PKG_CONFIG?=pkg-config
INSTALL=install
-CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall -fPIC \
+CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall -fPIC \
-Wno-pointer-sign \
-fno-strict-aliasing \
-fno-delete-null-pointer-checks \
CFLAGS+=$(call cc-disable-warning, shift-overflow)
CFLAGS+=$(call cc-disable-warning, enum-conversion)
-PKGCONFIG_LIBS="blkid uuid liburcu libsodium zlib liblz4 libzstd libudev"
+PKGCONFIG_LIBS="blkid uuid liburcu libsodium zlib liblz4 libzstd libudev libkeyutils"
ifdef BCACHEFS_FUSE
PKGCONFIG_LIBS+="fuse3 >= 3.7"
CFLAGS+=-DBCACHEFS_FUSE
git add include/linux/list_nulls.h
cp $(LINUX_DIR)/include/linux/poison.h include/linux/
git add include/linux/poison.h
+ cp $(LINUX_DIR)/include/linux/generic-radix-tree.h include/linux/
+ git add include/linux/generic-radix-tree.h
+ cp $(LINUX_DIR)/lib/generic-radix-tree.c linux/
+ git add linux/generic-radix-tree.c
+ cp $(LINUX_DIR)/include/linux/kmemleak.h include/linux/
+ git add include/linux/kmemleak.h
+ cp $(LINUX_DIR)/include/linux/printbuf.h include/linux/
+ git add include/linux/printbuf.h
+ cp $(LINUX_DIR)/lib/printbuf.c linux/
+ git add linux/printbuf.c
+ cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/
+ git add linux/mean_and_variance.c
+ cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/
+ git add include/linux/mean_and_variance.h
+ cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/
+ git add linux/int_sqrt.c
cp $(LINUX_DIR)/scripts/Makefile.compiler ./
git add Makefile.compiler
$(RM) libbcachefs/*.mod.c
# automatically cleaned up.
try-run = $(shell set -e; \
TMP=$(TMPOUT)/tmp; \
- mkdir -p $(TMPOUT); \
trap "rm -rf $(TMPOUT)" EXIT; \
+ mkdir -p $(TMPOUT); \
if ($(1)) >/dev/null 2>&1; \
then echo "$(2)"; \
else echo "$(3)"; \
You need to do this before you create a volume.
.Pp
Device specific options must come before corresponding devices, e.g.
-.Dl bcachefs format --group=ssd /dev/sda --label=hdd /dev/sdb
+.Dl bcachefs format --label=ssd /dev/sda --label=hdd /dev/sdb
.Bl -tag -width Ds
.It Fl b , Fl -block Ns = Ns Ar size
block size, in bytes (e.g. 4k)
.El
.It Nm Ic device Ic evacuate Ar device
Move data off of a given device
-.It Nm Ic device Ic set-state Oo Ar options Oc Ar device Ar new-state
+.It Nm Ic device Ic set-state Oo Ar options Oc Ar new-state Ar device
.Bl -tag -width Ds
+.It Ar new-state Ns = Ns ( Ar rw | ro | failed | spare )
.It Fl f , Fl -force
Force, if data redundancy will be degraded
.El
"Superblock commands:\n"
" format Format a new filesystem\n"
" show-super Dump superblock information to stdout\n"
+ " set-option Set a filesystem option\n"
"\n"
"Repair:\n"
" fsck Check an existing filesystem for errors\n"
" device resize-journal Resize journal on a device\n"
"\n"
"Commands for managing subvolumes and snapshots:\n"
- " subvolume create Create a new subvolume\n"
- " subvolume delete Delete an existing subvolume\n"
- " subvolume snapshot Create a snapshot\n"
+ " subvolume create Create a new subvolume\n"
+ " subvolume delete Delete an existing subvolume\n"
+ " subvolume snapshot Create a snapshot\n"
"\n"
"Commands for managing filesystem data:\n"
" data rereplicate Rereplicate degraded data\n"
return cmd_version(argc, argv);
if (!strcmp(cmd, "show-super"))
return cmd_show_super(argc, argv);
+ if (!strcmp(cmd, "set-option"))
+ return cmd_set_option(argc, argv);
if (argc < 2) {
printf("%s: missing command\n", argv[0]);
return cmd_list(argc, argv);
if (!strcmp(cmd, "list_journal"))
return cmd_list_journal(argc, argv);
+ if (!strcmp(cmd, "kill_btree_node"))
+ return cmd_kill_btree_node(argc, argv);
if (!strcmp(cmd, "setattr"))
return cmd_setattr(argc, argv);
+++ /dev/null
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
+++ /dev/null
-#include "config.h"
-#include <stdio.h>
-#include <string.h>
-
-#include "ccan/darray/darray.h"
-
-/**
- * darray - Generic resizable arrays
- *
- * darray is a set of macros for managing dynamically-allocated arrays.
- * It removes the tedium of managing realloc'd arrays with pointer, size, and
- * allocated size.
- *
- * Example:
- * #include <ccan/darray/darray.h>
- * #include <stdio.h>
- *
- * int main(void) {
- * darray(int) numbers = darray_new();
- * char buffer[32];
- *
- * for (;;) {
- * int *i;
- * darray_foreach(i, numbers)
- * printf("%d ", *i);
- * if (darray_size(numbers) > 0)
- * puts("");
- *
- * printf("darray> ");
- * fgets(buffer, sizeof(buffer), stdin);
- * if (*buffer == '\0' || *buffer == '\n')
- * break;
- *
- * darray_append(numbers, atoi(buffer));
- * }
- *
- * darray_free(numbers);
- *
- * return 0;
- * }
- *
- * Author: Joey Adams <joeyadams3.14159@gmail.com>
- * License: MIT
- * Version: 0.2
- */
-int main(int argc, char *argv[])
-{
- if (argc != 2)
- return 1;
-
- if (strcmp(argv[1], "depends") == 0) {
- /* Nothing. */
- return 0;
- }
-
- return 1;
-}
+++ /dev/null
-/*
- * Copyright (C) 2011 Joseph Adams <joeyadams3.14159@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#ifndef CCAN_DARRAY_H
-#define CCAN_DARRAY_H
-
-#include <stdlib.h>
-#include <string.h>
-#include "config.h"
-
-/*
- * SYNOPSIS
- *
- * Life cycle of a darray (dynamically-allocated array):
- *
- * darray(int) a = darray_new();
- * darray_free(a);
- *
- * struct {darray(int) a;} foo;
- * darray_init(foo.a);
- * darray_free(foo.a);
- *
- * Typedefs for darrays of common types:
- *
- * darray_char, darray_schar, darray_uchar
- * darray_short, darray_int, darray_long
- * darray_ushort, darray_uint, darray_ulong
- *
- * Access:
- *
- * T darray_item(darray(T) arr, size_t index);
- * size_t darray_size(darray(T) arr);
- * size_t darray_alloc(darray(T) arr);
- * bool darray_empty(darray(T) arr);
- *
- * Insertion (single item):
- *
- * void darray_append(darray(T) arr, T item);
- * void darray_prepend(darray(T) arr, T item);
- * void darray_push(darray(T) arr, T item); // same as darray_append
- *
- * Insertion (multiple items):
- *
- * void darray_append_items(darray(T) arr, T *items, size_t count);
- * void darray_prepend_items(darray(T) arr, T *items, size_t count);
- *
- * void darray_appends(darray(T) arr, [T item, [...]]);
- * void darray_prepends(darray(T) arr, [T item, [...]]);
- *
- * // Same functionality as above, but does not require typeof.
- * void darray_appends_t(darray(T) arr, #T, [T item, [...]]);
- * void darray_prepends_t(darray(T) arr, #T, [T item, [...]]);
- *
- * Removal:
- *
- * T darray_pop(darray(T) arr | darray_size(arr) != 0);
- * T* darray_pop_check(darray(T*) arr);
- * void darray_remove(darray(T) arr, size_t index);
- *
- * Replacement:
- *
- * void darray_from_items(darray(T) arr, T *items, size_t count);
- * void darray_from_c(darray(T) arr, T c_array[N]);
- *
- * String buffer:
- *
- * void darray_append_string(darray(char) arr, const char *str);
- * void darray_append_lit(darray(char) arr, char stringLiteral[N+1]);
- *
- * void darray_prepend_string(darray(char) arr, const char *str);
- * void darray_prepend_lit(darray(char) arr, char stringLiteral[N+1]);
- *
- * void darray_from_string(darray(T) arr, const char *str);
- * void darray_from_lit(darray(char) arr, char stringLiteral[N+1]);
- *
- * Size management:
- *
- * void darray_resize(darray(T) arr, size_t newSize);
- * void darray_resize0(darray(T) arr, size_t newSize);
- *
- * void darray_realloc(darray(T) arr, size_t newAlloc);
- * void darray_growalloc(darray(T) arr, size_t newAlloc);
- *
- * void darray_make_room(darray(T) arr, size_t room);
- *
- * Traversal:
- *
- * darray_foreach(T *&i, darray(T) arr) {...}
- * darray_foreach_reverse(T *&i, darray(T) arr) {...}
- *
- * Except for darray_foreach, darray_foreach_reverse, and darray_remove,
- * all macros evaluate their non-darray arguments only once.
- */
-
-/*** Life cycle ***/
-
-#define darray(type) struct {type *item; size_t size; size_t alloc;}
-
-#define darray_new() {0,0,0}
-#define darray_init(arr) do {(arr).item=0; (arr).size=0; (arr).alloc=0;} while(0)
-#define darray_free(arr) do {free((arr).item);} while(0)
-
-
-/*
- * Typedefs for darrays of common types. These are useful
- * when you want to pass a pointer to an darray(T) around.
- *
- * The following will produce an incompatible pointer warning:
- *
- * void foo(darray(int) *arr);
- * darray(int) arr = darray_new();
- * foo(&arr);
- *
- * The workaround:
- *
- * void foo(darray_int *arr);
- * darray_int arr = darray_new();
- * foo(&arr);
- */
-
-typedef darray(char) darray_char;
-typedef darray(signed char) darray_schar;
-typedef darray(unsigned char) darray_uchar;
-
-typedef darray(short) darray_short;
-typedef darray(int) darray_int;
-typedef darray(long) darray_long;
-
-typedef darray(unsigned short) darray_ushort;
-typedef darray(unsigned int) darray_uint;
-typedef darray(unsigned long) darray_ulong;
-
-
-/*** Access ***/
-
-#define darray_item(arr, i) ((arr).item[i])
-#define darray_size(arr) ((arr).size)
-#define darray_alloc(arr) ((arr).alloc)
-#define darray_empty(arr) ((arr).size == 0)
-
-
-/*** Insertion (single item) ***/
-
-#define darray_append(arr, ...) do { \
- darray_resize(arr, (arr).size+1); \
- (arr).item[(arr).size-1] = (__VA_ARGS__); \
- } while(0)
-#define darray_prepend(arr, ...) do { \
- darray_resize(arr, (arr).size+1); \
- memmove((arr).item+1, (arr).item, ((arr).size-1)*sizeof(*(arr).item)); \
- (arr).item[0] = (__VA_ARGS__); \
- } while(0)
-#define darray_push(arr, ...) darray_append(arr, __VA_ARGS__)
-
-
-/*** Insertion (multiple items) ***/
-
-#define darray_append_items(arr, items, count) do { \
- size_t __count = (count), __oldSize = (arr).size; \
- darray_resize(arr, __oldSize + __count); \
- memcpy((arr).item + __oldSize, items, __count * sizeof(*(arr).item)); \
- } while(0)
-
-#define darray_prepend_items(arr, items, count) do { \
- size_t __count = (count), __oldSize = (arr).size; \
- darray_resize(arr, __count + __oldSize); \
- memmove((arr).item + __count, (arr).item, __oldSize * sizeof(*(arr).item)); \
- memcpy((arr).item, items, __count * sizeof(*(arr).item)); \
- } while(0)
-
-#define darray_append_items_nullterminate(arr, items, count) do { \
- size_t __count = (count), __oldSize = (arr).size; \
- darray_resize(arr, __oldSize + __count + 1); \
- memcpy((arr).item + __oldSize, items, __count * sizeof(*(arr).item)); \
- (arr).item[--(arr).size] = 0; \
- } while(0)
-
-#define darray_prepend_items_nullterminate(arr, items, count) do { \
- size_t __count = (count), __oldSize = (arr).size; \
- darray_resize(arr, __count + __oldSize + 1); \
- memmove((arr).item + __count, (arr).item, __oldSize * sizeof(*(arr).item)); \
- memcpy((arr).item, items, __count * sizeof(*(arr).item)); \
- (arr).item[--(arr).size] = 0; \
- } while(0)
-
-#if HAVE_TYPEOF
-#define darray_appends(arr, ...) darray_appends_t(arr, typeof((*(arr).item)), __VA_ARGS__)
-#define darray_prepends(arr, ...) darray_prepends_t(arr, typeof((*(arr).item)), __VA_ARGS__)
-#endif
-
-#define darray_appends_t(arr, type, ...) do { \
- type __src[] = {__VA_ARGS__}; \
- darray_append_items(arr, __src, sizeof(__src)/sizeof(*__src)); \
- } while(0)
-#define darray_prepends_t(arr, type, ...) do { \
- type __src[] = {__VA_ARGS__}; \
- darray_prepend_items(arr, __src, sizeof(__src)/sizeof(*__src)); \
- } while(0)
-
-
-/*** Removal ***/
-
-/* Warning: Do not call darray_pop on an empty darray. */
-#define darray_pop(arr) ((arr).item[--(arr).size])
-#define darray_pop_check(arr) ((arr).size ? darray_pop(arr) : NULL)
-/* Warning, slow: Requires copying all elements after removed item. */
-#define darray_remove(arr, index) do { \
- if (index < arr.size-1) \
- memmove(&(arr).item[index], &(arr).item[index+1], ((arr).size-1-i)*sizeof(*(arr).item)); \
- (arr).size--; \
- } while(0)
-
-
-/*** Replacement ***/
-
-#define darray_from_items(arr, items, count) do {size_t __count = (count); darray_resize(arr, __count); memcpy((arr).item, items, __count*sizeof(*(arr).item));} while(0)
-#define darray_from_c(arr, c_array) darray_from_items(arr, c_array, sizeof(c_array)/sizeof(*(c_array)))
-
-
-/*** String buffer ***/
-
-#define darray_append_string(arr, str) do {const char *__str = (str); darray_append_items(arr, __str, strlen(__str)+1); (arr).size--;} while(0)
-#define darray_append_lit(arr, stringLiteral) do {darray_append_items(arr, stringLiteral, sizeof(stringLiteral)); (arr).size--;} while(0)
-
-#define darray_prepend_string(arr, str) do { \
- const char *__str = (str); \
- darray_prepend_items_nullterminate(arr, __str, strlen(__str)); \
- } while(0)
-#define darray_prepend_lit(arr, stringLiteral) \
- darray_prepend_items_nullterminate(arr, stringLiteral, sizeof(stringLiteral) - 1)
-
-#define darray_from_string(arr, str) do {const char *__str = (str); darray_from_items(arr, __str, strlen(__str)+1); (arr).size--;} while(0)
-#define darray_from_lit(arr, stringLiteral) do {darray_from_items(arr, stringLiteral, sizeof(stringLiteral)); (arr).size--;} while(0)
-
-
-/*** Size management ***/
-
-#define darray_resize(arr, newSize) darray_growalloc(arr, (arr).size = (newSize))
-#define darray_resize0(arr, newSize) do { \
- size_t __oldSize = (arr).size, __newSize = (newSize); \
- (arr).size = __newSize; \
- if (__newSize > __oldSize) { \
- darray_growalloc(arr, __newSize); \
- memset(&(arr).item[__oldSize], 0, (__newSize - __oldSize) * sizeof(*(arr).item)); \
- } \
- } while(0)
-
-#define darray_realloc(arr, newAlloc) do { \
- (arr).item = realloc((arr).item, ((arr).alloc = (newAlloc)) * sizeof(*(arr).item)); \
- } while(0)
-#define darray_growalloc(arr, need) do { \
- size_t __need = (need); \
- if (__need > (arr).alloc) \
- darray_realloc(arr, darray_next_alloc((arr).alloc, __need)); \
- } while(0)
-
-#if HAVE_STATEMENT_EXPR==1
-#define darray_make_room(arr, room) ({size_t newAlloc = (arr).size+(room); if ((arr).alloc<newAlloc) darray_realloc(arr, newAlloc); (arr).item+(arr).size; })
-#endif
-
-static inline size_t darray_next_alloc(size_t alloc, size_t need)
-{
- if (alloc == 0)
- alloc = 1;
- while (alloc < need)
- alloc *= 2;
- return alloc;
-}
-
-
-/*** Traversal ***/
-
-/*
- * darray_foreach(T *&i, darray(T) arr) {...}
- *
- * Traverse a darray. `i` must be declared in advance as a pointer to an item.
- */
-#define darray_foreach(i, arr) \
- for ((i) = &(arr).item[0]; (i) < &(arr).item[(arr).size]; (i)++)
-
-/*
- * darray_foreach_reverse(T *&i, darray(T) arr) {...}
- *
- * Like darray_foreach, but traverse in reverse order.
- */
-#define darray_foreach_reverse(i, arr) \
- for ((i) = &(arr).item[(arr).size]; (i)-- > &(arr).item[0]; )
-
-
-#endif /* CCAN_DARRAY_H */
-
-/*
-
-darray_growalloc(arr, newAlloc) sees if the darray can currently hold newAlloc items;
- if not, it increases the alloc to satisfy this requirement, allocating slack
- space to avoid having to reallocate for every size increment.
-
-darray_from_string(arr, str) copies a string to an darray_char.
-
-darray_push(arr, item) pushes an item to the end of the darray.
-darray_pop(arr) pops it back out. Be sure there is at least one item in the darray before calling.
-darray_pop_check(arr) does the same as darray_pop, but returns NULL if there are no more items left in the darray.
-
-darray_make_room(arr, room) ensures there's 'room' elements of space after the end of the darray, and it returns a pointer to this space.
-Currently requires HAVE_STATEMENT_EXPR, but I plan to remove this dependency by creating an inline function.
-
-The following require HAVE_TYPEOF==1 :
-
-darray_appends(arr, item0, item1...) appends a collection of comma-delimited items to the darray.
-darray_prepends(arr, item0, item1...) prepends a collection of comma-delimited items to the darray.\
-
-
-Examples:
-
- darray(int) arr;
- int *i;
-
- darray_appends(arr, 0,1,2,3,4);
- darray_appends(arr, -5,-4,-3,-2,-1);
- darray_foreach(i, arr)
- printf("%d ", *i);
- printf("\n");
-
- darray_free(arr);
-
-
- typedef struct {int n,d;} Fraction;
- darray(Fraction) fractions;
- Fraction *i;
-
- darray_appends(fractions, {3,4}, {3,5}, {2,1});
- darray_foreach(i, fractions)
- printf("%d/%d\n", i->n, i->d);
-
- darray_free(fractions);
-*/
bch2_opts_usage(OPT_INODE);
puts(" -h Display this help and exit\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
}
int cmd_setattr(int argc, char *argv[])
" rereplicate Rereplicate degraded data\n"
" job Kick off low level data jobs\n"
"\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
return 0;
}
"\n"
"Options:\n"
" -h, --help display this help and exit\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
exit(EXIT_SUCCESS);
}
" -s inode:offset start position\n"
" -e inode:offset end position\n"
" -h, --help display this help and exit\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
exit(EXIT_SUCCESS);
}
" -f, --force Use device even if it appears to already be formatted\n"
" -h, --help Display this help and exit\n"
"\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
}
int cmd_device_add(int argc, char *argv[])
" -F, --force-metadata Force removal, even if some metadata\n"
" couldn't be migrated\n"
" -h, --help display this help and exit\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
exit(EXIT_SUCCESS);
}
"Options:\n"
" -h, --help Display this help and exit\n"
"\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
}
int cmd_device_online(int argc, char *argv[])
" -f, --force Force, if data redundancy will be degraded\n"
" -h, --help Display this help and exit\n"
"\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
}
int cmd_device_offline(int argc, char *argv[])
"Options:\n"
" -h, --help Display this help and exit\n"
"\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
}
int cmd_device_evacuate(int argc, char *argv[])
" --force-if-data-lost Force, if data will be lost\n"
" -o, --offline Set state of an offline device\n"
" -h, --help display this help and exit\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
exit(EXIT_SUCCESS);
}
le64_add_cpu(&sb.sb->seq, 1);
- bch2_super_write(sb.bdev->bd_fd, sb.sb);
+ bch2_super_write(sb.bdev->bd_buffered_fd, sb.sb);
+ ret = fsync(sb.bdev->bd_buffered_fd);
+ if (ret)
+ fprintf(stderr, "error writing superblock: fsync error (%m)");
bch2_free_super(&sb);
- return 0;
+ return ret;
}
char *fs_path = arg_pop();
"\n"
"Options:\n"
" -h, --help display this help and exit\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
exit(EXIT_SUCCESS);
}
"\n"
"Options:\n"
" -h, --help display this help and exit\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
exit(EXIT_SUCCESS);
}
--- /dev/null
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "cmds.h"
+#include "libbcachefs.h"
+#include "qcow2.h"
+
+#include "libbcachefs/bcachefs.h"
+#include "libbcachefs/btree_cache.h"
+#include "libbcachefs/btree_iter.h"
+#include "libbcachefs/error.h"
+#include "libbcachefs/extents.h"
+#include "libbcachefs/super.h"
+
+static void dump_usage(void)
+{
+ puts("bcachefs dump - dump filesystem metadata\n"
+ "Usage: bcachefs dump [OPTION]... <devices>\n"
+ "\n"
+ "Options:\n"
+ " -o output Output qcow2 image(s)\n"
+ " -f Force; overwrite when needed\n"
+ " -j Dump entire journal, not just dirty entries\n"
+ " -h Display this help and exit\n"
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
+ bool entire_journal)
+{
+ struct bch_sb *sb = ca->disk_sb.sb;
+ ranges data = { 0 };
+ unsigned i;
+ int ret;
+
+ /* Superblock: */
+ range_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
+ sizeof(struct bch_sb_layout));
+
+ for (i = 0; i < sb->layout.nr_superblocks; i++)
+ range_add(&data,
+ le64_to_cpu(sb->layout.sb_offset[i]) << 9,
+ vstruct_bytes(sb));
+
+ /* Journal: */
+ for (i = 0; i < ca->journal.nr; i++)
+ if (entire_journal ||
+ ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) {
+ u64 bucket = ca->journal.buckets[i];
+
+ range_add(&data,
+ bucket_bytes(ca) * bucket,
+ bucket_bytes(ca));
+ }
+
+ /* Btree: */
+ for (i = 0; i < BTREE_ID_NR; i++) {
+ const struct bch_extent_ptr *ptr;
+ struct bkey_ptrs_c ptrs;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct btree *b;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b, ret) {
+ struct btree_node_iter iter;
+ struct bkey u;
+ struct bkey_s_c k;
+
+ for_each_btree_node_key_unpack(b, k, &iter, &u) {
+ ptrs = bch2_bkey_ptrs_c(k);
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->dev == ca->dev_idx)
+ range_add(&data,
+ ptr->offset << 9,
+ btree_bytes(c));
+ }
+ }
+
+ if (ret)
+ die("error %s walking btree nodes", strerror(-ret));
+
+ b = c->btree_roots[i].b;
+ if (!btree_node_fake(b)) {
+ ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->dev == ca->dev_idx)
+ range_add(&data,
+ ptr->offset << 9,
+ btree_bytes(c));
+ }
+
+ bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_exit(&trans);
+ }
+
+ qcow2_write_image(ca->disk_sb.bdev->bd_buffered_fd, fd, &data,
+ max_t(unsigned, btree_bytes(c) / 8, block_bytes(c)));
+ darray_exit(&data);
+}
+
+int cmd_dump(int argc, char *argv[])
+{
+ struct bch_opts opts = bch2_opts_empty();
+ struct bch_dev *ca;
+ char *out = NULL;
+ unsigned i, nr_devices = 0;
+ bool force = false, entire_journal = false;
+ int fd, opt;
+
+ opt_set(opts, nochanges, true);
+ opt_set(opts, norecovery, true);
+ opt_set(opts, degraded, true);
+ opt_set(opts, errors, BCH_ON_ERROR_continue);
+ opt_set(opts, fix_errors, FSCK_OPT_NO);
+
+ while ((opt = getopt(argc, argv, "o:fjvh")) != -1)
+ switch (opt) {
+ case 'o':
+ out = optarg;
+ break;
+ case 'f':
+ force = true;
+ break;
+ case 'j':
+ entire_journal = true;
+ break;
+ case 'v':
+ opt_set(opts, verbose, true);
+ break;
+ case 'h':
+ dump_usage();
+ exit(EXIT_SUCCESS);
+ }
+ args_shift(optind);
+
+ if (!out)
+ die("Please supply output filename");
+
+ if (!argc)
+ die("Please supply device(s) to check");
+
+ struct bch_fs *c = bch2_fs_open(argv, argc, opts);
+ if (IS_ERR(c))
+ die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c)));
+
+ down_read(&c->gc_lock);
+
+ for_each_online_member(ca, c, i)
+ nr_devices++;
+
+ BUG_ON(!nr_devices);
+
+ for_each_online_member(ca, c, i) {
+ int flags = O_WRONLY|O_CREAT|O_TRUNC;
+
+ if (!force)
+ flags |= O_EXCL;
+
+ if (!c->devs[i])
+ continue;
+
+ char *path = nr_devices > 1
+ ? mprintf("%s.%u.qcow2", out, i)
+ : mprintf("%s.qcow2", out);
+ fd = xopen(path, flags, 0600);
+ free(path);
+
+ dump_one_device(c, ca, fd, entire_journal);
+ close(fd);
+ }
+
+ up_read(&c->gc_lock);
+
+ bch2_fs_stop(c);
+ return 0;
+}
#include <uuid/uuid.h>
-#include "ccan/darray/darray.h"
-
#include "cmds.h"
#include "libbcachefs.h"
#include "crypto.h"
+#include "libbcachefs/darray.h"
#include "libbcachefs/opts.h"
#include "libbcachefs/super-io.h"
#include "libbcachefs/util.h"
x(0, no_initialize, no_argument) \
x('f', force, no_argument) \
x('q', quiet, no_argument) \
+x('v', verbose, no_argument) \
x('h', help, no_argument)
static void usage(void)
"\n"
" -f, --force\n"
" -q, --quiet Only print errors\n"
+ " -v, --verbose Verbose filesystem initialization\n"
" -h, --help Display this help and exit\n"
"\n"
"Device specific options must come before corresponding devices, e.g.\n"
" bcachefs format --label cache /dev/sdb /dev/sdc\n"
"\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
}
enum {
int cmd_format(int argc, char *argv[])
{
- darray(struct dev_opts) devices;
- darray(char *) device_paths;
+ DARRAY(struct dev_opts) devices = { 0 };
+ DARRAY(char *) device_paths = { 0 };
struct format_opts opts = format_opts_default();
struct dev_opts dev_opts = dev_opts_default(), *dev;
- bool force = false, no_passphrase = false, quiet = false, initialize = true;
+ bool force = false, no_passphrase = false, quiet = false, initialize = true, verbose = false;
unsigned v;
int opt;
- darray_init(devices);
- darray_init(device_paths);
-
struct bch_opt_strs fs_opt_strs =
bch2_cmdline_opts_get(&argc, argv, OPT_FORMAT);
struct bch_opts fs_opts = bch2_parse_opts(fs_opt_strs);
while ((opt = getopt_long(argc, argv,
- "-L:U:g:fqh",
+ "-L:U:g:fqhv",
format_opts,
NULL)) != -1)
switch (opt) {
initialize = false;
break;
case O_no_opt:
- darray_append(device_paths, optarg);
+ darray_push(&device_paths, optarg);
dev_opts.path = optarg;
- darray_append(devices, dev_opts);
+ darray_push(&devices, dev_opts);
dev_opts.size = 0;
break;
case O_quiet:
case 'q':
quiet = true;
break;
+ case 'v':
+ verbose = true;
case O_help:
case 'h':
usage();
break;
}
- if (darray_empty(devices))
+ if (!devices.nr)
die("Please supply a device");
if (opts.encrypted && !no_passphrase) {
initialize = false;
}
- darray_foreach(dev, devices)
+ darray_for_each(devices, dev)
dev->fd = open_for_format(dev->path, force);
struct bch_sb *sb =
bch2_format(fs_opt_strs,
fs_opts,
opts,
- devices.item, darray_size(devices));
+ devices.data, devices.nr);
bch2_opt_strs_free(&fs_opt_strs);
- if (!quiet)
- bch2_sb_print(sb, false, 1 << BCH_SB_FIELD_members, HUMAN_READABLE);
+ if (!quiet) {
+ struct printbuf buf = PRINTBUF;
+
+ buf.human_readable_units = true;
+
+ bch2_sb_to_text(&buf, sb, false, 1 << BCH_SB_FIELD_members);
+ printf("%s", buf.buf);
+
+ printbuf_exit(&buf);
+ }
free(sb);
if (opts.passphrase) {
free(opts.passphrase);
}
- darray_free(devices);
+ darray_exit(&devices);
if (initialize) {
+ struct bch_opts mount_opts = bch2_opts_empty();
+
+
+ opt_set(mount_opts, verbose, verbose);
+
/*
* Start the filesystem once, to allocate the journal and create
* the root directory:
*/
- struct bch_fs *c = bch2_fs_open(device_paths.item,
- darray_size(device_paths),
- bch2_opts_empty());
+ struct bch_fs *c = bch2_fs_open(device_paths.data,
+ device_paths.nr,
+ mount_opts);
if (IS_ERR(c))
- die("error opening %s: %s", device_paths.item[0],
+ die("error opening %s: %s", device_paths.data[0],
strerror(-PTR_ERR(c)));
bch2_fs_stop(c);
}
- darray_free(device_paths);
+ darray_exit(&device_paths);
return 0;
}
" -f, --fields=(fields) list of sections to print\n"
" -l, --layout print superblock layout\n"
" -h, --help display this help and exit\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
exit(EXIT_SUCCESS);
}
if (ret)
die("Error opening %s: %s", dev, strerror(-ret));
- bch2_sb_print(sb.sb, print_layout, fields, HUMAN_READABLE);
+ struct printbuf buf = PRINTBUF;
+
+ buf.human_readable_units = true;
+
+ bch2_sb_to_text(&buf, sb.sb, print_layout, fields);
+ printf("%s", buf.buf);
+
bch2_free_super(&sb);
+ printbuf_exit(&buf);
return 0;
}
#include <uuid/uuid.h>
-#include "ccan/darray/darray.h"
-
#include "linux/sort.h"
#include "libbcachefs/bcachefs_ioctl.h"
+#include "libbcachefs/darray.h"
#include "libbcachefs/opts.h"
#include "cmds.h"
#include "libbcachefs.h"
-static void print_dev_usage_type(const char *type,
- unsigned bucket_size,
- u64 buckets, u64 sectors,
- enum units units)
+static void __dev_usage_type_to_text(struct printbuf *out,
+ const char *type,
+ unsigned bucket_size,
+ u64 buckets, u64 sectors, u64 frag)
{
- u64 frag = max((s64) buckets * bucket_size - (s64) sectors, 0LL);
+ prt_printf(out, "%s:", type);
+ prt_tab(out);
+
+ prt_units_u64(out, sectors << 9);
+ prt_tab_rjust(out);
+
+ prt_printf(out, "%llu", buckets);
+ prt_tab_rjust(out);
+
+ if (frag) {
+ prt_units_u64(out, frag << 9);
+ prt_tab_rjust(out);
+ }
+ prt_newline(out);
+}
- printf_pad(20, " %s:", type);
- printf(" %15s %15llu %15s\n",
- pr_units(sectors, units),
- buckets,
- pr_units(frag, units));
+static void dev_usage_type_to_text(struct printbuf *out,
+ struct bch_ioctl_dev_usage *u,
+ enum bch_data_type type)
+{
+ __dev_usage_type_to_text(out, bch2_data_types[type],
+ u->bucket_size,
+ u->d[type].buckets,
+ u->d[type].sectors,
+ u->d[type].fragmented);
}
-static void print_dev_usage(struct bchfs_handle fs,
- struct dev_name *d,
- enum units units)
+static void dev_usage_to_text(struct printbuf *out,
+ struct bchfs_handle fs,
+ struct dev_name *d)
{
struct bch_ioctl_dev_usage u = bchu_dev_usage(fs, d->idx);
unsigned i;
- printf("\n");
- printf_pad(20, "%s (device %u):", d->label ?: "(no label)", d->idx);
- printf("%30s%16s\n", d->dev ?: "(device not found)", bch2_member_states[u.state]);
-
- printf("%-20s%16s%16s%16s\n",
- "", "data", "buckets", "fragmented");
-
- for (i = BCH_DATA_sb; i < BCH_DATA_NR; i++)
- print_dev_usage_type(bch2_data_types[i],
- u.bucket_size,
- u.buckets[i],
- u.sectors[i],
- units);
-
- print_dev_usage_type("erasure coded",
- u.bucket_size,
- u.ec_buckets,
- u.ec_sectors,
- units);
-
- printf_pad(20, " available:");
- printf(" %15s %15llu\n",
- pr_units(u.available_buckets * u.bucket_size, units),
- u.available_buckets);
-
- printf_pad(20, " capacity:");
- printf(" %15s %15llu\n",
- pr_units(u.nr_buckets * u.bucket_size, units),
- u.nr_buckets);
+ prt_newline(out);
+ prt_printf(out, "%s (device %u):", d->label ?: "(no label)", d->idx);
+ prt_tab(out);
+ prt_str(out, d->dev ?: "(device not found)");
+ prt_tab_rjust(out);
+
+ prt_str(out, bch2_member_states[u.state]);
+ prt_tab_rjust(out);
+
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+ prt_tab(out);
+
+ prt_str(out, "data");
+ prt_tab_rjust(out);
+
+ prt_str(out, "buckets");
+ prt_tab_rjust(out);
+
+ prt_str(out, "fragmented");
+ prt_tab_rjust(out);
+
+ prt_newline(out);
+
+ for (i = 0; i < BCH_DATA_NR; i++)
+ dev_usage_type_to_text(out, &u, i);
+ __dev_usage_type_to_text(out, "erasure coded",
+ u.bucket_size,
+ u.buckets_ec, u.buckets_ec * u.bucket_size, 0);
+
+ prt_str(out, "capacity:");
+ prt_tab(out);
+
+ prt_units_u64(out, (u.nr_buckets * u.bucket_size) << 9);
+ prt_tab_rjust(out);
+ prt_printf(out, "%llu", u.nr_buckets);
+ prt_tab_rjust(out);
+
+ printbuf_indent_sub(out, 2);
+
+ prt_newline(out);
}
static int dev_by_label_cmp(const void *_l, const void *_r)
{
struct dev_name *dev;
- darray_foreach(dev, *dev_names)
+ darray_for_each(*dev_names, dev)
if (dev->idx == idx)
return dev;
return NULL;
}
-static void print_replicas_usage(const struct bch_replicas_usage *r,
- dev_names *dev_names, enum units units)
+static void replicas_usage_to_text(struct printbuf *out,
+ const struct bch_replicas_usage *r,
+ dev_names *dev_names)
{
unsigned i;
*d++ = ']';
*d++ = '\0';
- printf_pad(16, "%s: ", bch2_data_types[r->r.data_type]);
- printf_pad(16, "%u/%u ", r->r.nr_required, r->r.nr_devs);
- printf_pad(32, "%s ", devs);
- printf(" %s\n", pr_units(r->sectors, units));
+ prt_printf(out, "%s: ", bch2_data_types[r->r.data_type]);
+ prt_tab(out);
+
+ prt_printf(out, "%u/%u ", r->r.nr_required, r->r.nr_devs);
+ prt_tab(out);
+
+ prt_printf(out, "%s ", devs);
+ prt_tab(out);
+
+ prt_units_u64(out, r->sectors << 9);
+ prt_tab_rjust(out);
+ prt_newline(out);
}
#define for_each_usage_replica(_u, _r) \
_r = replicas_usage_next(_r), \
BUG_ON((void *) _r > (void *) (_u)->replicas + (_u)->replica_entries_bytes))
-static void print_fs_usage(const char *path, enum units units)
+static void fs_usage_to_text(struct printbuf *out, const char *path)
{
unsigned i;
- char uuid[40];
struct bchfs_handle fs = bcache_fs_open(path);
struct bch_ioctl_fs_usage *u = bchu_fs_usage(fs);
- uuid_unparse(fs.uuid.b, uuid);
- printf("Filesystem %s:\n", uuid);
+ prt_str(out, "Filesystem: ");
+ pr_uuid(out, fs.uuid.b);
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 20);
+ printbuf_tabstop_push(out, 16);
- printf("%-20s%12s\n", "Size:", pr_units(u->capacity, units));
- printf("%-20s%12s\n", "Used:", pr_units(u->used, units));
+ prt_str(out, "Size:");
+ prt_tab(out);
+ prt_units_u64(out, u->capacity << 9);
+ prt_tab_rjust(out);
+ prt_newline(out);
- printf("%-20s%12s\n", "Online reserved:", pr_units(u->online_reserved, units));
+ prt_str(out, "Used:");
+ prt_tab(out);
+ prt_units_u64(out, u->used << 9);
+ prt_tab_rjust(out);
+ prt_newline(out);
- printf("\n");
- printf("%-16s%-16s%s\n", "Data type", "Required/total", "Devices");
+ prt_str(out, "Online reserved:");
+ prt_tab(out);
+ prt_units_u64(out, u->online_reserved << 9);
+ prt_tab_rjust(out);
+ prt_newline(out);
+
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 18);
+ printbuf_tabstop_push(out, 18);
+
+ prt_str(out, "Data type");
+ prt_tab(out);
+
+ prt_str(out, "Required/total");
+ prt_tab(out);
+
+ prt_str(out, "Devices");
+ prt_newline(out);
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
if (!u->persistent_reserved[i])
continue;
- printf_pad(16, "%s: ", "reserved");
- printf_pad(16, "%u/%u ", 1, i);
- printf_pad(32, "[] ");
- printf("%s\n", pr_units(u->persistent_reserved[i], units));
+ prt_str(out, "reserved:");
+ prt_tab(out);
+ prt_printf(out, "%u/%u ", 1, i);
+ prt_tab(out);
+ prt_str(out, "[] ");
+ prt_units_u64(out, u->persistent_reserved[i] << 9);
+ prt_tab_rjust(out);
+ prt_newline(out);
}
struct bch_replicas_usage *r;
for_each_usage_replica(u, r)
if (r->r.data_type < BCH_DATA_user)
- print_replicas_usage(r, &dev_names, units);
+ replicas_usage_to_text(out, r, &dev_names);
for_each_usage_replica(u, r)
if (r->r.data_type == BCH_DATA_user &&
r->r.nr_required <= 1)
- print_replicas_usage(r, &dev_names, units);
+ replicas_usage_to_text(out, r, &dev_names);
for_each_usage_replica(u, r)
if (r->r.data_type == BCH_DATA_user &&
r->r.nr_required > 1)
- print_replicas_usage(r, &dev_names, units);
+ replicas_usage_to_text(out, r, &dev_names);
for_each_usage_replica(u, r)
if (r->r.data_type > BCH_DATA_user)
- print_replicas_usage(r, &dev_names, units);
+ replicas_usage_to_text(out, r, &dev_names);
free(u);
- sort(&darray_item(dev_names, 0), darray_size(dev_names),
- sizeof(darray_item(dev_names, 0)), dev_by_label_cmp, NULL);
+ sort(dev_names.data, dev_names.nr,
+ sizeof(dev_names.data[0]), dev_by_label_cmp, NULL);
+
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 20);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 14);
- darray_foreach(dev, dev_names)
- print_dev_usage(fs, dev, units);
+ darray_for_each(dev_names, dev)
+ dev_usage_to_text(out, fs, dev);
- darray_foreach(dev, dev_names) {
+ darray_for_each(dev_names, dev) {
free(dev->dev);
free(dev->label);
}
- darray_free(dev_names);
+ darray_exit(&dev_names);
bcache_fs_close(fs);
}
int cmd_fs_usage(int argc, char *argv[])
{
- enum units units = BYTES;
+ bool human_readable = false;
+ struct printbuf buf = PRINTBUF;
char *fs;
int opt;
while ((opt = getopt(argc, argv, "h")) != -1)
switch (opt) {
case 'h':
- units = HUMAN_READABLE;
+ human_readable = true;
break;
}
args_shift(optind);
if (!argc) {
- print_fs_usage(".", units);
+ printbuf_reset(&buf);
+ buf.human_readable_units = human_readable;
+ fs_usage_to_text(&buf, ".");
+ printf("%s", buf.buf);
} else {
- while ((fs = arg_pop()))
- print_fs_usage(fs, units);
+ while ((fs = arg_pop())) {
+ printbuf_reset(&buf);
+ buf.human_readable_units = human_readable;
+ fs_usage_to_text(&buf, fs);
+ printf("%s", buf.buf);
+ }
}
+ printbuf_exit(&buf);
return 0;
}
"\n"
"Options:\n"
" -c Check if a device is encrypted\n"
+ " -k (session|user|user_session)\n"
+ " Keyring to add to (default: user)\n"
" -h Display this help and exit\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
}
int cmd_unlock(int argc, char *argv[])
{
+ const char *keyring = "user";
bool check = false;
int opt;
- while ((opt = getopt(argc, argv, "ch")) != -1)
+ while ((opt = getopt(argc, argv, "ck:h")) != -1)
switch (opt) {
case 'c':
check = true;
break;
+ case 'k':
+ keyring = strdup(optarg);
+ break;
case 'h':
unlock_usage();
exit(EXIT_SUCCESS);
char *passphrase = read_passphrase("Enter passphrase: ");
- bch2_add_key(sb.sb, passphrase);
+ bch2_add_key(sb.sb, "user", keyring, passphrase);
bch2_free_super(&sb);
memzero_explicit(passphrase, strlen(passphrase));
#include "tools-util.h"
#include "libbcachefs/bcachefs.h"
-#include "libbcachefs/bset.h"
#include "libbcachefs/btree_cache.h"
#include "libbcachefs/btree_io.h"
#include "libbcachefs/btree_iter.h"
-#include "libbcachefs/buckets.h"
#include "libbcachefs/checksum.h"
#include "libbcachefs/error.h"
-#include "libbcachefs/journal.h"
-#include "libbcachefs/journal_io.h"
+#include "libbcachefs/extents.h"
#include "libbcachefs/super.h"
-static void dump_usage(void)
-{
- puts("bcachefs dump - dump filesystem metadata\n"
- "Usage: bcachefs dump [OPTION]... <devices>\n"
- "\n"
- "Options:\n"
- " -o output Output qcow2 image(s)\n"
- " -f Force; overwrite when needed\n"
- " -h Display this help and exit\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
-}
-
-static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd)
-{
- struct bch_sb *sb = ca->disk_sb.sb;
- ranges data;
- unsigned i;
- int ret;
-
- darray_init(data);
-
- /* Superblock: */
- range_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
- sizeof(struct bch_sb_layout));
-
- for (i = 0; i < sb->layout.nr_superblocks; i++)
- range_add(&data,
- le64_to_cpu(sb->layout.sb_offset[i]) << 9,
- vstruct_bytes(sb));
-
- /* Journal: */
- for (i = 0; i < ca->journal.nr; i++)
- if (ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) {
- u64 bucket = ca->journal.buckets[i];
-
- range_add(&data,
- bucket_bytes(ca) * bucket,
- bucket_bytes(ca));
- }
-
- /* Btree: */
- for (i = 0; i < BTREE_ID_NR; i++) {
- const struct bch_extent_ptr *ptr;
- struct bkey_ptrs_c ptrs;
- struct btree_trans trans;
- struct btree_iter iter;
- struct btree *b;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b, ret) {
- struct btree_node_iter iter;
- struct bkey u;
- struct bkey_s_c k;
-
- for_each_btree_node_key_unpack(b, k, &iter, &u) {
- ptrs = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(ptrs, ptr)
- if (ptr->dev == ca->dev_idx)
- range_add(&data,
- ptr->offset << 9,
- btree_bytes(c));
- }
- }
-
- if (ret)
- die("error %s walking btree nodes", strerror(-ret));
-
- b = c->btree_roots[i].b;
- if (!btree_node_fake(b)) {
- ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
-
- bkey_for_each_ptr(ptrs, ptr)
- if (ptr->dev == ca->dev_idx)
- range_add(&data,
- ptr->offset << 9,
- btree_bytes(c));
- }
-
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
- }
-
- qcow2_write_image(ca->disk_sb.bdev->bd_fd, fd, &data,
- max_t(unsigned, btree_bytes(c) / 8, block_bytes(c)));
- darray_free(data);
-}
-
-int cmd_dump(int argc, char *argv[])
-{
- struct bch_opts opts = bch2_opts_empty();
- struct bch_dev *ca;
- char *out = NULL;
- unsigned i, nr_devices = 0;
- bool force = false;
- int fd, opt;
-
- opt_set(opts, nochanges, true);
- opt_set(opts, norecovery, true);
- opt_set(opts, degraded, true);
- opt_set(opts, errors, BCH_ON_ERROR_continue);
- opt_set(opts, fix_errors, FSCK_OPT_NO);
-
- while ((opt = getopt(argc, argv, "o:fvh")) != -1)
- switch (opt) {
- case 'o':
- out = optarg;
- break;
- case 'f':
- force = true;
- break;
- case 'v':
- opt_set(opts, verbose, true);
- break;
- case 'h':
- dump_usage();
- exit(EXIT_SUCCESS);
- }
- args_shift(optind);
-
- if (!out)
- die("Please supply output filename");
-
- if (!argc)
- die("Please supply device(s) to check");
-
- struct bch_fs *c = bch2_fs_open(argv, argc, opts);
- if (IS_ERR(c))
- die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c)));
-
- down_read(&c->gc_lock);
-
- for_each_online_member(ca, c, i)
- nr_devices++;
-
- BUG_ON(!nr_devices);
-
- for_each_online_member(ca, c, i) {
- int flags = O_WRONLY|O_CREAT|O_TRUNC;
-
- if (!force)
- flags |= O_EXCL;
-
- if (!c->devs[i])
- continue;
-
- char *path = nr_devices > 1
- ? mprintf("%s.%u", out, i)
- : strdup(out);
- fd = xopen(path, flags, 0600);
- free(path);
-
- dump_one_device(c, ca, fd);
- close(fd);
- }
-
- up_read(&c->gc_lock);
-
- bch2_fs_stop(c);
- return 0;
-}
-
static void list_keys(struct bch_fs *c, enum btree_id btree_id,
struct bpos start, struct bpos end)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- char buf[512];
+ struct printbuf buf = PRINTBUF;
int ret;
bch2_trans_init(&trans, c, 0, 0);
if (bkey_cmp(k.k->p, end) > 0)
break;
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- puts(buf);
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ puts(buf.buf);
}
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
+
+ printbuf_exit(&buf);
}
static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id, unsigned level,
struct btree_trans trans;
struct btree_iter iter;
struct btree *b;
- char buf[4096];
+ struct printbuf buf = PRINTBUF;
int ret;
bch2_trans_init(&trans, c, 0, 0);
if (bkey_cmp(b->key.k.p, end) > 0)
break;
- bch2_btree_node_to_text(&PBUF(buf), c, b);
- puts(buf);
+ printbuf_reset(&buf);
+ bch2_btree_node_to_text(&buf, c, b);
+ puts(buf.buf);
}
bch2_trans_iter_exit(&trans, &iter);
die("error %s walking btree nodes", strerror(-ret));
bch2_trans_exit(&trans);
+ printbuf_exit(&buf);
}
static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level,
struct btree_trans trans;
struct btree_iter iter;
struct btree *b;
- char buf[4096];
+ struct printbuf buf = PRINTBUF;
int ret;
bch2_trans_init(&trans, c, 0, 0);
if (bkey_cmp(b->key.k.p, end) > 0)
break;
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key));
- fputs(buf, stdout);
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ fputs(buf.buf, stdout);
putchar('\n');
}
bch2_trans_iter_exit(&trans, &iter);
die("error %s walking btree nodes", strerror(-ret));
bch2_trans_exit(&trans);
+ printbuf_exit(&buf);
}
static void print_node_ondisk(struct bch_fs *c, struct btree *b)
struct bch_dev *ca;
struct bio *bio;
unsigned offset = 0;
+ int ret;
if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) {
printf("error getting device to read from\n");
return;
}
- n_ondisk = malloc(btree_bytes(c));
+ n_ondisk = aligned_alloc(block_bytes(c), btree_bytes(c));
- bio = bio_alloc_bioset(GFP_NOIO,
- buf_pages(n_ondisk, btree_bytes(c)),
- &c->btree_bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_opf = REQ_OP_READ|REQ_META;
+ bio = bio_alloc_bioset(ca->disk_sb.bdev,
+ buf_pages(n_ondisk, btree_bytes(c)),
+ REQ_OP_READ|REQ_META,
+ GFP_NOIO,
+ &c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
bch2_bio_map(bio, n_ondisk, btree_bytes(c));
- submit_bio_wait(bio);
+ ret = submit_bio_wait(bio);
+ if (ret)
+ die("error reading btree node: %i", ret);
bio_put(bio);
percpu_ref_put(&ca->io_ref);
i = &n_ondisk->keys;
if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
- die("unknown checksum type");
+ die("unknown checksum type at offset %u: %llu",
+ offset, BSET_CSUM_TYPE(i));
nonce = btree_nonce(i, offset << 9);
csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk);
break;
if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
- die("unknown checksum type");
+ die("unknown checksum type at offset %u: %llu",
+ offset, BSET_CSUM_TYPE(i));
nonce = btree_nonce(i, offset << 9);
csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) {
struct bkey u;
- char buf[4096];
+ struct printbuf buf = PRINTBUF;
+
+ printbuf_indent_add(&buf, 4);
+
+ bch2_bkey_val_to_text(&buf, c, bkey_disassemble(b, k, &u));
+ fprintf(stdout, "%s\n", buf.buf);
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_disassemble(b, k, &u));
- fprintf(stdout, " %s\n", buf);
+ printbuf_exit(&buf);
}
}
struct btree_trans trans;
struct btree_iter iter;
struct btree *b;
- char buf[4096];
+ struct printbuf buf = PRINTBUF;
int ret;
bch2_trans_init(&trans, c, 0, 0);
if (bkey_cmp(b->key.k.p, end) > 0)
break;
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key));
- fputs(buf, stdout);
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ fputs(buf.buf, stdout);
putchar('\n');
print_node_ondisk(c, b);
die("error %s walking btree nodes", strerror(-ret));
bch2_trans_exit(&trans);
+ printbuf_exit(&buf);
}
static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned level,
struct bkey unpacked;
struct bkey_s_c k;
struct btree *b;
- char buf[4096];
+ struct printbuf buf = PRINTBUF;
int ret;
bch2_trans_init(&trans, c, 0, 0);
if (bkey_cmp(b->key.k.p, end) > 0)
break;
- bch2_btree_node_to_text(&PBUF(buf), c, b);
- fputs(buf, stdout);
+ printbuf_reset(&buf);
+ bch2_btree_node_to_text(&buf, c, b);
+ fputs(buf.buf, stdout);
for_each_btree_node_key_unpack(b, k, &node_iter, &unpacked) {
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
putchar('\t');
- puts(buf);
+ puts(buf.buf);
}
}
bch2_trans_iter_exit(&trans, &iter);
die("error %s walking btree nodes", strerror(-ret));
bch2_trans_exit(&trans);
+ printbuf_exit(&buf);
}
static void list_keys_usage(void)
" -f Check (fsck) the filesystem first\n"
" -v Verbose mode\n"
" -h Display this help and exit\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
}
#define LIST_MODES() \
bch2_fs_stop(c);
return 0;
}
-
-static void list_journal_usage(void)
-{
- puts("bcachefs list_journal - print contents of journal\n"
- "Usage: bcachefs list_journal [OPTION]... <devices>\n"
- "\n"
- "Options:\n"
- " -a Read entire journal, not just dirty entries\n"
- " -h Display this help and exit\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
-}
-
-int cmd_list_journal(int argc, char *argv[])
-{
- struct bch_opts opts = bch2_opts_empty();
- int opt;
-
- opt_set(opts, nochanges, true);
- opt_set(opts, norecovery, true);
- opt_set(opts, degraded, true);
- opt_set(opts, errors, BCH_ON_ERROR_continue);
- opt_set(opts, fix_errors, FSCK_OPT_YES);
- opt_set(opts, keep_journal, true);
-
- while ((opt = getopt(argc, argv, "ah")) != -1)
- switch (opt) {
- case 'a':
- opt_set(opts, read_entire_journal, true);
- break;
- case 'h':
- list_journal_usage();
- exit(EXIT_SUCCESS);
- }
- args_shift(optind);
-
- if (!argc)
- die("Please supply device(s) to open");
-
- struct bch_fs *c = bch2_fs_open(argv, argc, opts);
- if (IS_ERR(c))
- die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c)));
-
- struct journal_replay *p;
- struct jset_entry *entry;
-
- list_for_each_entry(p, &c->journal_entries, list) {
- printf("journal entry %8llu\n"
- " version %8u\n"
- " last seq %8llu\n"
- ,
- le64_to_cpu(p->j.seq),
- le32_to_cpu(p->j.version),
- le64_to_cpu(p->j.last_seq));
-
- vstruct_for_each(&p->j, entry) {
- char _buf[4096];
- struct printbuf buf = PBUF(_buf);
-
- printbuf_indent_push(&buf, 2);
- bch2_journal_entry_to_text(&buf, c, entry);
- printf("%s\n", _buf);
- }
- }
-
- bch2_fs_stop(c);
- return 0;
-}
--- /dev/null
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "cmds.h"
+#include "libbcachefs.h"
+#include "qcow2.h"
+#include "tools-util.h"
+
+#include "libbcachefs/bcachefs.h"
+#include "libbcachefs/btree_iter.h"
+#include "libbcachefs/error.h"
+#include "libbcachefs/journal_io.h"
+#include "libbcachefs/journal_seq_blacklist.h"
+#include "libbcachefs/super.h"
+
+static void list_journal_usage(void)
+{
+ puts("bcachefs list_journal - print contents of journal\n"
+ "Usage: bcachefs list_journal [OPTION]... <devices>\n"
+ "\n"
+ "Options:\n"
+ " -a Read entire journal, not just dirty entries\n"
+ " -n Number of journal entries to print, starting from the most recent\n"
+ " -v Verbose mode\n"
+ " -h Display this help and exit\n"
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+static void star_start_of_lines(char *buf)
+{
+ char *p = buf;
+
+ if (*p == ' ')
+ *p = '*';
+
+ while ((p = strstr(p, "\n ")))
+ p[1] = '*';
+}
+
+int cmd_list_journal(int argc, char *argv[])
+{
+ struct bch_opts opts = bch2_opts_empty();
+ u32 nr_entries = U32_MAX;
+ int opt;
+
+ opt_set(opts, nochanges, true);
+ opt_set(opts, norecovery, true);
+ opt_set(opts, degraded, true);
+ opt_set(opts, errors, BCH_ON_ERROR_continue);
+ opt_set(opts, fix_errors, FSCK_OPT_YES);
+ opt_set(opts, keep_journal, true);
+ opt_set(opts, read_journal_only,true);
+
+ while ((opt = getopt(argc, argv, "an:vh")) != -1)
+ switch (opt) {
+ case 'a':
+ opt_set(opts, read_entire_journal, true);
+ break;
+ case 'n':
+ nr_entries = kstrtouint(optarg, 10, &nr_entries);
+ opt_set(opts, read_entire_journal, true);
+ break;
+ case 'v':
+ opt_set(opts, verbose, true);
+ break;
+ case 'h':
+ list_journal_usage();
+ exit(EXIT_SUCCESS);
+ }
+ args_shift(optind);
+
+ if (!argc)
+ die("Please supply device(s) to open");
+
+ struct bch_fs *c = bch2_fs_open(argv, argc, opts);
+ if (IS_ERR(c))
+ die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c)));
+
+ struct journal_replay *p, **_p;
+ struct genradix_iter iter;
+ struct jset_entry *entry;
+ struct printbuf buf = PRINTBUF;
+
+ genradix_for_each(&c->journal_entries, iter, _p) {
+ p = *_p;
+ if (!p)
+ continue;
+
+ if (le64_to_cpu(p->j.seq) + nr_entries < atomic64_read(&c->journal.seq))
+ continue;
+
+ bool blacklisted =
+ bch2_journal_seq_is_blacklisted(c,
+ le64_to_cpu(p->j.seq), false);
+
+ if (blacklisted)
+ printf("blacklisted ");
+
+ printf("journal entry %llu\n", le64_to_cpu(p->j.seq));
+
+ printbuf_reset(&buf);
+
+ prt_printf(&buf,
+ " version %u\n"
+ " last seq %llu\n"
+ " flush %u\n"
+ " written at ",
+ le32_to_cpu(p->j.version),
+ le64_to_cpu(p->j.last_seq),
+ !JSET_NO_FLUSH(&p->j));
+ bch2_journal_ptrs_to_text(&buf, c, p);
+
+ if (blacklisted)
+ star_start_of_lines(buf.buf);
+ printf("%s\n", buf.buf);
+
+ vstruct_for_each(&p->j, entry) {
+ printbuf_reset(&buf);
+
+ /*
+ * log entries denote the start of a new transaction
+ * commit:
+ */
+ if (entry->type == BCH_JSET_ENTRY_log && !entry->level)
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 4);
+ bch2_journal_entry_to_text(&buf, c, entry);
+
+ if (blacklisted)
+ star_start_of_lines(buf.buf);
+ printf("%s\n", buf.buf);
+ }
+ }
+
+ printbuf_exit(&buf);
+ bch2_fs_stop(c);
+ return 0;
+}
+
+static void kill_btree_node_usage(void)
+{
+ puts("bcachefs kill_btree_node - make btree nodes unreadable\n"
+ "Usage: bcachefs kill_btree_node [OPTION]... <devices>\n"
+ "\n"
+ "Options:\n"
+ " -b (extents|inodes|dirents|xattrs) Btree to delete from\n"
+ " -l level Levle to delete from (0 == leaves)\n"
+ " -i index Index of btree node to kill\n"
+ " -h Display this help and exit\n"
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+int cmd_kill_btree_node(int argc, char *argv[])
+{
+ struct bch_opts opts = bch2_opts_empty();
+ enum btree_id btree_id = 0;
+ unsigned level = 0;
+ u64 node_index = 0;
+ int opt;
+
+ opt_set(opts, read_only, true);
+
+ while ((opt = getopt(argc, argv, "b:l:i:h")) != -1)
+ switch (opt) {
+ case 'b':
+ btree_id = read_string_list_or_die(optarg,
+ bch2_btree_ids, "btree id");
+ break;
+ case 'l':
+ if (kstrtouint(optarg, 10, &level) || level >= BTREE_MAX_DEPTH)
+ die("invalid level");
+ break;
+ case 'i':
+ if (kstrtoull(optarg, 10, &node_index))
+ die("invalid index %s", optarg);
+ break;
+ case 'h':
+ kill_btree_node_usage();
+ exit(EXIT_SUCCESS);
+ }
+ args_shift(optind);
+
+ if (!argc)
+ die("Please supply device(s)");
+
+ struct bch_fs *c = bch2_fs_open(argv, argc, opts);
+ if (IS_ERR(c))
+ die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c)));
+
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct btree *b;
+ int ret;
+ void *zeroes;
+
+ ret = posix_memalign(&zeroes, c->opts.block_size, c->opts.block_size);
+ if (ret)
+ die("error %s from posix_memalign", strerror(ret));
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ __for_each_btree_node(&trans, iter, btree_id, POS_MIN, 0, level, 0, b, ret) {
+ if (b->c.level != level)
+ continue;
+
+ if (!node_index) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
+ const struct bch_extent_ptr *ptr;
+
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ bch_info(c, "killing btree node %s", buf.buf);
+ printbuf_exit(&buf);
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ ret = pwrite(ca->disk_sb.bdev->bd_fd, zeroes,
+ c->opts.block_size, ptr->offset << 9);
+ if (ret != c->opts.block_size) {
+ bch_err(c, "pwrite error: expected %u got %i %s",
+ c->opts.block_size, ret, strerror(errno));
+ ret = EXIT_FAILURE;
+ goto done;
+ }
+ }
+ goto done;
+ }
+
+ node_index--;
+ }
+ if (ret)
+ bch_err(c, "error %i walking btree nodes", ret);
+ else
+ bch_err(c, "node at specified index not found");
+ ret = EXIT_FAILURE;
+done:
+ bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_exit(&trans);
+
+ bch2_fs_stop(c);
+ return ret;
+}
struct bkey_inode_buf packed;
int ret;
- bch2_inode_pack(c, &packed, inode);
+ bch2_inode_pack(&packed, inode);
packed.inode.k.p.snapshot = U32_MAX;
ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
NULL, NULL, 0);
closure_init_stack(&cl);
- bio_init(&op.wbio.bio, bv, ARRAY_SIZE(bv));
+ bio_init(&op.wbio.bio, NULL, bv, ARRAY_SIZE(bv), 0);
bch2_bio_map(&op.wbio.bio, buf, len);
bch2_write_op_init(&op, c, bch2_opts_to_inode_opts(c->opts));
struct fiemap_iter iter;
struct fiemap_extent e;
- ranges extents = { NULL };
+ ranges extents = { 0 };
fiemap_for_each(fd, iter, e) {
if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN|
update_inode(c, &root_inode);
- darray_free(s.extents);
+ darray_exit(&s.extents);
genradix_free(&s.hardlinks);
}
{
struct range *i;
- darray_foreach(i, extents) {
+ darray_for_each(extents, i) {
u64 start = round_up(max(256ULL << 10, i->start),
dev->bucket_size << 9);
u64 end = round_down(i->end,
" --no_passphrase Don't encrypt master encryption key\n"
" -F Force, even if metadata file already exists\n"
" -h Display this help and exit\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
}
static const struct option migrate_opts[] = {
u64 sb_offset = le64_to_cpu(sb->layout.sb_offset[0]);
if (format_opts.passphrase)
- bch2_add_key(sb, format_opts.passphrase);
+ bch2_add_key(sb, "user", "user", format_opts.passphrase);
free(sb);
" -d device Device to create superblock for\n"
" -o offset Offset of existing superblock\n"
" -h Display this help and exit\n"
- "Report bugs to <linux-bcache@vger.kernel.org>");
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
}
int cmd_migrate_superblock(int argc, char *argv[])
--- /dev/null
+/*
+ * Authors: Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * GPLv2
+ */
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <uuid/uuid.h>
+
+#include "cmds.h"
+#include "libbcachefs.h"
+#include "libbcachefs/opts.h"
+#include "libbcachefs/super-io.h"
+
+static void set_option_usage(void)
+{
+ puts("bcachefs set-option \n"
+ "Usage: bcachefs set-option [OPTION].. device\n"
+ "\n"
+ "Options:\n");
+ bch2_opts_usage(OPT_MOUNT);
+ puts(" -h, --help display this help and exit\n"
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
+ exit(EXIT_SUCCESS);
+}
+
+int cmd_set_option(int argc, char *argv[])
+{
+ struct bch_opt_strs new_opt_strs = bch2_cmdline_opts_get(&argc, argv, OPT_MOUNT);
+ struct bch_opts new_opts = bch2_parse_opts(new_opt_strs);
+ struct bch_opts open_opts = bch2_opts_empty();
+ unsigned i;
+ int opt, ret = 0;
+
+ opt_set(open_opts, nostart, true);
+
+ while ((opt = getopt(argc, argv, "h")) != -1)
+ switch (opt) {
+ case 'h':
+ set_option_usage();
+ break;
+ }
+ args_shift(optind);
+
+ if (!argc) {
+ fprintf(stderr, "Please supply device(s)\n");
+ exit(EXIT_FAILURE);
+ }
+
+ for (i = 0; i < argc; i++)
+ if (dev_mounted(argv[i]))
+ goto online;
+
+ struct bch_fs *c = bch2_fs_open(argv, argc, open_opts);
+ if (IS_ERR(c)) {
+ fprintf(stderr, "error opening %s: %s\n", argv[0], strerror(-PTR_ERR(c)));
+ exit(EXIT_FAILURE);
+ }
+
+ for (i = 0; i < bch2_opts_nr; i++) {
+ u64 v = bch2_opt_get_by_id(&new_opts, i);
+
+ if (!bch2_opt_defined_by_id(&new_opts, i))
+ continue;
+
+ ret = bch2_opt_check_may_set(c, i, v);
+ if (ret < 0) {
+ fprintf(stderr, "error setting %s: %i\n",
+ bch2_opt_table[i].attr.name, ret);
+ break;
+ }
+
+ bch2_opt_set_sb(c, bch2_opt_table + i, v);
+ bch2_opt_set_by_id(&c->opts, i, v);
+ }
+
+ bch2_fs_stop(c);
+ return ret;
+online:
+ {
+ unsigned dev_idx;
+ struct bchfs_handle fs = bchu_fs_open_by_dev(argv[i], &dev_idx);
+
+ for (i = 0; i < bch2_opts_nr; i++) {
+ if (!new_opt_strs.by_id[i])
+ continue;
+
+ char *path = mprintf("options/%s", bch2_opt_table[i].attr.name);
+
+ write_file_str(fs.sysfs_fd, path, new_opt_strs.by_id[i]);
+ free(path);
+ }
+ }
+ return 0;
+}
int cmd_format(int argc, char *argv[]);
int cmd_show_super(int argc, char *argv[]);
+int cmd_set_option(int argc, char *argv[]);
#if 0
int cmd_assemble(int argc, char *argv[]);
int cmd_dump(int argc, char *argv[]);
int cmd_list(int argc, char *argv[]);
int cmd_list_journal(int argc, char *argv[]);
+int cmd_kill_btree_node(int argc, char *argv[]);
int cmd_migrate(int argc, char *argv[]);
int cmd_migrate_superblock(int argc, char *argv[]);
die("incorrect passphrase");
}
-void bch2_add_key(struct bch_sb *sb, const char *passphrase)
+void bch2_add_key(struct bch_sb *sb,
+ const char *type,
+ const char *keyring_str,
+ const char *passphrase)
{
struct bch_key passphrase_key;
struct bch_encrypted_key sb_key;
+ int keyring;
+
+ if (!strcmp(keyring_str, "session"))
+ keyring = KEY_SPEC_SESSION_KEYRING;
+ else if (!strcmp(keyring_str, "user"))
+ keyring = KEY_SPEC_USER_KEYRING;
+ else if (!strcmp(keyring_str, "user_session"))
+ keyring = KEY_SPEC_USER_SESSION_KEYRING;
+ else
+ die("unknown keyring %s", keyring_str);
bch2_passphrase_check(sb, passphrase,
&passphrase_key,
char *description = mprintf("bcachefs:%s", uuid);
- if (add_key("logon", description,
- &passphrase_key, sizeof(passphrase_key),
- KEY_SPEC_USER_KEYRING) < 0 ||
- add_key("user", description,
+ if (add_key(type,
+ description,
&passphrase_key, sizeof(passphrase_key),
- KEY_SPEC_USER_KEYRING) < 0)
+ keyring) < 0)
die("add_key error: %m");
memzero_explicit(description, strlen(description));
bool bch2_sb_is_encrypted(struct bch_sb *);
void bch2_passphrase_check(struct bch_sb *, const char *,
struct bch_key *, struct bch_encrypted_key *);
-void bch2_add_key(struct bch_sb *, const char *);
+void bch2_add_key(struct bch_sb *, const char *, const char *, const char *);
void bch_sb_crypt_init(struct bch_sb *sb, struct bch_sb_field_crypt *,
const char *);
--- /dev/null
+#!/bin/sh
+
+set -e
+
+case "$1" in
+ configure)
+ if which update-initramfs >/dev/null; then
+ update-initramfs -u
+ fi
+ ;;
+esac
+
--- /dev/null
+#!/bin/sh
+
+set -e
+
+case "$1" in
+ remove)
+ if which update-initramfs >/dev/null; then
+ update-initramfs -u
+ fi
+ ;;
+esac
+
+bcachefs-tools (23-1) unstable; urgency=medium
+
+ * New upstream release
+ * Update standards version to 4.6.1
+
+ -- Jonathan Carter <jcc@debian.org> Mon, 31 Oct 2022 11:45:25 +0200
+
bcachefs-tools (0.1+git20220216.a1e928a-1) unstable; urgency=medium
* New upstream snapshot
Maintainer: Jonathan Carter <jcc@debian.org>
Section: utils
Priority: optional
-Standards-Version: 4.6.0
+Standards-Version: 4.6.1
Rules-Requires-Root: no
Build-Depends: debhelper-compat (= 13),
pkg-config,
-bcachefs-tools_0.1+git20220216.a1e928a-1_source.buildinfo utils optional
+bcachefs-tools_23-1_source.buildinfo utils optional
{ lib
-, filter
-
+, doCheck ? true
, stdenv
, pkg-config
, attr
, docutils
, nixosTests
-, lastModified
-, versionString ? lastModified
+, versionString ? "0.1"
, inShell ? false
, debugMode ? inShell
version = "v0.1-flake-${versionString}";
VERSION = "v0.1-flake-${versionString}";
-
- src = filter.filter {
- name = "bcachefs-tools";
- root = ./.;
- exclude = [
- ./rust-src
-
- ./.git
- ./nix
-
- ./flake.nix
- ./flake.lock
- ];
- };
+
+ src = (lib.cleanSource (builtins.path { name = "bcachefs-tools-src"; path = ./. ;} ));
postPatch = "patchShebangs --build doc/macro2rst.py";
"INITRAMFS_DIR=${placeholder "out"}/etc/initramfs-tools"
];
- doCheck = true; # needs bcachefs module loaded on builder
+ doCheck = doCheck; # needs bcachefs module loaded on builder
checkInputs = [
python39Packages.pytest
rm tests/test_fuse.py
'';
- dontStrip = debugMode == true;
+ dontStrip = debugMode;
passthru = {
bcachefs_revision = let
file = builtins.readFile ./.bcachefs_revision;
struct bio_set {
unsigned int front_pad;
+ unsigned int back_pad;
+ mempool_t bio_pool;
+ mempool_t bvec_pool;
};
-static inline void bioset_exit(struct bio_set *bs) {}
static inline void bioset_free(struct bio_set *bs)
{
kfree(bs);
}
-static inline int bioset_init(struct bio_set *bs,
- unsigned pool_size,
- unsigned front_pad,
- int flags)
-{
- bs->front_pad = front_pad;
- return 0;
-}
+void bioset_exit(struct bio_set *);
+int bioset_init(struct bio_set *, unsigned, unsigned, int);
extern struct bio_set *bioset_create(unsigned int, unsigned int);
extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
BIOSET_NEED_RESCUER = 1 << 1,
};
-extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
+struct bio *bio_alloc_bioset(struct block_device *, unsigned,
+ unsigned, gfp_t, struct bio_set *);
extern void bio_put(struct bio *);
int bio_add_page(struct bio *, struct page *, unsigned, unsigned);
-extern void __bio_clone_fast(struct bio *, struct bio *);
-extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
-extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
-
-static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
- return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
-}
-
-static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
-{
- return bio_clone_bioset(bio, gfp_mask, NULL);
+struct bio *bio_alloc_clone(struct block_device *, struct bio *,
+ gfp_t, struct bio_set *);
-}
+struct bio *bio_kmalloc(unsigned int, gfp_t);
extern void bio_endio(struct bio *);
extern void bio_advance(struct bio *, unsigned);
-extern void bio_reset(struct bio *);
+extern void bio_reset(struct bio *, struct block_device *, unsigned);
void bio_chain(struct bio *, struct bio *);
extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
atomic_inc(&bio->__bi_remaining);
}
-static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
- return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
-}
-
-static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
-{
- return bio_clone_bioset(bio, gfp_mask, NULL);
-}
-
-static inline void bio_init(struct bio *bio, struct bio_vec *table,
- unsigned short max_vecs)
+static inline void bio_init(struct bio *bio,
+ struct block_device *bdev,
+ struct bio_vec *table,
+ unsigned short max_vecs,
+ unsigned int opf)
{
memset(bio, 0, sizeof(*bio));
+ bio->bi_bdev = bdev;
+ bio->bi_opf = opf;
atomic_set(&bio->__bi_remaining, 1);
atomic_set(&bio->__bi_cnt, 1);
__builtin_popcount(w >> 32);
}
+static inline unsigned long hweight32(u32 w)
+{
+ return __builtin_popcount(w);
+}
+
static inline unsigned long hweight8(unsigned long w)
{
return __builtin_popcountl(w);
struct gendisk __bd_disk;
int bd_fd;
int bd_sync_fd;
+ int bd_buffered_fd;
};
#define bdev_kobj(_bdev) (&((_bdev)->kobj))
#define BLK_STS_AGAIN ((__force blk_status_t)12)
+#define BIO_INLINE_VECS 4
+
/*
* main unit of I/O for the block layer and lower layers (ie drivers and
* stacking drivers)
generic_make_request(bio);
}
-int blkdev_issue_discard(struct block_device *, sector_t,
- sector_t, gfp_t, unsigned long);
+int blkdev_issue_discard(struct block_device *, sector_t, sector_t, gfp_t);
#define bdev_get_queue(bdev) (&((bdev)->queue))
#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT)
#define SECTOR_MASK (PAGE_SECTORS - 1)
-#define blk_queue_discard(q) ((void) (q), 0)
+#define bdev_max_discard_sectors(bdev) ((void) (bdev), 0)
#define blk_queue_nonrot(q) ((void) (q), 0)
unsigned bdev_logical_block_size(struct block_device *bdev);
#define __TOOLS_LINUX_BUG_H
#include <assert.h>
+#include <stdio.h>
#include <linux/compiler.h>
#ifdef CONFIG_VALGRIND
#define BUILD_BUG_ON(cond) ((void)sizeof(char[1 - 2*!!(cond)]))
-#define BUG() do { assert(0); unreachable(); } while (0)
+#define BUG() do { fflush(stdout); assert(0); unreachable(); } while (0)
#define BUG_ON(cond) assert(!(cond))
#define WARN(cond, fmt, ...) \
--- /dev/null
+#ifndef _LINUX_ERRNAME_H
+#define _LINUX_ERRNAME_H
+
+#include <string.h>
+
+static inline const char *errname(int err)
+{
+ return strerror(abs(err));
+}
+
+#endif /* _LINUX_ERRNAME_H */
#define try_to_freeze()
#define set_freezable()
#define freezing(task) false
-#define freezable_schedule_timeout(_t) schedule_timeout(_t);
+#define freezable_schedule() schedule()
+#define freezable_schedule_timeout(_t) schedule_timeout(_t)
#endif /* __TOOLS_LINUX_FREEZER_H */
#define _LINUX_GENERIC_RADIX_TREE_H
/**
- * DOC: Generic radix trees/sparse arrays:
+ * DOC: Generic radix trees/sparse arrays
*
* Very simple and minimalistic, supporting arbitrary size entries up to
* PAGE_SIZE.
#include <asm/page.h>
#include <linux/bug.h>
-#include <linux/kernel.h>
+#include <linux/limits.h>
#include <linux/log2.h>
+#include <linux/math.h>
+#include <linux/types.h>
struct genradix_root;
struct __genradix {
- struct genradix_root __rcu *root;
+ struct genradix_root *root;
};
/*
#define __genradix_cast(_radix) (typeof((_radix)->type[0]) *)
#define __genradix_obj_size(_radix) sizeof((_radix)->type[0])
+#define __genradix_objs_per_page(_radix) \
+ (PAGE_SIZE / sizeof((_radix)->type[0]))
+#define __genradix_page_remainder(_radix) \
+ (PAGE_SIZE % sizeof((_radix)->type[0]))
+
#define __genradix_idx_to_offset(_radix, _idx) \
__idx_to_offset(_idx, __genradix_obj_size(_radix))
#define genradix_iter_peek(_iter, _radix) \
(__genradix_cast(_radix) \
__genradix_iter_peek(_iter, &(_radix)->tree, \
- PAGE_SIZE / __genradix_obj_size(_radix)))
+ __genradix_objs_per_page(_radix)))
+
+void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *,
+ size_t, size_t);
+
+/**
+ * genradix_iter_peek - get first entry at or below iterator's current
+ * position
+ * @_iter: a genradix_iter
+ * @_radix: genradix being iterated over
+ *
+ * If no more entries exist at or below @_iter's current position, returns NULL
+ */
+#define genradix_iter_peek_prev(_iter, _radix) \
+ (__genradix_cast(_radix) \
+ __genradix_iter_peek_prev(_iter, &(_radix)->tree, \
+ __genradix_objs_per_page(_radix), \
+ __genradix_obj_size(_radix) + \
+ __genradix_page_remainder(_radix)))
static inline void __genradix_iter_advance(struct genradix_iter *iter,
size_t obj_size)
{
- size_t new_offset = iter->offset + obj_size;
-
- if (new_offset < iter->offset) {
+ if (iter->offset + obj_size < iter->offset) {
iter->offset = SIZE_MAX;
iter->pos = SIZE_MAX;
return;
#define genradix_iter_advance(_iter, _radix) \
__genradix_iter_advance(_iter, __genradix_obj_size(_radix))
+static inline void __genradix_iter_rewind(struct genradix_iter *iter,
+ size_t obj_size)
+{
+ if (iter->offset == 0 ||
+ iter->offset == SIZE_MAX) {
+ iter->offset = SIZE_MAX;
+ return;
+ }
+
+ if ((iter->offset & (PAGE_SIZE - 1)) == 0)
+ iter->offset -= PAGE_SIZE % obj_size;
+
+ iter->offset -= obj_size;
+ iter->pos--;
+}
+
+#define genradix_iter_rewind(_iter, _radix) \
+ __genradix_iter_rewind(_iter, __genradix_obj_size(_radix))
+
#define genradix_for_each_from(_radix, _iter, _p, _start) \
for (_iter = genradix_iter_init(_radix, _start); \
(_p = genradix_iter_peek(&_iter, _radix)) != NULL; \
#define genradix_for_each(_radix, _iter, _p) \
genradix_for_each_from(_radix, _iter, _p, 0)
+#define genradix_last_pos(_radix) \
+ (SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1)
+
+/**
+ * genradix_for_each_reverse - iterate over entry in a genradix, reverse order
+ * @_radix: genradix to iterate over
+ * @_iter: a genradix_iter to track current position
+ * @_p: pointer to genradix entry type
+ *
+ * On every iteration, @_p will point to the current entry, and @_iter.pos
+ * will be the current entry's index.
+ */
+#define genradix_for_each_reverse(_radix, _iter, _p) \
+ for (_iter = genradix_iter_init(_radix, genradix_last_pos(_radix));\
+ (_p = genradix_iter_peek_prev(&_iter, _radix)) != NULL;\
+ genradix_iter_rewind(&_iter, _radix))
+
int __genradix_prealloc(struct __genradix *, size_t, gfp_t);
/**
(time_after_eq64(a, b) && \
time_before_eq64(a, c))
+#define time_is_before_jiffies(a) time_after(jiffies, a)
+
#define HZ 1000
static inline u64 jiffies_to_nsecs(const unsigned long j)
return sched_clock();
}
+static inline u64 ktime_get_ns(void)
+{
+ return sched_clock();
+}
+
#define jiffies nsecs_to_jiffies(sched_clock())
#endif
return kstrtoint(s, base, res);
}
+struct printbuf;
+extern __printf(2, 0) void prt_vprintf(struct printbuf *out, const char *fmt, va_list args);
+extern __printf(2, 3) void prt_printf(struct printbuf *out, const char *fmt, ...);
+
+static const char hex_asc[] = "0123456789abcdef";
+#define hex_asc_lo(x) hex_asc[((x) & 0x0f)]
+#define hex_asc_hi(x) hex_asc[((x) & 0xf0) >> 4]
+static const char hex_asc_upper[] = "0123456789ABCDEF";
+#define hex_asc_upper_lo(x) hex_asc_upper[((x) & 0x0f)]
+#define hex_asc_upper_hi(x) hex_asc_upper[((x) & 0xf0) >> 4]
+
/* The hash is always the low bits of hash_len */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define HASH_LEN_DECLARE u32 hash; u32 len
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * include/linux/kmemleak.h
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ */
+
+#ifndef __KMEMLEAK_H
+#define __KMEMLEAK_H
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#ifdef CONFIG_DEBUG_KMEMLEAK
+
+extern void kmemleak_init(void) __init;
+extern void kmemleak_alloc(const void *ptr, size_t size, int min_count,
+ gfp_t gfp) __ref;
+extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+ gfp_t gfp) __ref;
+extern void kmemleak_vmalloc(const struct vm_struct *area, size_t size,
+ gfp_t gfp) __ref;
+extern void kmemleak_free(const void *ptr) __ref;
+extern void kmemleak_free_part(const void *ptr, size_t size) __ref;
+extern void kmemleak_free_percpu(const void __percpu *ptr) __ref;
+extern void kmemleak_update_trace(const void *ptr) __ref;
+extern void kmemleak_not_leak(const void *ptr) __ref;
+extern void kmemleak_ignore(const void *ptr) __ref;
+extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref;
+extern void kmemleak_no_scan(const void *ptr) __ref;
+extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
+ gfp_t gfp) __ref;
+extern void kmemleak_free_part_phys(phys_addr_t phys, size_t size) __ref;
+extern void kmemleak_ignore_phys(phys_addr_t phys) __ref;
+
+static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
+ int min_count, slab_flags_t flags,
+ gfp_t gfp)
+{
+ if (!(flags & SLAB_NOLEAKTRACE))
+ kmemleak_alloc(ptr, size, min_count, gfp);
+}
+
+static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags)
+{
+ if (!(flags & SLAB_NOLEAKTRACE))
+ kmemleak_free(ptr);
+}
+
+static inline void kmemleak_erase(void **ptr)
+{
+ *ptr = NULL;
+}
+
+#else
+
+static inline void kmemleak_init(void)
+{
+}
+static inline void kmemleak_alloc(const void *ptr, size_t size, int min_count,
+ gfp_t gfp)
+{
+}
+static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
+ int min_count, slab_flags_t flags,
+ gfp_t gfp)
+{
+}
+static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+ gfp_t gfp)
+{
+}
+static inline void kmemleak_vmalloc(const struct vm_struct *area, size_t size,
+ gfp_t gfp)
+{
+}
+static inline void kmemleak_free(const void *ptr)
+{
+}
+static inline void kmemleak_free_part(const void *ptr, size_t size)
+{
+}
+static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags)
+{
+}
+static inline void kmemleak_free_percpu(const void __percpu *ptr)
+{
+}
+static inline void kmemleak_update_trace(const void *ptr)
+{
+}
+static inline void kmemleak_not_leak(const void *ptr)
+{
+}
+static inline void kmemleak_ignore(const void *ptr)
+{
+}
+static inline void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
+{
+}
+static inline void kmemleak_erase(void **ptr)
+{
+}
+static inline void kmemleak_no_scan(const void *ptr)
+{
+}
+static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
+ gfp_t gfp)
+{
+}
+static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size)
+{
+}
+static inline void kmemleak_ignore_phys(phys_addr_t phys)
+{
+}
+
+#endif /* CONFIG_DEBUG_KMEMLEAK */
+
+#endif /* __KMEMLEAK_H */
struct kobj_type {
void (*release)(struct kobject *kobj);
const struct sysfs_ops *sysfs_ops;
- struct attribute **default_attrs;
+ const struct attribute_group **default_groups;
const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
const void *(*namespace)(struct kobject *kobj);
};
struct kobject {
struct kobject *parent;
struct kset *kset;
- struct kobj_type *ktype;
+ const struct kobj_type *ktype;
struct kernfs_node *sd; /* sysfs directory entry */
atomic_t ref;
unsigned int state_initialized:1;
#define kobject_add(...) 0
-static inline void kobject_init(struct kobject *kobj, struct kobj_type *ktype)
+static inline void kobject_init(struct kobject *kobj, const struct kobj_type *ktype)
{
memset(kobj, 0, sizeof(*kobj));
static inline void kobject_cleanup(struct kobject *kobj)
{
- struct kobj_type *t = kobj->ktype;
+ const struct kobj_type *t = kobj->ktype;
/* remove from sysfs if the caller did not do it */
if (kobj->state_in_sysfs)
#define list_add(n, h) cds_list_add(n, h)
#define list_add_tail(n, h) cds_list_add_tail(n, h)
#define __list_del_entry(l) cds_list_del(l)
+#define __list_del(p, n) __cds_list_del(p, n)
#define list_del(l) cds_list_del(l)
#define list_del_init(l) cds_list_del_init(l)
#define list_replace(o, n) cds_list_replace(o, n)
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef MEAN_AND_VARIANCE_H_
+#define MEAN_AND_VARIANCE_H_
+
+#include <linux/types.h>
+#include <linux/limits.h>
+#include <linux/math64.h>
+#include <linux/printbuf.h>
+
+#define SQRT_U64_MAX 4294967295ULL
+
+
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+
+typedef unsigned __int128 u128;
+
+static inline u128 u64_to_u128(u64 a)
+{
+ return (u128)a;
+}
+
+static inline u64 u128_to_u64(u128 a)
+{
+ return (u64)a;
+}
+
+static inline u64 u128_shr64_to_u64(u128 a)
+{
+ return (u64)(a >> 64);
+}
+
+static inline u128 u128_add(u128 a, u128 b)
+{
+ return a + b;
+}
+
+static inline u128 u128_sub(u128 a, u128 b)
+{
+ return a - b;
+}
+
+static inline u128 u128_shl(u128 i, s8 shift)
+{
+ return i << shift;
+}
+
+static inline u128 u128_shl64_add(u64 a, u64 b)
+{
+ return ((u128)a << 64) + b;
+}
+
+static inline u128 u128_square(u64 i)
+{
+ return i*i;
+}
+
+#else
+
+typedef struct {
+ u64 hi, lo;
+} u128;
+
+static inline u128 u64_to_u128(u64 a)
+{
+ return (u128){ .lo = a };
+}
+
+static inline u64 u128_to_u64(u128 a)
+{
+ return a.lo;
+}
+
+static inline u64 u128_shr64_to_u64(u128 a)
+{
+ return a.hi;
+}
+
+static inline u128 u128_add(u128 a, u128 b)
+{
+ u128 c;
+
+ c.lo = a.lo + b.lo;
+ c.hi = a.hi + b.hi + (c.lo < a.lo);
+ return c;
+}
+
+static inline u128 u128_sub(u128 a, u128 b)
+{
+ u128 c;
+
+ c.lo = a.lo - b.lo;
+ c.hi = a.hi - b.hi - (c.lo > a.lo);
+ return c;
+}
+
+static inline u128 u128_shl(u128 i, s8 shift)
+{
+ u128 r;
+
+ r.lo = i.lo << shift;
+ if (shift < 64)
+ r.hi = (i.hi << shift) | (i.lo >> (64 - shift));
+ else {
+ r.hi = i.lo << (shift - 64);
+ r.lo = 0;
+ }
+ return r;
+}
+
+static inline u128 u128_shl64_add(u64 a, u64 b)
+{
+ return u128_add(u128_shl(u64_to_u128(a), 64), u64_to_u128(b));
+}
+
+static inline u128 u128_square(u64 i)
+{
+ u128 r;
+ u64 h = i >> 32, l = i & (u64)U32_MAX;
+
+ r = u128_shl(u64_to_u128(h*h), 64);
+ r = u128_add(r, u128_shl(u64_to_u128(h*l), 32));
+ r = u128_add(r, u128_shl(u64_to_u128(l*h), 32));
+ r = u128_add(r, u64_to_u128(l*l));
+ return r;
+}
+
+#endif
+
+static inline u128 u128_div(u128 n, u64 d)
+{
+ u128 r;
+ u64 rem;
+ u64 hi = u128_shr64_to_u64(n);
+ u64 lo = u128_to_u64(n);
+ u64 h = hi & ((u64)U32_MAX << 32);
+ u64 l = (hi & (u64)U32_MAX) << 32;
+
+ r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64);
+ r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32));
+ r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
+ return r;
+}
+
+struct mean_and_variance {
+ s64 n;
+ s64 sum;
+ u128 sum_squares;
+};
+
+/* expontentially weighted variant */
+struct mean_and_variance_weighted {
+ bool init;
+ u8 w;
+ s64 mean;
+ u64 variance;
+};
+
+inline s64 fast_divpow2(s64 n, u8 d);
+
+struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1);
+ s64 mean_and_variance_get_mean(struct mean_and_variance s);
+ u64 mean_and_variance_get_variance(struct mean_and_variance s1);
+ u32 mean_and_variance_get_stddev(struct mean_and_variance s);
+
+struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, s64 v1);
+ s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
+ u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
+ u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+
+#endif // MEAN_AND_VAIRANCE_H_
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TOOLS_LINUX_MM_H
+#define _TOOLS_LINUX_MM_H
+
+#include <linux/types.h>
+
+struct sysinfo {
+ long uptime; /* Seconds since boot */
+ unsigned long loads[3]; /* 1, 5, and 15 minute load averages */
+ unsigned long totalram; /* Total usable main memory size */
+ unsigned long freeram; /* Available memory size */
+ unsigned long sharedram; /* Amount of shared memory */
+ unsigned long bufferram; /* Memory used by buffers */
+ unsigned long totalswap; /* Total swap space size */
+ unsigned long freeswap; /* swap space still available */
+ __u16 procs; /* Number of current processes */
+ __u16 pad; /* Explicit padding for m68k */
+ unsigned long totalhigh; /* Total high memory size */
+ unsigned long freehigh; /* Available high memory size */
+ __u32 mem_unit; /* Memory unit size in bytes */
+};
+
+extern void si_meminfo(struct sysinfo * val);
+
+#endif /* _TOOLS_LINUX_MM_H */
--- /dev/null
+#ifndef _LINUX_PRANDOM_H
+#define _LINUX_PRANDOM_H
+
+#include <linux/random.h>
+
+static inline void prandom_bytes(void *buf, int nbytes)
+{
+ return get_random_bytes(buf, nbytes);
+}
+
+#define prandom_type(type) \
+static inline type prandom_##type(void) \
+{ \
+ type v; \
+ \
+ prandom_bytes(&v, sizeof(v)); \
+ return v; \
+}
+
+prandom_type(int);
+prandom_type(long);
+prandom_type(u32);
+prandom_type(u64);
+#undef prandom_type
+
+#endif /* _LINUX_PRANDOM_H */
+
#define prefetch(p) \
({ __maybe_unused typeof(p) __var = (p); })
+#define prefetchw(p) \
+ ({ __maybe_unused typeof(p) __var = (p); })
+
#endif /* _LINUX_PREFETCH_H */
--- /dev/null
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/* Copyright (C) 2022 Kent Overstreet */
+
+#ifndef _LINUX_PRETTY_PRINTERS_H
+#define _LINUX_PRETTY_PRINTERS_H
+
+void prt_string_option(struct printbuf *, const char * const[], size_t);
+void prt_bitflags(struct printbuf *, const char * const[], u64);
+
+#endif /* _LINUX_PRETTY_PRINTERS_H */
--- /dev/null
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/* Copyright (C) 2022 Kent Overstreet */
+
+#ifndef _LINUX_PRINTBUF_H
+#define _LINUX_PRINTBUF_H
+
+/*
+ * Printbufs: Simple strings for printing to, with optional heap allocation
+ *
+ * This code has provisions for use in userspace, to aid in making other code
+ * portable between kernelspace and userspace.
+ *
+ * Basic example:
+ * struct printbuf buf = PRINTBUF;
+ *
+ * prt_printf(&buf, "foo=");
+ * foo_to_text(&buf, foo);
+ * printk("%s", buf.buf);
+ * printbuf_exit(&buf);
+ *
+ * Or
+ * struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size)
+ *
+ * We can now write pretty printers instead of writing code that dumps
+ * everything to the kernel log buffer, and then those pretty-printers can be
+ * used by other code that outputs to kernel log, sysfs, debugfs, etc.
+ *
+ * Memory allocation: Outputing to a printbuf may allocate memory. This
+ * allocation is done with GFP_KERNEL, by default: use the newer
+ * memalloc_*_(save|restore) functions as needed.
+ *
+ * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations
+ * will be done with GFP_NOWAIT if printbuf->atomic is nonzero.
+ *
+ * It's allowed to grab the output buffer and free it later with kfree() instead
+ * of using printbuf_exit(), if the user just needs a heap allocated string at
+ * the end.
+ *
+ * Memory allocation failures: We don't return errors directly, because on
+ * memory allocation failure we usually don't want to bail out and unwind - we
+ * want to print what we've got, on a best-effort basis. But code that does want
+ * to return -ENOMEM may check printbuf.allocation_failure.
+ *
+ * Indenting, tabstops:
+ *
+ * To aid is writing multi-line pretty printers spread across multiple
+ * functions, printbufs track the current indent level.
+ *
+ * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent
+ * level, respectively.
+ *
+ * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from
+ * start of line. Once set, prt_tab() will output spaces up to the next tabstop.
+ * prt_tab_rjust() will also advance the current line of text up to the next
+ * tabstop, but it does so by shifting text since the previous tabstop up to the
+ * next tabstop - right justifying it.
+ *
+ * Make sure you use prt_newline() instead of \n in the format string for indent
+ * level and tabstops to work corretly.
+ *
+ * Output units: printbuf->units exists to tell pretty-printers how to output
+ * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as
+ * human readable bytes. prt_units() obeys it.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+enum printbuf_si {
+ PRINTBUF_UNITS_2, /* use binary powers of 2^10 */
+ PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */
+};
+
+#define PRINTBUF_INLINE_TABSTOPS 4
+
+struct printbuf {
+ char *buf;
+ unsigned size;
+ unsigned pos;
+ unsigned last_newline;
+ unsigned last_field;
+ unsigned indent;
+ /*
+ * If nonzero, allocations will be done with GFP_ATOMIC:
+ */
+ u8 atomic;
+ bool allocation_failure:1;
+ bool heap_allocated:1;
+ enum printbuf_si si_units:1;
+ bool human_readable_units:1;
+ bool has_indent_or_tabstops:1;
+ bool suppress_indent_tabstop_handling:1;
+ u8 nr_tabstops;
+
+ /*
+ * Do not modify directly: use printbuf_tabstop_add(),
+ * printbuf_tabstop_get()
+ */
+ u8 cur_tabstop;
+ u8 _tabstops[PRINTBUF_INLINE_TABSTOPS];
+};
+
+int printbuf_make_room(struct printbuf *, unsigned);
+const char *printbuf_str(const struct printbuf *);
+void printbuf_exit(struct printbuf *);
+
+void printbuf_tabstops_reset(struct printbuf *);
+void printbuf_tabstop_pop(struct printbuf *);
+int printbuf_tabstop_push(struct printbuf *, unsigned);
+
+void printbuf_indent_add(struct printbuf *, unsigned);
+void printbuf_indent_sub(struct printbuf *, unsigned);
+
+void prt_newline(struct printbuf *);
+void prt_tab(struct printbuf *);
+void prt_tab_rjust(struct printbuf *);
+
+void prt_bytes_indented(struct printbuf *, const char *, unsigned);
+void prt_human_readable_u64(struct printbuf *, u64);
+void prt_human_readable_s64(struct printbuf *, s64);
+void prt_units_u64(struct printbuf *, u64);
+void prt_units_s64(struct printbuf *, s64);
+
+/* Initializer for a heap allocated printbuf: */
+#define PRINTBUF ((struct printbuf) { .heap_allocated = true })
+
+/* Initializer a printbuf that points to an external buffer: */
+#define PRINTBUF_EXTERN(_buf, _size) \
+((struct printbuf) { \
+ .buf = _buf, \
+ .size = _size, \
+})
+
+/*
+ * Returns size remaining of output buffer:
+ */
+static inline unsigned printbuf_remaining_size(struct printbuf *out)
+{
+ return out->pos < out->size ? out->size - out->pos : 0;
+}
+
+/*
+ * Returns number of characters we can print to the output buffer - i.e.
+ * excluding the terminating nul:
+ */
+static inline unsigned printbuf_remaining(struct printbuf *out)
+{
+ return out->pos < out->size ? out->size - out->pos - 1 : 0;
+}
+
+static inline unsigned printbuf_written(struct printbuf *out)
+{
+ return out->size ? min(out->pos, out->size - 1) : 0;
+}
+
+/*
+ * Returns true if output was truncated:
+ */
+static inline bool printbuf_overflowed(struct printbuf *out)
+{
+ return out->pos >= out->size;
+}
+
+static inline void printbuf_nul_terminate(struct printbuf *out)
+{
+ printbuf_make_room(out, 1);
+
+ if (out->pos < out->size)
+ out->buf[out->pos] = 0;
+ else if (out->size)
+ out->buf[out->size - 1] = 0;
+}
+
+/* Doesn't call printbuf_make_room(), doesn't nul terminate: */
+static inline void __prt_char_reserved(struct printbuf *out, char c)
+{
+ if (printbuf_remaining(out))
+ out->buf[out->pos] = c;
+ out->pos++;
+}
+
+/* Doesn't nul terminate: */
+static inline void __prt_char(struct printbuf *out, char c)
+{
+ printbuf_make_room(out, 1);
+ __prt_char_reserved(out, c);
+}
+
+static inline void prt_char(struct printbuf *out, char c)
+{
+ __prt_char(out, c);
+ printbuf_nul_terminate(out);
+}
+
+static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
+{
+ unsigned i, can_print = min(n, printbuf_remaining(out));
+
+ for (i = 0; i < can_print; i++)
+ out->buf[out->pos++] = c;
+ out->pos += n - can_print;
+}
+
+static inline void prt_chars(struct printbuf *out, char c, unsigned n)
+{
+ printbuf_make_room(out, n);
+ __prt_chars_reserved(out, c, n);
+ printbuf_nul_terminate(out);
+}
+
+static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
+{
+ unsigned i, can_print;
+
+ printbuf_make_room(out, n);
+
+ can_print = min(n, printbuf_remaining(out));
+
+ for (i = 0; i < can_print; i++)
+ out->buf[out->pos++] = ((char *) b)[i];
+ out->pos += n - can_print;
+
+ printbuf_nul_terminate(out);
+}
+
+static inline void prt_str(struct printbuf *out, const char *str)
+{
+ prt_bytes(out, str, strlen(str));
+}
+
+static inline void prt_str_indented(struct printbuf *out, const char *str)
+{
+ prt_bytes_indented(out, str, strlen(str));
+}
+
+static inline void prt_hex_byte(struct printbuf *out, u8 byte)
+{
+ printbuf_make_room(out, 2);
+ __prt_char_reserved(out, hex_asc_hi(byte));
+ __prt_char_reserved(out, hex_asc_lo(byte));
+ printbuf_nul_terminate(out);
+}
+
+static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
+{
+ printbuf_make_room(out, 2);
+ __prt_char_reserved(out, hex_asc_upper_hi(byte));
+ __prt_char_reserved(out, hex_asc_upper_lo(byte));
+ printbuf_nul_terminate(out);
+}
+
+/**
+ * printbuf_reset - re-use a printbuf without freeing and re-initializing it:
+ */
+static inline void printbuf_reset(struct printbuf *buf)
+{
+ buf->pos = 0;
+ buf->allocation_failure = 0;
+ buf->indent = 0;
+ buf->nr_tabstops = 0;
+ buf->cur_tabstop = 0;
+}
+
+/**
+ * printbuf_atomic_inc - mark as entering an atomic section
+ */
+static inline void printbuf_atomic_inc(struct printbuf *buf)
+{
+ buf->atomic++;
+}
+
+/**
+ * printbuf_atomic_inc - mark as leaving an atomic section
+ */
+static inline void printbuf_atomic_dec(struct printbuf *buf)
+{
+ buf->atomic--;
+}
+
+/*
+ * This is used for the %pf(%p) sprintf format extension, where we pass a pretty
+ * printer and arguments to the pretty-printer to sprintf
+ *
+ * Instead of passing a pretty-printer function to sprintf directly, we pass it
+ * a pointer to a struct call_pp, so that sprintf can check that the magic
+ * number is present, which in turn ensures that the CALL_PP() macro has been
+ * used in order to typecheck the arguments to the pretty printer function
+ *
+ * Example usage:
+ * sprintf("%pf(%p)", CALL_PP(prt_bdev, bdev));
+ */
+struct call_pp {
+ unsigned long magic;
+ void *fn;
+};
+
+#define PP_TYPECHECK(fn, ...) \
+ ({ while (0) fn((struct printbuf *) NULL, ##__VA_ARGS__); })
+
+#define CALL_PP_MAGIC (unsigned long) 0xce0b92d22f6b6be4
+
+#define CALL_PP(fn, ...) \
+ (PP_TYPECHECK(fn, ##__VA_ARGS__), \
+ &((struct call_pp) { CALL_PP_MAGIC, fn })), ##__VA_ARGS__
+
+#endif /* _LINUX_PRINTBUF_H */
#define pr_fmt(fmt) fmt
#endif
+#include <linux/compiler.h>
#include <stdarg.h>
#include <stdio.h>
* ratelimited messages with local ratelimit_state,
* no local ratelimit_state used in the !PRINTK case
*/
-#ifdef CONFIG_PRINTK
#define printk_ratelimited(fmt, ...) \
({ \
static DEFINE_RATELIMIT_STATE(_rs, \
if (__ratelimit(&_rs)) \
printk(fmt, ##__VA_ARGS__); \
})
-#else
-#define printk_ratelimited(fmt, ...) \
- no_printk(fmt, ##__VA_ARGS__)
-#endif
#define pr_emerg_ratelimited(fmt, ...) \
printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
BUG_ON(getrandom(buf, nbytes, 0) != nbytes);
}
-static inline void prandom_bytes(void *buf, int nbytes)
-{
- return get_random_bytes(buf, nbytes);
-}
-
#define get_random_type(type) \
static inline type get_random_##type(void) \
{ \
}
#define down_read(l) pthread_rwlock_rdlock(&(l)->lock)
+#define down_read_killable(l) (pthread_rwlock_rdlock(&(l)->lock), 0)
#define down_read_trylock(l) (!pthread_rwlock_tryrdlock(&(l)->lock))
#define up_read(l) pthread_rwlock_unlock(&(l)->lock)
#include <linux/bug.h>
#include <linux/completion.h>
#include <linux/jiffies.h>
+#include <linux/rwsem.h>
#include <linux/time64.h>
#define TASK_RUNNING 0
pid_t pid;
struct bio_list *bio_list;
+
+ struct signal_struct {
+ struct rw_semaphore exec_update_lock;
+ } *signal, _signal;
};
extern __thread struct task_struct *current;
#define current_kernel_time64() current_kernel_time()
#define CURRENT_TIME (current_kernel_time())
+static inline unsigned int stack_trace_save_tsk(struct task_struct *task,
+ unsigned long *store, unsigned int size,
+ unsigned int skipnr)
+{
+ return 0;
+}
+
#endif /* __TOOLS_LINUX_SCHED_H */
#define SHRINK_STOP (~0UL)
+struct printbuf;
struct shrinker {
unsigned long (*count_objects)(struct shrinker *,
struct shrink_control *sc);
unsigned long (*scan_objects)(struct shrinker *,
struct shrink_control *sc);
+ void (*to_text)(struct printbuf *, struct shrinker *);
int seeks; /* seeks to recreate an obj */
long batch; /* reclaim batch size, 0 = default */
struct list_head list;
};
-int register_shrinker(struct shrinker *);
+int register_shrinker(struct shrinker *, const char *, ...);
void unregister_shrinker(struct shrinker *);
-void run_shrinkers(void);
+void run_shrinkers(gfp_t gfp_mask, bool);
#endif /* __TOOLS_LINUX_SHRINKER_H */
*/
#include <linux/lockdep.h>
-#include <linux/osq_lock.h>
#include <linux/sched.h>
#include <linux/types.h>
struct six_lock {
union six_lock_state state;
- unsigned intent_lock_recurse;
struct task_struct *owner;
- struct optimistic_spin_queue osq;
unsigned __percpu *readers;
-
+ unsigned intent_lock_recurse;
+ unsigned long ip;
raw_spinlock_t wait_lock;
- struct list_head wait_list[2];
+ struct list_head wait_list;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
};
+struct six_lock_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ enum six_lock_type lock_want;
+ bool lock_acquired;
+ u64 start_time;
+};
+
typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
static __always_inline void __six_lock_init(struct six_lock *lock,
{
atomic64_set(&lock->state.counter, 0);
raw_spin_lock_init(&lock->wait_lock);
- INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
- INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
+ INIT_LIST_HEAD(&lock->wait_list);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
debug_check_no_locks_freed((void *) lock, sizeof(*lock));
lockdep_init_map(&lock->dep_map, name, key, 0);
bool six_trylock_##type(struct six_lock *); \
bool six_relock_##type(struct six_lock *, u32); \
int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\
+int six_lock_waiter_##type(struct six_lock *, struct six_lock_waiter *, \
+ six_lock_should_sleep_fn, void *); \
void six_unlock_##type(struct six_lock *);
__SIX_LOCK(read)
SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
}
+static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
+ struct six_lock_waiter *wait,
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+ SIX_LOCK_DISPATCH(type, six_lock_waiter, lock, wait, should_sleep_fn, p);
+}
+
static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
{
SIX_LOCK_DISPATCH(type, six_unlock, lock);
void six_lock_wakeup_all(struct six_lock *);
-void six_lock_pcpu_free_rcu(struct six_lock *);
void six_lock_pcpu_free(struct six_lock *);
void six_lock_pcpu_alloc(struct six_lock *);
+struct six_lock_count {
+ unsigned n[3];
+};
+
+struct six_lock_count six_lock_counts(struct six_lock *);
+
#endif /* _LINUX_SIX_H */
#include <linux/kernel.h>
#include <linux/log2.h>
+#include <linux/overflow.h>
#include <linux/page.h>
#include <linux/shrinker.h>
#include <linux/types.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
#define ARCH_KMALLOC_MINALIGN 16
#define KMALLOC_MAX_SIZE SIZE_MAX
void *p;
do {
- run_shrinkers();
+ run_shrinkers(flags, i != 0);
if (size) {
size_t alignment = min(rounddown_pow_of_two(size), (size_t)PAGE_SIZE);
return new;
}
+static inline void *krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags)
+{
+ size_t bytes;
+
+ if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
+ return NULL;
+
+ return krealloc(p, bytes, flags);
+}
+
#define kzalloc(size, flags) kmalloc(size, flags|__GFP_ZERO)
#define kmalloc_array(n, size, flags) \
((size) != 0 && (n) > SIZE_MAX / (size) \
void *p;
do {
- run_shrinkers();
+ run_shrinkers(flags, i != 0);
p = aligned_alloc(PAGE_SIZE, size);
if (p && (flags & __GFP_ZERO))
#define KMEM_CACHE(_struct, _flags) kmem_cache_create(sizeof(struct _struct))
+#define PAGE_KERNEL 0
+#define PAGE_KERNEL_EXEC 1
+
+#define vfree(p) free(p)
+
+static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask)
+{
+ unsigned i = 0;
+ void *p;
+
+ size = round_up(size, PAGE_SIZE);
+
+ do {
+ run_shrinkers(gfp_mask, i != 0);
+
+ p = aligned_alloc(PAGE_SIZE, size);
+ if (p && gfp_mask & __GFP_ZERO)
+ memset(p, 0, size);
+ } while (!p && i++ < 10);
+
+ return p;
+}
+
+static inline void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
+{
+ void *p;
+
+ p = __vmalloc(size, gfp_mask);
+ if (!p)
+ return NULL;
+
+ if (mprotect(p, size, PROT_READ|PROT_WRITE|PROT_EXEC)) {
+ vfree(p);
+ return NULL;
+ }
+
+ return p;
+}
+
+static inline void *vmalloc(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL);
+}
+
+static inline void *vzalloc(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL|__GFP_ZERO);
+}
+
#endif /* __TOOLS_LINUX_SLAB_H */
#define __TOOLS_LINUX_SPINLOCK_H
#include <linux/atomic.h>
+#include <pthread.h>
typedef struct {
- int count;
+ pthread_mutex_t lock;
} raw_spinlock_t;
-#define __RAW_SPIN_LOCK_UNLOCKED(name) (raw_spinlock_t) { .count = 0 }
+#define __RAW_SPIN_LOCK_UNLOCKED(name) (raw_spinlock_t) { .lock = PTHREAD_MUTEX_INITIALIZER }
static inline void raw_spin_lock_init(raw_spinlock_t *lock)
{
- smp_store_release(&lock->count, 0);
+ pthread_mutex_init(&lock->lock, NULL);
+}
+
+static inline bool raw_spin_trylock(raw_spinlock_t *lock)
+{
+ return !pthread_mutex_trylock(&lock->lock);
}
static inline void raw_spin_lock(raw_spinlock_t *lock)
{
- while (xchg_acquire(&lock->count, 1))
- ;
+ pthread_mutex_lock(&lock->lock);
}
static inline void raw_spin_unlock(raw_spinlock_t *lock)
{
- smp_store_release(&lock->count, 0);
+ pthread_mutex_unlock(&lock->lock);
}
#define raw_spin_lock_irq(lock) raw_spin_lock(lock)
#include <linux/types.h> /* for size_t */
extern size_t strlcpy(char *dest, const char *src, size_t size);
+extern ssize_t strscpy(char *dest, const char *src, size_t count);
extern char *strim(char *);
extern void memzero_explicit(void *, size_t);
int match_string(const char * const *, size_t, const char *);
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_STRING_HELPERS_H_
+#define _LINUX_STRING_HELPERS_H_
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+
+/* Descriptions of the types of units to
+ * print in */
+enum string_size_units {
+ STRING_UNITS_10, /* use powers of 10^3 (standard SI) */
+ STRING_UNITS_2, /* use binary powers of 2^10 */
+};
+
+int string_get_size(u64 size, u64 blk_size, enum string_size_units units,
+ char *buf, int len);
+
+#endif
umode_t mode;
};
+struct attribute_group {
+ struct attribute **attrs;
+};
+
struct sysfs_ops {
ssize_t (*show)(struct kobject *, struct attribute *, char *);
ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t);
#include <stdint.h>
#include <fcntl.h>
+#include <sys/stat.h>
#include <sys/types.h>
#define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */
typedef unsigned gfp_t;
-#define GFP_KERNEL 0
#define GFP_ATOMIC 0
#define GFP_NOFS 0
#define GFP_NOIO 0
#define __GFP_NORETRY 0
#define __GFP_NOFAIL 0
#define __GFP_ZERO 1
+#define GFP_KERNEL 2
#define PAGE_ALLOC_COSTLY_ORDER 6
typedef int (*cmp_func_t)(const void *a, const void *b);
+typedef unsigned int __bitwise slab_flags_t;
+typedef u64 phys_addr_t;
+struct vm_struct;
+
#endif /* _TOOLS_LINUX_TYPES_H_ */
#ifndef __TOOLS_LINUX_VMALLOC_H
#define __TOOLS_LINUX_VMALLOC_H
-#include <stdlib.h>
-#include <sys/mman.h>
-
#include "linux/slab.h"
-#include "tools-util.h"
-
-#define PAGE_KERNEL 0
-#define PAGE_KERNEL_EXEC 1
-
-#define vfree(p) free(p)
-
-static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask)
-{
- unsigned i = 0;
- void *p;
-
- size = round_up(size, PAGE_SIZE);
-
- do {
- run_shrinkers();
-
- p = aligned_alloc(PAGE_SIZE, size);
- if (p && gfp_mask & __GFP_ZERO)
- memset(p, 0, size);
- } while (!p && i++ < 10);
-
- return p;
-}
-
-static inline void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
-{
- void *p;
-
- p = __vmalloc(size, gfp_mask);
- if (!p)
- return NULL;
-
- if (mprotect(p, size, PROT_READ|PROT_WRITE|PROT_EXEC)) {
- vfree(p);
- return NULL;
- }
-
- return p;
-}
-
-static inline void *vmalloc(unsigned long size)
-{
- return __vmalloc(size, GFP_KERNEL);
-}
-
-static inline void *vzalloc(unsigned long size)
-{
- return __vmalloc(size, GFP_KERNEL|__GFP_ZERO);
-}
#endif /* __TOOLS_LINUX_VMALLOC_H */
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd) and
+ * the GPLv2 (found in the COPYING file in the root directory of
+ * https://github.com/facebook/zstd). You may select, at your option, one of the
+ * above-listed licenses.
+ */
+
+#ifndef LINUX_ZSTD_H
+#define LINUX_ZSTD_H
+
+/**
+ * This is a kernel-style API that wraps the upstream zstd API, which cannot be
+ * used directly because the symbols aren't exported. It exposes the minimal
+ * functionality which is currently required by users of zstd in the kernel.
+ * Expose extra functions from lib/zstd/zstd.h as needed.
+ */
+
+/* ====== Dependency ====== */
+#include <linux/types.h>
#include <zstd.h>
+#include <linux/zstd_errors.h>
+
+/* ====== Helper Functions ====== */
+/**
+ * zstd_compress_bound() - maximum compressed size in worst case scenario
+ * @src_size: The size of the data to compress.
+ *
+ * Return: The maximum compressed size in the worst case scenario.
+ */
+size_t zstd_compress_bound(size_t src_size);
+
+/**
+ * zstd_is_error() - tells if a size_t function result is an error code
+ * @code: The function result to check for error.
+ *
+ * Return: Non-zero iff the code is an error.
+ */
+unsigned int zstd_is_error(size_t code);
+
+/**
+ * enum zstd_error_code - zstd error codes
+ */
+typedef ZSTD_ErrorCode zstd_error_code;
+
+/**
+ * zstd_get_error_code() - translates an error function result to an error code
+ * @code: The function result for which zstd_is_error(code) is true.
+ *
+ * Return: A unique error code for this error.
+ */
+zstd_error_code zstd_get_error_code(size_t code);
+
+/**
+ * zstd_get_error_name() - translates an error function result to a string
+ * @code: The function result for which zstd_is_error(code) is true.
+ *
+ * Return: An error string corresponding to the error code.
+ */
+const char *zstd_get_error_name(size_t code);
+
+/**
+ * zstd_min_clevel() - minimum allowed compression level
+ *
+ * Return: The minimum allowed compression level.
+ */
+int zstd_min_clevel(void);
+
+/**
+ * zstd_max_clevel() - maximum allowed compression level
+ *
+ * Return: The maximum allowed compression level.
+ */
+int zstd_max_clevel(void);
+
+/* ====== Parameter Selection ====== */
+
+/**
+ * enum zstd_strategy - zstd compression search strategy
+ *
+ * From faster to stronger. See zstd_lib.h.
+ */
+typedef ZSTD_strategy zstd_strategy;
+
+/**
+ * struct zstd_compression_parameters - zstd compression parameters
+ * @windowLog: Log of the largest match distance. Larger means more
+ * compression, and more memory needed during decompression.
+ * @chainLog: Fully searched segment. Larger means more compression,
+ * slower, and more memory (useless for fast).
+ * @hashLog: Dispatch table. Larger means more compression,
+ * slower, and more memory.
+ * @searchLog: Number of searches. Larger means more compression and slower.
+ * @searchLength: Match length searched. Larger means faster decompression,
+ * sometimes less compression.
+ * @targetLength: Acceptable match size for optimal parser (only). Larger means
+ * more compression, and slower.
+ * @strategy: The zstd compression strategy.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_compressionParameters zstd_compression_parameters;
+
+/**
+ * struct zstd_frame_parameters - zstd frame parameters
+ * @contentSizeFlag: Controls whether content size will be present in the
+ * frame header (when known).
+ * @checksumFlag: Controls whether a 32-bit checksum is generated at the
+ * end of the frame for error detection.
+ * @noDictIDFlag: Controls whether dictID will be saved into the frame
+ * header when using dictionary compression.
+ *
+ * The default value is all fields set to 0. See zstd_lib.h.
+ */
+typedef ZSTD_frameParameters zstd_frame_parameters;
+
+/**
+ * struct zstd_parameters - zstd parameters
+ * @cParams: The compression parameters.
+ * @fParams: The frame parameters.
+ */
+typedef ZSTD_parameters zstd_parameters;
+
+/**
+ * zstd_get_params() - returns zstd_parameters for selected level
+ * @level: The compression level
+ * @estimated_src_size: The estimated source size to compress or 0
+ * if unknown.
+ *
+ * Return: The selected zstd_parameters.
+ */
+zstd_parameters zstd_get_params(int level,
+ unsigned long long estimated_src_size);
+
+/* ====== Single-pass Compression ====== */
+
+typedef ZSTD_CCtx zstd_cctx;
+
+/**
+ * zstd_cctx_workspace_bound() - max memory needed to initialize a zstd_cctx
+ * @parameters: The compression parameters to be used.
+ *
+ * If multiple compression parameters might be used, the caller must call
+ * zstd_cctx_workspace_bound() for each set of parameters and use the maximum
+ * size.
+ *
+ * Return: A lower bound on the size of the workspace that is passed to
+ * zstd_init_cctx().
+ */
+size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *parameters);
+
+/**
+ * zstd_init_cctx() - initialize a zstd compression context
+ * @workspace: The workspace to emplace the context into. It must outlive
+ * the returned context.
+ * @workspace_size: The size of workspace. Use zstd_cctx_workspace_bound() to
+ * determine how large the workspace must be.
+ *
+ * Return: A zstd compression context or NULL on error.
+ */
+zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size);
+
+/**
+ * zstd_compress_cctx() - compress src into dst with the initialized parameters
+ * @cctx: The context. Must have been initialized with zstd_init_cctx().
+ * @dst: The buffer to compress src into.
+ * @dst_capacity: The size of the destination buffer. May be any size, but
+ * ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src: The data to compress.
+ * @src_size: The size of the data to compress.
+ * @parameters: The compression parameters to be used.
+ *
+ * Return: The compressed size or an error, which can be checked using
+ * zstd_is_error().
+ */
+size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity,
+ const void *src, size_t src_size, const zstd_parameters *parameters);
+
+/* ====== Single-pass Decompression ====== */
+
+typedef ZSTD_DCtx zstd_dctx;
+
+/**
+ * zstd_dctx_workspace_bound() - max memory needed to initialize a zstd_dctx
+ *
+ * Return: A lower bound on the size of the workspace that is passed to
+ * zstd_init_dctx().
+ */
+size_t zstd_dctx_workspace_bound(void);
+
+/**
+ * zstd_init_dctx() - initialize a zstd decompression context
+ * @workspace: The workspace to emplace the context into. It must outlive
+ * the returned context.
+ * @workspace_size: The size of workspace. Use zstd_dctx_workspace_bound() to
+ * determine how large the workspace must be.
+ *
+ * Return: A zstd decompression context or NULL on error.
+ */
+zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size);
+
+/**
+ * zstd_decompress_dctx() - decompress zstd compressed src into dst
+ * @dctx: The decompression context.
+ * @dst: The buffer to decompress src into.
+ * @dst_capacity: The size of the destination buffer. Must be at least as large
+ * as the decompressed size. If the caller cannot upper bound the
+ * decompressed size, then it's better to use the streaming API.
+ * @src: The zstd compressed data to decompress. Multiple concatenated
+ * frames and skippable frames are allowed.
+ * @src_size: The exact size of the data to decompress.
+ *
+ * Return: The decompressed size or an error, which can be checked using
+ * zstd_is_error().
+ */
+size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity,
+ const void *src, size_t src_size);
+
+/* ====== Streaming Buffers ====== */
+
+/**
+ * struct zstd_in_buffer - input buffer for streaming
+ * @src: Start of the input buffer.
+ * @size: Size of the input buffer.
+ * @pos: Position where reading stopped. Will be updated.
+ * Necessarily 0 <= pos <= size.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_inBuffer zstd_in_buffer;
+
+/**
+ * struct zstd_out_buffer - output buffer for streaming
+ * @dst: Start of the output buffer.
+ * @size: Size of the output buffer.
+ * @pos: Position where writing stopped. Will be updated.
+ * Necessarily 0 <= pos <= size.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_outBuffer zstd_out_buffer;
+
+/* ====== Streaming Compression ====== */
+
+typedef ZSTD_CStream zstd_cstream;
+
+/**
+ * zstd_cstream_workspace_bound() - memory needed to initialize a zstd_cstream
+ * @cparams: The compression parameters to be used for compression.
+ *
+ * Return: A lower bound on the size of the workspace that is passed to
+ * zstd_init_cstream().
+ */
+size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams);
+
+/**
+ * zstd_init_cstream() - initialize a zstd streaming compression context
+ * @parameters The zstd parameters to use for compression.
+ * @pledged_src_size: If params.fParams.contentSizeFlag == 1 then the caller
+ * must pass the source size (zero means empty source).
+ * Otherwise, the caller may optionally pass the source
+ * size, or zero if unknown.
+ * @workspace: The workspace to emplace the context into. It must outlive
+ * the returned context.
+ * @workspace_size: The size of workspace.
+ * Use zstd_cstream_workspace_bound(params->cparams) to
+ * determine how large the workspace must be.
+ *
+ * Return: The zstd streaming compression context or NULL on error.
+ */
+zstd_cstream *zstd_init_cstream(const zstd_parameters *parameters,
+ unsigned long long pledged_src_size, void *workspace, size_t workspace_size);
+
+/**
+ * zstd_reset_cstream() - reset the context using parameters from creation
+ * @cstream: The zstd streaming compression context to reset.
+ * @pledged_src_size: Optionally the source size, or zero if unknown.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused. If `pledged_src_size` is non-zero the frame
+ * content size is always written into the frame header.
+ *
+ * Return: Zero or an error, which can be checked using
+ * zstd_is_error().
+ */
+size_t zstd_reset_cstream(zstd_cstream *cstream,
+ unsigned long long pledged_src_size);
+
+/**
+ * zstd_compress_stream() - streaming compress some of input into output
+ * @cstream: The zstd streaming compression context.
+ * @output: Destination buffer. `output->pos` is updated to indicate how much
+ * compressed data was written.
+ * @input: Source buffer. `input->pos` is updated to indicate how much data
+ * was read. Note that it may not consume the entire input, in which
+ * case `input->pos < input->size`, and it's up to the caller to
+ * present remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ *
+ * Return: A hint for the number of bytes to use as the input for the next
+ * function call or an error, which can be checked using
+ * zstd_is_error().
+ */
+size_t zstd_compress_stream(zstd_cstream *cstream, zstd_out_buffer *output,
+ zstd_in_buffer *input);
+
+/**
+ * zstd_flush_stream() - flush internal buffers into output
+ * @cstream: The zstd streaming compression context.
+ * @output: Destination buffer. `output->pos` is updated to indicate how much
+ * compressed data was written.
+ *
+ * zstd_flush_stream() must be called until it returns 0, meaning all the data
+ * has been flushed. Since zstd_flush_stream() causes a block to be ended,
+ * calling it too often will degrade the compression ratio.
+ *
+ * Return: The number of bytes still present within internal buffers or an
+ * error, which can be checked using zstd_is_error().
+ */
+size_t zstd_flush_stream(zstd_cstream *cstream, zstd_out_buffer *output);
+
+/**
+ * zstd_end_stream() - flush internal buffers into output and end the frame
+ * @cstream: The zstd streaming compression context.
+ * @output: Destination buffer. `output->pos` is updated to indicate how much
+ * compressed data was written.
+ *
+ * zstd_end_stream() must be called until it returns 0, meaning all the data has
+ * been flushed and the frame epilogue has been written.
+ *
+ * Return: The number of bytes still present within internal buffers or an
+ * error, which can be checked using zstd_is_error().
+ */
+size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output);
+
+/* ====== Streaming Decompression ====== */
+
+typedef ZSTD_DStream zstd_dstream;
+
+/**
+ * zstd_dstream_workspace_bound() - memory needed to initialize a zstd_dstream
+ * @max_window_size: The maximum window size allowed for compressed frames.
+ *
+ * Return: A lower bound on the size of the workspace that is passed
+ * to zstd_init_dstream().
+ */
+size_t zstd_dstream_workspace_bound(size_t max_window_size);
+
+/**
+ * zstd_init_dstream() - initialize a zstd streaming decompression context
+ * @max_window_size: The maximum window size allowed for compressed frames.
+ * @workspace: The workspace to emplace the context into. It must outlive
+ * the returned context.
+ * @workspaceSize: The size of workspace.
+ * Use zstd_dstream_workspace_bound(max_window_size) to
+ * determine how large the workspace must be.
+ *
+ * Return: The zstd streaming decompression context.
+ */
+zstd_dstream *zstd_init_dstream(size_t max_window_size, void *workspace,
+ size_t workspace_size);
+
+/**
+ * zstd_reset_dstream() - reset the context using parameters from creation
+ * @dstream: The zstd streaming decompression context to reset.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused.
+ *
+ * Return: Zero or an error, which can be checked using zstd_is_error().
+ */
+size_t zstd_reset_dstream(zstd_dstream *dstream);
+
+/**
+ * zstd_decompress_stream() - streaming decompress some of input into output
+ * @dstream: The zstd streaming decompression context.
+ * @output: Destination buffer. `output.pos` is updated to indicate how much
+ * decompressed data was written.
+ * @input: Source buffer. `input.pos` is updated to indicate how much data was
+ * read. Note that it may not consume the entire input, in which case
+ * `input.pos < input.size`, and it's up to the caller to present
+ * remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ * zstd_decompress_stream() will not consume the last byte of the frame until
+ * the entire frame is flushed.
+ *
+ * Return: Returns 0 iff a frame is completely decoded and fully flushed.
+ * Otherwise returns a hint for the number of bytes to use as the
+ * input for the next function call or an error, which can be checked
+ * using zstd_is_error(). The size hint will never load more than the
+ * frame.
+ */
+size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output,
+ zstd_in_buffer *input);
+
+/* ====== Frame Inspection Functions ====== */
+
+/**
+ * zstd_find_frame_compressed_size() - returns the size of a compressed frame
+ * @src: Source buffer. It should point to the start of a zstd encoded
+ * frame or a skippable frame.
+ * @src_size: The size of the source buffer. It must be at least as large as the
+ * size of the frame.
+ *
+ * Return: The compressed size of the frame pointed to by `src` or an error,
+ * which can be check with zstd_is_error().
+ * Suitable to pass to ZSTD_decompress() or similar functions.
+ */
+size_t zstd_find_frame_compressed_size(const void *src, size_t src_size);
-#define ZSTD_initDCtx(w, s) ZSTD_initStaticDCtx(w, s)
-#define ZSTD_initCCtx(w, s) ZSTD_initStaticCCtx(w, s)
+/**
+ * struct zstd_frame_params - zstd frame parameters stored in the frame header
+ * @frameContentSize: The frame content size, or ZSTD_CONTENTSIZE_UNKNOWN if not
+ * present.
+ * @windowSize: The window size, or 0 if the frame is a skippable frame.
+ * @blockSizeMax: The maximum block size.
+ * @frameType: The frame type (zstd or skippable)
+ * @headerSize: The size of the frame header.
+ * @dictID: The dictionary id, or 0 if not present.
+ * @checksumFlag: Whether a checksum was used.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_frameHeader zstd_frame_header;
-#define ZSTD_compressCCtx(w, dst, d_len, src, src_len, params) \
- ZSTD_compressCCtx(w, dst, d_len, src, src_len, 0)
+/**
+ * zstd_get_frame_header() - extracts parameters from a zstd or skippable frame
+ * @params: On success the frame parameters are written here.
+ * @src: The source buffer. It must point to a zstd or skippable frame.
+ * @src_size: The size of the source buffer.
+ *
+ * Return: 0 on success. If more data is required it returns how many bytes
+ * must be provided to make forward progress. Otherwise it returns
+ * an error, which can be checked using zstd_is_error().
+ */
+size_t zstd_get_frame_header(zstd_frame_header *params, const void *src,
+ size_t src_size);
-#define ZSTD_CCtxWorkspaceBound(p) ZSTD_estimateCCtxSize(0)
-#define ZSTD_DCtxWorkspaceBound() ZSTD_estimateDCtxSize()
+#endif /* LINUX_ZSTD_H */
--- /dev/null
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_ERRORS_H_398273423
+#define ZSTD_ERRORS_H_398273423
+
+
+/*===== dependency =====*/
+#include <linux/types.h> /* size_t */
+
+
+/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */
+#define ZSTDERRORLIB_VISIBILITY
+#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+
+/*-*********************************************
+ * Error codes list
+ *-*********************************************
+ * Error codes _values_ are pinned down since v1.3.1 only.
+ * Therefore, don't rely on values if you may link to any version < v1.3.1.
+ *
+ * Only values < 100 are considered stable.
+ *
+ * note 1 : this API shall be used with static linking only.
+ * dynamic linking is not yet officially supported.
+ * note 2 : Prefer relying on the enum than on its value whenever possible
+ * This is the only supported way to use the error list < v1.3.1
+ * note 3 : ZSTD_isError() is always correct, whatever the library version.
+ **********************************************/
+typedef enum {
+ ZSTD_error_no_error = 0,
+ ZSTD_error_GENERIC = 1,
+ ZSTD_error_prefix_unknown = 10,
+ ZSTD_error_version_unsupported = 12,
+ ZSTD_error_frameParameter_unsupported = 14,
+ ZSTD_error_frameParameter_windowTooLarge = 16,
+ ZSTD_error_corruption_detected = 20,
+ ZSTD_error_checksum_wrong = 22,
+ ZSTD_error_dictionary_corrupted = 30,
+ ZSTD_error_dictionary_wrong = 32,
+ ZSTD_error_dictionaryCreation_failed = 34,
+ ZSTD_error_parameter_unsupported = 40,
+ ZSTD_error_parameter_outOfBound = 42,
+ ZSTD_error_tableLog_tooLarge = 44,
+ ZSTD_error_maxSymbolValue_tooLarge = 46,
+ ZSTD_error_maxSymbolValue_tooSmall = 48,
+ ZSTD_error_stage_wrong = 60,
+ ZSTD_error_init_missing = 62,
+ ZSTD_error_memory_allocation = 64,
+ ZSTD_error_workSpace_tooSmall= 66,
+ ZSTD_error_dstSize_tooSmall = 70,
+ ZSTD_error_srcSize_wrong = 72,
+ ZSTD_error_dstBuffer_null = 74,
+ /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+ ZSTD_error_frameIndex_tooLarge = 100,
+ ZSTD_error_seekableIO = 102,
+ ZSTD_error_dstBuffer_wrong = 104,
+ ZSTD_error_srcBuffer_wrong = 105,
+ ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+} ZSTD_ErrorCode;
+
+/*! ZSTD_getErrorCode() :
+ convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
+ which can be used to compare with enum list published above */
+ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
+ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /*< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+
+
+
+#endif /* ZSTD_ERRORS_H_398273423 */
#include <linux/tracepoint.h>
+#define TRACE_BPOS_entries(name) \
+ __field(u64, name##_inode ) \
+ __field(u64, name##_offset ) \
+ __field(u32, name##_snapshot )
+
+#define TRACE_BPOS_assign(dst, src) \
+ __entry->dst##_inode = (src).inode; \
+ __entry->dst##_offset = (src).offset; \
+ __entry->dst##_snapshot = (src).snapshot
+
DECLARE_EVENT_CLASS(bpos,
- TP_PROTO(struct bpos *p),
+ TP_PROTO(const struct bpos *p),
TP_ARGS(p),
TP_STRUCT__entry(
- __field(u64, inode )
- __field(u64, offset )
+ TRACE_BPOS_entries(p)
),
TP_fast_assign(
- __entry->inode = p->inode;
- __entry->offset = p->offset;
+ TRACE_BPOS_assign(p, *p);
),
- TP_printk("%llu:%llu", __entry->inode, __entry->offset)
+ TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
);
DECLARE_EVENT_CLASS(bkey,
__entry->offset, __entry->size)
);
+DECLARE_EVENT_CLASS(btree_node,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u8, level )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->level = b->c.level;
+ __entry->btree_id = b->c.btree_id;
+ TRACE_BPOS_assign(pos, b->key.k.p);
+ ),
+
+ TP_printk("%d,%d %u %s %llu:%llu:%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->level,
+ bch2_btree_ids[__entry->btree_id],
+ __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+);
+
DECLARE_EVENT_CLASS(bch_fs,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c),
(unsigned long long)__entry->sector, __entry->nr_sector)
);
+/* super-io.c: */
+TRACE_EVENT(write_super,
+ TP_PROTO(struct bch_fs *c, unsigned long ip),
+ TP_ARGS(c, ip),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(unsigned long, ip )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->ip = ip;
+ ),
+
+ TP_printk("%d,%d for %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (void *) __entry->ip)
+);
+
/* io.c: */
-DEFINE_EVENT(bio, read_split,
+DEFINE_EVENT(bio, read_promote,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
TP_ARGS(bio)
);
+DEFINE_EVENT(bio, read_split,
+ TP_PROTO(struct bio *bio),
+ TP_ARGS(bio)
+);
+
DEFINE_EVENT(bio, read_retry,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
-DEFINE_EVENT(bio, promote,
+DEFINE_EVENT(bio, read_reuse_race,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
);
TRACE_EVENT(journal_reclaim_start,
- TP_PROTO(struct bch_fs *c, u64 min_nr,
+ TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
+ u64 min_nr, u64 min_key_cache,
u64 prereserved, u64 prereserved_total,
u64 btree_cache_dirty, u64 btree_cache_total,
u64 btree_key_cache_dirty, u64 btree_key_cache_total),
- TP_ARGS(c, min_nr, prereserved, prereserved_total,
+ TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
btree_cache_dirty, btree_cache_total,
btree_key_cache_dirty, btree_key_cache_total),
TP_STRUCT__entry(
__field(dev_t, dev )
+ __field(bool, direct )
+ __field(bool, kicked )
__field(u64, min_nr )
+ __field(u64, min_key_cache )
__field(u64, prereserved )
__field(u64, prereserved_total )
__field(u64, btree_cache_dirty )
TP_fast_assign(
__entry->dev = c->dev;
+ __entry->direct = direct;
+ __entry->kicked = kicked;
__entry->min_nr = min_nr;
+ __entry->min_key_cache = min_key_cache;
__entry->prereserved = prereserved;
__entry->prereserved_total = prereserved_total;
__entry->btree_cache_dirty = btree_cache_dirty;
__entry->btree_key_cache_total = btree_key_cache_total;
),
- TP_printk("%d,%d min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+ TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->direct,
+ __entry->kicked,
__entry->min_nr,
+ __entry->min_key_cache,
__entry->prereserved,
__entry->prereserved_total,
__entry->btree_cache_dirty,
__entry->nr_flushed = nr_flushed;
),
- TP_printk("%d%d flushed %llu",
+ TP_printk("%d,%d flushed %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->nr_flushed)
);
/* bset.c: */
DEFINE_EVENT(bpos, bkey_pack_pos_fail,
- TP_PROTO(struct bpos *p),
+ TP_PROTO(const struct bpos *p),
TP_ARGS(p)
);
-/* Btree */
+/* Btree cache: */
-DECLARE_EVENT_CLASS(btree_node,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b),
+TRACE_EVENT(btree_cache_scan,
+ TP_PROTO(long nr_to_scan, long can_free, long ret),
+ TP_ARGS(nr_to_scan, can_free, ret),
TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u8, level )
- __field(u8, id )
- __field(u64, inode )
- __field(u64, offset )
+ __field(long, nr_to_scan )
+ __field(long, can_free )
+ __field(long, ret )
),
TP_fast_assign(
- __entry->dev = c->dev;
- __entry->level = b->c.level;
- __entry->id = b->c.btree_id;
- __entry->inode = b->key.k.p.inode;
- __entry->offset = b->key.k.p.offset;
+ __entry->nr_to_scan = nr_to_scan;
+ __entry->can_free = can_free;
+ __entry->ret = ret;
),
- TP_printk("%d,%d %u id %u %llu:%llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->level, __entry->id,
- __entry->inode, __entry->offset)
+ TP_printk("scanned for %li nodes, can free %li, ret %li",
+ __entry->nr_to_scan, __entry->can_free, __entry->ret)
);
-DEFINE_EVENT(btree_node, btree_read,
+DEFINE_EVENT(btree_node, btree_cache_reap,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-TRACE_EVENT(btree_write,
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+/* Btree */
+
+DEFINE_EVENT(btree_node, btree_node_read,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_node_write,
TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
TP_ARGS(b, bytes, sectors),
TP_ARGS(c, b)
);
-DEFINE_EVENT(btree_node, btree_node_reap,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock_fail,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
TRACE_EVENT(btree_reserve_get_fail,
- TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl),
- TP_ARGS(c, required, cl),
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ size_t required),
+ TP_ARGS(trans_fn, caller_ip, required),
TP_STRUCT__entry(
- __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
__field(size_t, required )
- __field(struct closure *, cl )
),
TP_fast_assign(
- __entry->dev = c->dev;
- __entry->required = required;
- __entry->cl = cl;
+ strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->required = required;
),
- TP_printk("%d,%d required %zu by %p",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->required, __entry->cl)
+ TP_printk("%s %pS required %zu",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ __entry->required)
);
-DEFINE_EVENT(btree_node, btree_split,
+DEFINE_EVENT(btree_node, btree_node_compact,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-DEFINE_EVENT(btree_node, btree_compact,
+DEFINE_EVENT(btree_node, btree_node_merge,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-DEFINE_EVENT(btree_node, btree_merge,
+DEFINE_EVENT(btree_node, btree_node_split,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-DEFINE_EVENT(btree_node, btree_set_root,
+DEFINE_EVENT(btree_node, btree_node_rewrite,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-TRACE_EVENT(btree_cache_scan,
- TP_PROTO(unsigned long nr_to_scan_pages,
- unsigned long nr_to_scan_nodes,
- unsigned long can_free_nodes,
- long ret),
- TP_ARGS(nr_to_scan_pages, nr_to_scan_nodes, can_free_nodes, ret),
+DEFINE_EVENT(btree_node, btree_node_set_root,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_path_relock_fail,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path,
+ unsigned level),
+ TP_ARGS(trans, caller_ip, path, level),
TP_STRUCT__entry(
- __field(unsigned long, nr_to_scan_pages )
- __field(unsigned long, nr_to_scan_nodes )
- __field(unsigned long, can_free_nodes )
- __field(long, ret )
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(u8, btree_id )
+ __field(u8, level )
+ TRACE_BPOS_entries(pos)
+ __array(char, node, 24 )
+ __field(u32, iter_lock_seq )
+ __field(u32, node_lock_seq )
),
TP_fast_assign(
- __entry->nr_to_scan_pages = nr_to_scan_pages;
- __entry->nr_to_scan_nodes = nr_to_scan_nodes;
- __entry->can_free_nodes = can_free_nodes;
- __entry->ret = ret;
+ struct btree *b = btree_path_node(path, level);
+
+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->btree_id = path->btree_id;
+ __entry->level = path->level;
+ TRACE_BPOS_assign(pos, path->pos);
+ if (IS_ERR(b))
+ strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
+ else
+ scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+ __entry->iter_lock_seq = path->l[level].lock_seq;
+ __entry->node_lock_seq = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
),
- TP_printk("scanned for %lu pages, %lu nodes, can free %lu nodes, ret %li",
- __entry->nr_to_scan_pages,
- __entry->nr_to_scan_nodes,
- __entry->can_free_nodes,
- __entry->ret)
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ bch2_btree_ids[__entry->btree_id],
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->level,
+ __entry->node,
+ __entry->iter_lock_seq,
+ __entry->node_lock_seq)
);
-TRACE_EVENT(btree_node_relock_fail,
- TP_PROTO(const char *trans_fn,
+TRACE_EVENT(btree_path_upgrade_fail,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos,
- unsigned long node,
- u32 iter_lock_seq,
- u32 node_lock_seq),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq),
+ struct btree_path *path,
+ unsigned level),
+ TP_ARGS(trans, caller_ip, path, level),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(u8, btree_id )
- __field(u64, pos_inode )
- __field(u64, pos_offset )
- __field(u32, pos_snapshot )
- __field(unsigned long, node )
+ __field(u8, level )
+ TRACE_BPOS_entries(pos)
+ __field(u8, locked )
+ __field(u8, self_read_count )
+ __field(u8, self_intent_count)
+ __field(u8, read_count )
+ __field(u8, intent_count )
__field(u32, iter_lock_seq )
__field(u32, node_lock_seq )
),
TP_fast_assign(
- strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+ struct six_lock_count c;
+
+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
- __entry->btree_id = btree_id;
- __entry->pos_inode = pos->inode;
- __entry->pos_offset = pos->offset;
- __entry->pos_snapshot = pos->snapshot;
- __entry->node = node;
- __entry->iter_lock_seq = iter_lock_seq;
- __entry->node_lock_seq = node_lock_seq;
+ __entry->btree_id = path->btree_id;
+ __entry->level = level;
+ TRACE_BPOS_assign(pos, path->pos);
+ __entry->locked = btree_node_locked(path, level);
+
+ c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+ __entry->self_read_count = c.n[SIX_LOCK_read];
+ __entry->self_intent_count = c.n[SIX_LOCK_intent];
+ c = six_lock_counts(&path->l[level].b->c.lock);
+ __entry->read_count = c.n[SIX_LOCK_read];
+ __entry->intent_count = c.n[SIX_LOCK_read];
+ __entry->iter_lock_seq = path->l[level].lock_seq;
+ __entry->node_lock_seq = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
),
- TP_printk("%s %pS btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
- __entry->btree_id,
+ bch2_btree_ids[__entry->btree_id],
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot,
- __entry->node,
+ __entry->level,
+ __entry->locked,
+ __entry->self_read_count,
+ __entry->self_intent_count,
+ __entry->read_count,
+ __entry->intent_count,
__entry->iter_lock_seq,
__entry->node_lock_seq)
);
/* Garbage collection */
-DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(bch_fs, gc_start,
+DEFINE_EVENT(bch_fs, gc_gens_start,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
-DEFINE_EVENT(bch_fs, gc_end,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
+DEFINE_EVENT(bch_fs, gc_gens_end,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
/* Allocator */
-TRACE_EVENT(alloc_scan,
- TP_PROTO(struct bch_dev *ca, u64 found, u64 inc_gen, u64 inc_gen_skipped),
- TP_ARGS(ca, found, inc_gen, inc_gen_skipped),
+TRACE_EVENT(bucket_alloc,
+ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+ bool user, u64 bucket),
+ TP_ARGS(ca, alloc_reserve, user, bucket),
TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, found )
- __field(u64, inc_gen )
- __field(u64, inc_gen_skipped )
+ __field(dev_t, dev )
+ __array(char, reserve, 16 )
+ __field(bool, user )
+ __field(u64, bucket )
),
TP_fast_assign(
__entry->dev = ca->dev;
- __entry->found = found;
- __entry->inc_gen = inc_gen;
- __entry->inc_gen_skipped = inc_gen_skipped;
+ strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
+ __entry->user = user;
+ __entry->bucket = bucket;
),
- TP_printk("%d,%d found %llu inc_gen %llu inc_gen_skipped %llu",
+ TP_printk("%d,%d reserve %s user %u bucket %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->found, __entry->inc_gen, __entry->inc_gen_skipped)
-);
+ __entry->reserve,
+ __entry->user,
+ __entry->bucket)
+);
+
+TRACE_EVENT(bucket_alloc_fail,
+ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+ u64 free,
+ u64 avail,
+ u64 copygc_wait_amount,
+ s64 copygc_waiting_for,
+ u64 seen,
+ u64 open,
+ u64 need_journal_commit,
+ u64 nouse,
+ bool nonblocking,
+ const char *err),
+ TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
+ seen, open, need_journal_commit, nouse, nonblocking, err),
-TRACE_EVENT(invalidate,
- TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors),
- TP_ARGS(ca, offset, sectors),
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, reserve, 16 )
+ __field(u64, free )
+ __field(u64, avail )
+ __field(u64, copygc_wait_amount )
+ __field(s64, copygc_waiting_for )
+ __field(u64, seen )
+ __field(u64, open )
+ __field(u64, need_journal_commit )
+ __field(u64, nouse )
+ __field(bool, nonblocking )
+ __array(char, err, 32 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = ca->dev;
+ strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
+ __entry->free = free;
+ __entry->avail = avail;
+ __entry->copygc_wait_amount = copygc_wait_amount;
+ __entry->copygc_waiting_for = copygc_waiting_for;
+ __entry->seen = seen;
+ __entry->open = open;
+ __entry->need_journal_commit = need_journal_commit;
+ __entry->nouse = nouse;
+ __entry->nonblocking = nonblocking;
+ strlcpy(__entry->err, err, sizeof(__entry->err));
+ ),
+
+ TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->reserve,
+ __entry->free,
+ __entry->avail,
+ __entry->copygc_wait_amount,
+ __entry->copygc_waiting_for,
+ __entry->seen,
+ __entry->open,
+ __entry->need_journal_commit,
+ __entry->nouse,
+ __entry->nonblocking,
+ __entry->err)
+);
+
+TRACE_EVENT(discard_buckets,
+ TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+ u64 need_journal_commit, u64 discarded, const char *err),
+ TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
TP_STRUCT__entry(
- __field(unsigned, sectors )
__field(dev_t, dev )
- __field(__u64, offset )
+ __field(u64, seen )
+ __field(u64, open )
+ __field(u64, need_journal_commit )
+ __field(u64, discarded )
+ __array(char, err, 16 )
),
TP_fast_assign(
- __entry->dev = ca->dev;
- __entry->offset = offset,
- __entry->sectors = sectors;
+ __entry->dev = c->dev;
+ __entry->seen = seen;
+ __entry->open = open;
+ __entry->need_journal_commit = need_journal_commit;
+ __entry->discarded = discarded;
+ strlcpy(__entry->err, err, sizeof(__entry->err));
),
- TP_printk("invalidated %u sectors at %d,%d sector=%llu",
- __entry->sectors,
- MAJOR(__entry->dev),
- MINOR(__entry->dev),
- __entry->offset)
+ TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->seen,
+ __entry->open,
+ __entry->need_journal_commit,
+ __entry->discarded,
+ __entry->err)
);
-DECLARE_EVENT_CLASS(bucket_alloc,
- TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
- TP_ARGS(ca, reserve),
+TRACE_EVENT(bucket_invalidate,
+ TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
+ TP_ARGS(c, dev, bucket, sectors),
TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(enum alloc_reserve, reserve )
+ __field(dev_t, dev )
+ __field(u32, dev_idx )
+ __field(u32, sectors )
+ __field(u64, bucket )
),
TP_fast_assign(
- __entry->dev = ca->dev;
- __entry->reserve = reserve;
+ __entry->dev = c->dev;
+ __entry->dev_idx = dev;
+ __entry->sectors = sectors;
+ __entry->bucket = bucket;
),
- TP_printk("%d,%d reserve %d",
+ TP_printk("%d:%d invalidated %u:%llu cached sectors %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->reserve)
+ __entry->dev_idx, __entry->bucket,
+ __entry->sectors)
);
-DEFINE_EVENT(bucket_alloc, bucket_alloc,
- TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
- TP_ARGS(ca, reserve)
-);
+/* Moving IO */
-DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
- TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
- TP_ARGS(ca, reserve)
+DEFINE_EVENT(bkey, move_extent_read,
+ TP_PROTO(const struct bkey *k),
+ TP_ARGS(k)
);
-DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
- TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
- TP_ARGS(ca, reserve)
+DEFINE_EVENT(bkey, move_extent_write,
+ TP_PROTO(const struct bkey *k),
+ TP_ARGS(k)
);
-/* Moving IO */
-
-DEFINE_EVENT(bkey, move_extent,
+DEFINE_EVENT(bkey, move_extent_finish,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
-DEFINE_EVENT(bkey, move_alloc_fail,
+DEFINE_EVENT(bkey, move_extent_race,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
-DEFINE_EVENT(bkey, move_race,
+DEFINE_EVENT(bkey, move_extent_alloc_mem_fail,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
__entry->wait_amount, __entry->until)
);
-DECLARE_EVENT_CLASS(transaction_restart,
- TP_PROTO(const char *trans_fn,
+/* btree transactions: */
+
+DECLARE_EVENT_CLASS(transaction_event,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip),
+ TP_ARGS(trans, caller_ip),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
),
TP_fast_assign(
- strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
),
TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
);
-DEFINE_EVENT(transaction_restart, transaction_restart_ip,
- TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event, transaction_commit,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_restart, trans_blocked_journal_reclaim,
- TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event, trans_restart_injected,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get,
- TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get,
- TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event, trans_restart_journal_res_get,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim,
- TP_PROTO(const char *trans_fn,
+
+TRACE_EVENT(trans_restart_journal_preres_get,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ unsigned flags),
+ TP_ARGS(trans, caller_ip, flags),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(unsigned, flags )
+ ),
+
+ TP_fast_assign(
+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->flags = flags;
+ ),
+
+ TP_printk("%s %pS %x", __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ __entry->flags)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_journal_reclaim,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_fault_inject,
- TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event, trans_restart_fault_inject,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_restart, trans_traverse_all,
- TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event, trans_traverse_all,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas,
- TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event, trans_restart_mark_replicas,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_key_cache_raced,
- TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_too_many_iters,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
DECLARE_EVENT_CLASS(transaction_restart_iter,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos),
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(u8, btree_id )
- __field(u64, pos_inode )
- __field(u64, pos_offset )
- __field(u32, pos_snapshot )
+ TRACE_BPOS_entries(pos)
),
TP_fast_assign(
- strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
- __entry->btree_id = btree_id;
- __entry->pos_inode = pos->inode;
- __entry->pos_offset = pos->offset;
- __entry->pos_snapshot = pos->snapshot;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos)
),
- TP_printk("%s %pS btree %u pos %llu:%llu:%u",
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u",
__entry->trans_fn,
(void *) __entry->caller_ip,
- __entry->btree_id,
+ bch2_btree_ids[__entry->btree_id],
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
-DEFINE_EVENT(transaction_restart_iter, trans_restart_mark,
- TP_PROTO(const char *trans_fn,
+TRACE_EVENT(trans_restart_upgrade,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
-);
+ struct btree_path *path,
+ unsigned old_locks_want,
+ unsigned new_locks_want),
+ TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want),
-DEFINE_EVENT(transaction_restart_iter, trans_restart_upgrade,
- TP_PROTO(const char *trans_fn,
- unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
-);
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(u8, btree_id )
+ __field(u8, old_locks_want )
+ __field(u8, new_locks_want )
+ TRACE_BPOS_entries(pos)
+ ),
-DEFINE_EVENT(transaction_restart_iter, trans_restart_iter_upgrade,
- TP_PROTO(const char *trans_fn,
- unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ TP_fast_assign(
+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->btree_id = path->btree_id;
+ __entry->old_locks_want = old_locks_want;
+ __entry->new_locks_want = new_locks_want;
+ TRACE_BPOS_assign(pos, path->pos)
+ ),
+
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ bch2_btree_ids[__entry->btree_id],
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->old_locks_want,
+ __entry->new_locks_want)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
-TRACE_EVENT(trans_restart_would_deadlock,
- TP_PROTO(const char *trans_fn,
- unsigned long caller_ip,
- bool in_traverse_all,
- unsigned reason,
- enum btree_id have_btree_id,
- unsigned have_iter_type,
- struct bpos *have_pos,
- enum btree_id want_btree_id,
- unsigned want_iter_type,
- struct bpos *want_pos),
- TP_ARGS(trans_fn, caller_ip, in_traverse_all, reason,
- have_btree_id, have_iter_type, have_pos,
- want_btree_id, want_iter_type, want_pos),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
- __field(unsigned long, caller_ip )
- __field(u8, in_traverse_all )
- __field(u8, reason )
- __field(u8, have_btree_id )
- __field(u8, have_iter_type )
- __field(u8, want_btree_id )
- __field(u8, want_iter_type )
-
- __field(u64, have_pos_inode )
- __field(u64, have_pos_offset )
- __field(u32, have_pos_snapshot)
- __field(u32, want_pos_snapshot)
- __field(u64, want_pos_inode )
- __field(u64, want_pos_offset )
- ),
-
- TP_fast_assign(
- strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->in_traverse_all = in_traverse_all;
- __entry->reason = reason;
- __entry->have_btree_id = have_btree_id;
- __entry->have_iter_type = have_iter_type;
- __entry->want_btree_id = want_btree_id;
- __entry->want_iter_type = want_iter_type;
-
- __entry->have_pos_inode = have_pos->inode;
- __entry->have_pos_offset = have_pos->offset;
- __entry->have_pos_snapshot = have_pos->snapshot;
+DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
+);
- __entry->want_pos_inode = want_pos->inode;
- __entry->want_pos_offset = want_pos->offset;
- __entry->want_pos_snapshot = want_pos->snapshot;
- ),
+DEFINE_EVENT(transaction_event, trans_restart_would_deadlock,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
- TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- __entry->in_traverse_all,
- __entry->reason,
- __entry->have_btree_id,
- __entry->have_iter_type,
- __entry->have_pos_inode,
- __entry->have_pos_offset,
- __entry->have_pos_snapshot,
- __entry->want_btree_id,
- __entry->want_iter_type,
- __entry->want_pos_inode,
- __entry->want_pos_offset,
- __entry->want_pos_snapshot)
+DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
);
TRACE_EVENT(trans_restart_would_deadlock_write,
- TP_PROTO(const char *trans_fn),
- TP_ARGS(trans_fn),
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
),
TP_fast_assign(
- strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
),
TP_printk("%s", __entry->trans_fn)
);
TRACE_EVENT(trans_restart_mem_realloced,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
unsigned long bytes),
- TP_ARGS(trans_fn, caller_ip, bytes),
+ TP_ARGS(trans, caller_ip, bytes),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(unsigned long, bytes )
),
TP_fast_assign(
- strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->bytes = bytes;
),
__entry->bytes)
);
+TRACE_EVENT(trans_restart_key_cache_key_realloced,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path,
+ unsigned old_u64s,
+ unsigned new_u64s),
+ TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(enum btree_id, btree_id )
+ TRACE_BPOS_entries(pos)
+ __field(u32, old_u64s )
+ __field(u32, new_u64s )
+ ),
+
+ TP_fast_assign(
+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
+ __entry->old_u64s = old_u64s;
+ __entry->new_u64s = new_u64s;
+ ),
+
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ bch2_btree_ids[__entry->btree_id],
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->old_u64s,
+ __entry->new_u64s)
+);
+
#endif /* _TRACE_BCACHE_H */
/* This part must be outside protection */
return ret;
}
-static unsigned get_dev_has_data(struct bch_sb *sb, unsigned dev)
-{
- struct bch_sb_field_replicas *replicas;
- struct bch_replicas_entry *r;
- unsigned i, data_has = 0;
-
- replicas = bch2_sb_get_replicas(sb);
-
- if (replicas)
- for_each_replicas_entry(replicas, r)
- for (i = 0; i < r->nr_devs; i++)
- if (r->devs[i] == dev)
- data_has |= 1 << r->data_type;
-
- return data_has;
-}
-
-static int bch2_sb_get_target(struct bch_sb *sb, char *buf, size_t len, u64 v)
-{
- struct target t = target_decode(v);
- int ret;
-
- switch (t.type) {
- case TARGET_NULL:
- return scnprintf(buf, len, "none");
- case TARGET_DEV: {
- struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
- struct bch_member *m = mi->members + t.dev;
-
- if (bch2_dev_exists(sb, mi, t.dev)) {
- char uuid_str[40];
-
- uuid_unparse(m->uuid.b, uuid_str);
-
- ret = scnprintf(buf, len, "Device %u (%s)", t.dev,
- uuid_str);
- } else {
- ret = scnprintf(buf, len, "Bad device %u", t.dev);
- }
-
- break;
- }
- case TARGET_GROUP: {
- struct bch_sb_field_disk_groups *gi;
- gi = bch2_sb_get_disk_groups(sb);
-
- struct bch_disk_group *g = gi->entries + t.group;
-
- if (t.group < disk_groups_nr(gi) && !BCH_GROUP_DELETED(g)) {
- ret = scnprintf(buf, len, "Label %u (%.*s)", t.group,
- BCH_SB_LABEL_SIZE, g->label);
- } else {
- ret = scnprintf(buf, len, "Bad label %u", t.group);
- }
- break;
- }
- default:
- BUG();
- }
-
- return ret;
-}
-
-/* superblock printing: */
-
-static void bch2_sb_print_layout(struct bch_sb *sb, enum units units)
-{
- struct bch_sb_layout *l = &sb->layout;
- unsigned i;
-
- printf(" type: %u\n"
- " superblock max size: %s\n"
- " nr superblocks: %u\n"
- " Offsets: ",
- l->layout_type,
- pr_units(1 << l->sb_max_size_bits, units),
- l->nr_superblocks);
-
- for (i = 0; i < l->nr_superblocks; i++) {
- if (i)
- printf(", ");
- printf("%llu", le64_to_cpu(l->sb_offset[i]));
- }
- putchar('\n');
-}
-
-static void bch2_sb_print_journal(struct bch_sb *sb, struct bch_sb_field *f,
- enum units units)
-{
- struct bch_sb_field_journal *journal = field_to_type(f, journal);
- unsigned i, nr = bch2_nr_journal_buckets(journal);
-
- printf(" Buckets: ");
- for (i = 0; i < nr; i++) {
- if (i)
- putchar(' ');
- printf("%llu", le64_to_cpu(journal->buckets[i]));
- }
- putchar('\n');
-}
-
-static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f,
- enum units units)
-{
- struct bch_sb_field_members *mi = field_to_type(f, members);
- struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
- unsigned i;
-
- for (i = 0; i < sb->nr_devices; i++) {
- struct bch_member *m = mi->members + i;
- time_t last_mount = le64_to_cpu(m->last_mount);
- char member_uuid_str[40];
- char data_allowed_str[100];
- char data_has_str[100];
- char label [BCH_SB_LABEL_SIZE+10];
- char time_str[64];
-
- if (!bch2_member_exists(m))
- continue;
-
- uuid_unparse(m->uuid.b, member_uuid_str);
-
- if (BCH_MEMBER_GROUP(m)) {
- unsigned idx = BCH_MEMBER_GROUP(m) - 1;
-
- if (idx < disk_groups_nr(gi)) {
- scnprintf(label, sizeof(label), "%.*s (%u)",
- BCH_SB_LABEL_SIZE,
- gi->entries[idx].label, idx);
- } else {
- strcpy(label, "(bad disk labels section)");
- }
- } else {
- strcpy(label, "(none)");
- }
-
- bch2_flags_to_text(&PBUF(data_allowed_str),
- bch2_data_types,
- BCH_MEMBER_DATA_ALLOWED(m));
- if (!data_allowed_str[0])
- strcpy(data_allowed_str, "(none)");
-
- bch2_flags_to_text(&PBUF(data_has_str),
- bch2_data_types,
- get_dev_has_data(sb, i));
- if (!data_has_str[0])
- strcpy(data_has_str, "(none)");
-
- if (last_mount) {
- struct tm *tm = localtime(&last_mount);
- size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
- if (!err)
- strcpy(time_str, "(formatting error)");
- } else {
- strcpy(time_str, "(never)");
- }
-
- printf(" Device %u:\n"
- " UUID: %s\n"
- " Size: %s\n"
- " Bucket size: %s\n"
- " First bucket: %u\n"
- " Buckets: %llu\n"
- " Last mount: %s\n"
- " State: %s\n"
- " Group: %s\n"
- " Data allowed: %s\n"
-
- " Has data: %s\n"
-
- " Discard: %llu\n",
- i, member_uuid_str,
- pr_units(le16_to_cpu(m->bucket_size) *
- le64_to_cpu(m->nbuckets), units),
- pr_units(le16_to_cpu(m->bucket_size), units),
- le16_to_cpu(m->first_bucket),
- le64_to_cpu(m->nbuckets),
- time_str,
-
- BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
- ? bch2_member_states[BCH_MEMBER_STATE(m)]
- : "unknown",
-
- label,
- data_allowed_str,
- data_has_str,
-
- BCH_MEMBER_DISCARD(m));
- }
-}
-
-static void bch2_sb_print_crypt(struct bch_sb *sb, struct bch_sb_field *f,
- enum units units)
-{
- struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
- printf(" KFD: %llu\n"
- " scrypt n: %llu\n"
- " scrypt r: %llu\n"
- " scrypt p: %llu\n",
- BCH_CRYPT_KDF_TYPE(crypt),
- BCH_KDF_SCRYPT_N(crypt),
- BCH_KDF_SCRYPT_R(crypt),
- BCH_KDF_SCRYPT_P(crypt));
-}
-
-static void bch2_sb_print_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f,
- enum units units)
-{
- struct bch_sb_field_replicas_v0 *replicas = field_to_type(f, replicas_v0);
- struct bch_replicas_entry_v0 *e;
- unsigned i;
-
- for_each_replicas_entry(replicas, e) {
- printf_pad(32, " %s:", bch2_data_types[e->data_type]);
-
- putchar('[');
- for (i = 0; i < e->nr_devs; i++) {
- if (i)
- putchar(' ');
- printf("%u", e->devs[i]);
- }
- printf("]\n");
- }
-}
-
-static void bch2_sb_print_replicas(struct bch_sb *sb, struct bch_sb_field *f,
- enum units units)
-{
- struct bch_sb_field_replicas *replicas = field_to_type(f, replicas);
- struct bch_replicas_entry *e;
- unsigned i;
-
- for_each_replicas_entry(replicas, e) {
- printf_pad(32, " %s: %u/%u",
- bch2_data_types[e->data_type],
- e->nr_required,
- e->nr_devs);
-
- putchar('[');
- for (i = 0; i < e->nr_devs; i++) {
- if (i)
- putchar(' ');
- printf("%u", e->devs[i]);
- }
- printf("]\n");
- }
-}
-
-static void bch2_sb_print_quota(struct bch_sb *sb, struct bch_sb_field *f,
- enum units units)
-{
-}
-
-static void bch2_sb_print_disk_groups(struct bch_sb *sb, struct bch_sb_field *f,
- enum units units)
-{
-}
-
-static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f,
- enum units units)
-{
- struct bch_sb_field_clean *clean = field_to_type(f, clean);
-
-
- printf(" flags: %x", le32_to_cpu(clean->flags));
- printf(" journal seq: %llx", le64_to_cpu(clean->journal_seq));
-}
-
-static void bch2_sb_print_journal_seq_blacklist(struct bch_sb *sb, struct bch_sb_field *f,
- enum units units)
-{
- struct bch_sb_field_journal_seq_blacklist *bl = field_to_type(f, journal_seq_blacklist);
- unsigned i, nr = blacklist_nr_entries(bl);
-
- for (i = 0; i < nr; i++) {
- struct journal_seq_blacklist_entry *e =
- bl->start + i;
-
- printf(" %llu-%llu\n",
- le64_to_cpu(e->start),
- le64_to_cpu(e->end));
- }
-}
-
-typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units);
-
-struct bch_sb_field_toolops {
- sb_field_print_fn print;
-};
-
-static const struct bch_sb_field_toolops bch2_sb_field_ops[] = {
-#define x(f, nr) \
- [BCH_SB_FIELD_##f] = { \
- .print = bch2_sb_print_##f, \
- },
- BCH_SB_FIELDS()
-#undef x
-};
-
-static inline void bch2_sb_field_print(struct bch_sb *sb,
- struct bch_sb_field *f,
- enum units units)
-{
- unsigned type = le32_to_cpu(f->type);
-
- if (type < BCH_SB_FIELD_NR)
- bch2_sb_field_ops[type].print(sb, f, units);
- else
- printf("(unknown field %u)\n", type);
-}
-
-void bch2_sb_print(struct bch_sb *sb, bool print_layout,
- unsigned fields, enum units units)
-{
- struct bch_sb_field_members *mi;
- char user_uuid_str[40], internal_uuid_str[40];
- char features_str[500];
- char compat_features_str[500];
- char fields_have_str[200];
- char label[BCH_SB_LABEL_SIZE + 1];
- char time_str[64];
- char foreground_str[64];
- char background_str[64];
- char promote_str[64];
- char metadata_str[64];
- struct bch_sb_field *f;
- u64 fields_have = 0;
- unsigned nr_devices = 0;
- time_t time_base = le64_to_cpu(sb->time_base_lo) / NSEC_PER_SEC;
-
- memcpy(label, sb->label, BCH_SB_LABEL_SIZE);
- label[BCH_SB_LABEL_SIZE] = '\0';
-
- uuid_unparse(sb->user_uuid.b, user_uuid_str);
- uuid_unparse(sb->uuid.b, internal_uuid_str);
-
- if (time_base) {
- struct tm *tm = localtime(&time_base);
- size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
- if (!err)
- strcpy(time_str, "(formatting error)");
- } else {
- strcpy(time_str, "(not set)");
- }
-
- mi = bch2_sb_get_members(sb);
- if (mi) {
- struct bch_member *m;
-
- for (m = mi->members;
- m < mi->members + sb->nr_devices;
- m++)
- nr_devices += bch2_member_exists(m);
- }
-
- bch2_sb_get_target(sb, foreground_str, sizeof(foreground_str),
- BCH_SB_FOREGROUND_TARGET(sb));
-
- bch2_sb_get_target(sb, background_str, sizeof(background_str),
- BCH_SB_BACKGROUND_TARGET(sb));
-
- bch2_sb_get_target(sb, promote_str, sizeof(promote_str),
- BCH_SB_PROMOTE_TARGET(sb));
-
- bch2_sb_get_target(sb, metadata_str, sizeof(metadata_str),
- BCH_SB_METADATA_TARGET(sb));
-
- bch2_flags_to_text(&PBUF(features_str),
- bch2_sb_features,
- le64_to_cpu(sb->features[0]));
-
- bch2_flags_to_text(&PBUF(compat_features_str),
- bch2_sb_compat,
- le64_to_cpu(sb->compat[0]));
-
- vstruct_for_each(sb, f)
- fields_have |= 1 << le32_to_cpu(f->type);
- bch2_flags_to_text(&PBUF(fields_have_str),
- bch2_sb_fields, fields_have);
-
- printf("External UUID: %s\n"
- "Internal UUID: %s\n"
- "Device index: %u\n"
- "Label: %s\n"
- "Version: %u\n"
- "Oldest version on disk: %u\n"
- "Created: %s\n"
- "Squence number: %llu\n"
- "Block_size: %s\n"
- "Btree node size: %s\n"
- "Error action: %s\n"
- "Clean: %llu\n"
- "Features: %s\n"
- "Compat features: %s\n"
-
- "Metadata replicas: %llu\n"
- "Data replicas: %llu\n"
-
- "Metadata checksum type: %s (%llu)\n"
- "Data checksum type: %s (%llu)\n"
- "Compression type: %s (%llu)\n"
-
- "Foreground write target: %s\n"
- "Background write target: %s\n"
- "Promote target: %s\n"
- "Metadata target: %s\n"
-
- "String hash type: %s (%llu)\n"
- "32 bit inodes: %llu\n"
- "GC reserve percentage: %llu%%\n"
- "Root reserve percentage: %llu%%\n"
-
- "Devices: %u live, %u total\n"
- "Sections: %s\n"
- "Superblock size: %llu\n",
- user_uuid_str,
- internal_uuid_str,
- sb->dev_idx,
- label,
- le16_to_cpu(sb->version),
- le16_to_cpu(sb->version_min),
- time_str,
- le64_to_cpu(sb->seq),
- pr_units(le16_to_cpu(sb->block_size), units),
- pr_units(BCH_SB_BTREE_NODE_SIZE(sb), units),
-
- BCH_SB_ERROR_ACTION(sb) < BCH_ON_ERROR_NR
- ? bch2_error_actions[BCH_SB_ERROR_ACTION(sb)]
- : "unknown",
-
- BCH_SB_CLEAN(sb),
- features_str,
- compat_features_str,
-
- BCH_SB_META_REPLICAS_WANT(sb),
- BCH_SB_DATA_REPLICAS_WANT(sb),
-
- BCH_SB_META_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR
- ? bch2_csum_opts[BCH_SB_META_CSUM_TYPE(sb)]
- : "unknown",
- BCH_SB_META_CSUM_TYPE(sb),
-
- BCH_SB_DATA_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR
- ? bch2_csum_opts[BCH_SB_DATA_CSUM_TYPE(sb)]
- : "unknown",
- BCH_SB_DATA_CSUM_TYPE(sb),
-
- BCH_SB_COMPRESSION_TYPE(sb) < BCH_COMPRESSION_OPT_NR
- ? bch2_compression_opts[BCH_SB_COMPRESSION_TYPE(sb)]
- : "unknown",
- BCH_SB_COMPRESSION_TYPE(sb),
-
- foreground_str,
- background_str,
- promote_str,
- metadata_str,
-
- BCH_SB_STR_HASH_TYPE(sb) < BCH_STR_HASH_NR
- ? bch2_str_hash_types[BCH_SB_STR_HASH_TYPE(sb)]
- : "unknown",
- BCH_SB_STR_HASH_TYPE(sb),
-
- BCH_SB_INODE_32BIT(sb),
- BCH_SB_GC_RESERVE(sb),
- BCH_SB_ROOT_RESERVE(sb),
-
- nr_devices, sb->nr_devices,
- fields_have_str,
- vstruct_bytes(sb));
-
- if (print_layout) {
- printf("\n"
- "Layout:\n");
- bch2_sb_print_layout(sb, units);
- }
-
- vstruct_for_each(sb, f) {
- unsigned type = le32_to_cpu(f->type);
- char name[60];
-
- if (!(fields & (1 << type)))
- continue;
-
- if (type < BCH_SB_FIELD_NR) {
- scnprintf(name, sizeof(name), "%s", bch2_sb_fields[type]);
- name[0] = toupper(name[0]);
- } else {
- scnprintf(name, sizeof(name), "(unknown field %u)", type);
- }
-
- printf("\n%s (size %llu):\n", name, vstruct_bytes(f));
- if (type < BCH_SB_FIELD_NR)
- bch2_sb_field_print(sb, f, units);
- }
-}
-
/* ioctl interface: */
/* Global control device: */
struct bch_opts bch2_parse_opts(struct bch_opt_strs strs)
{
struct bch_opts opts = bch2_opts_empty();
+ struct printbuf err = PRINTBUF;
unsigned i;
int ret;
u64 v;
bch2_opt_table[i].type == BCH_OPT_FN)
continue;
- ret = bch2_opt_parse(NULL, "option",
+ ret = bch2_opt_parse(NULL,
&bch2_opt_table[i],
- strs.by_id[i], &v);
+ strs.by_id[i], &v, &err);
if (ret < 0)
- die("Invalid %s: %s",
- bch2_opt_table[i].attr.name,
- strerror(-ret));
+ die("Invalid option %s", err.buf);
bch2_opt_set_by_id(&opts, i, v);
}
+ printbuf_exit(&err);
return opts;
}
struct dirent *d;
dev_names devs;
- darray_init(devs);
+ darray_init(&devs);
while ((errno = 0), (d = readdir(dir))) {
struct dev_name n = { 0, NULL, NULL };
n.label = read_file_str(fs.sysfs_fd, label_attr);
free(label_attr);
- darray_append(devs, n);
+ darray_push(&devs, n);
}
closedir(dir);
void bch2_super_write(int, struct bch_sb *);
struct bch_sb *__bch2_super_read(int, u64);
-void bch2_sb_print(struct bch_sb *, bool, unsigned, enum units);
-
/* ioctl interface: */
int bcachectl_open(void);
char *label;
uuid_le uuid;
};
-typedef darray(struct dev_name) dev_names;
+typedef DARRAY(struct dev_name) dev_names;
dev_names bchu_fs_get_devices(struct bchfs_handle);
bkey_xattr_init(&xattr->k_i);
xattr->k.u64s = u64s;
xattr->v.x_type = acl_to_xattr_type(type);
- xattr->v.x_name_len = 0,
+ xattr->v.x_name_len = 0;
xattr->v.x_val_len = cpu_to_le16(acl_len);
acl_header = xattr_val(&xattr->v);
&X_SEARCH(acl_to_xattr_type(type), "", 0),
0);
if (ret) {
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (ret != -ENOENT)
acl = ERR_PTR(ret);
btree_err:
bch2_trans_iter_exit(&trans, &inode_iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (unlikely(ret))
goto err;
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_key_cache.h"
#include "debug.h"
#include "ec.h"
#include "error.h"
+#include "lru.h"
#include "recovery.h"
#include "varint.h"
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
-const char * const bch2_allocator_states[] = {
-#define x(n) #n,
- ALLOC_THREAD_STATES()
-#undef x
- NULL
-};
+/* Persistent alloc info: */
static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
#undef x
};
-/* Persistent alloc info: */
+struct bkey_alloc_unpacked {
+ u64 journal_seq;
+ u8 gen;
+ u8 oldest_gen;
+ u8 data_type;
+ bool need_discard:1;
+ bool need_inc_gen:1;
+#define x(_name, _bits) u##_bits _name;
+ BCH_ALLOC_FIELDS_V2()
+#undef x
+};
static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
const void **p, unsigned field)
out->gen = a.v->gen;
out->oldest_gen = a.v->oldest_gen;
out->data_type = a.v->data_type;
+ out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
+ out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
out->journal_seq = le64_to_cpu(a.v->journal_seq);
#define x(_name, _bits) \
return 0;
}
-static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst,
- const struct bkey_alloc_unpacked src)
+static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
{
- struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k);
- unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
- u8 *out = a->v.data;
- u8 *end = (void *) &dst[1];
- u8 *last_nonzero_field = out;
- unsigned bytes;
-
- a->k.p = POS(src.dev, src.bucket);
- a->v.gen = src.gen;
- a->v.oldest_gen = src.oldest_gen;
- a->v.data_type = src.data_type;
- a->v.journal_seq = cpu_to_le64(src.journal_seq);
-
-#define x(_name, _bits) \
- nr_fields++; \
- \
- if (src._name) { \
- out += bch2_varint_encode_fast(out, src._name); \
- \
- last_nonzero_field = out; \
- last_nonzero_fieldnr = nr_fields; \
- } else { \
- *out++ = 0; \
- }
-
- BCH_ALLOC_FIELDS_V2()
-#undef x
- BUG_ON(out > end);
-
- out = last_nonzero_field;
- a->v.nr_fields = last_nonzero_fieldnr;
-
- bytes = (u8 *) out - (u8 *) &a->v;
- set_bkey_val_bytes(&a->k, bytes);
- memset_u64s_tail(&a->v, 0, bytes);
-}
-
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
-{
- struct bkey_alloc_unpacked ret = {
- .dev = k.k->p.inode,
- .bucket = k.k->p.offset,
- .gen = 0,
- };
+ struct bkey_alloc_unpacked ret = { .gen = 0 };
switch (k.k->type) {
case KEY_TYPE_alloc:
return ret;
}
-struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans,
- const struct bkey_alloc_unpacked src)
+static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
{
- struct bkey_alloc_buf *dst;
+ unsigned i, bytes = offsetof(struct bch_alloc, data);
- dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
- if (!IS_ERR(dst))
- bch2_alloc_pack_v3(dst, src);
+ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
+ if (a->fields & (1 << i))
+ bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
- return dst;
+ return DIV_ROUND_UP(bytes, sizeof(u64));
}
-int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_alloc_unpacked *u, unsigned trigger_flags)
+int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
- struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u);
+ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+
+ /* allow for unknown fields */
+ if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
+ prt_printf(err, "incorrect value size (%zu < %u)",
+ bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
+ return -EINVAL;
+ }
- return PTR_ERR_OR_ZERO(a) ?:
- bch2_trans_update(trans, iter, &a->k, trigger_flags);
+ return 0;
}
-static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
+int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
- unsigned i, bytes = offsetof(struct bch_alloc, data);
+ struct bkey_alloc_unpacked u;
- for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
- if (a->fields & (1 << i))
- bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
+ if (bch2_alloc_unpack_v2(&u, k)) {
+ prt_printf(err, "unpack error");
+ return -EINVAL;
+ }
- return DIV_ROUND_UP(bytes, sizeof(u64));
+ return 0;
}
-const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
- struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+ struct bkey_alloc_unpacked u;
- if (k.k->p.inode >= c->sb.nr_devices ||
- !c->devs[k.k->p.inode])
- return "invalid device";
+ if (bch2_alloc_unpack_v3(&u, k)) {
+ prt_printf(err, "unpack error");
+ return -EINVAL;
+ }
- /* allow for unknown fields */
- if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v))
- return "incorrect value size";
+ return 0;
+}
+
+int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
+{
+ struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+
+ if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) {
+ prt_printf(err, "bad val size (%lu != %u)",
+ bkey_val_u64s(k.k), alloc_v4_u64s(a.v));
+ return -EINVAL;
+ }
+
+ if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
+ BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) {
+ prt_printf(err, "invalid backpointers_start");
+ return -EINVAL;
+ }
+
+ if (rw == WRITE) {
+ if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
+ prt_printf(err, "invalid data type (got %u should be %u)",
+ a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
+ return -EINVAL;
+ }
- return NULL;
+ switch (a.v->data_type) {
+ case BCH_DATA_free:
+ case BCH_DATA_need_gc_gens:
+ case BCH_DATA_need_discard:
+ if (a.v->dirty_sectors ||
+ a.v->cached_sectors ||
+ a.v->stripe) {
+ prt_printf(err, "empty data type free but have data");
+ return -EINVAL;
+ }
+ break;
+ case BCH_DATA_sb:
+ case BCH_DATA_journal:
+ case BCH_DATA_btree:
+ case BCH_DATA_user:
+ case BCH_DATA_parity:
+ if (!a.v->dirty_sectors) {
+ prt_printf(err, "data_type %s but dirty_sectors==0",
+ bch2_data_types[a.v->data_type]);
+ return -EINVAL;
+ }
+ break;
+ case BCH_DATA_cached:
+ if (!a.v->cached_sectors ||
+ a.v->dirty_sectors ||
+ a.v->stripe) {
+ prt_printf(err, "data type inconsistency");
+ return -EINVAL;
+ }
+
+ if (!a.v->io_time[READ] &&
+ test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) {
+ prt_printf(err, "cached bucket with read_time == 0");
+ return -EINVAL;
+ }
+ break;
+ case BCH_DATA_stripe:
+ if (!a.v->stripe) {
+ prt_printf(err, "data_type %s but stripe==0",
+ bch2_data_types[a.v->data_type]);
+ return -EINVAL;
+ }
+ break;
+ }
+ }
+
+ return 0;
}
-const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+static inline u64 swab40(u64 x)
{
- struct bkey_alloc_unpacked u;
+ return (((x & 0x00000000ffULL) << 32)|
+ ((x & 0x000000ff00ULL) << 16)|
+ ((x & 0x0000ff0000ULL) >> 0)|
+ ((x & 0x00ff000000ULL) >> 16)|
+ ((x & 0xff00000000ULL) >> 32));
+}
+
+void bch2_alloc_v4_swab(struct bkey_s k)
+{
+ struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
+ struct bch_backpointer *bp, *bps;
+
+ a->journal_seq = swab64(a->journal_seq);
+ a->flags = swab32(a->flags);
+ a->dirty_sectors = swab32(a->dirty_sectors);
+ a->cached_sectors = swab32(a->cached_sectors);
+ a->io_time[0] = swab64(a->io_time[0]);
+ a->io_time[1] = swab64(a->io_time[1]);
+ a->stripe = swab32(a->stripe);
+ a->nr_external_backpointers = swab32(a->nr_external_backpointers);
+
+ bps = alloc_v4_backpointers(a);
+ for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
+ bp->bucket_offset = swab40(bp->bucket_offset);
+ bp->bucket_len = swab32(bp->bucket_len);
+ bch2_bpos_swab(&bp->pos);
+ }
+}
- if (k.k->p.inode >= c->sb.nr_devices ||
- !c->devs[k.k->p.inode])
- return "invalid device";
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bch_alloc_v4 _a;
+ const struct bch_alloc_v4 *a = &_a;
+ const struct bch_backpointer *bps;
+ unsigned i;
- if (bch2_alloc_unpack_v2(&u, k))
- return "unpack error";
+ if (k.k->type == KEY_TYPE_alloc_v4)
+ a = bkey_s_c_to_alloc_v4(k).v;
+ else
+ bch2_alloc_to_v4(k, &_a);
+
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "gen %u oldest_gen %u data_type %s",
+ a->gen, a->oldest_gen, bch2_data_types[a->data_type]);
+ prt_newline(out);
+ prt_printf(out, "journal_seq %llu", a->journal_seq);
+ prt_newline(out);
+ prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a));
+ prt_newline(out);
+ prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a));
+ prt_newline(out);
+ prt_printf(out, "dirty_sectors %u", a->dirty_sectors);
+ prt_newline(out);
+ prt_printf(out, "cached_sectors %u", a->cached_sectors);
+ prt_newline(out);
+ prt_printf(out, "stripe %u", a->stripe);
+ prt_newline(out);
+ prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy);
+ prt_newline(out);
+ prt_printf(out, "io_time[READ] %llu", a->io_time[READ]);
+ prt_newline(out);
+ prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]);
+ prt_newline(out);
+ prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a));
+ printbuf_indent_add(out, 2);
+
+ bps = alloc_v4_backpointers_c(a);
+ for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a); i++) {
+ prt_newline(out);
+ bch2_backpointer_to_text(out, &bps[i]);
+ }
- return NULL;
+ printbuf_indent_sub(out, 4);
}
-const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k)
+void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
{
- struct bkey_alloc_unpacked u;
+ if (k.k->type == KEY_TYPE_alloc_v4) {
+ int d;
+
+ *out = *bkey_s_c_to_alloc_v4(k).v;
+
+ d = (int) BCH_ALLOC_V4_U64s -
+ (int) (BCH_ALLOC_V4_BACKPOINTERS_START(out) ?: BCH_ALLOC_V4_U64s_V0);
+ if (unlikely(d > 0)) {
+ memset((u64 *) out + BCH_ALLOC_V4_BACKPOINTERS_START(out),
+ 0,
+ d * sizeof(u64));
+ SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+ }
+ } else {
+ struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+ *out = (struct bch_alloc_v4) {
+ .journal_seq = u.journal_seq,
+ .flags = u.need_discard,
+ .gen = u.gen,
+ .oldest_gen = u.oldest_gen,
+ .data_type = u.data_type,
+ .stripe_redundancy = u.stripe_redundancy,
+ .dirty_sectors = u.dirty_sectors,
+ .cached_sectors = u.cached_sectors,
+ .io_time[READ] = u.read_time,
+ .io_time[WRITE] = u.write_time,
+ .stripe = u.stripe,
+ };
+
+ SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+ }
+}
+
+static noinline struct bkey_i_alloc_v4 *
+__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+ struct bkey_i_alloc_v4 *ret;
+ unsigned bytes = k.k->type == KEY_TYPE_alloc_v4
+ ? bkey_bytes(k.k)
+ : sizeof(struct bkey_i_alloc_v4);
+
+ /*
+ * Reserve space for one more backpointer here:
+ * Not sketchy at doing it this way, nope...
+ */
+ ret = bch2_trans_kmalloc(trans, bytes + sizeof(struct bch_backpointer));
+ if (IS_ERR(ret))
+ return ret;
- if (k.k->p.inode >= c->sb.nr_devices ||
- !c->devs[k.k->p.inode])
- return "invalid device";
+ if (k.k->type == KEY_TYPE_alloc_v4) {
+ struct bch_backpointer *src, *dst;
- if (bch2_alloc_unpack_v3(&u, k))
- return "unpack error";
+ bkey_reassemble(&ret->k_i, k);
- return NULL;
+ src = alloc_v4_backpointers(&ret->v);
+ SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
+ dst = alloc_v4_backpointers(&ret->v);
+
+ memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) *
+ sizeof(struct bch_backpointer));
+ memset(src, 0, dst - src);
+ set_alloc_v4_u64s(ret);
+ } else {
+ bkey_alloc_v4_init(&ret->k_i);
+ ret->k.p = k.k->p;
+ bch2_alloc_to_v4(k, &ret->v);
+ }
+ return ret;
}
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
+static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
{
- struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+ if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
+ BCH_ALLOC_V4_BACKPOINTERS_START(bkey_s_c_to_alloc_v4(k).v) == BCH_ALLOC_V4_U64s) {
+ /*
+ * Reserve space for one more backpointer here:
+ * Not sketchy at doing it this way, nope...
+ */
+ struct bkey_i_alloc_v4 *ret =
+ bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(struct bch_backpointer));
+ if (!IS_ERR(ret))
+ bkey_reassemble(&ret->k_i, k);
+ return ret;
+ }
- pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu",
- u.gen, u.oldest_gen, bch2_data_types[u.data_type],
- u.journal_seq);
-#define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name);
- BCH_ALLOC_FIELDS_V2()
-#undef x
+ return __bch2_alloc_to_v4_mut(trans, k);
+}
+
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+ return bch2_alloc_to_v4_mut_inlined(trans, k);
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos pos)
+{
+ struct bkey_s_c k;
+ struct bkey_i_alloc_v4 *a;
+ int ret;
+
+ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
+ BTREE_ITER_WITH_UPDATES|
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
+ return ERR_PTR(ret);
+ }
+
+ a = bch2_alloc_to_v4_mut_inlined(trans, k);
+ if (IS_ERR(a))
+ bch2_trans_iter_exit(trans, iter);
+ return a;
}
-int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
+int bch2_alloc_read(struct bch_fs *c)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
+ struct bch_alloc_v4 a;
struct bch_dev *ca;
- struct bucket *g;
- struct bkey_alloc_unpacked u;
int ret;
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
+ /*
+ * Not a fsck error because this is checked/repaired by
+ * bch2_check_alloc_key() which runs later:
+ */
+ if (!bch2_dev_bucket_exists(c, k.k->p))
+ continue;
+
ca = bch_dev_bkey_exists(c, k.k->p.inode);
- g = __bucket(ca, k.k->p.offset, gc);
- u = bch2_alloc_unpack(k);
-
- if (!gc)
- *bucket_gen(ca, k.k->p.offset) = u.gen;
-
- g->_mark.gen = u.gen;
- g->io_time[READ] = u.read_time;
- g->io_time[WRITE] = u.write_time;
- g->oldest_gen = !gc ? u.oldest_gen : u.gen;
- g->gen_valid = 1;
-
- if (!gc ||
- (metadata_only &&
- (u.data_type == BCH_DATA_user ||
- u.data_type == BCH_DATA_cached ||
- u.data_type == BCH_DATA_parity))) {
- g->_mark.data_type = u.data_type;
- g->_mark.dirty_sectors = u.dirty_sectors;
- g->_mark.cached_sectors = u.cached_sectors;
- g->_mark.stripe = u.stripe != 0;
- g->stripe = u.stripe;
- g->stripe_redundancy = u.stripe_redundancy;
- }
+ bch2_alloc_to_v4(k, &a);
+ *bucket_gen(ca, k.k->p.offset) = a.gen;
}
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
if (ret)
- bch_err(c, "error reading alloc info: %i", ret);
+ bch_err(c, "error reading alloc info: %s", bch2_err_str(ret));
return ret;
}
-/* Bucket IO clocks: */
+/* Free space/discard btree: */
-int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
- size_t bucket_nr, int rw)
+static int bch2_bucket_do_index(struct btree_trans *trans,
+ struct bkey_s_c alloc_k,
+ const struct bch_alloc_v4 *a,
+ bool set)
{
struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_alloc_unpacked u;
- u64 *time, now;
- int ret = 0;
+ struct bkey_s_c old;
+ struct bkey_i *k;
+ enum btree_id btree;
+ enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ struct printbuf buf = PRINTBUF;
+ int ret;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto out;
+ if (a->data_type != BCH_DATA_free &&
+ a->data_type != BCH_DATA_need_discard)
+ return 0;
- u = bch2_alloc_unpack(k);
+ k = bch2_trans_kmalloc(trans, sizeof(*k));
+ if (IS_ERR(k))
+ return PTR_ERR(k);
- time = rw == READ ? &u.read_time : &u.write_time;
- now = atomic64_read(&c->io_clock[rw].now);
- if (*time == now)
- goto out;
+ bkey_init(&k->k);
+ k->k.type = new_type;
- *time = now;
+ switch (a->data_type) {
+ case BCH_DATA_free:
+ btree = BTREE_ID_freespace;
+ k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
+ bch2_key_resize(&k->k, 1);
+ break;
+ case BCH_DATA_need_discard:
+ btree = BTREE_ID_need_discard;
+ k->k.p = alloc_k.k->p;
+ break;
+ default:
+ return 0;
+ }
- ret = bch2_alloc_write(trans, &iter, &u, 0) ?:
- bch2_trans_commit(trans, NULL, NULL, 0);
-out:
+ bch2_trans_iter_init(trans, &iter, btree,
+ bkey_start_pos(&k->k),
+ BTREE_ITER_INTENT);
+ old = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(old);
+ if (ret)
+ goto err;
+
+ if (ca->mi.freespace_initialized &&
+ bch2_trans_inconsistent_on(old.k->type != old_type, trans,
+ "incorrect key when %s %s btree (got %s should be %s)\n"
+ " for %s",
+ set ? "setting" : "clearing",
+ bch2_btree_ids[btree],
+ bch2_bkey_types[old.k->type],
+ bch2_bkey_types[old_type],
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ ret = -EIO;
+ goto err;
+ }
+
+ ret = bch2_trans_update(trans, &iter, k, 0);
+err:
bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
return ret;
}
-/* Background allocator thread: */
-
-/*
- * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
- * (marking them as invalidated on disk), then optionally issues discard
- * commands to the newly free buckets, then puts them on the various freelists.
- */
-
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
- struct bucket_mark m)
+int bch2_trans_mark_alloc(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
{
- u8 gc_gen;
-
- if (!is_available_bucket(m))
- return false;
+ struct bch_fs *c = trans->c;
+ struct bch_alloc_v4 old_a, *new_a;
+ u64 old_lru, new_lru;
+ int ret = 0;
- if (m.owned_by_allocator)
- return false;
+ /*
+ * Deletion only happens in the device removal path, with
+ * BTREE_TRIGGER_NORUN:
+ */
+ BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
- if (ca->buckets_nouse &&
- test_bit(b, ca->buckets_nouse))
- return false;
+ bch2_alloc_to_v4(old, &old_a);
+ new_a = &bkey_i_to_alloc_v4(new)->v;
- if (ca->new_fs_bucket_idx) {
- /*
- * Device or filesystem is still being initialized, and we
- * haven't fully marked superblocks & journal:
- */
- if (is_superblock_bucket(ca, b))
- return false;
+ new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
- if (b < ca->new_fs_bucket_idx)
- return false;
+ if (new_a->dirty_sectors > old_a.dirty_sectors ||
+ new_a->cached_sectors > old_a.cached_sectors) {
+ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+ new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
+ SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
}
- gc_gen = bucket_gc_gen(bucket(ca, b));
-
- ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2;
- ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX;
+ if (data_type_is_empty(new_a->data_type) &&
+ BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
+ !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
+ new_a->gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+ }
- return gc_gen < BUCKET_GC_GEN_MAX;
-}
+ if (old_a.data_type != new_a->data_type ||
+ (new_a->data_type == BCH_DATA_free &&
+ alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) {
+ ret = bch2_bucket_do_index(trans, old, &old_a, false) ?:
+ bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true);
+ if (ret)
+ return ret;
+ }
-/*
- * Determines what order we're going to reuse buckets, smallest bucket_key()
- * first.
- */
+ if (new_a->data_type == BCH_DATA_cached &&
+ !new_a->io_time[READ])
+ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
-static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
- u64 now, u64 last_seq_ondisk)
-{
- unsigned used = m.cached_sectors;
+ old_lru = alloc_lru_idx(old_a);
+ new_lru = alloc_lru_idx(*new_a);
- if (used) {
- /*
- * Prefer to keep buckets that have been read more recently, and
- * buckets that have more data in them:
- */
- u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
- u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
+ if (old_lru != new_lru) {
+ ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset,
+ old_lru, &new_lru, old);
+ if (ret)
+ return ret;
- return -last_read_scaled;
- } else {
- /*
- * Prefer to use buckets with smaller gc_gen so that we don't
- * have to walk the btree and recalculate oldest_gen - but shift
- * off the low bits so that buckets will still have equal sort
- * keys when there's only a small difference, so that we can
- * keep sequential buckets together:
- */
- return bucket_gc_gen(g) >> 4;
+ if (new_a->data_type == BCH_DATA_cached)
+ new_a->io_time[READ] = new_lru;
}
-}
-static inline int bucket_alloc_cmp(alloc_heap *h,
- struct alloc_heap_entry l,
- struct alloc_heap_entry r)
-{
- return cmp_int(l.key, r.key) ?:
- cmp_int(r.nr, l.nr) ?:
- cmp_int(l.bucket, r.bucket);
+ return 0;
}
-static inline int bucket_idx_cmp(const void *_l, const void *_r)
+static int bch2_check_alloc_key(struct btree_trans *trans,
+ struct btree_iter *alloc_iter,
+ struct btree_iter *discard_iter,
+ struct btree_iter *freespace_iter)
{
- const struct alloc_heap_entry *l = _l, *r = _r;
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca;
+ struct bch_alloc_v4 a;
+ unsigned discard_key_type, freespace_key_type;
+ struct bkey_s_c alloc_k, k;
+ struct printbuf buf = PRINTBUF;
+ int ret;
- return cmp_int(l->bucket, r->bucket);
-}
+ alloc_k = bch2_dev_bucket_exists(c, alloc_iter->pos)
+ ? bch2_btree_iter_peek_slot(alloc_iter)
+ : bch2_btree_iter_peek(alloc_iter);
+ if (!alloc_k.k)
+ return 1;
-static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bucket_array *buckets;
- struct alloc_heap_entry e = { 0 };
- u64 now, last_seq_ondisk;
- size_t b, i, nr = 0;
+ ret = bkey_err(alloc_k);
+ if (ret)
+ return ret;
- down_read(&ca->bucket_lock);
+ if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
+ "alloc key for invalid device:bucket %llu:%llu",
+ alloc_k.k->p.inode, alloc_k.k->p.offset))
+ return bch2_btree_delete_at(trans, alloc_iter, 0);
- buckets = bucket_array(ca);
- ca->alloc_heap.used = 0;
- now = atomic64_read(&c->io_clock[READ].now);
- last_seq_ondisk = c->journal.flushed_seq_ondisk;
+ ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
+ if (!ca->mi.freespace_initialized)
+ return 0;
- /*
- * Find buckets with lowest read priority, by building a maxheap sorted
- * by read priority and repeatedly replacing the maximum element until
- * all buckets have been visited.
- */
- for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
- struct bucket *g = &buckets->b[b];
- struct bucket_mark m = READ_ONCE(g->mark);
- unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
+ bch2_alloc_to_v4(alloc_k, &a);
- cond_resched();
+ discard_key_type = a.data_type == BCH_DATA_need_discard
+ ? KEY_TYPE_set : 0;
+ freespace_key_type = a.data_type == BCH_DATA_free
+ ? KEY_TYPE_set : 0;
- if (!bch2_can_invalidate_bucket(ca, b, m))
- continue;
+ bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
+ bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, a));
- if (!m.data_type &&
- bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- last_seq_ondisk,
- ca->dev_idx, b)) {
- ca->buckets_waiting_on_journal++;
- continue;
- }
+ k = bch2_btree_iter_peek_slot(discard_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
- if (e.nr && e.bucket + e.nr == b && e.key == key) {
- e.nr++;
- } else {
- if (e.nr)
- heap_add_or_replace(&ca->alloc_heap, e,
- -bucket_alloc_cmp, NULL);
-
- e = (struct alloc_heap_entry) {
- .bucket = b,
- .nr = 1,
- .key = key,
- };
- }
+ if (k.k->type != discard_key_type &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n"
+ " %s",
+ bch2_bkey_types[k.k->type],
+ bch2_bkey_types[discard_key_type],
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ struct bkey_i *update =
+ bch2_trans_kmalloc(trans, sizeof(*update));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ bkey_init(&update->k);
+ update->k.type = discard_key_type;
+ update->k.p = discard_iter->pos;
+
+ ret = bch2_trans_update(trans, discard_iter, update, 0);
+ if (ret)
+ goto err;
}
- if (e.nr)
- heap_add_or_replace(&ca->alloc_heap, e,
- -bucket_alloc_cmp, NULL);
+ k = bch2_btree_iter_peek_slot(freespace_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != freespace_key_type &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n"
+ " %s",
+ bch2_bkey_types[k.k->type],
+ bch2_bkey_types[freespace_key_type],
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ struct bkey_i *update =
+ bch2_trans_kmalloc(trans, sizeof(*update));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
- for (i = 0; i < ca->alloc_heap.used; i++)
- nr += ca->alloc_heap.data[i].nr;
+ bkey_init(&update->k);
+ update->k.type = freespace_key_type;
+ update->k.p = freespace_iter->pos;
+ bch2_key_resize(&update->k, 1);
- while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
- nr -= ca->alloc_heap.data[0].nr;
- heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
+ ret = bch2_trans_update(trans, freespace_iter, update, 0);
+ if (ret)
+ goto err;
}
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int bch2_check_discard_freespace_key(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter alloc_iter;
+ struct bkey_s_c alloc_k;
+ struct bch_alloc_v4 a;
+ u64 genbits;
+ struct bpos pos;
+ enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
+ ? BCH_DATA_need_discard
+ : BCH_DATA_free;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ pos = iter->pos;
+ pos.offset &= ~(~0ULL << 56);
+ genbits = iter->pos.offset & (~0ULL << 56);
+
+ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
- up_read(&ca->bucket_lock);
+ if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
+ "entry in %s btree for nonexistant dev:bucket %llu:%llu",
+ bch2_btree_ids[iter->btree_id], pos.inode, pos.offset))
+ goto delete;
+
+ alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+ ret = bkey_err(alloc_k);
+ if (ret)
+ goto err;
+
+ bch2_alloc_to_v4(alloc_k, &a);
+
+ if (fsck_err_on(a.data_type != state ||
+ (state == BCH_DATA_free &&
+ genbits != alloc_freespace_genbits(a)), c,
+ "%s\n incorrectly set in %s index (free %u, genbits %llu should be %llu)",
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
+ bch2_btree_ids[iter->btree_id],
+ a.data_type == state,
+ genbits >> 56, alloc_freespace_genbits(a) >> 56))
+ goto delete;
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ printbuf_exit(&buf);
+ return ret;
+delete:
+ ret = bch2_btree_delete_extent_at(trans, iter,
+ iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0);
+ goto out;
}
-static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
+int bch2_check_alloc_info(struct bch_fs *c)
{
- size_t i, nr = 0;
+ struct btree_trans trans;
+ struct btree_iter iter, discard_iter, freespace_iter;
+ struct bkey_s_c k;
+ int ret = 0;
- ca->inc_gen_needs_gc = 0;
- ca->inc_gen_really_needs_gc = 0;
- ca->buckets_waiting_on_journal = 0;
+ bch2_trans_init(&trans, c, 0, 0);
- find_reclaimable_buckets_lru(c, ca);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
+ BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
+ BTREE_ITER_PREFETCH);
+ while (1) {
+ ret = commit_do(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ bch2_check_alloc_key(&trans, &iter,
+ &discard_iter,
+ &freespace_iter));
+ if (ret)
+ break;
- heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
+ bch2_btree_iter_advance(&iter);
+ }
+ bch2_trans_iter_exit(&trans, &freespace_iter);
+ bch2_trans_iter_exit(&trans, &discard_iter);
+ bch2_trans_iter_exit(&trans, &iter);
- for (i = 0; i < ca->alloc_heap.used; i++)
- nr += ca->alloc_heap.data[i].nr;
+ if (ret < 0)
+ goto err;
- return nr;
+ ret = for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_need_discard, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ bch2_check_discard_freespace_key(&trans, &iter)) ?:
+ for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_freespace, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ bch2_check_discard_freespace_key(&trans, &iter));
+err:
+ bch2_trans_exit(&trans);
+ return ret < 0 ? ret : 0;
}
-static int bucket_invalidate_btree(struct btree_trans *trans,
- struct bch_dev *ca, u64 b,
- struct bkey_alloc_unpacked *u)
+static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
+ struct btree_iter *alloc_iter)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
+ struct btree_iter lru_iter;
+ struct bch_alloc_v4 a;
+ struct bkey_s_c alloc_k, k;
+ struct printbuf buf = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
int ret;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- POS(ca->dev_idx, b),
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
+ alloc_k = bch2_btree_iter_peek(alloc_iter);
+ if (!alloc_k.k)
+ return 0;
- k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(alloc_k);
+ if (ret)
+ return ret;
+
+ bch2_alloc_to_v4(alloc_k, &a);
+
+ if (a.data_type != BCH_DATA_cached)
+ return 0;
+
+ bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
+ POS(alloc_k.k->p.inode, a.io_time[READ]), 0);
+
+ k = bch2_btree_iter_peek_slot(&lru_iter);
ret = bkey_err(k);
if (ret)
goto err;
- *u = bch2_alloc_unpack(k);
- u->gen++;
- u->data_type = 0;
- u->dirty_sectors = 0;
- u->cached_sectors = 0;
- u->read_time = atomic64_read(&c->io_clock[READ].now);
- u->write_time = atomic64_read(&c->io_clock[WRITE].now);
+ if (fsck_err_on(!a.io_time[READ], c,
+ "cached bucket with read_time 0\n"
+ " %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
+ fsck_err_on(k.k->type != KEY_TYPE_lru ||
+ le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c,
+ "incorrect/missing lru entry\n"
+ " %s\n"
+ " %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
+ (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
+ u64 read_time = a.io_time[READ];
+
+ if (!a.io_time[READ])
+ a.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
+
+ ret = bch2_lru_set(trans,
+ alloc_k.k->p.inode,
+ alloc_k.k->p.offset,
+ &a.io_time[READ]);
+ if (ret)
+ goto err;
+
+ if (a.io_time[READ] != read_time) {
+ struct bkey_i_alloc_v4 *a_mut =
+ bch2_alloc_to_v4_mut(trans, alloc_k);
+ ret = PTR_ERR_OR_ZERO(a_mut);
+ if (ret)
+ goto err;
- ret = bch2_alloc_write(trans, &iter, u,
- BTREE_TRIGGER_BUCKET_INVALIDATE);
+ a_mut->v.io_time[READ] = a.io_time[READ];
+ ret = bch2_trans_update(trans, alloc_iter,
+ &a_mut->k_i, BTREE_TRIGGER_NORUN);
+ if (ret)
+ goto err;
+ }
+ }
err:
- bch2_trans_iter_exit(trans, &iter);
+fsck_err:
+ bch2_trans_iter_exit(trans, &lru_iter);
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
- u64 *journal_seq, unsigned flags)
+int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
{
- struct bkey_alloc_unpacked u;
- size_t b;
- u64 commit_seq = 0;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
int ret = 0;
- /*
- * If the read-only path is trying to shut down, we can't be generating
- * new btree updates:
- */
- if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
- return 1;
+ bch2_trans_init(&trans, c, 0, 0);
- BUG_ON(!ca->alloc_heap.used ||
- !ca->alloc_heap.data[0].nr);
- b = ca->alloc_heap.data[0].bucket;
+ for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+ POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ bch2_check_alloc_to_lru_ref(&trans, &iter));
- /* first, put on free_inc and mark as owned by allocator: */
- percpu_down_read(&c->mark_lock);
+ bch2_trans_exit(&trans);
+ return ret < 0 ? ret : 0;
+}
+
+static int bch2_discard_one_bucket(struct btree_trans *trans,
+ struct btree_iter *need_discard_iter,
+ struct bpos *discard_pos_done,
+ u64 *seen,
+ u64 *open,
+ u64 *need_journal_commit,
+ u64 *discarded)
+{
+ struct bch_fs *c = trans->c;
+ struct bpos pos = need_discard_iter->pos;
+ struct btree_iter iter = { NULL };
+ struct bkey_s_c k;
+ struct bch_dev *ca;
+ struct bkey_i_alloc_v4 *a;
+ struct printbuf buf = PRINTBUF;
+ bool did_discard = false;
+ int ret = 0;
- bch2_mark_alloc_bucket(c, ca, b, true);
+ ca = bch_dev_bkey_exists(c, pos.inode);
+ if (!percpu_ref_tryget(&ca->io_ref)) {
+ bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
+ return 0;
+ }
- spin_lock(&c->freelist_lock);
- verify_not_on_freelist(c, ca, b);
- BUG_ON(!fifo_push(&ca->free_inc, b));
- spin_unlock(&c->freelist_lock);
+ if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
+ (*open)++;
+ goto out;
+ }
- percpu_up_read(&c->mark_lock);
+ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk,
+ pos.inode, pos.offset)) {
+ (*need_journal_commit)++;
+ goto out;
+ }
- ret = bch2_trans_do(c, NULL, &commit_seq,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RESERVED|
- flags,
- bucket_invalidate_btree(&trans, ca, b, &u));
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ need_discard_iter->pos,
+ BTREE_ITER_CACHED);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto out;
- if (!ret) {
- /* remove from alloc_heap: */
- struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+ a = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ goto out;
- top->bucket++;
- top->nr--;
+ if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
+ a->v.gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+ goto write;
+ }
- if (!top->nr)
- heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
+ if (bch2_trans_inconsistent_on(a->v.journal_seq > c->journal.flushed_seq_ondisk, trans,
+ "clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
+ "%s",
+ a->v.journal_seq,
+ c->journal.flushed_seq_ondisk,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = -EIO;
+ goto out;
+ }
- /*
- * If we invalidating cached data then we need to wait on the
- * journal commit:
- */
- if (u.data_type)
- *journal_seq = max(*journal_seq, commit_seq);
+ if (bch2_trans_inconsistent_on(a->v.data_type != BCH_DATA_need_discard, trans,
+ "bucket incorrectly set in need_discard btree\n"
+ "%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = -EIO;
+ goto out;
+ }
+ if (bkey_cmp(*discard_pos_done, iter.pos) &&
+ ca->mi.discard && !c->opts.nochanges) {
/*
- * We already waiting on u.alloc_seq when we filtered out
- * buckets that need journal commit:
+ * This works without any other locks because this is the only
+ * thread that removes items from the need_discard tree
*/
- BUG_ON(*journal_seq > u.journal_seq);
- } else {
- size_t b2;
+ bch2_trans_unlock(trans);
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ k.k->p.offset * ca->mi.bucket_size,
+ ca->mi.bucket_size,
+ GFP_KERNEL);
- /* remove from free_inc: */
- percpu_down_read(&c->mark_lock);
- spin_lock(&c->freelist_lock);
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ goto out;
+ }
- bch2_mark_alloc_bucket(c, ca, b, false);
+ *discard_pos_done = iter.pos;
+ did_discard = true;
- BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
- BUG_ON(b != b2);
+ SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
+ a->v.data_type = alloc_data_type(a->v, a->v.data_type);
+write:
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL);
+ if (ret)
+ goto out;
- spin_unlock(&c->freelist_lock);
- percpu_up_read(&c->mark_lock);
+ if (did_discard) {
+ this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
+ (*discarded)++;
}
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ percpu_ref_put(&ca->io_ref);
+ printbuf_exit(&buf);
+ return ret;
+}
- return ret < 0 ? ret : 0;
+static void bch2_do_discards_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
+ struct bpos discard_pos_done = POS_MAX;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ /*
+ * We're doing the commit in bch2_discard_one_bucket instead of using
+ * for_each_btree_key_commit() so that we can increment counters after
+ * successful commit:
+ */
+ ret = for_each_btree_key2(&trans, iter,
+ BTREE_ID_need_discard, POS_MIN, 0, k,
+ bch2_discard_one_bucket(&trans, &iter, &discard_pos_done,
+ &seen,
+ &open,
+ &need_journal_commit,
+ &discarded));
+
+ bch2_trans_exit(&trans);
+
+ if (need_journal_commit * 2 > seen)
+ bch2_journal_flush_async(&c->journal, NULL);
+
+ percpu_ref_put(&c->writes);
+
+ trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
+ bch2_err_str(ret));
+}
+
+void bch2_do_discards(struct bch_fs *c)
+{
+ if (percpu_ref_tryget_live(&c->writes) &&
+ !queue_work(system_long_wq, &c->discard_work))
+ percpu_ref_put(&c->writes);
}
-/*
- * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
- */
-static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
+static int invalidate_one_bucket(struct btree_trans *trans,
+ struct btree_iter *lru_iter, struct bkey_s_c k,
+ unsigned dev_idx, s64 *nr_to_invalidate)
{
- u64 journal_seq = 0;
+ struct bch_fs *c = trans->c;
+ struct btree_iter alloc_iter = { NULL };
+ struct bkey_i_alloc_v4 *a;
+ struct bpos bucket;
+ struct printbuf buf = PRINTBUF;
+ unsigned cached_sectors;
int ret = 0;
- /* Only use nowait if we've already invalidated at least one bucket: */
- while (!ret &&
- !fifo_full(&ca->free_inc) &&
- ca->alloc_heap.used) {
- if (kthread_should_stop()) {
- ret = 1;
- break;
+ if (*nr_to_invalidate <= 0 || k.k->p.inode != dev_idx)
+ return 1;
+
+ if (k.k->type != KEY_TYPE_lru) {
+ prt_printf(&buf, "non lru key in lru btree:\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+ bch_err(c, "%s", buf.buf);
+ } else {
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ ret = -EINVAL;
}
- ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
- (!fifo_empty(&ca->free_inc)
- ? BTREE_INSERT_NOWAIT : 0));
- /*
- * We only want to batch up invalidates when they're going to
- * require flushing the journal:
- */
- if (!journal_seq)
- break;
+ goto out;
}
- /* If we used NOWAIT, don't return the error: */
- if (!fifo_empty(&ca->free_inc))
- ret = 0;
- if (ret < 0)
- bch_err(ca, "error invalidating buckets: %i", ret);
+ bucket = POS(dev_idx, le64_to_cpu(bkey_s_c_to_lru(k).v->idx));
+
+ a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
+ ret = PTR_ERR_OR_ZERO(a);
if (ret)
- return ret;
+ goto out;
- if (journal_seq)
- ret = bch2_journal_flush_seq(&c->journal, journal_seq);
- if (ret) {
- bch_err(ca, "journal error: %i", ret);
- return ret;
- }
+ if (k.k->p.offset != alloc_lru_idx(a->v)) {
+ prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
- return 0;
-}
+ if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+ bch_err(c, "%s", buf.buf);
+ } else {
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ ret = -EINVAL;
+ }
-static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
-{
- if (ca->allocator_state != new_state) {
- ca->allocator_state = new_state;
- closure_wake_up(&ca->fs->freelist_wait);
+ goto out;
}
+
+ if (!a->v.cached_sectors)
+ bch_err(c, "invalidating empty bucket, confused");
+
+ cached_sectors = a->v.cached_sectors;
+
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+ a->v.gen++;
+ a->v.data_type = 0;
+ a->v.dirty_sectors = 0;
+ a->v.cached_sectors = 0;
+ a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
+ a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now);
+
+ ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
+ BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL);
+ if (ret)
+ goto out;
+
+ trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
+ --*nr_to_invalidate;
+out:
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ printbuf_exit(&buf);
+ return ret;
}
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
+static void bch2_do_invalidates_work(struct work_struct *work)
{
+ struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
+ struct bch_dev *ca;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
unsigned i;
int ret = 0;
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++) {
- /*
- * Don't strand buckets on the copygc freelist until
- * after recovery is finished:
- */
- if (i == RESERVE_MOVINGGC &&
- !test_bit(BCH_FS_STARTED, &c->flags))
- continue;
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_member_device(ca, c, i) {
+ s64 nr_to_invalidate =
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+
+ ret = for_each_btree_key2(&trans, iter, BTREE_ID_lru,
+ POS(ca->dev_idx, 0), BTREE_ITER_INTENT, k,
+ invalidate_one_bucket(&trans, &iter, k, ca->dev_idx, &nr_to_invalidate));
- if (fifo_push(&ca->free[i], b)) {
- fifo_pop(&ca->free_inc, b);
- ret = 1;
+ if (ret < 0) {
+ percpu_ref_put(&ca->ref);
break;
}
}
- spin_unlock(&c->freelist_lock);
- ca->allocator_state = ret
- ? ALLOCATOR_running
- : ALLOCATOR_blocked_full;
- closure_wake_up(&c->freelist_wait);
- return ret;
+ bch2_trans_exit(&trans);
+ percpu_ref_put(&c->writes);
}
-static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
+void bch2_do_invalidates(struct bch_fs *c)
{
- if (ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
- blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
- ca->mi.bucket_size, GFP_NOFS, 0);
+ if (percpu_ref_tryget_live(&c->writes) &&
+ !queue_work(system_long_wq, &c->invalidate_work))
+ percpu_ref_put(&c->writes);
}
-static bool allocator_thread_running(struct bch_dev *ca)
+static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k, struct bch_dev *ca)
{
- unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
- test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
- ? ALLOCATOR_running
- : ALLOCATOR_stopped;
- alloc_thread_set_state(ca, state);
- return state == ALLOCATOR_running;
-}
+ struct bch_alloc_v4 a;
-static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
-{
- s64 available = dev_buckets_reclaimable(ca) -
- (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
- bool ret = available > 0;
+ if (iter->pos.offset >= ca->mi.nbuckets)
+ return 1;
- alloc_thread_set_state(ca, ret
- ? ALLOCATOR_running
- : ALLOCATOR_blocked);
- return ret;
+ bch2_alloc_to_v4(k, &a);
+ return bch2_bucket_do_index(trans, k, &a, true);
}
-/**
- * bch_allocator_thread - move buckets from free_inc to reserves
- *
- * The free_inc FIFO is populated by find_reclaimable_buckets(), and
- * the reserves are depleted by bucket allocation. When we run out
- * of free_inc, try to invalidate some buckets and write out
- * prios and gens.
- */
-static int bch2_allocator_thread(void *arg)
+static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_dev *ca = arg;
- struct bch_fs *c = ca->fs;
- unsigned long gc_count = c->gc_count;
- size_t nr;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_member *m;
int ret;
- set_freezable();
+ bch2_trans_init(&trans, c, 0, 0);
- while (1) {
- ret = kthread_wait_freezable(allocator_thread_running(ca));
- if (ret)
- goto stop;
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, ca->mi.first_bucket),
+ BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW,
+ bucket_freespace_init(&trans, &iter, k, ca));
- while (!ca->alloc_heap.used) {
- cond_resched();
+ bch2_trans_exit(&trans);
- ret = kthread_wait_freezable(buckets_available(ca, gc_count));
- if (ret)
- goto stop;
-
- gc_count = c->gc_count;
- nr = find_reclaimable_buckets(c, ca);
-
- if (!nr && ca->buckets_waiting_on_journal) {
- ret = bch2_journal_flush(&c->journal);
- if (ret)
- goto stop;
- } else if (nr < (ca->mi.nbuckets >> 6) &&
- ca->buckets_waiting_on_journal >= nr / 2) {
- bch2_journal_flush_async(&c->journal, NULL);
- }
+ if (ret < 0) {
+ bch_err(ca, "error initializing free space: %s", bch2_err_str(ret));
+ return ret;
+ }
- if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
- ca->inc_gen_really_needs_gc) &&
- c->gc_thread) {
- atomic_inc(&c->kick_gc);
- wake_up_process(c->gc_thread);
- }
+ mutex_lock(&c->sb_lock);
+ m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
+ SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
+ mutex_unlock(&c->sb_lock);
- trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
- ca->inc_gen_really_needs_gc);
- }
+ return 0;
+}
- ret = bch2_invalidate_buckets(c, ca);
- if (ret)
- goto stop;
+int bch2_fs_freespace_init(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+ int ret = 0;
+ bool doing_init = false;
- while (!fifo_empty(&ca->free_inc)) {
- u64 b = fifo_peek(&ca->free_inc);
+ /*
+ * We can crash during the device add path, so we need to check this on
+ * every mount:
+ */
- discard_one_bucket(c, ca, b);
+ for_each_member_device(ca, c, i) {
+ if (ca->mi.freespace_initialized)
+ continue;
- ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
- if (ret)
- goto stop;
+ if (!doing_init) {
+ bch_info(c, "initializing freespace");
+ doing_init = true;
+ }
+
+ ret = bch2_dev_freespace_init(c, ca);
+ if (ret) {
+ percpu_ref_put(&ca->ref);
+ return ret;
}
}
-stop:
- alloc_thread_set_state(ca, ALLOCATOR_stopped);
- return 0;
+
+ if (doing_init) {
+ mutex_lock(&c->sb_lock);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ bch_verbose(c, "done initializing freespace");
+ }
+
+ return ret;
+}
+
+/* Bucket IO clocks: */
+
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+ size_t bucket_nr, int rw)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a;
+ u64 now;
+ int ret = 0;
+
+ a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr));
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ret;
+
+ now = atomic64_read(&c->io_clock[rw].now);
+ if (a->v.io_time[rw] == now)
+ goto out;
+
+ a->v.io_time[rw] = now;
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
/* Startup/shutdown (ro/rw): */
u64 capacity = 0, reserved_sectors = 0, gc_reserve;
unsigned bucket_size_max = 0;
unsigned long ra_pages = 0;
- unsigned i, j;
+ unsigned i;
lockdep_assert_held(&c->state_lock);
* allocations for foreground writes must wait -
* not -ENOSPC calculations.
*/
- for (j = 0; j < RESERVE_NONE; j++)
- dev_reserve += ca->free[j].size;
+
+ dev_reserve += ca->nr_btree_reserve * 2;
+ dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
dev_reserve += 1; /* btree write point */
dev_reserve += 1; /* copygc write point */
{
unsigned i;
- BUG_ON(ca->alloc_thread);
-
/* First, remove device from allocation groups: */
for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
-void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
-{
- if (ca->alloc_thread)
- closure_wait_event(&c->freelist_wait,
- ca->allocator_state != ALLOCATOR_running);
-}
-
-/* stop allocator thread: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
-{
- struct task_struct *p;
-
- p = rcu_dereference_protected(ca->alloc_thread, 1);
- ca->alloc_thread = NULL;
-
- /*
- * We need an rcu barrier between setting ca->alloc_thread = NULL and
- * the thread shutting down to avoid bch2_wake_allocator() racing:
- *
- * XXX: it would be better to have the rcu barrier be asynchronous
- * instead of blocking us here
- */
- synchronize_rcu();
-
- if (p) {
- kthread_stop(p);
- put_task_struct(p);
- }
-}
-
-/* start allocator thread: */
-int bch2_dev_allocator_start(struct bch_dev *ca)
-{
- struct task_struct *p;
-
- /*
- * allocator thread already started?
- */
- if (ca->alloc_thread)
- return 0;
-
- p = kthread_create(bch2_allocator_thread, ca,
- "bch-alloc/%s", ca->name);
- if (IS_ERR(p)) {
- bch_err(ca->fs, "error creating allocator thread: %li",
- PTR_ERR(p));
- return PTR_ERR(p);
- }
-
- get_task_struct(p);
- rcu_assign_pointer(ca->alloc_thread, p);
- wake_up_process(p);
- return 0;
-}
-
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
+ INIT_WORK(&c->discard_work, bch2_do_discards_work);
+ INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
}
#include "debug.h"
#include "super.h"
-extern const char * const bch2_allocator_states[];
-
-struct bkey_alloc_unpacked {
- u64 journal_seq;
- u64 bucket;
- u8 dev;
- u8 gen;
- u8 oldest_gen;
- u8 data_type;
-#define x(_name, _bits) u##_bits _name;
- BCH_ALLOC_FIELDS_V2()
-#undef x
-};
-
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
-/* returns true if not equal */
-static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
- struct bkey_alloc_unpacked r)
+static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
+{
+ struct bch_dev *ca;
+
+ if (!bch2_dev_exists2(c, pos.inode))
+ return false;
+
+ ca = bch_dev_bkey_exists(c, pos.inode);
+ return pos.offset >= ca->mi.first_bucket &&
+ pos.offset < ca->mi.nbuckets;
+}
+
+static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
+{
+ return a.gen - a.oldest_gen;
+}
+
+static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors,
+ u32 cached_sectors,
+ u32 stripe,
+ struct bch_alloc_v4 a,
+ enum bch_data_type data_type)
+{
+ if (dirty_sectors)
+ return data_type;
+ if (stripe)
+ return BCH_DATA_stripe;
+ if (cached_sectors)
+ return BCH_DATA_cached;
+ if (BCH_ALLOC_V4_NEED_DISCARD(&a))
+ return BCH_DATA_need_discard;
+ if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
+ return BCH_DATA_need_gc_gens;
+ return BCH_DATA_free;
+}
+
+static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
+ enum bch_data_type data_type)
+{
+ return __alloc_data_type(a.dirty_sectors, a.cached_sectors,
+ a.stripe, a, data_type);
+}
+
+static inline u64 alloc_lru_idx(struct bch_alloc_v4 a)
+{
+ return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
+}
+
+static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
+{
+ return ((u64) alloc_gc_gen(a) >> 4) << 56;
+}
+
+static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
+{
+ pos.offset |= alloc_freespace_genbits(a);
+ return pos;
+}
+
+static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+{
+ unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+ BCH_ALLOC_V4_U64s_V0) +
+ BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
+ (sizeof(struct bch_backpointer) / sizeof(u64));
+
+ BUG_ON(ret > U8_MAX - BKEY_U64s);
+ return ret;
+}
+
+static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
{
- return l.gen != r.gen ||
- l.oldest_gen != r.oldest_gen ||
- l.data_type != r.data_type
-#define x(_name, ...) || l._name != r._name
- BCH_ALLOC_FIELDS_V2()
-#undef x
- ;
-}
-
-struct bkey_alloc_buf {
- struct bkey_i k;
- struct bch_alloc_v3 v;
-
-#define x(_name, _bits) + _bits / 8
- u8 _pad[0 + BCH_ALLOC_FIELDS_V2()];
-#undef x
-} __attribute__((packed, aligned(8)));
-
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *,
- const struct bkey_alloc_unpacked);
-int bch2_alloc_write(struct btree_trans *, struct btree_iter *,
- struct bkey_alloc_unpacked *, unsigned);
+ set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v));
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
+
+void bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
-const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
-const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
-const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+void bch2_alloc_v4_swab(struct bkey_s);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc (struct bkey_ops) { \
.key_invalid = bch2_alloc_v1_invalid, \
.val_to_text = bch2_alloc_to_text, \
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
}
#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \
.key_invalid = bch2_alloc_v2_invalid, \
.val_to_text = bch2_alloc_to_text, \
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
}
#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \
.key_invalid = bch2_alloc_v3_invalid, \
.val_to_text = bch2_alloc_to_text, \
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
+}
+
+#define bch2_bkey_ops_alloc_v4 (struct bkey_ops) { \
+ .key_invalid = bch2_alloc_v4_invalid, \
+ .val_to_text = bch2_alloc_to_text, \
+ .swab = bch2_alloc_v4_swab, \
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
}
static inline bool bkey_is_alloc(const struct bkey *k)
k->type == KEY_TYPE_alloc_v3;
}
-int bch2_alloc_read(struct bch_fs *, bool, bool);
+int bch2_alloc_read(struct bch_fs *);
+
+int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_check_alloc_info(struct bch_fs *);
+int bch2_check_alloc_to_lru_refs(struct bch_fs *);
+void bch2_do_discards(struct bch_fs *);
-static inline void bch2_wake_allocator(struct bch_dev *ca)
+static inline u64 should_invalidate_buckets(struct bch_dev *ca,
+ struct bch_dev_usage u)
{
- struct task_struct *p;
+ u64 want_free = ca->mi.nbuckets >> 7;
+ u64 free = max_t(s64, 0,
+ u.d[BCH_DATA_free].buckets
+ + u.d[BCH_DATA_need_discard].buckets
+ - bch2_dev_buckets_reserved(ca, RESERVE_none));
- rcu_read_lock();
- p = rcu_dereference(ca->alloc_thread);
- if (p)
- wake_up_process(p);
- rcu_read_unlock();
+ return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
}
-static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
- size_t bucket)
+void bch2_do_invalidates(struct bch_fs *);
+
+static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
{
- if (bch2_expensive_debug_checks) {
- size_t iter;
- long i;
- unsigned j;
+ return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
+}
- for (j = 0; j < RESERVE_NR; j++)
- fifo_for_each_entry(i, &ca->free[j], iter)
- BUG_ON(i == bucket);
- fifo_for_each_entry(i, &ca->free_inc, iter)
- BUG_ON(i == bucket);
- }
+static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
+{
+ return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
}
+int bch2_fs_freespace_init(struct bch_fs *);
+
void bch2_recalc_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_stop(struct bch_dev *);
-int bch2_dev_allocator_start(struct bch_dev *);
-
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "btree_update.h"
#include "btree_gc.h"
#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
#include "clock.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
+#include "error.h"
#include "io.h"
+#include "journal.h"
+#include "movinggc.h"
#include <linux/math64.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <trace/events/bcachefs.h>
+const char * const bch2_alloc_reserves[] = {
+#define x(t) #t,
+ BCH_ALLOC_RESERVES()
+#undef x
+ NULL
+};
+
/*
* Open buckets represent a bucket that's currently being allocated from. They
* serve two purposes:
percpu_down_read(&c->mark_lock);
spin_lock(&ob->lock);
- bch2_mark_alloc_bucket(c, ca, ob->bucket, false);
ob->valid = false;
ob->data_type = 0;
static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
{
switch (reserve) {
- case RESERVE_BTREE:
- case RESERVE_BTREE_MOVINGGC:
+ case RESERVE_btree:
+ case RESERVE_btree_movinggc:
return 0;
- case RESERVE_MOVINGGC:
+ case RESERVE_movinggc:
return OPEN_BUCKETS_COUNT / 4;
default:
return OPEN_BUCKETS_COUNT / 2;
}
}
-/**
- * bch_bucket_alloc - allocate a single bucket from a specific device
- *
- * Returns index of bucket on success, 0 on failure
- * */
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
- enum alloc_reserve reserve,
- bool may_alloc_partial,
- struct closure *cl)
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+ u64 bucket,
+ enum alloc_reserve reserve,
+ struct bch_alloc_v4 *a,
+ u64 *skipped_open,
+ u64 *skipped_need_journal_commit,
+ u64 *skipped_nouse,
+ struct closure *cl)
{
struct open_bucket *ob;
- long b = 0;
- spin_lock(&c->freelist_lock);
+ if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
+ (*skipped_nouse)++;
+ return NULL;
+ }
- if (may_alloc_partial) {
- int i;
-
- for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
- ob = c->open_buckets + ca->open_buckets_partial[i];
-
- if (reserve <= ob->alloc_reserve) {
- array_remove_item(ca->open_buckets_partial,
- ca->open_buckets_partial_nr,
- i);
- ob->on_partial_list = false;
- ob->alloc_reserve = reserve;
- spin_unlock(&c->freelist_lock);
- return ob;
- }
- }
+ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+ (*skipped_open)++;
+ return NULL;
+ }
+
+ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
+ (*skipped_need_journal_commit)++;
+ return NULL;
}
+ spin_lock(&c->freelist_lock);
+
if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
if (cl)
closure_wait(&c->open_buckets_wait, cl);
c->blocked_allocate_open_bucket = local_clock();
spin_unlock(&c->freelist_lock);
- trace_open_bucket_alloc_fail(ca, reserve);
- return ERR_PTR(-OPEN_BUCKETS_EMPTY);
+ return ERR_PTR(-BCH_ERR_open_buckets_empty);
}
- if (likely(fifo_pop(&ca->free[RESERVE_NONE], b)))
- goto out;
-
- switch (reserve) {
- case RESERVE_BTREE_MOVINGGC:
- case RESERVE_MOVINGGC:
- if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b))
- goto out;
- break;
- default:
- break;
+ /* Recheck under lock: */
+ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+ spin_unlock(&c->freelist_lock);
+ (*skipped_open)++;
+ return NULL;
}
- if (cl)
- closure_wait(&c->freelist_wait, cl);
-
- if (!c->blocked_allocate)
- c->blocked_allocate = local_clock();
-
- spin_unlock(&c->freelist_lock);
-
- trace_bucket_alloc_fail(ca, reserve);
- return ERR_PTR(-FREELIST_EMPTY);
-out:
- verify_not_on_freelist(c, ca, b);
-
ob = bch2_open_bucket_alloc(c);
spin_lock(&ob->lock);
ob->sectors_free = ca->mi.bucket_size;
ob->alloc_reserve = reserve;
ob->dev = ca->dev_idx;
- ob->gen = *bucket_gen(ca, b);
- ob->bucket = b;
+ ob->gen = a->gen;
+ ob->bucket = bucket;
spin_unlock(&ob->lock);
ca->nr_open_buckets++;
spin_unlock(&c->freelist_lock);
- bch2_wake_allocator(ca);
+ return ob;
+}
+
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
+ enum alloc_reserve reserve, u64 free_entry,
+ u64 *skipped_open,
+ u64 *skipped_need_journal_commit,
+ u64 *skipped_nouse,
+ struct bkey_s_c freespace_k,
+ struct closure *cl)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter = { NULL };
+ struct bkey_s_c k;
+ struct open_bucket *ob;
+ struct bch_alloc_v4 a;
+ u64 b = free_entry & ~(~0ULL << 56);
+ unsigned genbits = free_entry >> 56;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
+ prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
+ " freespace key ",
+ ca->mi.first_bucket, ca->mi.nbuckets);
+ bch2_bkey_val_to_text(&buf, c, freespace_k);
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ ob = ERR_PTR(-EIO);
+ goto err;
+ }
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret) {
+ ob = ERR_PTR(ret);
+ goto err;
+ }
+
+ bch2_alloc_to_v4(k, &a);
+
+ if (genbits != (alloc_freespace_genbits(a) >> 56)) {
+ prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
+ " freespace key ",
+ genbits, alloc_freespace_genbits(a) >> 56);
+ bch2_bkey_val_to_text(&buf, c, freespace_k);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ ob = ERR_PTR(-EIO);
+ goto err;
+
+ }
+
+ if (a.data_type != BCH_DATA_free) {
+ prt_printf(&buf, "non free bucket in freespace btree\n"
+ " freespace key ");
+ bch2_bkey_val_to_text(&buf, c, freespace_k);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ ob = ERR_PTR(-EIO);
+ goto err;
+ }
+
+ ob = __try_alloc_bucket(c, ca, b, reserve, &a,
+ skipped_open,
+ skipped_need_journal_commit,
+ skipped_nouse,
+ cl);
+ if (!ob)
+ iter.path->preserve = false;
+err:
+ set_btree_iter_dontneed(&iter);
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ob;
+}
+
+static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca,
+ enum alloc_reserve reserve)
+{
+ struct open_bucket *ob;
+ int i;
+
+ spin_lock(&c->freelist_lock);
+
+ for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
+ ob = c->open_buckets + ca->open_buckets_partial[i];
+
+ if (reserve <= ob->alloc_reserve) {
+ array_remove_item(ca->open_buckets_partial,
+ ca->open_buckets_partial_nr,
+ i);
+ ob->on_partial_list = false;
+ ob->alloc_reserve = reserve;
+ spin_unlock(&c->freelist_lock);
+ return ob;
+ }
+ }
+
+ spin_unlock(&c->freelist_lock);
+ return NULL;
+}
+
+/*
+ * This path is for before the freespace btree is initialized:
+ *
+ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
+ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
+ */
+static noinline struct open_bucket *
+bch2_bucket_alloc_early(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ u64 *cur_bucket,
+ u64 *buckets_seen,
+ u64 *skipped_open,
+ u64 *skipped_need_journal_commit,
+ u64 *skipped_nouse,
+ struct closure *cl)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct open_bucket *ob = NULL;
+ int ret;
+
+ *cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket);
+ *cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx);
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket),
+ BTREE_ITER_SLOTS, k, ret) {
+ struct bch_alloc_v4 a;
+
+ if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+ break;
+
+ if (ca->new_fs_bucket_idx &&
+ is_superblock_bucket(ca, k.k->p.offset))
+ continue;
+
+ bch2_alloc_to_v4(k, &a);
+
+ if (a.data_type != BCH_DATA_free)
+ continue;
+
+ (*buckets_seen)++;
+
+ ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a,
+ skipped_open,
+ skipped_need_journal_commit,
+ skipped_nouse,
+ cl);
+ if (ob)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ *cur_bucket = iter.pos.offset;
+
+ return ob ?: ERR_PTR(ret ?: -BCH_ERR_no_buckets_found);
+}
+
+static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ u64 *cur_bucket,
+ u64 *buckets_seen,
+ u64 *skipped_open,
+ u64 *skipped_need_journal_commit,
+ u64 *skipped_nouse,
+ struct closure *cl)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct open_bucket *ob = NULL;
+ int ret;
+
+ BUG_ON(ca->new_fs_bucket_idx);
+
+ /*
+ * XXX:
+ * On transaction restart, we'd like to restart from the bucket we were
+ * at previously
+ */
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
+ POS(ca->dev_idx, *cur_bucket), 0, k, ret) {
+ if (k.k->p.inode != ca->dev_idx)
+ break;
+
+ for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k));
+ *cur_bucket < k.k->p.offset;
+ (*cur_bucket)++) {
+ ret = btree_trans_too_many_iters(trans);
+ if (ret)
+ break;
+
+ (*buckets_seen)++;
+
+ ob = try_alloc_bucket(trans, ca, reserve,
+ *cur_bucket,
+ skipped_open,
+ skipped_need_journal_commit,
+ skipped_nouse,
+ k, cl);
+ if (ob)
+ break;
+ }
+
+ if (ob || ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ob ?: ERR_PTR(ret);
+}
+
+/**
+ * bch_bucket_alloc - allocate a single bucket from a specific device
+ *
+ * Returns index of bucket on success, 0 on failure
+ */
+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ bool may_alloc_partial,
+ struct closure *cl,
+ struct bch_dev_usage *usage)
+{
+ struct bch_fs *c = trans->c;
+ struct open_bucket *ob = NULL;
+ bool freespace_initialized = READ_ONCE(ca->mi.freespace_initialized);
+ u64 start = freespace_initialized ? 0 : ca->bucket_alloc_trans_early_cursor;
+ u64 avail;
+ u64 cur_bucket = start;
+ u64 buckets_seen = 0;
+ u64 skipped_open = 0;
+ u64 skipped_need_journal_commit = 0;
+ u64 skipped_nouse = 0;
+ bool waiting = false;
+again:
+ bch2_dev_usage_read_fast(ca, usage);
+ avail = dev_buckets_free(ca, *usage, reserve);
+
+ if (usage->d[BCH_DATA_need_discard].buckets > avail)
+ bch2_do_discards(c);
+
+ if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
+ bch2_do_gc_gens(c);
+
+ if (should_invalidate_buckets(ca, *usage))
+ bch2_do_invalidates(c);
+
+ if (!avail) {
+ if (cl && !waiting) {
+ closure_wait(&c->freelist_wait, cl);
+ waiting = true;
+ goto again;
+ }
+
+ if (!c->blocked_allocate)
+ c->blocked_allocate = local_clock();
+
+ ob = ERR_PTR(-BCH_ERR_freelist_empty);
+ goto err;
+ }
+
+ if (waiting)
+ closure_wake_up(&c->freelist_wait);
+
+ if (may_alloc_partial) {
+ ob = try_alloc_partial_bucket(c, ca, reserve);
+ if (ob)
+ return ob;
+ }
+
+ ob = likely(ca->mi.freespace_initialized)
+ ? bch2_bucket_alloc_freelist(trans, ca, reserve,
+ &cur_bucket,
+ &buckets_seen,
+ &skipped_open,
+ &skipped_need_journal_commit,
+ &skipped_nouse,
+ cl)
+ : bch2_bucket_alloc_early(trans, ca, reserve,
+ &cur_bucket,
+ &buckets_seen,
+ &skipped_open,
+ &skipped_need_journal_commit,
+ &skipped_nouse,
+ cl);
+
+ if (skipped_need_journal_commit * 2 > avail)
+ bch2_journal_flush_async(&c->journal, NULL);
+
+ if (!ob && !freespace_initialized && start) {
+ start = cur_bucket = 0;
+ goto again;
+ }
+
+ if (!freespace_initialized)
+ ca->bucket_alloc_trans_early_cursor = cur_bucket;
+err:
+ if (!ob)
+ ob = ERR_PTR(-BCH_ERR_no_buckets_found);
+
+ if (!IS_ERR(ob))
+ trace_and_count(c, bucket_alloc, ca, bch2_alloc_reserves[reserve],
+ may_alloc_partial, ob->bucket);
+ else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
+ trace_and_count(c, bucket_alloc_fail,
+ ca, bch2_alloc_reserves[reserve],
+ usage->d[BCH_DATA_free].buckets,
+ avail,
+ bch2_copygc_wait_amount(c),
+ c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
+ buckets_seen,
+ skipped_open,
+ skipped_need_journal_commit,
+ skipped_nouse,
+ cl == NULL,
+ bch2_err_str(PTR_ERR(ob)));
- trace_bucket_alloc(ca, reserve);
+ return ob;
+}
+
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ bool may_alloc_partial,
+ struct closure *cl)
+{
+ struct bch_dev_usage usage;
+ struct open_bucket *ob;
+
+ bch2_trans_do(c, NULL, NULL, 0,
+ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
+ may_alloc_partial, cl, &usage)));
return ob;
}
return ret;
}
-void bch2_dev_stripe_increment(struct bch_dev *ca,
- struct dev_stripe_state *stripe)
+static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
+ struct dev_stripe_state *stripe,
+ struct bch_dev_usage *usage)
{
u64 *v = stripe->next_alloc + ca->dev_idx;
- u64 free_space = dev_buckets_available(ca);
+ u64 free_space = dev_buckets_available(ca, RESERVE_none);
u64 free_space_inv = free_space
? div64_u64(1ULL << 48, free_space)
: 1ULL << 48;
*v = *v < scale ? 0 : *v - scale;
}
+void bch2_dev_stripe_increment(struct bch_dev *ca,
+ struct dev_stripe_state *stripe)
+{
+ struct bch_dev_usage usage;
+
+ bch2_dev_usage_read_fast(ca, &usage);
+ bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
+}
+
#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0)
#define BUCKET_ALLOC_USE_DURABILITY (1 << 1)
ob_push(c, ptrs, ob);
}
-int bch2_bucket_alloc_set(struct bch_fs *c,
+static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
struct open_buckets *ptrs,
struct dev_stripe_state *stripe,
struct bch_devs_mask *devs_may_alloc,
unsigned flags,
struct closure *cl)
{
+ struct bch_fs *c = trans->c;
struct dev_alloc_list devs_sorted =
bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+ unsigned dev;
struct bch_dev *ca;
- int ret = -INSUFFICIENT_DEVICES;
+ int ret = -BCH_ERR_insufficient_devices;
unsigned i;
BUG_ON(*nr_effective >= nr_replicas);
for (i = 0; i < devs_sorted.nr; i++) {
+ struct bch_dev_usage usage;
struct open_bucket *ob;
- ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+ dev = devs_sorted.devs[i];
+
+ rcu_read_lock();
+ ca = rcu_dereference(c->devs[dev]);
+ if (ca)
+ percpu_ref_get(&ca->ref);
+ rcu_read_unlock();
+
if (!ca)
continue;
- if (!ca->mi.durability && *have_cache)
+ if (!ca->mi.durability && *have_cache) {
+ percpu_ref_put(&ca->ref);
continue;
+ }
+
+ ob = bch2_bucket_alloc_trans(trans, ca, reserve,
+ flags & BUCKET_MAY_ALLOC_PARTIAL, cl, &usage);
+ if (!IS_ERR(ob))
+ bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
+ percpu_ref_put(&ca->ref);
- ob = bch2_bucket_alloc(c, ca, reserve,
- flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
if (IS_ERR(ob)) {
ret = PTR_ERR(ob);
-
- if (cl)
- return ret;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
+ break;
continue;
}
add_new_bucket(c, ptrs, devs_may_alloc,
nr_effective, have_cache, flags, ob);
- bch2_dev_stripe_increment(ca, stripe);
-
- if (*nr_effective >= nr_replicas)
- return 0;
+ if (*nr_effective >= nr_replicas) {
+ ret = 0;
+ break;
+ }
}
return ret;
}
+int bch2_bucket_alloc_set(struct bch_fs *c,
+ struct open_buckets *ptrs,
+ struct dev_stripe_state *stripe,
+ struct bch_devs_mask *devs_may_alloc,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache,
+ enum alloc_reserve reserve,
+ unsigned flags,
+ struct closure *cl)
+{
+ return bch2_trans_do(c, NULL, NULL, 0,
+ bch2_bucket_alloc_set_trans(&trans, ptrs, stripe,
+ devs_may_alloc, nr_replicas,
+ nr_effective, have_cache, reserve,
+ flags, cl));
+}
+
/* Allocate from stripes: */
/*
wp->ptrs = ptrs_skip;
}
-static int open_bucket_add_buckets(struct bch_fs *c,
+static int open_bucket_add_buckets(struct btree_trans *trans,
struct open_buckets *ptrs,
struct write_point *wp,
struct bch_devs_list *devs_have,
unsigned flags,
struct closure *_cl)
{
+ struct bch_fs *c = trans->c;
struct bch_devs_mask devs;
struct open_bucket *ob;
struct closure *cl = NULL;
target, erasure_code,
nr_replicas, nr_effective,
have_cache, flags, _cl);
- if (ret == -FREELIST_EMPTY ||
- ret == -OPEN_BUCKETS_EMPTY)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
+ bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
return ret;
if (*nr_effective >= nr_replicas)
return 0;
if (*nr_effective >= nr_replicas)
return 0;
- percpu_down_read(&c->mark_lock);
- rcu_read_lock();
-
retry_blocking:
/*
* Try nonblocking first, so that if one device is full we'll try from
* other devices:
*/
- ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
+ ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
nr_replicas, nr_effective, have_cache,
reserve, flags, cl);
- if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) {
+ if (ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+ !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
+ !cl && _cl) {
cl = _cl;
goto retry_blocking;
}
- rcu_read_unlock();
- percpu_up_read(&c->mark_lock);
-
return ret;
}
return true;
}
-static struct write_point *writepoint_find(struct bch_fs *c,
+static void bch2_trans_mutex_lock(struct btree_trans *trans,
+ struct mutex *lock)
+{
+ if (!mutex_trylock(lock)) {
+ bch2_trans_unlock(trans);
+ mutex_lock(lock);
+ }
+}
+
+static struct write_point *writepoint_find(struct btree_trans *trans,
unsigned long write_point)
{
+ struct bch_fs *c = trans->c;
struct write_point *wp, *oldest;
struct hlist_head *head;
if (!(write_point & 1UL)) {
wp = (struct write_point *) write_point;
- mutex_lock(&wp->lock);
+ bch2_trans_mutex_lock(trans, &wp->lock);
return wp;
}
wp = __writepoint_find(head, write_point);
if (wp) {
lock_wp:
- mutex_lock(&wp->lock);
+ bch2_trans_mutex_lock(trans, &wp->lock);
if (wp->write_point == write_point)
goto out;
mutex_unlock(&wp->lock);
if (!oldest || time_before64(wp->last_used, oldest->last_used))
oldest = wp;
- mutex_lock(&oldest->lock);
- mutex_lock(&c->write_points_hash_lock);
+ bch2_trans_mutex_lock(trans, &oldest->lock);
+ bch2_trans_mutex_lock(trans, &c->write_points_hash_lock);
if (oldest >= c->write_points + c->write_points_nr ||
try_increase_writepoints(c)) {
mutex_unlock(&c->write_points_hash_lock);
hlist_add_head_rcu(&wp->node, head);
mutex_unlock(&c->write_points_hash_lock);
out:
- wp->last_used = sched_clock();
+ wp->last_used = local_clock();
return wp;
}
/*
* Get us an open_bucket we can allocate from, return with it locked:
*/
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
+struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *trans,
unsigned target,
unsigned erasure_code,
struct write_point_specifier write_point,
unsigned flags,
struct closure *cl)
{
+ struct bch_fs *c = trans->c;
struct write_point *wp;
struct open_bucket *ob;
struct open_buckets ptrs;
write_points_nr = c->write_points_nr;
have_cache = false;
- wp = writepoint_find(c, write_point.v);
+ wp = writepoint_find(trans, write_point.v);
if (wp->data_type == BCH_DATA_user)
ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
have_cache = true;
if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
- ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
target, erasure_code,
nr_replicas, &nr_effective,
&have_cache, reserve,
ob_flags, cl);
} else {
- ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
target, erasure_code,
nr_replicas, &nr_effective,
&have_cache, reserve,
ob_flags, NULL);
- if (!ret)
+ if (!ret ||
+ bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto alloc_done;
- ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
0, erasure_code,
nr_replicas, &nr_effective,
&have_cache, reserve,
if (erasure_code && !ec_open_bucket(c, &ptrs))
pr_debug("failed to get ec bucket: ret %u", ret);
- if (ret == -INSUFFICIENT_DEVICES &&
+ if (ret == -BCH_ERR_insufficient_devices &&
nr_effective >= nr_replicas_required)
ret = 0;
mutex_unlock(&wp->lock);
- if (ret == -FREELIST_EMPTY &&
+ if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
try_decrease_writepoints(c, write_points_nr))
goto retry;
- switch (ret) {
- case -OPEN_BUCKETS_EMPTY:
- case -FREELIST_EMPTY:
- return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
- case -INSUFFICIENT_DEVICES:
+ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
+ bch2_err_matches(ret, BCH_ERR_freelist_empty))
+ return cl
+ ? ERR_PTR(-EAGAIN)
+ : ERR_PTR(-BCH_ERR_ENOSPC_bucket_alloc);
+
+ if (bch2_err_matches(ret, BCH_ERR_insufficient_devices))
return ERR_PTR(-EROFS);
- default:
- BUG();
- }
+
+ return ERR_PTR(ret);
+}
+
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
+ unsigned target,
+ unsigned erasure_code,
+ struct write_point_specifier write_point,
+ struct bch_devs_list *devs_have,
+ unsigned nr_replicas,
+ unsigned nr_replicas_required,
+ enum alloc_reserve reserve,
+ unsigned flags,
+ struct closure *cl)
+{
+ struct write_point *wp;
+
+ bch2_trans_do(c, NULL, NULL, 0,
+ PTR_ERR_OR_ZERO(wp = bch2_alloc_sectors_start_trans(&trans, target,
+ erasure_code,
+ write_point,
+ devs_have,
+ nr_replicas,
+ nr_replicas_required,
+ reserve,
+ flags, cl)));
+ return wp;
+
}
struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
wp < c->write_points + c->write_points_nr; wp++) {
writepoint_init(wp, BCH_DATA_user);
- wp->last_used = sched_clock();
+ wp->last_used = local_clock();
wp->write_point = (unsigned long) wp;
hlist_add_head_rcu(&wp->node,
writepoint_hash(c, wp->write_point));
ob++) {
spin_lock(&ob->lock);
if (ob->valid && !ob->on_partial_list) {
- pr_buf(out, "%zu ref %u type %s\n",
+ prt_printf(out, "%zu ref %u type %s %u:%llu:%u\n",
ob - c->open_buckets,
atomic_read(&ob->pin),
- bch2_data_types[ob->data_type]);
+ bch2_data_types[ob->data_type],
+ ob->dev, ob->bucket, ob->gen);
}
spin_unlock(&ob->lock);
}
-
}
struct bch_fs;
struct bch_devs_List;
+extern const char * const bch2_alloc_reserves[];
+
struct dev_alloc_list {
unsigned nr;
u8 devs[BCH_SB_MEMBERS_MAX];
return false;
}
+static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+ bool ret;
+
+ if (bch2_bucket_is_open(c, dev, bucket))
+ return true;
+
+ spin_lock(&c->freelist_lock);
+ ret = bch2_bucket_is_open(c, dev, bucket);
+ spin_unlock(&c->freelist_lock);
+
+ return ret;
+}
+
int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
struct dev_stripe_state *, struct bch_devs_mask *,
unsigned, unsigned *, bool *, enum alloc_reserve,
unsigned, struct closure *);
+struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *,
+ unsigned, unsigned,
+ struct write_point_specifier,
+ struct bch_devs_list *,
+ unsigned, unsigned,
+ enum alloc_reserve,
+ unsigned,
+ struct closure *);
struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
unsigned, unsigned,
struct write_point_specifier,
struct ec_bucket_buf;
-#define ALLOC_THREAD_STATES() \
- x(stopped) \
- x(running) \
- x(blocked) \
- x(blocked_full)
-
-enum allocator_states {
-#define x(n) ALLOCATOR_##n,
- ALLOC_THREAD_STATES()
-#undef x
-};
+#define BCH_ALLOC_RESERVES() \
+ x(btree_movinggc) \
+ x(btree) \
+ x(movinggc) \
+ x(none)
enum alloc_reserve {
- RESERVE_BTREE_MOVINGGC = -2,
- RESERVE_BTREE = -1,
- RESERVE_MOVINGGC = 0,
- RESERVE_NONE = 1,
- RESERVE_NR = 2,
+#define x(name) RESERVE_##name,
+ BCH_ALLOC_RESERVES()
+#undef x
};
-typedef FIFO(long) alloc_fifo;
-
#define OPEN_BUCKETS_COUNT 1024
#define WRITE_POINT_HASH_NR 32
* the block in the stripe this open_bucket corresponds to:
*/
u8 ec_idx;
- enum bch_data_type data_type:3;
+ enum bch_data_type data_type:8;
unsigned valid:1;
unsigned on_partial_list:1;
- int alloc_reserve:3;
+ unsigned alloc_reserve:3;
- unsigned sectors_free;
u8 dev;
u8 gen;
+ u32 sectors_free;
u64 bucket;
struct ec_stripe_new *ec;
};
unsigned long v;
};
-struct alloc_heap_entry {
- size_t bucket;
- size_t nr;
- unsigned long key;
-};
-
-typedef HEAP(struct alloc_heap_entry) alloc_heap;
-
#endif /* _BCACHEFS_ALLOC_TYPES_H */
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bbpos.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "error.h"
+
+#include <linux/mm.h>
+
+#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10
+
+/*
+ * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
+ * btree:
+ */
+static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c,
+ struct bpos bp_pos)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode);
+ u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+ return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
+}
+
+/*
+ * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
+ */
+static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
+ struct bpos bucket,
+ u64 bucket_offset)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+ struct bpos ret;
+
+ ret = POS(bucket.inode,
+ (bucket_to_sector(ca, bucket.offset) <<
+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+
+ BUG_ON(bkey_cmp(bucket, bp_pos_to_bucket(c, ret)));
+
+ return ret;
+}
+
+void bch2_extent_ptr_to_bp(struct bch_fs *c,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p,
+ struct bpos *bucket_pos, struct bch_backpointer *bp)
+{
+ enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user;
+ s64 sectors = level ? btree_sectors(c) : k.k->size;
+ u32 bucket_offset;
+
+ *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
+ *bp = (struct bch_backpointer) {
+ .btree_id = btree_id,
+ .level = level,
+ .data_type = data_type,
+ .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
+ p.crc.offset,
+ .bucket_len = ptr_disk_sectors(sectors, p),
+ .pos = k.k->p,
+ };
+}
+
+static bool extent_matches_bp(struct bch_fs *c,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k,
+ struct bpos bucket,
+ struct bch_backpointer bp)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bpos bucket2;
+ struct bch_backpointer bp2;
+
+ if (p.ptr.cached)
+ continue;
+
+ bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
+ &bucket2, &bp2);
+ if (!bpos_cmp(bucket, bucket2) &&
+ !memcmp(&bp, &bp2, sizeof(bp)))
+ return true;
+ }
+
+ return false;
+}
+
+int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
+{
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+ struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
+
+ if (bkey_val_bytes(bp.k) < sizeof(*bp.v)) {
+ prt_str(err, "incorrect value size");
+ return -EINVAL;
+ }
+
+ if (bpos_cmp(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) {
+ prt_str(err, "backpointer at wrong pos");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
+{
+ prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
+ bch2_btree_ids[bp->btree_id],
+ bp->level,
+ (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+ (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+ bp->bucket_len);
+ bch2_bpos_to_text(out, bp->pos);
+}
+
+void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
+}
+
+void bch2_backpointer_swab(struct bkey_s k)
+{
+ struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
+
+ bp.v->bucket_offset = swab32(bp.v->bucket_offset);
+ bp.v->bucket_len = swab32(bp.v->bucket_len);
+ bch2_bpos_swab(&bp.v->pos);
+}
+
+#define BACKPOINTER_OFFSET_MAX ((1ULL << 40) - 1)
+
+static inline int backpointer_cmp(struct bch_backpointer l, struct bch_backpointer r)
+{
+ return cmp_int(l.bucket_offset, r.bucket_offset);
+}
+
+static int bch2_backpointer_del_by_offset(struct btree_trans *trans,
+ struct bpos bucket,
+ u64 bp_offset,
+ struct bch_backpointer bp)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ if (bp_offset < BACKPOINTER_OFFSET_MAX) {
+ struct bch_backpointer *bps;
+ struct bkey_i_alloc_v4 *a;
+ unsigned i, nr;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ bucket,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_alloc_v4) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ a = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ goto err;
+ bps = alloc_v4_backpointers(&a->v);
+ nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
+
+ for (i = 0; i < nr; i++) {
+ if (bps[i].bucket_offset == bp_offset)
+ goto found;
+ if (bps[i].bucket_offset > bp_offset)
+ break;
+ }
+
+ ret = -ENOENT;
+ goto err;
+found:
+ if (memcmp(&bps[i], &bp, sizeof(bp))) {
+ ret = -ENOENT;
+ goto err;
+ }
+ array_remove_item(bps, nr, i);
+ SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
+ set_alloc_v4_u64s(a);
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ } else {
+ bp_offset -= BACKPOINTER_OFFSET_MAX;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp(c, bucket, bp_offset),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_backpointer ||
+ memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_bucket_backpointer_del(struct btree_trans *trans,
+ struct bkey_i_alloc_v4 *a,
+ struct bch_backpointer bp,
+ struct bkey_s_c orig_k)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_backpointer *bps = alloc_v4_backpointers(&a->v);
+ unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
+ struct btree_iter bp_iter;
+ struct bkey_s_c k;
+ int ret;
+
+ for (i = 0; i < nr; i++) {
+ int cmp = backpointer_cmp(bps[i], bp) ?:
+ memcmp(&bps[i], &bp, sizeof(bp));
+ if (!cmp)
+ goto found;
+ if (cmp >= 0)
+ break;
+ }
+
+ goto btree;
+found:
+ array_remove_item(bps, nr, i);
+ SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
+ set_alloc_v4_u64s(a);
+ return 0;
+btree:
+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp(c, a->k.p, bp.bucket_offset),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&bp_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_backpointer ||
+ memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "backpointer not found when deleting");
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ prt_printf(&buf, "searching for ");
+ bch2_backpointer_to_text(&buf, &bp);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "got ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+
+ prt_str(&buf, "alloc ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
+ prt_newline(&buf);
+
+ prt_printf(&buf, "for ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+
+ if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+ bch_err(c, "%s", buf.buf);
+ } else {
+ ret = -EIO;
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ }
+ printbuf_exit(&buf);
+ goto err;
+ }
+
+ ret = bch2_btree_delete_at(trans, &bp_iter, 0);
+err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ return ret;
+}
+
+int bch2_bucket_backpointer_add(struct btree_trans *trans,
+ struct bkey_i_alloc_v4 *a,
+ struct bch_backpointer bp,
+ struct bkey_s_c orig_k)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca;
+ struct bch_backpointer *bps = alloc_v4_backpointers(&a->v);
+ unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
+ struct bkey_i_backpointer *bp_k;
+ struct btree_iter bp_iter;
+ struct bkey_s_c k;
+ int ret;
+
+ /* Check for duplicates: */
+ for (i = 0; i < nr; i++) {
+ int cmp = backpointer_cmp(bps[i], bp);
+ if (cmp >= 0)
+ break;
+ }
+
+ if ((i &&
+ (bps[i - 1].bucket_offset +
+ bps[i - 1].bucket_len > bp.bucket_offset)) ||
+ (i < nr &&
+ (bp.bucket_offset + bp.bucket_len > bps[i].bucket_offset))) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "overlapping backpointer found when inserting ");
+ bch2_backpointer_to_text(&buf, &bp);
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ prt_printf(&buf, "into ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
+ prt_newline(&buf);
+
+ prt_printf(&buf, "for ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+
+ if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags))
+ bch_err(c, "%s", buf.buf);
+ else {
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return -EIO;
+ }
+ }
+
+ if (nr < BCH_ALLOC_V4_NR_BACKPOINTERS_MAX) {
+ array_insert_item(bps, nr, i, bp);
+ SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
+ set_alloc_v4_u64s(a);
+ return 0;
+ }
+
+ /* Overflow: use backpointer btree */
+ bp_k = bch2_trans_kmalloc(trans, sizeof(*bp_k));
+ ret = PTR_ERR_OR_ZERO(bp_k);
+ if (ret)
+ return ret;
+
+ ca = bch_dev_bkey_exists(c, a->k.p.inode);
+
+ bkey_backpointer_init(&bp_k->k_i);
+ bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset);
+ bp_k->v = bp;
+
+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_k->k.p,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&bp_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "existing btree backpointer key found when inserting ");
+ bch2_backpointer_to_text(&buf, &bp);
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ prt_printf(&buf, "found ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "for ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+
+ if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags))
+ bch_err(c, "%s", buf.buf);
+ else {
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ printbuf_exit(&buf);
+ ret = -EIO;
+ goto err;
+ }
+ }
+
+ ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0);
+err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ return ret;
+}
+
+/*
+ * Find the next backpointer >= *bp_offset:
+ */
+int bch2_get_next_backpointer(struct btree_trans *trans,
+ struct bpos bucket, int gen,
+ u64 *bp_offset,
+ struct bch_backpointer *dst,
+ unsigned iter_flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bpos bp_pos, bp_end_pos;
+ struct btree_iter alloc_iter, bp_iter = { NULL };
+ struct bkey_s_c k;
+ struct bkey_s_c_alloc_v4 a;
+ size_t i;
+ int ret;
+
+ if (*bp_offset == U64_MAX)
+ return 0;
+
+ bp_pos = bucket_pos_to_bp(c, bucket,
+ max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX);
+ bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
+
+ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+ bucket, BTREE_ITER_CACHED);
+ k = bch2_btree_iter_peek_slot(&alloc_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto out;
+
+ if (k.k->type != KEY_TYPE_alloc_v4)
+ goto done;
+
+ a = bkey_s_c_to_alloc_v4(k);
+ if (gen >= 0 && a.v->gen != gen)
+ goto done;
+
+ for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) {
+ if (alloc_v4_backpointers_c(a.v)[i].bucket_offset < *bp_offset)
+ continue;
+
+ *dst = alloc_v4_backpointers_c(a.v)[i];
+ *bp_offset = dst->bucket_offset;
+ goto out;
+ }
+
+ for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
+ bp_pos, 0, k, ret) {
+ if (bpos_cmp(k.k->p, bp_end_pos) >= 0)
+ break;
+
+ if (k.k->type != KEY_TYPE_backpointer)
+ continue;
+
+ *dst = *bkey_s_c_to_backpointer(k).v;
+ *bp_offset = dst->bucket_offset + BACKPOINTER_OFFSET_MAX;
+ goto out;
+ }
+done:
+ *bp_offset = U64_MAX;
+out:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ return ret;
+}
+
+static void backpointer_not_found(struct btree_trans *trans,
+ struct bpos bucket,
+ u64 bp_offset,
+ struct bch_backpointer bp,
+ struct bkey_s_c k,
+ const char *thing_it_points_to)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
+ thing_it_points_to);
+ prt_printf(&buf, "bucket: ");
+ bch2_bpos_to_text(&buf, bucket);
+ prt_printf(&buf, "\n ");
+
+ if (bp_offset >= BACKPOINTER_OFFSET_MAX) {
+ struct bpos bp_pos =
+ bucket_pos_to_bp(c, bucket,
+ bp_offset - BACKPOINTER_OFFSET_MAX);
+ prt_printf(&buf, "backpointer pos: ");
+ bch2_bpos_to_text(&buf, bp_pos);
+ prt_printf(&buf, "\n ");
+ }
+
+ bch2_backpointer_to_text(&buf, &bp);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags))
+ bch_err_ratelimited(c, "%s", buf.buf);
+ else
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+
+ printbuf_exit(&buf);
+}
+
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos bucket,
+ u64 bp_offset,
+ struct bch_backpointer bp)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+
+ bch2_trans_node_iter_init(trans, iter,
+ bp.btree_id,
+ bp.pos,
+ 0,
+ min(bp.level, c->btree_roots[bp.btree_id].level),
+ 0);
+ k = bch2_btree_iter_peek_slot(iter);
+ if (bkey_err(k)) {
+ bch2_trans_iter_exit(trans, iter);
+ return k;
+ }
+
+ if (bp.level == c->btree_roots[bp.btree_id].level + 1)
+ k = bkey_i_to_s_c(&c->btree_roots[bp.btree_id].key);
+
+ if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+ return k;
+
+ bch2_trans_iter_exit(trans, iter);
+
+ if (bp.level) {
+ struct btree *b;
+
+ /*
+ * If a backpointer for a btree node wasn't found, it may be
+ * because it was overwritten by a new btree node that hasn't
+ * been written out yet - backpointer_get_node() checks for
+ * this:
+ */
+ b = bch2_backpointer_get_node(trans, iter, bucket, bp_offset, bp);
+ if (!IS_ERR_OR_NULL(b))
+ return bkey_i_to_s_c(&b->key);
+
+ bch2_trans_iter_exit(trans, iter);
+
+ if (IS_ERR(b))
+ return bkey_s_c_err(PTR_ERR(b));
+ return bkey_s_c_null;
+ }
+
+ backpointer_not_found(trans, bucket, bp_offset, bp, k, "extent");
+ return bkey_s_c_null;
+}
+
+struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos bucket,
+ u64 bp_offset,
+ struct bch_backpointer bp)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b;
+
+ BUG_ON(!bp.level);
+
+ bch2_trans_node_iter_init(trans, iter,
+ bp.btree_id,
+ bp.pos,
+ 0,
+ bp.level - 1,
+ 0);
+ b = bch2_btree_iter_peek_node(iter);
+ if (IS_ERR(b))
+ goto err;
+
+ if (b && extent_matches_bp(c, bp.btree_id, bp.level,
+ bkey_i_to_s_c(&b->key),
+ bucket, bp))
+ return b;
+
+ if (b && btree_node_will_make_reachable(b)) {
+ b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
+ } else {
+ backpointer_not_found(trans, bucket, bp_offset, bp,
+ bkey_i_to_s_c(&b->key), "btree node");
+ b = NULL;
+ }
+err:
+ bch2_trans_iter_exit(trans, iter);
+ return b;
+}
+
+static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter alloc_iter = { NULL };
+ struct bch_dev *ca;
+ struct bkey_s_c alloc_k;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
+ "backpointer for mising device:\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, bp_iter, 0);
+ goto out;
+ }
+
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+ bp_pos_to_bucket(c, k.k->p), 0);
+
+ alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+ ret = bkey_err(alloc_k);
+ if (ret)
+ goto out;
+
+ if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
+ "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
+ alloc_iter.pos.inode, alloc_iter.pos.offset,
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, bp_iter, 0);
+ goto out;
+ }
+out:
+fsck_err:
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+/* verify that every backpointer has a corresponding alloc key */
+int bch2_check_btree_backpointers(struct bch_fs *c)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ return bch2_trans_run(c,
+ for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_backpointers, POS_MIN, 0, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ bch2_check_btree_backpointer(&trans, &iter, k)));
+}
+
+static int check_bp_exists(struct btree_trans *trans,
+ struct bpos bucket_pos,
+ struct bch_backpointer bp,
+ struct bkey_s_c orig_k,
+ struct bpos bucket_start,
+ struct bpos bucket_end)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter alloc_iter, bp_iter = { NULL };
+ struct printbuf buf = PRINTBUF;
+ struct bkey_s_c alloc_k, bp_k;
+ int ret;
+
+ if (bpos_cmp(bucket_pos, bucket_start) < 0 ||
+ bpos_cmp(bucket_pos, bucket_end) > 0)
+ return 0;
+
+ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0);
+ alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+ ret = bkey_err(alloc_k);
+ if (ret)
+ goto err;
+
+ if (alloc_k.k->type == KEY_TYPE_alloc_v4) {
+ struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(alloc_k);
+ const struct bch_backpointer *bps = alloc_v4_backpointers_c(a.v);
+ unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(a.v);
+
+ for (i = 0; i < nr; i++) {
+ int cmp = backpointer_cmp(bps[i], bp) ?:
+ memcmp(&bps[i], &bp, sizeof(bp));
+ if (!cmp)
+ goto out;
+ if (cmp >= 0)
+ break;
+ }
+ } else {
+ goto missing;
+ }
+
+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp(c, bucket_pos, bp.bucket_offset),
+ 0);
+ bp_k = bch2_btree_iter_peek_slot(&bp_iter);
+ ret = bkey_err(bp_k);
+ if (ret)
+ goto err;
+
+ if (bp_k.k->type != KEY_TYPE_backpointer ||
+ memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp)))
+ goto missing;
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ printbuf_exit(&buf);
+ return ret;
+missing:
+ prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
+ bch2_btree_ids[bp.btree_id], bp.level);
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_printf(&buf, "\nin alloc key ");
+ bch2_bkey_val_to_text(&buf, c, alloc_k);
+
+ if (c->sb.version < bcachefs_metadata_version_backpointers ||
+ c->opts.reconstruct_alloc ||
+ fsck_err(c, "%s", buf.buf)) {
+ struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, alloc_k);
+
+ ret = PTR_ERR_OR_ZERO(a) ?:
+ bch2_bucket_backpointer_add(trans, a, bp, orig_k) ?:
+ bch2_trans_update(trans, &alloc_iter, &a->k_i, 0);
+ }
+
+ goto out;
+}
+
+static int check_extent_to_backpointers(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos bucket_start,
+ struct bpos bucket_end)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs;
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct bkey_s_c k;
+ int ret;
+
+ k = bch2_btree_iter_peek_all_levels(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+ if (!k.k)
+ return 0;
+
+ ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bpos bucket_pos;
+ struct bch_backpointer bp;
+
+ if (p.ptr.cached)
+ continue;
+
+ bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
+ k, p, &bucket_pos, &bp);
+
+ ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int check_btree_root_to_backpointers(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bpos bucket_start,
+ struct bpos bucket_end)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct btree *b;
+ struct bkey_s_c k;
+ struct bkey_ptrs_c ptrs;
+ struct extent_ptr_decoded p;
+ const union bch_extent_entry *entry;
+ int ret;
+
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+ c->btree_roots[btree_id].level, 0);
+ b = bch2_btree_iter_peek_node(&iter);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto err;
+
+ BUG_ON(b != btree_node_root(c, b));
+
+ k = bkey_i_to_s_c(&b->key);
+ ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bpos bucket_pos;
+ struct bch_backpointer bp;
+
+ if (p.ptr.cached)
+ continue;
+
+ bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1,
+ k, p, &bucket_pos, &bp);
+
+ ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end);
+ if (ret)
+ goto err;
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
+{
+ return (struct bbpos) {
+ .btree = bp.btree_id,
+ .pos = bp.pos,
+ };
+}
+
+static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+{
+ struct sysinfo i;
+ u64 mem_bytes;
+
+ si_meminfo(&i);
+ mem_bytes = i.totalram * i.mem_unit;
+ return (mem_bytes >> 1) / btree_bytes(c);
+}
+
+int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
+ unsigned btree_leaf_mask,
+ unsigned btree_interior_mask,
+ struct bbpos start, struct bbpos *end)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
+ enum btree_id btree;
+ int ret = 0;
+
+ for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
+ unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
+
+ if (!((1U << btree) & btree_leaf_mask) &&
+ !((1U << btree) & btree_interior_mask))
+ continue;
+
+ bch2_trans_node_iter_init(trans, &iter, btree,
+ btree == start.btree ? start.pos : POS_MIN,
+ 0, depth, 0);
+ /*
+ * for_each_btree_key_contineu() doesn't check the return value
+ * from bch2_btree_iter_advance(), which is needed when
+ * iterating over interior nodes where we'll see keys at
+ * SPOS_MAX:
+ */
+ do {
+ k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
+ ret = bkey_err(k);
+ if (!k.k || ret)
+ break;
+
+ --btree_nodes;
+ if (!btree_nodes) {
+ *end = BBPOS(btree, k.k->p);
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+ }
+ } while (bch2_btree_iter_advance(&iter));
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ *end = BBPOS_MAX;
+ return ret;
+}
+
+static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
+ struct bpos bucket_start,
+ struct bpos bucket_end)
+{
+ struct btree_iter iter;
+ enum btree_id btree_id;
+ int ret = 0;
+
+ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+ unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
+
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+ depth,
+ BTREE_ITER_ALL_LEVELS|
+ BTREE_ITER_PREFETCH);
+
+ do {
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_extent_to_backpointers(trans, &iter,
+ bucket_start, bucket_end));
+ if (ret)
+ break;
+ } while (!bch2_btree_iter_advance(&iter));
+
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ break;
+
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_btree_root_to_backpointers(trans, btree_id,
+ bucket_start, bucket_end));
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
+ struct bpos start, struct bpos *end)
+{
+ struct btree_iter alloc_iter;
+ struct btree_iter bp_iter;
+ struct bkey_s_c alloc_k, bp_k;
+ size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
+ bool alloc_end = false, bp_end = false;
+ int ret = 0;
+
+ bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+ start, 0, 1, 0);
+ bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp(trans->c, start, 0), 0, 1, 0);
+ while (1) {
+ alloc_k = !alloc_end
+ ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
+ : bkey_s_c_null;
+ bp_k = !bp_end
+ ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
+ : bkey_s_c_null;
+
+ ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
+ if ((!alloc_k.k && !bp_k.k) || ret) {
+ *end = SPOS_MAX;
+ break;
+ }
+
+ --btree_nodes;
+ if (!btree_nodes) {
+ *end = alloc_k.k->p;
+ break;
+ }
+
+ if (bpos_cmp(alloc_iter.pos, SPOS_MAX) &&
+ bpos_cmp(bucket_pos_to_bp(trans->c, alloc_iter.pos, 0), bp_iter.pos) < 0) {
+ if (!bch2_btree_iter_advance(&alloc_iter))
+ alloc_end = true;
+ } else {
+ if (!bch2_btree_iter_advance(&bp_iter))
+ bp_end = true;
+ }
+ }
+ bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ return ret;
+}
+
+int bch2_check_extents_to_backpointers(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct bpos start = POS_MIN, end;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ while (1) {
+ ret = bch2_get_alloc_in_memory_pos(&trans, start, &end);
+ if (ret)
+ break;
+
+ if (!bpos_cmp(start, POS_MIN) && bpos_cmp(end, SPOS_MAX))
+ bch_verbose(c, "check_extents_to_backpointers(): alloc info does not fit in ram,"
+ "running in multiple passes with %zu nodes per pass",
+ btree_nodes_fit_in_ram(c));
+
+ if (bpos_cmp(start, POS_MIN) || bpos_cmp(end, SPOS_MAX)) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "check_extents_to_backpointers(): ");
+ bch2_bpos_to_text(&buf, start);
+ prt_str(&buf, "-");
+ bch2_bpos_to_text(&buf, end);
+
+ bch_verbose(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ ret = bch2_check_extents_to_backpointers_pass(&trans, start, end);
+ if (ret || !bpos_cmp(end, SPOS_MAX))
+ break;
+
+ start = bpos_successor(end);
+ }
+ bch2_trans_exit(&trans);
+
+ return ret;
+}
+
+static int check_one_backpointer(struct btree_trans *trans,
+ struct bpos bucket,
+ u64 *bp_offset,
+ struct bbpos start,
+ struct bbpos end)
+{
+ struct btree_iter iter;
+ struct bch_backpointer bp;
+ struct bbpos pos;
+ struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ ret = bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp, 0);
+ if (ret || *bp_offset == U64_MAX)
+ return ret;
+
+ pos = bp_to_bbpos(bp);
+ if (bbpos_cmp(pos, start) < 0 ||
+ bbpos_cmp(pos, end) > 0)
+ return 0;
+
+ k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp);
+ ret = bkey_err(k);
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+ return 0;
+ if (ret)
+ return ret;
+
+ if (fsck_err_on(!k.k, trans->c,
+ "%s backpointer points to missing extent\n%s",
+ *bp_offset < BACKPOINTER_OFFSET_MAX ? "alloc" : "btree",
+ (bch2_backpointer_to_text(&buf, &bp), buf.buf))) {
+ ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp);
+ if (ret == -ENOENT)
+ bch_err(trans->c, "backpointer at %llu not found", *bp_offset);
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
+ struct bbpos start,
+ struct bbpos end)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ u64 bp_offset = 0;
+
+ while (!(ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_one_backpointer(trans, iter.pos, &bp_offset, start, end))) &&
+ bp_offset < U64_MAX)
+ bp_offset++;
+
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ return ret < 0 ? ret : 0;
+}
+
+int bch2_check_backpointers_to_extents(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ while (1) {
+ ret = bch2_get_btree_in_memory_pos(&trans,
+ (1U << BTREE_ID_extents)|
+ (1U << BTREE_ID_reflink),
+ ~0,
+ start, &end);
+ if (ret)
+ break;
+
+ if (!bbpos_cmp(start, BBPOS_MIN) &&
+ bbpos_cmp(end, BBPOS_MAX))
+ bch_verbose(c, "check_backpointers_to_extents(): extents do not fit in ram,"
+ "running in multiple passes with %zu nodes per pass",
+ btree_nodes_fit_in_ram(c));
+
+ if (bbpos_cmp(start, BBPOS_MIN) ||
+ bbpos_cmp(end, BBPOS_MAX)) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "check_backpointers_to_extents(): ");
+ bch2_bbpos_to_text(&buf, start);
+ prt_str(&buf, "-");
+ bch2_bbpos_to_text(&buf, end);
+
+ bch_verbose(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ ret = bch2_check_backpointers_to_extents_pass(&trans, start, end);
+ if (ret || !bbpos_cmp(end, BBPOS_MAX))
+ break;
+
+ start = bbpos_successor(end);
+ }
+ bch2_trans_exit(&trans);
+
+ return ret;
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+
+#include "super.h"
+
+int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
+ int, struct printbuf *);
+void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
+void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_backpointer_swab(struct bkey_s);
+
+#define bch2_bkey_ops_backpointer (struct bkey_ops) { \
+ .key_invalid = bch2_backpointer_invalid, \
+ .val_to_text = bch2_backpointer_k_to_text, \
+ .swab = bch2_backpointer_swab, \
+}
+
+void bch2_extent_ptr_to_bp(struct bch_fs *, enum btree_id, unsigned,
+ struct bkey_s_c, struct extent_ptr_decoded,
+ struct bpos *, struct bch_backpointer *);
+
+int bch2_bucket_backpointer_del(struct btree_trans *, struct bkey_i_alloc_v4 *,
+ struct bch_backpointer, struct bkey_s_c);
+int bch2_bucket_backpointer_add(struct btree_trans *, struct bkey_i_alloc_v4 *,
+ struct bch_backpointer, struct bkey_s_c);
+int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
+ u64 *, struct bch_backpointer *, unsigned);
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
+ struct bpos, u64, struct bch_backpointer);
+struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
+ struct bpos, u64, struct bch_backpointer);
+
+int bch2_check_btree_backpointers(struct bch_fs *);
+int bch2_check_extents_to_backpointers(struct bch_fs *);
+int bch2_check_backpointers_to_extents(struct bch_fs *);
+
+#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_H
+#define _BCACHEFS_BBPOS_H
+
+#include "bkey_methods.h"
+
+struct bbpos {
+ enum btree_id btree;
+ struct bpos pos;
+};
+
+static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
+{
+ return (struct bbpos) { btree, pos };
+}
+
+#define BBPOS_MIN BBPOS(0, POS_MIN)
+#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
+
+static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
+{
+ return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
+}
+
+static inline struct bbpos bbpos_successor(struct bbpos pos)
+{
+ if (bpos_cmp(pos.pos, SPOS_MAX)) {
+ pos.pos = bpos_successor(pos.pos);
+ return pos;
+ }
+
+ if (pos.btree != BTREE_ID_NR) {
+ pos.btree++;
+ pos.pos = POS_MIN;
+ return pos;
+ }
+
+ BUG();
+}
+
+static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
+{
+ prt_str(out, bch2_btree_ids[pos.btree]);
+ prt_char(out, ':');
+ bch2_bpos_to_text(out, pos.pos);
+}
+
+#endif /* _BCACHEFS_BBPOS_H */
*
* BTREE NODES:
*
- * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
+ * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
* free smaller than a bucket - so, that's how big our btree nodes are.
*
* (If buckets are really big we'll only use part of the bucket for a btree node
#define dynamic_fault(...) 0
#define race_fault(...) 0
+#define trace_and_count(_c, _name, ...) \
+do { \
+ this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \
+ trace_##_name(__VA_ARGS__); \
+} while (0)
+
#define bch2_fs_init_fault(name) \
dynamic_fault("bcachefs:bch_fs_init:" name)
#define bch2_meta_read_fault(name) \
dynamic_fault("bcachefs:meta:write:" name)
#ifdef __KERNEL__
-#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name)
+#define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name)
+#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
#else
+#define bch2_log_msg(_c, fmt) fmt
#define bch2_fmt(_c, fmt) fmt "\n"
#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum)
#endif
x(btree_interior_update_foreground) \
x(btree_interior_update_total) \
x(btree_gc) \
- x(btree_lock_contended_read) \
- x(btree_lock_contended_intent) \
- x(btree_lock_contended_write) \
x(data_write) \
x(data_read) \
x(data_promote) \
GC_PHASE_BTREE_reflink,
GC_PHASE_BTREE_subvolumes,
GC_PHASE_BTREE_snapshots,
+ GC_PHASE_BTREE_lru,
+ GC_PHASE_BTREE_freespace,
+ GC_PHASE_BTREE_need_discard,
+ GC_PHASE_BTREE_backpointers,
GC_PHASE_PENDING_DELETE,
};
* gc_lock, for device resize - holding any is sufficient for access:
* Or rcu_read_lock(), but only for ptr_stale():
*/
- struct bucket_array __rcu *buckets[2];
+ struct bucket_array __rcu *buckets_gc;
struct bucket_gens __rcu *bucket_gens;
u8 *oldest_gen;
unsigned long *buckets_nouse;
/* Allocator: */
u64 new_fs_bucket_idx;
- struct task_struct __rcu *alloc_thread;
+ u64 bucket_alloc_trans_early_cursor;
- /*
- * free: Buckets that are ready to be used
- *
- * free_inc: Incoming buckets - these are buckets that currently have
- * cached data in them, and we can't reuse them until after we write
- * their new gen to disk. After prio_write() finishes writing the new
- * gens/prios, they'll be moved to the free list (and possibly discarded
- * in the process)
- */
- alloc_fifo free[RESERVE_NR];
- alloc_fifo free_inc;
unsigned nr_open_buckets;
+ unsigned nr_btree_reserve;
open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT];
open_bucket_idx_t open_buckets_partial_nr;
- size_t fifo_last_bucket;
-
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
size_t buckets_waiting_on_journal;
- enum allocator_states allocator_state;
-
- alloc_heap alloc_heap;
-
atomic64_t rebalance_work;
struct journal_device journal;
enum {
/* startup: */
- BCH_FS_INITIALIZED,
- BCH_FS_ALLOC_READ_DONE,
- BCH_FS_ALLOC_CLEAN,
- BCH_FS_ALLOCATOR_RUNNING,
- BCH_FS_ALLOCATOR_STOPPING,
- BCH_FS_INITIAL_GC_DONE,
- BCH_FS_INITIAL_GC_UNFIXED,
- BCH_FS_TOPOLOGY_REPAIR_DONE,
- BCH_FS_FSCK_DONE,
BCH_FS_STARTED,
+ BCH_FS_MAY_GO_RW,
BCH_FS_RW,
BCH_FS_WAS_RW,
BCH_FS_STOPPING,
BCH_FS_EMERGENCY_RO,
BCH_FS_WRITE_DISABLE_COMPLETE,
+ BCH_FS_CLEAN_SHUTDOWN,
+
+ /* fsck passes: */
+ BCH_FS_TOPOLOGY_REPAIR_DONE,
+ BCH_FS_INITIAL_GC_DONE, /* kill when we enumerate fsck passes */
+ BCH_FS_CHECK_LRUS_DONE,
+ BCH_FS_CHECK_BACKPOINTERS_DONE,
+ BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE,
+ BCH_FS_FSCK_DONE,
+ BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */
+ BCH_FS_NEED_ANOTHER_GC,
+
+ BCH_FS_HAVE_DELETED_SNAPSHOTS,
/* errors: */
BCH_FS_ERROR,
BCH_FS_TOPOLOGY_ERROR,
BCH_FS_ERRORS_FIXED,
BCH_FS_ERRORS_NOT_FIXED,
-
- /* misc: */
- BCH_FS_NEED_ANOTHER_GC,
- BCH_FS_DELETED_NODES,
- BCH_FS_REBUILD_REPLICAS,
- BCH_FS_HOLD_BTREE_WRITES,
};
struct btree_debug {
unsigned id;
- struct dentry *btree;
- struct dentry *btree_format;
- struct dentry *failed;
+};
+
+#define BCH_TRANSACTIONS_NR 128
+
+struct btree_transaction_stats {
+ struct mutex lock;
+ struct time_stats lock_hold_times;
+ unsigned nr_max_paths;
+ unsigned max_mem;
+ char *max_paths_text;
};
struct bch_fs_pcpu {
struct journal_keys {
struct journal_key {
+ u64 journal_seq;
+ u32 journal_offset;
enum btree_id btree_id:8;
unsigned level:8;
bool allocated;
bool overwritten;
struct bkey_i *k;
- u32 journal_seq;
- u32 journal_offset;
} *d;
+ /*
+ * Gap buffer: instead of all the empty space in the array being at the
+ * end of the buffer - from @nr to @size - the empty space is at @gap.
+ * This means that sequential insertions are O(n) instead of O(n^2).
+ */
+ size_t gap;
size_t nr;
size_t size;
- u64 journal_seq_base;
};
struct btree_path_buf {
struct list_head list;
struct kobject kobj;
+ struct kobject counters_kobj;
struct kobject internal;
struct kobject opts_dir;
struct kobject time_stats;
struct mutex snapshot_table_lock;
struct work_struct snapshot_delete_work;
struct work_struct snapshot_wait_for_pagecache_and_delete_work;
- struct snapshot_id_list snapshots_unlinked;
+ snapshot_id_list snapshots_unlinked;
struct mutex snapshots_unlinked_lock;
/* BTREE CACHE */
unsigned write_points_nr;
struct buckets_waiting_for_journal buckets_waiting_for_journal;
+ struct work_struct discard_work;
+ struct work_struct invalidate_work;
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;
struct mutex gc_gens_lock;
/* IO PATH */
- struct semaphore io_in_flight;
struct bio_set bio_read;
struct bio_set bio_read_split;
struct bio_set bio_write;
copygc_heap copygc_heap;
struct write_point copygc_write_point;
s64 copygc_wait;
+ bool copygc_running;
+ wait_queue_head_t copygc_running_wq;
/* DATA PROGRESS STATS */
struct list_head data_progress_list;
struct bch_memquota_type quotas[QTYP_NR];
/* DEBUG JUNK */
- struct dentry *debug;
+ struct dentry *fs_debug_dir;
+ struct dentry *btree_debug_dir;
struct btree_debug btree_debug[BTREE_ID_NR];
struct btree *verify_data;
struct btree_node *verify_ondisk;
mempool_t btree_bounce_pool;
struct journal journal;
- struct list_head journal_entries;
+ GENRADIX(struct journal_replay *) journal_entries;
+ u64 journal_entries_base_seq;
struct journal_keys journal_keys;
struct list_head journal_iters;
u64 last_bucket_seq_cleanup;
- /* The rest of this all shows up in sysfs */
- atomic_long_t read_realloc_races;
- atomic_long_t extent_migrate_done;
- atomic_long_t extent_migrate_raced;
+ u64 counters_on_mount[BCH_COUNTER_NR];
+ u64 __percpu *counters;
unsigned btree_gc_periodic:1;
unsigned copy_gc_enabled:1;
bool promote_whole_extents;
struct time_stats times[BCH_TIME_STAT_NR];
+
+ struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
};
static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
#include <linux/uuid.h>
#include "vstructs.h"
+#define BITMASK(name, type, field, offset, end) \
+static const unsigned name##_OFFSET = offset; \
+static const unsigned name##_BITS = (end - offset); \
+ \
+static inline __u64 name(const type *k) \
+{ \
+ return (k->field >> offset) & ~(~0ULL << (end - offset)); \
+} \
+ \
+static inline void SET_##name(type *k, __u64 v) \
+{ \
+ k->field &= ~(~(~0ULL << (end - offset)) << offset); \
+ k->field |= (v & ~(~0ULL << (end - offset))) << offset; \
+}
+
#define LE_BITMASK(_bits, name, type, field, offset, end) \
static const unsigned name##_OFFSET = offset; \
static const unsigned name##_BITS = (end - offset); \
* number.
*
* - WHITEOUT: for hash table btrees
-*/
+ */
#define BCH_BKEY_TYPES() \
x(deleted, 0) \
x(whiteout, 1) \
x(subvolume, 21) \
x(snapshot, 22) \
x(inode_v2, 23) \
- x(alloc_v3, 24)
+ x(alloc_v3, 24) \
+ x(set, 25) \
+ x(lru, 26) \
+ x(alloc_v4, 27) \
+ x(backpointer, 28) \
+ x(inode_v3, 29)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
struct bch_val v;
};
+struct bch_set {
+ struct bch_val v;
+};
+
/* Extents */
/*
struct bch_btree_ptr {
struct bch_val v;
- struct bch_extent_ptr start[0];
__u64 _data[0];
+ struct bch_extent_ptr start[];
} __attribute__((packed, aligned(8)));
struct bch_btree_ptr_v2 {
__le16 sectors_written;
__le16 flags;
struct bpos min_key;
- struct bch_extent_ptr start[0];
__u64 _data[0];
+ struct bch_extent_ptr start[];
} __attribute__((packed, aligned(8)));
LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
struct bch_extent {
struct bch_val v;
- union bch_extent_entry start[0];
__u64 _data[0];
+ union bch_extent_entry start[];
} __attribute__((packed, aligned(8)));
struct bch_reservation {
__u8 fields[0];
} __attribute__((packed, aligned(8)));
+struct bch_inode_v3 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le64 bi_sectors;
+ __le64 bi_size;
+ __le64 bi_version;
+ __u8 fields[0];
+} __attribute__((packed, aligned(8)));
+
+#define INODEv3_FIELDS_START_INITIAL 6
+#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(u64))
+
struct bch_inode_generation {
struct bch_val v;
* bi_subvol and bi_parent_subvol are only set for subvolume roots:
*/
-#define BCH_INODE_FIELDS() \
+#define BCH_INODE_FIELDS_v2() \
x(bi_atime, 96) \
x(bi_ctime, 96) \
x(bi_mtime, 96) \
x(bi_subvol, 32) \
x(bi_parent_subvol, 32)
+#define BCH_INODE_FIELDS_v3() \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
+ x(bi_uid, 32) \
+ x(bi_gid, 32) \
+ x(bi_nlink, 32) \
+ x(bi_generation, 32) \
+ x(bi_dev, 32) \
+ x(bi_data_checksum, 8) \
+ x(bi_compression, 8) \
+ x(bi_project, 32) \
+ x(bi_background_compression, 8) \
+ x(bi_data_replicas, 8) \
+ x(bi_promote_target, 16) \
+ x(bi_foreground_target, 16) \
+ x(bi_background_target, 16) \
+ x(bi_erasure_code, 16) \
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32)
+
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
x(data_checksum, 8) \
* User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
* flags)
*/
- __BCH_INODE_SYNC = 0,
- __BCH_INODE_IMMUTABLE = 1,
- __BCH_INODE_APPEND = 2,
- __BCH_INODE_NODUMP = 3,
- __BCH_INODE_NOATIME = 4,
+ __BCH_INODE_SYNC = 0,
+ __BCH_INODE_IMMUTABLE = 1,
+ __BCH_INODE_APPEND = 2,
+ __BCH_INODE_NODUMP = 3,
+ __BCH_INODE_NOATIME = 4,
- __BCH_INODE_I_SIZE_DIRTY= 5,
- __BCH_INODE_I_SECTORS_DIRTY= 6,
- __BCH_INODE_UNLINKED = 7,
- __BCH_INODE_BACKPTR_UNTRUSTED = 8,
+ __BCH_INODE_I_SIZE_DIRTY = 5,
+ __BCH_INODE_I_SECTORS_DIRTY = 6,
+ __BCH_INODE_UNLINKED = 7,
+ __BCH_INODE_BACKPTR_UNTRUSTED = 8,
/* bits 20+ reserved for packed fields below: */
};
LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
+LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
+LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_FIELDS_START,
+ struct bch_inode_v3, bi_flags, 31, 36);
+LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
+
/* Dirents */
/*
#define DT_SUBVOL 16
#define BCH_DT_MAX 17
-#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \
+#define BCH_NAME_MAX ((unsigned) (U8_MAX * sizeof(u64) - \
sizeof(struct bkey) - \
- offsetof(struct bch_dirent, d_name))
-
+ offsetof(struct bch_dirent, d_name)))
/* Xattrs */
x(stripe, 32) \
x(stripe_redundancy, 8)
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+ BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
struct bch_alloc_v2 {
struct bch_val v;
__u8 nr_fields;
#define BCH_ALLOC_FIELDS_V2() \
x(read_time, 64) \
x(write_time, 64) \
- x(dirty_sectors, 16) \
- x(cached_sectors, 16) \
+ x(dirty_sectors, 32) \
+ x(cached_sectors, 32) \
x(stripe, 32) \
x(stripe_redundancy, 8)
__u8 data[];
} __attribute__((packed, aligned(8)));
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
- BCH_ALLOC_FIELDS_V1()
-#undef x
- BCH_ALLOC_FIELD_NR
-};
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
+
+struct bch_alloc_v4 {
+ struct bch_val v;
+ __u64 journal_seq;
+ __u32 flags;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 stripe_redundancy;
+ __u32 dirty_sectors;
+ __u32 cached_sectors;
+ __u64 io_time[2];
+ __u32 stripe;
+ __u32 nr_external_backpointers;
+} __attribute__((packed, aligned(8)));
+
+#define BCH_ALLOC_V4_U64s_V0 6
+#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(u64))
+
+BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
+
+#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40
+
+struct bch_backpointer {
+ struct bch_val v;
+ __u8 btree_id;
+ __u8 level;
+ __u8 data_type;
+ __u64 bucket_offset:40;
+ __u32 bucket_len;
+ struct bpos pos;
+} __attribute__((packed, aligned(8)));
/* Quotas: */
__u8 csum_type;
__u8 pad;
- struct bch_extent_ptr ptrs[0];
+ struct bch_extent_ptr ptrs[];
} __attribute__((packed, aligned(8)));
/* Reflink: */
/* True if a subvolume points to this snapshot node: */
LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
+/* LRU btree: */
+
+struct bch_lru {
+ struct bch_val v;
+ __le64 idx;
+} __attribute__((packed, aligned(8)));
+
+#define LRU_ID_STRIPES (1U << 16)
+
/* Optional/variable size superblock sections: */
struct bch_sb_field {
__le32 type;
};
-#define BCH_SB_FIELDS() \
- x(journal, 0) \
- x(members, 1) \
- x(crypt, 2) \
- x(replicas_v0, 3) \
- x(quota, 4) \
- x(disk_groups, 5) \
- x(clean, 6) \
- x(replicas, 7) \
- x(journal_seq_blacklist, 8)
+#define BCH_SB_FIELDS() \
+ x(journal, 0) \
+ x(members, 1) \
+ x(crypt, 2) \
+ x(replicas_v0, 3) \
+ x(quota, 4) \
+ x(disk_groups, 5) \
+ x(clean, 6) \
+ x(replicas, 7) \
+ x(journal_seq_blacklist, 8) \
+ x(journal_v2, 9) \
+ x(counters, 10)
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
BCH_SB_FIELD_NR
};
+/*
+ * Most superblock fields are replicated in all device's superblocks - a few are
+ * not:
+ */
+#define BCH_SINGLE_DEVICE_SB_FIELDS \
+ ((1U << BCH_SB_FIELD_journal)| \
+ (1U << BCH_SB_FIELD_journal_v2))
+
/* BCH_SB_FIELD_journal: */
struct bch_sb_field_journal {
__le64 buckets[0];
};
+struct bch_sb_field_journal_v2 {
+ struct bch_sb_field field;
+
+ struct bch_sb_field_journal_v2_entry {
+ __le64 start;
+ __le64 nr;
+ } d[0];
+};
+
/* BCH_SB_FIELD_members: */
#define BCH_MIN_NR_NBUCKETS (1 << 6)
LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20)
LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28)
LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30)
+LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
+ struct bch_member, flags[0], 30, 31)
#if 0
LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
/* BCH_SB_FIELD_replicas: */
#define BCH_DATA_TYPES() \
- x(none, 0) \
+ x(free, 0) \
x(sb, 1) \
x(journal, 2) \
x(btree, 3) \
x(user, 4) \
x(cached, 5) \
- x(parity, 6)
+ x(parity, 6) \
+ x(stripe, 7) \
+ x(need_gc_gens, 8) \
+ x(need_discard, 9)
enum bch_data_type {
#define x(t, n) BCH_DATA_##t,
BCH_DATA_NR
};
+static inline bool data_type_is_empty(enum bch_data_type type)
+{
+ switch (type) {
+ case BCH_DATA_free:
+ case BCH_DATA_need_gc_gens:
+ case BCH_DATA_need_discard:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool data_type_is_hidden(enum bch_data_type type)
+{
+ switch (type) {
+ case BCH_DATA_sb:
+ case BCH_DATA_journal:
+ return true;
+ default:
+ return false;
+ }
+}
+
struct bch_replicas_entry_v0 {
__u8 data_type;
__u8 nr_devs;
- __u8 devs[0];
+ __u8 devs[];
} __attribute__((packed));
struct bch_sb_field_replicas_v0 {
struct bch_sb_field field;
- struct bch_replicas_entry_v0 entries[0];
+ struct bch_replicas_entry_v0 entries[];
} __attribute__((packed, aligned(8)));
struct bch_replicas_entry {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
- __u8 devs[0];
+ __u8 devs[];
} __attribute__((packed));
#define replicas_entry_bytes(_i) \
struct bch_disk_group entries[0];
} __attribute__((packed, aligned(8)));
+/* BCH_SB_FIELD_counters */
+
+#define BCH_PERSISTENT_COUNTERS() \
+ x(io_read, 0) \
+ x(io_write, 1) \
+ x(io_move, 2) \
+ x(bucket_invalidate, 3) \
+ x(bucket_discard, 4) \
+ x(bucket_alloc, 5) \
+ x(bucket_alloc_fail, 6) \
+ x(btree_cache_scan, 7) \
+ x(btree_cache_reap, 8) \
+ x(btree_cache_cannibalize, 9) \
+ x(btree_cache_cannibalize_lock, 10) \
+ x(btree_cache_cannibalize_lock_fail, 11) \
+ x(btree_cache_cannibalize_unlock, 12) \
+ x(btree_node_write, 13) \
+ x(btree_node_read, 14) \
+ x(btree_node_compact, 15) \
+ x(btree_node_merge, 16) \
+ x(btree_node_split, 17) \
+ x(btree_node_rewrite, 18) \
+ x(btree_node_alloc, 19) \
+ x(btree_node_free, 20) \
+ x(btree_node_set_root, 21) \
+ x(btree_path_relock_fail, 22) \
+ x(btree_path_upgrade_fail, 23) \
+ x(btree_reserve_get_fail, 24) \
+ x(journal_entry_full, 25) \
+ x(journal_full, 26) \
+ x(journal_reclaim_finish, 27) \
+ x(journal_reclaim_start, 28) \
+ x(journal_write, 29) \
+ x(read_promote, 30) \
+ x(read_bounce, 31) \
+ x(read_split, 33) \
+ x(read_retry, 32) \
+ x(read_reuse_race, 34) \
+ x(move_extent_read, 35) \
+ x(move_extent_write, 36) \
+ x(move_extent_finish, 37) \
+ x(move_extent_race, 38) \
+ x(move_extent_alloc_mem_fail, 39) \
+ x(copygc, 40) \
+ x(copygc_wait, 41) \
+ x(gc_gens_end, 42) \
+ x(gc_gens_start, 43) \
+ x(trans_blocked_journal_reclaim, 44) \
+ x(trans_restart_btree_node_reused, 45) \
+ x(trans_restart_btree_node_split, 46) \
+ x(trans_restart_fault_inject, 47) \
+ x(trans_restart_iter_upgrade, 48) \
+ x(trans_restart_journal_preres_get, 49) \
+ x(trans_restart_journal_reclaim, 50) \
+ x(trans_restart_journal_res_get, 51) \
+ x(trans_restart_key_cache_key_realloced, 52) \
+ x(trans_restart_key_cache_raced, 53) \
+ x(trans_restart_mark_replicas, 54) \
+ x(trans_restart_mem_realloced, 55) \
+ x(trans_restart_memory_allocation_failure, 56) \
+ x(trans_restart_relock, 57) \
+ x(trans_restart_relock_after_fill, 58) \
+ x(trans_restart_relock_key_cache_fill, 59) \
+ x(trans_restart_relock_next_node, 60) \
+ x(trans_restart_relock_parent_for_fill, 61) \
+ x(trans_restart_relock_path, 62) \
+ x(trans_restart_relock_path_intent, 63) \
+ x(trans_restart_too_many_iters, 64) \
+ x(trans_restart_traverse, 65) \
+ x(trans_restart_upgrade, 66) \
+ x(trans_restart_would_deadlock, 67) \
+ x(trans_restart_would_deadlock_write, 68) \
+ x(trans_restart_injected, 69) \
+ x(trans_restart_key_cache_upgrade, 70) \
+ x(trans_traverse_all, 71) \
+ x(transaction_commit, 72) \
+ x(write_super, 73) \
+ x(trans_restart_would_deadlock_recursion_limit, 74)
+
+enum bch_persistent_counters {
+#define x(t, n, ...) BCH_COUNTER_##t,
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ BCH_COUNTER_NR
+};
+
+struct bch_sb_field_counters {
+ struct bch_sb_field field;
+ __le64 d[0];
+};
+
/*
* On clean shutdown, store btree roots and current journal sequence number in
* the superblock:
#define BCH_JSET_VERSION_OLD 2
#define BCH_BSET_VERSION_OLD 3
+#define BCH_METADATA_VERSIONS() \
+ x(bkey_renumber, 10) \
+ x(inode_btree_change, 11) \
+ x(snapshot, 12) \
+ x(inode_backpointers, 13) \
+ x(btree_ptr_sectors_written, 14) \
+ x(snapshot_2, 15) \
+ x(reflink_p_fix, 16) \
+ x(subvol_dirent, 17) \
+ x(inode_v2, 18) \
+ x(freespace, 19) \
+ x(alloc_v4, 20) \
+ x(new_data_types, 21) \
+ x(backpointers, 22) \
+ x(inode_v3, 23)
+
enum bcachefs_metadata_version {
- bcachefs_metadata_version_min = 9,
- bcachefs_metadata_version_new_versioning = 10,
- bcachefs_metadata_version_bkey_renumber = 10,
- bcachefs_metadata_version_inode_btree_change = 11,
- bcachefs_metadata_version_snapshot = 12,
- bcachefs_metadata_version_inode_backpointers = 13,
- bcachefs_metadata_version_btree_ptr_sectors_written = 14,
- bcachefs_metadata_version_snapshot_2 = 15,
- bcachefs_metadata_version_reflink_p_fix = 16,
- bcachefs_metadata_version_subvol_dirent = 17,
- bcachefs_metadata_version_inode_v2 = 18,
- bcachefs_metadata_version_max = 19,
+ bcachefs_metadata_version_min = 9,
+#define x(t, n) bcachefs_metadata_version_##t = n,
+ BCH_METADATA_VERSIONS()
+#undef x
+ bcachefs_metadata_version_max
};
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
+/* Obsolete, always enabled: */
LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
/*
x(data_usage, 6) \
x(clock, 7) \
x(dev_usage, 8) \
- x(log, 9)
+ x(log, 9) \
+ x(overwrite, 10)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
__u32 pad;
__le64 buckets_ec;
- __le64 buckets_unavailable;
+ __le64 _buckets_unavailable; /* No longer used */
struct jset_entry_dev_usage_type d[];
} __attribute__((packed));
x(stripes, 6) \
x(reflink, 7) \
x(subvolumes, 8) \
- x(snapshots, 9)
+ x(snapshots, 9) \
+ x(lru, 10) \
+ x(freespace, 11) \
+ x(need_discard, 12) \
+ x(backpointers, 13)
enum btree_id {
#define x(kwd, val) BTREE_ID_##kwd = val,
__u32 bucket_size;
__u64 nr_buckets;
- __u64 available_buckets;
- __u64 buckets[BCH_DATA_NR];
- __u64 sectors[BCH_DATA_NR];
+ __u64 buckets_ec;
- __u64 ec_buckets;
- __u64 ec_sectors;
+ struct bch_ioctl_dev_usage_type {
+ __u64 buckets;
+ __u64 sectors;
+ __u64 fragmented;
+ } d[BCH_DATA_NR];
};
/*
#include "bcachefs.h"
#include "bkey.h"
+#include "bkey_cmp.h"
#include "bkey_methods.h"
#include "bset.h"
#include "util.h"
struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
const struct bkey_packed *);
-void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits)
+void bch2_bkey_packed_to_binary_text(struct printbuf *out,
+ const struct bkey_format *f,
+ const struct bkey_packed *k)
{
- unsigned bit = high_bit_offset, done = 0;
+ const u64 *p = high_word(f, k);
+ unsigned word_bits = 64 - high_bit_offset;
+ unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset;
+ u64 v = *p & (~0ULL >> high_bit_offset);
+
+ if (!nr_key_bits) {
+ prt_str(out, "(empty)");
+ return;
+ }
while (1) {
- while (bit < 64) {
- if (done && !(done % 8))
- *out++ = ' ';
- *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0';
- bit++;
- done++;
- if (done == nr_bits) {
- *out++ = '\0';
- return;
- }
+ unsigned next_key_bits = nr_key_bits;
+
+ if (nr_key_bits < 64) {
+ v >>= 64 - nr_key_bits;
+ next_key_bits = 0;
+ } else {
+ next_key_bits -= 64;
}
+ bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits));
+
+ if (!next_key_bits)
+ break;
+
+ prt_char(out, ' ');
+
p = next_word(p);
- bit = 0;
+ v = *p;
+ word_bits = 64;
+ nr_key_bits = next_key_bits;
}
}
#ifdef CONFIG_BCACHEFS_DEBUG
static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
- const struct bkey *unpacked,
- const struct bkey_format *format)
+ const struct bkey *unpacked,
+ const struct bkey_format *format)
{
struct bkey tmp;
tmp = __bch2_bkey_unpack_key(format, packed);
if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
- char buf1[160], buf2[160];
- char buf3[160], buf4[160];
-
- bch2_bkey_to_text(&PBUF(buf1), unpacked);
- bch2_bkey_to_text(&PBUF(buf2), &tmp);
- bch2_to_binary(buf3, (void *) unpacked, 80);
- bch2_to_binary(buf4, high_word(format, packed), 80);
+ struct printbuf buf = PRINTBUF;
- panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n",
+ prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n",
format->key_u64s,
format->bits_per_field[0],
format->bits_per_field[1],
format->bits_per_field[2],
format->bits_per_field[3],
- format->bits_per_field[4],
- buf1, buf2, buf3, buf4);
+ format->bits_per_field[4]);
+
+ prt_printf(&buf, "compiled unpack: ");
+ bch2_bkey_to_text(&buf, unpacked);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "c unpack: ");
+ bch2_bkey_to_text(&buf, &tmp);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "compiled unpack: ");
+ bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+ (struct bkey_packed *) unpacked);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "c unpack: ");
+ bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+ (struct bkey_packed *) &tmp);
+ prt_newline(&buf);
+
+ panic("%s", buf.buf);
}
}
{
struct pack_state out_s = pack_state_init(out_f, out);
struct unpack_state in_s = unpack_state_init(in_f, in);
+ u64 *w = out->_data;
unsigned i;
- out->_data[0] = 0;
+ *w = 0;
for (i = 0; i < BKEY_NR_FIELDS; i++)
if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
const struct bkey_format *format)
{
struct pack_state state = pack_state_init(format, out);
+ u64 *w = out->_data;
EBUG_ON((void *) in == (void *) out);
EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
EBUG_ON(in->format != KEY_FORMAT_CURRENT);
- out->_data[0] = 0;
+ *w = 0;
#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false;
bkey_fields()
{
const struct bkey_format *f = &b->format;
struct pack_state state = pack_state_init(f, out);
+ u64 *w = out->_data;
#ifdef CONFIG_BCACHEFS_DEBUG
struct bpos orig = in;
#endif
* enough - we need to make sure to zero them out:
*/
for (i = 0; i < f->key_u64s; i++)
- out->_data[i] = 0;
+ w[i] = 0;
if (unlikely(in.snapshot <
le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
#ifdef CONFIG_X86_64
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
- unsigned nr_key_bits)
-{
- long d0, d1, d2, d3;
- int cmp;
-
- /* we shouldn't need asm for this, but gcc is being retarded: */
-
- asm(".intel_syntax noprefix;"
- "xor eax, eax;"
- "xor edx, edx;"
- "1:;"
- "mov r8, [rdi];"
- "mov r9, [rsi];"
- "sub ecx, 64;"
- "jl 2f;"
-
- "cmp r8, r9;"
- "jnz 3f;"
-
- "lea rdi, [rdi - 8];"
- "lea rsi, [rsi - 8];"
- "jmp 1b;"
-
- "2:;"
- "not ecx;"
- "shr r8, 1;"
- "shr r9, 1;"
- "shr r8, cl;"
- "shr r9, cl;"
- "cmp r8, r9;"
-
- "3:\n"
- "seta al;"
- "setb dl;"
- "sub eax, edx;"
- ".att_syntax prefix;"
- : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
- : "0" (l), "1" (r), "3" (nr_key_bits)
- : "r8", "r9", "cc", "memory");
-
- return cmp;
-}
-
#define I(_x) (*(out)++ = (_x))
#define I1(i0) I(i0)
#define I2(i0, i1) (I1(i0), I(i1))
}
#else
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
- unsigned nr_key_bits)
-{
- u64 l_v, r_v;
-
- if (!nr_key_bits)
- return 0;
-
- /* for big endian, skip past header */
- nr_key_bits += high_bit_offset;
- l_v = *l & (~0ULL >> high_bit_offset);
- r_v = *r & (~0ULL >> high_bit_offset);
-
- while (1) {
- if (nr_key_bits < 64) {
- l_v >>= 64 - nr_key_bits;
- r_v >>= 64 - nr_key_bits;
- nr_key_bits = 0;
- } else {
- nr_key_bits -= 64;
- }
-
- if (!nr_key_bits || l_v != r_v)
- break;
-
- l = next_word(l);
- r = next_word(r);
-
- l_v = *l;
- r_v = *r;
- }
-
- return cmp_int(l_v, r_v);
-}
#endif
__pure
const struct bkey_packed *r,
const struct btree *b)
{
- const struct bkey_format *f = &b->format;
- int ret;
-
- EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
- EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
- ret = __bkey_cmp_bits(high_word(f, l),
- high_word(f, r),
- b->nr_key_bits);
-
- EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
- bkey_unpack_pos(b, r)));
- return ret;
+ return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
}
__pure __flatten
const struct bkey_packed *l,
const struct bkey_packed *r)
{
- struct bkey unpacked;
-
- if (likely(bkey_packed(l) && bkey_packed(r)))
- return __bch2_bkey_cmp_packed_format_checked(l, r, b);
-
- if (bkey_packed(l)) {
- __bkey_unpack_key_format_checked(b, &unpacked, l);
- l = (void*) &unpacked;
- } else if (bkey_packed(r)) {
- __bkey_unpack_key_format_checked(b, &unpacked, r);
- r = (void*) &unpacked;
- }
-
- return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+ return bch2_bkey_cmp_packed_inlined(b, l, r);
}
__pure __flatten
#include <linux/bug.h>
#include "bcachefs_format.h"
+#include "btree_types.h"
#include "util.h"
#include "vstructs.h"
#define HAVE_BCACHEFS_COMPILED_UNPACK 1
#endif
-void bch2_to_binary(char *, const u64 *, unsigned);
+void bch2_bkey_packed_to_binary_text(struct printbuf *,
+ const struct bkey_format *,
+ const struct bkey_packed *);
/* bkey with split value, const */
struct bkey_s_c {
static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
{
- k->u64s = BKEY_U64s + val_u64s;
+ unsigned u64s = BKEY_U64s + val_u64s;
+
+ BUG_ON(u64s > U8_MAX);
+ k->u64s = u64s;
}
static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
{
- k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
+ set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
}
#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
}
/*
- * we prefer to pass bpos by ref, but it's often enough terribly convenient to
- * pass it by by val... as much as I hate c++, const ref would be nice here:
+ * The compiler generates better code when we pass bpos by ref, but it's often
+ * enough terribly convenient to pass it by val... as much as I hate c++, const
+ * ref would be nice here:
*/
__pure __flatten
static inline int bkey_cmp_left_packed_byval(const struct btree *b,
bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
const struct bkey_format *);
+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
+
+static inline void
+__bkey_unpack_key_format_checked(const struct btree *b,
+ struct bkey *dst,
+ const struct bkey_packed *src)
+{
+ if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
+ compiled_unpack_fn unpack_fn = b->aux_data;
+ unpack_fn(dst, src);
+
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+ bch2_expensive_debug_checks) {
+ struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
+
+ BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
+ }
+ } else {
+ *dst = __bch2_bkey_unpack_key(&b->format, src);
+ }
+}
+
+static inline struct bkey
+bkey_unpack_key_format_checked(const struct btree *b,
+ const struct bkey_packed *src)
+{
+ struct bkey dst;
+
+ __bkey_unpack_key_format_checked(b, &dst, src);
+ return dst;
+}
+
+static inline void __bkey_unpack_key(const struct btree *b,
+ struct bkey *dst,
+ const struct bkey_packed *src)
+{
+ if (likely(bkey_packed(src)))
+ __bkey_unpack_key_format_checked(b, dst, src);
+ else
+ *dst = *packed_to_bkey_c(src);
+}
+
+/**
+ * bkey_unpack_key -- unpack just the key, not the value
+ */
+static inline struct bkey bkey_unpack_key(const struct btree *b,
+ const struct bkey_packed *src)
+{
+ return likely(bkey_packed(src))
+ ? bkey_unpack_key_format_checked(b, src)
+ : *packed_to_bkey_c(src);
+}
+
+static inline struct bpos
+bkey_unpack_pos_format_checked(const struct btree *b,
+ const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+ return bkey_unpack_key_format_checked(b, src).p;
+#else
+ return __bkey_unpack_pos(&b->format, src);
+#endif
+}
+
+static inline struct bpos bkey_unpack_pos(const struct btree *b,
+ const struct bkey_packed *src)
+{
+ return likely(bkey_packed(src))
+ ? bkey_unpack_pos_format_checked(b, src)
+ : packed_to_bkey_c(src)->p;
+}
+
+/* Disassembled bkeys */
+
+static inline struct bkey_s_c bkey_disassemble(struct btree *b,
+ const struct bkey_packed *k,
+ struct bkey *u)
+{
+ __bkey_unpack_key(b, u, k);
+
+ return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
+}
+
+/* non const version: */
+static inline struct bkey_s __bkey_disassemble(struct btree *b,
+ struct bkey_packed *k,
+ struct bkey *u)
+{
+ __bkey_unpack_key(b, u, k);
+
+ return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
+}
+
static inline u64 bkey_field_max(const struct bkey_format *f,
enum bch_bkey_fields nr)
{
#define _BCACHEFS_BKEY_BUF_H
#include "bcachefs.h"
+#include "bkey.h"
struct bkey_buf {
struct bkey_i *k;
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_CMP_H
+#define _BCACHEFS_BKEY_CMP_H
+
+#include "bkey.h"
+
+#ifdef CONFIG_X86_64
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+ unsigned nr_key_bits)
+{
+ long d0, d1, d2, d3;
+ int cmp;
+
+ /* we shouldn't need asm for this, but gcc is being retarded: */
+
+ asm(".intel_syntax noprefix;"
+ "xor eax, eax;"
+ "xor edx, edx;"
+ "1:;"
+ "mov r8, [rdi];"
+ "mov r9, [rsi];"
+ "sub ecx, 64;"
+ "jl 2f;"
+
+ "cmp r8, r9;"
+ "jnz 3f;"
+
+ "lea rdi, [rdi - 8];"
+ "lea rsi, [rsi - 8];"
+ "jmp 1b;"
+
+ "2:;"
+ "not ecx;"
+ "shr r8, 1;"
+ "shr r9, 1;"
+ "shr r8, cl;"
+ "shr r9, cl;"
+ "cmp r8, r9;"
+
+ "3:\n"
+ "seta al;"
+ "setb dl;"
+ "sub eax, edx;"
+ ".att_syntax prefix;"
+ : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
+ : "0" (l), "1" (r), "3" (nr_key_bits)
+ : "r8", "r9", "cc", "memory");
+
+ return cmp;
+}
+#else
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+ unsigned nr_key_bits)
+{
+ u64 l_v, r_v;
+
+ if (!nr_key_bits)
+ return 0;
+
+ /* for big endian, skip past header */
+ nr_key_bits += high_bit_offset;
+ l_v = *l & (~0ULL >> high_bit_offset);
+ r_v = *r & (~0ULL >> high_bit_offset);
+
+ while (1) {
+ if (nr_key_bits < 64) {
+ l_v >>= 64 - nr_key_bits;
+ r_v >>= 64 - nr_key_bits;
+ nr_key_bits = 0;
+ } else {
+ nr_key_bits -= 64;
+ }
+
+ if (!nr_key_bits || l_v != r_v)
+ break;
+
+ l = next_word(l);
+ r = next_word(r);
+
+ l_v = *l;
+ r_v = *r;
+ }
+
+ return cmp_int(l_v, r_v);
+}
+#endif
+
+static inline __pure __flatten
+int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l,
+ const struct bkey_packed *r,
+ const struct btree *b)
+{
+ const struct bkey_format *f = &b->format;
+ int ret;
+
+ EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+ ret = __bkey_cmp_bits(high_word(f, l),
+ high_word(f, r),
+ b->nr_key_bits);
+
+ EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
+ bkey_unpack_pos(b, r)));
+ return ret;
+}
+
+static inline __pure __flatten
+int bch2_bkey_cmp_packed_inlined(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
+{
+ struct bkey unpacked;
+
+ if (likely(bkey_packed(l) && bkey_packed(r)))
+ return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
+
+ if (bkey_packed(l)) {
+ __bkey_unpack_key_format_checked(b, &unpacked, l);
+ l = (void *) &unpacked;
+ } else if (bkey_packed(r)) {
+ __bkey_unpack_key_format_checked(b, &unpacked, r);
+ r = (void *) &unpacked;
+ }
+
+ return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+}
+
+#endif /* _BCACHEFS_BKEY_CMP_H */
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "backpointers.h"
#include "bkey_methods.h"
#include "btree_types.h"
#include "alloc_background.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
+#include "lru.h"
#include "quota.h"
#include "reflink.h"
#include "subvolume.h"
NULL
};
-static const char *deleted_key_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
- return NULL;
+ return 0;
}
#define bch2_bkey_ops_deleted (struct bkey_ops) { \
.key_invalid = deleted_key_invalid, \
}
-static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k)
+static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
- if (bkey_val_bytes(k.k))
- return "value size should be zero";
+ if (bkey_val_bytes(k.k)) {
+ prt_printf(err, "incorrect value size (%zu != 0)",
+ bkey_val_bytes(k.k));
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
#define bch2_bkey_ops_error (struct bkey_ops) { \
.key_invalid = empty_val_key_invalid, \
}
-static const char *key_type_cookie_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
- if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie))
- return "incorrect value size";
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) {
+ prt_printf(err, "incorrect value size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_cookie));
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
#define bch2_bkey_ops_cookie (struct bkey_ops) { \
.key_invalid = empty_val_key_invalid, \
}
-static const char *key_type_inline_data_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
- return NULL;
+ return 0;
}
static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
unsigned datalen = bkey_inline_data_bytes(k.k);
- pr_buf(out, "datalen %u: %*phN",
+ prt_printf(out, "datalen %u: %*phN",
datalen, min(datalen, 32U), d.v->data);
}
.val_to_text = key_type_inline_data_to_text, \
}
+static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
+{
+ if (bkey_val_bytes(k.k)) {
+ prt_printf(err, "incorrect value size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_cookie));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+ bch2_key_resize(l.k, l.k->size + r.k->size);
+ return true;
+}
+
+#define bch2_bkey_ops_set (struct bkey_ops) { \
+ .key_invalid = key_type_set_invalid, \
+ .key_merge = key_type_set_merge, \
+}
+
const struct bkey_ops bch2_bkey_ops[] = {
#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
BCH_BKEY_TYPES()
#undef x
};
-const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
+int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
- if (k.k->type >= KEY_TYPE_MAX)
- return "invalid type";
+ if (k.k->type >= KEY_TYPE_MAX) {
+ prt_printf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX);
+ return -EINVAL;
+ }
- return bch2_bkey_ops[k.k->type].key_invalid(c, k);
+ return bch2_bkey_ops[k.k->type].key_invalid(c, k, rw, err);
}
static unsigned bch2_key_types_allowed[] = {
(1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_inode)|
(1U << KEY_TYPE_inode_v2)|
+ (1U << KEY_TYPE_inode_v3)|
(1U << KEY_TYPE_inode_generation),
[BKEY_TYPE_dirents] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_alloc)|
(1U << KEY_TYPE_alloc_v2)|
- (1U << KEY_TYPE_alloc_v3),
+ (1U << KEY_TYPE_alloc_v3)|
+ (1U << KEY_TYPE_alloc_v4),
[BKEY_TYPE_quotas] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_quota),
[BKEY_TYPE_snapshots] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_snapshot),
+ [BKEY_TYPE_lru] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_lru),
+ [BKEY_TYPE_freespace] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_set),
+ [BKEY_TYPE_need_discard] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_set),
+ [BKEY_TYPE_backpointers] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_backpointer),
[BKEY_TYPE_btree] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_btree_ptr)|
(1U << KEY_TYPE_btree_ptr_v2),
};
-const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum btree_node_type type)
+int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum btree_node_type type,
+ int rw, struct printbuf *err)
{
- if (k.k->u64s < BKEY_U64s)
- return "u64s too small";
-
- if (!(bch2_key_types_allowed[type] & (1U << k.k->type)))
- return "invalid key type for this btree";
+ if (k.k->u64s < BKEY_U64s) {
+ prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
+ return -EINVAL;
+ }
- if (type == BKEY_TYPE_btree &&
- bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
- return "value too big";
+ if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) {
+ prt_printf(err, "invalid key type for btree %s (%s)",
+ bch2_btree_ids[type], bch2_bkey_types[type]);
+ return -EINVAL;
+ }
if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
- if (k.k->size == 0)
- return "bad size field";
+ if (k.k->size == 0) {
+ prt_printf(err, "size == 0");
+ return -EINVAL;
+ }
- if (k.k->size > k.k->p.offset)
- return "size greater than offset";
+ if (k.k->size > k.k->p.offset) {
+ prt_printf(err, "size greater than offset (%u > %llu)",
+ k.k->size, k.k->p.offset);
+ return -EINVAL;
+ }
} else {
- if (k.k->size)
- return "nonzero size field";
+ if (k.k->size) {
+ prt_printf(err, "size != 0");
+ return -EINVAL;
+ }
}
if (type != BKEY_TYPE_btree &&
!btree_type_has_snapshots(type) &&
- k.k->p.snapshot)
- return "nonzero snapshot";
+ k.k->p.snapshot) {
+ prt_printf(err, "nonzero snapshot");
+ return -EINVAL;
+ }
if (type != BKEY_TYPE_btree &&
btree_type_has_snapshots(type) &&
- !k.k->p.snapshot)
- return "invalid snapshot field";
+ !k.k->p.snapshot) {
+ prt_printf(err, "snapshot == 0");
+ return -EINVAL;
+ }
if (type != BKEY_TYPE_btree &&
- !bkey_cmp(k.k->p, POS_MAX))
- return "POS_MAX key";
+ !bkey_cmp(k.k->p, POS_MAX)) {
+ prt_printf(err, "key at POS_MAX");
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
-const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum btree_node_type type)
+int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum btree_node_type type,
+ int rw, struct printbuf *err)
{
- return __bch2_bkey_invalid(c, k, type) ?:
- bch2_bkey_val_invalid(c, k);
+ return __bch2_bkey_invalid(c, k, type, rw, err) ?:
+ bch2_bkey_val_invalid(c, k, rw, err);
}
-const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
+int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
+ struct printbuf *err)
{
- if (bpos_cmp(k.k->p, b->data->min_key) < 0)
- return "key before start of btree node";
+ if (bpos_cmp(k.k->p, b->data->min_key) < 0) {
+ prt_printf(err, "key before start of btree node");
+ return -EINVAL;
+ }
- if (bpos_cmp(k.k->p, b->data->max_key) > 0)
- return "key past end of btree node";
+ if (bpos_cmp(k.k->p, b->data->max_key) > 0) {
+ prt_printf(err, "key past end of btree node");
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
{
if (!bpos_cmp(pos, POS_MIN))
- pr_buf(out, "POS_MIN");
+ prt_printf(out, "POS_MIN");
else if (!bpos_cmp(pos, POS_MAX))
- pr_buf(out, "POS_MAX");
+ prt_printf(out, "POS_MAX");
else if (!bpos_cmp(pos, SPOS_MAX))
- pr_buf(out, "SPOS_MAX");
+ prt_printf(out, "SPOS_MAX");
else {
if (pos.inode == U64_MAX)
- pr_buf(out, "U64_MAX");
+ prt_printf(out, "U64_MAX");
else
- pr_buf(out, "%llu", pos.inode);
- pr_buf(out, ":");
+ prt_printf(out, "%llu", pos.inode);
+ prt_printf(out, ":");
if (pos.offset == U64_MAX)
- pr_buf(out, "U64_MAX");
+ prt_printf(out, "U64_MAX");
else
- pr_buf(out, "%llu", pos.offset);
- pr_buf(out, ":");
+ prt_printf(out, "%llu", pos.offset);
+ prt_printf(out, ":");
if (pos.snapshot == U32_MAX)
- pr_buf(out, "U32_MAX");
+ prt_printf(out, "U32_MAX");
else
- pr_buf(out, "%u", pos.snapshot);
+ prt_printf(out, "%u", pos.snapshot);
}
}
void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
{
if (k) {
- pr_buf(out, "u64s %u type ", k->u64s);
+ prt_printf(out, "u64s %u type ", k->u64s);
if (k->type < KEY_TYPE_MAX)
- pr_buf(out, "%s ", bch2_bkey_types[k->type]);
+ prt_printf(out, "%s ", bch2_bkey_types[k->type]);
else
- pr_buf(out, "%u ", k->type);
+ prt_printf(out, "%u ", k->type);
bch2_bpos_to_text(out, k->p);
- pr_buf(out, " len %u ver %llu", k->size, k->version.lo);
+ prt_printf(out, " len %u ver %llu", k->size, k->version.lo);
} else {
- pr_buf(out, "(null)");
+ prt_printf(out, "(null)");
}
}
if (likely(ops->val_to_text))
ops->val_to_text(out, c, k);
} else {
- pr_buf(out, "(invalid type %u)", k.k->type);
+ prt_printf(out, "(invalid type %u)", k.k->type);
}
}
bch2_bkey_to_text(out, k.k);
if (bkey_val_bytes(k.k)) {
- pr_buf(out, ": ");
+ prt_printf(out, ": ");
bch2_val_to_text(out, c, k);
}
}
struct bch_fs;
struct btree;
+struct btree_trans;
struct bkey;
enum btree_node_type;
extern const char * const bch2_bkey_types[];
+/*
+ * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
+ * invalid, entire key will be deleted.
+ *
+ * When invalid, error string is returned via @err. @rw indicates whether key is
+ * being read or written; more aggressive checks can be enabled when rw == WRITE.
+*/
struct bkey_ops {
- /* Returns reason for being invalid if invalid, else NULL: */
- const char * (*key_invalid)(const struct bch_fs *,
- struct bkey_s_c);
+ int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err);
void (*val_to_text)(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
void (*swab)(struct bkey_s);
bool (*key_normalize)(struct bch_fs *, struct bkey_s);
bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+ int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *, unsigned);
+ int (*atomic_trigger)(struct btree_trans *, struct bkey_s_c,
+ struct bkey_s_c, unsigned);
void (*compat)(enum btree_id id, unsigned version,
unsigned big_endian, int write,
struct bkey_s);
extern const struct bkey_ops bch2_bkey_ops[];
-const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c);
-const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
- enum btree_node_type);
-const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
- enum btree_node_type);
-const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
+int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
+ enum btree_node_type, int, struct printbuf *);
+int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
+ enum btree_node_type, int, struct printbuf *);
+int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *);
void bch2_bpos_to_text(struct printbuf *, struct bpos);
void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+static inline int bch2_mark_key(struct btree_trans *trans,
+ struct bkey_s_c old,
+ struct bkey_s_c new,
+ unsigned flags)
+{
+ const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type];
+
+ return ops->atomic_trigger
+ ? ops->atomic_trigger(trans, old, new, flags)
+ : 0;
+}
+
+enum btree_update_flags {
+ __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+ __BTREE_UPDATE_KEY_CACHE_RECLAIM,
+
+ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
+
+ __BTREE_TRIGGER_INSERT,
+ __BTREE_TRIGGER_OVERWRITE,
+
+ __BTREE_TRIGGER_GC,
+ __BTREE_TRIGGER_BUCKET_INVALIDATE,
+ __BTREE_TRIGGER_NOATOMIC,
+};
+
+#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
+
+#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
+
+#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
+#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
+
+#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
+#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
+#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
+
+#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \
+ ((1U << KEY_TYPE_alloc)| \
+ (1U << KEY_TYPE_alloc_v2)| \
+ (1U << KEY_TYPE_alloc_v3)| \
+ (1U << KEY_TYPE_alloc_v4)| \
+ (1U << KEY_TYPE_stripe)| \
+ (1U << KEY_TYPE_inode)| \
+ (1U << KEY_TYPE_inode_v2)| \
+ (1U << KEY_TYPE_snapshot))
+
+static inline int bch2_trans_mark_key(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type];
+
+ return ops->trans_trigger
+ ? ops->trans_trigger(trans, btree_id, level, old, new, flags)
+ : 0;
+}
+
+static inline int bch2_trans_mark_old(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, unsigned flags)
+{
+ struct bkey_i deleted;
+
+ bkey_init(&deleted.k);
+ deleted.k.p = old.k->p;
+
+ return bch2_trans_mark_key(trans, btree_id, level, old, &deleted,
+ BTREE_TRIGGER_OVERWRITE|flags);
+}
+
+static inline int bch2_trans_mark_new(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_i *new, unsigned flags)
+{
+ struct bkey_i deleted;
+
+ bkey_init(&deleted.k);
+ deleted.k.p = new->k.p;
+
+ return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
+ BTREE_TRIGGER_INSERT|flags);
+}
+
void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "bkey_buf.h"
+#include "bkey_cmp.h"
#include "bkey_sort.h"
#include "bset.h"
#include "extents.h"
struct bkey_packed *l,
struct bkey_packed *r)
{
- return bch2_bkey_cmp_packed(b, l, r) ?:
+ return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
(int) l->needs_whiteout - (int) r->needs_whiteout;
}
struct bkey_packed *_k, *_n;
struct bkey uk, n;
struct bkey_s_c k;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
if (!i->u64s)
return;
_n = bkey_next(_k);
k = bkey_disassemble(b, _k, &uk);
+
+ printbuf_reset(&buf);
if (c)
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
+ bch2_bkey_val_to_text(&buf, c, k);
else
- bch2_bkey_to_text(&PBUF(buf), k.k);
+ bch2_bkey_to_text(&buf, k.k);
printk(KERN_ERR "block %u key %5zu: %s\n", set,
- _k->_data - i->_data, buf);
+ _k->_data - i->_data, buf.buf);
if (_n == vstruct_last(i))
continue;
!bpos_cmp(n.p, k.k->p))
printk(KERN_ERR "Duplicate keys\n");
}
+
+ printbuf_exit(&buf);
}
void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
struct btree_node_iter *iter)
{
struct btree_node_iter_set *set;
+ struct printbuf buf = PRINTBUF;
printk(KERN_ERR "btree node iter with %u/%u sets:\n",
__btree_node_iter_used(iter), b->nsets);
struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
struct bset_tree *t = bch2_bkey_to_bset(b, k);
struct bkey uk = bkey_unpack_key(b, k);
- char buf[100];
- bch2_bkey_to_text(&PBUF(buf), &uk);
+ printbuf_reset(&buf);
+ bch2_bkey_to_text(&buf, &uk);
printk(KERN_ERR "set %zu key %u: %s\n",
- t - b->set, set->k, buf);
+ t - b->set, set->k, buf.buf);
}
+
+ printbuf_exit(&buf);
}
#ifdef CONFIG_BCACHEFS_DEBUG
struct btree_node_iter_set *set;
struct bkey ku = bkey_unpack_key(b, k);
struct bkey nu = bkey_unpack_key(b, n);
- char buf1[80], buf2[80];
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
bch2_dump_btree_node(NULL, b);
- bch2_bkey_to_text(&PBUF(buf1), &ku);
- bch2_bkey_to_text(&PBUF(buf2), &nu);
+ bch2_bkey_to_text(&buf1, &ku);
+ bch2_bkey_to_text(&buf2, &nu);
printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
- buf1, buf2);
+ buf1.buf, buf2.buf);
printk(KERN_ERR "iter was:");
btree_node_iter_for_each(_iter, set) {
struct bset_tree *t = bch2_bkey_to_bset(b, where);
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
#if 0
BUG_ON(prev &&
bkey_iter_cmp(b, prev, insert) > 0);
bkey_iter_cmp(b, prev, insert) > 0) {
struct bkey k1 = bkey_unpack_key(b, prev);
struct bkey k2 = bkey_unpack_key(b, insert);
- char buf1[100];
- char buf2[100];
bch2_dump_btree_node(NULL, b);
- bch2_bkey_to_text(&PBUF(buf1), &k1);
- bch2_bkey_to_text(&PBUF(buf2), &k2);
+ bch2_bkey_to_text(&buf1, &k1);
+ bch2_bkey_to_text(&buf2, &k2);
panic("prev > insert:\n"
"prev key %s\n"
"insert key %s\n",
- buf1, buf2);
+ buf1.buf, buf2.buf);
}
#endif
#if 0
bkey_iter_cmp(b, insert, next) > 0) {
struct bkey k1 = bkey_unpack_key(b, insert);
struct bkey k2 = bkey_unpack_key(b, next);
- char buf1[100];
- char buf2[100];
bch2_dump_btree_node(NULL, b);
- bch2_bkey_to_text(&PBUF(buf1), &k1);
- bch2_bkey_to_text(&PBUF(buf2), &k2);
+ bch2_bkey_to_text(&buf1, &k1);
+ bch2_bkey_to_text(&buf2, &k2);
panic("insert > next:\n"
"insert key %s\n"
"next key %s\n",
- buf1, buf2);
+ buf1.buf, buf2.buf);
}
#endif
}
t->size -= j - l;
for (j = l; j < t->size; j++)
- rw_aux_tree(b, t)[j].offset += shift;
+ rw_aux_tree(b, t)[j].offset += shift;
EBUG_ON(l < t->size &&
rw_aux_tree(b, t)[l].offset ==
bch2_btree_node_iter_sort(iter, b);
}
-noinline __flatten __attribute__((cold))
+noinline __flatten __cold
static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
struct btree *b, struct bpos *search)
{
EBUG_ON(iter->data->k > iter->data->end);
if (unlikely(__btree_node_iter_set_end(iter, 0))) {
- bch2_btree_node_iter_set_drop(iter, iter->data);
+ /* avoid an expensive memmove call: */
+ iter->data[0] = iter->data[1];
+ iter->data[1] = iter->data[2];
+ iter->data[2] = (struct btree_node_iter_set) { 0, 0 };
return;
}
struct bkey uk;
unsigned j, inorder;
- if (out->pos != out->end)
- *out->pos = '\0';
-
if (!bset_has_ro_aux_tree(t))
return;
switch (bkey_float(b, t, j)->exponent) {
case BFLOAT_FAILED:
uk = bkey_unpack_key(b, k);
- pr_buf(out,
+ prt_printf(out,
" failed unpacked at depth %u\n"
"\t",
ilog2(j));
bch2_bpos_to_text(out, uk.p);
- pr_buf(out, "\n");
+ prt_printf(out, "\n");
break;
}
}
return btree_aux_data_bytes(b) / sizeof(u64);
}
-typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
-
-static inline void
-__bkey_unpack_key_format_checked(const struct btree *b,
- struct bkey *dst,
- const struct bkey_packed *src)
-{
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
- {
- compiled_unpack_fn unpack_fn = b->aux_data;
- unpack_fn(dst, src);
-
- if (bch2_expensive_debug_checks) {
- struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
-
- BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
- }
- }
-#else
- *dst = __bch2_bkey_unpack_key(&b->format, src);
-#endif
-}
-
-static inline struct bkey
-bkey_unpack_key_format_checked(const struct btree *b,
- const struct bkey_packed *src)
-{
- struct bkey dst;
-
- __bkey_unpack_key_format_checked(b, &dst, src);
- return dst;
-}
-
-static inline void __bkey_unpack_key(const struct btree *b,
- struct bkey *dst,
- const struct bkey_packed *src)
-{
- if (likely(bkey_packed(src)))
- __bkey_unpack_key_format_checked(b, dst, src);
- else
- *dst = *packed_to_bkey_c(src);
-}
-
-/**
- * bkey_unpack_key -- unpack just the key, not the value
- */
-static inline struct bkey bkey_unpack_key(const struct btree *b,
- const struct bkey_packed *src)
-{
- return likely(bkey_packed(src))
- ? bkey_unpack_key_format_checked(b, src)
- : *packed_to_bkey_c(src);
-}
-
-static inline struct bpos
-bkey_unpack_pos_format_checked(const struct btree *b,
- const struct bkey_packed *src)
-{
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
- return bkey_unpack_key_format_checked(b, src).p;
-#else
- return __bkey_unpack_pos(&b->format, src);
-#endif
-}
-
-static inline struct bpos bkey_unpack_pos(const struct btree *b,
- const struct bkey_packed *src)
-{
- return likely(bkey_packed(src))
- ? bkey_unpack_pos_format_checked(b, src)
- : packed_to_bkey_c(src)->p;
-}
-
-/* Disassembled bkeys */
-
-static inline struct bkey_s_c bkey_disassemble(struct btree *b,
- const struct bkey_packed *k,
- struct bkey *u)
-{
- __bkey_unpack_key(b, u, k);
-
- return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
-}
-
-/* non const version: */
-static inline struct bkey_s __bkey_disassemble(struct btree *b,
- struct bkey_packed *k,
- struct bkey *u)
-{
- __bkey_unpack_key(b, u, k);
-
- return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
-}
-
#define for_each_bset(_b, _t) \
for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
#include "btree_iter.h"
#include "btree_locking.h"
#include "debug.h"
+#include "errcode.h"
#include "error.h"
#include <linux/prefetch.h>
#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
-struct lock_class_key bch2_btree_node_lock_key;
+#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
+do { \
+ if (shrinker_counter) \
+ bc->not_freed_##counter++; \
+} while (0)
+
+const char * const bch2_btree_node_flags[] = {
+#define x(f) #f,
+ BTREE_FLAGS()
+#undef x
+ NULL
+};
void bch2_recalc_btree_reserve(struct bch_fs *c)
{
return max_t(int, 0, bc->used - bc->reserve);
}
+static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
+{
+ if (b->c.lock.readers)
+ list_move(&b->list, &bc->freed_pcpu);
+ else
+ list_move(&b->list, &bc->freed_nonpcpu);
+}
+
static void btree_node_data_free(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
b->aux_data = NULL;
bc->used--;
- list_move(&b->list, &bc->freed);
+
+ btree_node_to_freedlist(bc, b);
}
static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
return 0;
}
-static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
+static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
{
- struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL);
+ struct btree *b = kzalloc(sizeof(struct btree), gfp);
if (!b)
return NULL;
bkey_btree_ptr_init(&b->key);
__six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ lockdep_set_no_check_recursion(&b->c.lock.dep_map);
+#endif
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
b->byte_order = ilog2(btree_bytes(c));
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
- struct btree *b = __btree_node_mem_alloc(c);
+ struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL);
if (!b)
return NULL;
/* Cause future lookups for this node to fail: */
b->hash_val = 0;
-
- six_lock_wakeup_all(&b->c.lock);
}
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
b->c.level = level;
b->c.btree_id = id;
- if (level)
- six_lock_pcpu_alloc(&b->c.lock);
- else
- six_lock_pcpu_free_rcu(&b->c.lock);
-
mutex_lock(&bc->lock);
ret = __bch2_btree_node_hash_insert(bc, b);
if (!ret)
- list_add(&b->list, &bc->live);
+ list_add_tail(&b->list, &bc->live);
mutex_unlock(&bc->lock);
return ret;
* this version is for btree nodes that have already been freed (we're not
* reaping a real btree node)
*/
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
{
struct btree_cache *bc = &c->btree_cache;
int ret = 0;
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
- if (!flush)
+ if (!flush) {
+ if (btree_node_dirty(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
+ else if (btree_node_read_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+ else if (btree_node_write_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
return -ENOMEM;
+ }
/* XXX: waiting on IO with btree cache lock held */
bch2_btree_node_wait_on_read(b);
bch2_btree_node_wait_on_write(b);
}
- if (!six_trylock_intent(&b->c.lock))
+ if (!six_trylock_intent(&b->c.lock)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
return -ENOMEM;
+ }
- if (!six_trylock_write(&b->c.lock))
+ if (!six_trylock_write(&b->c.lock)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
goto out_unlock_intent;
+ }
/* recheck under lock */
if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
- if (!flush)
+ if (!flush) {
+ if (btree_node_read_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+ else if (btree_node_write_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
goto out_unlock;
+ }
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;
}
- if (btree_node_noevict(b))
+ if (btree_node_noevict(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
goto out_unlock;
-
- if (!btree_node_may_write(b))
+ }
+ if (btree_node_write_blocked(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
+ goto out_unlock;
+ }
+ if (btree_node_will_make_reachable(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
goto out_unlock;
+ }
if (btree_node_dirty(b)) {
- if (!flush ||
- test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+ if (!flush) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
goto out_unlock;
+ }
/*
* Using the underscore version because we don't want to compact
* bsets after the write, since this node is about to be evicted
* the post write cleanup:
*/
if (bch2_verify_btree_ondisk)
- bch2_btree_node_write(c, b, SIX_LOCK_intent);
+ bch2_btree_node_write(c, b, SIX_LOCK_intent, 0);
else
- __bch2_btree_node_write(c, b, false);
+ __bch2_btree_node_write(c, b, 0);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
out:
if (b->hash_val && !ret)
- trace_btree_node_reap(c, b);
+ trace_and_count(c, btree_cache_reap, c, b);
return ret;
out_unlock:
six_unlock_write(&b->c.lock);
goto out;
}
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
{
- return __btree_node_reclaim(c, b, false);
+ return __btree_node_reclaim(c, b, false, shrinker_counter);
}
static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
{
- return __btree_node_reclaim(c, b, true);
+ return __btree_node_reclaim(c, b, true, false);
}
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
struct btree_cache *bc = &c->btree_cache;
struct btree *b, *t;
unsigned long nr = sc->nr_to_scan;
- unsigned long can_free;
- unsigned long touched = 0;
+ unsigned long can_free = 0;
unsigned long freed = 0;
+ unsigned long touched = 0;
unsigned i, flags;
unsigned long ret = SHRINK_STOP;
+ bool trigger_writes = atomic_read(&bc->dirty) + nr >=
+ bc->used * 3 / 4;
if (bch2_btree_shrinker_disabled)
return SHRINK_STOP;
- /* Return -1 if we can't do anything right now */
- if (sc->gfp_mask & __GFP_FS)
- mutex_lock(&bc->lock);
- else if (!mutex_trylock(&bc->lock))
- goto out_norestore;
-
+ mutex_lock(&bc->lock);
flags = memalloc_nofs_save();
/*
* succeed, so that inserting keys into the btree can always succeed and
* IO can always make forward progress:
*/
- nr /= btree_pages(c);
can_free = btree_cache_can_free(bc);
nr = min_t(unsigned long, nr, can_free);
touched++;
if (touched >= nr)
- break;
+ goto out;
- if (!btree_node_reclaim(c, b)) {
+ if (!btree_node_reclaim(c, b, true)) {
btree_node_data_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
freed++;
+ bc->freed++;
}
}
restart:
list_for_each_entry_safe(b, t, &bc->live, list) {
touched++;
- if (touched >= nr) {
- /* Save position */
- if (&t->list != &bc->live)
- list_move_tail(&bc->live, &t->list);
- break;
- }
-
- if (!btree_node_accessed(b) &&
- !btree_node_reclaim(c, b)) {
- /* can't call bch2_btree_node_hash_remove under lock */
+ if (btree_node_accessed(b)) {
+ clear_btree_node_accessed(b);
+ bc->not_freed_access_bit++;
+ } else if (!btree_node_reclaim(c, b, true)) {
freed++;
- if (&t->list != &bc->live)
- list_move_tail(&bc->live, &t->list);
-
btree_node_data_free(c, b);
- mutex_unlock(&bc->lock);
+ bc->freed++;
bch2_btree_node_hash_remove(bc, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
- if (freed >= nr)
- goto out;
-
- if (sc->gfp_mask & __GFP_FS)
- mutex_lock(&bc->lock);
- else if (!mutex_trylock(&bc->lock))
- goto out;
+ if (freed == nr)
+ goto out_rotate;
+ } else if (trigger_writes &&
+ btree_node_dirty(b) &&
+ !btree_node_will_make_reachable(b) &&
+ !btree_node_write_blocked(b) &&
+ six_trylock_read(&b->c.lock)) {
+ list_move(&bc->live, &b->list);
+ mutex_unlock(&bc->lock);
+ __bch2_btree_node_write(c, b, 0);
+ six_unlock_read(&b->c.lock);
+ if (touched >= nr)
+ goto out_nounlock;
+ mutex_lock(&bc->lock);
goto restart;
- } else
- clear_btree_node_accessed(b);
- }
+ }
- mutex_unlock(&bc->lock);
+ if (touched >= nr)
+ break;
+ }
+out_rotate:
+ if (&t->list != &bc->live)
+ list_move_tail(&bc->live, &t->list);
out:
- ret = (unsigned long) freed * btree_pages(c);
+ mutex_unlock(&bc->lock);
+out_nounlock:
+ ret = freed;
memalloc_nofs_restore(flags);
-out_norestore:
- trace_btree_cache_scan(sc->nr_to_scan,
- sc->nr_to_scan / btree_pages(c),
- btree_cache_can_free(bc),
- ret);
+ trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
return ret;
}
if (bch2_btree_shrinker_disabled)
return 0;
- return btree_cache_can_free(bc) * btree_pages(c);
+ return btree_cache_can_free(bc);
+}
+
+static void bch2_btree_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
+{
+ struct bch_fs *c = container_of(shrink, struct bch_fs,
+ btree_cache.shrink);
+
+ bch2_btree_cache_to_text(out, &c->btree_cache);
}
void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (btree_node_dirty(b))
bch2_btree_complete_write(c, b, btree_current_write(b));
- clear_btree_node_dirty(c, b);
+ clear_btree_node_dirty_acct(c, b);
btree_node_data_free(c, b);
}
BUG_ON(atomic_read(&c->btree_cache.dirty));
- while (!list_empty(&bc->freed)) {
- b = list_first_entry(&bc->freed, struct btree, list);
+ list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+ while (!list_empty(&bc->freed_nonpcpu)) {
+ b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
list_del(&b->list);
six_lock_pcpu_free(&b->c.lock);
kfree(b);
bc->shrink.count_objects = bch2_btree_cache_count;
bc->shrink.scan_objects = bch2_btree_cache_scan;
+ bc->shrink.to_text = bch2_btree_cache_shrinker_to_text;
bc->shrink.seeks = 4;
- bc->shrink.batch = btree_pages(c) * 2;
- ret = register_shrinker(&bc->shrink);
+ ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
out:
pr_verbose_init(c->opts, "ret %i", ret);
return ret;
mutex_init(&bc->lock);
INIT_LIST_HEAD(&bc->live);
INIT_LIST_HEAD(&bc->freeable);
- INIT_LIST_HEAD(&bc->freed);
+ INIT_LIST_HEAD(&bc->freed_pcpu);
+ INIT_LIST_HEAD(&bc->freed_nonpcpu);
}
/*
struct btree_cache *bc = &c->btree_cache;
if (bc->alloc_lock == current) {
- trace_btree_node_cannibalize_unlock(c);
+ trace_and_count(c, btree_cache_cannibalize_unlock, c);
bc->alloc_lock = NULL;
closure_wake_up(&bc->alloc_wait);
}
goto success;
if (!cl) {
- trace_btree_node_cannibalize_lock_fail(c);
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
return -ENOMEM;
}
goto success;
}
- trace_btree_node_cannibalize_lock_fail(c);
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
return -EAGAIN;
success:
- trace_btree_node_cannibalize_lock(c);
+ trace_and_count(c, btree_cache_cannibalize_lock, c);
return 0;
}
struct btree *b;
list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_reclaim(c, b))
+ if (!btree_node_reclaim(c, b, false))
return b;
while (1) {
}
}
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
{
struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
+ struct list_head *freed = pcpu_read_locks
+ ? &bc->freed_pcpu
+ : &bc->freed_nonpcpu;
+ struct btree *b, *b2;
u64 start_time = local_clock();
unsigned flags;
flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
- /*
- * btree_free() doesn't free memory; it sticks the node on the end of
- * the list. Check if there's any freed nodes there:
- */
- list_for_each_entry(b, &bc->freeable, list)
- if (!btree_node_reclaim(c, b))
- goto got_node;
-
/*
* We never free struct btree itself, just the memory that holds the on
* disk node. Check the freed list before allocating a new one:
*/
- list_for_each_entry(b, &bc->freed, list)
- if (!btree_node_reclaim(c, b))
+ list_for_each_entry(b, freed, list)
+ if (!btree_node_reclaim(c, b, false)) {
+ list_del_init(&b->list);
goto got_node;
+ }
- b = NULL;
-got_node:
- if (b)
- list_del_init(&b->list);
- mutex_unlock(&bc->lock);
-
+ b = __btree_node_mem_alloc(c, __GFP_NOWARN);
if (!b) {
- b = __btree_node_mem_alloc(c);
+ mutex_unlock(&bc->lock);
+ b = __btree_node_mem_alloc(c, GFP_KERNEL);
if (!b)
goto err;
-
- BUG_ON(!six_trylock_intent(&b->c.lock));
- BUG_ON(!six_trylock_write(&b->c.lock));
+ mutex_lock(&bc->lock);
}
- if (!b->data) {
- if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
- goto err;
+ if (pcpu_read_locks)
+ six_lock_pcpu_alloc(&b->c.lock);
- mutex_lock(&bc->lock);
- bc->used++;
- mutex_unlock(&bc->lock);
- }
+ BUG_ON(!six_trylock_intent(&b->c.lock));
+ BUG_ON(!six_trylock_write(&b->c.lock));
+got_node:
+
+ /*
+ * btree_free() doesn't free memory; it sticks the node on the end of
+ * the list. Check if there's any freed nodes there:
+ */
+ list_for_each_entry(b2, &bc->freeable, list)
+ if (!btree_node_reclaim(c, b2, false)) {
+ swap(b->data, b2->data);
+ swap(b->aux_data, b2->aux_data);
+ btree_node_to_freedlist(bc, b2);
+ six_unlock_write(&b2->c.lock);
+ six_unlock_intent(&b2->c.lock);
+ goto got_mem;
+ }
+
+ mutex_unlock(&bc->lock);
+
+ if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
+ goto err;
+
+ mutex_lock(&bc->lock);
+ bc->used++;
+got_mem:
+ mutex_unlock(&bc->lock);
BUG_ON(btree_node_hashed(b));
BUG_ON(btree_node_dirty(b));
err:
mutex_lock(&bc->lock);
- if (b) {
- list_add(&b->list, &bc->freed);
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- }
-
/* Try to cannibalize another cached btree node: */
if (bc->alloc_lock == current) {
- b = btree_node_cannibalize(c);
- list_del_init(&b->list);
- mutex_unlock(&bc->lock);
+ b2 = btree_node_cannibalize(c);
+ bch2_btree_node_hash_remove(bc, b2);
+
+ if (b) {
+ swap(b->data, b2->data);
+ swap(b->aux_data, b2->aux_data);
+ btree_node_to_freedlist(bc, b2);
+ six_unlock_write(&b2->c.lock);
+ six_unlock_intent(&b2->c.lock);
+ } else {
+ b = b2;
+ list_del_init(&b->list);
+ }
- bch2_btree_node_hash_remove(bc, b);
+ mutex_unlock(&bc->lock);
- trace_btree_node_cannibalize(c);
+ trace_and_count(c, btree_cache_cannibalize, c);
goto out;
}
* been freed:
*/
if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
- trace_trans_restart_relock_parent_for_fill(trans->fn,
- _THIS_IP_, btree_id, &path->pos);
- btree_trans_restart(trans);
- return ERR_PTR(-EINTR);
+ trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
+ }
+
+ b = bch2_btree_node_mem_alloc(c, level != 0);
+
+ if (trans && b == ERR_PTR(-ENOMEM)) {
+ trans->memory_allocation_failure = true;
+ trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
}
- b = bch2_btree_node_mem_alloc(c);
if (IS_ERR(b))
return b;
if (!sync)
return NULL;
- if (trans &&
- (!bch2_trans_relock(trans) ||
- !bch2_btree_path_relock_intent(trans, path))) {
- BUG_ON(!trans->restarted);
- return ERR_PTR(-EINTR);
+ if (trans) {
+ int ret = bch2_trans_relock(trans) ?:
+ bch2_btree_path_relock_intent(trans, path);
+ if (ret) {
+ BUG_ON(!trans->restarted);
+ return ERR_PTR(ret);
+ }
}
if (!six_relock_type(&b->c.lock, lock_type, seq)) {
- trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_,
- btree_id, &path->pos);
- btree_trans_restart(trans);
- return ERR_PTR(-EINTR);
+ if (trans)
+ trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
}
return b;
}
-static int lock_node_check_fn(struct six_lock *lock, void *p)
-{
- struct btree *b = container_of(lock, struct btree, c.lock);
- const struct bkey_i *k = p;
-
- return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1;
-}
-
static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
{
- char buf1[200], buf2[100], buf3[100];
+ struct printbuf buf = PRINTBUF;
if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
return;
- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&b->key));
- bch2_bpos_to_text(&PBUF(buf2), b->data->min_key);
- bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
-
- bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n"
- "btree %s level %u\n"
- "ptr: %s\n"
- "header: btree %s level %llu\n"
- "min %s max %s\n",
- bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1,
- bch2_btree_ids[BTREE_NODE_ID(b->data)],
- BTREE_NODE_LEVEL(b->data),
- buf2, buf3);
+ prt_printf(&buf,
+ "btree node header doesn't match ptr\n"
+ "btree %s level %u\n"
+ "ptr: ",
+ bch2_btree_ids[b->c.btree_id], b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ prt_printf(&buf, "\nheader: btree %s level %llu\n"
+ "min ",
+ bch2_btree_ids[BTREE_NODE_ID(b->data)],
+ BTREE_NODE_LEVEL(b->data));
+ bch2_bpos_to_text(&buf, b->data->min_key);
+
+ prt_printf(&buf, "\nmax ");
+ bch2_bpos_to_text(&buf, b->data->max_key);
+
+ bch2_fs_inconsistent(c, "%s", buf.buf);
+ printbuf_exit(&buf);
}
static inline void btree_check_header(struct bch_fs *c, struct btree *b)
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
struct bset_tree *t;
+ int ret;
EBUG_ON(level >= BTREE_MAX_DEPTH);
if (likely(c->opts.btree_node_mem_ptr_optimization &&
b &&
b->hash_val == btree_ptr_hash_val(k)))
- goto lock_node;
+ goto lock_node;
retry:
b = btree_cache_find(bc, k);
if (unlikely(!b)) {
* was removed - and we'll bail out:
*/
if (btree_node_read_locked(path, level + 1))
- btree_node_unlock(path, level + 1);
+ btree_node_unlock(trans, path, level + 1);
- if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type,
- lock_node_check_fn, (void *) k, trace_ip)) {
- if (!trans->restarted)
- goto retry;
- return ERR_PTR(-EINTR);
- }
+ ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ERR_PTR(ret);
+
+ BUG_ON(ret);
if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
b->c.level != level ||
if (bch2_btree_node_relock(trans, path, level + 1))
goto retry;
- trace_trans_restart_btree_node_reused(trans->fn,
- trace_ip,
- path->btree_id,
- &path->pos);
- btree_trans_restart(trans);
- return ERR_PTR(-EINTR);
+ trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
}
}
* should_be_locked is not set on this path yet, so we need to
* relock it specifically:
*/
- if (trans &&
- (!bch2_trans_relock(trans) ||
- !bch2_btree_path_relock_intent(trans, path))) {
- BUG_ON(!trans->restarted);
- return ERR_PTR(-EINTR);
+ if (trans) {
+ int ret = bch2_trans_relock(trans) ?:
+ bch2_btree_path_relock_intent(trans, path);
+ if (ret) {
+ BUG_ON(!trans->restarted);
+ return ERR_PTR(ret);
+ }
}
if (!six_relock_type(&b->c.lock, lock_type, seq))
return b;
}
-struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
const struct bkey_i *k,
enum btree_id btree_id,
unsigned level,
bool nofill)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
struct bset_tree *t;
goto out;
} else {
lock_node:
- ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
- if (ret)
- goto retry;
+ ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ERR_PTR(ret);
+
+ BUG_ON(ret);
if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
b->c.btree_id != btree_id ||
return PTR_ERR_OR_ZERO(b);
}
-void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
+void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
/* XXX we're called from btree_gc which will be holding other btree
* nodes locked
- * */
+ */
__bch2_btree_node_wait_on_read(b);
__bch2_btree_node_wait_on_write(b);
- six_lock_intent(&b->c.lock, NULL, NULL);
- six_lock_write(&b->c.lock, NULL, NULL);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
if (btree_node_dirty(b)) {
- __bch2_btree_node_write(c, b, false);
+ __bch2_btree_node_write(c, b, 0);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;
bch2_btree_keys_stats(b, &stats);
- pr_buf(out, "l %u ", b->c.level);
+ prt_printf(out, "l %u ", b->c.level);
bch2_bpos_to_text(out, b->data->min_key);
- pr_buf(out, " - ");
+ prt_printf(out, " - ");
bch2_bpos_to_text(out, b->data->max_key);
- pr_buf(out, ":\n"
+ prt_printf(out, ":\n"
" ptrs: ");
bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
- pr_buf(out, "\n"
+ prt_printf(out, "\n"
" format: u64s %u fields %u %u %u %u %u\n"
" unpack fn len: %u\n"
" bytes used %zu/%zu (%zu%% full)\n"
stats.failed);
}
-void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
+void bch2_btree_cache_to_text(struct printbuf *out, struct btree_cache *bc)
{
- pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
- pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
- pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
+ prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
+ prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
+ prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
+
+ prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
+ prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
+ prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
+ prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
+ prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
+ prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
+ prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
+ prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
+ prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
+ prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
+
}
#include "bcachefs.h"
#include "btree_types.h"
+#include "bkey_methods.h"
-extern struct lock_class_key bch2_btree_node_lock_key;
+extern const char * const bch2_btree_node_flags[];
struct btree_iter;
int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool);
struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
const struct bkey_i *, unsigned,
enum six_lock_type, unsigned long);
-struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
enum btree_id, unsigned, bool);
int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *,
const struct bkey_i *, enum btree_id, unsigned);
-void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
+void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
void bch2_fs_btree_cache_exit(struct bch_fs *);
int bch2_fs_btree_cache_init(struct bch_fs *);
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
struct btree *);
-void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *);
+void bch2_btree_cache_to_text(struct printbuf *, struct btree_cache *);
#endif /* _BCACHEFS_BTREE_CACHE_H */
struct bpos expected_start = bkey_deleted(&prev->k->k)
? node_start
: bpos_successor(prev->k->k.p);
- char buf1[200], buf2[200];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
int ret = 0;
if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
- if (bkey_deleted(&prev->k->k)) {
- struct printbuf out = PBUF(buf1);
- pr_buf(&out, "start of node: ");
- bch2_bpos_to_text(&out, node_start);
- } else {
- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
- }
-
if (bpos_cmp(expected_start, bp->v.min_key)) {
bch2_topology_error(c);
+ if (bkey_deleted(&prev->k->k)) {
+ prt_printf(&buf1, "start of node: ");
+ bch2_bpos_to_text(&buf1, node_start);
+ } else {
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
+ }
+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
+
if (__fsck_err(c,
FSCK_CAN_FIX|
FSCK_CAN_IGNORE|
" prev %s\n"
" cur %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1,
- (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)) &&
+ buf1.buf, buf2.buf) &&
!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
- return FSCK_ERR_START_TOPOLOGY_REPAIR;
+ ret = -BCH_ERR_need_topology_repair;
+ goto err;
} else {
set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
}
if (is_last && bpos_cmp(cur.k->k.p, node_end)) {
bch2_topology_error(c);
+ printbuf_reset(&buf1);
+ printbuf_reset(&buf2);
+
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
+ bch2_bpos_to_text(&buf2, node_end);
+
if (__fsck_err(c,
FSCK_CAN_FIX|
FSCK_CAN_IGNORE|
" %s\n"
" expected %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)) &&
+ buf1.buf, buf2.buf) &&
!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
- return FSCK_ERR_START_TOPOLOGY_REPAIR;
+ ret = -BCH_ERR_need_topology_repair;
+ goto err;
} else {
set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
}
}
bch2_bkey_buf_copy(prev, c, cur.k);
+err:
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
}
}
-static void bch2_btree_node_update_key_early(struct bch_fs *c,
+static void bch2_btree_node_update_key_early(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_i *new)
{
+ struct bch_fs *c = trans->c;
struct btree *b;
struct bkey_buf tmp;
int ret;
bch2_bkey_buf_init(&tmp);
bch2_bkey_buf_reassemble(&tmp, c, old);
- b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true);
+ b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
if (!IS_ERR_OR_NULL(b)) {
mutex_lock(&c->btree_cache.lock);
}
bch2_btree_node_drop_keys_outside_node(b);
-
+ bkey_copy(&b->key, &new->k_i);
return 0;
}
struct bpos expected_start = !prev
? b->data->min_key
: bpos_successor(prev->key.k.p);
- char buf1[200], buf2[200];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
int ret = 0;
if (!prev) {
- struct printbuf out = PBUF(buf1);
- pr_buf(&out, "start of node: ");
- bch2_bpos_to_text(&out, b->data->min_key);
+ prt_printf(&buf1, "start of node: ");
+ bch2_bpos_to_text(&buf1, b->data->min_key);
} else {
- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key));
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
}
- bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key));
+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
if (prev &&
bpos_cmp(expected_start, cur->data->min_key) > 0 &&
" node %s\n"
" next %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1, buf2))
- return DROP_PREV_NODE;
+ buf1.buf, buf2.buf)) {
+ ret = DROP_PREV_NODE;
+ goto out;
+ }
if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p,
bpos_predecessor(cur->data->min_key)), c,
" node %s\n"
" next %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1, buf2))
+ buf1.buf, buf2.buf))
ret = set_node_max(c, prev,
bpos_predecessor(cur->data->min_key));
} else {
" prev %s\n"
" node %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1, buf2))
- return DROP_THIS_NODE;
+ buf1.buf, buf2.buf)) {
+ ret = DROP_THIS_NODE;
+ goto out;
+ }
if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
"btree node with incorrect min_key at btree %s level %u:\n"
" prev %s\n"
" node %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1, buf2))
- ret = set_node_min(c, cur, expected_start);
+ buf1.buf, buf2.buf))
+ ret = set_node_min(c, cur, expected_start);
}
+out:
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
struct btree *child)
{
- char buf1[200], buf2[200];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
int ret = 0;
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
+ bch2_bpos_to_text(&buf2, b->key.k.p);
+
if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c,
"btree node with incorrect max_key at btree %s level %u:\n"
" %s\n"
" expected %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) {
+ buf1.buf, buf2.buf)) {
ret = set_node_max(c, child, b->key.k.p);
if (ret)
- return ret;
+ goto err;
}
+err:
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
-static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
{
+ struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf prev_k, cur_k;
struct btree *prev = NULL, *cur = NULL;
bool have_child, dropped_children = false;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
if (!b->c.level)
bch2_btree_and_journal_iter_advance(&iter);
bch2_bkey_buf_reassemble(&cur_k, c, k);
- cur = bch2_btree_node_get_noiter(c, cur_k.k,
+ cur = bch2_btree_node_get_noiter(trans, cur_k.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(cur);
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
+
if (mustfix_fsck_err_on(ret == -EIO, c,
- "Unreadable btree node at btree %s level %u:\n"
+ "Topology repair: unreadable btree node at btree %s level %u:\n"
" %s",
bch2_btree_ids[b->c.btree_id],
b->c.level - 1,
- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) {
- bch2_btree_node_evict(c, cur_k.k);
+ buf.buf)) {
+ bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
+ cur = NULL;
if (ret)
break;
continue;
}
if (ret) {
- bch_err(c, "%s: error %i getting btree node",
- __func__, ret);
+ bch_err(c, "%s: error getting btree node: %s",
+ __func__, bch2_err_str(ret));
break;
}
if (ret == DROP_THIS_NODE) {
six_unlock_read(&cur->c.lock);
- bch2_btree_node_evict(c, cur_k.k);
+ bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
+ cur = NULL;
if (ret)
break;
continue;
prev = NULL;
if (ret == DROP_PREV_NODE) {
- bch2_btree_node_evict(c, prev_k.k);
+ bch2_btree_node_evict(trans, prev_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, prev_k.k->k.p);
if (ret)
bch2_bkey_buf_reassemble(&cur_k, c, k);
bch2_btree_and_journal_iter_advance(&iter);
- cur = bch2_btree_node_get_noiter(c, cur_k.k,
+ cur = bch2_btree_node_get_noiter(trans, cur_k.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(cur);
if (ret) {
- bch_err(c, "%s: error %i getting btree node",
- __func__, ret);
+ bch_err(c, "%s: error getting btree node: %s",
+ __func__, bch2_err_str(ret));
goto err;
}
- ret = bch2_btree_repair_topology_recurse(c, cur);
+ ret = bch2_btree_repair_topology_recurse(trans, cur);
six_unlock_read(&cur->c.lock);
cur = NULL;
if (ret == DROP_THIS_NODE) {
- bch2_btree_node_evict(c, cur_k.k);
+ bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
dropped_children = true;
have_child = true;
}
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
if (mustfix_fsck_err_on(!have_child, c,
"empty interior btree node at btree %s level %u\n"
" %s",
bch2_btree_ids[b->c.btree_id],
- b->c.level,
- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf)))
+ b->c.level, buf.buf))
ret = DROP_THIS_NODE;
err:
fsck_err:
if (!ret && dropped_children)
goto again;
+ printbuf_exit(&buf);
return ret;
}
static int bch2_repair_topology(struct bch_fs *c)
{
+ struct btree_trans trans;
struct btree *b;
unsigned i;
int ret = 0;
+ bch2_trans_init(&trans, c, 0, 0);
+
for (i = 0; i < BTREE_ID_NR && !ret; i++) {
b = c->btree_roots[i].b;
if (btree_node_fake(b))
continue;
- six_lock_read(&b->c.lock, NULL, NULL);
- ret = bch2_btree_repair_topology_recurse(c, b);
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+ ret = bch2_btree_repair_topology_recurse(&trans, b);
six_unlock_read(&b->c.lock);
if (ret == DROP_THIS_NODE) {
bch_err(c, "empty btree root - repair unimplemented");
- ret = FSCK_ERR_EXIT;
+ ret = -BCH_ERR_fsck_repair_unimplemented;
}
}
+ bch2_trans_exit(&trans);
+
return ret;
}
-static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
+static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id,
unsigned level, bool is_root,
struct bkey_s_c *k)
{
+ struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p = { 0 };
bool do_update = false;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
/*
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
- if (fsck_err_on(!g->gen_valid, c,
+ if (c->opts.reconstruct_alloc ||
+ fsck_err_on(!g->gen_valid, c,
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
p.ptr.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (!p.ptr.cached) {
- g->_mark.gen = p.ptr.gen;
g->gen_valid = true;
+ g->gen = p.ptr.gen;
} else {
do_update = true;
}
}
- if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c,
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
- p.ptr.gen, g->mark.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (!p.ptr.cached) {
- g->_mark.gen = p.ptr.gen;
g->gen_valid = true;
- g->_mark.data_type = 0;
- g->_mark.dirty_sectors = 0;
- g->_mark.cached_sectors = 0;
+ g->gen = p.ptr.gen;
+ g->data_type = 0;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
} else {
do_update = true;
}
}
- if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
+ if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen,
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
p.ptr.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
if (fsck_err_on(!p.ptr.cached &&
- gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
+ gen_cmp(p.ptr.gen, g->gen) < 0, c,
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
- p.ptr.gen, g->mark.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
- if (data_type != BCH_DATA_btree && p.ptr.gen != g->mark.gen)
+ if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
continue;
- if (fsck_err_on(g->mark.data_type &&
- g->mark.data_type != data_type, c,
+ if (fsck_err_on(g->data_type &&
+ g->data_type != data_type, c,
"bucket %u:%zu different types of data in same bucket: %s, %s\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[g->mark.data_type],
+ bch2_data_types[g->data_type],
bch2_data_types[data_type],
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (data_type == BCH_DATA_btree) {
- g->_mark.data_type = data_type;
+ g->data_type = data_type;
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
} else {
do_update = true;
"pointer to nonexistent stripe %llu\n"
"while marking %s",
(u64) p.ec.idx,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c,
"pointer does not match stripe %llu\n"
"while marking %s",
(u64) p.ec.idx,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
}
}
if (is_root) {
bch_err(c, "cannot update btree roots yet");
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
if (!new) {
bch_err(c, "%s: error allocating new key", __func__);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto err;
}
bkey_reassemble(new, *k);
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_GC_BUCKET(ca, ptr);
- ptr->gen = g->mark.gen;
+ ptr->gen = g->gen;
}
} else {
bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
(ptr->cached &&
- (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
+ (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
(!ptr->cached &&
- gen_cmp(ptr->gen, g->mark.gen) < 0) ||
- gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
- (g->mark.data_type &&
- g->mark.data_type != data_type);
+ gen_cmp(ptr->gen, g->gen) < 0) ||
+ gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
+ (g->data_type &&
+ g->data_type != data_type);
}));
again:
ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
ret = bch2_journal_key_insert_take(c, btree_id, level, new);
if (ret) {
kfree(new);
- return ret;
+ goto err;
}
if (level)
- bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
+ bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
+
+ if (c->opts.verbose) {
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, *k);
+ bch_info(c, "updated %s", buf.buf);
+
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+ bch_info(c, "new key %s", buf.buf);
+ }
- bch2_bkey_val_to_text(&PBUF(buf), c, *k);
- bch_info(c, "updated %s", buf);
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new));
- bch_info(c, "new key %s", buf);
*k = bkey_i_to_s_c(new);
}
+err:
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
if (initial) {
BUG_ON(bch2_journal_seq_verify &&
- k->k->version.lo > journal_cur_seq(&c->journal));
+ k->k->version.lo > atomic64_read(&c->journal.seq));
- ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
+ ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
if (ret)
goto err;
atomic64_set(&c->key_version, k->k->version.lo);
}
- ret = bch2_mark_key(trans, old, *k, flags);
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_mark_key(trans, old, *k, flags));
fsck_err:
err:
if (ret)
- bch_err(c, "%s: ret %i", __func__, ret);
+ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
return ret;
}
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct btree *b;
- unsigned depth = metadata_only ? 1
- : bch2_expensive_debug_checks ? 0
- : !btree_node_type_needs_gc(btree_id) ? 1
- : 0;
+ unsigned depth = metadata_only ? 1 : 0;
int ret = 0;
gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
false, &k, true);
if (ret) {
- bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
+ bch_err(c, "%s: error from bch2_gc_mark_key: %s",
+ __func__, bch2_err_str(ret));
goto fsck_err;
}
bch2_bkey_buf_reassemble(&cur, c, k);
bch2_btree_and_journal_iter_advance(&iter);
- child = bch2_btree_node_get_noiter(c, cur.k,
+ child = bch2_btree_node_get_noiter(trans, cur.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(child);
" %s",
bch2_btree_ids[b->c.btree_id],
b->c.level - 1,
- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf)) &&
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
- ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
+ ret = -BCH_ERR_need_topology_repair;
bch_info(c, "Halting mark and sweep to start topology repair pass");
goto fsck_err;
} else {
continue;
}
} else if (ret) {
- bch_err(c, "%s: error %i getting btree node",
- __func__, ret);
+ bch_err(c, "%s: error getting btree node: %s",
+ __func__, bch2_err_str(ret));
break;
}
bch2_bkey_buf_exit(&cur, c);
bch2_bkey_buf_exit(&prev, c);
bch2_btree_and_journal_iter_exit(&iter);
+ printbuf_exit(&buf);
return ret;
}
{
struct bch_fs *c = trans->c;
struct btree *b;
- unsigned target_depth = metadata_only ? 1
- : bch2_expensive_debug_checks ? 0
- : !btree_node_type_needs_gc(btree_id) ? 1
- : 0;
- char buf[100];
+ unsigned target_depth = metadata_only ? 1 : 0;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
b = c->btree_roots[btree_id].b;
return 0;
six_lock_read(&b->c.lock, NULL, NULL);
+ printbuf_reset(&buf);
+ bch2_bpos_to_text(&buf, b->data->min_key);
if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
- "btree root with incorrect min_key: %s",
- (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
+ "btree root with incorrect min_key: %s", buf.buf)) {
bch_err(c, "repair unimplemented");
- ret = FSCK_ERR_EXIT;
+ ret = -BCH_ERR_fsck_repair_unimplemented;
goto fsck_err;
}
+ printbuf_reset(&buf);
+ bch2_bpos_to_text(&buf, b->data->max_key);
if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c,
- "btree root with incorrect max_key: %s",
- (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
+ "btree root with incorrect max_key: %s", buf.buf)) {
bch_err(c, "repair unimplemented");
- ret = FSCK_ERR_EXIT;
+ ret = -BCH_ERR_fsck_repair_unimplemented;
goto fsck_err;
}
six_unlock_read(&b->c.lock);
if (ret < 0)
- bch_err(c, "%s: ret %i", __func__, ret);
+ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
+ printbuf_exit(&buf);
return ret;
}
bch2_trans_init(&trans, c, 0, 0);
+ if (initial)
+ trans.is_initial_gc = true;
+
for (i = 0; i < BTREE_ID_NR; i++)
ids[i] = i;
bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
: bch2_gc_btree(&trans, ids[i], initial, metadata_only);
if (ret < 0)
- bch_err(c, "%s: ret %i", __func__, ret);
+ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
bch2_trans_exit(&trans);
return ret;
genradix_free(&c->gc_stripes);
for_each_member_device(ca, c, i) {
- kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
+ kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
- ca->buckets[1] = NULL;
+ ca->buckets_gc = NULL;
free_percpu(ca->usage_gc);
ca->usage_gc = NULL;
bool initial, bool metadata_only)
{
struct bch_dev *ca = NULL;
- bool verify = !metadata_only && (!initial ||
- (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
+ struct printbuf buf = PRINTBUF;
+ bool verify = !metadata_only &&
+ !c->opts.reconstruct_alloc &&
+ (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
unsigned i, dev;
int ret = 0;
percpu_down_write(&c->mark_lock);
#define copy_field(_f, _msg, ...) \
- if (dst->_f != src->_f) { \
- if (verify) \
- fsck_err(c, _msg ": got %llu, should be %llu" \
- , ##__VA_ARGS__, dst->_f, src->_f); \
- dst->_f = src->_f; \
- }
+ if (dst->_f != src->_f && \
+ (!verify || \
+ fsck_err(c, _msg ": got %llu, should be %llu" \
+ , ##__VA_ARGS__, dst->_f, src->_f))) \
+ dst->_f = src->_f
#define copy_stripe_field(_f, _msg, ...) \
- if (dst->_f != src->_f) { \
- if (verify) \
- fsck_err(c, "stripe %zu has wrong "_msg \
- ": got %u, should be %u", \
- iter.pos, ##__VA_ARGS__, \
- dst->_f, src->_f); \
- dst->_f = src->_f; \
- }
+ if (dst->_f != src->_f && \
+ (!verify || \
+ fsck_err(c, "stripe %zu has wrong "_msg \
+ ": got %u, should be %u", \
+ iter.pos, ##__VA_ARGS__, \
+ dst->_f, src->_f))) \
+ dst->_f = src->_f
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
#define copy_fs_field(_f, _msg, ...) \
dev_usage_u64s());
copy_dev_field(buckets_ec, "buckets_ec");
- copy_dev_field(buckets_unavailable, "buckets_unavailable");
for (i = 0; i < BCH_DATA_NR; i++) {
copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
- char buf[80];
if (metadata_only &&
(e->data_type == BCH_DATA_user ||
e->data_type == BCH_DATA_cached))
continue;
- bch2_replicas_entry_to_text(&PBUF(buf), e);
+ printbuf_reset(&buf);
+ bch2_replicas_entry_to_text(&buf, e);
- copy_fs_field(replicas[i], "%s", buf);
+ copy_fs_field(replicas[i], "%s", buf.buf);
}
}
if (ca)
percpu_ref_put(&ca->ref);
if (ret)
- bch_err(c, "%s: ret %i", __func__, ret);
+ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
percpu_up_write(&c->mark_lock);
+ printbuf_exit(&buf);
return ret;
}
}
for_each_member_device(ca, c, i) {
- BUG_ON(ca->buckets[1]);
+ BUG_ON(ca->buckets_gc);
BUG_ON(ca->usage_gc);
ca->usage_gc = alloc_percpu(struct bch_dev_usage);
percpu_ref_put(&ca->ref);
return -ENOMEM;
}
+
+ this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
+ ca->mi.nbuckets - ca->mi.first_bucket);
}
return 0;
}
+/* returns true if not equal */
+static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
+ struct bch_alloc_v4 r)
+{
+ return l.gen != r.gen ||
+ l.oldest_gen != r.oldest_gen ||
+ l.data_type != r.data_type ||
+ l.dirty_sectors != r.dirty_sectors ||
+ l.cached_sectors != r.cached_sectors ||
+ l.stripe_redundancy != r.stripe_redundancy ||
+ l.stripe != r.stripe;
+}
+
static int bch2_alloc_write_key(struct btree_trans *trans,
struct btree_iter *iter,
- bool initial, bool metadata_only)
+ struct bkey_s_c k,
+ bool metadata_only)
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
- struct bucket *g;
- struct bkey_s_c k;
- struct bkey_alloc_unpacked old_u, new_u, gc_u;
- struct bkey_alloc_buf *a;
+ struct bucket gc, *b;
+ struct bkey_i_alloc_v4 *a;
+ struct bch_alloc_v4 old, new;
+ enum bch_data_type type;
int ret;
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret)
- return ret;
+ if (bkey_cmp(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+ return 1;
- old_u = new_u = bch2_alloc_unpack(k);
+ bch2_alloc_to_v4(k, &old);
+ new = old;
percpu_down_read(&c->mark_lock);
- g = gc_bucket(ca, iter->pos.offset);
- gc_u = (struct bkey_alloc_unpacked) {
- .dev = iter->pos.inode,
- .bucket = iter->pos.offset,
- .gen = g->mark.gen,
- .data_type = g->mark.data_type,
- .dirty_sectors = g->mark.dirty_sectors,
- .cached_sectors = g->mark.cached_sectors,
- .read_time = g->io_time[READ],
- .write_time = g->io_time[WRITE],
- .stripe = g->stripe,
- .stripe_redundancy = g->stripe_redundancy,
- };
+ b = gc_bucket(ca, iter->pos.offset);
+
+ /*
+ * b->data_type doesn't yet include need_discard & need_gc_gen states -
+ * fix that here:
+ */
+ type = __alloc_data_type(b->dirty_sectors,
+ b->cached_sectors,
+ b->stripe,
+ old,
+ b->data_type);
+ if (b->data_type != type) {
+ struct bch_dev_usage *u;
+
+ preempt_disable();
+ u = this_cpu_ptr(ca->usage_gc);
+ u->d[b->data_type].buckets--;
+ b->data_type = type;
+ u->d[b->data_type].buckets++;
+ preempt_enable();
+ }
+
+ gc = *b;
percpu_up_read(&c->mark_lock);
if (metadata_only &&
- gc_u.data_type != BCH_DATA_sb &&
- gc_u.data_type != BCH_DATA_journal &&
- gc_u.data_type != BCH_DATA_btree)
+ gc.data_type != BCH_DATA_sb &&
+ gc.data_type != BCH_DATA_journal &&
+ gc.data_type != BCH_DATA_btree)
return 0;
- if (gen_after(old_u.gen, gc_u.gen))
+ if (gen_after(old.gen, gc.gen))
return 0;
#define copy_bucket_field(_f) \
- if (fsck_err_on(new_u._f != gc_u._f, c, \
+ if (c->opts.reconstruct_alloc || \
+ fsck_err_on(new._f != gc._f, c, \
"bucket %llu:%llu gen %u data type %s has wrong " #_f \
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
- new_u.gen, \
- bch2_data_types[new_u.data_type], \
- new_u._f, gc_u._f)) \
- new_u._f = gc_u._f; \
+ gc.gen, \
+ bch2_data_types[gc.data_type], \
+ new._f, gc._f)) \
+ new._f = gc._f; \
copy_bucket_field(gen);
copy_bucket_field(data_type);
- copy_bucket_field(stripe);
copy_bucket_field(dirty_sectors);
copy_bucket_field(cached_sectors);
copy_bucket_field(stripe_redundancy);
copy_bucket_field(stripe);
#undef copy_bucket_field
- if (!bkey_alloc_unpacked_cmp(old_u, new_u))
+ if (!bch2_alloc_v4_cmp(old, new))
return 0;
- a = bch2_alloc_pack(trans, new_u);
- if (IS_ERR(a))
- return PTR_ERR(a);
+ a = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ret;
+
+ a->v = new;
- ret = initial
- ? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k)
- : bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN);
+ /*
+ * The trigger normally makes sure this is set, but we're not running
+ * triggers:
+ */
+ if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
+ a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+
+ ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
fsck_err:
return ret;
}
-static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only)
+static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
{
struct btree_trans trans;
struct btree_iter iter;
bch2_trans_init(&trans, c, 0, 0);
for_each_member_device(ca, c, i) {
- for_each_btree_key(&trans, iter, BTREE_ID_alloc,
- POS(ca->dev_idx, ca->mi.first_bucket),
- BTREE_ITER_SLOTS|
- BTREE_ITER_PREFETCH, k, ret) {
- if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
- break;
-
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW,
- bch2_alloc_write_key(&trans, &iter,
- initial, metadata_only));
- if (ret)
- break;
- }
- bch2_trans_iter_exit(&trans, &iter);
-
- if (ret) {
- bch_err(c, "error writing alloc info: %i", ret);
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, ca->mi.first_bucket),
+ BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW,
+ bch2_alloc_write_key(&trans, &iter, k, metadata_only));
+
+ if (ret < 0) {
+ bch_err(c, "error writing alloc info: %s", bch2_err_str(ret));
percpu_ref_put(&ca->ref);
break;
}
}
bch2_trans_exit(&trans);
- return ret;
+ return ret < 0 ? ret : 0;
}
-static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only)
+static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
{
struct bch_dev *ca;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bucket *g;
+ struct bch_alloc_v4 a;
unsigned i;
+ int ret;
for_each_member_device(ca, c, i) {
struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
GFP_KERNEL|__GFP_ZERO);
if (!buckets) {
percpu_ref_put(&ca->ref);
- percpu_up_write(&c->mark_lock);
bch_err(c, "error allocating ca->buckets[gc]");
return -ENOMEM;
}
buckets->first_bucket = ca->mi.first_bucket;
buckets->nbuckets = ca->mi.nbuckets;
- rcu_assign_pointer(ca->buckets[1], buckets);
+ rcu_assign_pointer(ca->buckets_gc, buckets);
};
- return bch2_alloc_read(c, true, metadata_only);
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ g = gc_bucket(ca, k.k->p.offset);
+
+ bch2_alloc_to_v4(k, &a);
+
+ g->gen_valid = 1;
+ g->gen = a.gen;
+
+ if (metadata_only &&
+ (a.data_type == BCH_DATA_user ||
+ a.data_type == BCH_DATA_cached ||
+ a.data_type == BCH_DATA_parity)) {
+ g->data_type = a.data_type;
+ g->dirty_sectors = a.dirty_sectors;
+ g->cached_sectors = a.cached_sectors;
+ g->stripe = a.stripe;
+ g->stripe_redundancy = a.stripe_redundancy;
+ }
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+
+ if (ret)
+ bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret));
+
+ return ret;
}
-static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only)
+static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
{
struct bch_dev *ca;
unsigned i;
for_each_member_device(ca, c, i) {
- struct bucket_array *buckets = __bucket_array(ca, true);
+ struct bucket_array *buckets = gc_bucket_array(ca);
struct bucket *g;
for_each_bucket(g, buckets) {
if (metadata_only &&
- (g->mark.data_type == BCH_DATA_user ||
- g->mark.data_type == BCH_DATA_cached ||
- g->mark.data_type == BCH_DATA_parity))
+ (g->data_type == BCH_DATA_user ||
+ g->data_type == BCH_DATA_cached ||
+ g->data_type == BCH_DATA_parity))
continue;
- g->_mark.dirty_sectors = 0;
- g->_mark.cached_sectors = 0;
+ g->data_type = 0;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
}
};
}
-static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
- bool metadata_only)
+static int bch2_gc_write_reflink_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ size_t *idx)
{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
+ struct bch_fs *c = trans->c;
+ const __le64 *refcount = bkey_refcount_c(k);
+ struct printbuf buf = PRINTBUF;
struct reflink_gc *r;
- size_t idx = 0;
- char buf[200];
int ret = 0;
- if (metadata_only)
+ if (!refcount)
return 0;
- bch2_trans_init(&trans, c, 0, 0);
+ while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
+ r->offset < k.k->p.offset)
+ ++*idx;
- for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- const __le64 *refcount = bkey_refcount_c(k);
-
- if (!refcount)
- continue;
+ if (!r ||
+ r->offset != k.k->p.offset ||
+ r->size != k.k->size) {
+ bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+ return -EINVAL;
+ }
- r = genradix_ptr(&c->reflink_gc_table, idx++);
- if (!r ||
- r->offset != k.k->p.offset ||
- r->size != k.k->size) {
- bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
- ret = -EINVAL;
- break;
- }
+ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+ "reflink key has wrong refcount:\n"
+ " %s\n"
+ " should be %u",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+ r->refcount)) {
+ struct bkey_i *new;
- if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
- "reflink key has wrong refcount:\n"
- " %s\n"
- " should be %u",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
- r->refcount)) {
- struct bkey_i *new;
-
- new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
- if (!new) {
- ret = -ENOMEM;
- break;
- }
+ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
- bkey_reassemble(new, k);
-
- if (!r->refcount) {
- new->k.type = KEY_TYPE_deleted;
- /*
- * XXX ugly: bch2_journal_key_insert() queues up
- * the key for the journal replay code, which
- * doesn't run the extent overwrite pass
- */
- if (initial)
- new->k.size = 0;
- } else {
- *bkey_refcount(new) = cpu_to_le64(r->refcount);
- }
+ bkey_reassemble(new, k);
- ret = initial
- ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new)
- : __bch2_trans_do(&trans, NULL, NULL, 0,
- __bch2_btree_insert(&trans, BTREE_ID_reflink, new));
- kfree(new);
+ if (!r->refcount)
+ new->k.type = KEY_TYPE_deleted;
+ else
+ *bkey_refcount(new) = cpu_to_le64(r->refcount);
- if (ret)
- break;
- }
+ ret = bch2_trans_update(trans, iter, new, 0);
}
fsck_err:
- bch2_trans_iter_exit(&trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ size_t idx = 0;
+ int ret = 0;
+
+ if (metadata_only)
+ return 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ ret = for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_gc_write_reflink_key(&trans, &iter, k, &idx));
+
c->reflink_gc_nr = 0;
bch2_trans_exit(&trans);
return ret;
}
-static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
+static int bch2_gc_reflink_start(struct bch_fs *c,
bool metadata_only)
{
struct btree_trans trans;
return ret;
}
-static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial,
- bool metadata_only)
+static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
{
struct genradix_iter iter;
struct reflink_gc *r;
r->refcount = 0;
}
-static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
- bool metadata_only)
+static int bch2_gc_write_stripes_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct gc_stripe *m;
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
const struct bch_stripe *s;
- char buf[200];
+ struct gc_stripe *m;
unsigned i;
int ret = 0;
- if (metadata_only)
+ if (k.k->type != KEY_TYPE_stripe)
return 0;
- bch2_trans_init(&trans, c, 0, 0);
-
- for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- if (k.k->type != KEY_TYPE_stripe)
- continue;
+ s = bkey_s_c_to_stripe(k).v;
+ m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
- s = bkey_s_c_to_stripe(k).v;
- m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
-
- for (i = 0; i < s->nr_blocks; i++)
- if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
- goto inconsistent;
- continue;
+ for (i = 0; i < s->nr_blocks; i++)
+ if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
+ goto inconsistent;
+ return 0;
inconsistent:
- if (fsck_err_on(true, c,
- "stripe has wrong block sector count %u:\n"
- " %s\n"
- " should be %u", i,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
- m ? m->block_sectors[i] : 0)) {
- struct bkey_i_stripe *new;
-
- new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
- if (!new) {
- ret = -ENOMEM;
- break;
- }
+ if (fsck_err_on(true, c,
+ "stripe has wrong block sector count %u:\n"
+ " %s\n"
+ " should be %u", i,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+ m ? m->block_sectors[i] : 0)) {
+ struct bkey_i_stripe *new;
+
+ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
- bkey_reassemble(&new->k_i, k);
+ bkey_reassemble(&new->k_i, k);
- for (i = 0; i < new->v.nr_blocks; i++)
- stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
+ for (i = 0; i < new->v.nr_blocks; i++)
+ stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
- ret = initial
- ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i)
- : __bch2_trans_do(&trans, NULL, NULL, 0,
- __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
- kfree(new);
- }
+ ret = bch2_trans_update(trans, iter, &new->k_i, 0);
}
fsck_err:
- bch2_trans_iter_exit(&trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ if (metadata_only)
+ return 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ ret = for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_gc_write_stripes_key(&trans, &iter, k));
bch2_trans_exit(&trans);
return ret;
}
-static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial,
- bool metadata_only)
+static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
{
genradix_free(&c->gc_stripes);
}
*/
int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
{
- struct bch_dev *ca;
- u64 start_time = local_clock();
- unsigned i, iter = 0;
+ unsigned iter = 0;
int ret;
lockdep_assert_held(&c->state_lock);
- trace_gc_start(c);
down_write(&c->gc_lock);
- /* flush interior btree updates: */
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
+ bch2_btree_interior_updates_flush(c);
ret = bch2_gc_start(c, metadata_only) ?:
- bch2_gc_alloc_start(c, initial, metadata_only) ?:
- bch2_gc_reflink_start(c, initial, metadata_only);
+ bch2_gc_alloc_start(c, metadata_only) ?:
+ bch2_gc_reflink_start(c, metadata_only);
if (ret)
goto out;
again:
if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
c->opts.fix_errors != FSCK_OPT_NO) {
- bch_info(c, "starting topology repair pass");
+ bch_info(c, "Starting topology repair pass");
ret = bch2_repair_topology(c);
if (ret)
goto out;
- bch_info(c, "topology repair pass done");
+ bch_info(c, "Topology repair pass done");
set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags);
}
ret = bch2_gc_btrees(c, initial, metadata_only);
- if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR &&
+ if (ret == -BCH_ERR_need_topology_repair &&
!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) &&
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, true);
ret = 0;
}
- if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR)
- ret = FSCK_ERR_EXIT;
+ if (ret == -BCH_ERR_need_topology_repair)
+ ret = -BCH_ERR_fsck_errors_not_fixed;
if (ret)
goto out;
clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
- bch2_gc_stripes_reset(c, initial, metadata_only);
- bch2_gc_alloc_reset(c, initial, metadata_only);
- bch2_gc_reflink_reset(c, initial, metadata_only);
+ bch2_gc_stripes_reset(c, metadata_only);
+ bch2_gc_alloc_reset(c, metadata_only);
+ bch2_gc_reflink_reset(c, metadata_only);
/* flush fsck errors, reset counters */
bch2_flush_fsck_errs(c);
if (!ret) {
bch2_journal_block(&c->journal);
- ret = bch2_gc_stripes_done(c, initial, metadata_only) ?:
- bch2_gc_reflink_done(c, initial, metadata_only) ?:
- bch2_gc_alloc_done(c, initial, metadata_only) ?:
+ ret = bch2_gc_stripes_done(c, metadata_only) ?:
+ bch2_gc_reflink_done(c, metadata_only) ?:
+ bch2_gc_alloc_done(c, metadata_only) ?:
bch2_gc_done(c, initial, metadata_only);
bch2_journal_unblock(&c->journal);
up_write(&c->gc_lock);
- trace_gc_end(c);
- bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
-
- /*
- * Wake up allocator in case it was waiting for buckets
- * because of not being able to inc gens
- */
- for_each_member_device(ca, c, i)
- bch2_wake_allocator(ca);
-
/*
* At startup, allocations can happen directly instead of via the
* allocator thread - issue wakeup in case they blocked on gc_lock:
return ret;
}
-static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
+static int gc_btree_gens_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
{
+ struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const struct bch_extent_ptr *ptr;
+ struct bkey_i *u;
+ int ret;
percpu_down_read(&c->mark_lock);
bkey_for_each_ptr(ptrs, ptr) {
if (ptr_stale(ca, ptr) > 16) {
percpu_up_read(&c->mark_lock);
- return true;
+ goto update;
}
}
*gen = ptr->gen;
}
percpu_up_read(&c->mark_lock);
+ return 0;
+update:
+ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
- return false;
-}
-
-/*
- * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
- * node pointers currently never have cached pointers that can become stale:
- */
-static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_buf sk;
- int ret = 0, commit_err = 0;
-
- bch2_bkey_buf_init(&sk);
-
- bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
-
- while ((bch2_trans_begin(trans),
- k = bch2_btree_iter_peek(&iter)).k) {
- ret = bkey_err(k);
-
- if (ret == -EINTR)
- continue;
- if (ret)
- break;
-
- c->gc_gens_pos = iter.pos;
-
- if (gc_btree_gens_key(c, k) && !commit_err) {
- bch2_bkey_buf_reassemble(&sk, c, k);
- bch2_extent_normalize(c, bkey_i_to_s(sk.k));
-
- commit_err =
- bch2_trans_update(trans, &iter, sk.k, 0) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOWAIT|
- BTREE_INSERT_NOFAIL);
- if (commit_err == -EINTR) {
- commit_err = 0;
- continue;
- }
- }
-
- bch2_btree_iter_advance(&iter);
- }
- bch2_trans_iter_exit(trans, &iter);
-
- bch2_bkey_buf_exit(&sk, c);
+ bkey_reassemble(u, k);
- return ret;
+ bch2_extent_normalize(c, bkey_i_to_s(u));
+ return bch2_trans_update(trans, iter, u, 0);
}
-static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter)
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
{
struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
- struct bkey_s_c k;
- struct bkey_alloc_unpacked u;
+ struct bch_alloc_v4 a;
+ struct bkey_i_alloc_v4 *a_mut;
int ret;
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- u = bch2_alloc_unpack(k);
+ bch2_alloc_to_v4(k, &a);
- if (u.oldest_gen == ca->oldest_gen[iter->pos.offset])
+ if (a.oldest_gen == ca->oldest_gen[iter->pos.offset])
return 0;
- u.oldest_gen = ca->oldest_gen[iter->pos.offset];
+ a_mut = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a_mut);
+ if (ret)
+ return ret;
+
+ a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
+ a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type);
- return bch2_alloc_write(trans, iter, &u, BTREE_TRIGGER_NORUN);
+ return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
}
int bch2_gc_gens(struct bch_fs *c)
if (!mutex_trylock(&c->gc_gens_lock))
return 0;
+ trace_and_count(c, gc_gens_start, c);
down_read(&c->gc_lock);
bch2_trans_init(&trans, c, 0, 0);
}
for (i = 0; i < BTREE_ID_NR; i++)
- if ((1 << i) & BTREE_ID_HAS_PTRS) {
+ if (btree_type_has_ptrs(i)) {
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
c->gc_gens_btree = i;
c->gc_gens_pos = POS_MIN;
- ret = bch2_gc_btree_gens(&trans, i);
- if (ret) {
- bch_err(c, "error recalculating oldest_gen: %i", ret);
+ ret = for_each_btree_key_commit(&trans, iter, i,
+ POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ k,
+ NULL, NULL,
+ BTREE_INSERT_NOFAIL,
+ gc_btree_gens_key(&trans, &iter, k));
+ if (ret && ret != -EROFS)
+ bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
+ if (ret)
goto err;
- }
}
- for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL,
- bch2_alloc_write_oldest_gen(&trans, &iter));
- if (ret) {
- bch_err(c, "error writing oldest_gen: %i", ret);
- break;
- }
- }
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+ POS_MIN,
+ BTREE_ITER_PREFETCH,
+ k,
+ NULL, NULL,
+ BTREE_INSERT_NOFAIL,
+ bch2_alloc_write_oldest_gen(&trans, &iter, k));
+ if (ret && ret != -EROFS)
+ bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
+ if (ret)
+ goto err;
c->gc_gens_btree = 0;
c->gc_gens_pos = POS_MIN;
c->gc_count++;
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+ trace_and_count(c, gc_gens_end, c);
err:
for_each_member_device(ca, c, i) {
kvfree(ca->oldest_gen);
ret = bch2_gc_gens(c);
#endif
if (ret < 0)
- bch_err(c, "btree gc failed: %i", ret);
+ bch_err(c, "btree gc failed: %s", bch2_err_str(ret));
debug_check_no_locks_held();
}
p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
if (IS_ERR(p)) {
- bch_err(c, "error creating gc thread: %li", PTR_ERR(p));
+ bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p)));
return PTR_ERR(p);
}
return ret;
}
+static inline void bch2_do_gc_gens(struct bch_fs *c)
+{
+ atomic_inc(&c->kick_gc);
+ if (c->gc_thread)
+ wake_up_process(c->gc_thread);
+}
+
#endif /* _BCACHEFS_BTREE_GC_H */
};
if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
- bch2_btree_node_write(c, b, SIX_LOCK_write);
+ bch2_btree_node_write(c, b, SIX_LOCK_write, 0);
reinit_iter = true;
}
}
static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
struct btree *b)
{
- pr_buf(out, "%s level %u/%u\n ",
+ prt_printf(out, "%s level %u/%u\n ",
bch2_btree_ids[b->c.btree_id],
b->c.level,
c->btree_roots[b->c.btree_id].level);
struct btree *b, struct bset *i,
unsigned offset, int write)
{
- pr_buf(out, "error validating btree node ");
- if (write)
- pr_buf(out, "before write ");
+ prt_printf(out, bch2_log_msg(c, ""));
+ if (!write)
+ prt_str(out, "error validating btree node ");
+ else
+ prt_str(out, "corrupt btree node before write ");
if (ca)
- pr_buf(out, "on %s ", ca->name);
- pr_buf(out, "at btree ");
+ prt_printf(out, "on %s ", ca->name);
+ prt_printf(out, "at btree ");
btree_pos_to_text(out, c, b);
- pr_buf(out, "\n node offset %u", b->written);
+ prt_printf(out, "\n node offset %u", b->written);
if (i)
- pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+ prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+ prt_str(out, ": ");
}
enum btree_err_type {
#define btree_err(type, c, ca, b, i, msg, ...) \
({ \
__label__ out; \
- char _buf[300]; \
- char *_buf2 = _buf; \
- struct printbuf out = PBUF(_buf); \
- \
- _buf2 = kmalloc(4096, GFP_ATOMIC); \
- if (_buf2) \
- out = _PBUF(_buf2, 4986); \
+ struct printbuf out = PRINTBUF; \
\
btree_err_msg(&out, c, ca, b, i, b->written, write); \
- pr_buf(&out, ": " msg, ##__VA_ARGS__); \
+ prt_printf(&out, msg, ##__VA_ARGS__); \
\
if (type == BTREE_ERR_FIXABLE && \
write == READ && \
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
- mustfix_fsck_err(c, "%s", _buf2); \
+ mustfix_fsck_err(c, "%s", out.buf); \
goto out; \
} \
\
+ bch2_print_string_as_lines(KERN_ERR, out.buf); \
+ \
switch (write) { \
case READ: \
- if (_buf2) \
- bch_err(c, "%s", _buf2); \
- \
switch (type) { \
case BTREE_ERR_FIXABLE: \
- ret = BCH_FSCK_ERRORS_NOT_FIXED; \
+ ret = -BCH_ERR_fsck_errors_not_fixed; \
goto fsck_err; \
case BTREE_ERR_WANT_RETRY: \
if (have_retry) { \
ret = BTREE_RETRY_READ; \
goto fsck_err; \
case BTREE_ERR_FATAL: \
- ret = BCH_FSCK_ERRORS_NOT_FIXED; \
+ ret = -BCH_ERR_fsck_errors_not_fixed; \
goto fsck_err; \
} \
break; \
case WRITE: \
- bch_err(c, "corrupt metadata before write: %s", _buf2); \
- \
if (bch2_fs_inconsistent(c)) { \
- ret = BCH_FSCK_ERRORS_NOT_FIXED; \
+ ret = -BCH_ERR_fsck_errors_not_fixed; \
goto fsck_err; \
} \
break; \
} \
out: \
- if (_buf2 != _buf) \
- kfree(_buf2); \
+ printbuf_exit(&out); \
true; \
})
(u64 *) vstruct_end(i) - (u64 *) k);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
set_btree_bset_end(b, t);
- bch2_bset_set_no_aux_tree(b, t);
}
for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
if (k != vstruct_last(i)) {
i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
set_btree_bset_end(b, t);
- bch2_bset_set_no_aux_tree(b, t);
}
}
+ /*
+ * Always rebuild search trees: eytzinger search tree nodes directly
+ * depend on the values of min/max key:
+ */
+ bch2_bset_set_no_aux_tree(b, b->set);
bch2_btree_build_aux_trees(b);
for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
{
unsigned version = le16_to_cpu(i->version);
const char *err;
- char buf1[100];
- char buf2[100];
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
int ret = 0;
btree_err_on((version != BCH_BSET_VERSION_OLD &&
BTREE_ERR_FIXABLE, c, ca, b, i,
"bset past end of btree node")) {
i->u64s = 0;
- return 0;
+ ret = 0;
+ goto out;
}
btree_err_on(offset && !i->u64s,
btree_err_on(bpos_cmp(b->data->min_key, bp->min_key),
BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
"incorrect min_key: got %s should be %s",
- (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2));
+ (printbuf_reset(&buf1),
+ bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
+ (printbuf_reset(&buf2),
+ bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
}
btree_err_on(bpos_cmp(bn->max_key, b->key.k.p),
BTREE_ERR_MUST_RETRY, c, ca, b, i,
"incorrect max key %s",
- (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1));
+ (printbuf_reset(&buf1),
+ bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
if (write)
compat_btree_node(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
&bn->format);
}
+out:
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
+static int bset_key_invalid(struct bch_fs *c, struct btree *b,
+ struct bkey_s_c k,
+ bool updated_range, int rw,
+ struct printbuf *err)
+{
+ return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
+ (!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?:
+ (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
+}
+
static int validate_bset_keys(struct bch_fs *c, struct btree *b,
- struct bset *i, unsigned *whiteout_u64s,
- int write, bool have_retry)
+ struct bset *i, int write, bool have_retry)
{
unsigned version = le16_to_cpu(i->version);
struct bkey_packed *k, *prev = NULL;
+ struct printbuf buf = PRINTBUF;
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
int ret = 0;
k != vstruct_last(i);) {
struct bkey_s u;
struct bkey tmp;
- const char *invalid;
if (btree_err_on(bkey_next(k) > vstruct_last(i),
BTREE_ERR_FIXABLE, c, NULL, b, i,
u = __bkey_disassemble(b, k, &tmp);
- invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?:
- (!updated_range ? bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?:
- (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL);
- if (invalid) {
- char buf[160];
+ printbuf_reset(&buf);
+ if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "invalid bkey: ");
+ bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, u.s_c);
- bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
- btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
- "invalid bkey: %s\n%s", invalid, buf);
+ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
&b->format, k);
if (prev && bkey_iter_cmp(b, prev, k) > 0) {
- char buf1[80];
- char buf2[80];
struct bkey up = bkey_unpack_key(b, prev);
- bch2_bkey_to_text(&PBUF(buf1), &up);
- bch2_bkey_to_text(&PBUF(buf2), u.k);
+ printbuf_reset(&buf);
+ prt_printf(&buf, "keys out of order: ");
+ bch2_bkey_to_text(&buf, &up);
+ prt_printf(&buf, " > ");
+ bch2_bkey_to_text(&buf, u.k);
bch2_dump_bset(c, b, i, 0);
- if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
- "keys out of order: %s > %s",
- buf1, buf2)) {
+ if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) {
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
k = bkey_next(k);
}
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
unsigned u64s;
unsigned blacklisted_written, nonblacklisted_written = 0;
unsigned ptr_written = btree_ptr_sectors_written(&b->key);
+ struct printbuf buf = PRINTBUF;
int ret, retry_read = 0, write = READ;
b->version_ondisk = U16_MAX;
+ /* We might get called multiple times on read retry: */
+ b->written = 0;
iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
sort_iter_init(iter, b);
btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
- "bad magic");
+ "bad magic: want %llx, got %llx",
+ bset_magic(c), le64_to_cpu(b->data->magic));
btree_err_on(!b->data->keys.seq,
BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
- "bad btree header");
+ "bad btree header: seq 0");
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
struct bch_btree_ptr_v2 *bp =
}
while (b->written < (ptr_written ?: btree_sectors(c))) {
- unsigned sectors, whiteout_u64s = 0;
+ unsigned sectors;
struct nonce nonce;
struct bch_csum csum;
bool first = !b->written;
BTREE_ERR_WANT_RETRY, c, ca, b, i,
"invalid checksum");
- bset_encrypt(c, i, b->written << 9);
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error decrypting btree node: %i", ret))
+ goto fsck_err;
- btree_err_on(btree_node_is_extents(b) &&
+ btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
BTREE_ERR_FATAL, c, NULL, b, NULL,
"btree node does not have NEW_EXTENT_OVERWRITE set");
BTREE_ERR_WANT_RETRY, c, ca, b, i,
"invalid checksum");
- bset_encrypt(c, i, b->written << 9);
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error decrypting btree node: %i\n", ret))
+ goto fsck_err;
sectors = vstruct_sectors(bne, c->block_bits);
}
if (!b->written)
btree_node_set_format(b, b->data->format);
- ret = validate_bset_keys(c, b, i, &whiteout_u64s,
- READ, have_retry);
+ ret = validate_bset_keys(c, b, i, READ, have_retry);
if (ret)
goto fsck_err;
if (blacklisted && !first)
continue;
- sort_iter_add(iter, i->start,
- vstruct_idx(i, whiteout_u64s));
-
sort_iter_add(iter,
- vstruct_idx(i, whiteout_u64s),
+ vstruct_idx(i, 0),
vstruct_last(i));
nonblacklisted_written = b->written;
for (k = i->start; k != vstruct_last(i);) {
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
- const char *invalid = bch2_bkey_val_invalid(c, u.s_c);
- if (invalid ||
+ printbuf_reset(&buf);
+
+ if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) ||
(bch2_inject_invalid_keys &&
!bversion_cmp(u.k->version, MAX_VERSION))) {
- char buf[160];
+ printbuf_reset(&buf);
+
+ prt_printf(&buf, "invalid bkey: ");
+ bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, u.s_c);
- bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
- btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
- "invalid bkey %s: %s", buf, invalid);
+ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
btree_keys_account_key_drop(&b->nr, 0, k);
set_btree_node_need_rewrite(b);
out:
mempool_free(iter, &c->fill_iter);
+ printbuf_exit(&buf);
return retry_read;
fsck_err:
if (ret == BTREE_RETRY_READ) {
struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
- char buf[200];
- struct printbuf out;
+ struct printbuf buf = PRINTBUF;
bool saw_error = false;
+ bool retry = false;
bool can_retry;
goto start;
while (1) {
+ retry = true;
bch_info(c, "retrying read");
ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
- bio_reset(bio);
- bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
+ bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
bio->bi_status = BLK_STS_REMOVED;
}
start:
- out = PBUF(buf);
- btree_pos_to_text(&out, c, b);
+ printbuf_reset(&buf);
+ btree_pos_to_text(&buf, c, b);
bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
- bch2_blk_status_to_str(bio->bi_status), buf);
+ bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
percpu_ref_put(&ca->io_ref);
rb->have_ioref = false;
&failed, &rb->pick) > 0;
if (!bio->bi_status &&
- !bch2_btree_node_read_done(c, ca, b, can_retry))
+ !bch2_btree_node_read_done(c, ca, b, can_retry)) {
+ if (retry)
+ bch_info(c, "retry success");
break;
+ }
saw_error = true;
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
rb->start_time);
bio_put(&rb->bio);
+ printbuf_exit(&buf);
if (saw_error && !btree_node_read_error(b))
bch2_btree_node_rewrite_async(c, b);
container_of(cl, struct btree_node_read_all, cl);
struct bch_fs *c = ra->c;
struct btree *b = ra->b;
+ struct printbuf buf = PRINTBUF;
bool dump_bset_maps = false;
bool have_retry = false;
int ret = 0, best = -1, write = READ;
fsck_err:
if (dump_bset_maps) {
for (i = 0; i < ra->nr; i++) {
- char buf[200];
- struct printbuf out = PBUF(buf);
struct btree_node *bn = ra->buf[i];
struct btree_node_entry *bne = NULL;
unsigned offset = 0, sectors;
if (ra->err[i])
continue;
+ printbuf_reset(&buf);
+
while (offset < btree_sectors(c)) {
if (!offset) {
sectors = vstruct_sectors(bn, c->block_bits);
sectors = vstruct_sectors(bne, c->block_bits);
}
- pr_buf(&out, " %u-%u", offset, offset + sectors);
+ prt_printf(&buf, " %u-%u", offset, offset + sectors);
if (bne && bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(bne->keys.journal_seq), false))
- pr_buf(&out, "*");
+ prt_printf(&buf, "*");
offset += sectors;
}
bne = ra->buf[i] + (offset << 9);
if (bne->keys.seq == bn->keys.seq) {
if (!gap)
- pr_buf(&out, " GAP");
+ prt_printf(&buf, " GAP");
gap = true;
sectors = vstruct_sectors(bne, c->block_bits);
- pr_buf(&out, " %u-%u", offset, offset + sectors);
+ prt_printf(&buf, " %u-%u", offset, offset + sectors);
if (bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(bne->keys.journal_seq), false))
- pr_buf(&out, "*");
+ prt_printf(&buf, "*");
}
offset++;
}
- bch_err(c, "replica %u:%s", i, buf);
+ bch_err(c, "replica %u:%s", i, buf.buf);
}
}
closure_debug_destroy(&ra->cl);
kfree(ra);
+ printbuf_exit(&buf);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
for (i = 0; i < ra->nr; i++) {
ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
- ra->bio[i] = bio_alloc_bioset(GFP_NOFS, buf_pages(ra->buf[i],
- btree_bytes(c)),
+ ra->bio[i] = bio_alloc_bioset(NULL,
+ buf_pages(ra->buf[i], btree_bytes(c)),
+ REQ_OP_READ|REQ_SYNC|REQ_META,
+ GFP_NOFS,
&c->btree_bio);
}
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
rb->idx = i;
rb->pick = pick;
- rb->bio.bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
rb->bio.bi_iter.bi_sector = pick.ptr.offset;
rb->bio.bi_end_io = btree_node_read_all_replicas_endio;
bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
struct btree_read_bio *rb;
struct bch_dev *ca;
struct bio *bio;
- char buf[200];
int ret;
- btree_pos_to_text(&PBUF(buf), c, b);
- trace_btree_read(c, b);
+ trace_and_count(c, btree_node_read, c, b);
if (bch2_verify_all_btree_replicas &&
!btree_node_read_all_replicas(c, b, sync))
ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
NULL, &pick);
- if (bch2_fs_fatal_err_on(ret <= 0, c,
- "btree node read error: no device to read from\n"
- " at %s", buf)) {
+
+ if (ret <= 0) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "btree node read error: no device to read from\n at ");
+ btree_pos_to_text(&buf, c, b);
+ bch_err(c, "%s", buf.buf);
+
+ if (test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags))
+ bch2_fatal_error(c);
+
set_btree_node_read_error(b);
+ clear_btree_node_read_in_flight(b);
+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+ printbuf_exit(&buf);
return;
}
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
- bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data,
- btree_bytes(c)),
+ bio = bio_alloc_bioset(NULL,
+ buf_pages(b->data, btree_bytes(c)),
+ REQ_OP_READ|REQ_SYNC|REQ_META,
+ GFP_NOIO,
&c->btree_bio);
rb = container_of(bio, struct btree_read_bio, bio);
rb->c = c;
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
rb->pick = pick;
INIT_WORK(&rb->work, btree_node_read_work);
- bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_end_io = btree_node_read_endio;
bch2_bio_map(bio, b->data, btree_bytes(c));
closure_sync(&cl);
} while (ret);
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(c, level != 0);
bch2_btree_cache_cannibalize_unlock(c);
BUG_ON(IS_ERR(b));
bch2_journal_pin_drop(&c->journal, &w->journal);
}
-static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
{
struct btree_write *w = btree_prev_write(b);
unsigned long old, new, v;
bch2_btree_complete_write(c, b, w);
- v = READ_ONCE(b->flags);
- do {
- old = new = v;
-
- if (old & (1U << BTREE_NODE_need_write))
- goto do_write;
-
- new &= ~(1U << BTREE_NODE_write_in_flight);
- new &= ~(1U << BTREE_NODE_write_in_flight_inner);
- } while ((v = cmpxchg(&b->flags, old, new)) != old);
-
- wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
- return;
-
-do_write:
- six_lock_read(&b->c.lock, NULL, NULL);
v = READ_ONCE(b->flags);
do {
old = new = v;
if ((old & (1U << BTREE_NODE_dirty)) &&
(old & (1U << BTREE_NODE_need_write)) &&
!(old & (1U << BTREE_NODE_never_write)) &&
- btree_node_may_write(b)) {
+ !(old & (1U << BTREE_NODE_write_blocked)) &&
+ !(old & (1U << BTREE_NODE_will_make_reachable))) {
new &= ~(1U << BTREE_NODE_dirty);
new &= ~(1U << BTREE_NODE_need_write);
new |= (1U << BTREE_NODE_write_in_flight);
} while ((v = cmpxchg(&b->flags, old, new)) != old);
if (new & (1U << BTREE_NODE_write_in_flight))
- __bch2_btree_node_write(c, b, true);
+ __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED);
+ else
+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+{
+ struct btree_trans trans;
+ bch2_trans_init(&trans, c, 0, 0);
+
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+ __btree_node_write_done(c, b);
six_unlock_read(&b->c.lock);
+
+ bch2_trans_exit(&trans);
}
static void btree_node_write_work(struct work_struct *work)
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
struct bset *i, unsigned sectors)
{
- unsigned whiteout_u64s = 0;
+ struct printbuf buf = PRINTBUF;
int ret;
- if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree))
- return -1;
+ ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
+ BKEY_TYPE_btree, WRITE, &buf);
+
+ if (ret)
+ bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
+ printbuf_exit(&buf);
+ if (ret)
+ return ret;
- ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
+ ret = validate_bset_keys(c, b, i, WRITE, false) ?:
validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false);
if (ret) {
bch2_inconsistent_error(c);
bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k);
}
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started)
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
{
struct btree_write_bio *wbio;
struct bset_tree *t;
unsigned long old, new;
bool validate_before_checksum = false;
void *data;
+ int ret;
- if (already_started)
+ if (flags & BTREE_WRITE_ALREADY_STARTED)
goto do_write;
- if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
- return;
-
/*
* We may only have a read lock on the btree node - the dirty bit is our
* "lock" against racing with other threads that may be trying to start
if (!(old & (1 << BTREE_NODE_dirty)))
return;
- if (!btree_node_may_write(b))
+ if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
+ !(old & (1 << BTREE_NODE_need_write)))
+ return;
+
+ if (old &
+ ((1 << BTREE_NODE_never_write)|
+ (1 << BTREE_NODE_write_blocked)))
return;
- if (old & (1 << BTREE_NODE_never_write))
+ if (b->written &&
+ (old & (1 << BTREE_NODE_will_make_reachable)))
return;
- BUG_ON(old & (1 << BTREE_NODE_write_in_flight));
+ if (old & (1 << BTREE_NODE_write_in_flight))
+ return;
new &= ~(1 << BTREE_NODE_dirty);
new &= ~(1 << BTREE_NODE_need_write);
u64s = bch2_sort_keys(i->start, &sort_iter, false);
le16_add_cpu(&i->u64s, u64s);
+ BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
+
set_needs_whiteout(i, false);
/* do we have data to write? */
bytes_to_write = vstruct_end(i) - data;
sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
+ if (!b->written &&
+ b->key.k.type == KEY_TYPE_btree_ptr_v2)
+ BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
+
memset(data + bytes_to_write, 0,
(sectors_to_write << 9) - bytes_to_write);
BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
BUG_ON(i->seq != b->data->keys.seq);
- i->version = c->sb.version < bcachefs_metadata_version_new_versioning
+ i->version = c->sb.version < bcachefs_metadata_version_bkey_renumber
? cpu_to_le16(BCH_BSET_VERSION_OLD)
: cpu_to_le16(c->sb.version);
SET_BSET_OFFSET(i, b->written);
validate_bset_for_write(c, b, i, sectors_to_write))
goto err;
- bset_encrypt(c, i, b->written << 9);
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error encrypting btree node: %i\n", ret))
+ goto err;
nonce = btree_nonce(i, b->written << 9);
c->opts.nochanges)
goto err;
- trace_btree_write(b, bytes_to_write, sectors_to_write);
+ trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
- wbio = container_of(bio_alloc_bioset(GFP_NOIO,
+ wbio = container_of(bio_alloc_bioset(NULL,
buf_pages(data, sectors_to_write << 9),
+ REQ_OP_WRITE|REQ_META,
+ GFP_NOIO,
&c->btree_bio),
struct btree_write_bio, wbio.bio);
wbio_init(&wbio->wbio.bio);
wbio->wbio.c = c;
wbio->wbio.used_mempool = used_mempool;
wbio->wbio.first_btree_write = !b->written;
- wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META;
wbio->wbio.bio.bi_end_io = btree_node_write_endio;
wbio->wbio.bio.bi_private = b;
b->written += sectors_to_write;
- if (wbio->wbio.first_btree_write &&
- b->key.k.type == KEY_TYPE_btree_ptr_v2)
- bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
- cpu_to_le16(b->written);
-
if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
cpu_to_le16(b->written);
return;
err:
set_btree_node_noevict(b);
- if (!b->written &&
- b->key.k.type == KEY_TYPE_btree_ptr_v2)
- bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
- cpu_to_le16(sectors_to_write);
b->written += sectors_to_write;
nowrite:
btree_bounce_free(c, bytes, used_mempool, data);
- btree_node_write_done(c, b);
+ __btree_node_write_done(c, b);
}
/*
* Use this one if the node is intent locked:
*/
void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
- enum six_lock_type lock_type_held)
+ enum six_lock_type lock_type_held,
+ unsigned flags)
{
if (lock_type_held == SIX_LOCK_intent ||
(lock_type_held == SIX_LOCK_read &&
six_lock_tryupgrade(&b->c.lock))) {
- __bch2_btree_node_write(c, b, false);
+ __bch2_btree_node_write(c, b, flags);
/* don't cycle lock unnecessarily: */
if (btree_node_just_written(b) &&
if (lock_type_held == SIX_LOCK_read)
six_lock_downgrade(&b->c.lock);
} else {
- __bch2_btree_node_write(c, b, false);
+ __bch2_btree_node_write(c, b, flags);
if (lock_type_held == SIX_LOCK_write &&
btree_node_just_written(b))
bch2_btree_post_write_cleanup(c, b);
}
}
-static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
+static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
{
struct bucket_table *tbl;
struct rhash_head *pos;
struct btree *b;
unsigned i;
+ bool ret = false;
restart:
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos)
if (test_bit(flag, &b->flags)) {
rcu_read_unlock();
wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
+ ret = true;
goto restart;
-
}
rcu_read_unlock();
-}
-void bch2_btree_flush_all_reads(struct bch_fs *c)
-{
- __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
+ return ret;
}
-void bch2_btree_flush_all_writes(struct bch_fs *c)
+bool bch2_btree_flush_all_reads(struct bch_fs *c)
{
- __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+ return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
}
-void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
+bool bch2_btree_flush_all_writes(struct bch_fs *c)
{
- struct bucket_table *tbl;
- struct rhash_head *pos;
- struct btree *b;
- unsigned i;
-
- rcu_read_lock();
- for_each_cached_btree(b, c, tbl, i, pos) {
- unsigned long flags = READ_ONCE(b->flags);
-
- if (!(flags & (1 << BTREE_NODE_dirty)))
- continue;
-
- pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
- b,
- (flags & (1 << BTREE_NODE_dirty)) != 0,
- (flags & (1 << BTREE_NODE_need_write)) != 0,
- b->c.level,
- b->written,
- !list_empty_careful(&b->write_blocked),
- b->will_make_reachable != 0,
- b->will_make_reachable & 1);
- }
- rcu_read_unlock();
+ return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
}
struct btree_iter;
struct btree_node_read_all;
-static inline bool btree_node_dirty(struct btree *b)
-{
- return test_bit(BTREE_NODE_dirty, &b->flags);
-}
-
-static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b)
+static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
atomic_inc(&c->btree_cache.dirty);
}
-static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
+static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
atomic_dec(&c->btree_cache.dirty);
void bch2_btree_node_wait_on_read(struct btree *);
void bch2_btree_node_wait_on_write(struct btree *);
-static inline bool btree_node_may_write(struct btree *b)
-{
- return list_empty_careful(&b->write_blocked) &&
- (!b->written || !b->will_make_reachable);
-}
-
enum compact_mode {
COMPACT_LAZY,
COMPACT_ALL,
}};
}
-static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
{
struct nonce nonce = btree_nonce(i, offset);
+ int ret;
if (!offset) {
struct btree_node *bn = container_of(i, struct btree_node, keys);
unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
- bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
- bytes);
+ ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
+ &bn->flags, bytes);
+ if (ret)
+ return ret;
nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
}
- bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
- vstruct_end(i) - (void *) i->_data);
+ return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+ vstruct_end(i) - (void *) i->_data);
}
void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
void bch2_btree_complete_write(struct bch_fs *, struct btree *,
struct btree_write *);
-void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool);
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
+#define BTREE_WRITE_ONLY_IF_NEED (1U << 0)
+#define BTREE_WRITE_ALREADY_STARTED (1U << 1)
+
+void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
- enum six_lock_type);
+ enum six_lock_type, unsigned);
static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
enum six_lock_type lock_held)
{
- if (b->written &&
- btree_node_need_write(b) &&
- btree_node_may_write(b) &&
- !btree_node_write_in_flight(b))
- bch2_btree_node_write(c, b, lock_held);
+ bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
}
-#define bch2_btree_node_write_cond(_c, _b, cond) \
-do { \
- unsigned long old, new, v = READ_ONCE((_b)->flags); \
- \
- do { \
- old = new = v; \
- \
- if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \
- break; \
- \
- new |= (1 << BTREE_NODE_need_write); \
- } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \
- \
- btree_node_write_if_need(_c, _b, SIX_LOCK_read); \
-} while (0)
-
-void bch2_btree_flush_all_reads(struct bch_fs *);
-void bch2_btree_flush_all_writes(struct bch_fs *);
-void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
+bool bch2_btree_flush_all_reads(struct bch_fs *);
+bool bch2_btree_flush_all_writes(struct bch_fs *);
static inline void compat_bformat(unsigned level, enum btree_id btree_id,
unsigned version, unsigned big_endian,
#include "replicas.h"
#include "subvolume.h"
+#include <linux/prandom.h>
#include <linux/prefetch.h>
#include <trace/events/bcachefs.h>
static void btree_trans_verify_sorted(struct btree_trans *);
-static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
+inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
+static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *,
+ struct btree_path *, int);
static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
if (need_resched() || race_fault()) {
bch2_trans_unlock(trans);
schedule();
- return bch2_trans_relock(trans) ? 0 : -EINTR;
+ return bch2_trans_relock(trans);
} else {
return 0;
}
return p;
}
-static inline bool is_btree_node(struct btree_path *path, unsigned l)
-{
- return l < BTREE_MAX_DEPTH &&
- (unsigned long) path->l[l].b >= 128;
-}
-
static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
{
struct bpos pos = iter->pos;
!btree_path_pos_after_node(path, b);
}
-/* Btree node locking: */
-
-void bch2_btree_node_unlock_write(struct btree_trans *trans,
- struct btree_path *path, struct btree *b)
-{
- bch2_btree_node_unlock_write_inlined(trans, path, b);
-}
-
-void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
-{
- struct btree_path *linked;
- unsigned readers = 0;
-
- trans_for_each_path(trans, linked)
- if (linked->l[b->c.level].b == b &&
- btree_node_read_locked(linked, b->c.level))
- readers++;
-
- /*
- * Must drop our read locks before calling six_lock_write() -
- * six_unlock() won't do wakeups until the reader count
- * goes to 0, and it's safe because we have the node intent
- * locked:
- */
- if (!b->c.lock.readers)
- atomic64_sub(__SIX_VAL(read_lock, readers),
- &b->c.lock.state.counter);
- else
- this_cpu_sub(*b->c.lock.readers, readers);
-
- six_lock_write(&b->c.lock, NULL, NULL);
-
- if (!b->c.lock.readers)
- atomic64_add(__SIX_VAL(read_lock, readers),
- &b->c.lock.state.counter);
- else
- this_cpu_add(*b->c.lock.readers, readers);
-}
-
-bool __bch2_btree_node_relock(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- struct btree *b = btree_path_node(path, level);
- int want = __btree_lock_want(path, level);
-
- if (!is_btree_node(path, level))
- goto fail;
-
- if (race_fault())
- goto fail;
-
- if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
- (btree_node_lock_seq_matches(path, b, level) &&
- btree_node_lock_increment(trans, b, level, want))) {
- mark_btree_node_locked(path, level, want);
- return true;
- }
-fail:
- trace_btree_node_relock_fail(trans->fn, _RET_IP_,
- path->btree_id,
- &path->pos,
- (unsigned long) b,
- path->l[level].lock_seq,
- is_btree_node(path, level) ? b->c.lock.state.seq : 0);
- return false;
-}
-
-bool bch2_btree_node_upgrade(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- struct btree *b = path->l[level].b;
-
- if (!is_btree_node(path, level))
- return false;
-
- switch (btree_lock_want(path, level)) {
- case BTREE_NODE_UNLOCKED:
- BUG_ON(btree_node_locked(path, level));
- return true;
- case BTREE_NODE_READ_LOCKED:
- BUG_ON(btree_node_intent_locked(path, level));
- return bch2_btree_node_relock(trans, path, level);
- case BTREE_NODE_INTENT_LOCKED:
- break;
- }
-
- if (btree_node_intent_locked(path, level))
- return true;
-
- if (race_fault())
- return false;
-
- if (btree_node_locked(path, level)
- ? six_lock_tryupgrade(&b->c.lock)
- : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
- goto success;
-
- if (btree_node_lock_seq_matches(path, b, level) &&
- btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
- btree_node_unlock(path, level);
- goto success;
- }
-
- return false;
-success:
- mark_btree_node_intent_locked(path, level);
- return true;
-}
-
-static inline bool btree_path_get_locks(struct btree_trans *trans,
- struct btree_path *path,
- bool upgrade)
-{
- unsigned l = path->level;
- int fail_idx = -1;
-
- do {
- if (!btree_path_node(path, l))
- break;
-
- if (!(upgrade
- ? bch2_btree_node_upgrade(trans, path, l)
- : bch2_btree_node_relock(trans, path, l)))
- fail_idx = l;
-
- l++;
- } while (l < path->locks_want);
-
- /*
- * When we fail to get a lock, we have to ensure that any child nodes
- * can't be relocked so bch2_btree_path_traverse has to walk back up to
- * the node that we failed to relock:
- */
- if (fail_idx >= 0) {
- __bch2_btree_path_unlock(path);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-
- do {
- path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
- --fail_idx;
- } while (fail_idx >= 0);
- }
-
- if (path->uptodate == BTREE_ITER_NEED_RELOCK)
- path->uptodate = BTREE_ITER_UPTODATE;
-
- bch2_trans_verify_locks(trans);
-
- return path->uptodate < BTREE_ITER_NEED_RELOCK;
-}
-
-static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
- bool cached)
-{
- return !cached
- ? container_of(_b, struct btree, c)->key.k.p
- : container_of(_b, struct bkey_cached, c)->key.pos;
-}
-
-/* Slowpath: */
-bool __bch2_btree_node_lock(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct bpos pos, unsigned level,
- enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
- unsigned long ip)
-{
- struct btree_path *linked;
- unsigned reason;
-
- /* Check if it's safe to block: */
- trans_for_each_path(trans, linked) {
- if (!linked->nodes_locked)
- continue;
-
- /*
- * Can't block taking an intent lock if we have _any_ nodes read
- * locked:
- *
- * - Our read lock blocks another thread with an intent lock on
- * the same node from getting a write lock, and thus from
- * dropping its intent lock
- *
- * - And the other thread may have multiple nodes intent locked:
- * both the node we want to intent lock, and the node we
- * already have read locked - deadlock:
- */
- if (type == SIX_LOCK_intent &&
- linked->nodes_locked != linked->nodes_intent_locked) {
- reason = 1;
- goto deadlock;
- }
-
- if (linked->btree_id != path->btree_id) {
- if (linked->btree_id < path->btree_id)
- continue;
-
- reason = 3;
- goto deadlock;
- }
-
- /*
- * Within the same btree, non-cached paths come before cached
- * paths:
- */
- if (linked->cached != path->cached) {
- if (!linked->cached)
- continue;
-
- reason = 4;
- goto deadlock;
- }
-
- /*
- * Interior nodes must be locked before their descendants: if
- * another path has possible descendants locked of the node
- * we're about to lock, it must have the ancestors locked too:
- */
- if (level > __fls(linked->nodes_locked)) {
- reason = 5;
- goto deadlock;
- }
-
- /* Must lock btree nodes in key order: */
- if (btree_node_locked(linked, level) &&
- bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
- linked->cached)) <= 0) {
- BUG_ON(trans->in_traverse_all);
- reason = 7;
- goto deadlock;
- }
- }
-
- return btree_node_lock_type(trans, path, b, pos, level,
- type, should_sleep_fn, p);
-deadlock:
- trace_trans_restart_would_deadlock(trans->fn, ip,
- trans->in_traverse_all, reason,
- linked->btree_id,
- linked->cached,
- &linked->pos,
- path->btree_id,
- path->cached,
- &pos);
- btree_trans_restart(trans);
- return false;
-}
-
-/* Btree iterator locking: */
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static void bch2_btree_path_verify_locks(struct btree_path *path)
-{
- unsigned l;
-
- if (!path->nodes_locked) {
- BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
- btree_path_node(path, path->level));
- return;
- }
-
- for (l = 0; btree_path_node(path, l); l++)
- BUG_ON(btree_lock_want(path, l) !=
- btree_node_locked_type(path, l));
-}
-
-void bch2_trans_verify_locks(struct btree_trans *trans)
-{
- struct btree_path *path;
-
- trans_for_each_path(trans, path)
- bch2_btree_path_verify_locks(path);
-}
-#else
-static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
-#endif
-
-/* Btree path locking: */
-
-/*
- * Only for btree_cache.c - only relocks intent locks
- */
-bool bch2_btree_path_relock_intent(struct btree_trans *trans,
- struct btree_path *path)
-{
- unsigned l;
-
- for (l = path->level;
- l < path->locks_want && btree_path_node(path, l);
- l++) {
- if (!bch2_btree_node_relock(trans, path, l)) {
- __bch2_btree_path_unlock(path);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_,
- path->btree_id, &path->pos);
- btree_trans_restart(trans);
- return false;
- }
- }
-
- return true;
-}
-
-__flatten
-static bool bch2_btree_path_relock(struct btree_trans *trans,
- struct btree_path *path, unsigned long trace_ip)
-{
- bool ret = btree_path_get_locks(trans, path, false);
-
- if (!ret) {
- trace_trans_restart_relock_path(trans->fn, trace_ip,
- path->btree_id, &path->pos);
- btree_trans_restart(trans);
- }
- return ret;
-}
-
-bool __bch2_btree_path_upgrade(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want)
-{
- struct btree_path *linked;
-
- EBUG_ON(path->locks_want >= new_locks_want);
-
- path->locks_want = new_locks_want;
-
- if (btree_path_get_locks(trans, path, true))
- return true;
-
- /*
- * XXX: this is ugly - we'd prefer to not be mucking with other
- * iterators in the btree_trans here.
- *
- * On failure to upgrade the iterator, setting iter->locks_want and
- * calling get_locks() is sufficient to make bch2_btree_path_traverse()
- * get the locks we want on transaction restart.
- *
- * But if this iterator was a clone, on transaction restart what we did
- * to this iterator isn't going to be preserved.
- *
- * Possibly we could add an iterator field for the parent iterator when
- * an iterator is a copy - for now, we'll just upgrade any other
- * iterators with the same btree id.
- *
- * The code below used to be needed to ensure ancestor nodes get locked
- * before interior nodes - now that's handled by
- * bch2_btree_path_traverse_all().
- */
- trans_for_each_path(trans, linked)
- if (linked != path &&
- linked->cached == path->cached &&
- linked->btree_id == path->btree_id &&
- linked->locks_want < new_locks_want) {
- linked->locks_want = new_locks_want;
- btree_path_get_locks(trans, linked, true);
- }
-
- return false;
-}
-
-void __bch2_btree_path_downgrade(struct btree_path *path,
- unsigned new_locks_want)
-{
- unsigned l;
-
- EBUG_ON(path->locks_want < new_locks_want);
-
- path->locks_want = new_locks_want;
-
- while (path->nodes_locked &&
- (l = __fls(path->nodes_locked)) >= path->locks_want) {
- if (l > path->level) {
- btree_node_unlock(path, l);
- } else {
- if (btree_node_intent_locked(path, l)) {
- six_lock_downgrade(&path->l[l].b->c.lock);
- path->nodes_intent_locked ^= 1 << l;
- }
- break;
- }
- }
-
- bch2_btree_path_verify_locks(path);
-}
-
-void bch2_trans_downgrade(struct btree_trans *trans)
-{
- struct btree_path *path;
-
- trans_for_each_path(trans, path)
- bch2_btree_path_downgrade(path);
-}
-
-/* Btree transaction locking: */
-
-bool bch2_trans_relock(struct btree_trans *trans)
-{
- struct btree_path *path;
-
- if (unlikely(trans->restarted))
- return false;
-
- trans_for_each_path(trans, path)
- if (path->should_be_locked &&
- !bch2_btree_path_relock(trans, path, _RET_IP_)) {
- trace_trans_restart_relock(trans->fn, _RET_IP_,
- path->btree_id, &path->pos);
- BUG_ON(!trans->restarted);
- return false;
- }
- return true;
-}
-
-void bch2_trans_unlock(struct btree_trans *trans)
-{
- struct btree_path *path;
-
- trans_for_each_path(trans, path)
- __bch2_btree_path_unlock(path);
-
- BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
-}
-
/* Btree iterator: */
#ifdef CONFIG_BCACHEFS_DEBUG
bkey_cmp(ck->key.pos, path->pos));
if (!locked)
- btree_node_unlock(path, 0);
+ btree_node_unlock(trans, path, 0);
}
static void bch2_btree_path_verify_level(struct btree_trans *trans,
struct btree_node_iter tmp;
bool locked;
struct bkey_packed *p, *k;
- char buf1[100], buf2[100], buf3[100];
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+ struct printbuf buf3 = PRINTBUF;
const char *msg;
if (!bch2_debug_check_iterators)
if (!btree_path_node(path, level))
return;
- if (!bch2_btree_node_relock(trans, path, level))
+ if (!bch2_btree_node_relock_notrace(trans, path, level))
return;
BUG_ON(!btree_path_pos_in_node(path, l->b));
}
if (!locked)
- btree_node_unlock(path, level);
+ btree_node_unlock(trans, path, level);
return;
err:
- strcpy(buf2, "(none)");
- strcpy(buf3, "(none)");
-
- bch2_bpos_to_text(&PBUF(buf1), path->pos);
+ bch2_bpos_to_text(&buf1, path->pos);
if (p) {
struct bkey uk = bkey_unpack_key(l->b, p);
- bch2_bkey_to_text(&PBUF(buf2), &uk);
+ bch2_bkey_to_text(&buf2, &uk);
+ } else {
+ prt_printf(&buf2, "(none)");
}
if (k) {
struct bkey uk = bkey_unpack_key(l->b, k);
- bch2_bkey_to_text(&PBUF(buf3), &uk);
+ bch2_bkey_to_text(&buf3, &uk);
+ } else {
+ prt_printf(&buf3, "(none)");
}
panic("path should be %s key at level %u:\n"
"path pos %s\n"
"prev key %s\n"
"cur key %s\n",
- msg, level, buf1, buf2, buf3);
+ msg, level, buf1.buf, buf2.buf, buf3.buf);
}
static void bch2_btree_path_verify(struct btree_trans *trans,
if (!bkey_cmp(prev.k->p, k.k->p) &&
bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
prev.k->p.snapshot) > 0) {
- char buf1[100], buf2[200];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
- bch2_bkey_to_text(&PBUF(buf1), k.k);
- bch2_bkey_to_text(&PBUF(buf2), prev.k);
+ bch2_bkey_to_text(&buf1, k.k);
+ bch2_bkey_to_text(&buf2, prev.k);
panic("iter snap %u\n"
"k %s\n"
"prev %s\n",
iter->snapshot,
- buf1, buf2);
+ buf1.buf, buf2.buf);
}
out:
bch2_trans_iter_exit(trans, ©);
{
struct btree_path *path;
unsigned idx;
- char buf[100];
+ struct printbuf buf = PRINTBUF;
trans_for_each_path_inorder(trans, path, idx) {
int cmp = cmp_int(path->btree_id, id) ?:
if (cmp < 0)
continue;
- if (!(path->nodes_locked & 1) ||
+ if (!btree_node_locked(path, 0) ||
!path->should_be_locked)
continue;
}
bch2_dump_trans_paths_updates(trans);
+ bch2_bpos_to_text(&buf, pos);
+
panic("not locked: %s %s%s\n",
- bch2_btree_ids[id],
- (bch2_bpos_to_text(&PBUF(buf), pos), buf),
+ bch2_btree_ids[id], buf.buf,
key_cache ? " cached" : "");
}
bch2_btree_node_iter_peek_all(&l->iter, l->b));
}
-static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c,
+static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
struct btree_path *path,
struct btree_path_level *l,
struct bkey *u)
{
- struct bkey_s_c k = __btree_iter_unpack(c, l, u,
+ struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
bch2_btree_node_iter_peek(&l->iter, l->b));
path->pos = k.k ? k.k->p : l->b->key.k.p;
+ bch2_btree_path_verify_level(trans, path, l - path->l);
return k;
}
-static inline struct bkey_s_c btree_path_level_prev(struct bch_fs *c,
+static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
struct btree_path *path,
struct btree_path_level *l,
struct bkey *u)
{
- struct bkey_s_c k = __btree_iter_unpack(c, l, u,
+ struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
bch2_btree_node_iter_prev(&l->iter, l->b));
path->pos = k.k ? k.k->p : l->b->data->min_key;
+ bch2_btree_path_verify_level(trans, path, l - path->l);
return k;
}
return true;
}
-/*
- * Verify that iterator for parent node points to child node:
- */
-static void btree_path_verify_new_node(struct btree_trans *trans,
- struct btree_path *path, struct btree *b)
-{
- struct bch_fs *c = trans->c;
- struct btree_path_level *l;
- unsigned plevel;
- bool parent_locked;
- struct bkey_packed *k;
-
- if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- return;
-
- if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
- return;
-
- plevel = b->c.level + 1;
- if (!btree_path_node(path, plevel))
- return;
-
- parent_locked = btree_node_locked(path, plevel);
-
- if (!bch2_btree_node_relock(trans, path, plevel))
- return;
-
- l = &path->l[plevel];
- k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
- if (!k ||
- bkey_deleted(k) ||
- bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
- char buf1[100];
- char buf2[100];
- char buf3[100];
- char buf4[100];
- struct bkey uk = bkey_unpack_key(b, k);
-
- bch2_dump_btree_node(c, l->b);
- bch2_bpos_to_text(&PBUF(buf1), path->pos);
- bch2_bkey_to_text(&PBUF(buf2), &uk);
- bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
- bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
- panic("parent iter doesn't point to new node:\n"
- "iter pos %s %s\n"
- "iter key %s\n"
- "new node %s-%s\n",
- bch2_btree_ids[path->btree_id], buf1,
- buf2, buf3, buf4);
- }
-
- if (!parent_locked)
- btree_node_unlock(path, plevel);
-}
-
static inline void __btree_path_level_init(struct btree_path *path,
unsigned level)
{
bch2_btree_node_iter_peek(&l->iter, l->b);
}
-static inline void btree_path_level_init(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
+inline void bch2_btree_path_level_init(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
{
BUG_ON(path->cached);
- btree_path_verify_new_node(trans, path, b);
-
EBUG_ON(!btree_path_pos_in_node(path, b));
EBUG_ON(b->c.lock.state.seq & 1);
struct btree_path *path;
trans_for_each_path(trans, path)
- if (!path->cached &&
+ if (path->uptodate == BTREE_ITER_UPTODATE &&
+ !path->cached &&
btree_path_pos_in_node(path, b)) {
enum btree_node_locked_type t =
btree_lock_want(path, b->c.level);
- if (path->nodes_locked &&
- t != BTREE_NODE_UNLOCKED) {
- btree_node_unlock(path, b->c.level);
+ if (t != BTREE_NODE_UNLOCKED) {
+ btree_node_unlock(trans, path, b->c.level);
six_lock_increment(&b->c.lock, t);
- mark_btree_node_locked(path, b->c.level, t);
+ mark_btree_node_locked(trans, path, b->c.level, t);
}
- btree_path_level_init(trans, path, b);
+ bch2_btree_path_level_init(trans, path, b);
}
}
/* Btree path: traverse, set_pos: */
-static int lock_root_check_fn(struct six_lock *lock, void *p)
-{
- struct btree *b = container_of(lock, struct btree, c.lock);
- struct btree **rootp = p;
-
- return b == *rootp ? 0 : -1;
-}
-
static inline int btree_path_lock_root(struct btree_trans *trans,
struct btree_path *path,
unsigned depth_want,
struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
enum six_lock_type lock_type;
unsigned i;
+ int ret;
EBUG_ON(path->nodes_locked);
}
lock_type = __btree_lock_want(path, path->level);
- if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX,
- path->level, lock_type,
- lock_root_check_fn, rootp,
- trace_ip))) {
- if (trans->restarted)
- return -EINTR;
- continue;
+ ret = btree_node_lock(trans, path, &b->c,
+ path->level, lock_type, trace_ip);
+ if (unlikely(ret)) {
+ if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
+ continue;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ret;
+ BUG();
}
if (likely(b == READ_ONCE(*rootp) &&
b->c.level == path->level &&
!race_fault())) {
for (i = 0; i < path->level; i++)
- path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
+ path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
path->l[path->level].b = b;
for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
path->l[i].b = NULL;
- mark_btree_node_locked(path, path->level, lock_type);
- btree_path_level_init(trans, path, b);
+ mark_btree_node_locked(trans, path, path->level, lock_type);
+ bch2_btree_path_level_init(trans, path, b);
return 0;
}
bch2_bkey_buf_init(&tmp);
- while (nr && !ret) {
+ while (nr-- && !ret) {
if (!bch2_btree_node_relock(trans, path, path->level))
break;
}
if (!was_locked)
- btree_node_unlock(path, path->level);
+ btree_node_unlock(trans, path, path->level);
bch2_bkey_buf_exit(&tmp, c);
return ret;
bch2_bkey_buf_init(&tmp);
- while (nr && !ret) {
+ while (nr-- && !ret) {
if (!bch2_btree_node_relock(trans, path, path->level))
break;
}
if (!was_locked)
- btree_node_unlock(path, path->level);
+ btree_node_unlock(trans, path, path->level);
bch2_bkey_buf_exit(&tmp, c);
return ret;
bp->mem_ptr = (unsigned long)b;
if (!locked)
- btree_node_unlock(path, plevel);
+ btree_node_unlock(trans, path, plevel);
}
static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
if (unlikely(ret))
goto err;
- mark_btree_node_locked(path, level, lock_type);
- btree_path_level_init(trans, path, b);
-
if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
unlikely(b != btree_node_mem_ptr(tmp.k)))
btree_node_mem_ptr_set(trans, path, level + 1, b);
if (btree_node_read_locked(path, level + 1))
- btree_node_unlock(path, level + 1);
+ btree_node_unlock(trans, path, level + 1);
+
+ mark_btree_node_locked(trans, path, level, lock_type);
path->level = level;
+ bch2_btree_path_level_init(trans, path, b);
bch2_btree_path_verify_locks(path);
err:
static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
unsigned, unsigned long);
-static int __btree_path_traverse_all(struct btree_trans *trans, int ret,
- unsigned long trace_ip)
+static int bch2_btree_path_traverse_all(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct btree_path *path;
- int i;
+ unsigned long trace_ip = _RET_IP_;
+ int ret = 0;
if (trans->in_traverse_all)
- return -EINTR;
+ return -BCH_ERR_transaction_restart_in_traverse_all;
trans->in_traverse_all = true;
retry_all:
- trans->restarted = false;
+ trans->restarted = 0;
+ trans->traverse_all_idx = U8_MAX;
trans_for_each_path(trans, path)
path->should_be_locked = false;
btree_trans_verify_sorted(trans);
- for (i = trans->nr_sorted - 2; i >= 0; --i) {
- struct btree_path *path1 = trans->paths + trans->sorted[i];
- struct btree_path *path2 = trans->paths + trans->sorted[i + 1];
-
- if (path1->btree_id == path2->btree_id &&
- path1->locks_want < path2->locks_want)
- __bch2_btree_path_upgrade(trans, path1, path2->locks_want);
- else if (!path1->locks_want && path2->locks_want)
- __bch2_btree_path_upgrade(trans, path1, 1);
- }
-
bch2_trans_unlock(trans);
cond_resched();
- if (unlikely(ret == -ENOMEM)) {
+ if (unlikely(trans->memory_allocation_failure)) {
struct closure cl;
closure_init_stack(&cl);
} while (ret);
}
- if (unlikely(ret == -EIO))
- goto out;
-
- BUG_ON(ret && ret != -EINTR);
-
/* Now, redo traversals in correct order: */
- i = 0;
- while (i < trans->nr_sorted) {
- path = trans->paths + trans->sorted[i];
+ trans->traverse_all_idx = 0;
+ while (trans->traverse_all_idx < trans->nr_sorted) {
+ path = trans->paths + trans->sorted[trans->traverse_all_idx];
/*
* Traversing a path can cause another path to be added at about
*/
if (path->uptodate) {
ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
- if (ret)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ ret == -ENOMEM)
goto retry_all;
+ if (ret)
+ goto err;
+ BUG_ON(path->uptodate);
} else {
- i++;
+ trans->traverse_all_idx++;
}
}
*/
trans_for_each_path(trans, path)
BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
-out:
+err:
bch2_btree_cache_cannibalize_unlock(c);
trans->in_traverse_all = false;
- trace_trans_traverse_all(trans->fn, trace_ip);
+ trace_and_count(c, trans_traverse_all, trans, trace_ip);
return ret;
}
-static int bch2_btree_path_traverse_all(struct btree_trans *trans)
+static inline bool btree_path_check_pos_in_node(struct btree_path *path,
+ unsigned l, int check_pos)
{
- return __btree_path_traverse_all(trans, 0, _RET_IP_);
+ if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
+ return false;
+ if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
+ return false;
+ return true;
}
static inline bool btree_path_good_node(struct btree_trans *trans,
struct btree_path *path,
unsigned l, int check_pos)
{
- if (!is_btree_node(path, l) ||
- !bch2_btree_node_relock(trans, path, l))
- return false;
+ return is_btree_node(path, l) &&
+ bch2_btree_node_relock(trans, path, l) &&
+ btree_path_check_pos_in_node(path, l, check_pos);
+}
- if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
- return false;
- if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
- return false;
- return true;
+static void btree_path_set_level_down(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_level)
+{
+ unsigned l;
+
+ path->level = new_level;
+
+ for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
+ if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
+ btree_node_unlock(trans, path, l);
+
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ bch2_btree_path_verify(trans, path);
}
-static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
- struct btree_path *path,
- int check_pos)
+static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
+ struct btree_path *path,
+ int check_pos)
{
unsigned i, l = path->level;
-
+again:
while (btree_path_node(path, l) &&
- !btree_path_good_node(trans, path, l, check_pos)) {
- btree_node_unlock(path, l);
- path->l[l].b = BTREE_ITER_NO_NODE_UP;
- l++;
- }
+ !btree_path_good_node(trans, path, l, check_pos))
+ __btree_path_set_level_up(trans, path, l++);
/* If we need intent locks, take them too: */
for (i = l + 1;
i < path->locks_want && btree_path_node(path, i);
i++)
- if (!bch2_btree_node_relock(trans, path, i))
- while (l <= i) {
- btree_node_unlock(path, l);
- path->l[l].b = BTREE_ITER_NO_NODE_UP;
- l++;
- }
+ if (!bch2_btree_node_relock(trans, path, i)) {
+ while (l <= i)
+ __btree_path_set_level_up(trans, path, l++);
+ goto again;
+ }
return l;
}
+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
+ struct btree_path *path,
+ int check_pos)
+{
+ return likely(btree_node_locked(path, path->level) &&
+ btree_path_check_pos_in_node(path, path->level, check_pos))
+ ? path->level
+ : __btree_path_up_until_good_node(trans, path, check_pos);
+}
+
/*
* This is the main state machine for walking down the btree - walks down to a
* specified depth
unsigned long trace_ip)
{
unsigned depth_want = path->level;
- int ret = 0;
+ int ret = trans->restarted;
- if (unlikely(trans->restarted)) {
- ret = -EINTR;
+ if (unlikely(ret))
goto out;
- }
/*
* Ensure we obey path->should_be_locked: if it's set, we can't unlock
* and re-traverse the path without a transaction restart:
*/
if (path->should_be_locked) {
- ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR;
+ ret = bch2_btree_path_relock(trans, path, trace_ip);
goto out;
}
path->level = btree_path_up_until_good_node(trans, path, 0);
+ EBUG_ON(btree_path_node(path, path->level) &&
+ !btree_node_locked(path, path->level));
+
/*
* Note: path->nodes[path->level] may be temporarily NULL here - that
* would indicate to other code that we got to the end of the btree,
goto out;
}
- __bch2_btree_path_unlock(path);
+ __bch2_btree_path_unlock(trans, path);
path->level = depth_want;
-
- if (ret == -EIO)
- path->l[path->level].b =
- BTREE_ITER_NO_NODE_ERROR;
- else
- path->l[path->level].b =
- BTREE_ITER_NO_NODE_DOWN;
+ path->l[path->level].b = ERR_PTR(ret);
goto out;
}
}
path->uptodate = BTREE_ITER_UPTODATE;
out:
- BUG_ON((ret == -EINTR) != !!trans->restarted);
+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
bch2_btree_path_verify(trans, path);
return ret;
}
-static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long);
-
int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
struct btree_path *path, unsigned flags)
{
+ if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+ unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U);
+ u64 mask = ~(~0ULL << restart_probability_bits);
+
+ if ((prandom_u32() & mask) == mask) {
+ trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
+ }
+ }
+
if (path->uptodate < BTREE_ITER_NEED_RELOCK)
return 0;
static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
struct btree_path *src)
{
- unsigned i;
+ unsigned i, offset = offsetof(struct btree_path, pos);
+ int cmp = btree_path_cmp(dst, src);
- memcpy(&dst->pos, &src->pos,
- sizeof(struct btree_path) - offsetof(struct btree_path, pos));
+ memcpy((void *) dst + offset,
+ (void *) src + offset,
+ sizeof(struct btree_path) - offset);
- for (i = 0; i < BTREE_MAX_DEPTH; i++)
- if (btree_node_locked(dst, i))
- six_lock_increment(&dst->l[i].b->c.lock,
- __btree_lock_want(dst, i));
+ for (i = 0; i < BTREE_MAX_DEPTH; i++) {
+ unsigned t = btree_node_locked_type(dst, i);
- btree_path_check_sort(trans, dst, 0);
+ if (t != BTREE_NODE_UNLOCKED)
+ six_lock_increment(&dst->l[i].b->c.lock, t);
+ }
+
+ if (cmp)
+ bch2_btree_path_check_sort_fast(trans, dst, cmp);
}
static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
return new;
}
-inline struct btree_path * __must_check
-bch2_btree_path_make_mut(struct btree_trans *trans,
+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
struct btree_path *path, bool intent,
unsigned long ip)
{
btree_trans_verify_sorted(trans);
}
+ path->should_be_locked = false;
return path;
}
path = bch2_btree_path_make_mut(trans, path, intent, ip);
- path->pos = new_pos;
- path->should_be_locked = false;
+ path->pos = new_pos;
- btree_path_check_sort(trans, path, cmp);
+ bch2_btree_path_check_sort_fast(trans, path, cmp);
if (unlikely(path->cached)) {
- btree_node_unlock(path, 0);
- path->l[0].b = BTREE_ITER_NO_NODE_CACHED;
+ btree_node_unlock(trans, path, 0);
+ path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
goto out;
}
l = btree_path_up_until_good_node(trans, path, cmp);
if (btree_path_node(path, l)) {
+ BUG_ON(!btree_node_locked(path, l));
/*
* We might have to skip over many keys, or just a few: try
* advancing the node iterator, and if we have to skip over too
__btree_path_level_init(path, l);
}
- if (l != path->level) {
+ if (unlikely(l != path->level)) {
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- __bch2_btree_path_unlock(path);
+ __bch2_btree_path_unlock(trans, path);
}
out:
bch2_btree_path_verify(trans, path);
static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
{
- struct btree_path *next;
+ struct btree_path *sib;
- next = prev_btree_path(trans, path);
- if (next && !btree_path_cmp(next, path))
- return next;
+ sib = prev_btree_path(trans, path);
+ if (sib && !btree_path_cmp(sib, path))
+ return sib;
- next = next_btree_path(trans, path);
- if (next && !btree_path_cmp(next, path))
- return next;
+ sib = next_btree_path(trans, path);
+ if (sib && !btree_path_cmp(sib, path))
+ return sib;
return NULL;
}
static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
{
- struct btree_path *next;
+ struct btree_path *sib;
- next = prev_btree_path(trans, path);
- if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
- return next;
+ sib = prev_btree_path(trans, path);
+ if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+ return sib;
- next = next_btree_path(trans, path);
- if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
- return next;
+ sib = next_btree_path(trans, path);
+ if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+ return sib;
return NULL;
}
static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
{
- __bch2_btree_path_unlock(path);
+ __bch2_btree_path_unlock(trans, path);
btree_path_list_remove(trans, path);
trans->paths_allocated &= ~(1ULL << path->idx);
}
if (!__btree_path_put(path, intent))
return;
- /*
- * Perhaps instead we should check for duplicate paths in traverse_all:
- */
- if (path->preserve &&
- (dup = have_path_at_pos(trans, path))) {
- dup->preserve = true;
- path->preserve = false;
- goto free;
- }
+ dup = path->preserve
+ ? have_path_at_pos(trans, path)
+ : have_node_at_pos(trans, path);
+
+ if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
+ return;
- if (!path->preserve &&
- (dup = have_node_at_pos(trans, path)))
- goto free;
- return;
-free:
if (path->should_be_locked &&
- !btree_node_locked(dup, path->level))
+ !trans->restarted &&
+ (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
return;
- dup->should_be_locked |= path->should_be_locked;
- __bch2_path_free(trans, path);
+ if (dup) {
+ dup->preserve |= path->preserve;
+ dup->should_be_locked |= path->should_be_locked;
+ }
+
+ __bch2_path_free(trans, path);
+}
+
+static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
+ bool intent)
+{
+ EBUG_ON(trans->paths + path->idx != path);
+ EBUG_ON(!path->ref);
+
+ if (!__btree_path_put(path, intent))
+ return;
+
+ __bch2_path_free(trans, path);
+}
+
+void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
+{
+ struct btree_insert_entry *i;
+
+ prt_printf(buf, "transaction updates for %s journal seq %llu",
+ trans->fn, trans->journal_res.seq);
+ prt_newline(buf);
+ printbuf_indent_add(buf, 2);
+
+ trans_for_each_update(trans, i) {
+ struct bkey_s_c old = { &i->old_k, i->old_v };
+
+ prt_printf(buf, "update: btree=%s cached=%u %pS",
+ bch2_btree_ids[i->btree_id],
+ i->cached,
+ (void *) i->ip_allocated);
+ prt_newline(buf);
+
+ prt_printf(buf, " old ");
+ bch2_bkey_val_to_text(buf, trans->c, old);
+ prt_newline(buf);
+
+ prt_printf(buf, " new ");
+ bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
+ prt_newline(buf);
+ }
+
+ printbuf_indent_sub(buf, 2);
+}
+
+noinline __cold
+void bch2_dump_trans_updates(struct btree_trans *trans)
+{
+ struct printbuf buf = PRINTBUF;
+
+ bch2_trans_updates_to_text(&buf, trans);
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+}
+
+void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
+{
+ prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
+ path->idx, path->ref, path->intent_ref,
+ path->preserve ? 'P' : ' ',
+ path->should_be_locked ? 'S' : ' ',
+ bch2_btree_ids[path->btree_id],
+ path->level);
+ bch2_bpos_to_text(out, path->pos);
+
+ prt_printf(out, " locks %u", path->nodes_locked);
+#ifdef CONFIG_BCACHEFS_DEBUG
+ prt_printf(out, " %pS", (void *) path->ip_allocated);
+#endif
+ prt_newline(out);
+}
+
+void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
+{
+ struct btree_path *path;
+ unsigned idx;
+
+ trans_for_each_path_inorder(trans, path, idx)
+ bch2_btree_path_to_text(out, path);
}
noinline __cold
void bch2_dump_trans_paths_updates(struct btree_trans *trans)
{
- struct btree_path *path;
- struct btree_insert_entry *i;
- unsigned idx;
- char buf1[300], buf2[300];
+ struct printbuf buf = PRINTBUF;
- btree_trans_verify_sorted(trans);
+ bch2_trans_paths_to_text(&buf, trans);
+ bch2_trans_updates_to_text(&buf, trans);
- trans_for_each_path_inorder(trans, path, idx)
- printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n",
- path->idx, path->ref, path->intent_ref,
- path->should_be_locked ? " S" : "",
- path->preserve ? " P" : "",
- bch2_btree_ids[path->btree_id],
- (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1),
- path->nodes_locked,
-#ifdef CONFIG_BCACHEFS_DEBUG
- (void *) path->ip_allocated
-#else
- NULL
-#endif
- );
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+}
- trans_for_each_update(trans, i) {
- struct bkey u;
- struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u);
+noinline
+static void bch2_trans_update_max_paths(struct btree_trans *trans)
+{
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
+ struct printbuf buf = PRINTBUF;
- printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s",
- bch2_btree_ids[i->btree_id],
- (void *) i->ip_allocated,
- (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1),
- (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2));
+ bch2_trans_paths_to_text(&buf, trans);
+
+ if (!buf.allocation_failure) {
+ mutex_lock(&s->lock);
+ if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
+ s->nr_max_paths = trans->nr_max_paths =
+ hweight64(trans->paths_allocated);
+ swap(s->max_paths_text, buf.buf);
+ }
+ mutex_unlock(&s->lock);
}
+
+ printbuf_exit(&buf);
+}
+
+static noinline void btree_path_overflow(struct btree_trans *trans)
+{
+ bch2_dump_trans_paths_updates(trans);
+ panic("trans path oveflow\n");
}
-static struct btree_path *btree_path_alloc(struct btree_trans *trans,
- struct btree_path *pos)
+static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
+ struct btree_path *pos)
{
struct btree_path *path;
unsigned idx;
if (unlikely(trans->paths_allocated ==
- ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) {
- bch2_dump_trans_paths_updates(trans);
- panic("trans path oveflow\n");
- }
+ ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
+ btree_path_overflow(trans);
idx = __ffs64(~trans->paths_allocated);
trans->paths_allocated |= 1ULL << idx;
+ if (unlikely(idx > trans->nr_max_paths))
+ bch2_trans_update_max_paths(trans);
+
path = &trans->paths[idx];
path->idx = idx;
path->ref = 0;
path->intent_ref = 0;
path->nodes_locked = 0;
- path->nodes_intent_locked = 0;
btree_path_list_add(trans, pos, path);
return path;
int i;
BUG_ON(trans->restarted);
+ btree_trans_verify_sorted(trans);
+ bch2_trans_verify_locks(trans);
trans_for_each_path_inorder(trans, path, i) {
if (__btree_path_cmp(path,
path->level = level;
path->locks_want = locks_want;
path->nodes_locked = 0;
- path->nodes_intent_locked = 0;
for (i = 0; i < ARRAY_SIZE(path->l); i++)
- path->l[i].b = BTREE_ITER_NO_NODE_INIT;
+ path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
#ifdef CONFIG_BCACHEFS_DEBUG
path->ip_allocated = ip;
#endif
*/
locks_want = min(locks_want, BTREE_MAX_DEPTH);
- if (locks_want > path->locks_want) {
- path->locks_want = locks_want;
- btree_path_get_locks(trans, path, true);
- }
+ if (locks_want > path->locks_want)
+ bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
return path;
}
inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
{
+ struct btree_path_level *l = path_l(path);
+ struct bkey_packed *_k;
struct bkey_s_c k;
- if (!path->cached) {
- struct btree_path_level *l = path_l(path);
- struct bkey_packed *_k;
+ if (unlikely(!l->b))
+ return bkey_s_c_null;
- EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+ EBUG_ON(!btree_node_locked(path, path->level));
+ if (!path->cached) {
_k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
EBUG_ON(ck &&
(path->btree_id != ck->key.btree_id ||
bkey_cmp(path->pos, ck->key.pos)));
+ EBUG_ON(!ck || !ck->valid);
- /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */
- if (unlikely(!ck || !ck->valid))
- return bkey_s_c_null;
-
- EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
-
+ *u = ck->k->k;
k = bkey_i_to_s_c(ck->k);
}
if (ret)
return ret;
- iter->path->should_be_locked = true;
+ btree_path_set_should_be_locked(iter->path);
return 0;
}
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- iter->path->should_be_locked = true;
- BUG_ON(iter->path->uptodate);
+ btree_path_set_should_be_locked(iter->path);
out:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
struct btree_trans *trans = iter->trans;
struct btree_path *path = iter->path;
struct btree *b = NULL;
- unsigned l;
int ret;
BUG_ON(trans->restarted);
/* got to end? */
if (!btree_path_node(path, path->level + 1)) {
- btree_node_unlock(path, path->level);
- path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
- path->level++;
+ btree_path_set_level_up(trans, path);
return NULL;
}
if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
- __bch2_btree_path_unlock(path);
- path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
- path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
- trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
- path->btree_id, &path->pos);
- btree_trans_restart(trans);
- ret = -EINTR;
+ __bch2_btree_path_unlock(trans, path);
+ path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
+ path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
goto err;
}
b = btree_path_node(path, path->level + 1);
if (!bpos_cmp(iter->pos, b->key.k.p)) {
- btree_node_unlock(path, path->level);
- path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
- path->level++;
+ __btree_path_set_level_up(trans, path, path->level++);
} else {
/*
* Haven't gotten to the end of the parent node: go back down to
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- path->level = iter->min_depth;
-
- for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
- if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
- btree_node_unlock(path, l);
-
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- bch2_btree_iter_verify(iter);
+ btree_path_set_level_down(trans, path, iter->min_depth);
ret = bch2_btree_path_traverse(trans, path, iter->flags);
if (ret)
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- iter->path->should_be_locked = true;
+ btree_path_set_should_be_locked(iter->path);
BUG_ON(iter->path->uptodate);
out:
bch2_btree_iter_verify_entry_exit(iter);
inline bool bch2_btree_iter_advance(struct btree_iter *iter)
{
- struct bpos pos = iter->k.p;
- bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
- ? bpos_cmp(pos, SPOS_MAX)
- : bkey_cmp(pos, SPOS_MAX)) != 0;
+ if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
+ struct bpos pos = iter->k.p;
+ bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ ? bpos_cmp(pos, SPOS_MAX)
+ : bkey_cmp(pos, SPOS_MAX)) != 0;
- if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
- pos = bkey_successor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
- return ret;
+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ pos = bkey_successor(iter, pos);
+ bch2_btree_iter_set_pos(iter, pos);
+ return ret;
+ } else {
+ if (!btree_path_node(iter->path, iter->path->level))
+ return true;
+
+ iter->advanced = true;
+ return false;
+ }
}
inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
struct bpos pos)
{
struct btree_insert_entry *i;
+ struct bkey_i *ret = NULL;
- trans_for_each_update(trans, i)
- if ((cmp_int(btree_id, i->btree_id) ?:
- bpos_cmp(pos, i->k->k.p)) <= 0) {
- if (btree_id == i->btree_id)
- return i->k;
+ trans_for_each_update(trans, i) {
+ if (i->btree_id < btree_id)
+ continue;
+ if (i->btree_id > btree_id)
break;
- }
+ if (bpos_cmp(i->k->k.p, pos) < 0)
+ continue;
+ if (i->key_cache_already_flushed)
+ continue;
+ if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
+ ret = i->k;
+ }
- return NULL;
+ return ret;
}
-static noinline
-struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans,
- struct btree_path *path)
+struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos start_pos,
+ struct bpos end_pos)
{
- struct journal_keys *keys = &trans->c->journal_keys;
- size_t idx = bch2_journal_key_search(keys, path->btree_id,
- path->level, path->pos);
+ struct bkey_i *k;
+
+ if (bpos_cmp(start_pos, iter->journal_pos) < 0)
+ iter->journal_idx = 0;
+
+ k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, 0,
+ start_pos, end_pos,
+ &iter->journal_idx);
- while (idx < keys->nr && keys->d[idx].overwritten)
- idx++;
+ iter->journal_pos = k ? k->k.p : end_pos;
+ return k;
+}
- return (idx < keys->nr &&
- keys->d[idx].btree_id == path->btree_id &&
- keys->d[idx].level == path->level)
- ? keys->d[idx].k
- : NULL;
+struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos pos)
+{
+ return bch2_btree_journal_peek(trans, iter, pos, pos);
}
static noinline
struct bkey_s_c k)
{
struct bkey_i *next_journal =
- __btree_trans_peek_journal(trans, iter->path);
+ bch2_btree_journal_peek(trans, iter, iter->path->pos,
+ k.k ? k.k->p : iter->path->l[0].b->key.k.p);
- if (next_journal &&
- bpos_cmp(next_journal->k.p,
- k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+ if (next_journal) {
iter->k = next_journal->k;
k = bkey_i_to_s_c(next_journal);
}
* bkey_s_c_null:
*/
static noinline
-struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
{
struct btree_trans *trans = iter->trans;
struct bch_fs *c = trans->c;
if (unlikely(ret))
return bkey_s_c_err(ret);
- iter->key_cache_path->should_be_locked = true;
+ btree_path_set_should_be_locked(iter->key_cache_path);
return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
}
+static noinline
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+{
+ struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos);
+ int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_);
+
+ return err ? bkey_s_c_err(err) : ret;
+}
+
static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
{
struct btree_trans *trans = iter->trans;
struct bkey_s_c k, k2;
int ret;
- EBUG_ON(iter->path->cached || iter->path->level);
+ EBUG_ON(iter->path->cached);
bch2_btree_iter_verify(iter);
while (1) {
+ struct btree_path_level *l;
+
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
goto out;
}
- iter->path->should_be_locked = true;
+ l = path_l(iter->path);
+
+ if (unlikely(!l->b)) {
+ /* No btree nodes at requested level: */
+ bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ k = bkey_s_c_null;
+ goto out;
+ }
+
+ btree_path_set_should_be_locked(iter->path);
- k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
+ k = btree_path_level_peek_all(trans->c, l, &iter->k);
if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
k.k &&
(k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
- ret = bkey_err(k2);
+ k = k2;
+ ret = bkey_err(k);
if (ret) {
- k = k2;
bch2_btree_iter_set_pos(iter, iter->pos);
goto out;
}
-
- k = k2;
- iter->k = *k.k;
}
if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
: NULL;
if (next_update &&
bpos_cmp(next_update->k.p,
- k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+ k.k ? k.k->p : l->b->key.k.p) <= 0) {
iter->k = next_update->k;
k = bkey_i_to_s_c(next_update);
}
if (likely(k.k)) {
break;
- } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
+ } else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) {
/* Advance to next leaf node: */
- search_key = bpos_successor(iter->path->l[0].b->key.k.p);
+ search_key = bpos_successor(l->b->key.k.p);
} else {
/* End of btree: */
bch2_btree_iter_set_pos(iter, SPOS_MAX);
* bch2_btree_iter_peek: returns first key greater than or equal to iterator's
* current position
*/
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
{
struct btree_trans *trans = iter->trans;
struct bpos search_key = btree_iter_search_key(iter);
struct bkey_s_c k;
+ struct bpos iter_pos;
int ret;
+ EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+
if (iter->update_path) {
- bch2_path_put(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
+ bch2_path_put_nokeep(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
iter->update_path = NULL;
}
while (1) {
k = __bch2_btree_iter_peek(iter, search_key);
if (!k.k || bkey_err(k))
- goto out;
+ goto out_no_locked;
+
+ /*
+ * iter->pos should be mononotically increasing, and always be
+ * equal to the key we just returned - except extents can
+ * straddle iter->pos:
+ */
+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+ iter_pos = k.k->p;
+ else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+ iter_pos = bkey_start_pos(k.k);
+ else
+ iter_pos = iter->pos;
+
+ if (bkey_cmp(iter_pos, end) > 0) {
+ bch2_btree_iter_set_pos(iter, end);
+ k = bkey_s_c_null;
+ goto out_no_locked;
+ }
if (iter->update_path &&
bkey_cmp(iter->update_path->pos, k.k->p)) {
- bch2_path_put(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
+ bch2_path_put_nokeep(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
iter->update_path = NULL;
}
iter->update_path = bch2_btree_path_set_pos(trans,
iter->update_path, pos,
iter->flags & BTREE_ITER_INTENT,
- btree_iter_ip_allocated(iter));
-
- BUG_ON(!(iter->update_path->nodes_locked & 1));
- iter->update_path->should_be_locked = true;
+ _THIS_IP_);
}
/*
break;
}
- /*
- * iter->pos should be mononotically increasing, and always be equal to
- * the key we just returned - except extents can straddle iter->pos:
- */
- if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
- iter->pos = k.k->p;
- else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
- iter->pos = bkey_start_pos(k.k);
+ iter->pos = iter_pos;
iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- BUG_ON(!iter->path->nodes_locked);
-out:
+
+ btree_path_set_should_be_locked(iter->path);
+out_no_locked:
if (iter->update_path) {
- BUG_ON(!(iter->update_path->nodes_locked & 1));
- iter->update_path->should_be_locked = true;
+ if (iter->update_path->uptodate &&
+ (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)))
+ k = bkey_s_c_err(ret);
+ else
+ btree_path_set_should_be_locked(iter->update_path);
}
- iter->path->should_be_locked = true;
if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
iter->pos.snapshot = iter->snapshot;
return k;
}
+/**
+ * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
+ * to iterator's current position, returning keys from every level of the btree.
+ * For keys at different levels of the btree that compare equal, the key from
+ * the lower level (leaf) is returned first.
+ */
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
+{
+ struct btree_trans *trans = iter->trans;
+ struct bkey_s_c k;
+ int ret;
+
+ EBUG_ON(iter->path->cached);
+ bch2_btree_iter_verify(iter);
+ BUG_ON(iter->path->level < iter->min_depth);
+ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+ EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
+
+ while (1) {
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ if (unlikely(ret)) {
+ /* ensure that iter->k is consistent with iter->pos: */
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
+
+ /* Already at end? */
+ if (!btree_path_node(iter->path, iter->path->level)) {
+ k = bkey_s_c_null;
+ goto out_no_locked;
+ }
+
+ k = btree_path_level_peek_all(trans->c,
+ &iter->path->l[iter->path->level], &iter->k);
+
+ /* Check if we should go up to the parent node: */
+ if (!k.k ||
+ (iter->advanced &&
+ !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) {
+ iter->pos = path_l(iter->path)->b->key.k.p;
+ btree_path_set_level_up(trans, iter->path);
+ iter->advanced = false;
+ continue;
+ }
+
+ /*
+ * Check if we should go back down to a leaf:
+ * If we're not in a leaf node, we only return the current key
+ * if it exactly matches iter->pos - otherwise we first have to
+ * go back to the leaf:
+ */
+ if (iter->path->level != iter->min_depth &&
+ (iter->advanced ||
+ !k.k ||
+ bpos_cmp(iter->pos, k.k->p))) {
+ btree_path_set_level_down(trans, iter->path, iter->min_depth);
+ iter->pos = bpos_successor(iter->pos);
+ iter->advanced = false;
+ continue;
+ }
+
+ /* Check if we should go to the next key: */
+ if (iter->path->level == iter->min_depth &&
+ iter->advanced &&
+ k.k &&
+ !bpos_cmp(iter->pos, k.k->p)) {
+ iter->pos = bpos_successor(iter->pos);
+ iter->advanced = false;
+ continue;
+ }
+
+ if (iter->advanced &&
+ iter->path->level == iter->min_depth &&
+ bpos_cmp(k.k->p, iter->pos))
+ iter->advanced = false;
+
+ BUG_ON(iter->advanced);
+ BUG_ON(!k.k);
+ break;
+ }
+
+ iter->pos = k.k->p;
+ btree_path_set_should_be_locked(iter->path);
+out_no_locked:
+ bch2_btree_iter_verify(iter);
+
+ return k;
+}
+
/**
* bch2_btree_iter_next: returns first key greater than iterator's current
* position
/* ensure that iter->k is consistent with iter->pos: */
bch2_btree_iter_set_pos(iter, iter->pos);
k = bkey_s_c_err(ret);
- goto out;
+ goto out_no_locked;
}
- k = btree_path_level_peek(trans->c, iter->path,
+ k = btree_path_level_peek(trans, iter->path,
&iter->path->l[0], &iter->k);
if (!k.k ||
((iter->flags & BTREE_ITER_IS_EXTENTS)
? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
: bpos_cmp(k.k->p, search_key) > 0))
- k = btree_path_level_prev(trans->c, iter->path,
+ k = btree_path_level_prev(trans, iter->path,
&iter->path->l[0], &iter->k);
- btree_path_check_sort(trans, iter->path, 0);
+ bch2_btree_path_check_sort(trans, iter->path, 0);
if (likely(k.k)) {
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
* that candidate
*/
if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
- bch2_path_put(trans, iter->path,
+ bch2_path_put_nokeep(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
iter->path = saved_path;
saved_path = NULL;
iter->snapshot,
k.k->p.snapshot)) {
if (saved_path)
- bch2_path_put(trans, saved_path,
+ bch2_path_put_nokeep(trans, saved_path,
iter->flags & BTREE_ITER_INTENT);
saved_path = btree_path_clone(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
/* Start of btree: */
bch2_btree_iter_set_pos(iter, POS_MIN);
k = bkey_s_c_null;
- goto out;
+ goto out_no_locked;
}
}
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
iter->pos.snapshot = iter->snapshot;
-out:
+
+ btree_path_set_should_be_locked(iter->path);
+out_no_locked:
if (saved_path)
- bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
- iter->path->should_be_locked = true;
+ bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
struct bkey_s_c k;
int ret;
- EBUG_ON(iter->path->level);
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
+ EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+ EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
/* extents can't span inode numbers: */
if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
if ((iter->flags & BTREE_ITER_CACHED) ||
!(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
}
if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
- (next_update = __btree_trans_peek_journal(trans, iter->path)) &&
- !bpos_cmp(next_update->k.p, iter->pos)) {
+ (next_update = bch2_btree_journal_peek_slot(trans,
+ iter, iter->pos))) {
iter->k = next_update->k;
k = bkey_i_to_s_c(next_update);
goto out;
}
if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
- (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+ (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) {
if (!bkey_err(k))
iter->k = *k.k;
- goto out;
+ /* We're not returning a key from iter->path: */
+ goto out_no_locked;
}
k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+ if (unlikely(!k.k))
+ goto out_no_locked;
} else {
struct bpos next;
+ EBUG_ON(iter->path->level);
+
if (iter->flags & BTREE_ITER_INTENT) {
struct btree_iter iter2;
+ struct bpos end = iter->pos;
+
+ if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ end.offset = U64_MAX;
bch2_trans_copy_iter(&iter2, iter);
- k = bch2_btree_iter_peek(&iter2);
+ k = bch2_btree_iter_peek_upto(&iter2, end);
if (k.k && !bkey_err(k)) {
iter->k = iter2.k;
struct bpos pos = iter->pos;
k = bch2_btree_iter_peek(iter);
- iter->pos = pos;
+ if (unlikely(bkey_err(k)))
+ bch2_btree_iter_set_pos(iter, pos);
+ else
+ iter->pos = pos;
}
if (unlikely(bkey_err(k)))
- return k;
+ goto out_no_locked;
next = k.k ? bkey_start_pos(k.k) : POS_MAX;
}
}
out:
- iter->path->should_be_locked = true;
-
+ btree_path_set_should_be_locked(iter->path);
+out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
ret = bch2_btree_iter_verify_ret(iter, k);
struct btree_path *path, *prev = NULL;
unsigned i;
+ if (!bch2_debug_check_iterators)
+ return;
+
trans_for_each_path_inorder(trans, path, i) {
- BUG_ON(prev && btree_path_cmp(prev, path) > 0);
+ if (prev && btree_path_cmp(prev, path) > 0) {
+ bch2_dump_trans_paths_updates(trans);
+ panic("trans paths out of order!\n");
+ }
prev = path;
}
#endif
btree_path_verify_sorted_ref(trans, r);
}
-static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
- int cmp)
+static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *trans,
+ struct btree_path *path,
+ int cmp)
+{
+ struct btree_path *n;
+ int cmp2;
+
+ EBUG_ON(!cmp);
+
+ while ((n = cmp < 0
+ ? prev_btree_path(trans, path)
+ : next_btree_path(trans, path)) &&
+ (cmp2 = btree_path_cmp(n, path)) &&
+ cmp2 != cmp)
+ btree_path_swap(trans, n, path);
+
+ btree_trans_verify_sorted(trans);
+}
+
+inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
+ int cmp)
{
struct btree_path *n;
path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
+ if (trans->in_traverse_all &&
+ trans->traverse_all_idx != U8_MAX &&
+ trans->traverse_all_idx >= path->sorted_idx)
+ trans->traverse_all_idx++;
+
array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
for (i = path->sorted_idx; i < trans->nr_sorted; i++)
bch2_path_put(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
if (iter->update_path)
- bch2_path_put(trans, iter->update_path,
+ bch2_path_put_nokeep(trans, iter->update_path,
iter->flags & BTREE_ITER_INTENT);
if (iter->key_cache_path)
bch2_path_put(trans, iter->key_cache_path,
iter->key_cache_path = NULL;
}
-static void __bch2_trans_iter_init(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- unsigned locks_want,
- unsigned depth,
- unsigned flags,
- unsigned long ip)
+static inline void __bch2_trans_iter_init(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned locks_want,
+ unsigned depth,
+ unsigned flags,
+ unsigned long ip)
{
- EBUG_ON(trans->restarted);
+ if (unlikely(trans->restarted))
+ panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n",
+ bch2_err_str(trans->restarted),
+ (void *) trans->last_restarted_ip);
+
+ if (flags & BTREE_ITER_ALL_LEVELS)
+ flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
btree_node_type_is_extents(btree_id))
btree_type_has_snapshots(btree_id))
flags |= BTREE_ITER_FILTER_SNAPSHOTS;
- if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
+ if (trans->journal_replay_not_finished)
flags |= BTREE_ITER_WITH_JOURNAL;
- if (!btree_id_cached(trans->c, btree_id)) {
- flags &= ~BTREE_ITER_CACHED;
- flags &= ~BTREE_ITER_WITH_KEY_CACHE;
- } else if (!(flags & BTREE_ITER_CACHED))
- flags |= BTREE_ITER_WITH_KEY_CACHE;
-
iter->trans = trans;
iter->path = NULL;
iter->update_path = NULL;
iter->k.type = KEY_TYPE_deleted;
iter->k.p = pos;
iter->k.size = 0;
+ iter->journal_idx = 0;
+ iter->journal_pos = POS_MIN;
#ifdef CONFIG_BCACHEFS_DEBUG
iter->ip_allocated = ip;
#endif
unsigned btree_id, struct bpos pos,
unsigned flags)
{
+ if (!btree_id_cached(trans->c, btree_id)) {
+ flags &= ~BTREE_ITER_CACHED;
+ flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+ } else if (!(flags & BTREE_ITER_CACHED))
+ flags |= BTREE_ITER_WITH_KEY_CACHE;
+
__bch2_trans_iter_init(trans, iter, btree_id, pos,
0, 0, flags, _RET_IP_);
}
dst->key_cache_path = NULL;
}
-void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
- size_t new_top = trans->mem_top + size;
+ unsigned new_top = trans->mem_top + size;
+ size_t old_bytes = trans->mem_bytes;
+ size_t new_bytes = roundup_pow_of_two(new_top);
+ void *new_mem;
void *p;
- if (new_top > trans->mem_bytes) {
- size_t old_bytes = trans->mem_bytes;
- size_t new_bytes = roundup_pow_of_two(new_top);
- void *new_mem;
+ trans->mem_max = max(trans->mem_max, new_top);
- WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+ WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
- new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
- if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
- new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
- new_bytes = BTREE_TRANS_MEM_MAX;
- kfree(trans->mem);
- }
+ new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
+ if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
+ new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+ new_bytes = BTREE_TRANS_MEM_MAX;
+ kfree(trans->mem);
+ }
- if (!new_mem)
- return ERR_PTR(-ENOMEM);
+ if (!new_mem)
+ return ERR_PTR(-ENOMEM);
- trans->mem = new_mem;
- trans->mem_bytes = new_bytes;
+ trans->mem = new_mem;
+ trans->mem_bytes = new_bytes;
- if (old_bytes) {
- trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes);
- btree_trans_restart(trans);
- return ERR_PTR(-EINTR);
- }
+ if (old_bytes) {
+ trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
}
p = trans->mem + trans->mem_top;
* bch2_trans_begin() - reset a transaction after a interrupted attempt
* @trans: transaction to reset
*
- * While iterating over nodes or updating nodes a attempt to lock a btree
- * node may return EINTR when the trylock fails. When this occurs
- * bch2_trans_begin() should be called and the transaction retried.
+ * While iterating over nodes or updating nodes a attempt to lock a btree node
+ * may return BCH_ERR_transaction_restart when the trylock fails. When this
+ * occurs bch2_trans_begin() should be called and the transaction retried.
*/
-void bch2_trans_begin(struct btree_trans *trans)
+u32 bch2_trans_begin(struct btree_trans *trans)
{
- struct btree_insert_entry *i;
struct btree_path *path;
- trans_for_each_update(trans, i)
- __btree_path_put(i->path, true);
+ bch2_trans_reset_updates(trans);
- memset(&trans->journal_res, 0, sizeof(trans->journal_res));
- trans->extra_journal_res = 0;
- trans->nr_updates = 0;
+ trans->restart_count++;
trans->mem_top = 0;
- trans->hooks = NULL;
- trans->extra_journal_entries = NULL;
- trans->extra_journal_entry_u64s = 0;
-
if (trans->fs_usage_deltas) {
trans->fs_usage_deltas->used = 0;
- memset(&trans->fs_usage_deltas->memset_start, 0,
+ memset((void *) trans->fs_usage_deltas +
+ offsetof(struct replicas_delta_list, memset_start), 0,
(void *) &trans->fs_usage_deltas->memset_end -
(void *) &trans->fs_usage_deltas->memset_start);
}
trans_for_each_path(trans, path) {
path->should_be_locked = false;
+ /*
+ * If the transaction wasn't restarted, we're presuming to be
+ * doing something new: dont keep iterators excpt the ones that
+ * are in use - except for the subvolumes btree:
+ */
+ if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
+ path->preserve = false;
+
/*
* XXX: we probably shouldn't be doing this if the transaction
* was restarted, but currently we still overflow transaction
*/
if (!path->ref && !path->preserve)
__bch2_path_free(trans, path);
- else if (!path->ref)
+ else
path->preserve = false;
}
- bch2_trans_cond_resched(trans);
+ if (!trans->restarted &&
+ (need_resched() ||
+ local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
+ bch2_trans_unlock(trans);
+ cond_resched();
+ bch2_trans_relock(trans);
+ }
+ trans->last_restarted_ip = _RET_IP_;
if (trans->restarted)
bch2_btree_path_traverse_all(trans);
- trans->restarted = false;
+ trans->last_begin_time = local_clock();
+ return trans->restart_count;
+}
+
+void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count)
+{
+ if (trans_was_restarted(trans, restart_count))
+ panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
+ trans->restart_count, restart_count,
+ (void *) trans->last_restarted_ip);
}
static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
BUG_ON(trans->used_mempool);
#ifdef __KERNEL__
- p = this_cpu_xchg(c->btree_paths_bufs->path , NULL);
+ p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
#endif
if (!p)
p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
trans->updates = p; p += updates_bytes;
}
-void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
- unsigned expected_nr_iters,
- size_t expected_mem_bytes,
- const char *fn)
+const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+
+unsigned bch2_trans_get_fn_idx(const char *fn)
+{
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
+ if (!bch2_btree_transaction_fns[i] ||
+ bch2_btree_transaction_fns[i] == fn) {
+ bch2_btree_transaction_fns[i] = fn;
+ return i;
+ }
+
+ pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
+ return i;
+}
+
+void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
__acquires(&c->btree_trans_barrier)
{
+ struct btree_transaction_stats *s;
+ struct btree_trans *pos;
+
BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
memset(trans, 0, sizeof(*trans));
trans->c = c;
- trans->fn = fn;
+ trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
+ ? bch2_btree_transaction_fns[fn_idx] : NULL;
+ trans->last_begin_time = local_clock();
+ trans->fn_idx = fn_idx;
+ trans->locking_wait.task = current;
+ trans->journal_replay_not_finished =
+ !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
+ closure_init_stack(&trans->ref);
bch2_trans_alloc_paths(trans, c);
- if (expected_mem_bytes) {
- trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
- trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
+ s = btree_trans_stats(trans);
+ if (s) {
+ unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
+
+ trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
if (!unlikely(trans->mem)) {
trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
trans->mem_bytes = BTREE_TRANS_MEM_MAX;
+ } else {
+ trans->mem_bytes = expected_mem_bytes;
}
+
+ trans->nr_max_paths = s->nr_max_paths;
}
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- trans->pid = current->pid;
mutex_lock(&c->btree_trans_lock);
- list_add(&trans->list, &c->btree_trans_list);
+ list_for_each_entry(pos, &c->btree_trans_list, list) {
+ if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
+ list_add_tail(&trans->list, &pos->list);
+ goto list_add_done;
+ }
+ }
+ list_add_tail(&trans->list, &c->btree_trans_list);
+list_add_done:
mutex_unlock(&c->btree_trans_lock);
}
{
struct btree_insert_entry *i;
struct bch_fs *c = trans->c;
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
bch2_trans_unlock(trans);
+ closure_sync(&trans->ref);
+
+ if (s)
+ s->max_mem = max(s->max_mem, trans->mem_max);
+
trans_for_each_update(trans, i)
__btree_path_put(i->path, true);
trans->nr_updates = 0;
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+ kfree(trans->extra_journal_entries.data);
+
if (trans->fs_usage_deltas) {
if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
REPLICAS_DELTA_LIST_MAX)
}
static void __maybe_unused
-bch2_btree_path_node_to_text(struct printbuf *out,
- struct btree_bkey_cached_common *_b,
- bool cached)
+bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
+ struct btree_bkey_cached_common *b)
{
- pr_buf(out, " l=%u %s:",
- _b->level, bch2_btree_ids[_b->btree_id]);
- bch2_bpos_to_text(out, btree_node_pos(_b, cached));
-}
+ struct six_lock_count c = six_lock_counts(&b->lock);
+ struct task_struct *owner;
+ pid_t pid;
-static bool trans_has_locks(struct btree_trans *trans)
-{
- struct btree_path *path;
+ rcu_read_lock();
+ owner = READ_ONCE(b->lock.owner);
+ pid = owner ? owner->pid : 0;
+ rcu_read_unlock();
- trans_for_each_path(trans, path)
- if (path->nodes_locked)
- return true;
- return false;
+ prt_tab(out);
+ prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
+ b->level, bch2_btree_ids[b->btree_id]);
+ bch2_bpos_to_text(out, btree_node_pos(b));
+
+ prt_tab(out);
+ prt_printf(out, " locks %u:%u:%u held by pid %u",
+ c.n[0], c.n[1], c.n[2], pid);
}
-void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
+void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
{
- struct btree_trans *trans;
struct btree_path *path;
- struct btree *b;
+ struct btree_bkey_cached_common *b;
static char lock_types[] = { 'r', 'i', 'w' };
unsigned l;
- mutex_lock(&c->btree_trans_lock);
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- if (!trans_has_locks(trans))
- continue;
+ if (!out->nr_tabstops) {
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 32);
+ }
- pr_buf(out, "%i %s\n", trans->pid, trans->fn);
+ prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
- trans_for_each_path(trans, path) {
- if (!path->nodes_locked)
- continue;
+ trans_for_each_path(trans, path) {
+ if (!path->nodes_locked)
+ continue;
- pr_buf(out, " path %u %c l=%u %s:",
- path->idx,
- path->cached ? 'c' : 'b',
- path->level,
- bch2_btree_ids[path->btree_id]);
- bch2_bpos_to_text(out, path->pos);
- pr_buf(out, "\n");
-
- for (l = 0; l < BTREE_MAX_DEPTH; l++) {
- if (btree_node_locked(path, l)) {
- pr_buf(out, " %s l=%u ",
- btree_node_intent_locked(path, l) ? "i" : "r", l);
- bch2_btree_path_node_to_text(out,
- (void *) path->l[l].b,
- path->cached);
- pr_buf(out, "\n");
- }
+ prt_printf(out, " path %u %c l=%u %s:",
+ path->idx,
+ path->cached ? 'c' : 'b',
+ path->level,
+ bch2_btree_ids[path->btree_id]);
+ bch2_bpos_to_text(out, path->pos);
+ prt_newline(out);
+
+ for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+ if (btree_node_locked(path, l) &&
+ !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
+ prt_printf(out, " %c l=%u ",
+ lock_types[btree_node_locked_type(path, l)], l);
+ bch2_btree_bkey_cached_common_to_text(out, b);
+ prt_newline(out);
}
}
+ }
- b = READ_ONCE(trans->locking);
- if (b) {
- path = &trans->paths[trans->locking_path_idx];
- pr_buf(out, " locking path %u %c l=%u %c %s:",
- trans->locking_path_idx,
- path->cached ? 'c' : 'b',
- trans->locking_level,
- lock_types[trans->locking_lock_type],
- bch2_btree_ids[trans->locking_btree_id]);
- bch2_bpos_to_text(out, trans->locking_pos);
-
- pr_buf(out, " node ");
- bch2_btree_path_node_to_text(out,
- (void *) b, path->cached);
- pr_buf(out, "\n");
- }
+ b = READ_ONCE(trans->locking);
+ if (b) {
+ prt_str(out, " want");
+ prt_newline(out);
+ prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]);
+ bch2_btree_bkey_cached_common_to_text(out, b);
+ prt_newline(out);
}
- mutex_unlock(&c->btree_trans_lock);
}
void bch2_fs_btree_iter_exit(struct bch_fs *c)
{
+ struct btree_transaction_stats *s;
+
+ for (s = c->btree_transaction_stats;
+ s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+ s++)
+ kfree(s->max_paths_text);
+
if (c->btree_trans_barrier_initialized)
cleanup_srcu_struct(&c->btree_trans_barrier);
mempool_exit(&c->btree_trans_mem_pool);
int bch2_fs_btree_iter_init(struct bch_fs *c)
{
- unsigned nr = BTREE_ITER_MAX;
+ unsigned i, nr = BTREE_ITER_MAX;
int ret;
+ for (i = 0; i < ARRAY_SIZE(c->btree_transaction_stats); i++)
+ mutex_init(&c->btree_transaction_stats[i].lock);
+
INIT_LIST_HEAD(&c->btree_trans_list);
mutex_init(&c->btree_trans_lock);
#include "bset.h"
#include "btree_types.h"
+#include <trace/events/bcachefs.h>
+
static inline void __btree_path_get(struct btree_path *path, bool intent)
{
path->ref++;
return &trans->paths[idx];
}
-#define trans_for_each_path(_trans, _path) \
- for (_path = __trans_next_path((_trans), 0); \
+void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
+
+#define trans_for_each_path_from(_trans, _path, _start) \
+ for (_path = __trans_next_path((_trans), _start); \
(_path); \
_path = __trans_next_path((_trans), (_path)->idx + 1))
+#define trans_for_each_path(_trans, _path) \
+ trans_for_each_path_from(_trans, _path, 0)
+
static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
{
unsigned idx = path ? path->sorted_idx + 1 : 0;
_path = __trans_next_path_with_node((_trans), (_b), \
(_path)->idx + 1))
-struct btree_path * __must_check
-bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
bool, unsigned long);
+
+static inline struct btree_path * __must_check
+bch2_btree_path_make_mut(struct btree_trans *trans,
+ struct btree_path *path, bool intent,
+ unsigned long ip)
+{
+ if (path->ref > 1 || path->preserve)
+ path = __bch2_btree_path_make_mut(trans, path, intent, ip);
+ path->should_be_locked = false;
+ return path;
+}
+
struct btree_path * __must_check
bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
struct bpos, bool, unsigned long);
unsigned, unsigned, unsigned, unsigned long);
inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
+struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
+ struct btree_iter *, struct bpos);
+
+inline void bch2_btree_path_level_init(struct btree_trans *,
+ struct btree_path *, struct btree *);
+
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_trans_verify_paths(struct btree_trans *);
-void bch2_trans_verify_locks(struct btree_trans *);
void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
struct bpos, bool);
#else
static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
-static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
struct bpos pos, bool key_cache) {}
#endif
struct btree *, struct btree_node_iter *,
struct bkey_packed *, unsigned, unsigned);
-bool bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
+int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
-bool bch2_trans_relock(struct btree_trans *);
+int bch2_trans_relock(struct btree_trans *);
void bch2_trans_unlock(struct btree_trans *);
+bool bch2_trans_locked(struct btree_trans *);
-__always_inline
-static inline int btree_trans_restart(struct btree_trans *trans)
+static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count)
{
- trans->restarted = true;
- bch2_trans_unlock(trans);
- return -EINTR;
+ return restart_count != trans->restart_count;
}
-bool bch2_btree_node_upgrade(struct btree_trans *,
- struct btree_path *, unsigned);
-
-bool __bch2_btree_path_upgrade(struct btree_trans *,
- struct btree_path *, unsigned);
+void bch2_trans_verify_not_restarted(struct btree_trans *, u32);
-static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want)
+__always_inline
+static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
{
- new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+ BUG_ON(err <= 0);
+ BUG_ON(!bch2_err_matches(err, BCH_ERR_transaction_restart));
- return path->locks_want < new_locks_want
- ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
- : path->uptodate == BTREE_ITER_UPTODATE;
+ trans->restarted = err;
+ return -err;
}
-void __bch2_btree_path_downgrade(struct btree_path *, unsigned);
+__always_inline
+static inline int btree_trans_restart(struct btree_trans *trans, int err)
+{
+ btree_trans_restart_nounlock(trans, err);
+ return -err;
+}
+
+bool bch2_btree_node_upgrade(struct btree_trans *,
+ struct btree_path *, unsigned);
+
+void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
-static inline void bch2_btree_path_downgrade(struct btree_path *path)
+static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
+ struct btree_path *path)
{
unsigned new_locks_want = path->level + !!path->intent_ref;
if (path->locks_want > new_locks_want)
- __bch2_btree_path_downgrade(path, new_locks_want);
+ __bch2_btree_path_downgrade(trans, path, new_locks_want);
}
void bch2_trans_downgrade(struct btree_trans *);
struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
struct btree *bch2_btree_iter_next_node(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *);
+
+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+ return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
+}
+
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
static inline void set_btree_iter_dontneed(struct btree_iter *iter)
{
- iter->path->preserve = false;
+ if (!iter->trans->restarted)
+ iter->path->preserve = false;
}
-void *bch2_trans_kmalloc(struct btree_trans *, size_t);
-void bch2_trans_begin(struct btree_trans *);
+void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
+
+static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+{
+ unsigned new_top = trans->mem_top + size;
+ void *p = trans->mem + trans->mem_top;
+
+ if (likely(new_top <= trans->mem_bytes)) {
+ trans->mem_top += size;
+ memset(p, 0, size);
+ return p;
+ } else {
+ return __bch2_trans_kmalloc(trans, size);
+
+ }
+}
+
+u32 bch2_trans_begin(struct btree_trans *);
static inline struct btree *
__btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter)
struct btree *b;
while (b = bch2_btree_iter_peek_node(iter),
- PTR_ERR_OR_ZERO(b) == -EINTR)
+ bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
bch2_trans_begin(trans);
return b;
return PTR_ERR_OR_ZERO(k.k);
}
+static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
+ unsigned flags)
+{
+ BUG_ON(flags & BTREE_ITER_ALL_LEVELS);
+
+ return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ bch2_btree_iter_peek_prev(iter);
+}
+
static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
- unsigned flags)
+ unsigned flags)
{
- return flags & BTREE_ITER_SLOTS
- ? bch2_btree_iter_peek_slot(iter)
- : bch2_btree_iter_peek(iter);
+ return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) :
+ flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ bch2_btree_iter_peek(iter);
+}
+
+static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
+ struct bpos end,
+ unsigned flags)
+{
+ if (!(flags & BTREE_ITER_SLOTS))
+ return bch2_btree_iter_peek_upto(iter, end);
+
+ if (bkey_cmp(iter->pos, end) > 0)
+ return bkey_s_c_null;
+
+ return bch2_btree_iter_peek_slot(iter);
}
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2
- ? -EINTR : 0;
+ if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) {
+ trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
+ }
+
+ return 0;
}
static inline struct bkey_s_c
while (btree_trans_too_many_iters(trans) ||
(k = bch2_btree_iter_peek_type(iter, flags),
- bkey_err(k) == -EINTR))
+ bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
bch2_trans_begin(trans);
return k;
}
+#define lockrestart_do(_trans, _do) \
+({ \
+ u32 _restart_count; \
+ int _ret; \
+ \
+ do { \
+ _restart_count = bch2_trans_begin(_trans); \
+ _ret = (_do); \
+ } while (bch2_err_matches(_ret, BCH_ERR_transaction_restart)); \
+ \
+ if (!_ret) \
+ bch2_trans_verify_not_restarted(_trans, _restart_count);\
+ \
+ _ret; \
+})
+
+/*
+ * nested_lockrestart_do(), nested_commit_do():
+ *
+ * These are like lockrestart_do() and commit_do(), with two differences:
+ *
+ * - We don't call bch2_trans_begin() unless we had a transaction restart
+ * - We return -BCH_ERR_transaction_restart_nested if we succeeded after a
+ * transaction restart
+ */
+#define nested_lockrestart_do(_trans, _do) \
+({ \
+ u32 _restart_count, _orig_restart_count; \
+ int _ret; \
+ \
+ _restart_count = _orig_restart_count = (_trans)->restart_count; \
+ \
+ while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\
+ _restart_count = bch2_trans_begin(_trans); \
+ \
+ if (!_ret) \
+ bch2_trans_verify_not_restarted(_trans, _restart_count);\
+ \
+ if (!_ret && trans_was_restarted(_trans, _orig_restart_count)) \
+ _ret = -BCH_ERR_transaction_restart_nested; \
+ \
+ _ret; \
+})
+
+#define for_each_btree_key2(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _do) \
+({ \
+ int _ret = 0; \
+ \
+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ \
+ while (1) { \
+ u32 _restart_count = bch2_trans_begin(_trans); \
+ (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \
+ if (!(_k).k) { \
+ _ret = 0; \
+ break; \
+ } \
+ \
+ _ret = bkey_err(_k) ?: (_do); \
+ if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ continue; \
+ if (_ret) \
+ break; \
+ bch2_trans_verify_not_restarted(_trans, _restart_count);\
+ if (!bch2_btree_iter_advance(&(_iter))) \
+ break; \
+ } \
+ \
+ bch2_trans_iter_exit((_trans), &(_iter)); \
+ _ret; \
+})
+
+#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _do) \
+({ \
+ int _ret = 0; \
+ \
+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ \
+ while (1) { \
+ u32 _restart_count = bch2_trans_begin(_trans); \
+ (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
+ if (!(_k).k) { \
+ _ret = 0; \
+ break; \
+ } \
+ \
+ _ret = bkey_err(_k) ?: (_do); \
+ if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ continue; \
+ if (_ret) \
+ break; \
+ bch2_trans_verify_not_restarted(_trans, _restart_count);\
+ if (!bch2_btree_iter_rewind(&(_iter))) \
+ break; \
+ } \
+ \
+ bch2_trans_iter_exit((_trans), &(_iter)); \
+ _ret; \
+})
+
+#define for_each_btree_key_commit(_trans, _iter, _btree_id, \
+ _start, _iter_flags, _k, \
+ _disk_res, _journal_seq, _commit_flags,\
+ _do) \
+ for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+ (_journal_seq), (_commit_flags)))
+
#define for_each_btree_key(_trans, _iter, _btree_id, \
_start, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
+#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \
+ _start, _end, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
+
#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \
for (; \
(_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
/* new multiple iterator interface: */
+void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
+void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
+void bch2_dump_trans_updates(struct btree_trans *);
void bch2_dump_trans_paths_updates(struct btree_trans *);
-void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
- unsigned, size_t, const char *);
+void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned);
void bch2_trans_exit(struct btree_trans *);
-#define bch2_trans_init(...) __bch2_trans_init(__VA_ARGS__, __func__)
+extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+unsigned bch2_trans_get_fn_idx(const char *);
+
+#define bch2_trans_init(_trans, _c, _nr_iters, _mem) \
+do { \
+ static unsigned trans_fn_idx; \
+ \
+ if (unlikely(!trans_fn_idx)) \
+ trans_fn_idx = bch2_trans_get_fn_idx(__func__); \
+ \
+ __bch2_trans_init(_trans, _c, trans_fn_idx); \
+} while (0)
-void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *);
+void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
void bch2_fs_btree_iter_exit(struct bch_fs *);
int bch2_fs_btree_iter_init(struct bch_fs *);
+// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_cache.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update.h"
+#include "errcode.h"
#include "error.h"
#include "journal.h"
#include "journal_reclaim.h"
#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
+static inline bool btree_uses_pcpu_readers(enum btree_id id)
+{
+ return id == BTREE_ID_subvolumes;
+}
+
static struct kmem_cache *bch2_key_cache;
static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
- list_move_tail(&ck->list, &bc->freed);
- bc->nr_freed++;
+ if (ck->c.lock.readers)
+ list_move_tail(&ck->list, &bc->freed_pcpu);
+ else
+ list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ atomic_long_inc(&bc->nr_freed);
+
+ kfree(ck->k);
+ ck->k = NULL;
+ ck->u64s = 0;
+
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
+}
+
+static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
+{
+ struct bkey_cached *pos;
+
+ list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
+ if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
+ pos->btree_trans_barrier_seq)) {
+ list_move(&ck->list, &pos->list);
+ return;
+ }
+ }
+
+ list_move(&ck->list, &bc->freed_nonpcpu);
+}
+
+static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
+{
+ struct btree_key_cache_freelist *f;
+ bool freed = false;
+
+ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
+ if (!ck->c.lock.readers) {
+#ifdef __KERNEL__
+ preempt_disable();
+ f = this_cpu_ptr(bc->pcpu_freed);
+
+ if (f->nr < ARRAY_SIZE(f->objs)) {
+ f->objs[f->nr++] = ck;
+ freed = true;
+ }
+ preempt_enable();
+
+ if (!freed) {
+ mutex_lock(&bc->lock);
+ preempt_disable();
+ f = this_cpu_ptr(bc->pcpu_freed);
+
+ while (f->nr > ARRAY_SIZE(f->objs) / 2) {
+ struct bkey_cached *ck2 = f->objs[--f->nr];
+
+ __bkey_cached_move_to_freelist_ordered(bc, ck2);
+ }
+ preempt_enable();
+
+ __bkey_cached_move_to_freelist_ordered(bc, ck);
+ mutex_unlock(&bc->lock);
+ }
+#else
+ mutex_lock(&bc->lock);
+ list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ mutex_unlock(&bc->lock);
+#endif
+ } else {
+ mutex_lock(&bc->lock);
+ list_move_tail(&ck->list, &bc->freed_pcpu);
+ mutex_unlock(&bc->lock);
+ }
+}
+
+static void bkey_cached_free_fast(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
+{
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+ ck->btree_trans_barrier_seq =
+ start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+ list_del_init(&ck->list);
+ atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
ck->k = NULL;
ck->u64s = 0;
+ bkey_cached_move_to_freelist(bc, ck);
+
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
static struct bkey_cached *
-bkey_cached_alloc(struct btree_key_cache *c)
+bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path)
{
- struct bkey_cached *ck;
+ struct bch_fs *c = trans->c;
+ struct btree_key_cache *bc = &c->btree_key_cache;
+ struct bkey_cached *ck = NULL;
+ struct btree_key_cache_freelist *f;
+ bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
+
+ if (!pcpu_readers) {
+#ifdef __KERNEL__
+ preempt_disable();
+ f = this_cpu_ptr(bc->pcpu_freed);
+ if (f->nr)
+ ck = f->objs[--f->nr];
+ preempt_enable();
+
+ if (!ck) {
+ mutex_lock(&bc->lock);
+ preempt_disable();
+ f = this_cpu_ptr(bc->pcpu_freed);
+
+ while (!list_empty(&bc->freed_nonpcpu) &&
+ f->nr < ARRAY_SIZE(f->objs) / 2) {
+ ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+ list_del_init(&ck->list);
+ f->objs[f->nr++] = ck;
+ }
+ ck = f->nr ? f->objs[--f->nr] : NULL;
+ preempt_enable();
+ mutex_unlock(&bc->lock);
+ }
+#else
+ mutex_lock(&bc->lock);
+ if (!list_empty(&bc->freed_nonpcpu)) {
+ ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+ list_del_init(&ck->list);
+ }
+ mutex_unlock(&bc->lock);
+#endif
+ } else {
+ mutex_lock(&bc->lock);
+ if (!list_empty(&bc->freed_pcpu)) {
+ ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
+ list_del_init(&ck->list);
+ }
+ mutex_unlock(&bc->lock);
+ }
+
+ if (ck) {
+ int ret;
+
+ ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent);
+ if (unlikely(ret)) {
+ bkey_cached_move_to_freelist(bc, ck);
+ return ERR_PTR(ret);
+ }
+
+ path->l[0].b = (void *) ck;
+ path->l[0].lock_seq = ck->c.lock.state.seq;
+ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+
+ ret = bch2_btree_node_lock_write(trans, path, &ck->c);
+ if (unlikely(ret)) {
+ btree_node_unlock(trans, path, 0);
+ bkey_cached_move_to_freelist(bc, ck);
+ return ERR_PTR(ret);
+ }
+
+ return ck;
+ }
+
+ /* GFP_NOFS because we're holding btree locks: */
ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
if (likely(ck)) {
INIT_LIST_HEAD(&ck->list);
- six_lock_init(&ck->c.lock);
+ __six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
+ if (pcpu_readers)
+ six_lock_pcpu_alloc(&ck->c.lock);
+
+ ck->c.cached = true;
BUG_ON(!six_trylock_intent(&ck->c.lock));
BUG_ON(!six_trylock_write(&ck->c.lock));
return ck;
unsigned i;
mutex_lock(&c->lock);
- list_for_each_entry_reverse(ck, &c->freed, list)
- if (bkey_cached_lock_for_evict(ck)) {
- c->nr_freed--;
- list_del(&ck->list);
- mutex_unlock(&c->lock);
- return ck;
- }
- mutex_unlock(&c->lock);
-
rcu_read_lock();
tbl = rht_dereference_rcu(c->table.tbl, &c->table);
for (i = 0; i < tbl->size; i++)
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bkey_cached_lock_for_evict(ck)) {
bkey_cached_evict(c, ck);
- rcu_read_unlock();
- return ck;
+ goto out;
}
}
+ ck = NULL;
+out:
rcu_read_unlock();
-
- return NULL;
+ mutex_unlock(&c->lock);
+ return ck;
}
static struct bkey_cached *
-btree_key_cache_create(struct bch_fs *c,
- enum btree_id btree_id,
- struct bpos pos)
+btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
{
+ struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bkey_cached *ck;
bool was_new = true;
- ck = bkey_cached_alloc(bc);
+ ck = bkey_cached_alloc(trans, path);
+ if (IS_ERR(ck))
+ return ck;
if (unlikely(!ck)) {
ck = bkey_cached_reuse(bc);
if (unlikely(!ck)) {
bch_err(c, "error allocating memory for key cache item, btree %s",
- bch2_btree_ids[btree_id]);
+ bch2_btree_ids[path->btree_id]);
return ERR_PTR(-ENOMEM);
}
+ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
was_new = false;
+ } else {
+ if (path->btree_id == BTREE_ID_subvolumes)
+ six_lock_pcpu_alloc(&ck->c.lock);
}
- if (btree_id == BTREE_ID_subvolumes)
- six_lock_pcpu_alloc(&ck->c.lock);
- else
- six_lock_pcpu_free(&ck->c.lock);
-
ck->c.level = 0;
- ck->c.btree_id = btree_id;
- ck->key.btree_id = btree_id;
- ck->key.pos = pos;
+ ck->c.btree_id = path->btree_id;
+ ck->key.btree_id = path->btree_id;
+ ck->key.pos = path->pos;
ck->valid = false;
ck->flags = 1U << BKEY_CACHED_ACCESSED;
if (likely(was_new)) {
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
+ mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
kfree(ck);
} else {
- mutex_lock(&bc->lock);
- bkey_cached_free(bc, ck);
- mutex_unlock(&bc->lock);
+ bkey_cached_free_fast(bc, ck);
}
return NULL;
k = bch2_btree_path_peek_slot(path, &u);
if (!bch2_btree_node_relock(trans, ck_path, 0)) {
- trace_trans_restart_relock_key_cache_fill(trans->fn,
- _THIS_IP_, ck_path->btree_id, &ck_path->pos);
- ret = btree_trans_restart(trans);
+ trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
goto err;
}
*/
new_u64s = k.k->u64s + 1;
+ /*
+ * Allocate some extra space so that the transaction commit path is less
+ * likely to have to reallocate, since that requires a transaction
+ * restart:
+ */
+ new_u64s = min(256U, (new_u64s * 3) / 2);
+
if (new_u64s > ck->u64s) {
new_u64s = roundup_pow_of_two(new_u64s);
new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
}
}
- /*
- * XXX: not allowed to be holding read locks when we take a write lock,
- * currently
- */
- bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
+ ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
+ if (ret) {
+ kfree(new_k);
+ goto err;
+ }
+
if (new_k) {
kfree(ck->k);
ck->u64s = new_u64s;
return ret;
}
-static int bkey_cached_check_fn(struct six_lock *lock, void *p)
-{
- struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
- const struct btree_path *path = p;
-
- return ck->key.btree_id == path->btree_id &&
- !bpos_cmp(ck->key.pos, path->pos) ? 0 : -1;
-}
-
-__flatten
-int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
- unsigned flags)
+static noinline int
+bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck;
retry:
ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
if (!ck) {
- if (flags & BTREE_ITER_CACHED_NOCREATE) {
- path->l[0].b = NULL;
- return 0;
- }
-
- ck = btree_key_cache_create(c, path->btree_id, path->pos);
+ ck = btree_key_cache_create(trans, path);
ret = PTR_ERR_OR_ZERO(ck);
if (ret)
goto err;
if (!ck)
goto retry;
- mark_btree_node_locked(path, 0, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
path->locks_want = 1;
} else {
enum six_lock_type lock_want = __btree_lock_want(path, 0);
- if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0,
- lock_want,
- bkey_cached_check_fn, path, _THIS_IP_)) {
- if (!trans->restarted)
- goto retry;
-
- ret = -EINTR;
+ ret = btree_node_lock(trans, path, (void *) ck, 0,
+ lock_want, _THIS_IP_);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto err;
- }
+
+ BUG_ON(ret);
if (ck->key.btree_id != path->btree_id ||
bpos_cmp(ck->key.pos, path->pos)) {
goto retry;
}
- mark_btree_node_locked(path, 0, lock_want);
+ mark_btree_node_locked(trans, path, 0, lock_want);
}
path->l[0].lock_seq = ck->c.lock.state.seq;
path->l[0].b = (void *) ck;
fill:
- if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
+ if (!ck->valid) {
+ /*
+ * Using the underscore version because we haven't set
+ * path->uptodate yet:
+ */
if (!path->locks_want &&
!__bch2_btree_path_upgrade(trans, path, 1)) {
- trace_transaction_restart_ip(trans->fn, _THIS_IP_);
- ret = btree_trans_restart(trans);
+ trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
goto err;
}
set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
path->uptodate = BTREE_ITER_UPTODATE;
+ BUG_ON(!ck->valid);
BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
return ret;
err:
- if (ret != -EINTR) {
- btree_node_unlock(path, 0);
- path->l[0].b = BTREE_ITER_NO_NODE_ERROR;
+ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ btree_node_unlock(trans, path, 0);
+ path->l[0].b = ERR_PTR(ret);
}
return ret;
}
+int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_cached *ck;
+ int ret = 0;
+
+ EBUG_ON(path->level);
+
+ path->l[1].b = NULL;
+
+ if (bch2_btree_node_relock(trans, path, 0)) {
+ ck = (void *) path->l[0].b;
+ goto fill;
+ }
+retry:
+ ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
+ if (!ck) {
+ return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+ } else {
+ enum six_lock_type lock_want = __btree_lock_want(path, 0);
+
+ ret = btree_node_lock(trans, path, (void *) ck, 0,
+ lock_want, _THIS_IP_);
+ EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+ if (ret)
+ return ret;
+
+ if (ck->key.btree_id != path->btree_id ||
+ bpos_cmp(ck->key.pos, path->pos)) {
+ six_unlock_type(&ck->c.lock, lock_want);
+ goto retry;
+ }
+
+ mark_btree_node_locked(trans, path, 0, lock_want);
+ }
+
+ path->l[0].lock_seq = ck->c.lock.state.seq;
+ path->l[0].b = (void *) ck;
+fill:
+ if (!ck->valid)
+ return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+
+ if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
+ path->uptodate = BTREE_ITER_UPTODATE;
+ EBUG_ON(!ck->valid);
+ EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+
+ return ret;
+}
+
static int btree_key_cache_flush_pos(struct btree_trans *trans,
struct bkey_cached_key key,
u64 journal_seq,
BTREE_ITER_ALL_SNAPSHOTS);
bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_CACHED_NOCREATE|
BTREE_ITER_INTENT);
b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
* Since journal reclaim depends on us making progress here, and the
* allocator/copygc depend on journal reclaim making progress, we need
* to be using alloc reserves:
- * */
+ */
ret = bch2_btree_iter_traverse(&b_iter) ?:
bch2_trans_update(trans, &b_iter, ck->k,
BTREE_UPDATE_KEY_CACHE_RECLAIM|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
(ck->journal.seq == journal_last_seq(j)
- ? BTREE_INSERT_JOURNAL_RESERVED
+ ? JOURNAL_WATERMARK_reserved
: 0)|
commit_flags);
- if (ret) {
- bch2_fs_fatal_err_on(ret != -EINTR &&
- ret != -EAGAIN &&
- !bch2_journal_error(j), c,
- "error flushing key cache: %i", ret);
+
+ bch2_fs_fatal_err_on(ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+ !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
+ !bch2_journal_error(j), c,
+ "error flushing key cache: %s", bch2_err_str(ret));
+ if (ret)
goto out;
- }
bch2_journal_pin_drop(j, &ck->journal);
bch2_journal_preres_put(j, &ck->res);
atomic_long_dec(&c->btree_key_cache.nr_dirty);
}
} else {
+ struct btree_path *path2;
evict:
- BUG_ON(!btree_node_intent_locked(c_iter.path, 0));
+ trans_for_each_path(trans, path2)
+ if (path2 != c_iter.path)
+ __bch2_btree_path_unlock(trans, path2);
- mark_btree_node_unlocked(c_iter.path, 0);
- c_iter.path->l[0].b = NULL;
-
- six_lock_write(&ck->c.lock, NULL, NULL);
+ bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
atomic_long_dec(&c->btree_key_cache.nr_dirty);
}
+ mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
bkey_cached_evict(&c->btree_key_cache, ck);
-
- mutex_lock(&c->btree_key_cache.lock);
- bkey_cached_free(&c->btree_key_cache, ck);
- mutex_unlock(&c->btree_key_cache.lock);
+ bkey_cached_free_fast(&c->btree_key_cache, ck);
}
out:
bch2_trans_iter_exit(trans, &b_iter);
struct bkey_cached *ck =
container_of(pin, struct bkey_cached, journal);
struct bkey_cached_key key;
+ struct btree_trans trans;
+ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
int ret = 0;
- int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ bch2_trans_init(&trans, c, 0, 0);
- six_lock_read(&ck->c.lock, NULL, NULL);
+ btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read);
key = ck->key;
if (ck->journal.seq != seq ||
}
six_unlock_read(&ck->c.lock);
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
btree_key_cache_flush_pos(&trans, key, seq,
BTREE_INSERT_JOURNAL_RECLAIM, false));
unlock:
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+ bch2_trans_exit(&trans);
return ret;
}
return true;
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
- enum btree_id id, struct bpos pos)
+void bch2_btree_key_cache_drop(struct btree_trans *trans,
+ struct btree_path *path)
{
- BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos));
+ struct bch_fs *c = trans->c;
+ struct bkey_cached *ck = (void *) path->l[0].b;
+
+ BUG_ON(!ck->valid);
+
+ /*
+ * We just did an update to the btree, bypassing the key cache: the key
+ * cache key is now stale and must be dropped, even if dirty:
+ */
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ atomic_long_dec(&c->btree_key_cache.nr_dirty);
+ bch2_journal_pin_drop(&c->journal, &ck->journal);
+ }
+
+ ck->valid = false;
}
-#endif
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
unsigned start, flags;
int srcu_idx;
- /* Return -1 if we can't do anything right now */
- if (sc->gfp_mask & __GFP_FS)
- mutex_lock(&bc->lock);
- else if (!mutex_trylock(&bc->lock))
- return -1;
-
+ mutex_lock(&bc->lock);
srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
flags = memalloc_nofs_save();
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
- list_for_each_entry_safe(ck, t, &bc->freed, list) {
+ list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
break;
list_del(&ck->list);
+ six_lock_pcpu_free(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
- bc->nr_freed--;
+ atomic_long_dec(&bc->nr_freed);
+ scanned++;
+ freed++;
+ }
+
+ if (scanned >= nr)
+ goto out;
+
+ list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
+ if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+ ck->btree_trans_barrier_seq))
+ break;
+
+ list_del(&ck->list);
+ six_lock_pcpu_free(&ck->c.lock);
+ kmem_cache_free(bch2_key_cache, ck);
+ atomic_long_dec(&bc->nr_freed);
scanned++;
freed++;
}
struct bkey_cached *ck, *n;
struct rhash_head *pos;
unsigned i;
+#ifdef __KERNEL__
+ int cpu;
+#endif
if (bc->shrink.list.next)
unregister_shrinker(&bc->shrink);
mutex_lock(&bc->lock);
- rcu_read_lock();
- tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
- if (tbl)
- for (i = 0; i < tbl->size; i++)
- rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
- bkey_cached_evict(bc, ck);
- list_add(&ck->list, &bc->freed);
- }
- rcu_read_unlock();
+ /*
+ * The loop is needed to guard against racing with rehash:
+ */
+ while (atomic_long_read(&bc->nr_keys)) {
+ rcu_read_lock();
+ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+ if (tbl)
+ for (i = 0; i < tbl->size; i++)
+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+ bkey_cached_evict(bc, ck);
+ list_add(&ck->list, &bc->freed_nonpcpu);
+ }
+ rcu_read_unlock();
+ }
+
+#ifdef __KERNEL__
+ for_each_possible_cpu(cpu) {
+ struct btree_key_cache_freelist *f =
+ per_cpu_ptr(bc->pcpu_freed, cpu);
- list_for_each_entry_safe(ck, n, &bc->freed, list) {
+ for (i = 0; i < f->nr; i++) {
+ ck = f->objs[i];
+ list_add(&ck->list, &bc->freed_nonpcpu);
+ }
+ }
+#endif
+
+ list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+ list_for_each_entry_safe(ck, n, &bc->freed_nonpcpu, list) {
cond_resched();
bch2_journal_pin_drop(&c->journal, &ck->journal);
list_del(&ck->list);
kfree(ck->k);
+ six_lock_pcpu_free(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
}
- BUG_ON(atomic_long_read(&bc->nr_dirty) &&
- !bch2_journal_error(&c->journal) &&
- test_bit(BCH_FS_WAS_RW, &c->flags));
- BUG_ON(atomic_long_read(&bc->nr_keys));
+ if (atomic_long_read(&bc->nr_dirty) &&
+ !bch2_journal_error(&c->journal) &&
+ test_bit(BCH_FS_WAS_RW, &c->flags))
+ panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
+ atomic_long_read(&bc->nr_dirty));
+
+ if (atomic_long_read(&bc->nr_keys))
+ panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
+ atomic_long_read(&bc->nr_keys));
mutex_unlock(&bc->lock);
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
+
+ free_percpu(bc->pcpu_freed);
}
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
mutex_init(&c->lock);
- INIT_LIST_HEAD(&c->freed);
+ INIT_LIST_HEAD(&c->freed_pcpu);
+ INIT_LIST_HEAD(&c->freed_nonpcpu);
}
-int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
+static void bch2_btree_key_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
{
+ struct btree_key_cache *bc =
+ container_of(shrink, struct btree_key_cache, shrink);
+
+ bch2_btree_key_cache_to_text(out, bc);
+}
+
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
+{
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
int ret;
- ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+#ifdef __KERNEL__
+ bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
+ if (!bc->pcpu_freed)
+ return -ENOMEM;
+#endif
+
+ ret = rhashtable_init(&bc->table, &bch2_btree_key_cache_params);
if (ret)
return ret;
- c->table_init_done = true;
+ bc->table_init_done = true;
- c->shrink.seeks = 1;
- c->shrink.count_objects = bch2_btree_key_cache_count;
- c->shrink.scan_objects = bch2_btree_key_cache_scan;
- return register_shrinker(&c->shrink);
+ bc->shrink.seeks = 0;
+ bc->shrink.count_objects = bch2_btree_key_cache_count;
+ bc->shrink.scan_objects = bch2_btree_key_cache_scan;
+ bc->shrink.to_text = bch2_btree_key_cache_shrinker_to_text;
+ return register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name);
}
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
{
- pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed);
- pr_buf(out, "nr_keys:\t%zu\n", atomic_long_read(&c->nr_keys));
- pr_buf(out, "nr_dirty:\t%zu\n", atomic_long_read(&c->nr_dirty));
+ prt_printf(out, "nr_freed:\t%zu", atomic_long_read(&c->nr_freed));
+ prt_newline(out);
+ prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys));
+ prt_newline(out);
+ prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty));
+ prt_newline(out);
}
void bch2_btree_key_cache_exit(void)
{
- if (bch2_key_cache)
- kmem_cache_destroy(bch2_key_cache);
+ kmem_cache_destroy(bch2_key_cache);
}
int __init bch2_btree_key_cache_init(void)
struct btree_path *, struct bkey_i *);
int bch2_btree_key_cache_flush(struct btree_trans *,
enum btree_id, struct bpos);
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_key_cache_verify_clean(struct btree_trans *,
- enum btree_id, struct bpos);
-#else
-static inline void
-bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
- enum btree_id id, struct bpos pos) {}
-#endif
+void bch2_btree_key_cache_drop(struct btree_trans *,
+ struct btree_path *);
void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_locking.h"
+#include "btree_types.h"
+
+struct lock_class_key bch2_btree_node_lock_key;
+
+/* Btree node locking: */
+
+static inline void six_lock_readers_add(struct six_lock *lock, int nr)
+{
+ if (lock->readers)
+ this_cpu_add(*lock->readers, nr);
+ else if (nr > 0)
+ atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
+ else
+ atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter);
+}
+
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
+ struct btree_path *skip,
+ struct btree_bkey_cached_common *b,
+ unsigned level)
+{
+ struct btree_path *path;
+ struct six_lock_count ret;
+
+ memset(&ret, 0, sizeof(ret));
+
+ if (IS_ERR_OR_NULL(b))
+ return ret;
+
+ trans_for_each_path(trans, path)
+ if (path != skip && &path->l[level].b->c == b) {
+ int t = btree_node_locked_type(path, level);
+
+ if (t != BTREE_NODE_UNLOCKED)
+ ret.n[t]++;
+ }
+
+ return ret;
+}
+
+/* unlock */
+
+void bch2_btree_node_unlock_write(struct btree_trans *trans,
+ struct btree_path *path, struct btree *b)
+{
+ bch2_btree_node_unlock_write_inlined(trans, path, b);
+}
+
+/* lock */
+
+/*
+ * @trans wants to lock @b with type @type
+ */
+struct trans_waiting_for_lock {
+ struct btree_trans *trans;
+ struct btree_bkey_cached_common *node_want;
+ enum six_lock_type lock_want;
+
+ /* for iterating over held locks :*/
+ u8 path_idx;
+ u8 level;
+ u64 lock_start_time;
+};
+
+struct lock_graph {
+ struct trans_waiting_for_lock g[8];
+ unsigned nr;
+};
+
+static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
+{
+ struct trans_waiting_for_lock *i;
+
+ prt_printf(out, "Found lock cycle (%u entries):", g->nr);
+ prt_newline(out);
+
+ for (i = g->g; i < g->g + g->nr; i++)
+ bch2_btree_trans_to_text(out, i->trans);
+}
+
+static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
+{
+ struct trans_waiting_for_lock *i;
+
+ for (i = g->g; i != g->g + g->nr; i++) {
+ if (i != g->g)
+ prt_str(out, "<- ");
+ prt_printf(out, "%u ", i->trans->locking_wait.task->pid);
+ }
+ prt_newline(out);
+}
+
+static void lock_graph_up(struct lock_graph *g)
+{
+ closure_put(&g->g[--g->nr].trans->ref);
+}
+
+static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
+{
+ closure_get(&trans->ref);
+
+ g->g[g->nr++] = (struct trans_waiting_for_lock) {
+ .trans = trans,
+ .node_want = trans->locking,
+ .lock_want = trans->locking_wait.lock_want,
+ };
+}
+
+static bool lock_graph_remove_non_waiters(struct lock_graph *g)
+{
+ struct trans_waiting_for_lock *i;
+
+ for (i = g->g + 1; i < g->g + g->nr; i++)
+ if (i->trans->locking != i->node_want ||
+ i->trans->locking_wait.start_time != i[-1].lock_start_time) {
+ while (g->g + g->nr > i)
+ lock_graph_up(g);
+ return true;
+ }
+
+ return false;
+}
+
+static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
+{
+ if (i == g->g) {
+ trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
+ return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
+ } else {
+ i->trans->lock_must_abort = true;
+ wake_up_process(i->trans->locking_wait.task);
+ return 0;
+ }
+}
+
+static int btree_trans_abort_preference(struct btree_trans *trans)
+{
+ if (trans->lock_may_not_fail)
+ return 0;
+ if (trans->locking_wait.lock_want == SIX_LOCK_write)
+ return 1;
+ if (!trans->in_traverse_all)
+ return 2;
+ return 3;
+}
+
+static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
+{
+ struct trans_waiting_for_lock *i, *abort = NULL;
+ unsigned best = 0, pref;
+ int ret;
+
+ if (lock_graph_remove_non_waiters(g))
+ return 0;
+
+ /* Only checking, for debugfs: */
+ if (cycle) {
+ print_cycle(cycle, g);
+ ret = -1;
+ goto out;
+ }
+
+ for (i = g->g; i < g->g + g->nr; i++) {
+ pref = btree_trans_abort_preference(i->trans);
+ if (pref > best) {
+ abort = i;
+ best = pref;
+ }
+ }
+
+ if (unlikely(!best)) {
+ struct bch_fs *c = g->g->trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ bch_err(c, "cycle of nofail locks");
+
+ for (i = g->g; i < g->g + g->nr; i++) {
+ struct btree_trans *trans = i->trans;
+
+ bch2_btree_trans_to_text(&buf, trans);
+
+ prt_printf(&buf, "backtrace:");
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+ bch2_prt_backtrace(&buf, trans->locking_wait.task);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+ }
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ BUG();
+ }
+
+ ret = abort_lock(g, abort);
+out:
+ if (ret)
+ while (g->nr)
+ lock_graph_up(g);
+ return ret;
+}
+
+static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
+ struct printbuf *cycle)
+{
+ struct btree_trans *orig_trans = g->g->trans;
+ struct trans_waiting_for_lock *i;
+
+ for (i = g->g; i < g->g + g->nr; i++)
+ if (i->trans == trans)
+ return break_cycle(g, cycle);
+
+ if (g->nr == ARRAY_SIZE(g->g)) {
+ if (orig_trans->lock_may_not_fail)
+ return 0;
+
+ while (g->nr)
+ lock_graph_up(g);
+ trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
+ return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
+ }
+
+ lock_graph_down(g, trans);
+ return 0;
+}
+
+static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
+{
+ return t1 + t2 > 1;
+}
+
+int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
+{
+ struct lock_graph g;
+ struct trans_waiting_for_lock *top;
+ struct btree_bkey_cached_common *b;
+ struct btree_path *path;
+ int ret;
+
+ if (trans->lock_must_abort) {
+ trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
+ }
+
+ g.nr = 0;
+ lock_graph_down(&g, trans);
+next:
+ if (!g.nr)
+ return 0;
+
+ top = &g.g[g.nr - 1];
+
+ trans_for_each_path_from(top->trans, path, top->path_idx) {
+ if (!path->nodes_locked)
+ continue;
+
+ if (top->path_idx != path->idx) {
+ top->path_idx = path->idx;
+ top->level = 0;
+ top->lock_start_time = 0;
+ }
+
+ for (;
+ top->level < BTREE_MAX_DEPTH;
+ top->level++, top->lock_start_time = 0) {
+ int lock_held = btree_node_locked_type(path, top->level);
+
+ if (lock_held == BTREE_NODE_UNLOCKED)
+ continue;
+
+ b = &READ_ONCE(path->l[top->level].b)->c;
+
+ if (IS_ERR_OR_NULL(b)) {
+ BUG_ON(!lock_graph_remove_non_waiters(&g));
+ goto next;
+ }
+
+ if (list_empty_careful(&b->lock.wait_list))
+ continue;
+
+ raw_spin_lock(&b->lock.wait_lock);
+ list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
+ BUG_ON(b != trans->locking);
+
+ if (top->lock_start_time &&
+ time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
+ continue;
+
+ top->lock_start_time = trans->locking_wait.start_time;
+
+ /* Don't check for self deadlock: */
+ if (trans == top->trans ||
+ !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
+ continue;
+
+ ret = lock_graph_descend(&g, trans, cycle);
+ raw_spin_unlock(&b->lock.wait_lock);
+
+ if (ret)
+ return ret;
+ goto next;
+
+ }
+ raw_spin_unlock(&b->lock.wait_lock);
+ }
+ }
+
+ if (g.nr > 1 && cycle)
+ print_chain(cycle, &g);
+ lock_graph_up(&g);
+ goto next;
+}
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
+{
+ struct btree_trans *trans = p;
+
+ return bch2_check_for_deadlock(trans, NULL);
+}
+
+int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
+ struct btree_bkey_cached_common *b,
+ bool lock_may_not_fail)
+{
+ int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
+ int ret;
+
+ /*
+ * Must drop our read locks before calling six_lock_write() -
+ * six_unlock() won't do wakeups until the reader count
+ * goes to 0, and it's safe because we have the node intent
+ * locked:
+ */
+ six_lock_readers_add(&b->lock, -readers);
+ ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail);
+ six_lock_readers_add(&b->lock, readers);
+
+ if (ret)
+ mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent);
+
+ return ret;
+}
+
+/* relock */
+
+static inline bool btree_path_get_locks(struct btree_trans *trans,
+ struct btree_path *path,
+ bool upgrade)
+{
+ unsigned l = path->level;
+ int fail_idx = -1;
+
+ do {
+ if (!btree_path_node(path, l))
+ break;
+
+ if (!(upgrade
+ ? bch2_btree_node_upgrade(trans, path, l)
+ : bch2_btree_node_relock(trans, path, l)))
+ fail_idx = l;
+
+ l++;
+ } while (l < path->locks_want);
+
+ /*
+ * When we fail to get a lock, we have to ensure that any child nodes
+ * can't be relocked so bch2_btree_path_traverse has to walk back up to
+ * the node that we failed to relock:
+ */
+ if (fail_idx >= 0) {
+ __bch2_btree_path_unlock(trans, path);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+
+ do {
+ path->l[fail_idx].b = upgrade
+ ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
+ : ERR_PTR(-BCH_ERR_no_btree_node_relock);
+ --fail_idx;
+ } while (fail_idx >= 0);
+ }
+
+ if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+ path->uptodate = BTREE_ITER_UPTODATE;
+
+ bch2_trans_verify_locks(trans);
+
+ return path->uptodate < BTREE_ITER_NEED_RELOCK;
+}
+
+bool __bch2_btree_node_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned level,
+ bool trace)
+{
+ struct btree *b = btree_path_node(path, level);
+ int want = __btree_lock_want(path, level);
+
+ if (race_fault())
+ goto fail;
+
+ if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
+ (btree_node_lock_seq_matches(path, b, level) &&
+ btree_node_lock_increment(trans, &b->c, level, want))) {
+ mark_btree_node_locked(trans, path, level, want);
+ return true;
+ }
+fail:
+ if (trace)
+ trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
+ return false;
+}
+
+/* upgrade */
+
+bool bch2_btree_node_upgrade(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
+{
+ struct btree *b = path->l[level].b;
+ struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level);
+
+ if (!is_btree_node(path, level))
+ return false;
+
+ switch (btree_lock_want(path, level)) {
+ case BTREE_NODE_UNLOCKED:
+ BUG_ON(btree_node_locked(path, level));
+ return true;
+ case BTREE_NODE_READ_LOCKED:
+ BUG_ON(btree_node_intent_locked(path, level));
+ return bch2_btree_node_relock(trans, path, level);
+ case BTREE_NODE_INTENT_LOCKED:
+ break;
+ case BTREE_NODE_WRITE_LOCKED:
+ BUG();
+ }
+
+ if (btree_node_intent_locked(path, level))
+ return true;
+
+ if (race_fault())
+ return false;
+
+ if (btree_node_locked(path, level)) {
+ bool ret;
+
+ six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]);
+ ret = six_lock_tryupgrade(&b->c.lock);
+ six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]);
+
+ if (ret)
+ goto success;
+ } else {
+ if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
+ goto success;
+ }
+
+ /*
+ * Do we already have an intent lock via another path? If so, just bump
+ * lock count:
+ */
+ if (btree_node_lock_seq_matches(path, b, level) &&
+ btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
+ btree_node_unlock(trans, path, level);
+ goto success;
+ }
+
+ trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
+ return false;
+success:
+ mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
+ return true;
+}
+
+/* Btree path locking: */
+
+/*
+ * Only for btree_cache.c - only relocks intent locks
+ */
+int bch2_btree_path_relock_intent(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ unsigned l;
+
+ for (l = path->level;
+ l < path->locks_want && btree_path_node(path, l);
+ l++) {
+ if (!bch2_btree_node_relock(trans, path, l)) {
+ __bch2_btree_path_unlock(trans, path);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
+ }
+ }
+
+ return 0;
+}
+
+__flatten
+bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
+ struct btree_path *path, unsigned long trace_ip)
+{
+ return btree_path_get_locks(trans, path, false);
+}
+
+__flatten
+bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
+ struct btree_path *path, unsigned long trace_ip)
+{
+ return btree_path_get_locks(trans, path, true);
+}
+
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
+{
+ EBUG_ON(path->locks_want >= new_locks_want);
+
+ path->locks_want = new_locks_want;
+
+ return btree_path_get_locks(trans, path, true);
+}
+
+bool __bch2_btree_path_upgrade(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
+{
+ struct btree_path *linked;
+
+ if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want))
+ return true;
+
+ /*
+ * XXX: this is ugly - we'd prefer to not be mucking with other
+ * iterators in the btree_trans here.
+ *
+ * On failure to upgrade the iterator, setting iter->locks_want and
+ * calling get_locks() is sufficient to make bch2_btree_path_traverse()
+ * get the locks we want on transaction restart.
+ *
+ * But if this iterator was a clone, on transaction restart what we did
+ * to this iterator isn't going to be preserved.
+ *
+ * Possibly we could add an iterator field for the parent iterator when
+ * an iterator is a copy - for now, we'll just upgrade any other
+ * iterators with the same btree id.
+ *
+ * The code below used to be needed to ensure ancestor nodes get locked
+ * before interior nodes - now that's handled by
+ * bch2_btree_path_traverse_all().
+ */
+ if (!path->cached && !trans->in_traverse_all)
+ trans_for_each_path(trans, linked)
+ if (linked != path &&
+ linked->cached == path->cached &&
+ linked->btree_id == path->btree_id &&
+ linked->locks_want < new_locks_want) {
+ linked->locks_want = new_locks_want;
+ btree_path_get_locks(trans, linked, true);
+ }
+
+ return false;
+}
+
+void __bch2_btree_path_downgrade(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
+{
+ unsigned l;
+
+ EBUG_ON(path->locks_want < new_locks_want);
+
+ path->locks_want = new_locks_want;
+
+ while (path->nodes_locked &&
+ (l = btree_path_highest_level_locked(path)) >= path->locks_want) {
+ if (l > path->level) {
+ btree_node_unlock(trans, path, l);
+ } else {
+ if (btree_node_intent_locked(path, l)) {
+ six_lock_downgrade(&path->l[l].b->c.lock);
+ mark_btree_node_locked_noreset(path, l, SIX_LOCK_read);
+ }
+ break;
+ }
+ }
+
+ bch2_btree_path_verify_locks(path);
+}
+
+/* Btree transaction locking: */
+
+void bch2_trans_downgrade(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ bch2_btree_path_downgrade(trans, path);
+}
+
+int bch2_trans_relock(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ if (unlikely(trans->restarted))
+ return -((int) trans->restarted);
+
+ trans_for_each_path(trans, path)
+ if (path->should_be_locked &&
+ !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+ trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+ }
+ return 0;
+}
+
+void bch2_trans_unlock(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ __bch2_btree_path_unlock(trans, path);
+
+ /*
+ * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking
+ * btree nodes, it implements its own walking:
+ */
+ EBUG_ON(!trans->is_initial_gc &&
+ lock_class_is_held(&bch2_btree_node_lock_key));
+}
+
+bool bch2_trans_locked(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ if (path->nodes_locked)
+ return true;
+ return false;
+}
+
+/* Debug */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void bch2_btree_path_verify_locks(struct btree_path *path)
+{
+ unsigned l;
+
+ if (!path->nodes_locked) {
+ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+ btree_path_node(path, path->level));
+ return;
+ }
+
+ for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+ int want = btree_lock_want(path, l);
+ int have = btree_node_locked_type(path, l);
+
+ BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
+
+ BUG_ON(is_btree_node(path, l) &&
+ (want == BTREE_NODE_UNLOCKED ||
+ have != BTREE_NODE_WRITE_LOCKED) &&
+ want != have);
+ }
+}
+
+void bch2_trans_verify_locks(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ bch2_btree_path_verify_locks(path);
+}
+
+#endif
#include "btree_iter.h"
+extern struct lock_class_key bch2_btree_node_lock_key;
+
+static inline bool is_btree_node(struct btree_path *path, unsigned l)
+{
+ return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
+}
+
+static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
+{
+ return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
+ ? &trans->c->btree_transaction_stats[trans->fn_idx]
+ : NULL;
+}
+
/* matches six lock types */
enum btree_node_locked_type {
BTREE_NODE_UNLOCKED = -1,
BTREE_NODE_READ_LOCKED = SIX_LOCK_read,
BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent,
+ BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write,
};
static inline int btree_node_locked_type(struct btree_path *path,
unsigned level)
{
- /*
- * We're relying on the fact that if nodes_intent_locked is set
- * nodes_locked must be set as well, so that we can compute without
- * branches:
- */
- return BTREE_NODE_UNLOCKED +
- ((path->nodes_locked >> level) & 1) +
- ((path->nodes_intent_locked >> level) & 1);
+ return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
}
-static inline bool btree_node_intent_locked(struct btree_path *path,
- unsigned level)
+static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
{
- return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED;
+ return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
}
-static inline bool btree_node_read_locked(struct btree_path *path,
- unsigned level)
+static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l)
{
- return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED;
+ return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED;
}
-static inline bool btree_node_locked(struct btree_path *path, unsigned level)
+static inline bool btree_node_read_locked(struct btree_path *path, unsigned l)
{
- return path->nodes_locked & (1 << level);
+ return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED;
}
-static inline void mark_btree_node_unlocked(struct btree_path *path,
- unsigned level)
+static inline bool btree_node_locked(struct btree_path *path, unsigned level)
{
- path->nodes_locked &= ~(1 << level);
- path->nodes_intent_locked &= ~(1 << level);
+ return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED;
}
-static inline void mark_btree_node_locked(struct btree_path *path,
- unsigned level,
- enum six_lock_type type)
+static inline void mark_btree_node_locked_noreset(struct btree_path *path,
+ unsigned level,
+ enum btree_node_locked_type type)
{
/* relying on this to avoid a branch */
BUILD_BUG_ON(SIX_LOCK_read != 0);
BUILD_BUG_ON(SIX_LOCK_intent != 1);
- path->nodes_locked |= 1 << level;
- path->nodes_intent_locked |= type << level;
+ path->nodes_locked &= ~(3U << (level << 1));
+ path->nodes_locked |= (type + 1) << (level << 1);
}
-static inline void mark_btree_node_intent_locked(struct btree_path *path,
- unsigned level)
+static inline void mark_btree_node_unlocked(struct btree_path *path,
+ unsigned level)
+{
+ EBUG_ON(btree_node_write_locked(path, level));
+ mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
+}
+
+static inline void mark_btree_node_locked(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned level,
+ enum six_lock_type type)
{
- mark_btree_node_locked(path, level, SIX_LOCK_intent);
+ mark_btree_node_locked_noreset(path, level, type);
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+ path->l[level].lock_taken_time = local_clock();
+#endif
}
static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
return BTREE_NODE_UNLOCKED;
}
-static inline void btree_node_unlock(struct btree_path *path, unsigned level)
+static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
+{
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+ if (s)
+ __bch2_time_stats_update(&s->lock_hold_times,
+ path->l[level].lock_taken_time,
+ local_clock());
+#endif
+}
+
+/* unlock: */
+
+static inline void btree_node_unlock(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
{
int lock_type = btree_node_locked_type(path, level);
EBUG_ON(level >= BTREE_MAX_DEPTH);
- if (lock_type != BTREE_NODE_UNLOCKED)
+ if (lock_type != BTREE_NODE_UNLOCKED) {
six_unlock_type(&path->l[level].b->c.lock, lock_type);
+ btree_trans_lock_hold_time_update(trans, path, level);
+ }
mark_btree_node_unlocked(path, level);
}
-static inline void __bch2_btree_path_unlock(struct btree_path *path)
+static inline int btree_path_lowest_level_locked(struct btree_path *path)
+{
+ return __ffs(path->nodes_locked) >> 1;
+}
+
+static inline int btree_path_highest_level_locked(struct btree_path *path)
+{
+ return __fls(path->nodes_locked) >> 1;
+}
+
+static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
+ struct btree_path *path)
{
btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
while (path->nodes_locked)
- btree_node_unlock(path, __ffs(path->nodes_locked));
+ btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
}
-static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
+/*
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
+ * succeed:
+ */
+static inline void
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
+ struct btree *b)
{
- switch (type) {
- case SIX_LOCK_read:
- return BCH_TIME_btree_lock_contended_read;
- case SIX_LOCK_intent:
- return BCH_TIME_btree_lock_contended_intent;
- case SIX_LOCK_write:
- return BCH_TIME_btree_lock_contended_write;
- default:
- BUG();
- }
+ struct btree_path *linked;
+
+ EBUG_ON(path->l[b->c.level].b != b);
+ EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+ EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
+
+ mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
+
+ trans_for_each_path_with_node(trans, b, linked)
+ linked->l[b->c.level].lock_seq += 2;
+
+ six_unlock_write(&b->c.lock);
}
-static inline bool btree_node_lock_type(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct bpos pos, unsigned level,
- enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
- struct bch_fs *c = trans->c;
- u64 start_time;
- bool ret;
+void bch2_btree_node_unlock_write(struct btree_trans *,
+ struct btree_path *, struct btree *);
- if (six_trylock_type(&b->c.lock, type))
- return true;
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
- start_time = local_clock();
+/* lock: */
- trans->locking_path_idx = path->idx;
- trans->locking_pos = pos;
- trans->locking_btree_id = path->btree_id;
- trans->locking_level = level;
- trans->locking_lock_type = type;
- trans->locking = b;
- ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
- trans->locking = NULL;
+static inline int __btree_node_lock_nopath(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ enum six_lock_type type,
+ bool lock_may_not_fail)
+{
+ int ret;
- if (ret)
- bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+ trans->lock_may_not_fail = lock_may_not_fail;
+ trans->lock_must_abort = false;
+ trans->locking = b;
+ ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
+ bch2_six_check_for_deadlock, trans);
+ WRITE_ONCE(trans->locking, NULL);
+ WRITE_ONCE(trans->locking_wait.start_time, 0);
return ret;
}
+static inline int __must_check
+btree_node_lock_nopath(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ enum six_lock_type type)
+{
+ return __btree_node_lock_nopath(trans, b, type, false);
+}
+
+static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ enum six_lock_type type)
+{
+ int ret = __btree_node_lock_nopath(trans, b, type, true);
+
+ BUG_ON(ret);
+}
+
/*
* Lock a btree node if we already have it locked on one of our linked
* iterators:
*/
static inline bool btree_node_lock_increment(struct btree_trans *trans,
- struct btree *b, unsigned level,
+ struct btree_bkey_cached_common *b,
+ unsigned level,
enum btree_node_locked_type want)
{
struct btree_path *path;
trans_for_each_path(trans, path)
- if (path->l[level].b == b &&
+ if (&path->l[level].b->c == b &&
btree_node_locked_type(path, level) >= want) {
- six_lock_increment(&b->c.lock, want);
+ six_lock_increment(&b->lock, want);
return true;
}
return false;
}
-bool __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
- struct btree *, struct bpos, unsigned,
- enum six_lock_type,
- six_lock_should_sleep_fn, void *,
- unsigned long);
-
-static inline bool btree_node_lock(struct btree_trans *trans,
+static inline int btree_node_lock(struct btree_trans *trans,
struct btree_path *path,
- struct btree *b, struct bpos pos, unsigned level,
+ struct btree_bkey_cached_common *b,
+ unsigned level,
enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
unsigned long ip)
{
+ int ret = 0;
+
EBUG_ON(level >= BTREE_MAX_DEPTH);
EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
- return likely(six_trylock_type(&b->c.lock, type)) ||
- btree_node_lock_increment(trans, b, level, type) ||
- __bch2_btree_node_lock(trans, path, b, pos, level, type,
- should_sleep_fn, p, ip);
+ if (likely(six_trylock_type(&b->lock, type)) ||
+ btree_node_lock_increment(trans, b, level, type) ||
+ !(ret = btree_node_lock_nopath(trans, b, type))) {
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+ path->l[b->level].lock_taken_time = local_clock();
+#endif
+ }
+
+ return ret;
}
-bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
+int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *,
+ struct btree_bkey_cached_common *b, bool);
+
+static inline int __btree_node_lock_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b,
+ bool lock_may_not_fail)
+{
+ EBUG_ON(&path->l[b->level].b->c != b);
+ EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
+ EBUG_ON(!btree_node_intent_locked(path, b->level));
+
+ /*
+ * six locks are unfair, and read locks block while a thread wants a
+ * write lock: thus, we need to tell the cycle detector we have a write
+ * lock _before_ taking the lock:
+ */
+ mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write);
+
+ return likely(six_trylock_write(&b->lock))
+ ? 0
+ : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
+}
+
+static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b)
+{
+ int ret = __btree_node_lock_write(trans, path, b, true);
+ BUG_ON(ret);
+}
+
+static inline int __must_check
+bch2_btree_node_lock_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b)
+{
+ return __btree_node_lock_write(trans, path, b, false);
+}
+
+/* relock: */
+
+bool bch2_btree_path_relock_norestart(struct btree_trans *,
+ struct btree_path *, unsigned long);
+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace);
static inline bool bch2_btree_node_relock(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
EBUG_ON(btree_node_locked(path, level) &&
- btree_node_locked_type(path, level) !=
- __btree_lock_want(path, level));
+ !btree_node_write_locked(path, level) &&
+ btree_node_locked_type(path, level) != __btree_lock_want(path, level));
return likely(btree_node_locked(path, level)) ||
- __bch2_btree_node_relock(trans, path, level);
+ (!IS_ERR_OR_NULL(path->l[level].b) &&
+ __bch2_btree_node_relock(trans, path, level, true));
}
-/*
- * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
- * succeed:
- */
-static inline void
-bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
- struct btree *b)
+static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
{
- struct btree_path *linked;
+ EBUG_ON(btree_node_locked(path, level) &&
+ !btree_node_write_locked(path, level) &&
+ btree_node_locked_type(path, level) != __btree_lock_want(path, level));
- EBUG_ON(path->l[b->c.level].b != b);
- EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+ return likely(btree_node_locked(path, level)) ||
+ (!IS_ERR_OR_NULL(path->l[level].b) &&
+ __bch2_btree_node_relock(trans, path, level, false));
+}
- trans_for_each_path_with_node(trans, b, linked)
- linked->l[b->c.level].lock_seq += 2;
+static inline int bch2_btree_path_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned long trace_ip)
+{
+ if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
+ trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
+ }
- six_unlock_write(&b->c.lock);
+ return 0;
}
-void bch2_btree_node_unlock_write(struct btree_trans *,
- struct btree_path *, struct btree *);
+/* upgrade */
-void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *);
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
+ struct btree_path *, unsigned);
+bool __bch2_btree_path_upgrade(struct btree_trans *,
+ struct btree_path *, unsigned);
-static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
+static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
{
- EBUG_ON(path->l[b->c.level].b != b);
- EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq);
- EBUG_ON(!btree_node_intent_locked(path, b->c.level));
+ unsigned old_locks_want = path->locks_want;
+
+ new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+
+ if (path->locks_want < new_locks_want
+ ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+ : path->uptodate == BTREE_ITER_UPTODATE)
+ return 0;
- if (unlikely(!six_trylock_write(&b->c.lock)))
- __bch2_btree_node_lock_write(trans, b);
+ trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
+ old_locks_want, new_locks_want);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
}
-#endif /* _BCACHEFS_BTREE_LOCKING_H */
+/* misc: */
+
+static inline void btree_path_set_should_be_locked(struct btree_path *path)
+{
+ EBUG_ON(!btree_node_locked(path, path->level));
+ EBUG_ON(path->uptodate);
+
+ path->should_be_locked = true;
+}
+static inline void __btree_path_set_level_up(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned l)
+{
+ btree_node_unlock(trans, path, l);
+ path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
+}
+static inline void btree_path_set_level_up(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ __btree_path_set_level_up(trans, path, path->level++);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+}
+
+/* debug */
+
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
+ struct btree_path *,
+ struct btree_bkey_cached_common *b,
+ unsigned);
+
+int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_path_verify_locks(struct btree_path *);
+void bch2_trans_verify_locks(struct btree_trans *);
+#else
+static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
+static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
+#endif
+
+#endif /* _BCACHEFS_BTREE_LOCKING_H */
#include <linux/rhashtable.h>
#include <linux/six.h>
-#include "bkey_methods.h"
+//#include "bkey_methods.h"
#include "buckets_types.h"
+#include "darray.h"
#include "journal_types.h"
struct open_bucket;
struct six_lock lock;
u8 level;
u8 btree_id;
+ bool cached;
};
struct btree {
struct mutex lock;
struct list_head live;
struct list_head freeable;
- struct list_head freed;
+ struct list_head freed_pcpu;
+ struct list_head freed_nonpcpu;
/* Number of elements in live + freeable lists */
unsigned used;
unsigned reserve;
+ unsigned freed;
+ unsigned not_freed_lock_intent;
+ unsigned not_freed_lock_write;
+ unsigned not_freed_dirty;
+ unsigned not_freed_read_in_flight;
+ unsigned not_freed_write_in_flight;
+ unsigned not_freed_noevict;
+ unsigned not_freed_write_blocked;
+ unsigned not_freed_will_make_reachable;
+ unsigned not_freed_access_bit;
atomic_t dirty;
struct shrinker shrink;
* Iterate over all possible positions, synthesizing deleted keys for holes:
*/
#define BTREE_ITER_SLOTS (1 << 0)
+#define BTREE_ITER_ALL_LEVELS (1 << 1)
/*
* Indicates that intent locks should be taken on leaf nodes, because we expect
* to be doing updates:
*/
-#define BTREE_ITER_INTENT (1 << 1)
+#define BTREE_ITER_INTENT (1 << 2)
/*
* Causes the btree iterator code to prefetch additional btree nodes from disk:
*/
-#define BTREE_ITER_PREFETCH (1 << 2)
-/*
- * Indicates that this iterator should not be reused until transaction commit,
- * either because a pending update references it or because the update depends
- * on that particular key being locked (e.g. by the str_hash code, for hash
- * table consistency)
- */
-#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 3)
+#define BTREE_ITER_PREFETCH (1 << 3)
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
#define BTREE_ITER_IS_EXTENTS (1 << 4)
#define BTREE_ITER_NOT_EXTENTS (1 << 5)
#define BTREE_ITER_CACHED (1 << 6)
-#define BTREE_ITER_CACHED_NOFILL (1 << 7)
-#define BTREE_ITER_CACHED_NOCREATE (1 << 8)
-#define BTREE_ITER_WITH_KEY_CACHE (1 << 9)
-#define BTREE_ITER_WITH_UPDATES (1 << 10)
-#define BTREE_ITER_WITH_JOURNAL (1 << 11)
-#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
-#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13)
-#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14)
-#define BTREE_ITER_NOPRESERVE (1 << 15)
+#define BTREE_ITER_WITH_KEY_CACHE (1 << 7)
+#define BTREE_ITER_WITH_UPDATES (1 << 8)
+#define BTREE_ITER_WITH_JOURNAL (1 << 9)
+#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 10)
+#define BTREE_ITER_ALL_SNAPSHOTS (1 << 11)
+#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 12)
+#define BTREE_ITER_NOPRESERVE (1 << 13)
enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
BTREE_ITER_NEED_TRAVERSE = 2,
};
-#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1)
-#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2)
-#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3)
-#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4)
-#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5)
-#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6)
-#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7)
-#define BTREE_ITER_NO_NODE_CACHED ((struct btree *) 8)
-
struct btree_path {
u8 idx;
u8 sorted_idx;
*/
bool should_be_locked:1;
unsigned level:3,
- locks_want:4,
- nodes_locked:4,
- nodes_intent_locked:4;
+ locks_want:4;
+ u8 nodes_locked;
struct btree_path_level {
struct btree *b;
struct btree_node_iter iter;
u32 lock_seq;
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+ u64 lock_taken_time;
+#endif
} l[BTREE_MAX_DEPTH];
#ifdef CONFIG_BCACHEFS_DEBUG
unsigned long ip_allocated;
struct btree_path *key_cache_path;
enum btree_id btree_id:4;
- unsigned min_depth:4;
+ unsigned min_depth:3;
+ unsigned advanced:1;
/* btree_iter_copy starts here: */
u16 flags;
* bch2_btree_iter_next_slot() can correctly advance pos.
*/
struct bkey k;
+
+ /* BTREE_ITER_WITH_JOURNAL: */
+ size_t journal_idx;
+ struct bpos journal_pos;
#ifdef CONFIG_BCACHEFS_DEBUG
unsigned long ip_allocated;
#endif
};
+struct btree_key_cache_freelist {
+ struct bkey_cached *objs[16];
+ unsigned nr;
+};
+
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
bool table_init_done;
- struct list_head freed;
+ struct list_head freed_pcpu;
+ struct list_head freed_nonpcpu;
struct shrinker shrink;
unsigned shrink_iter;
+ struct btree_key_cache_freelist __percpu *pcpu_freed;
- size_t nr_freed;
+ atomic_long_t nr_freed;
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
};
struct btree_bkey_cached_common c;
unsigned long flags;
- u8 u64s;
+ u16 u64s;
bool valid;
u32 btree_trans_barrier_seq;
struct bkey_cached_key key;
struct bkey_i *k;
};
+static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
+{
+ return !b->cached
+ ? container_of(b, struct btree, c)->key.k.p
+ : container_of(b, struct bkey_cached, c)->key.pos;
+}
+
struct btree_insert_entry {
unsigned flags;
u8 bkey_type;
enum btree_id btree_id:8;
- u8 level;
+ u8 level:4;
bool cached:1;
bool insert_trigger_run:1;
bool overwrite_trigger_run:1;
+ bool key_cache_already_flushed:1;
+ /*
+ * @old_k may be a key from the journal; @old_btree_u64s always refers
+ * to the size of the key being overwritten in the btree:
+ */
+ u8 old_btree_u64s;
struct bkey_i *k;
struct btree_path *path;
+ /* key being overwritten: */
+ struct bkey old_k;
+ const struct bch_val *old_v;
unsigned long ip_allocated;
};
struct btree_trans_commit_hook *next;
};
-#define BTREE_TRANS_MEM_MAX (1U << 14)
+#define BTREE_TRANS_MEM_MAX (1U << 16)
+
+#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000
struct btree_trans {
struct bch_fs *c;
const char *fn;
+ struct closure ref;
struct list_head list;
- struct btree *locking;
- unsigned locking_path_idx;
- struct bpos locking_pos;
- u8 locking_btree_id;
- u8 locking_level;
- u8 locking_lock_type;
- pid_t pid;
+ u64 last_begin_time;
+
+ u8 lock_may_not_fail;
+ u8 lock_must_abort;
+ struct btree_bkey_cached_common *locking;
+ struct six_lock_waiter locking_wait;
+
int srcu_idx;
+ u8 fn_idx;
u8 nr_sorted;
u8 nr_updates;
+ u8 traverse_all_idx;
bool used_mempool:1;
bool in_traverse_all:1;
- bool restarted:1;
- bool journal_transaction_names:1;
+ bool memory_allocation_failure:1;
+ bool is_initial_gc:1;
+ bool journal_replay_not_finished:1;
+ enum bch_errcode restarted:16;
+ u32 restart_count;
+ unsigned long last_restarted_ip;
+
/*
* For when bch2_trans_update notices we'll be splitting a compressed
* extent:
*/
unsigned extra_journal_res;
+ unsigned nr_max_paths;
u64 paths_allocated;
unsigned mem_top;
+ unsigned mem_max;
unsigned mem_bytes;
void *mem;
/* update path: */
struct btree_trans_commit_hook *hooks;
- struct jset_entry *extra_journal_entries;
- unsigned extra_journal_entry_u64s;
+ DARRAY(u64) extra_journal_entries;
struct journal_entry_pin *journal_pin;
struct journal_res journal_res;
struct replicas_delta_list *fs_usage_deltas;
};
-#define BTREE_FLAG(flag) \
+#define BTREE_FLAGS() \
+ x(read_in_flight) \
+ x(read_error) \
+ x(dirty) \
+ x(need_write) \
+ x(write_blocked) \
+ x(will_make_reachable) \
+ x(noevict) \
+ x(write_idx) \
+ x(accessed) \
+ x(write_in_flight) \
+ x(write_in_flight_inner) \
+ x(just_written) \
+ x(dying) \
+ x(fake) \
+ x(need_rewrite) \
+ x(never_write)
+
+enum btree_flags {
+#define x(flag) BTREE_NODE_##flag,
+ BTREE_FLAGS()
+#undef x
+};
+
+#define x(flag) \
static inline bool btree_node_ ## flag(struct btree *b) \
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
\
static inline void clear_btree_node_ ## flag(struct btree *b) \
{ clear_bit(BTREE_NODE_ ## flag, &b->flags); }
-enum btree_flags {
- BTREE_NODE_read_in_flight,
- BTREE_NODE_read_error,
- BTREE_NODE_dirty,
- BTREE_NODE_need_write,
- BTREE_NODE_noevict,
- BTREE_NODE_write_idx,
- BTREE_NODE_accessed,
- BTREE_NODE_write_in_flight,
- BTREE_NODE_write_in_flight_inner,
- BTREE_NODE_just_written,
- BTREE_NODE_dying,
- BTREE_NODE_fake,
- BTREE_NODE_need_rewrite,
- BTREE_NODE_never_write,
-};
-
-BTREE_FLAG(read_in_flight);
-BTREE_FLAG(read_error);
-BTREE_FLAG(need_write);
-BTREE_FLAG(noevict);
-BTREE_FLAG(write_idx);
-BTREE_FLAG(accessed);
-BTREE_FLAG(write_in_flight);
-BTREE_FLAG(write_in_flight_inner);
-BTREE_FLAG(just_written);
-BTREE_FLAG(dying);
-BTREE_FLAG(fake);
-BTREE_FLAG(need_rewrite);
-BTREE_FLAG(never_write);
+BTREE_FLAGS()
+#undef x
static inline struct btree_write *btree_current_write(struct btree *b)
{
return __btree_node_type(b->c.level, b->c.btree_id);
}
-static inline bool btree_node_type_is_extents(enum btree_node_type type)
-{
- switch (type) {
- case BKEY_TYPE_extents:
- case BKEY_TYPE_reflink:
- return true;
- default:
- return false;
- }
-}
-
-static inline bool btree_node_is_extents(struct btree *b)
-{
- return btree_node_type_is_extents(btree_node_type(b));
-}
-
#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
((1U << BKEY_TYPE_extents)| \
+ (1U << BKEY_TYPE_alloc)| \
(1U << BKEY_TYPE_inodes)| \
(1U << BKEY_TYPE_stripes)| \
(1U << BKEY_TYPE_reflink)| \
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
+#define BTREE_ID_IS_EXTENTS \
+ ((1U << BTREE_ID_extents)| \
+ (1U << BTREE_ID_reflink)| \
+ (1U << BTREE_ID_freespace))
+
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
+{
+ return (1U << type) & BTREE_ID_IS_EXTENTS;
+}
+
#define BTREE_ID_HAS_SNAPSHOTS \
((1U << BTREE_ID_extents)| \
(1U << BTREE_ID_inodes)| \
return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
}
-enum btree_update_flags {
- __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
- __BTREE_UPDATE_KEY_CACHE_RECLAIM,
-
- __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
-
- __BTREE_TRIGGER_INSERT,
- __BTREE_TRIGGER_OVERWRITE,
-
- __BTREE_TRIGGER_GC,
- __BTREE_TRIGGER_BUCKET_INVALIDATE,
- __BTREE_TRIGGER_NOATOMIC,
-};
-
-#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
-#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
-
-#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
-
-#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
-#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
-
-#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
-#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
-#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
-
-#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \
- ((1U << KEY_TYPE_alloc)| \
- (1U << KEY_TYPE_alloc_v2)| \
- (1U << KEY_TYPE_alloc_v3)| \
- (1U << KEY_TYPE_stripe)| \
- (1U << KEY_TYPE_inode)| \
- (1U << KEY_TYPE_inode_v2)| \
- (1U << KEY_TYPE_snapshot))
+static inline bool btree_type_has_ptrs(enum btree_id id)
+{
+ return (1 << id) & BTREE_ID_HAS_PTRS;
+}
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
struct bch_fs;
struct btree;
-void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *,
- struct btree *);
+void bch2_btree_node_prep_for_write(struct btree_trans *,
+ struct btree_path *, struct btree *);
bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
struct btree *, struct btree_node_iter *,
struct bkey_i *);
void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
enum btree_insert_flags {
- __BTREE_INSERT_NOFAIL,
+ /* First two bits for journal watermark: */
+ __BTREE_INSERT_NOFAIL = 2,
__BTREE_INSERT_NOCHECK_RW,
__BTREE_INSERT_LAZY_RW,
__BTREE_INSERT_USE_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
- __BTREE_INSERT_JOURNAL_RESERVED,
__BTREE_INSERT_JOURNAL_RECLAIM,
__BTREE_INSERT_NOWAIT,
__BTREE_INSERT_GC_LOCK_HELD,
/* Insert is for journal replay - don't get journal reservations: */
#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
-/* Indicates that we have pre-reserved space in the journal: */
-#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED)
-
/* Insert is being called from journal reclaim path: */
#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM)
#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE)
#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE)
+int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
+ unsigned, unsigned);
int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *);
struct btree_trans_commit_hook *);
int __bch2_trans_commit(struct btree_trans *);
+int bch2_trans_log_msg(struct btree_trans *, const char *);
+
/**
* bch2_trans_commit - insert keys at given iterator positions
*
* This is main entry point for btree updates.
*
* Return values:
- * -EINTR: locking changed, this function should be called again.
* -EROFS: filesystem read only
* -EIO: journal or btree node IO error
*/
return __bch2_trans_commit(trans);
}
-#define lockrestart_do(_trans, _do) \
+#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \
+ lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+ (_journal_seq), (_flags)))
+
+#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \
+ nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+ (_journal_seq), (_flags)))
+
+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
({ \
+ struct btree_trans trans; \
int _ret; \
\
- do { \
- bch2_trans_begin(_trans); \
- _ret = (_do); \
- } while (_ret == -EINTR); \
+ bch2_trans_init(&trans, (_c), 0, 0); \
+ _ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do); \
+ bch2_trans_exit(&trans); \
\
_ret; \
})
-#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \
- lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
- (_journal_seq), (_flags)))
-
-#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
+#define bch2_trans_run(_c, _do) \
({ \
struct btree_trans trans; \
int _ret; \
\
bch2_trans_init(&trans, (_c), 0, 0); \
- _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \
- _do); \
+ _ret = (_do); \
bch2_trans_exit(&trans); \
\
_ret; \
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
+static inline void bch2_trans_reset_updates(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i)
+ bch2_path_put(trans, i->path, true);
+
+ trans->extra_journal_res = 0;
+ trans->nr_updates = 0;
+ trans->hooks = NULL;
+ trans->extra_journal_entries.nr = 0;
+}
+
#endif /* _BCACHEFS_BTREE_UPDATE_H */
#include <linux/random.h>
#include <trace/events/bcachefs.h>
-static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
- struct btree_path *, struct btree *,
- struct keylist *, unsigned);
+static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
+ struct btree_path *, struct btree *,
+ struct keylist *, unsigned);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
+static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
+ enum btree_id btree_id,
+ unsigned level,
+ struct bpos pos)
+{
+ struct btree_path *path;
+
+ path = bch2_path_get(trans, btree_id, pos, level + 1, level,
+ BTREE_ITER_NOPRESERVE|
+ BTREE_ITER_INTENT, _RET_IP_);
+ path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_);
+ bch2_btree_path_downgrade(trans, path);
+ __bch2_btree_path_unlock(trans, path);
+ return path;
+}
+
/* Debug code: */
/*
struct bkey_s_c k;
struct bkey_s_c_btree_ptr_v2 bp;
struct bkey unpacked;
- char buf1[100], buf2[100];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
BUG_ON(!b->c.level);
if (bpos_cmp(next_node, bp.v->min_key)) {
bch2_dump_btree_node(c, b);
- panic("expected next min_key %s got %s\n",
- (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2));
+ bch2_bpos_to_text(&buf1, next_node);
+ bch2_bpos_to_text(&buf2, bp.v->min_key);
+ panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
}
bch2_btree_node_iter_advance(&iter, b);
if (bch2_btree_node_iter_end(&iter)) {
if (bpos_cmp(k.k->p, b->key.k.p)) {
bch2_dump_btree_node(c, b);
- panic("expected end %s got %s\n",
- (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2));
+ bch2_bpos_to_text(&buf1, b->key.k.p);
+ bch2_bpos_to_text(&buf2, k.k->p);
+ panic("expected end %s got %s\n", buf1.buf, buf2.buf);
}
break;
}
static void __btree_node_free(struct bch_fs *c, struct btree *b)
{
- trace_btree_node_free(c, b);
+ trace_and_count(c, btree_node_free, c, b);
BUG_ON(btree_node_dirty(b));
BUG_ON(btree_node_need_write(b));
}
static void bch2_btree_node_free_inmem(struct btree_trans *trans,
+ struct btree_path *path,
struct btree *b)
{
struct bch_fs *c = trans->c;
- struct btree_path *path;
+ unsigned level = b->c.level;
+
+ bch2_btree_node_lock_write_nofail(trans, path, &b->c);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+ __btree_node_free(c, b);
+ six_unlock_write(&b->c.lock);
+ mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
trans_for_each_path(trans, path)
- BUG_ON(path->l[b->c.level].b == b &&
- path->l[b->c.level].lock_seq == b->c.lock.state.seq);
+ if (path->l[level].b == b) {
+ btree_node_unlock(trans, path, level);
+ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+ }
+}
- six_lock_write(&b->c.lock, NULL, NULL);
+static void bch2_btree_node_free_never_used(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree *b)
+{
+ struct bch_fs *c = as->c;
+ struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
+ struct btree_path *path;
+ unsigned level = b->c.level;
+
+ BUG_ON(!list_empty(&b->write_blocked));
+ BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
+
+ b->will_make_reachable = 0;
+ closure_put(&as->cl);
+ clear_btree_node_will_make_reachable(b);
+ clear_btree_node_accessed(b);
+ clear_btree_node_dirty_acct(c, b);
+ clear_btree_node_need_write(b);
+
+ mutex_lock(&c->btree_cache.lock);
+ list_del_init(&b->list);
bch2_btree_node_hash_remove(&c->btree_cache, b);
- __btree_node_free(c, b);
+ mutex_unlock(&c->btree_cache.lock);
+
+ BUG_ON(p->nr >= ARRAY_SIZE(p->b));
+ p->b[p->nr++] = b;
- six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
+
+ trans_for_each_path(trans, path)
+ if (path->l[level].b == b) {
+ btree_node_unlock(trans, path, level);
+ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+ }
}
-static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
+static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct disk_reservation *res,
struct closure *cl,
+ bool interior_node,
unsigned flags)
{
+ struct bch_fs *c = trans->c;
struct write_point *wp;
struct btree *b;
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
if (flags & BTREE_INSERT_USE_RESERVE) {
nr_reserve = 0;
- alloc_reserve = RESERVE_BTREE_MOVINGGC;
+ alloc_reserve = RESERVE_btree_movinggc;
} else {
nr_reserve = BTREE_NODE_RESERVE;
- alloc_reserve = RESERVE_BTREE;
+ alloc_reserve = RESERVE_btree;
}
mutex_lock(&c->btree_reserve_cache_lock);
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
- wp = bch2_alloc_sectors_start(c,
+ wp = bch2_alloc_sectors_start_trans(trans,
c->opts.metadata_target ?:
c->opts.foreground_target,
0,
bch2_open_bucket_get(c, wp, &ob);
bch2_alloc_sectors_done(c, wp);
mem_alloc:
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(c, interior_node);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
return b;
}
-static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
+static struct btree *bch2_btree_node_alloc(struct btree_update *as,
+ struct btree_trans *trans,
+ unsigned level)
{
struct bch_fs *c = as->c;
struct btree *b;
+ struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
int ret;
BUG_ON(level >= BTREE_MAX_DEPTH);
- BUG_ON(!as->nr_prealloc_nodes);
+ BUG_ON(!p->nr);
- b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+ b = p->b[--p->nr];
- six_lock_intent(&b->c.lock, NULL, NULL);
- six_lock_write(&b->c.lock, NULL, NULL);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
set_btree_node_accessed(b);
- set_btree_node_dirty(c, b);
+ set_btree_node_dirty_acct(c, b);
set_btree_node_need_write(b);
bch2_bset_init_first(b, &b->data->keys);
ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
BUG_ON(ret);
- trace_btree_node_alloc(c, b);
+ trace_and_count(c, btree_node_alloc, c, b);
return b;
}
}
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
+ struct btree_trans *trans,
struct btree *b,
struct bkey_format format)
{
struct btree *n;
- n = bch2_btree_node_alloc(as, b->c.level);
+ n = bch2_btree_node_alloc(as, trans, b->c.level);
SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
}
static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
+ struct btree_trans *trans,
struct btree *b)
{
struct bkey_format new_f = bch2_btree_calc_format(b);
if (!bch2_btree_node_format_fits(as->c, b, &new_f))
new_f = b->format;
- return __bch2_btree_node_alloc_replacement(as, b, new_f);
+ return __bch2_btree_node_alloc_replacement(as, trans, b, new_f);
}
-static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
+static struct btree *__btree_root_alloc(struct btree_update *as,
+ struct btree_trans *trans, unsigned level)
{
- struct btree *b = bch2_btree_node_alloc(as, level);
+ struct btree *b = bch2_btree_node_alloc(as, trans, level);
btree_set_min(b, POS_MIN);
btree_set_max(b, SPOS_MAX);
btree_node_set_format(b, b->data->format);
bch2_btree_build_aux_trees(b);
- bch2_btree_update_add_new_node(as, b);
- six_unlock_write(&b->c.lock);
-
return b;
}
-static void bch2_btree_reserve_put(struct btree_update *as)
+static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
{
struct bch_fs *c = as->c;
+ struct prealloc_nodes *p;
- mutex_lock(&c->btree_reserve_cache_lock);
+ for (p = as->prealloc_nodes;
+ p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
+ p++) {
+ while (p->nr) {
+ struct btree *b = p->b[--p->nr];
- while (as->nr_prealloc_nodes) {
- struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+ mutex_lock(&c->btree_reserve_cache_lock);
- six_lock_intent(&b->c.lock, NULL, NULL);
- six_lock_write(&b->c.lock, NULL, NULL);
+ if (c->btree_reserve_cache_nr <
+ ARRAY_SIZE(c->btree_reserve_cache)) {
+ struct btree_alloc *a =
+ &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
- if (c->btree_reserve_cache_nr <
- ARRAY_SIZE(c->btree_reserve_cache)) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+ a->ob = b->ob;
+ b->ob.nr = 0;
+ bkey_copy(&a->k, &b->key);
+ } else {
+ bch2_open_buckets_put(c, &b->ob);
+ }
- a->ob = b->ob;
- b->ob.nr = 0;
- bkey_copy(&a->k, &b->key);
- } else {
- bch2_open_buckets_put(c, &b->ob);
- }
+ mutex_unlock(&c->btree_reserve_cache_lock);
- __btree_node_free(c, b);
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+ __btree_node_free(c, b);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ }
}
-
- mutex_unlock(&c->btree_reserve_cache_lock);
}
-static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
- unsigned flags)
+static int bch2_btree_reserve_get(struct btree_trans *trans,
+ struct btree_update *as,
+ unsigned nr_nodes[2],
+ unsigned flags,
+ struct closure *cl)
{
struct bch_fs *c = as->c;
- struct closure cl;
struct btree *b;
- int ret;
-
- closure_init_stack(&cl);
-retry:
+ unsigned interior;
+ int ret = 0;
- BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
+ BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
/*
* Protects reaping from the btree node cache and using the btree node
* BTREE_INSERT_NOWAIT only applies to btree node allocation, not
* blocking on this lock:
*/
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(c, cl);
if (ret)
- goto err;
+ return ret;
+
+ for (interior = 0; interior < 2; interior++) {
+ struct prealloc_nodes *p = as->prealloc_nodes + interior;
+
+ while (p->nr < nr_nodes[interior]) {
+ b = __bch2_btree_node_alloc(trans, &as->disk_res,
+ flags & BTREE_INSERT_NOWAIT ? NULL : cl,
+ interior, flags);
+ if (IS_ERR(b)) {
+ ret = PTR_ERR(b);
+ goto err;
+ }
- while (as->nr_prealloc_nodes < nr_nodes) {
- b = __bch2_btree_node_alloc(c, &as->disk_res,
- flags & BTREE_INSERT_NOWAIT
- ? NULL : &cl, flags);
- if (IS_ERR(b)) {
- ret = PTR_ERR(b);
- goto err;
+ p->b[p->nr++] = b;
}
-
- as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
}
-
- bch2_btree_cache_cannibalize_unlock(c);
- closure_sync(&cl);
- return 0;
err:
bch2_btree_cache_cannibalize_unlock(c);
- closure_sync(&cl);
-
- if (ret == -EAGAIN)
- goto retry;
-
- trace_btree_reserve_get_fail(c, nr_nodes, &cl);
return ret;
}
/* Asynchronous interior node update machinery */
-static void bch2_btree_update_free(struct btree_update *as)
+static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
{
struct bch_fs *c = as->c;
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal);
bch2_disk_reservation_put(c, &as->disk_res);
- bch2_btree_reserve_put(as);
+ bch2_btree_reserve_put(as, trans);
bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
as->start_time);
mutex_unlock(&c->btree_interior_update_lock);
}
-static void btree_update_will_delete_key(struct btree_update *as,
- struct bkey_i *k)
+static void btree_update_add_key(struct btree_update *as,
+ struct keylist *keys, struct btree *b)
{
- BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s >
+ struct bkey_i *k = &b->key;
+
+ BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
ARRAY_SIZE(as->_old_keys));
- bch2_keylist_add(&as->old_keys, k);
-}
-static void btree_update_will_add_key(struct btree_update *as,
- struct bkey_i *k)
-{
- BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s >
- ARRAY_SIZE(as->_new_keys));
- bch2_keylist_add(&as->new_keys, k);
+ bkey_copy(keys->top, k);
+ bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
+
+ bch2_keylist_push(keys);
}
/*
struct bkey_i *k;
int ret;
- trans->extra_journal_entries = (void *) &as->journal_entries[0];
- trans->extra_journal_entry_u64s = as->journal_u64s;
+ ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s);
+ if (ret)
+ return ret;
+
+ memcpy(&darray_top(trans->extra_journal_entries),
+ as->journal_entries,
+ as->journal_u64s * sizeof(u64));
+ trans->extra_journal_entries.nr += as->journal_u64s;
+
trans->journal_pin = &as->journal;
- for_each_keylist_key(&as->new_keys, k) {
- ret = bch2_trans_mark_key(trans,
- bkey_s_c_null,
- bkey_i_to_s_c(k),
- BTREE_TRIGGER_INSERT);
+ for_each_keylist_key(&as->old_keys, k) {
+ unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+ ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0);
if (ret)
return ret;
}
- for_each_keylist_key(&as->old_keys, k) {
- ret = bch2_trans_mark_key(trans,
- bkey_i_to_s_c(k),
- bkey_s_c_null,
- BTREE_TRIGGER_OVERWRITE);
+ for_each_keylist_key(&as->new_keys, k) {
+ unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+ ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0);
if (ret)
return ret;
}
static void btree_update_nodes_written(struct btree_update *as)
{
struct bch_fs *c = as->c;
- struct btree *b = as->b;
+ struct btree *b;
struct btree_trans trans;
u64 journal_seq = 0;
unsigned i;
int ret;
+ bch2_trans_init(&trans, c, 0, 512);
/*
* If we're already in an error state, it might be because a btree node
* was never written, and we might be trying to free that same btree
if (ret)
goto err;
- BUG_ON(!journal_pin_active(&as->journal));
-
/*
* Wait for any in flight writes to finish before we free the old nodes
* on disk:
*/
for (i = 0; i < as->nr_old_nodes; i++) {
- struct btree *old = as->old_nodes[i];
__le64 seq;
- six_lock_read(&old->c.lock, NULL, NULL);
- seq = old->data ? old->data->keys.seq : 0;
- six_unlock_read(&old->c.lock);
+ b = as->old_nodes[i];
+
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+ seq = b->data ? b->data->keys.seq : 0;
+ six_unlock_read(&b->c.lock);
if (seq == as->old_nodes_seq[i])
- wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner,
+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
TASK_UNINTERRUPTIBLE);
}
* journal reclaim does btree updates when flushing bkey_cached entries,
* which may require allocations as well.
*/
- bch2_trans_init(&trans, c, 0, 512);
- ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_JOURNAL_RECLAIM|
- BTREE_INSERT_JOURNAL_RESERVED,
- btree_update_nodes_written_trans(&trans, as));
- bch2_trans_exit(&trans);
+ ret = commit_do(&trans, &as->disk_res, &journal_seq,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_JOURNAL_RECLAIM|
+ JOURNAL_WATERMARK_reserved,
+ btree_update_nodes_written_trans(&trans, as));
+ bch2_trans_unlock(&trans);
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
"error %i in btree_update_nodes_written()", ret);
err:
- if (b) {
+ if (as->b) {
+ struct btree_path *path;
+
+ b = as->b;
+ path = get_unlocked_mut_path(&trans, as->btree_id, b->c.level, b->key.k.p);
/*
* @b is the node we did the final insert into:
*
* we're in journal error state:
*/
- six_lock_intent(&b->c.lock, NULL, NULL);
- six_lock_write(&b->c.lock, NULL, NULL);
+ /*
+ * Ensure transaction is unlocked before using
+ * btree_node_lock_nopath() (the use of which is always suspect,
+ * we need to work on removing this in the future)
+ *
+ * It should be, but get_unlocked_mut_path() -> bch2_path_get()
+ * calls bch2_path_upgrade(), before we call path_make_mut(), so
+ * we may rarely end up with a locked path besides the one we
+ * have here:
+ */
+ bch2_trans_unlock(&trans);
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
+ mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent);
+ bch2_btree_path_level_init(&trans, path, b);
+
+ bch2_btree_node_lock_write_nofail(&trans, path, &b->c);
+
mutex_lock(&c->btree_interior_update_lock);
list_del(&as->write_blocked_list);
+ if (list_empty(&b->write_blocked))
+ clear_btree_node_write_blocked(b);
/*
* Node might have been freed, recheck under
if (!ret) {
i->journal_seq = cpu_to_le64(
- max(journal_seq,
- le64_to_cpu(i->journal_seq)));
+ max(journal_seq,
+ le64_to_cpu(i->journal_seq)));
bch2_btree_add_journal_pin(c, b, journal_seq);
} else {
}
mutex_unlock(&c->btree_interior_update_lock);
+
+ mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
six_unlock_write(&b->c.lock);
btree_node_write_if_need(c, b, SIX_LOCK_intent);
- six_unlock_intent(&b->c.lock);
+ btree_node_unlock(&trans, path, b->c.level);
+ bch2_path_put(&trans, path, true);
}
bch2_journal_pin_drop(&c->journal, &as->journal);
BUG_ON(b->will_make_reachable != (unsigned long) as);
b->will_make_reachable = 0;
+ clear_btree_node_will_make_reachable(b);
}
mutex_unlock(&c->btree_interior_update_lock);
for (i = 0; i < as->nr_new_nodes; i++) {
b = as->new_nodes[i];
- six_lock_read(&b->c.lock, NULL, NULL);
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
}
for (i = 0; i < as->nr_open_buckets; i++)
bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
- bch2_btree_update_free(as);
+ bch2_btree_update_free(as, &trans);
+ bch2_trans_exit(&trans);
}
static void btree_interior_update_work(struct work_struct *work)
as->mode = BTREE_INTERIOR_UPDATING_NODE;
as->b = b;
+
+ set_btree_node_write_blocked(b);
list_add(&as->write_blocked_list, &b->write_blocked);
mutex_unlock(&c->btree_interior_update_lock);
as->new_nodes[as->nr_new_nodes++] = b;
b->will_make_reachable = 1UL|(unsigned long) as;
+ set_btree_node_will_make_reachable(b);
mutex_unlock(&c->btree_interior_update_lock);
- btree_update_will_add_key(as, &b->key);
+ btree_update_add_key(as, &as->new_keys, b);
+
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+ unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
+ unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
+
+ bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
+ cpu_to_le16(sectors);
+ }
}
/*
* xchg() is for synchronization with bch2_btree_complete_write:
*/
v = xchg(&b->will_make_reachable, 0);
+ clear_btree_node_will_make_reachable(b);
as = (struct btree_update *) (v & ~1UL);
if (!as) {
* btree_updates to point to this btree_update:
*/
static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
- struct btree *b)
+ struct btree *b)
{
struct bch_fs *c = as->c;
struct btree_update *p, *n;
closure_wake_up(&c->btree_interior_update_wait);
}
- clear_btree_node_dirty(c, b);
+ clear_btree_node_dirty_acct(c, b);
clear_btree_node_need_write(b);
/*
*/
btree_update_drop_new_node(c, b);
- btree_update_will_delete_key(as, &b->key);
+ btree_update_add_key(as, &as->old_keys, b);
as->old_nodes[as->nr_old_nodes] = b;
as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
as->nr_old_nodes++;
}
-static void bch2_btree_update_done(struct btree_update *as)
+static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
{
struct bch_fs *c = as->c;
u64 start_time = as->start_time;
up_read(&as->c->gc_lock);
as->took_gc_lock = false;
- bch2_btree_reserve_put(as);
+ bch2_btree_reserve_put(as, trans);
continue_at(&as->cl, btree_update_set_nodes_written,
as->c->btree_interior_update_worker);
static struct btree_update *
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
- unsigned level, unsigned nr_nodes, unsigned flags)
+ unsigned level, bool split, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_update *as;
u64 start_time = local_clock();
int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
? BCH_DISK_RESERVATION_NOFAIL : 0;
- int journal_flags = 0;
+ unsigned nr_nodes[2] = { 0, 0 };
+ unsigned update_level = level;
+ int journal_flags = flags & JOURNAL_WATERMARK_MASK;
int ret = 0;
+ u32 restart_count = trans->restart_count;
BUG_ON(!path->should_be_locked);
- if (flags & BTREE_INSERT_JOURNAL_RESERVED)
- journal_flags |= JOURNAL_RES_GET_RESERVED;
if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
journal_flags |= JOURNAL_RES_GET_NONBLOCK;
- /*
- * XXX: figure out how far we might need to split,
- * instead of locking/reserving all the way to the root:
- */
- if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
- trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
- path->btree_id, &path->pos);
- ret = btree_trans_restart(trans);
- return ERR_PTR(ret);
+ while (1) {
+ nr_nodes[!!update_level] += 1 + split;
+ update_level++;
+
+ ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (!btree_path_node(path, update_level)) {
+ /* Allocating new root? */
+ nr_nodes[1] += split;
+ update_level = BTREE_MAX_DEPTH;
+ break;
+ }
+
+ if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
+ BKEY_BTREE_PTR_U64s_MAX * (1 + split)))
+ break;
+
+ split = true;
}
if (flags & BTREE_INSERT_GC_LOCK_HELD)
else if (!down_read_trylock(&c->gc_lock)) {
bch2_trans_unlock(trans);
down_read(&c->gc_lock);
- if (!bch2_trans_relock(trans)) {
+ ret = bch2_trans_relock(trans);
+ if (ret) {
up_read(&c->gc_lock);
- return ERR_PTR(-EINTR);
+ return ERR_PTR(ret);
}
}
as->mode = BTREE_INTERIOR_NO_UPDATE;
as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
as->btree_id = path->btree_id;
+ as->update_level = update_level;
INIT_LIST_HEAD(&as->list);
INIT_LIST_HEAD(&as->unwritten_list);
INIT_LIST_HEAD(&as->write_blocked_list);
if (ret)
goto err;
- bch2_trans_unlock(trans);
-
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
- journal_flags);
+ journal_flags|JOURNAL_RES_GET_NONBLOCK);
if (ret) {
- bch2_btree_update_free(as);
- trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
- btree_trans_restart(trans);
- return ERR_PTR(ret);
+ bch2_trans_unlock(trans);
+
+ if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
+ ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ goto err;
+ }
+
+ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
+ BTREE_UPDATE_JOURNAL_RES,
+ journal_flags);
+ if (ret) {
+ trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
+ goto err;
+ }
+
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ goto err;
}
ret = bch2_disk_reservation_get(c, &as->disk_res,
- nr_nodes * btree_sectors(c),
+ (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
c->opts.metadata_replicas,
disk_res_flags);
if (ret)
goto err;
- ret = bch2_btree_reserve_get(as, nr_nodes, flags);
- if (ret)
- goto err;
+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
+ if (bch2_err_matches(ret, ENOSPC) ||
+ bch2_err_matches(ret, ENOMEM)) {
+ struct closure cl;
+
+ closure_init_stack(&cl);
+
+ do {
+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
- if (!bch2_trans_relock(trans)) {
- ret = -EINTR;
+ bch2_trans_unlock(trans);
+ closure_sync(&cl);
+ } while (ret == -EAGAIN);
+ }
+
+ if (ret) {
+ trace_and_count(c, btree_reserve_get_fail, trans->fn, _RET_IP_, nr_nodes[0] + nr_nodes[1]);
goto err;
}
- bch2_journal_pin_add(&c->journal,
- atomic64_read(&c->journal.seq),
- &as->journal, NULL);
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ goto err;
+ bch2_trans_verify_not_restarted(trans, restart_count);
return as;
err:
- bch2_btree_update_free(as);
+ bch2_btree_update_free(as, trans);
return ERR_PTR(ret);
}
list_del_init(&b->list);
mutex_unlock(&c->btree_cache.lock);
- if (b->c.level)
- six_lock_pcpu_alloc(&b->c.lock);
- else
- six_lock_pcpu_free(&b->c.lock);
-
mutex_lock(&c->btree_root_lock);
BUG_ON(btree_node_root(c, b) &&
(b->c.level < btree_node_root(c, b)->c.level ||
struct bch_fs *c = as->c;
struct btree *old;
- trace_btree_set_root(c, b);
- BUG_ON(!b->written &&
- !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
+ trace_and_count(c, btree_node_set_root, c, b);
old = btree_node_root(c, b);
* Ensure no one is using the old root while we switch to the
* new root:
*/
- bch2_btree_node_lock_write(trans, path, old);
+ bch2_btree_node_lock_write_nofail(trans, path, &old->c);
bch2_btree_set_root_inmem(c, b);
{
struct bch_fs *c = as->c;
struct bkey_packed *k;
- const char *invalid;
+ struct printbuf buf = PRINTBUF;
BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
!btree_ptr_sectors_written(insert));
if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
- invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
- bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
- if (invalid) {
- char buf[160];
-
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert));
- bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid);
+ if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+ btree_node_type(b), WRITE, &buf) ?:
+ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "inserting invalid bkey\n ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+ prt_printf(&buf, "\n ");
+ bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+ btree_node_type(b), WRITE, &buf);
+ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf);
+
+ bch2_fs_inconsistent(c, "%s", buf.buf);
dump_stack();
}
bch2_btree_node_iter_advance(node_iter, b);
bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
- set_btree_node_dirty(c, b);
+ set_btree_node_dirty_acct(c, b);
set_btree_node_need_write(b);
+
+ printbuf_exit(&buf);
}
static void
* node)
*/
static struct btree *__btree_split_node(struct btree_update *as,
+ struct btree_trans *trans,
struct btree *n1)
{
struct bkey_format_state s;
struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
struct bpos n1_pos;
- n2 = bch2_btree_node_alloc(as, n1->c.level);
- bch2_btree_update_add_new_node(as, n2);
+ n2 = bch2_btree_node_alloc(as, trans, n1->c.level);
n2->data->max_key = n1->data->max_key;
n2->data->format = n1->format;
btree_node_interior_verify(as->c, b);
}
-static void btree_split(struct btree_update *as, struct btree_trans *trans,
- struct btree_path *path, struct btree *b,
- struct keylist *keys, unsigned flags)
+static int btree_split(struct btree_update *as, struct btree_trans *trans,
+ struct btree_path *path, struct btree *b,
+ struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
struct btree *parent = btree_node_parent(path, b);
struct btree *n1, *n2 = NULL, *n3 = NULL;
+ struct btree_path *path1 = NULL, *path2 = NULL;
u64 start_time = local_clock();
+ int ret = 0;
BUG_ON(!parent && (b != btree_node_root(c, b)));
- BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
+ BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1));
bch2_btree_interior_update_will_free_node(as, b);
- n1 = bch2_btree_node_alloc_replacement(as, b);
- bch2_btree_update_add_new_node(as, n1);
+ n1 = bch2_btree_node_alloc_replacement(as, trans, b);
if (keys)
btree_split_insert_keys(as, trans, path, n1, keys);
if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
- trace_btree_split(c, b);
+ trace_and_count(c, btree_node_split, c, b);
- n2 = __btree_split_node(as, n1);
+ n2 = __btree_split_node(as, trans, n1);
bch2_btree_build_aux_trees(n2);
bch2_btree_build_aux_trees(n1);
+
+ bch2_btree_update_add_new_node(as, n1);
+ bch2_btree_update_add_new_node(as, n2);
six_unlock_write(&n2->c.lock);
six_unlock_write(&n1->c.lock);
- bch2_btree_node_write(c, n1, SIX_LOCK_intent);
- bch2_btree_node_write(c, n2, SIX_LOCK_intent);
+ path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+ six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+ bch2_btree_path_level_init(trans, path1, n1);
+
+ path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p);
+ six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent);
+ bch2_btree_path_level_init(trans, path2, n2);
/*
* Note that on recursive parent_keys == keys, so we
if (!parent) {
/* Depth increases, make a new root */
- n3 = __btree_root_alloc(as, b->c.level + 1);
+ n3 = __btree_root_alloc(as, trans, b->c.level + 1);
+
+ bch2_btree_update_add_new_node(as, n3);
+ six_unlock_write(&n3->c.lock);
+
+ path2->locks_want++;
+ BUG_ON(btree_node_locked(path2, n3->c.level));
+ six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path2, n3->c.level, SIX_LOCK_intent);
+ bch2_btree_path_level_init(trans, path2, n3);
n3->sib_u64s[0] = U16_MAX;
n3->sib_u64s[1] = U16_MAX;
btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
-
- bch2_btree_node_write(c, n3, SIX_LOCK_intent);
}
} else {
- trace_btree_compact(c, b);
+ trace_and_count(c, btree_node_compact, c, b);
bch2_btree_build_aux_trees(n1);
+ bch2_btree_update_add_new_node(as, n1);
six_unlock_write(&n1->c.lock);
- bch2_btree_node_write(c, n1, SIX_LOCK_intent);
+ path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+ six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+ bch2_btree_path_level_init(trans, path1, n1);
if (parent)
bch2_keylist_add(&as->parent_keys, &n1->key);
if (parent) {
/* Split a non root node */
- bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+ if (ret)
+ goto err;
} else if (n3) {
bch2_btree_set_root(as, trans, path, n3);
} else {
bch2_btree_set_root(as, trans, path, n1);
}
- bch2_btree_update_get_open_buckets(as, n1);
- if (n2)
- bch2_btree_update_get_open_buckets(as, n2);
- if (n3)
+ if (n3) {
bch2_btree_update_get_open_buckets(as, n3);
-
- /* Successful split, update the path to point to the new nodes: */
-
- six_lock_increment(&b->c.lock, SIX_LOCK_intent);
- if (n3)
- bch2_trans_node_add(trans, n3);
- if (n2)
- bch2_trans_node_add(trans, n2);
- bch2_trans_node_add(trans, n1);
+ bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
+ }
+ if (n2) {
+ bch2_btree_update_get_open_buckets(as, n2);
+ bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
+ }
+ bch2_btree_update_get_open_buckets(as, n1);
+ bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
/*
* The old node must be freed (in memory) _before_ unlocking the new
* node after another thread has locked and updated the new node, thus
* seeing stale data:
*/
- bch2_btree_node_free_inmem(trans, b);
+ bch2_btree_node_free_inmem(trans, path, b);
+
+ if (n3)
+ bch2_trans_node_add(trans, n3);
+ if (n2)
+ bch2_trans_node_add(trans, n2);
+ bch2_trans_node_add(trans, n1);
if (n3)
six_unlock_intent(&n3->c.lock);
if (n2)
six_unlock_intent(&n2->c.lock);
six_unlock_intent(&n1->c.lock);
+out:
+ if (path2) {
+ __bch2_btree_path_unlock(trans, path2);
+ bch2_path_put(trans, path2, true);
+ }
+ if (path1) {
+ __bch2_btree_path_unlock(trans, path1);
+ bch2_path_put(trans, path1, true);
+ }
bch2_trans_verify_locks(trans);
? BCH_TIME_btree_node_split
: BCH_TIME_btree_node_compact],
start_time);
+ return ret;
+err:
+ if (n3)
+ bch2_btree_node_free_never_used(as, trans, n3);
+ if (n2)
+ bch2_btree_node_free_never_used(as, trans, n2);
+ bch2_btree_node_free_never_used(as, trans, n1);
+ goto out;
}
static void
* If a split occurred, this function will return early. This can only happen
* for leaf nodes -- inserts into interior nodes have to be atomic.
*/
-static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
- struct btree_path *path, struct btree *b,
- struct keylist *keys, unsigned flags)
+static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
+ struct btree_path *path, struct btree *b,
+ struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
+ int ret;
lockdep_assert_held(&c->gc_lock);
- BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
+ BUG_ON(!btree_node_intent_locked(path, b->c.level));
BUG_ON(!b->c.level);
BUG_ON(!as || as->b);
bch2_verify_keylist_sorted(keys);
- bch2_btree_node_lock_for_insert(trans, path, b);
+ if (!(local_clock() & 63))
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
+
+ ret = bch2_btree_node_lock_write(trans, path, &b->c);
+ if (ret)
+ return ret;
+
+ bch2_btree_node_prep_for_write(trans, path, b);
if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
bch2_btree_node_unlock_write(trans, path, b);
bch2_btree_node_unlock_write(trans, path, b);
btree_node_interior_verify(c, b);
- return;
+ return 0;
split:
- btree_split(as, trans, path, b, keys, flags);
+ /*
+ * We could attempt to avoid the transaction restart, by calling
+ * bch2_btree_path_upgrade() and allocating more nodes:
+ */
+ if (b->c.level >= as->update_level)
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
+
+ return btree_split(as, trans, path, b, keys, flags);
}
int bch2_btree_split_leaf(struct btree_trans *trans,
struct btree_path *path,
unsigned flags)
{
- struct bch_fs *c = trans->c;
struct btree *b = path_l(path)->b;
struct btree_update *as;
unsigned l;
int ret = 0;
as = bch2_btree_update_start(trans, path, path->level,
- btree_update_reserve_required(c, b), flags);
+ true, flags);
if (IS_ERR(as))
return PTR_ERR(as);
- btree_split(as, trans, path, b, NULL, flags);
- bch2_btree_update_done(as);
+ ret = btree_split(as, trans, path, b, NULL, flags);
+ if (ret) {
+ bch2_btree_update_free(as, trans);
+ return ret;
+ }
+
+ bch2_btree_update_done(as, trans);
- for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
+ for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++)
ret = bch2_foreground_maybe_merge(trans, path, l, flags);
return ret;
enum btree_node_sibling sib)
{
struct bch_fs *c = trans->c;
- struct btree_path *sib_path = NULL;
+ struct btree_path *sib_path = NULL, *new_path = NULL;
struct btree_update *as;
struct bkey_format_state new_s;
struct bkey_format new_f;
if (ret)
goto err;
- sib_path->should_be_locked = true;
+ btree_path_set_should_be_locked(sib_path);
m = sib_path->l[level].b;
}
if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) {
- char buf1[100], buf2[100];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
- bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key);
- bch2_bpos_to_text(&PBUF(buf2), next->data->min_key);
+ bch2_bpos_to_text(&buf1, prev->data->max_key);
+ bch2_bpos_to_text(&buf2, next->data->min_key);
bch_err(c,
"btree topology error in btree merge:\n"
" prev ends at %s\n"
" next starts at %s",
- buf1, buf2);
+ buf1.buf, buf2.buf);
+ printbuf_exit(&buf1);
+ printbuf_exit(&buf2);
bch2_topology_error(c);
ret = -EIO;
goto err;
goto out;
parent = btree_node_parent(path, b);
- as = bch2_btree_update_start(trans, path, level,
- btree_update_reserve_required(c, parent) + 1,
- flags|
+ as = bch2_btree_update_start(trans, path, level, false,
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE);
+ BTREE_INSERT_USE_RESERVE|
+ flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto err;
- trace_btree_merge(c, b);
+ trace_and_count(c, btree_node_merge, c, b);
bch2_btree_interior_update_will_free_node(as, b);
bch2_btree_interior_update_will_free_node(as, m);
- n = bch2_btree_node_alloc(as, b->c.level);
- bch2_btree_update_add_new_node(as, n);
+ n = bch2_btree_node_alloc(as, trans, b->c.level);
+
+ SET_BTREE_NODE_SEQ(n->data,
+ max(BTREE_NODE_SEQ(b->data),
+ BTREE_NODE_SEQ(m->data)) + 1);
btree_set_min(n, prev->data->min_key);
btree_set_max(n, next->data->max_key);
- n->data->format = new_f;
+ n->data->format = new_f;
btree_node_set_format(n, new_f);
bch2_btree_sort_into(c, n, prev);
bch2_btree_sort_into(c, n, next);
bch2_btree_build_aux_trees(n);
+ bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
- bch2_btree_node_write(c, n, SIX_LOCK_intent);
+ new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
+ six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+ bch2_btree_path_level_init(trans, new_path, n);
bkey_init(&delete.k);
delete.k.p = prev->key.k.p;
bch2_trans_verify_paths(trans);
- bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+ if (ret)
+ goto err_free_update;
bch2_trans_verify_paths(trans);
bch2_btree_update_get_open_buckets(as, n);
+ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
- six_lock_increment(&b->c.lock, SIX_LOCK_intent);
- six_lock_increment(&m->c.lock, SIX_LOCK_intent);
+ bch2_btree_node_free_inmem(trans, path, b);
+ bch2_btree_node_free_inmem(trans, sib_path, m);
bch2_trans_node_add(trans, n);
bch2_trans_verify_paths(trans);
- bch2_btree_node_free_inmem(trans, b);
- bch2_btree_node_free_inmem(trans, m);
-
six_unlock_intent(&n->c.lock);
- bch2_btree_update_done(as);
+ bch2_btree_update_done(as, trans);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
out:
err:
+ if (new_path)
+ bch2_path_put(trans, new_path, true);
bch2_path_put(trans, sib_path, true);
bch2_trans_verify_locks(trans);
return ret;
+err_free_update:
+ bch2_btree_node_free_never_used(as, trans, n);
+ bch2_btree_update_free(as, trans);
+ goto out;
}
/**
unsigned flags)
{
struct bch_fs *c = trans->c;
+ struct btree_path *new_path = NULL;
struct btree *n, *parent;
struct btree_update *as;
int ret;
parent = btree_node_parent(iter->path, b);
as = bch2_btree_update_start(trans, iter->path, b->c.level,
- (parent
- ? btree_update_reserve_required(c, parent)
- : 0) + 1,
- flags);
+ false, flags);
ret = PTR_ERR_OR_ZERO(as);
- if (ret) {
- trace_btree_gc_rewrite_node_fail(c, b);
+ if (ret)
goto out;
- }
bch2_btree_interior_update_will_free_node(as, b);
- n = bch2_btree_node_alloc_replacement(as, b);
- bch2_btree_update_add_new_node(as, n);
+ n = bch2_btree_node_alloc_replacement(as, trans, b);
bch2_btree_build_aux_trees(n);
+ bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
- trace_btree_gc_rewrite_node(c, b);
+ new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
+ six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+ bch2_btree_path_level_init(trans, new_path, n);
- bch2_btree_node_write(c, n, SIX_LOCK_intent);
+ trace_and_count(c, btree_node_rewrite, c, b);
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
- bch2_btree_insert_node(as, trans, iter->path, parent,
- &as->parent_keys, flags);
+ ret = bch2_btree_insert_node(as, trans, iter->path, parent,
+ &as->parent_keys, flags);
+ if (ret)
+ goto err;
} else {
bch2_btree_set_root(as, trans, iter->path, n);
}
bch2_btree_update_get_open_buckets(as, n);
+ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+
+ bch2_btree_node_free_inmem(trans, iter->path, b);
- six_lock_increment(&b->c.lock, SIX_LOCK_intent);
bch2_trans_node_add(trans, n);
- bch2_btree_node_free_inmem(trans, b);
six_unlock_intent(&n->c.lock);
- bch2_btree_update_done(as);
+ bch2_btree_update_done(as, trans);
out:
- bch2_btree_path_downgrade(iter->path);
+ if (new_path)
+ bch2_path_put(trans, new_path, true);
+ bch2_btree_path_downgrade(trans, iter->path);
return ret;
+err:
+ bch2_btree_node_free_never_used(as, trans, n);
+ bch2_btree_update_free(as, trans);
+ goto out;
}
struct async_btree_rewrite {
goto out;
ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
-out :
+out:
bch2_trans_iter_exit(trans, &iter);
return ret;
{
struct async_btree_rewrite *a;
- if (!percpu_ref_tryget(&c->writes))
+ if (!percpu_ref_tryget_live(&c->writes))
return;
a = kmalloc(sizeof(*a), GFP_NOFS);
struct bch_fs *c = trans->c;
struct btree_iter iter2 = { NULL };
struct btree *parent;
- u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX];
int ret;
if (!skip_triggers) {
- ret = bch2_trans_mark_key(trans,
- bkey_s_c_null,
- bkey_i_to_s_c(new_key),
- BTREE_TRIGGER_INSERT);
+ ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
+ bkey_i_to_s_c(&b->key), 0);
if (ret)
return ret;
- ret = bch2_trans_mark_key(trans,
- bkey_i_to_s_c(&b->key),
- bkey_s_c_null,
- BTREE_TRIGGER_OVERWRITE);
+ ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
+ new_key, 0);
if (ret)
return ret;
}
BUG_ON(iter2.path->level != b->c.level);
BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
- btree_node_unlock(iter2.path, iter2.path->level);
- path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
- iter2.path->level++;
+ btree_path_set_level_up(trans, iter2.path);
+
+ bch2_btree_path_check_sort(trans, iter2.path, 0);
ret = bch2_btree_iter_traverse(&iter2) ?:
bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
} else {
BUG_ON(btree_node_root(c, b) != b);
- trans->extra_journal_entries = (void *) &journal_entries[0];
- trans->extra_journal_entry_u64s =
- journal_entry_set((void *) &journal_entries[0],
- BCH_JSET_ENTRY_btree_root,
- b->c.btree_id, b->c.level,
- new_key, new_key->k.u64s);
+ ret = darray_make_room(&trans->extra_journal_entries,
+ jset_u64s(new_key->k.u64s));
+ if (ret)
+ return ret;
+
+ journal_entry_set((void *) &darray_top(trans->extra_journal_entries),
+ BCH_JSET_ENTRY_btree_root,
+ b->c.btree_id, b->c.level,
+ new_key, new_key->k.u64s);
+ trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s);
}
ret = bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_JOURNAL_RECLAIM|
- BTREE_INSERT_JOURNAL_RESERVED);
+ JOURNAL_WATERMARK_reserved);
if (ret)
goto err;
- bch2_btree_node_lock_write(trans, iter->path, b);
+ bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c);
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
struct closure cl;
int ret = 0;
- if (!btree_node_intent_locked(path, b->c.level) &&
- !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) {
- btree_trans_restart(trans);
- return -EINTR;
- }
+ ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
+ if (ret)
+ return ret;
closure_init_stack(&cl);
if (ret) {
bch2_trans_unlock(trans);
closure_sync(&cl);
- if (!bch2_trans_relock(trans))
- return -EINTR;
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ return ret;
}
- new_hash = bch2_btree_node_mem_alloc(c);
+ new_hash = bch2_btree_node_mem_alloc(c, false);
}
path->intent_ref++;
closure_sync(&cl);
} while (ret);
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(c, false);
bch2_btree_cache_cannibalize_unlock(c);
set_btree_node_fake(b);
mutex_lock(&c->btree_interior_update_lock);
list_for_each_entry(as, &c->btree_interior_update_list, list)
- pr_buf(out, "%p m %u w %u r %u j %llu\n",
+ prt_printf(out, "%p m %u w %u r %u j %llu\n",
as,
as->mode,
as->nodes_written,
mutex_unlock(&c->btree_interior_update_lock);
}
-size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
+static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
{
- size_t ret = 0;
- struct list_head *i;
+ bool ret;
mutex_lock(&c->btree_interior_update_lock);
- list_for_each(i, &c->btree_interior_update_list)
- ret++;
+ ret = !list_empty(&c->btree_interior_update_list);
mutex_unlock(&c->btree_interior_update_lock);
return ret;
}
+bool bch2_btree_interior_updates_flush(struct bch_fs *c)
+{
+ bool ret = bch2_btree_interior_updates_pending(c);
+
+ if (ret)
+ closure_wait_event(&c->btree_interior_update_wait,
+ !bch2_btree_interior_updates_pending(c));
+ return ret;
+}
+
void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
{
struct btree_root *r;
unsigned took_gc_lock:1;
enum btree_id btree_id;
+ unsigned update_level;
struct disk_reservation disk_res;
struct journal_preres journal_preres;
struct journal_entry_pin journal;
/* Preallocated nodes we reserve when we start the update: */
- struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX];
- unsigned nr_prealloc_nodes;
+ struct prealloc_nodes {
+ struct btree *b[BTREE_UPDATE_NODES_MAX];
+ unsigned nr;
+ } prealloc_nodes[2];
/* Nodes being freed: */
struct keylist old_keys;
};
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
+ struct btree_trans *,
struct btree *,
struct bkey_format);
void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
-size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
+bool bch2_btree_interior_updates_flush(struct bch_fs *);
void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *);
struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
#include "btree_locking.h"
#include "buckets.h"
#include "debug.h"
+#include "errcode.h"
#include "error.h"
#include "extent_update.h"
#include "journal.h"
const struct btree_insert_entry *r)
{
return cmp_int(l->btree_id, r->btree_id) ?:
+ cmp_int(l->cached, r->cached) ?:
-cmp_int(l->level, r->level) ?:
bpos_cmp(l->k->k.p, r->k->k.p);
}
insert_l(&i[0])->b == insert_l(&i[1])->b;
}
-static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
+inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
{
struct bch_fs *c = trans->c;
bch2_btree_init_next(trans, b);
}
-void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
-{
- bch2_btree_node_lock_write(trans, path, b);
- bch2_btree_node_prep_for_write(trans, path, b);
-}
-
/* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct btree_write *w = container_of(pin, struct btree_write, journal);
struct btree *b = container_of(w, struct btree, writes[i]);
+ struct btree_trans trans;
+ unsigned long old, new, v;
+ unsigned idx = w - b->writes;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+ v = READ_ONCE(b->flags);
+
+ do {
+ old = new = v;
- six_lock_read(&b->c.lock, NULL, NULL);
- bch2_btree_node_write_cond(c, b,
- (btree_current_write(b) == w && w->journal.seq == seq));
+ if (!(old & (1 << BTREE_NODE_dirty)) ||
+ !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
+ w->journal.seq != seq)
+ break;
+
+ new |= 1 << BTREE_NODE_need_write;
+ } while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+ btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
+
+ bch2_trans_exit(&trans);
return 0;
}
/**
* btree_insert_key - insert a key one key into a leaf node
*/
-static bool btree_insert_key_leaf(struct btree_trans *trans,
+static void btree_insert_key_leaf(struct btree_trans *trans,
struct btree_insert_entry *insert)
{
struct bch_fs *c = trans->c;
if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
&insert_l(insert)->iter, insert->k)))
- return false;
+ return;
i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
le64_to_cpu(i->journal_seq)));
bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
if (unlikely(!btree_node_dirty(b)))
- set_btree_node_dirty(c, b);
+ set_btree_node_dirty_acct(c, b);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) bset_u64s(t) - old_u64s;
if (u64s_added > live_u64s_added &&
bch2_maybe_compact_whiteouts(c, b))
bch2_trans_node_reinit_iter(trans, b);
-
- return true;
}
/* Cached btree updates: */
if (ret)
return ret;
- if (!bch2_trans_relock(trans)) {
- trace_trans_restart_journal_preres_get(trans->fn, trace_ip);
- return -EINTR;
+ ret = bch2_trans_relock(trans);
+ if (ret) {
+ trace_and_count(c, trans_restart_journal_preres_get, trans, trace_ip, 0);
+ return ret;
}
return 0;
struct bch_fs *c = trans->c;
int ret;
- if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
- flags |= JOURNAL_RES_GET_RESERVED;
-
ret = bch2_journal_res_get(&c->journal, &trans->journal_res,
- trans->journal_u64s, flags);
+ trans->journal_u64s,
+ flags|
+ (trans->flags & JOURNAL_WATERMARK_MASK));
return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
}
#define JSET_ENTRY_LOG_U64s 4
-static noinline void journal_transaction_name(struct btree_trans *trans)
+static void journal_transaction_name(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
- struct jset_entry *entry = journal_res_entry(&c->journal, &trans->journal_res);
- struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
- unsigned u64s = JSET_ENTRY_LOG_U64s - 1;
- unsigned b, buflen = u64s * sizeof(u64);
-
- l->entry.u64s = cpu_to_le16(u64s);
- l->entry.btree_id = 0;
- l->entry.level = 0;
- l->entry.type = BCH_JSET_ENTRY_log;
- l->entry.pad[0] = 0;
- l->entry.pad[1] = 0;
- l->entry.pad[2] = 0;
- b = min_t(unsigned, strlen(trans->fn), buflen);
- memcpy(l->d, trans->fn, b);
- while (b < buflen)
- l->d[b++] = '\0';
-
- trans->journal_res.offset += JSET_ENTRY_LOG_U64s;
- trans->journal_res.u64s -= JSET_ENTRY_LOG_U64s;
+ struct journal *j = &c->journal;
+ struct jset_entry *entry =
+ bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_log, 0, 0,
+ JSET_ENTRY_LOG_U64s);
+ struct jset_entry_log *l =
+ container_of(entry, struct jset_entry_log, entry);
+
+ strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
}
static inline enum btree_insert_ret
ck->u64s = new_u64s;
ck->k = new_k;
- return BTREE_INSERT_OK;
+ return 0;
}
-static inline void do_btree_insert_one(struct btree_trans *trans,
- struct btree_insert_entry *i)
+/* Triggers: */
+
+static int run_one_mem_trigger(struct btree_trans *trans,
+ struct btree_insert_entry *i,
+ unsigned flags)
{
- struct bch_fs *c = trans->c;
- struct journal *j = &c->journal;
- bool did_work;
+ struct bkey_s_c old = { &i->old_k, i->old_v };
+ struct bkey_i *new = i->k;
+ int ret;
- EBUG_ON(trans->journal_res.ref !=
- !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
+ if (unlikely(flags & BTREE_TRIGGER_NORUN))
+ return 0;
- i->k->k.needs_whiteout = false;
+ if (!btree_node_type_needs_gc(i->btree_id))
+ return 0;
- did_work = !i->cached
- ? btree_insert_key_leaf(trans, i)
- : bch2_btree_insert_key_cached(trans, i->path, i->k);
- if (!did_work)
- return;
+ if (bch2_bkey_ops[old.k->type].atomic_trigger ==
+ bch2_bkey_ops[i->k->k.type].atomic_trigger &&
+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+ ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+ } else {
+ struct bkey _deleted = KEY(0, 0, 0);
+ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
- if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- bch2_journal_add_keys(j, &trans->journal_res,
- i->btree_id,
- i->level,
- i->k);
+ _deleted.p = i->path->pos;
- if (trans->journal_seq)
- *trans->journal_seq = trans->journal_res.seq;
+ ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
+ BTREE_TRIGGER_INSERT|flags) ?:
+ bch2_mark_key(trans, old, deleted,
+ BTREE_TRIGGER_OVERWRITE|flags);
+ }
+
+ return ret;
+}
+
+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+ bool overwrite)
+{
+ /*
+ * Transactional triggers create new btree_insert_entries, so we can't
+ * pass them a pointer to a btree_insert_entry, that memory is going to
+ * move:
+ */
+ struct bkey old_k = i->old_k;
+ struct bkey_s_c old = { &old_k, i->old_v };
+
+ if ((i->flags & BTREE_TRIGGER_NORUN) ||
+ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+ return 0;
+
+ if (!i->insert_trigger_run &&
+ !i->overwrite_trigger_run &&
+ bch2_bkey_ops[old.k->type].trans_trigger ==
+ bch2_bkey_ops[i->k->k.type].trans_trigger &&
+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+ i->overwrite_trigger_run = true;
+ i->insert_trigger_run = true;
+ return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
+ BTREE_TRIGGER_INSERT|
+ BTREE_TRIGGER_OVERWRITE|
+ i->flags) ?: 1;
+ } else if (overwrite && !i->overwrite_trigger_run) {
+ i->overwrite_trigger_run = true;
+ return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
+ } else if (!overwrite && !i->insert_trigger_run) {
+ i->insert_trigger_run = true;
+ return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
+ } else {
+ return 0;
+ }
+}
+
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+ struct btree_insert_entry *btree_id_start)
+{
+ struct btree_insert_entry *i;
+ bool trans_trigger_run;
+ int ret, overwrite;
+
+ for (overwrite = 1; overwrite >= 0; --overwrite) {
+
+ /*
+ * Running triggers will append more updates to the list of updates as
+ * we're walking it:
+ */
+ do {
+ trans_trigger_run = false;
+
+ for (i = btree_id_start;
+ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+ i++) {
+ if (i->btree_id != btree_id)
+ continue;
+
+ ret = run_one_trans_trigger(trans, i, overwrite);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ trans_trigger_run = true;
+ }
+ } while (trans_trigger_run);
+ }
+
+ return 0;
+}
+
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+ unsigned btree_id = 0;
+ int ret = 0;
+
+ /*
+ *
+ * For a given btree, this algorithm runs insert triggers before
+ * overwrite triggers: this is so that when extents are being moved
+ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+ * they are re-added.
+ */
+ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+ if (btree_id == BTREE_ID_alloc)
+ continue;
+
+ while (btree_id_start < trans->updates + trans->nr_updates &&
+ btree_id_start->btree_id < btree_id)
+ btree_id_start++;
+
+ ret = run_btree_triggers(trans, btree_id, btree_id_start);
+ if (ret)
+ return ret;
}
+
+ trans_for_each_update(trans, i) {
+ if (i->btree_id > BTREE_ID_alloc)
+ break;
+ if (i->btree_id == BTREE_ID_alloc) {
+ ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+ if (ret)
+ return ret;
+ break;
+ }
+ }
+
+ trans_for_each_update(trans, i)
+ BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+ (!i->insert_trigger_run || !i->overwrite_trigger_run));
+
+ return 0;
}
-static noinline int bch2_trans_mark_gc(struct btree_trans *trans)
+static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
BUG_ON(i->cached || i->level);
if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
- ret = bch2_mark_update(trans, i->path, i->k,
- i->flags|BTREE_TRIGGER_GC);
+ ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
if (ret)
break;
}
int ret;
if (race_fault()) {
- trace_trans_restart_fault_inject(trans->fn, trace_ip);
- trans->restarted = true;
- return -EINTR;
+ trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
+ return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
}
/*
if (btree_node_type_needs_gc(i->bkey_type))
marking = true;
+
+ /*
+ * Revalidate before calling mem triggers - XXX, ugly:
+ *
+ * - successful btree node splits don't cause transaction
+ * restarts and will have invalidated the pointer to the bkey
+ * value
+ * - btree_node_lock_for_insert() -> btree_node_prep_for_write()
+ * when it has to resort
+ * - btree_key_can_insert_cached() when it has to reallocate
+ *
+ * Ugly because we currently have no way to tell if the
+ * pointer's been invalidated, which means it's debatabale
+ * whether we should be stashing the old key at all.
+ */
+ i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
+
+ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) {
+ struct bkey_i *j_k =
+ bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
+ i->k->k.p);
+
+ if (j_k) {
+ i->old_k = j_k->k;
+ i->old_v = &j_k->v;
+ }
+ }
}
/*
if (ret)
return ret;
- if (unlikely(trans->journal_transaction_names))
- journal_transaction_name(trans);
+ journal_transaction_name(trans);
} else {
trans->journal_res.seq = c->journal.replay_journal_seq;
}
- if (unlikely(trans->extra_journal_entry_u64s)) {
+ if (unlikely(trans->extra_journal_entries.nr)) {
memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
- trans->extra_journal_entries,
- trans->extra_journal_entry_u64s);
+ trans->extra_journal_entries.data,
+ trans->extra_journal_entries.nr);
- trans->journal_res.offset += trans->extra_journal_entry_u64s;
- trans->journal_res.u64s -= trans->extra_journal_entry_u64s;
+ trans->journal_res.offset += trans->extra_journal_entries.nr;
+ trans->journal_res.u64s -= trans->extra_journal_entries.nr;
}
/*
trans_for_each_update(trans, i)
if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
- ret = bch2_mark_update(trans, i->path, i->k, i->flags);
+ ret = run_one_mem_trigger(trans, i, i->flags);
if (ret)
return ret;
}
if (unlikely(c->gc_pos.phase)) {
- ret = bch2_trans_mark_gc(trans);
+ ret = bch2_trans_commit_run_gc_triggers(trans);
if (ret)
return ret;
}
- trans_for_each_update(trans, i)
- do_btree_insert_one(trans, i);
-
- return ret;
-}
-
-static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_path *path)
-{
- unsigned l;
-
- for (l = 0; l < BTREE_MAX_DEPTH; l++)
- if (btree_node_read_locked(path, l))
- BUG_ON(!bch2_btree_node_upgrade(trans, path, l));
-}
-
-static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path)
-{
- struct btree *b = path_l(path)->b;
-
- do {
- if (path->nodes_locked &&
- path->nodes_locked != path->nodes_intent_locked)
- path_upgrade_readers(trans, path);
- } while ((path = prev_btree_path(trans, path)) &&
- path_l(path)->b == b);
-}
-
-/*
- * Check for nodes that we have both read and intent locks on, and upgrade the
- * readers to intent:
- */
-static inline void normalize_read_intent_locks(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i, nr_read = 0, nr_intent = 0;
-
- trans_for_each_path_inorder(trans, path, i) {
- struct btree_path *next = i + 1 < trans->nr_sorted
- ? trans->paths + trans->sorted[i + 1]
- : NULL;
-
- if (path->nodes_locked) {
- if (path->nodes_intent_locked)
- nr_intent++;
- else
- nr_read++;
+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ trans_for_each_update(trans, i) {
+ struct journal *j = &c->journal;
+ struct jset_entry *entry;
+
+ if (i->key_cache_already_flushed)
+ continue;
+
+ entry = bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_overwrite,
+ i->btree_id, i->level,
+ i->old_k.u64s);
+ bkey_reassemble(&entry->start[0],
+ (struct bkey_s_c) { &i->old_k, i->old_v });
+
+ entry = bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_btree_keys,
+ i->btree_id, i->level,
+ i->k->k.u64s);
+ bkey_copy(&entry->start[0], i->k);
}
- if (!next || path_l(path)->b != path_l(next)->b) {
- if (nr_read && nr_intent)
- upgrade_readers(trans, path);
-
- nr_read = nr_intent = 0;
- }
+ if (trans->journal_seq)
+ *trans->journal_seq = trans->journal_res.seq;
}
- bch2_trans_verify_locks(trans);
-}
-
-static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path_inorder(trans, path, i) {
- //if (path == pos)
- // break;
-
- if (path->nodes_locked != path->nodes_intent_locked &&
- !bch2_btree_path_upgrade(trans, path, path->level + 1))
- return true;
+ trans_for_each_update(trans, i) {
+ i->k->k.needs_whiteout = false;
+
+ if (!i->cached)
+ btree_insert_key_leaf(trans, i);
+ else if (!i->key_cache_already_flushed)
+ bch2_btree_insert_key_cached(trans, i->path, i->k);
+ else {
+ bch2_btree_key_cache_drop(trans, i->path);
+ btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
+ }
}
- return false;
+ return ret;
}
static inline int trans_lock_write(struct btree_trans *trans)
{
struct btree_insert_entry *i;
+ int ret;
trans_for_each_update(trans, i) {
if (same_leaf_as_prev(trans, i))
continue;
- if (!six_trylock_write(&insert_l(i)->b->c.lock)) {
- if (have_conflicting_read_lock(trans, i->path))
- goto fail;
-
- btree_node_lock_type(trans, i->path,
- insert_l(i)->b,
- i->path->pos, i->level,
- SIX_LOCK_write, NULL, NULL);
- }
+ ret = bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c);
+ if (ret)
+ goto fail;
bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
}
- trace_trans_restart_would_deadlock_write(trans->fn);
- return btree_trans_restart(trans);
+ trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
}
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
- struct bkey_s_c old;
+ struct printbuf buf = PRINTBUF;
int ret, u64s_delta = 0;
+ int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
trans_for_each_update(trans, i) {
- const char *invalid = bch2_bkey_invalid(c,
- bkey_i_to_s_c(i->k), i->bkey_type);
- if (invalid) {
- char buf[200];
-
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
- bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
- buf, trans->fn, (void *) i->ip_allocated, invalid);
+ if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+ i->bkey_type, rw, &buf)) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "invalid bkey on insert from %s -> %ps",
+ trans->fn, (void *) i->ip_allocated);
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
+ prt_newline(&buf);
+
+ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+ i->bkey_type, rw, &buf);
+
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ printbuf_exit(&buf);
return -EINVAL;
}
btree_insert_entry_checks(trans, i);
}
- trans_for_each_update(trans, i) {
- struct bkey u;
+ printbuf_exit(&buf);
- /*
- * peek_slot() doesn't yet work on iterators that point to
- * interior nodes:
- */
- if (i->cached || i->level)
+ trans_for_each_update(trans, i) {
+ if (i->cached)
continue;
- old = bch2_btree_path_peek_slot(i->path, &u);
- ret = bkey_err(old);
- if (unlikely(ret))
- return ret;
-
u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
- u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+ u64s_delta -= i->old_btree_u64s;
if (!same_leaf_as_next(trans, i)) {
if (u64s_delta <= 0) {
ret = bch2_journal_preres_get(&c->journal,
&trans->journal_preres, trans->journal_preres_u64s,
JOURNAL_RES_GET_NONBLOCK|
- ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
- ? JOURNAL_RES_GET_RESERVED : 0));
+ (trans->flags & JOURNAL_WATERMARK_MASK));
if (unlikely(ret == -EAGAIN))
ret = bch2_trans_journal_preres_get_cold(trans,
trans->journal_preres_u64s, trace_ip);
if (unlikely(ret))
return ret;
- normalize_read_intent_locks(trans);
-
ret = trans_lock_write(trans);
if (unlikely(ret))
return ret;
switch (ret) {
case BTREE_INSERT_BTREE_NODE_FULL:
ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
- if (!ret)
- return 0;
-
- if (ret == -EINTR)
- trace_trans_restart_btree_node_split(trans->fn, trace_ip,
- i->btree_id, &i->path->pos);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
break;
case BTREE_INSERT_NEED_MARK_REPLICAS:
bch2_trans_unlock(trans);
if (ret)
break;
- if (bch2_trans_relock(trans))
- return 0;
-
- trace_trans_restart_mark_replicas(trans->fn, trace_ip);
- ret = -EINTR;
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ trace_and_count(c, trans_restart_mark_replicas, trans, trace_ip);
break;
case BTREE_INSERT_NEED_JOURNAL_RES:
bch2_trans_unlock(trans);
if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
- !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) {
- trans->restarted = true;
- ret = -EAGAIN;
+ !(trans->flags & JOURNAL_WATERMARK_reserved)) {
+ ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
}
if (ret)
break;
- if (bch2_trans_relock(trans))
- return 0;
-
- trace_trans_restart_journal_res_get(trans->fn, trace_ip);
- ret = -EINTR;
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ trace_and_count(c, trans_restart_journal_res_get, trans, trace_ip);
break;
case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
bch2_trans_unlock(trans);
- trace_trans_blocked_journal_reclaim(trans->fn, trace_ip);
+ trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
wait_event_freezable(c->journal.reclaim_wait,
(ret = journal_reclaim_wait_done(c)));
if (ret < 0)
break;
- if (bch2_trans_relock(trans))
- return 0;
-
- trace_trans_restart_journal_reclaim(trans->fn, trace_ip);
- ret = -EINTR;
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip);
break;
default:
BUG_ON(ret >= 0);
break;
}
- BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted);
- BUG_ON(ret == -ENOSPC &&
- !(trans->flags & BTREE_INSERT_NOWAIT) &&
- (trans->flags & BTREE_INSERT_NOFAIL));
+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
+
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
+ !(trans->flags & BTREE_INSERT_NOWAIT) &&
+ (trans->flags & BTREE_INSERT_NOFAIL), c,
+ "%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
return ret;
}
bch2_trans_unlock(trans);
- ret = bch2_fs_read_write_early(c);
+ ret = bch2_fs_read_write_early(c) ?:
+ bch2_trans_relock(trans);
if (ret)
return ret;
- if (!bch2_trans_relock(trans))
- return -EINTR;
-
percpu_ref_get(&c->writes);
return 0;
}
-static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
- bool overwrite)
-{
- struct bkey _deleted = KEY(0, 0, 0);
- struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
- struct bkey_s_c old;
- struct bkey unpacked;
- int ret = 0;
-
- if ((i->flags & BTREE_TRIGGER_NORUN) ||
- !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
- return 0;
-
- if (!overwrite) {
- if (i->insert_trigger_run)
- return 0;
-
- BUG_ON(i->overwrite_trigger_run);
- i->insert_trigger_run = true;
- } else {
- if (i->overwrite_trigger_run)
- return 0;
-
- BUG_ON(!i->insert_trigger_run);
- i->overwrite_trigger_run = true;
- }
-
- old = bch2_btree_path_peek_slot(i->path, &unpacked);
- _deleted.p = i->path->pos;
-
- if (overwrite) {
- ret = bch2_trans_mark_key(trans, old, deleted,
- BTREE_TRIGGER_OVERWRITE|i->flags);
- } else if (old.k->type == i->k->k.type &&
- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
- i->overwrite_trigger_run = true;
- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
- } else {
- ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
- BTREE_TRIGGER_INSERT|i->flags);
- }
-
- if (ret == -EINTR)
- trace_trans_restart_mark(trans->fn, _RET_IP_,
- i->btree_id, &i->path->pos);
- return ret ?: 1;
-}
-
-static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
- struct btree_insert_entry *btree_id_start)
+/*
+ * This is for updates done in the early part of fsck - btree_gc - before we've
+ * gone RW. we only add the new key to the list of keys for journal replay to
+ * do.
+ */
+static noinline int
+do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
{
+ struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
- bool trans_trigger_run;
- int ret, overwrite;
-
- for (overwrite = 0; overwrite < 2; overwrite++) {
-
- /*
- * Running triggers will append more updates to the list of updates as
- * we're walking it:
- */
- do {
- trans_trigger_run = false;
-
- for (i = btree_id_start;
- i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
- i++) {
- ret = run_one_trigger(trans, i, overwrite);
- if (ret < 0)
- return ret;
- if (ret)
- trans_trigger_run = true;
- }
- } while (trans_trigger_run);
- }
-
- return 0;
-}
-
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
-{
- struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
- unsigned btree_id = 0;
int ret = 0;
- /*
- *
- * For a given btree, this algorithm runs insert triggers before
- * overwrite triggers: this is so that when extents are being moved
- * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
- * they are re-added.
- */
- for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
- while (btree_id_start < trans->updates + trans->nr_updates &&
- btree_id_start->btree_id < btree_id)
- btree_id_start++;
-
- ret = run_btree_triggers(trans, btree_id, btree_id_start);
+ trans_for_each_update(trans, i) {
+ ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
if (ret)
- return ret;
+ break;
}
- trans_for_each_update(trans, i)
- BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
- (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
- (!i->insert_trigger_run || !i->overwrite_trigger_run));
-
- return 0;
+ return ret;
}
int __bch2_trans_commit(struct btree_trans *trans)
int ret = 0;
if (!trans->nr_updates &&
- !trans->extra_journal_entry_u64s)
+ !trans->extra_journal_entries.nr)
goto out_reset;
if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
lockdep_assert_held(&c->gc_lock);
- memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
-
- trans->journal_u64s = trans->extra_journal_entry_u64s;
- trans->journal_preres_u64s = 0;
-
- trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
+ ret = bch2_trans_commit_run_triggers(trans);
+ if (ret)
+ goto out_reset;
- if (trans->journal_transaction_names)
- trans->journal_u64s += JSET_ENTRY_LOG_U64s;
+ if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+ ret = do_bch2_trans_commit_to_journal_replay(trans);
+ goto out_reset;
+ }
if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
- unlikely(!percpu_ref_tryget(&c->writes))) {
+ unlikely(!percpu_ref_tryget_live(&c->writes))) {
ret = bch2_trans_commit_get_rw_cold(trans);
if (ret)
goto out_reset;
}
-#ifdef CONFIG_BCACHEFS_DEBUG
- /*
- * if BTREE_TRIGGER_NORUN is set, it means we're probably being called
- * from the key cache flush code:
- */
- trans_for_each_update(trans, i)
- if (!i->cached &&
- !(i->flags & BTREE_TRIGGER_NORUN))
- bch2_btree_key_cache_verify_clean(trans,
- i->btree_id, i->k->k.p);
-#endif
+ EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
- ret = bch2_trans_commit_run_triggers(trans);
- if (ret)
- goto out;
+ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
+
+ trans->journal_u64s = trans->extra_journal_entries.nr;
+ trans->journal_preres_u64s = 0;
+
+ /* For journalling transaction name: */
+ trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
trans_for_each_update(trans, i) {
BUG_ON(!i->path->should_be_locked);
- if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
- trace_trans_restart_upgrade(trans->fn, _RET_IP_,
- i->btree_id, &i->path->pos);
- ret = btree_trans_restart(trans);
+ ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
+ if (unlikely(ret))
goto out;
- }
BUG_ON(!btree_node_intent_locked(i->path, i->level));
+ if (i->key_cache_already_flushed)
+ continue;
+
+ /* we're going to journal the key being updated: */
u64s = jset_u64s(i->k->k.u64s);
if (i->cached &&
likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
trans->journal_preres_u64s += u64s;
trans->journal_u64s += u64s;
+
+ /* and we're also going to log the overwrite: */
+ trans->journal_u64s += jset_u64s(i->old_k.u64s);
}
if (trans->extra_journal_res) {
if (ret)
goto err;
+
+ trace_and_count(c, transaction_commit, trans, _RET_IP_);
out:
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
percpu_ref_put(&c->writes);
out_reset:
- trans_for_each_update(trans, i)
- bch2_path_put(trans, i->path, true);
-
- trans->extra_journal_res = 0;
- trans->nr_updates = 0;
- trans->hooks = NULL;
- trans->extra_journal_entries = NULL;
- trans->extra_journal_entry_u64s = 0;
+ bch2_trans_reset_updates(trans);
if (trans->fs_usage_deltas) {
trans->fs_usage_deltas->used = 0;
- memset(&trans->fs_usage_deltas->memset_start, 0,
+ memset((void *) trans->fs_usage_deltas +
+ offsetof(struct replicas_delta_list, memset_start), 0,
(void *) &trans->fs_usage_deltas->memset_end -
(void *) &trans->fs_usage_deltas->memset_start);
}
goto retry;
}
-static int check_pos_snapshot_overwritten(struct btree_trans *trans,
+static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans,
enum btree_id id,
struct bpos pos)
{
struct bkey_s_c k;
int ret;
- if (!btree_type_has_snapshots(id))
- return 0;
-
- if (!snapshot_t(c, pos.snapshot)->children[0])
- return 0;
-
bch2_trans_iter_init(trans, &iter, id, pos,
BTREE_ITER_NOT_EXTENTS|
BTREE_ITER_ALL_SNAPSHOTS);
return ret;
}
+static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
+ enum btree_id id,
+ struct bpos pos)
+{
+ if (!btree_type_has_snapshots(id) ||
+ pos.snapshot == U32_MAX ||
+ !snapshot_t(trans->c, pos.snapshot)->children[0])
+ return 0;
+
+ return __check_pos_snapshot_overwritten(trans, id, pos);
+}
+
int bch2_trans_update_extent(struct btree_trans *trans,
struct btree_iter *orig_iter,
struct bkey_i *insert,
BTREE_ITER_INTENT|
BTREE_ITER_WITH_UPDATES|
BTREE_ITER_NOT_EXTENTS);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
goto out;
}
next:
- k = bch2_btree_iter_next(&iter);
+ bch2_btree_iter_advance(&iter);
+ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
}
static int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
- struct bkey_i *k, enum btree_update_flags flags)
+bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_i *k, enum btree_update_flags flags,
+ unsigned long ip);
+
+static noinline int flush_new_cached_update(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_insert_entry *i,
+ enum btree_update_flags flags,
+ unsigned long ip)
{
+ struct btree_path *btree_path;
+ int ret;
+
+ i->key_cache_already_flushed = true;
+ i->flags |= BTREE_TRIGGER_NORUN;
+
+ btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+ BTREE_ITER_INTENT, _THIS_IP_);
+
+ ret = bch2_btree_path_traverse(trans, btree_path, 0);
+ if (ret)
+ goto err;
+
+ btree_path_set_should_be_locked(btree_path);
+ ret = bch2_trans_update_by_path_trace(trans, btree_path, i->k, flags, ip);
+err:
+ bch2_path_put(trans, btree_path, true);
+ return ret;
+}
+
+static int __must_check
+bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_i *k, enum btree_update_flags flags,
+ unsigned long ip)
+{
+ struct bch_fs *c = trans->c;
struct btree_insert_entry *i, n;
BUG_ON(!path->should_be_locked);
.cached = path->cached,
.path = path,
.k = k,
- .ip_allocated = _RET_IP_,
+ .ip_allocated = ip,
};
#ifdef CONFIG_BCACHEFS_DEBUG
BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
bch2_path_put(trans, i->path, true);
- *i = n;
- } else
+ i->flags = n.flags;
+ i->cached = n.cached;
+ i->k = n.k;
+ i->path = n.path;
+ i->ip_allocated = n.ip_allocated;
+ } else {
array_insert_item(trans->updates, trans->nr_updates,
i - trans->updates, n);
- __btree_path_get(n.path, true);
+ i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v;
+ i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
+
+ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) {
+ struct bkey_i *j_k =
+ bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
+
+ if (j_k) {
+ i->old_k = j_k->k;
+ i->old_v = &j_k->v;
+ }
+ }
+ }
+
+ __btree_path_get(i->path, true);
+
+ /*
+ * If a key is present in the key cache, it must also exist in the
+ * btree - this is necessary for cache coherency. When iterating over
+ * a btree that's cached in the key cache, the btree iter code checks
+ * the key cache - but the key has to exist in the btree for that to
+ * work:
+ */
+ if (unlikely(path->cached && bkey_deleted(&i->old_k)))
+ return flush_new_cached_update(trans, path, i, flags, ip);
+
return 0;
}
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_i *k, enum btree_update_flags flags)
+{
+ return bch2_trans_update_by_path_trace(trans, path, k, flags, _RET_IP_);
+}
+
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_i *k, enum btree_update_flags flags)
{
k->k.type = KEY_TYPE_whiteout;
}
+ /*
+ * Ensure that updates to cached btrees go to the key cache:
+ */
if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
!path->cached &&
!path->level &&
_THIS_IP_);
ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
- BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL);
+ BTREE_ITER_CACHED);
if (unlikely(ret))
return ret;
ck = (void *) iter->key_cache_path->l[0].b;
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_);
- btree_trans_restart(trans);
- return -EINTR;
+ trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
}
- iter->key_cache_path->should_be_locked = true;
+ btree_path_set_should_be_locked(iter->key_cache_path);
}
path = iter->key_cache_path;
__bch2_btree_insert(&trans, id, k));
}
-int bch2_btree_delete_at(struct btree_trans *trans,
- struct btree_iter *iter, unsigned update_flags)
+int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
+ unsigned len, unsigned update_flags)
{
struct bkey_i *k;
bkey_init(&k->k);
k->k.p = iter->pos;
+ bch2_key_resize(&k->k, len);
return bch2_trans_update(trans, iter, k, update_flags);
}
+int bch2_btree_delete_at(struct btree_trans *trans,
+ struct btree_iter *iter, unsigned update_flags)
+{
+ return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
+}
+
int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
struct bpos start, struct bpos end,
- unsigned iter_flags,
+ unsigned update_flags,
u64 *journal_seq)
{
+ u32 restart_count = trans->restart_count;
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
- bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags);
-retry:
- while ((bch2_trans_begin(trans),
- (k = bch2_btree_iter_peek(&iter)).k) &&
- !(ret = bkey_err(k)) &&
- bkey_cmp(iter.pos, end) < 0) {
+ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
+ while ((k = bch2_btree_iter_peek(&iter)).k) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(trans->c, 0);
struct bkey_i delete;
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (bkey_cmp(iter.pos, end) >= 0)
+ break;
+
bkey_init(&delete.k);
/*
ret = bch2_extent_trim_atomic(trans, &iter, &delete);
if (ret)
- break;
+ goto err;
}
- ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
+ ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
bch2_trans_commit(trans, &disk_res, journal_seq,
- BTREE_INSERT_NOFAIL);
+ BTREE_INSERT_NOFAIL);
bch2_disk_reservation_put(trans->c, &disk_res);
+err:
+ /*
+ * the bch2_trans_begin() call is in a weird place because we
+ * need to call it after every transaction commit, to avoid path
+ * overflow, but don't want to call it if the delete operation
+ * is a no-op and we have no work to do:
+ */
+ bch2_trans_begin(trans);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ ret = 0;
if (ret)
break;
}
-
- if (ret == -EINTR) {
- ret = 0;
- goto retry;
- }
-
bch2_trans_iter_exit(trans, &iter);
+
+ if (!ret && trans_was_restarted(trans, restart_count))
+ ret = -BCH_ERR_transaction_restart_nested;
return ret;
}
*/
int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
struct bpos start, struct bpos end,
- unsigned iter_flags,
+ unsigned update_flags,
u64 *journal_seq)
{
- return bch2_trans_do(c, NULL, journal_seq, 0,
- bch2_btree_delete_range_trans(&trans, id, start, end,
- iter_flags, journal_seq));
+ int ret = bch2_trans_run(c,
+ bch2_btree_delete_range_trans(&trans, id, start, end,
+ update_flags, journal_seq));
+ if (ret == -BCH_ERR_transaction_restart_nested)
+ ret = 0;
+ return ret;
+}
+
+int bch2_trans_log_msg(struct btree_trans *trans, const char *msg)
+{
+ unsigned len = strlen(msg);
+ unsigned u64s = DIV_ROUND_UP(len, sizeof(u64));
+ struct jset_entry_log *l;
+ int ret;
+
+ ret = darray_make_room(&trans->extra_journal_entries, jset_u64s(u64s));
+ if (ret)
+ return ret;
+
+ l = (void *) &darray_top(trans->extra_journal_entries);
+ l->entry.u64s = cpu_to_le16(u64s);
+ l->entry.btree_id = 0;
+ l->entry.level = 1;
+ l->entry.type = BCH_JSET_ENTRY_log;
+ l->entry.pad[0] = 0;
+ l->entry.pad[1] = 0;
+ l->entry.pad[2] = 0;
+ memcpy(l->d, msg, len);
+ while (len & 7)
+ l->d[len++] = '\0';
+
+ trans->extra_journal_entries.nr += jset_u64s(u64s);
+ return 0;
}
#include "bcachefs.h"
#include "alloc_background.h"
+#include "backpointers.h"
#include "bset.h"
#include "btree_gc.h"
#include "btree_update.h"
: ca->usage[journal_seq & JOURNAL_BUF_MASK]);
}
-struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
+void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
{
struct bch_fs *c = ca->fs;
- struct bch_dev_usage ret;
unsigned seq, i, u64s = dev_usage_u64s();
do {
seq = read_seqcount_begin(&c->usage_lock);
- memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
+ memcpy(usage, ca->usage_base, u64s * sizeof(u64));
for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
- acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
+ acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s);
} while (read_seqcount_retry(&c->usage_lock, seq));
-
- return ret;
}
static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
{
unsigned i;
- pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity);
+ prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
- pr_buf(out, "hidden:\t\t\t\t%llu\n",
+ prt_printf(out, "hidden:\t\t\t\t%llu\n",
fs_usage->u.hidden);
- pr_buf(out, "data:\t\t\t\t%llu\n",
+ prt_printf(out, "data:\t\t\t\t%llu\n",
fs_usage->u.data);
- pr_buf(out, "cached:\t\t\t\t%llu\n",
+ prt_printf(out, "cached:\t\t\t\t%llu\n",
fs_usage->u.cached);
- pr_buf(out, "reserved:\t\t\t%llu\n",
+ prt_printf(out, "reserved:\t\t\t%llu\n",
fs_usage->u.reserved);
- pr_buf(out, "nr_inodes:\t\t\t%llu\n",
+ prt_printf(out, "nr_inodes:\t\t\t%llu\n",
fs_usage->u.nr_inodes);
- pr_buf(out, "online reserved:\t\t%llu\n",
+ prt_printf(out, "online reserved:\t\t%llu\n",
fs_usage->online_reserved);
for (i = 0;
i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
i++) {
- pr_buf(out, "%u replicas:\n", i + 1);
- pr_buf(out, "\treserved:\t\t%llu\n",
+ prt_printf(out, "%u replicas:\n", i + 1);
+ prt_printf(out, "\treserved:\t\t%llu\n",
fs_usage->u.persistent_reserved[i]);
}
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
- pr_buf(out, "\t");
+ prt_printf(out, "\t");
bch2_replicas_entry_to_text(out, e);
- pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
+ prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
}
}
return ret;
}
-static inline int is_unavailable_bucket(struct bucket_mark m)
+void bch2_dev_usage_init(struct bch_dev *ca)
{
- return !is_available_bucket(m);
+ ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
}
static inline int bucket_sectors_fragmented(struct bch_dev *ca,
- struct bucket_mark m)
+ struct bch_alloc_v4 a)
{
- return m.dirty_sectors
- ? max(0, (int) ca->mi.bucket_size - (int) m.dirty_sectors)
+ return a.dirty_sectors
+ ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors)
: 0;
}
-static inline int is_stripe_data_bucket(struct bucket_mark m)
-{
- return m.stripe && m.data_type != BCH_DATA_parity;
-}
-
-static inline enum bch_data_type bucket_type(struct bucket_mark m)
-{
- return m.cached_sectors && !m.dirty_sectors
- ? BCH_DATA_cached
- : m.data_type;
-}
-
-static inline void account_bucket(struct bch_fs_usage *fs_usage,
- struct bch_dev_usage *dev_usage,
- enum bch_data_type type,
- int nr, s64 size)
-{
- if (type == BCH_DATA_sb || type == BCH_DATA_journal)
- fs_usage->hidden += size;
-
- dev_usage->d[type].buckets += nr;
-}
-
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
- struct bucket_mark old, struct bucket_mark new,
+ struct bch_alloc_v4 old,
+ struct bch_alloc_v4 new,
u64 journal_seq, bool gc)
{
struct bch_fs_usage *fs_usage;
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
- u = dev_usage_ptr(ca, journal_seq, gc);
- if (bucket_type(old))
- account_bucket(fs_usage, u, bucket_type(old),
- -1, -ca->mi.bucket_size);
+ if (data_type_is_hidden(old.data_type))
+ fs_usage->hidden -= ca->mi.bucket_size;
+ if (data_type_is_hidden(new.data_type))
+ fs_usage->hidden += ca->mi.bucket_size;
- if (bucket_type(new))
- account_bucket(fs_usage, u, bucket_type(new),
- 1, ca->mi.bucket_size);
+ u = dev_usage_ptr(ca, journal_seq, gc);
- u->buckets_ec += (int) new.stripe - (int) old.stripe;
- u->buckets_unavailable +=
- is_unavailable_bucket(new) - is_unavailable_bucket(old);
+ u->d[old.data_type].buckets--;
+ u->d[new.data_type].buckets++;
+
+ u->buckets_ec -= (int) !!old.stripe;
+ u->buckets_ec += (int) !!new.stripe;
u->d[old.data_type].sectors -= old.dirty_sectors;
u->d[new.data_type].sectors += new.dirty_sectors;
- u->d[BCH_DATA_cached].sectors +=
- (int) new.cached_sectors - (int) old.cached_sectors;
+
+ u->d[BCH_DATA_cached].sectors += new.cached_sectors;
+ u->d[BCH_DATA_cached].sectors -= old.cached_sectors;
u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
preempt_enable();
+}
+
+static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket old, struct bucket new,
+ u64 journal_seq, bool gc)
+{
+ struct bch_alloc_v4 old_a = {
+ .gen = old.gen,
+ .data_type = old.data_type,
+ .dirty_sectors = old.dirty_sectors,
+ .cached_sectors = old.cached_sectors,
+ .stripe = old.stripe,
+ };
+ struct bch_alloc_v4 new_a = {
+ .gen = new.gen,
+ .data_type = new.data_type,
+ .dirty_sectors = new.dirty_sectors,
+ .cached_sectors = new.cached_sectors,
+ .stripe = new.stripe,
+ };
- if (!is_available_bucket(old) && is_available_bucket(new))
- bch2_wake_allocator(ca);
+ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
}
static inline int __update_replicas(struct bch_fs *c,
{
struct bch_fs_usage __percpu *fs_usage;
int idx, ret = 0;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
percpu_down_read(&c->mark_lock);
+ buf.atomic++;
idx = bch2_replicas_entry_idx(c, r);
if (idx < 0 &&
- (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err(c, "no replicas entry\n"
- " while marking %s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))) {
+ fsck_err(c, "no replicas entry\n"
+ " while marking %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
percpu_up_read(&c->mark_lock);
ret = bch2_mark_replicas(c, r);
- if (ret)
- return ret;
-
percpu_down_read(&c->mark_lock);
+
+ if (ret)
+ goto err;
idx = bch2_replicas_entry_idx(c, r);
}
if (idx < 0) {
err:
fsck_err:
percpu_up_read(&c->mark_lock);
+ printbuf_exit(&buf);
return ret;
}
n = (void *) d->d + d->used;
n->delta = sectors;
- memcpy(&n->r, r, replicas_entry_bytes(r));
+ memcpy((void *) n + offsetof(struct replicas_delta, r),
+ r, replicas_entry_bytes(r));
bch2_replicas_entry_sort(&n->r);
d->used += b;
}
update_replicas_list(trans, &r.e, sectors);
}
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, bool owned_by_allocator)
-{
- struct bucket *g = bucket(ca, b);
- struct bucket_mark old, new;
-
- old = bucket_cmpxchg(g, new, ({
- new.owned_by_allocator = owned_by_allocator;
- }));
-
- BUG_ON(owned_by_allocator == old.owned_by_allocator);
-}
-
-static int bch2_mark_alloc(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_alloc(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
bool gc = flags & BTREE_TRIGGER_GC;
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
- struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old);
- struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new);
+ struct bch_alloc_v4 old_a, new_a;
struct bch_dev *ca;
- struct bucket *g;
- struct bucket_mark old_m, m;
int ret = 0;
/*
!(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
return 0;
+ if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
+ "alloc key for invalid device or bucket"))
+ return -EIO;
+
+ ca = bch_dev_bkey_exists(c, new.k->p.inode);
+
+ bch2_alloc_to_v4(old, &old_a);
+ bch2_alloc_to_v4(new, &new_a);
+
if ((flags & BTREE_TRIGGER_INSERT) &&
- !old_u.data_type != !new_u.data_type &&
- new.k->type == KEY_TYPE_alloc_v3) {
- struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v;
- u64 old_journal_seq = le64_to_cpu(v->journal_seq);
+ data_type_is_empty(old_a.data_type) !=
+ data_type_is_empty(new_a.data_type) &&
+ new.k->type == KEY_TYPE_alloc_v4) {
+ struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
BUG_ON(!journal_seq);
* before the bucket became empty again, then the we don't have
* to wait on a journal flush before we can reuse the bucket:
*/
- new_u.journal_seq = !new_u.data_type &&
- (journal_seq == old_journal_seq ||
- bch2_journal_noflush_seq(&c->journal, old_journal_seq))
+ new_a.journal_seq = data_type_is_empty(new_a.data_type) &&
+ (journal_seq == v->journal_seq ||
+ bch2_journal_noflush_seq(&c->journal, v->journal_seq))
? 0 : journal_seq;
- v->journal_seq = cpu_to_le64(new_u.journal_seq);
+ v->journal_seq = new_a.journal_seq;
}
- if (old_u.data_type && !new_u.data_type && new_u.journal_seq) {
+ if (!data_type_is_empty(old_a.data_type) &&
+ data_type_is_empty(new_a.data_type) &&
+ new_a.journal_seq) {
ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
c->journal.flushed_seq_ondisk,
- new_u.dev, new_u.bucket,
- new_u.journal_seq);
+ new.k->p.inode, new.k->p.offset,
+ new_a.journal_seq);
if (ret) {
bch2_fs_fatal_error(c,
"error setting bucket_needs_journal_commit: %i", ret);
}
}
- ca = bch_dev_bkey_exists(c, new_u.dev);
+ percpu_down_read(&c->mark_lock);
+ if (!gc && new_a.gen != old_a.gen)
+ *bucket_gen(ca, new.k->p.offset) = new_a.gen;
- if (new_u.bucket >= ca->mi.nbuckets)
- return 0;
+ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
- percpu_down_read(&c->mark_lock);
- if (!gc && new_u.gen != old_u.gen)
- *bucket_gen(ca, new_u.bucket) = new_u.gen;
-
- g = __bucket(ca, new_u.bucket, gc);
-
- old_m = bucket_cmpxchg(g, m, ({
- m.gen = new_u.gen;
- m.data_type = new_u.data_type;
- m.dirty_sectors = new_u.dirty_sectors;
- m.cached_sectors = new_u.cached_sectors;
- m.stripe = new_u.stripe != 0;
- }));
-
- bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
-
- g->io_time[READ] = new_u.read_time;
- g->io_time[WRITE] = new_u.write_time;
- g->oldest_gen = new_u.oldest_gen;
- g->gen_valid = 1;
- g->stripe = new_u.stripe;
- g->stripe_redundancy = new_u.stripe_redundancy;
+ if (gc) {
+ struct bucket *g = gc_bucket(ca, new.k->p.offset);
+
+ bucket_lock(g);
+
+ g->gen_valid = 1;
+ g->gen = new_a.gen;
+ g->data_type = new_a.data_type;
+ g->stripe = new_a.stripe;
+ g->stripe_redundancy = new_a.stripe_redundancy;
+ g->dirty_sectors = new_a.dirty_sectors;
+ g->cached_sectors = new_a.cached_sectors;
+
+ bucket_unlock(g);
+ }
percpu_up_read(&c->mark_lock);
/*
*/
if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
- old_m.cached_sectors) {
+ old_a.cached_sectors) {
ret = update_cached_sectors(c, new, ca->dev_idx,
- -old_m.cached_sectors,
+ -((s64) old_a.cached_sectors),
journal_seq, gc);
if (ret) {
bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
return ret;
}
-
- trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket),
- old_m.cached_sectors);
}
+ if (new_a.data_type == BCH_DATA_free &&
+ (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
+ closure_wake_up(&c->freelist_wait);
+
+ if (new_a.data_type == BCH_DATA_need_discard &&
+ (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
+ bch2_do_discards(c);
+
+ if (old_a.data_type != BCH_DATA_cached &&
+ new_a.data_type == BCH_DATA_cached &&
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
+ bch2_do_invalidates(c);
+
+ if (new_a.data_type == BCH_DATA_need_gc_gens)
+ bch2_do_gc_gens(c);
+
return 0;
}
-#define checked_add(a, b) \
-({ \
- unsigned _res = (unsigned) (a) + (b); \
- bool overflow = _res > U16_MAX; \
- if (overflow) \
- _res = U16_MAX; \
- (a) = _res; \
- overflow; \
-})
-
-void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, enum bch_data_type data_type,
- unsigned sectors, struct gc_pos pos,
- unsigned flags)
+int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, enum bch_data_type data_type,
+ unsigned sectors, struct gc_pos pos,
+ unsigned flags)
{
- struct bucket *g;
- struct bucket_mark old, new;
- bool overflow;
+ struct bucket old, new, *g;
+ int ret = 0;
BUG_ON(!(flags & BTREE_TRIGGER_GC));
BUG_ON(data_type != BCH_DATA_sb &&
* Backup superblock might be past the end of our normal usable space:
*/
if (b >= ca->mi.nbuckets)
- return;
+ return 0;
percpu_down_read(&c->mark_lock);
g = gc_bucket(ca, b);
- old = bucket_cmpxchg(g, new, ({
- new.data_type = data_type;
- overflow = checked_add(new.dirty_sectors, sectors);
- }));
-
- bch2_fs_inconsistent_on(old.data_type &&
- old.data_type != data_type, c,
- "different types of data in same bucket: %s, %s",
- bch2_data_types[old.data_type],
- bch2_data_types[data_type]);
-
- bch2_fs_inconsistent_on(overflow, c,
- "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX",
- ca->dev_idx, b, new.gen,
- bch2_data_types[old.data_type ?: data_type],
- old.dirty_sectors, sectors);
-
- bch2_dev_usage_update(c, ca, old, new, 0, true);
- percpu_up_read(&c->mark_lock);
-}
-static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
-{
- EBUG_ON(sectors < 0);
+ bucket_lock(g);
+ old = *g;
+
+ if (bch2_fs_inconsistent_on(g->data_type &&
+ g->data_type != data_type, c,
+ "different types of data in same bucket: %s, %s",
+ bch2_data_types[g->data_type],
+ bch2_data_types[data_type])) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
+ ca->dev_idx, b, g->gen,
+ bch2_data_types[g->data_type ?: data_type],
+ g->dirty_sectors, sectors)) {
+ ret = -EIO;
+ goto err;
+ }
+
- return p.crc.compression_type &&
- p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible
- ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
- p.crc.uncompressed_size)
- : sectors;
+ g->data_type = data_type;
+ g->dirty_sectors += sectors;
+ new = *g;
+err:
+ bucket_unlock(g);
+ if (!ret)
+ bch2_dev_usage_update_m(c, ca, old, new, 0, true);
+ percpu_up_read(&c->mark_lock);
+ return ret;
}
static int check_bucket_ref(struct bch_fs *c,
const struct bch_extent_ptr *ptr,
s64 sectors, enum bch_data_type ptr_data_type,
u8 b_gen, u8 bucket_data_type,
- u16 dirty_sectors, u16 cached_sectors)
+ u32 dirty_sectors, u32 cached_sectors)
{
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
u16 bucket_sectors = !ptr->cached
? dirty_sectors
: cached_sectors;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (bucket_data_type == BCH_DATA_cached)
+ bucket_data_type = BCH_DATA_user;
+
+ if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) ||
+ (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe))
+ bucket_data_type = ptr_data_type = BCH_DATA_stripe;
if (gen_after(ptr->gen, b_gen)) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
ptr->gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EIO;
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
}
if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
ptr->gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EIO;
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
}
if (b_gen != ptr->gen && !ptr->cached) {
*bucket_gen(ca, bucket_nr),
bch2_data_types[bucket_data_type ?: ptr_data_type],
ptr->gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EIO;
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
}
- if (b_gen != ptr->gen)
- return 1;
+ if (b_gen != ptr->gen) {
+ ret = 1;
+ goto err;
+ }
- if (bucket_data_type && ptr_data_type &&
+ if (!data_type_is_empty(bucket_data_type) &&
+ ptr_data_type &&
bucket_data_type != ptr_data_type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type],
bch2_data_types[ptr_data_type],
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EIO;
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
}
- if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
+ if ((unsigned) (bucket_sectors + sectors) > U32_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
bucket_sectors, sectors,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EIO;
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
}
-
- return 0;
+err:
+ printbuf_exit(&buf);
+ return ret;
}
static int mark_stripe_bucket(struct btree_trans *trans,
s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g;
- struct bucket_mark new, old;
- char buf[200];
+ struct bucket old, new, *g;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
BUG_ON(!(flags & BTREE_TRIGGER_GC));
/* * XXX doesn't handle deletion */
percpu_down_read(&c->mark_lock);
+ buf.atomic++;
g = PTR_GC_BUCKET(ca, ptr);
- if (g->mark.dirty_sectors ||
+ if (g->dirty_sectors ||
(g->stripe && g->stripe != k.k->p.offset)) {
bch2_fs_inconsistent(c,
"bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
- ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+ ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EINVAL;
goto err;
}
- old = bucket_cmpxchg(g, new, ({
- ret = check_bucket_ref(c, k, ptr, sectors, data_type,
- new.gen, new.data_type,
- new.dirty_sectors, new.cached_sectors);
- if (ret)
- goto err;
+ bucket_lock(g);
+ old = *g;
- new.dirty_sectors += sectors;
- if (data_type)
- new.data_type = data_type;
+ ret = check_bucket_ref(c, k, ptr, sectors, data_type,
+ g->gen, g->data_type,
+ g->dirty_sectors, g->cached_sectors);
+ if (ret)
+ goto err;
- new.stripe = true;
- }));
+ if (data_type)
+ g->data_type = data_type;
+ g->dirty_sectors += sectors;
g->stripe = k.k->p.offset;
g->stripe_redundancy = s->nr_redundant;
-
- bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
+ new = *g;
err:
+ bucket_unlock(g);
+ if (!ret)
+ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
percpu_up_read(&c->mark_lock);
-
- return 0;
+ printbuf_exit(&buf);
+ return ret;
}
static int __mark_pointer(struct btree_trans *trans,
const struct bch_extent_ptr *ptr,
s64 sectors, enum bch_data_type ptr_data_type,
u8 bucket_gen, u8 *bucket_data_type,
- u16 *dirty_sectors, u16 *cached_sectors)
+ u32 *dirty_sectors, u32 *cached_sectors)
{
- u16 *dst_sectors = !ptr->cached
+ u32 *dst_sectors = !ptr->cached
? dirty_sectors
: cached_sectors;
int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type,
{
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
- struct bucket_mark old, new;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket *g;
+ struct bucket old, new, *g;
u8 bucket_data_type;
- u64 v;
int ret = 0;
BUG_ON(!(flags & BTREE_TRIGGER_GC));
percpu_down_read(&c->mark_lock);
g = PTR_GC_BUCKET(ca, &p.ptr);
-
- v = atomic64_read(&g->_mark.v);
- do {
- new.v.counter = old.v.counter = v;
- bucket_data_type = new.data_type;
-
- ret = __mark_pointer(trans, k, &p.ptr, sectors,
- data_type, new.gen,
- &bucket_data_type,
- &new.dirty_sectors,
- &new.cached_sectors);
- if (ret)
- goto err;
-
- new.data_type = bucket_data_type;
-
- if (flags & BTREE_TRIGGER_NOATOMIC) {
- g->_mark = new;
- break;
- }
- } while ((v = atomic64_cmpxchg(&g->_mark.v,
- old.v.counter,
- new.v.counter)) != old.v.counter);
-
- bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
-err:
+ bucket_lock(g);
+ old = *g;
+
+ bucket_data_type = g->data_type;
+ ret = __mark_pointer(trans, k, &p.ptr, sectors,
+ data_type, g->gen,
+ &bucket_data_type,
+ &g->dirty_sectors,
+ &g->cached_sectors);
+ if (!ret)
+ g->data_type = bucket_data_type;
+
+ new = *g;
+ bucket_unlock(g);
+ if (!ret)
+ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
percpu_up_read(&c->mark_lock);
return ret;
return 0;
}
-static int bch2_mark_extent(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_extent(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
if (r.e.nr_devs) {
ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
if (ret) {
- char buf[200];
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+ printbuf_exit(&buf);
return ret;
}
}
return 0;
}
-static int bch2_mark_stripe(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_stripe(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
bool gc = flags & BTREE_TRIGGER_GC;
u64 journal_seq = trans->journal_res.seq;
struct stripe *m = genradix_ptr(&c->stripes, idx);
if (!m || (old_s && !m->alive)) {
- char buf1[200], buf2[200];
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf1), c, old);
- bch2_bkey_val_to_text(&PBUF(buf2), c, new);
+ bch2_bkey_val_to_text(&buf1, c, old);
+ bch2_bkey_val_to_text(&buf2, c, new);
bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
"old %s\n"
- "new %s", idx, buf1, buf2);
+ "new %s", idx, buf1.buf, buf2.buf);
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
bch2_inconsistent_error(c);
return -1;
}
((s64) m->sectors * m->nr_redundant),
journal_seq, gc);
if (ret) {
- char buf[200];
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf), c, new);
- bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
+ bch2_bkey_val_to_text(&buf, c, new);
+ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+ printbuf_exit(&buf);
return ret;
}
}
return 0;
}
-static int bch2_mark_inode(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_inode(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct bch_fs_usage __percpu *fs_usage;
u64 journal_seq = trans->journal_res.seq;
if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v;
+ struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v;
BUG_ON(!journal_seq);
- BUG_ON(new.k->type != KEY_TYPE_inode_v2);
+ BUG_ON(new.k->type != KEY_TYPE_inode_v3);
v->bi_journal_seq = cpu_to_le64(journal_seq);
}
return 0;
}
-static int bch2_mark_reservation(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_reservation(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bch_fs_usage __percpu *fs_usage;
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
return 0;
}
-static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
+static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 start, u64 end,
u64 *idx, unsigned flags, size_t r_idx)
{
+ struct bch_fs *c = trans->c;
struct reflink_gc *r;
int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ u64 next_idx = end;
s64 ret = 0;
+ struct printbuf buf = PRINTBUF;
if (r_idx >= c->reflink_gc_nr)
goto not_found;
r = genradix_ptr(&c->reflink_gc_table, r_idx);
- if (*idx < r->offset - r->size)
+ next_idx = min(next_idx, r->offset - r->size);
+ if (*idx < next_idx)
goto not_found;
BUG_ON((s64) r->refcount + add < 0);
*idx = r->offset;
return 0;
not_found:
- *idx = U64_MAX;
- ret = -EIO;
-
- /*
- * XXX: we're replacing the entire reflink pointer with an error
- * key, we should just be replacing the part that was missing:
- */
- if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
- p.k->p.inode, p.k->p.offset, p.k->size, *idx)) {
+ if (fsck_err(c, "pointer to missing indirect extent\n"
+ " %s\n"
+ " missing range %llu-%llu",
+ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+ *idx, next_idx)) {
struct bkey_i_error new;
bkey_init(&new.k);
new.k.type = KEY_TYPE_error;
- new.k.p = p.k->p;
- new.k.size = p.k->size;
- ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new.k_i);
+ new.k.p = bkey_start_pos(p.k);
+ new.k.p.offset += *idx - start;
+ bch2_key_resize(&new.k, next_idx - *idx);
+ ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i);
}
+
+ *idx = next_idx;
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_mark_reflink_p(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_reflink_p(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
struct reflink_gc *ref;
size_t l, r, m;
- u64 idx = le64_to_cpu(p.v->idx);
+ u64 idx = le64_to_cpu(p.v->idx), start = idx;
u64 end = le64_to_cpu(p.v->idx) + p.k->size;
int ret = 0;
}
while (idx < end && !ret)
- ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++);
-
- return ret;
-}
-
-int bch2_mark_key(struct btree_trans *trans,
- struct bkey_s_c old,
- struct bkey_s_c new,
- unsigned flags)
-{
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
-
- switch (k.k->type) {
- case KEY_TYPE_alloc:
- case KEY_TYPE_alloc_v2:
- case KEY_TYPE_alloc_v3:
- return bch2_mark_alloc(trans, old, new, flags);
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- return bch2_mark_extent(trans, old, new, flags);
- case KEY_TYPE_stripe:
- return bch2_mark_stripe(trans, old, new, flags);
- case KEY_TYPE_inode:
- case KEY_TYPE_inode_v2:
- return bch2_mark_inode(trans, old, new, flags);
- case KEY_TYPE_reservation:
- return bch2_mark_reservation(trans, old, new, flags);
- case KEY_TYPE_reflink_p:
- return bch2_mark_reflink_p(trans, old, new, flags);
- case KEY_TYPE_snapshot:
- return bch2_mark_snapshot(trans, old, new, flags);
- default:
- return 0;
- }
-}
-
-int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
- struct bkey_i *new, unsigned flags)
-{
- struct bkey _deleted = KEY(0, 0, 0);
- struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
- struct bkey_s_c old;
- struct bkey unpacked;
- int ret;
-
- _deleted.p = path->pos;
-
- if (unlikely(flags & BTREE_TRIGGER_NORUN))
- return 0;
-
- if (!btree_node_type_needs_gc(path->btree_id))
- return 0;
-
- old = bch2_btree_path_peek_slot(path, &unpacked);
-
- if (old.k->type == new->k.type &&
- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
- ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
- } else {
- ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
- BTREE_TRIGGER_INSERT|flags) ?:
- bch2_mark_key(trans, old, deleted,
- BTREE_TRIGGER_OVERWRITE|flags);
- }
+ ret = __bch2_mark_reflink_p(trans, p, start, end,
+ &idx, flags, l++);
return ret;
}
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
bch_err(c, "disk usage increased %lli more than %u sectors reserved",
should_not_have_added, disk_res_sectors);
trans_for_each_update(trans, i) {
+ struct bkey_s_c old = { &i->old_k, i->old_v };
+
pr_err("while inserting");
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
- pr_err("%s", buf);
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
+ pr_err(" %s", buf.buf);
pr_err("overlapping with");
-
- if (!i->cached) {
- struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u);
-
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- pr_err("%s", buf);
- } else {
- struct bkey_cached *ck = (void *) i->path->l[0].b;
-
- if (ck->valid) {
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
- pr_err("%s", buf);
- }
- }
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, old);
+ pr_err(" %s", buf.buf);
}
+
__WARN();
+ printbuf_exit(&buf);
}
int bch2_trans_fs_usage_apply(struct btree_trans *trans,
/* trans_mark: */
-static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
- const struct bch_extent_ptr *ptr,
- struct bkey_alloc_unpacked *u)
-{
- struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bkey_s_c k;
- int ret;
-
- bch2_trans_iter_init(trans, iter, BTREE_ID_alloc,
- POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)),
- BTREE_ITER_WITH_UPDATES|
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret) {
- bch2_trans_iter_exit(trans, iter);
- return ret;
- }
-
- *u = bch2_alloc_unpack(k);
- return 0;
-}
-
static int bch2_trans_mark_pointer(struct btree_trans *trans,
- struct bkey_s_c k, struct extent_ptr_decoded p,
- s64 sectors, enum bch_data_type data_type)
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p,
+ unsigned flags)
{
+ bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
struct btree_iter iter;
- struct bkey_alloc_unpacked u;
+ struct bkey_i_alloc_v4 *a;
+ struct bpos bucket_pos;
+ struct bch_backpointer bp;
+ s64 sectors;
int ret;
- ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
- if (ret)
- return ret;
+ bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket_pos, &bp);
+ sectors = bp.bucket_len;
+ if (!insert)
+ sectors = -sectors;
- ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
- u.gen, &u.data_type,
- &u.dirty_sectors, &u.cached_sectors);
- if (ret)
- goto out;
+ a = bch2_trans_start_alloc_update(trans, &iter, bucket_pos);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
- ret = bch2_alloc_write(trans, &iter, &u, 0);
+ ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type,
+ a->v.gen, &a->v.data_type,
+ &a->v.dirty_sectors, &a->v.cached_sectors);
if (ret)
- goto out;
-out:
+ goto err;
+
+ if (!p.ptr.cached) {
+ ret = insert
+ ? bch2_bucket_backpointer_add(trans, a, bp, k)
+ : bch2_bucket_backpointer_del(trans, a, bp, k);
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type)
{
- struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i_stripe *s;
goto err;
if (k.k->type != KEY_TYPE_stripe) {
- bch2_fs_inconsistent(c,
+ bch2_trans_inconsistent(trans,
"pointer to nonexistent stripe %llu",
(u64) p.ec.idx);
- bch2_inconsistent_error(c);
ret = -EIO;
goto err;
}
if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
- bch2_fs_inconsistent(c,
+ bch2_trans_inconsistent(trans,
"stripe pointer doesn't match stripe %llu",
(u64) p.ec.idx);
ret = -EIO;
return ret;
}
-static int bch2_trans_mark_extent(struct btree_trans *trans,
- struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+ ? old
+ : bkey_i_to_s_c(new);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
if (flags & BTREE_TRIGGER_OVERWRITE)
disk_sectors = -disk_sectors;
- ret = bch2_trans_mark_pointer(trans, k, p,
- disk_sectors, data_type);
+ ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags);
if (ret < 0)
return ret;
struct bch_fs *c = trans->c;
const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
struct btree_iter iter;
- struct bkey_alloc_unpacked u;
+ struct bkey_i_alloc_v4 *a;
enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
? BCH_DATA_parity : 0;
s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
if (deleting)
sectors = -sectors;
- ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
- if (ret)
- return ret;
+ a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
+ if (IS_ERR(a))
+ return PTR_ERR(a);
ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type,
- u.gen, u.data_type,
- u.dirty_sectors, u.cached_sectors);
+ a->v.gen, a->v.data_type,
+ a->v.dirty_sectors, a->v.cached_sectors);
if (ret)
goto err;
if (!deleting) {
- if (bch2_fs_inconsistent_on(u.stripe ||
- u.stripe_redundancy, c,
+ if (bch2_trans_inconsistent_on(a->v.stripe ||
+ a->v.stripe_redundancy, trans,
"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
- iter.pos.inode, iter.pos.offset, u.gen,
- bch2_data_types[u.data_type],
- u.dirty_sectors,
- u.stripe, s.k->p.offset)) {
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_types[a->v.data_type],
+ a->v.dirty_sectors,
+ a->v.stripe, s.k->p.offset)) {
ret = -EIO;
goto err;
}
- if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c,
+ if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
"bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
- iter.pos.inode, iter.pos.offset, u.gen,
- bch2_data_types[u.data_type],
- u.dirty_sectors,
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_types[a->v.data_type],
+ a->v.dirty_sectors,
s.k->p.offset)) {
ret = -EIO;
goto err;
}
- u.stripe = s.k->p.offset;
- u.stripe_redundancy = s.v->nr_redundant;
+ a->v.stripe = s.k->p.offset;
+ a->v.stripe_redundancy = s.v->nr_redundant;
} else {
- if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset ||
- u.stripe_redundancy != s.v->nr_redundant, c,
+ if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
+ a->v.stripe_redundancy != s.v->nr_redundant, trans,
"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
- iter.pos.inode, iter.pos.offset, u.gen,
- s.k->p.offset, u.stripe)) {
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ s.k->p.offset, a->v.stripe)) {
ret = -EIO;
goto err;
}
- u.stripe = 0;
- u.stripe_redundancy = 0;
+ a->v.stripe = 0;
+ a->v.stripe_redundancy = 0;
}
- u.dirty_sectors += sectors;
+ a->v.dirty_sectors += sectors;
if (data_type)
- u.data_type = !deleting ? data_type : 0;
+ a->v.data_type = !deleting ? data_type : 0;
- ret = bch2_alloc_write(trans, &iter, &u, 0);
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
if (ret)
goto err;
err:
return ret;
}
-static int bch2_trans_mark_stripe(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_trans_mark_stripe(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
{
- struct bkey_s_c_stripe old_s = { .k = NULL };
- struct bkey_s_c_stripe new_s = { .k = NULL };
+ const struct bch_stripe *old_s = NULL;
+ struct bch_stripe *new_s = NULL;
struct bch_replicas_padded r;
unsigned i, nr_blocks;
int ret = 0;
if (old.k->type == KEY_TYPE_stripe)
- old_s = bkey_s_c_to_stripe(old);
- if (new.k->type == KEY_TYPE_stripe)
- new_s = bkey_s_c_to_stripe(new);
+ old_s = bkey_s_c_to_stripe(old).v;
+ if (new->k.type == KEY_TYPE_stripe)
+ new_s = &bkey_i_to_stripe(new)->v;
/*
* If the pointers aren't changing, we don't need to do anything:
*/
- if (new_s.k && old_s.k &&
- new_s.v->nr_blocks == old_s.v->nr_blocks &&
- new_s.v->nr_redundant == old_s.v->nr_redundant &&
- !memcmp(old_s.v->ptrs, new_s.v->ptrs,
- new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
+ if (new_s && old_s &&
+ new_s->nr_blocks == old_s->nr_blocks &&
+ new_s->nr_redundant == old_s->nr_redundant &&
+ !memcmp(old_s->ptrs, new_s->ptrs,
+ new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
return 0;
- BUG_ON(new_s.k && old_s.k &&
- (new_s.v->nr_blocks != old_s.v->nr_blocks ||
- new_s.v->nr_redundant != old_s.v->nr_redundant));
+ BUG_ON(new_s && old_s &&
+ (new_s->nr_blocks != old_s->nr_blocks ||
+ new_s->nr_redundant != old_s->nr_redundant));
- nr_blocks = new_s.k ? new_s.v->nr_blocks : old_s.v->nr_blocks;
+ nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
- if (new_s.k) {
- s64 sectors = le16_to_cpu(new_s.v->sectors);
+ if (new_s) {
+ s64 sectors = le16_to_cpu(new_s->sectors);
- bch2_bkey_to_replicas(&r.e, new);
- update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
+ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
+ update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
}
- if (old_s.k) {
- s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
+ if (old_s) {
+ s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
bch2_bkey_to_replicas(&r.e, old);
- update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
+ update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
}
for (i = 0; i < nr_blocks; i++) {
- if (new_s.k && old_s.k &&
- !memcmp(&new_s.v->ptrs[i],
- &old_s.v->ptrs[i],
- sizeof(new_s.v->ptrs[i])))
+ if (new_s && old_s &&
+ !memcmp(&new_s->ptrs[i],
+ &old_s->ptrs[i],
+ sizeof(new_s->ptrs[i])))
continue;
- if (new_s.k) {
- ret = bch2_trans_mark_stripe_bucket(trans, new_s, i, false);
+ if (new_s) {
+ ret = bch2_trans_mark_stripe_bucket(trans,
+ bkey_i_to_s_c_stripe(new), i, false);
if (ret)
break;
}
- if (old_s.k) {
- ret = bch2_trans_mark_stripe_bucket(trans, old_s, i, true);
+ if (old_s) {
+ ret = bch2_trans_mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(old), i, true);
if (ret)
break;
}
return ret;
}
-static int bch2_trans_mark_inode(struct btree_trans *trans,
- struct bkey_s_c old,
- struct bkey_s_c new,
- unsigned flags)
+int bch2_trans_mark_inode(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
{
- int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
+ int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
if (nr) {
struct replicas_delta_list *d =
return 0;
}
-static int bch2_trans_mark_reservation(struct btree_trans *trans,
- struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
{
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+ ? old
+ : bkey_i_to_s_c(new);
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
struct replicas_delta_list *d;
struct bkey_i *n;
__le64 *refcount;
int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx),
refcount = bkey_refcount(n);
if (!refcount) {
- bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
- bch2_fs_inconsistent(c,
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
"nonexistent indirect extent at %llu while marking\n %s",
- *idx, buf);
+ *idx, buf.buf);
ret = -EIO;
goto err;
}
if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
- bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
- bch2_fs_inconsistent(c,
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
"indirect extent refcount underflow at %llu while marking\n %s",
- *idx, buf);
+ *idx, buf.buf);
ret = -EIO;
goto err;
}
le64_add_cpu(refcount, add);
- if (!*refcount) {
- n->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&n->k, 0);
- }
-
bch2_btree_iter_set_pos_to_extent_start(&iter);
ret = bch2_trans_update(trans, &iter, n, 0);
if (ret)
*idx = k.k->p.offset;
err:
bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
- struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
{
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+ ? old
+ : bkey_i_to_s_c(new);
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
u64 idx, end_idx;
int ret = 0;
return ret;
}
-int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
- struct bkey_s_c new, unsigned flags)
-{
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
-
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- return bch2_trans_mark_extent(trans, k, flags);
- case KEY_TYPE_stripe:
- return bch2_trans_mark_stripe(trans, old, new, flags);
- case KEY_TYPE_inode:
- case KEY_TYPE_inode_v2:
- return bch2_trans_mark_inode(trans, old, new, flags);
- case KEY_TYPE_reservation:
- return bch2_trans_mark_reservation(trans, k, flags);
- case KEY_TYPE_reflink_p:
- return bch2_trans_mark_reflink_p(trans, k, flags);
- default:
- return 0;
- }
-}
-
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_dev *ca, size_t b,
enum bch_data_type type,
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
- struct bkey_alloc_unpacked u;
- struct bch_extent_ptr ptr = {
- .dev = ca->dev_idx,
- .offset = bucket_to_sector(ca, b),
- };
+ struct bkey_i_alloc_v4 *a;
int ret = 0;
/*
if (b >= ca->mi.nbuckets)
return 0;
- ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
- if (ret)
- return ret;
+ a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b));
+ if (IS_ERR(a))
+ return PTR_ERR(a);
- if (u.data_type && u.data_type != type) {
+ if (a->v.data_type && a->v.data_type != type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
- iter.pos.inode, iter.pos.offset, u.gen,
- bch2_data_types[u.data_type],
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_types[a->v.data_type],
bch2_data_types[type],
bch2_data_types[type]);
ret = -EIO;
goto out;
}
- u.data_type = type;
- u.dirty_sectors = sectors;
+ a->v.data_type = type;
+ a->v.dirty_sectors = sectors;
- ret = bch2_alloc_write(trans, &iter, &u, 0);
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
if (ret)
goto out;
out:
enum bch_data_type type,
unsigned sectors)
{
- return __bch2_trans_do(trans, NULL, NULL, 0,
+ return commit_do(trans, NULL, NULL, 0,
__bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
}
int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
{
- return bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
- __bch2_trans_mark_dev_sb(&trans, ca));
+ return bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca));
}
/* Disk reservations: */
ret = 0;
} else {
atomic64_set(&c->sectors_available, sectors_available);
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_disk_reservation;
}
mutex_unlock(&c->sectors_available_lock);
/* Startup/shutdown: */
-static void buckets_free_rcu(struct rcu_head *rcu)
-{
- struct bucket_array *buckets =
- container_of(rcu, struct bucket_array, rcu);
-
- kvpfree(buckets,
- sizeof(*buckets) +
- buckets->nbuckets * sizeof(struct bucket));
-}
-
static void bucket_gens_free_rcu(struct rcu_head *rcu)
{
struct bucket_gens *buckets =
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
{
- struct bucket_array *buckets = NULL, *old_buckets = NULL;
struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
unsigned long *buckets_nouse = NULL;
- alloc_fifo free[RESERVE_NR];
- alloc_fifo free_inc;
- alloc_heap alloc_heap;
-
- size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
- ca->mi.bucket_size / btree_sectors(c));
- /* XXX: these should be tunable */
- size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
- size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6);
- size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
- btree_reserve * 2);
- bool resize = ca->buckets[0] != NULL;
+ bool resize = ca->bucket_gens != NULL;
int ret = -ENOMEM;
- unsigned i;
-
- memset(&free, 0, sizeof(free));
- memset(&free_inc, 0, sizeof(free_inc));
- memset(&alloc_heap, 0, sizeof(alloc_heap));
- if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
- nbuckets * sizeof(struct bucket),
- GFP_KERNEL|__GFP_ZERO)) ||
- !(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
+ if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
GFP_KERNEL|__GFP_ZERO)) ||
(c->opts.buckets_nouse &&
!(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
- GFP_KERNEL|__GFP_ZERO))) ||
- !init_fifo(&free[RESERVE_MOVINGGC],
- copygc_reserve, GFP_KERNEL) ||
- !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
- !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
- !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
+ GFP_KERNEL|__GFP_ZERO))))
goto err;
- buckets->first_bucket = ca->mi.first_bucket;
- buckets->nbuckets = nbuckets;
bucket_gens->first_bucket = ca->mi.first_bucket;
bucket_gens->nbuckets = nbuckets;
percpu_down_write(&c->mark_lock);
}
- old_buckets = bucket_array(ca);
old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
if (resize) {
- size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
+ size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
- memcpy(buckets->b,
- old_buckets->b,
- n * sizeof(struct bucket));
memcpy(bucket_gens->b,
old_bucket_gens->b,
n);
BITS_TO_LONGS(n) * sizeof(unsigned long));
}
- rcu_assign_pointer(ca->buckets[0], buckets);
rcu_assign_pointer(ca->bucket_gens, bucket_gens);
- buckets = old_buckets;
bucket_gens = old_bucket_gens;
swap(ca->buckets_nouse, buckets_nouse);
+ nbuckets = ca->mi.nbuckets;
+
if (resize) {
percpu_up_write(&c->mark_lock);
+ up_write(&ca->bucket_lock);
up_write(&c->gc_lock);
}
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++) {
- fifo_move(&free[i], &ca->free[i]);
- swap(ca->free[i], free[i]);
- }
- fifo_move(&free_inc, &ca->free_inc);
- swap(ca->free_inc, free_inc);
- spin_unlock(&c->freelist_lock);
-
- /* with gc lock held, alloc_heap can't be in use: */
- swap(ca->alloc_heap, alloc_heap);
-
- nbuckets = ca->mi.nbuckets;
-
- if (resize)
- up_write(&ca->bucket_lock);
-
ret = 0;
err:
- free_heap(&alloc_heap);
- free_fifo(&free_inc);
- for (i = 0; i < RESERVE_NR; i++)
- free_fifo(&free[i]);
kvpfree(buckets_nouse,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
if (bucket_gens)
call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
- if (buckets)
- call_rcu(&buckets->rcu, buckets_free_rcu);
return ret;
}
{
unsigned i;
- free_heap(&ca->alloc_heap);
- free_fifo(&ca->free_inc);
- for (i = 0; i < RESERVE_NR; i++)
- free_fifo(&ca->free[i]);
kvpfree(ca->buckets_nouse,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
sizeof(struct bucket_gens) + ca->mi.nbuckets);
- kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
- sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket));
for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
free_percpu(ca->usage[i]);
return -ENOMEM;
}
- return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
+ return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
}
#define _BUCKETS_H
#include "buckets_types.h"
+#include "extents.h"
#include "super.h"
#define for_each_bucket(_b, _buckets) \
for (_b = (_buckets)->b + (_buckets)->first_bucket; \
_b < (_buckets)->b + (_buckets)->nbuckets; _b++)
-#define bucket_cmpxchg(g, new, expr) \
-({ \
- struct bucket *_g = g; \
- u64 _v = atomic64_read(&(g)->_mark.v); \
- struct bucket_mark _old; \
- \
- do { \
- (new).v.counter = _old.v.counter = _v; \
- expr; \
- } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \
- _old.v.counter, \
- (new).v.counter)) != _old.v.counter);\
- _old; \
-})
-
-static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
- bool gc)
+static inline void bucket_unlock(struct bucket *b)
{
- return rcu_dereference_check(ca->buckets[gc],
- !ca->fs ||
- percpu_rwsem_is_held(&ca->fs->mark_lock) ||
- lockdep_is_held(&ca->fs->gc_lock) ||
- lockdep_is_held(&ca->bucket_lock));
+ smp_store_release(&b->lock, 0);
}
-static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+static inline void bucket_lock(struct bucket *b)
{
- return __bucket_array(ca, false);
+ while (xchg(&b->lock, 1))
+ cpu_relax();
}
-static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
+static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
{
- struct bucket_array *buckets = __bucket_array(ca, gc);
-
- BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
- return buckets->b + b;
+ return rcu_dereference_check(ca->buckets_gc,
+ !ca->fs ||
+ percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+ lockdep_is_held(&ca->fs->gc_lock) ||
+ lockdep_is_held(&ca->bucket_lock));
}
static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
- return __bucket(ca, b, true);
-}
+ struct bucket_array *buckets = gc_bucket_array(ca);
-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
-{
- return __bucket(ca, b, false);
+ BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
+ return buckets->b + b;
}
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
percpu_rwsem_is_held(&ca->fs->mark_lock) ||
lockdep_is_held(&ca->fs->gc_lock) ||
lockdep_is_held(&ca->bucket_lock));
-
}
static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
return gens->b + b;
}
-/*
- * bucket_gc_gen() returns the difference between the bucket's current gen and
- * the oldest gen of any pointer into that bucket in the btree.
- */
-
-static inline u8 bucket_gc_gen(struct bucket *g)
+static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
+ const struct bch_extent_ptr *ptr)
{
- return g->mark.gen - g->oldest_gen;
+ return sector_to_bucket(ca, ptr->offset);
}
-static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
+static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
const struct bch_extent_ptr *ptr)
{
- return sector_to_bucket(ca, ptr->offset);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
+}
+
+static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c,
+ const struct bch_extent_ptr *ptr,
+ u32 *bucket_offset)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
}
static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
static inline enum bch_data_type ptr_data_type(const struct bkey *k,
const struct bch_extent_ptr *ptr)
{
- if (k->type == KEY_TYPE_btree_ptr ||
- k->type == KEY_TYPE_btree_ptr_v2)
+ if (bkey_is_btree_ptr(k))
return BCH_DATA_btree;
return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
}
+static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
+{
+ EBUG_ON(sectors < 0);
+
+ return crc_is_compressed(p.crc)
+ ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
+ p.crc.uncompressed_size)
+ : sectors;
+}
+
static inline int gen_cmp(u8 a, u8 b)
{
return (s8) (a - b);
return ret;
}
-/* bucket gc marks */
+/* Device usage: */
-static inline bool is_available_bucket(struct bucket_mark mark)
+void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *);
+static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
{
- return !mark.dirty_sectors && !mark.stripe;
-}
+ struct bch_dev_usage ret;
-/* Device usage: */
+ bch2_dev_usage_read_fast(ca, &ret);
+ return ret;
+}
-struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
+void bch2_dev_usage_init(struct bch_dev *);
-static inline u64 __dev_buckets_available(struct bch_dev *ca,
- struct bch_dev_usage stats)
+static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reserve reserve)
{
- u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
-
- if (WARN_ONCE(stats.buckets_unavailable > total,
- "buckets_unavailable overflow (%llu > %llu)\n",
- stats.buckets_unavailable, total))
- return 0;
-
- return total - stats.buckets_unavailable;
+ s64 reserved = 0;
+
+ switch (reserve) {
+ case RESERVE_none:
+ reserved += ca->mi.nbuckets >> 6;
+ fallthrough;
+ case RESERVE_movinggc:
+ reserved += ca->nr_btree_reserve;
+ fallthrough;
+ case RESERVE_btree:
+ reserved += ca->nr_btree_reserve;
+ fallthrough;
+ case RESERVE_btree_movinggc:
+ break;
+ }
+
+ return reserved;
}
-static inline u64 dev_buckets_available(struct bch_dev *ca)
+static inline u64 dev_buckets_free(struct bch_dev *ca,
+ struct bch_dev_usage usage,
+ enum alloc_reserve reserve)
{
- return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
+ return max_t(s64, 0,
+ usage.d[BCH_DATA_free].buckets -
+ ca->nr_open_buckets -
+ bch2_dev_buckets_reserved(ca, reserve));
}
-static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca,
- struct bch_dev_usage stats)
+static inline u64 __dev_buckets_available(struct bch_dev *ca,
+ struct bch_dev_usage usage,
+ enum alloc_reserve reserve)
{
- struct bch_fs *c = ca->fs;
- s64 available = __dev_buckets_available(ca, stats);
- unsigned i;
-
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++)
- available -= fifo_used(&ca->free[i]);
- available -= fifo_used(&ca->free_inc);
- available -= ca->nr_open_buckets;
- spin_unlock(&c->freelist_lock);
-
- return max(available, 0LL);
+ return max_t(s64, 0,
+ usage.d[BCH_DATA_free].buckets
+ + usage.d[BCH_DATA_cached].buckets
+ + usage.d[BCH_DATA_need_gc_gens].buckets
+ + usage.d[BCH_DATA_need_discard].buckets
+ - ca->nr_open_buckets
+ - bch2_dev_buckets_reserved(ca, reserve));
}
-static inline u64 dev_buckets_reclaimable(struct bch_dev *ca)
+static inline u64 dev_buckets_available(struct bch_dev *ca,
+ enum alloc_reserve reserve)
{
- return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca));
+ return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve);
}
/* Filesystem usage: */
static inline unsigned fs_usage_u64s(struct bch_fs *c)
{
-
return sizeof(struct bch_fs_usage) / sizeof(u64) +
READ_ONCE(c->replicas.nr);
}
void bch2_fs_usage_initialize(struct bch_fs *);
-void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
-void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
- size_t, enum bch_data_type, unsigned,
- struct gc_pos, unsigned);
+int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
+ size_t, enum bch_data_type, unsigned,
+ struct gc_pos, unsigned);
-int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_update(struct btree_trans *, struct btree_path *,
- struct bkey_i *, unsigned);
+int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
- struct bkey_s_c, unsigned);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
#define BUCKET_JOURNAL_SEQ_BITS 16
-struct bucket_mark {
- union {
- atomic64_t v;
-
- struct {
- u8 gen;
- u8 data_type:3,
- owned_by_allocator:1,
- stripe:1;
- u16 dirty_sectors;
- u16 cached_sectors;
- };
- };
-};
-
struct bucket {
- union {
- struct bucket_mark _mark;
- const struct bucket_mark mark;
- };
-
- u64 io_time[2];
- u8 oldest_gen;
- unsigned gen_valid:1;
- u8 stripe_redundancy;
- u32 stripe;
+ u8 lock;
+ u8 gen_valid:1;
+ u8 data_type:7;
+ u8 gen;
+ u8 stripe_redundancy;
+ u32 stripe;
+ u32 dirty_sectors;
+ u32 cached_sectors;
};
struct bucket_array {
struct bch_dev_usage {
u64 buckets_ec;
- u64 buckets_unavailable;
struct {
u64 buckets;
u8 dev;
u8 gen;
u8 replicas;
- u16 fragmentation;
+ u32 fragmentation;
u32 sectors;
- u64 offset;
+ u64 bucket;
};
typedef HEAP(struct copygc_heap_entry) copygc_heap;
arg.state = ca->mi.state;
arg.bucket_size = ca->mi.bucket_size;
arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
- arg.available_buckets = arg.nr_buckets - src.buckets_unavailable;
- arg.ec_buckets = src.buckets_ec;
- arg.ec_sectors = 0;
+ arg.buckets_ec = src.buckets_ec;
for (i = 0; i < BCH_DATA_NR; i++) {
- arg.buckets[i] = src.d[i].buckets;
- arg.sectors[i] = src.d[i].sectors;
+ arg.d[i].buckets = src.d[i].buckets;
+ arg.d[i].sectors = src.d[i].sectors;
+ arg.d[i].fragmented = src.d[i].fragmented;
}
percpu_ref_put(&ca->ref);
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "checksum.h"
+#include "errcode.h"
#include "super.h"
#include "super-io.h"
}
}
-static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
- struct nonce nonce,
- struct scatterlist *sg, size_t len)
+static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
+ struct nonce nonce,
+ struct scatterlist *sg, size_t len)
{
SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
- BUG_ON(ret);
+ if (ret)
+ pr_err("got error %i from crypto_skcipher_encrypt()", ret);
+
+ return ret;
}
-static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
+static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
void *buf, size_t len)
{
- struct scatterlist sg;
+ if (!is_vmalloc_addr(buf)) {
+ struct scatterlist sg;
+
+ sg_init_table(&sg, 1);
+ sg_set_page(&sg,
+ is_vmalloc_addr(buf)
+ ? vmalloc_to_page(buf)
+ : virt_to_page(buf),
+ len, offset_in_page(buf));
+ return do_encrypt_sg(tfm, nonce, &sg, len);
+ } else {
+ unsigned pages = buf_pages(buf, len);
+ struct scatterlist *sg;
+ size_t orig_len = len;
+ int ret, i;
+
+ sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
+ if (!sg)
+ return -ENOMEM;
+
+ sg_init_table(sg, pages);
+
+ for (i = 0; i < pages; i++) {
+ unsigned offset = offset_in_page(buf);
+ unsigned pg_len = min(len, PAGE_SIZE - offset);
+
+ sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
+ buf += pg_len;
+ len -= pg_len;
+ }
- sg_init_one(&sg, buf, len);
- do_encrypt_sg(tfm, nonce, &sg, len);
+ ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
+ kfree(sg);
+ return ret;
+ }
}
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
goto err;
}
- do_encrypt(chacha20, nonce, buf, len);
+ ret = do_encrypt(chacha20, nonce, buf, len);
err:
crypto_free_sync_skcipher(chacha20);
return ret;
}
-static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
- struct nonce nonce)
+static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
+ struct nonce nonce)
{
u8 key[POLY1305_KEY_SIZE];
+ int ret;
nonce.d[3] ^= BCH_NONCE_POLY;
memset(key, 0, sizeof(key));
- do_encrypt(c->chacha20, nonce, key, sizeof(key));
+ ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
+ if (ret)
+ return ret;
desc->tfm = c->poly1305;
crypto_shash_init(desc);
crypto_shash_update(desc, key, sizeof(key));
+ return 0;
}
struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
}
}
-void bch2_encrypt(struct bch_fs *c, unsigned type,
+int bch2_encrypt(struct bch_fs *c, unsigned type,
struct nonce nonce, void *data, size_t len)
{
if (!bch2_csum_type_is_encryption(type))
- return;
+ return 0;
- do_encrypt(c->chacha20, nonce, data, len);
+ return do_encrypt(c->chacha20, nonce, data, len);
}
static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
return __bch2_checksum_bio(c, type, nonce, bio, &iter);
}
-void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
- struct nonce nonce, struct bio *bio)
+int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+ struct nonce nonce, struct bio *bio)
{
struct bio_vec bv;
struct bvec_iter iter;
struct scatterlist sgl[16], *sg = sgl;
size_t bytes = 0;
+ int ret = 0;
if (!bch2_csum_type_is_encryption(type))
- return;
+ return 0;
sg_init_table(sgl, ARRAY_SIZE(sgl));
bio_for_each_segment(bv, bio, iter) {
if (sg == sgl + ARRAY_SIZE(sgl)) {
sg_mark_end(sg - 1);
- do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+
+ ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ if (ret)
+ return ret;
nonce = nonce_add(nonce, bytes);
bytes = 0;
}
sg_mark_end(sg - 1);
- do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
}
struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
merged = bch2_checksum_bio(c, crc_old.csum_type,
extent_nonce(version, crc_old), bio);
- if (bch2_crc_cmp(merged, crc_old.csum))
+ if (bch2_crc_cmp(merged, crc_old.csum)) {
+ bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n"
+ "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
+ crc_old.csum.hi,
+ crc_old.csum.lo,
+ merged.hi,
+ merged.lo,
+ bch2_csum_types[crc_old.csum_type],
+ bch2_csum_types[new_csum_type]);
return -EIO;
+ }
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
if (i->crc)
const struct user_key_payload *ukp;
int ret;
- keyring_key = request_key(&key_type_logon, key_description, NULL);
+ keyring_key = request_key(&key_type_user, key_description, NULL);
if (IS_ERR(keyring_key))
return PTR_ERR(keyring_key);
int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
{
- char key_description[60];
- char uuid[40];
+ struct printbuf key_description = PRINTBUF;
+ int ret;
- uuid_unparse_lower(sb->user_uuid.b, uuid);
- sprintf(key_description, "bcachefs:%s", uuid);
+ prt_printf(&key_description, "bcachefs:");
+ pr_uuid(&key_description, sb->user_uuid.b);
- return __bch2_request_key(key_description, key);
+ ret = __bch2_request_key(key_description.buf, key);
+ printbuf_exit(&key_description);
+ return ret;
}
int bch2_decrypt_sb_key(struct bch_fs *c,
ret = bch2_request_key(c->disk_sb.sb, &user_key);
if (ret) {
- bch_err(c, "error requesting encryption key: %i", ret);
+ bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
goto err;
}
static int bch2_alloc_ciphers(struct bch_fs *c)
{
+ int ret;
+
if (!c->chacha20)
c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
- if (IS_ERR(c->chacha20)) {
- bch_err(c, "error requesting chacha20 module: %li",
- PTR_ERR(c->chacha20));
- return PTR_ERR(c->chacha20);
+ ret = PTR_ERR_OR_ZERO(c->chacha20);
+
+ if (ret) {
+ bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
+ return ret;
}
if (!c->poly1305)
c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
- if (IS_ERR(c->poly1305)) {
- bch_err(c, "error requesting poly1305 module: %li",
- PTR_ERR(c->poly1305));
- return PTR_ERR(c->poly1305);
+ ret = PTR_ERR_OR_ZERO(c->poly1305);
+
+ if (ret) {
+ bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
+ return ret;
}
return 0;
if (keyed) {
ret = bch2_request_key(c->disk_sb.sb, &user_key);
if (ret) {
- bch_err(c, "error requesting encryption key: %i", ret);
+ bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
goto err;
}
pr_verbose_init(c->opts, "");
c->sha256 = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(c->sha256)) {
- bch_err(c, "error requesting sha256 module");
- ret = PTR_ERR(c->sha256);
+ ret = PTR_ERR_OR_ZERO(c->sha256);
+ if (ret) {
+ bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
goto out;
}
int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
int bch2_request_key(struct bch_sb *, struct bch_key *);
-void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
+int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
void *data, size_t);
struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
struct bch_extent_crc_unpacked *,
unsigned, unsigned, unsigned);
-void bch2_encrypt_bio(struct bch_fs *, unsigned,
- struct nonce, struct bio *);
+int bch2_encrypt_bio(struct bch_fs *, unsigned,
+ struct nonce, struct bio *);
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
struct bch_key *);
now = atomic64_read(&clock->now);
for (i = 0; i < clock->timers.used; i++)
- pr_buf(out, "%ps:\t%li\n",
+ prt_printf(out, "%ps:\t%li\n",
clock->timers.data[i]->fn,
clock->timers.data[i]->expire - now);
spin_unlock(&clock->timer_lock);
goto err;
workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
- ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound());
+ ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
- ret = ZSTD_decompressDCtx(ctx,
+ ret = zstd_decompress_dctx(ctx,
dst_data, dst_len,
src_data.b + 4, real_src_len);
return strm.total_out;
}
case BCH_COMPRESSION_TYPE_zstd: {
- ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
- ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
+ ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
+ zstd_cctx_workspace_bound(&c->zstd_params.cParams));
/*
* ZSTD requires that when we decompress we pass in the exact
* factor (7 bytes) from the dst buffer size to account for
* that.
*/
- size_t len = ZSTD_compressCCtx(ctx,
+ size_t len = zstd_compress_cctx(ctx,
dst + 4, dst_len - 4 - 7,
src, src_len,
- c->zstd_params);
- if (ZSTD_isError(len))
+ &c->zstd_params);
+ if (zstd_is_error(len))
return 0;
*((__le32 *) dst) = cpu_to_le32(len);
/* If it's only one block, don't bother trying to compress: */
if (src->bi_iter.bi_size <= c->opts.block_size)
- return 0;
+ return BCH_COMPRESSION_TYPE_incompressible;
dst_data = bio_map_or_bounce(c, dst, WRITE);
src_data = bio_map_or_bounce(c, src, READ);
{
size_t decompress_workspace_size = 0;
bool decompress_workspace_needed;
- ZSTD_parameters params = ZSTD_getParams(0, c->opts.encoded_extent_max, 0);
+ ZSTD_parameters params = zstd_get_params(0, c->opts.encoded_extent_max);
struct {
unsigned feature;
unsigned type;
zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
zlib_inflate_workspacesize(), },
{ BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
- ZSTD_CCtxWorkspaceBound(params.cParams),
- ZSTD_DCtxWorkspaceBound() },
+ zstd_cctx_workspace_bound(¶ms.cParams),
+ zstd_dctx_workspace_bound() },
}, *i;
int ret = 0;
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "super-io.h"
+#include "counters.h"
+
+/* BCH_SB_FIELD_counters */
+
+const char * const bch2_counter_names[] = {
+#define x(t, n, ...) (#t),
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ NULL
+};
+
+static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
+{
+ if (!ctrs)
+ return 0;
+
+ return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
+};
+
+static int bch2_sb_counters_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ return 0;
+};
+
+void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
+ unsigned int i;
+ unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+ for (i = 0; i < nr; i++) {
+ if (i < BCH_COUNTER_NR)
+ prt_printf(out, "%s ", bch2_counter_names[i]);
+ else
+ prt_printf(out, "(unknown)");
+
+ prt_tab(out);
+ prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
+ prt_newline(out);
+ };
+};
+
+int bch2_sb_counters_to_cpu(struct bch_fs *c)
+{
+ struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+ unsigned int i;
+ unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+ u64 val = 0;
+
+ for (i = 0; i < BCH_COUNTER_NR; i++)
+ c->counters_on_mount[i] = 0;
+
+ for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) {
+ val = le64_to_cpu(ctrs->d[i]);
+ percpu_u64_set(&c->counters[i], val);
+ c->counters_on_mount[i] = val;
+ }
+ return 0;
+};
+
+int bch2_sb_counters_from_cpu(struct bch_fs *c)
+{
+ struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+ struct bch_sb_field_counters *ret;
+ unsigned int i;
+ unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+ if (nr < BCH_COUNTER_NR) {
+ ret = bch2_sb_resize_counters(&c->disk_sb,
+ sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
+
+ if (ret) {
+ ctrs = ret;
+ nr = bch2_sb_counter_nr_entries(ctrs);
+ }
+ }
+
+
+ for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++)
+ ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
+ return 0;
+}
+
+void bch2_fs_counters_exit(struct bch_fs *c)
+{
+ free_percpu(c->counters);
+}
+
+int bch2_fs_counters_init(struct bch_fs *c)
+{
+ c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64));
+ if (!c->counters)
+ return -ENOMEM;
+
+ return bch2_sb_counters_to_cpu(c);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_counters = {
+ .validate = bch2_sb_counters_validate,
+ .to_text = bch2_sb_counters_to_text,
+};
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COUNTERS_H
+#define _BCACHEFS_COUNTERS_H
+
+#include "bcachefs.h"
+#include "super-io.h"
+
+
+int bch2_sb_counters_to_cpu(struct bch_fs *);
+int bch2_sb_counters_from_cpu(struct bch_fs *);
+
+void bch2_fs_counters_exit(struct bch_fs *);
+int bch2_fs_counters_init(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
+
+#endif // _BCACHEFS_COUNTERS_H
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DARRAY_H
+#define _BCACHEFS_DARRAY_H
+
+/*
+ * Dynamic arrays:
+ *
+ * Inspired by CCAN's darray
+ */
+
+#include "util.h"
+#include <linux/slab.h>
+
+#define DARRAY(type) \
+struct { \
+ size_t nr, size; \
+ type *data; \
+}
+
+typedef DARRAY(void) darray_void;
+
+static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more)
+{
+ if (d->nr + more > d->size) {
+ size_t new_size = roundup_pow_of_two(d->nr + more);
+ void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL);
+
+ if (!data)
+ return -ENOMEM;
+
+ d->data = data;
+ d->size = new_size;
+ }
+
+ return 0;
+}
+
+#define darray_make_room(_d, _more) \
+ __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more))
+
+#define darray_top(_d) ((_d).data[(_d).nr])
+
+#define darray_push(_d, _item) \
+({ \
+ int _ret = darray_make_room((_d), 1); \
+ \
+ if (!_ret) \
+ (_d)->data[(_d)->nr++] = (_item); \
+ _ret; \
+})
+
+#define darray_insert_item(_d, _pos, _item) \
+({ \
+ size_t pos = (_pos); \
+ int _ret = darray_make_room((_d), 1); \
+ \
+ if (!_ret) \
+ array_insert_item((_d)->data, (_d)->nr, pos, (_item)); \
+ _ret; \
+})
+
+#define darray_for_each(_d, _i) \
+ for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+
+#define darray_init(_d) \
+do { \
+ (_d)->data = NULL; \
+ (_d)->nr = (_d)->size = 0; \
+} while (0)
+
+#define darray_exit(_d) \
+do { \
+ kfree((_d)->data); \
+ darray_init(_d); \
+} while (0)
+
+#endif /* _BCACHEFS_DARRAY_H */
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "data_update.h"
+#include "ec.h"
+#include "extents.h"
+#include "io.h"
+#include "keylist.h"
+#include "move.h"
+#include "subvolume.h"
+
+#include <trace/events/bcachefs.h>
+
+static int insert_snapshot_whiteouts(struct btree_trans *trans,
+ enum btree_id id,
+ struct bpos old_pos,
+ struct bpos new_pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter, update_iter;
+ struct bkey_s_c k;
+ snapshot_id_list s;
+ int ret;
+
+ if (!btree_type_has_snapshots(id))
+ return 0;
+
+ darray_init(&s);
+
+ if (!bkey_cmp(old_pos, new_pos))
+ return 0;
+
+ if (!snapshot_t(c, old_pos.snapshot)->children[0])
+ return 0;
+
+ bch2_trans_iter_init(trans, &iter, id, old_pos,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ while (1) {
+ k = bch2_btree_iter_prev(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ break;
+
+ if (bkey_cmp(old_pos, k.k->p))
+ break;
+
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
+ struct bkey_i *update;
+
+ if (snapshot_list_has_ancestor(c, &s, k.k->p.snapshot))
+ continue;
+
+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ break;
+
+ bkey_init(&update->k);
+ update->k.p = new_pos;
+ update->k.p.snapshot = k.k->p.snapshot;
+
+ bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&update_iter) ?:
+ bch2_trans_update(trans, &update_iter, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_iter_exit(trans, &update_iter);
+ if (ret)
+ break;
+
+ ret = snapshot_list_add(c, &s, k.k->p.snapshot);
+ if (ret)
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ darray_exit(&s);
+
+ return ret;
+}
+
+static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev)
+{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+ struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->dev == dev)
+ ptr->cached = true;
+}
+
+static int bch2_data_update_index_update(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct data_update *m =
+ container_of(op, struct data_update, op);
+ struct keylist *keys = &op->insert_keys;
+ struct bkey_buf _new, _insert;
+ int ret = 0;
+
+ bch2_bkey_buf_init(&_new);
+ bch2_bkey_buf_init(&_insert);
+ bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
+
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+
+ bch2_trans_iter_init(&trans, &iter, m->btree_id,
+ bkey_start_pos(&bch2_keylist_front(keys)->k),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+ while (1) {
+ struct bkey_s_c k;
+ struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
+ struct bkey_i *insert;
+ struct bkey_i_extent *new;
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct bpos next_pos;
+ bool did_work = false;
+ bool should_check_enospc;
+ s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+ unsigned i;
+
+ bch2_trans_begin(&trans);
+
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ new = bkey_i_to_extent(bch2_keylist_front(keys));
+
+ if (!bch2_extents_match(k, old))
+ goto nomatch;
+
+ bkey_reassemble(_insert.k, k);
+ insert = _insert.k;
+
+ bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
+ new = bkey_i_to_extent(_new.k);
+ bch2_cut_front(iter.pos, &new->k_i);
+
+ bch2_cut_front(iter.pos, insert);
+ bch2_cut_back(new->k.p, insert);
+ bch2_cut_back(insert->k.p, &new->k_i);
+
+ /*
+ * @old: extent that we read from
+ * @insert: key that we're going to update, initialized from
+ * extent currently in btree - same as @old unless we raced with
+ * other updates
+ * @new: extent with new pointers that we'll be adding to @insert
+ *
+ * Fist, drop rewrite_ptrs from @new:
+ */
+ i = 0;
+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
+ if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+ bch2_extent_has_ptr(old, p, bkey_i_to_s_c(insert))) {
+ /*
+ * If we're going to be adding a pointer to the
+ * same device, we have to drop the old one -
+ * otherwise, we can just mark it cached:
+ */
+ if (bch2_bkey_has_device(bkey_i_to_s_c(&new->k_i), p.ptr.dev))
+ bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), p.ptr.dev);
+ else
+ bch2_bkey_mark_dev_cached(bkey_i_to_s(insert), p.ptr.dev);
+ }
+ i++;
+ }
+
+
+ /* Add new ptrs: */
+ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
+ if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
+ /*
+ * raced with another move op? extent already
+ * has a pointer to the device we just wrote
+ * data to
+ */
+ continue;
+ }
+
+ bch2_extent_ptr_decoded_append(insert, &p);
+ did_work = true;
+ }
+
+ if (!did_work)
+ goto nomatch;
+
+ bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
+ bch2_extent_normalize(c, bkey_i_to_s(insert));
+
+ ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
+ &should_check_enospc,
+ &i_sectors_delta,
+ &disk_sectors_delta);
+ if (ret)
+ goto err;
+
+ if (disk_sectors_delta > (s64) op->res.sectors) {
+ ret = bch2_disk_reservation_add(c, &op->res,
+ disk_sectors_delta - op->res.sectors,
+ !should_check_enospc
+ ? BCH_DISK_RESERVATION_NOFAIL : 0);
+ if (ret)
+ goto out;
+ }
+
+ next_pos = insert->k.p;
+
+ ret = insert_snapshot_whiteouts(&trans, m->btree_id,
+ k.k->p, insert->k.p) ?:
+ bch2_trans_update(&trans, &iter, insert,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(&trans, &op->res,
+ op_journal_seq(op),
+ BTREE_INSERT_NOFAIL|
+ m->data_opts.btree_insert_flags);
+ if (!ret) {
+ bch2_btree_iter_set_pos(&iter, next_pos);
+
+ this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
+ trace_move_extent_finish(&new->k);
+ }
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ ret = 0;
+ if (ret)
+ break;
+next:
+ while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
+ bch2_keylist_pop_front(keys);
+ if (bch2_keylist_empty(keys))
+ goto out;
+ }
+ continue;
+nomatch:
+ if (m->ctxt) {
+ BUG_ON(k.k->p.offset <= iter.pos.offset);
+ atomic64_inc(&m->ctxt->stats->keys_raced);
+ atomic64_add(k.k->p.offset - iter.pos.offset,
+ &m->ctxt->stats->sectors_raced);
+ }
+
+ this_cpu_add(c->counters[BCH_COUNTER_move_extent_race], new->k.size);
+ trace_move_extent_race(&new->k);
+
+ bch2_btree_iter_advance(&iter);
+ goto next;
+ }
+out:
+ bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_exit(&trans);
+ bch2_bkey_buf_exit(&_insert, c);
+ bch2_bkey_buf_exit(&_new, c);
+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+ return ret;
+}
+
+void bch2_data_update_read_done(struct data_update *m,
+ struct bch_extent_crc_unpacked crc,
+ struct closure *cl)
+{
+ /* write bio must own pages: */
+ BUG_ON(!m->op.wbio.bio.bi_vcnt);
+
+ m->op.crc = crc;
+ m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
+
+ closure_call(&m->op.cl, bch2_write, NULL, cl);
+}
+
+void bch2_data_update_exit(struct data_update *update)
+{
+ struct bch_fs *c = update->op.c;
+
+ bch2_bkey_buf_exit(&update->k, c);
+ bch2_disk_reservation_put(c, &update->op.res);
+ bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
+}
+
+int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
+ struct write_point_specifier wp,
+ struct bch_io_opts io_opts,
+ struct data_update_opts data_opts,
+ enum btree_id btree_id,
+ struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
+ int ret;
+
+ bch2_bkey_buf_init(&m->k);
+ bch2_bkey_buf_reassemble(&m->k, c, k);
+ m->btree_id = btree_id;
+ m->data_opts = data_opts;
+
+ bch2_write_op_init(&m->op, c, io_opts);
+ m->op.pos = bkey_start_pos(k.k);
+ m->op.version = k.k->version;
+ m->op.target = data_opts.target;
+ m->op.write_point = wp;
+ m->op.flags |= BCH_WRITE_PAGES_STABLE|
+ BCH_WRITE_PAGES_OWNED|
+ BCH_WRITE_DATA_ENCODED|
+ BCH_WRITE_FROM_INTERNAL|
+ m->data_opts.write_flags;
+ m->op.compression_type =
+ bch2_compression_opt_to_type[io_opts.background_compression ?:
+ io_opts.compression];
+ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
+ m->op.alloc_reserve = RESERVE_movinggc;
+ m->op.index_update_fn = bch2_data_update_index_update;
+
+ i = 0;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+ p.ptr.cached)
+ BUG();
+
+ if (!((1U << i) & m->data_opts.rewrite_ptrs))
+ bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+
+ if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+ crc_is_compressed(p.crc))
+ reserve_sectors += k.k->size;
+
+ /*
+ * op->csum_type is normally initialized from the fs/file's
+ * current options - but if an extent is encrypted, we require
+ * that it stays encrypted:
+ */
+ if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
+ m->op.nonce = p.crc.nonce + p.crc.offset;
+ m->op.csum_type = p.crc.csum_type;
+ }
+
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+ m->op.incompressible = true;
+
+ i++;
+ }
+
+ if (reserve_sectors) {
+ ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
+ m->data_opts.extra_replicas
+ ? 0
+ : BCH_DISK_RESERVATION_NOFAIL);
+ if (ret)
+ return ret;
+ }
+
+ m->op.nr_replicas = m->op.nr_replicas_required =
+ hweight32(m->data_opts.rewrite_ptrs) + m->data_opts.extra_replicas;
+
+ BUG_ON(!m->op.nr_replicas);
+ return 0;
+}
+
+void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+ unsigned i = 0;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) {
+ opts->kill_ptrs |= 1U << i;
+ opts->rewrite_ptrs ^= 1U << i;
+ }
+
+ i++;
+ }
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BCACHEFS_DATA_UPDATE_H
+#define _BCACHEFS_DATA_UPDATE_H
+
+#include "bkey_buf.h"
+#include "io_types.h"
+
+struct moving_context;
+
+struct data_update_opts {
+ unsigned rewrite_ptrs;
+ unsigned kill_ptrs;
+ u16 target;
+ u8 extra_replicas;
+ unsigned btree_insert_flags;
+ unsigned write_flags;
+};
+
+struct data_update {
+ /* extent being updated: */
+ enum btree_id btree_id;
+ struct bkey_buf k;
+ struct data_update_opts data_opts;
+ struct moving_context *ctxt;
+ struct bch_write_op op;
+};
+
+void bch2_data_update_read_done(struct data_update *,
+ struct bch_extent_crc_unpacked,
+ struct closure *);
+
+void bch2_data_update_exit(struct data_update *);
+int bch2_data_update_init(struct bch_fs *, struct data_update *,
+ struct write_point_specifier,
+ struct bch_io_opts, struct data_update_opts,
+ enum btree_id, struct bkey_s_c);
+void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
+
+#endif /* _BCACHEFS_DATA_UPDATE_H */
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_iter.h"
+#include "btree_locking.h"
#include "btree_update.h"
#include "buckets.h"
#include "debug.h"
#include <linux/console.h>
#include <linux/debugfs.h>
#include <linux/module.h>
+#include <linux/pretty-printers.h>
#include <linux/random.h>
#include <linux/seq_file.h>
if (!bch2_dev_get_ioref(ca, READ))
return false;
- bio = bio_alloc_bioset(GFP_NOIO,
- buf_pages(n_sorted, btree_bytes(c)),
- &c->btree_bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_opf = REQ_OP_READ|REQ_META;
+ bio = bio_alloc_bioset(ca->disk_sb.bdev,
+ buf_pages(n_sorted, btree_bytes(c)),
+ REQ_OP_READ|REQ_META,
+ GFP_NOIO,
+ &c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
bch2_bio_map(bio, n_sorted, btree_bytes(c));
failed |= bch2_btree_verify_replica(c, b, p);
if (failed) {
- char buf[200];
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key));
- bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
+ printbuf_exit(&buf);
}
out:
mutex_unlock(&c->verify_lock);
/* XXX: bch_fs refcounting */
struct dump_iter {
- struct bpos from;
- struct bch_fs *c;
+ struct bch_fs *c;
enum btree_id id;
+ struct bpos from;
+ struct bpos prev_node;
+ u64 iter;
- char buf[1 << 12];
- size_t bytes; /* what's currently in buf */
+ struct printbuf buf;
char __user *ubuf; /* destination user buffer */
size_t size; /* size of requested read */
ssize_t ret; /* bytes read so far */
};
-static int flush_buf(struct dump_iter *i)
+static ssize_t flush_buf(struct dump_iter *i)
{
- if (i->bytes) {
- size_t bytes = min(i->bytes, i->size);
- int err = copy_to_user(i->ubuf, i->buf, bytes);
+ if (i->buf.pos) {
+ size_t bytes = min_t(size_t, i->buf.pos, i->size);
+ int err = copy_to_user(i->ubuf, i->buf.buf, bytes);
if (err)
return err;
i->ret += bytes;
i->ubuf += bytes;
i->size -= bytes;
- i->bytes -= bytes;
- memmove(i->buf, i->buf + bytes, i->bytes);
+ i->buf.pos -= bytes;
+ memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos);
}
- return 0;
+ return i->size ? 0 : i->ret;
}
static int bch2_dump_open(struct inode *inode, struct file *file)
file->private_data = i;
i->from = POS_MIN;
+ i->iter = 0;
i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]);
i->id = bd->id;
+ i->buf = PRINTBUF;
return 0;
}
static int bch2_dump_release(struct inode *inode, struct file *file)
{
- kfree(file->private_data);
+ struct dump_iter *i = file->private_data;
+
+ printbuf_exit(&i->buf);
+ kfree(i);
return 0;
}
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- int err;
+ ssize_t ret;
i->ubuf = buf;
i->size = size;
i->ret = 0;
- err = flush_buf(i);
- if (err)
- return err;
-
- if (!i->size)
- return i->ret;
-
bch2_trans_init(&trans, i->c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
- k = bch2_btree_iter_peek(&iter);
-
- while (k.k && !(err = bkey_err(k))) {
- bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
- i->bytes = strlen(i->buf);
- BUG_ON(i->bytes >= sizeof(i->buf));
- i->buf[i->bytes] = '\n';
- i->bytes++;
-
- k = bch2_btree_iter_next(&iter);
- i->from = iter.pos;
-
- err = flush_buf(i);
- if (err)
+ ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ ret = flush_buf(i);
+ if (ret)
break;
- if (!i->size)
- break;
- }
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_bkey_val_to_text(&i->buf, i->c, k);
+ prt_newline(&i->buf);
+ 0;
+ }));
+ i->from = iter.pos;
+
+ if (!ret)
+ ret = flush_buf(i);
bch2_trans_exit(&trans);
- return err < 0 ? err : i->ret;
+ return ret ?: i->ret;
}
static const struct file_operations btree_debug_ops = {
struct btree_trans trans;
struct btree_iter iter;
struct btree *b;
- int err;
+ ssize_t ret;
i->ubuf = buf;
i->size = size;
i->ret = 0;
- err = flush_buf(i);
- if (err)
- return err;
+ ret = flush_buf(i);
+ if (ret)
+ return ret;
- if (!i->size || !bpos_cmp(SPOS_MAX, i->from))
+ if (!bpos_cmp(SPOS_MAX, i->from))
return i->ret;
bch2_trans_init(&trans, i->c, 0, 0);
- for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) {
- bch2_btree_node_to_text(&PBUF(i->buf), i->c, b);
- i->bytes = strlen(i->buf);
- err = flush_buf(i);
- if (err)
+ for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) {
+ ret = flush_buf(i);
+ if (ret)
break;
- /*
- * can't easily correctly restart a btree node traversal across
- * all nodes, meh
- */
+ bch2_btree_node_to_text(&i->buf, i->c, b);
i->from = bpos_cmp(SPOS_MAX, b->key.k.p)
? bpos_successor(b->key.k.p)
: b->key.k.p;
-
- if (!i->size)
- break;
}
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
- return err < 0 ? err : i->ret;
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
}
static const struct file_operations btree_format_debug_ops = {
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct btree *prev_node = NULL;
- int err;
+ ssize_t ret;
i->ubuf = buf;
i->size = size;
i->ret = 0;
- err = flush_buf(i);
- if (err)
- return err;
-
- if (!i->size)
- return i->ret;
+ ret = flush_buf(i);
+ if (ret)
+ return ret;
bch2_trans_init(&trans, i->c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
-
- while ((k = bch2_btree_iter_peek(&iter)).k &&
- !(err = bkey_err(k))) {
+ ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
struct btree_path_level *l = &iter.path->l[0];
struct bkey_packed *_k =
bch2_btree_node_iter_peek(&l->iter, l->b);
- if (l->b != prev_node) {
- bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b);
- i->bytes = strlen(i->buf);
- err = flush_buf(i);
- if (err)
- break;
+ ret = flush_buf(i);
+ if (ret)
+ break;
+
+ if (bpos_cmp(l->b->key.k.p, i->prev_node) > 0) {
+ bch2_btree_node_to_text(&i->buf, i->c, l->b);
+ i->prev_node = l->b->key.k.p;
}
- prev_node = l->b;
- bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k);
- i->bytes = strlen(i->buf);
+ bch2_bfloat_to_text(&i->buf, l->b, _k);
+ 0;
+ }));
+ i->from = iter.pos;
+
+ bch2_trans_exit(&trans);
+
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations bfloat_failed_debug_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_read_bfloat_failed,
+};
+
+static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
+ struct btree *b)
+{
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
+
+ prt_printf(out, "%px btree=%s l=%u ",
+ b,
+ bch2_btree_ids[b->c.btree_id],
+ b->c.level);
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+ prt_newline(out);
+
+ prt_printf(out, "flags: ");
+ prt_tab(out);
+ prt_bitflags(out, bch2_btree_node_flags, b->flags);
+ prt_newline(out);
+
+ prt_printf(out, "pcpu read locks: ");
+ prt_tab(out);
+ prt_printf(out, "%u", b->c.lock.readers != NULL);
+ prt_newline(out);
+
+ prt_printf(out, "written:");
+ prt_tab(out);
+ prt_printf(out, "%u", b->written);
+ prt_newline(out);
+
+ prt_printf(out, "writes blocked:");
+ prt_tab(out);
+ prt_printf(out, "%u", !list_empty_careful(&b->write_blocked));
+ prt_newline(out);
+
+ prt_printf(out, "will make reachable:");
+ prt_tab(out);
+ prt_printf(out, "%lx", b->will_make_reachable);
+ prt_newline(out);
+
+ prt_printf(out, "journal pin %px:", &b->writes[0].journal);
+ prt_tab(out);
+ prt_printf(out, "%llu", b->writes[0].journal.seq);
+ prt_newline(out);
+
+ prt_printf(out, "journal pin %px:", &b->writes[1].journal);
+ prt_tab(out);
+ prt_printf(out, "%llu", b->writes[1].journal.seq);
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+}
+
+static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ bool done = false;
+ ssize_t ret = 0;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ do {
+ struct bucket_table *tbl;
+ struct rhash_head *pos;
+ struct btree *b;
+
+ ret = flush_buf(i);
+ if (ret)
+ return ret;
+
+ rcu_read_lock();
+ i->buf.atomic++;
+ tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
+ &c->btree_cache.table);
+ if (i->iter < tbl->size) {
+ rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
+ bch2_cached_btree_node_to_text(&i->buf, c, b);
+ i->iter++;
+ } else {
+ done = true;
+ }
+ --i->buf.atomic;
+ rcu_read_unlock();
+ } while (!done);
+
+ if (i->buf.allocation_failure)
+ ret = -ENOMEM;
+
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations cached_btree_nodes_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_cached_btree_nodes_read,
+};
+
+static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ struct btree_trans *trans;
+ ssize_t ret = 0;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ mutex_lock(&c->btree_trans_lock);
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ if (trans->locking_wait.task->pid <= i->iter)
+ continue;
+
+ ret = flush_buf(i);
+ if (ret)
+ return ret;
+
+ bch2_btree_trans_to_text(&i->buf, trans);
+
+ prt_printf(&i->buf, "backtrace:");
+ prt_newline(&i->buf);
+ printbuf_indent_add(&i->buf, 2);
+ bch2_prt_backtrace(&i->buf, trans->locking_wait.task);
+ printbuf_indent_sub(&i->buf, 2);
+ prt_newline(&i->buf);
+
+ i->iter = trans->locking_wait.task->pid;
+ }
+ mutex_unlock(&c->btree_trans_lock);
+
+ if (i->buf.allocation_failure)
+ ret = -ENOMEM;
+
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations btree_transactions_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_btree_transactions_read,
+};
+
+static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ bool done = false;
+ int err;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ do {
err = flush_buf(i);
if (err)
+ return err;
+
+ if (!i->size)
break;
- bch2_btree_iter_advance(&iter);
- i->from = iter.pos;
+ done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
+ i->iter++;
+ } while (!done);
+
+ if (i->buf.allocation_failure)
+ return -ENOMEM;
+
+ return i->ret;
+}
+
+static const struct file_operations journal_pins_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_journal_pins_read,
+};
+
+static int lock_held_stats_open(struct inode *inode, struct file *file)
+{
+ struct bch_fs *c = inode->i_private;
+ struct dump_iter *i;
+
+ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+
+ if (!i)
+ return -ENOMEM;
+
+ i->iter = 0;
+ i->c = c;
+ i->buf = PRINTBUF;
+ file->private_data = i;
+
+ return 0;
+}
+
+static int lock_held_stats_release(struct inode *inode, struct file *file)
+{
+ struct dump_iter *i = file->private_data;
+
+ printbuf_exit(&i->buf);
+ kfree(i);
+
+ return 0;
+}
+
+static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ int err;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ while (1) {
+ struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
err = flush_buf(i);
if (err)
- break;
+ return err;
if (!i->size)
break;
+
+ if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) ||
+ !bch2_btree_transaction_fns[i->iter])
+ break;
+
+ prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]);
+ prt_newline(&i->buf);
+ printbuf_indent_add(&i->buf, 2);
+
+ mutex_lock(&s->lock);
+
+ prt_printf(&i->buf, "Max mem used: %u", s->max_mem);
+ prt_newline(&i->buf);
+
+ if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
+ prt_printf(&i->buf, "Lock hold times:");
+ prt_newline(&i->buf);
+
+ printbuf_indent_add(&i->buf, 2);
+ bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
+ printbuf_indent_sub(&i->buf, 2);
+ }
+
+ if (s->max_paths_text) {
+ prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths);
+ prt_newline(&i->buf);
+
+ printbuf_indent_add(&i->buf, 2);
+ prt_str_indented(&i->buf, s->max_paths_text);
+ printbuf_indent_sub(&i->buf, 2);
+ }
+
+ mutex_unlock(&s->lock);
+
+ printbuf_indent_sub(&i->buf, 2);
+ prt_newline(&i->buf);
+ i->iter++;
}
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ if (i->buf.allocation_failure)
+ return -ENOMEM;
- return err < 0 ? err : i->ret;
+ return i->ret;
}
-static const struct file_operations bfloat_failed_debug_ops = {
+static const struct file_operations lock_held_stats_op = {
+ .owner = THIS_MODULE,
+ .open = lock_held_stats_open,
+ .release = lock_held_stats_release,
+ .read = lock_held_stats_read,
+};
+
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ struct btree_trans *trans;
+ ssize_t ret = 0;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ if (i->iter)
+ goto out;
+
+ mutex_lock(&c->btree_trans_lock);
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ if (trans->locking_wait.task->pid <= i->iter)
+ continue;
+
+ ret = flush_buf(i);
+ if (ret)
+ return ret;
+
+ bch2_check_for_deadlock(trans, &i->buf);
+
+ i->iter = trans->locking_wait.task->pid;
+ }
+ mutex_unlock(&c->btree_trans_lock);
+out:
+ if (i->buf.allocation_failure)
+ ret = -ENOMEM;
+
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations btree_deadlock_ops = {
.owner = THIS_MODULE,
.open = bch2_dump_open,
.release = bch2_dump_release,
- .read = bch2_read_bfloat_failed,
+ .read = bch2_btree_deadlock_read,
};
void bch2_fs_debug_exit(struct bch_fs *c)
{
- if (!IS_ERR_OR_NULL(c->debug))
- debugfs_remove_recursive(c->debug);
+ if (!IS_ERR_OR_NULL(c->fs_debug_dir))
+ debugfs_remove_recursive(c->fs_debug_dir);
}
void bch2_fs_debug_init(struct bch_fs *c)
return;
snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
- c->debug = debugfs_create_dir(name, bch_debug);
- if (IS_ERR_OR_NULL(c->debug))
+ c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
+ if (IS_ERR_OR_NULL(c->fs_debug_dir))
+ return;
+
+ debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
+ c->btree_debug, &cached_btree_nodes_ops);
+
+ debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
+ c->btree_debug, &btree_transactions_ops);
+
+ debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
+ c->btree_debug, &journal_pins_ops);
+
+ debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
+ c, &lock_held_stats_op);
+
+ debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
+ c->btree_debug, &btree_deadlock_ops);
+
+ c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
+ if (IS_ERR_OR_NULL(c->btree_debug_dir))
return;
for (bd = c->btree_debug;
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
- bd->btree = debugfs_create_file(bch2_btree_ids[bd->id],
- 0400, c->debug, bd,
- &btree_debug_ops);
+ debugfs_create_file(bch2_btree_ids[bd->id],
+ 0400, c->btree_debug_dir, bd,
+ &btree_debug_ops);
snprintf(name, sizeof(name), "%s-formats",
bch2_btree_ids[bd->id]);
- bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
- &btree_format_debug_ops);
+ debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+ &btree_format_debug_ops);
snprintf(name, sizeof(name), "%s-bfloat-failed",
bch2_btree_ids[bd->id]);
- bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
- &bfloat_failed_debug_ops);
+ debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+ &bfloat_failed_debug_ops);
}
}
.is_visible = dirent_is_visible,
};
-const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
unsigned len;
- if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
- return "value too small";
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(*d.v));
+ return -EINVAL;
+ }
len = bch2_dirent_name_bytes(d);
- if (!len)
- return "empty name";
+ if (!len) {
+ prt_printf(err, "empty name");
+ return -EINVAL;
+ }
- if (bkey_val_u64s(k.k) > dirent_val_u64s(len))
- return "value too big";
+ if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) {
+ prt_printf(err, "value too big (%zu > %u)",
+ bkey_val_u64s(k.k), dirent_val_u64s(len));
+ return -EINVAL;
+ }
- if (len > BCH_NAME_MAX)
- return "dirent name too big";
+ if (len > BCH_NAME_MAX) {
+ prt_printf(err, "dirent name too big (%u > %u)",
+ len, BCH_NAME_MAX);
+ return -EINVAL;
+ }
- if (len == 1 && !memcmp(d.v->d_name, ".", 1))
- return "invalid name";
+ if (len == 1 && !memcmp(d.v->d_name, ".", 1)) {
+ prt_printf(err, "invalid name");
+ return -EINVAL;
+ }
- if (len == 2 && !memcmp(d.v->d_name, "..", 2))
- return "invalid name";
+ if (len == 2 && !memcmp(d.v->d_name, "..", 2)) {
+ prt_printf(err, "invalid name");
+ return -EINVAL;
+ }
- if (memchr(d.v->d_name, '/', len))
- return "invalid name";
+ if (memchr(d.v->d_name, '/', len)) {
+ prt_printf(err, "invalid name");
+ return -EINVAL;
+ }
if (d.v->d_type != DT_SUBVOL &&
- le64_to_cpu(d.v->d_inum) == d.k->p.inode)
- return "dirent points to own directory";
+ le64_to_cpu(d.v->d_inum) == d.k->p.inode) {
+ prt_printf(err, "dirent points to own directory");
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- bch_scnmemcpy(out, d.v->d_name,
- bch2_dirent_name_bytes(d));
- pr_buf(out, " -> %llu type %s",
+ prt_printf(out, "%.*s -> %llu type %s",
+ bch2_dirent_name_bytes(d),
+ d.v->d_name,
d.v->d_type != DT_SUBVOL
? le64_to_cpu(d.v->d_inum)
: le32_to_cpu(d.v->d_child_subvol),
ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
name, inum, 0);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (!ret)
bch2_trans_iter_exit(&trans, &iter);
if (ret)
return ret;
- for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents,
- SPOS(dir.inum, 0, snapshot), 0, k, ret) {
- if (k.k->p.inode > dir.inum)
- break;
-
+ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
+ SPOS(dir.inum, 0, snapshot),
+ POS(dir.inum, U64_MAX), 0, k, ret)
if (k.k->type == KEY_TYPE_dirent) {
ret = -ENOTEMPTY;
break;
}
- }
bch2_trans_iter_exit(trans, &iter);
return ret;
if (ret)
goto err;
- for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents,
- SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) {
- if (k.k->p.inode > inum.inum)
- break;
-
+ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents,
+ SPOS(inum.inum, ctx->pos, snapshot),
+ POS(inum.inum, U64_MAX), 0, k, ret) {
if (k.k->type != KEY_TYPE_dirent)
continue;
}
bch2_trans_iter_exit(&trans, &iter);
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
extern const struct bch_hash_desc bch2_dirent_hash_desc;
-const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_dirent (struct bkey_ops) { \
g = BCH_MEMBER_GROUP(m) - 1;
if (g >= nr_groups) {
- pr_buf(err, "disk %u has invalid label %u (have %u)",
+ prt_printf(err, "disk %u has invalid label %u (have %u)",
i, g, nr_groups);
return -EINVAL;
}
if (BCH_GROUP_DELETED(&groups->entries[g])) {
- pr_buf(err, "disk %u has deleted label %u", i, g);
+ prt_printf(err, "disk %u has deleted label %u", i, g);
return -EINVAL;
}
}
len = strnlen(g->label, sizeof(g->label));
if (!len) {
- pr_buf(err, "label %u empty", i);
+ prt_printf(err, "label %u empty", i);
return -EINVAL;
}
}
for (g = sorted; g + 1 < sorted + nr_groups; g++)
if (!BCH_GROUP_DELETED(g) &&
!group_cmp(&g[0], &g[1])) {
- pr_buf(err, "duplicate label %llu.", BCH_GROUP_PARENT(g));
- bch_scnmemcpy(err, g->label, strnlen(g->label, sizeof(g->label)));
+ prt_printf(err, "duplicate label %llu.%.*s",
+ BCH_GROUP_PARENT(g),
+ (int) sizeof(g->label), g->label);
goto err;
}
g < groups->entries + nr_groups;
g++) {
if (g != groups->entries)
- pr_buf(out, " ");
+ prt_printf(out, " ");
if (BCH_GROUP_DELETED(g))
- pr_buf(out, "[deleted]");
+ prt_printf(out, "[deleted]");
else
- pr_buf(out, "[parent %llu name %s]",
+ prt_printf(out, "[parent %llu name %s]",
BCH_GROUP_PARENT(g), g->label);
}
}
groups = bch2_sb_resize_disk_groups(sb, u64s);
if (!groups)
- return -ENOSPC;
+ return -BCH_ERR_ENOSPC_disk_label_add;
nr_groups = disk_groups_nr(groups);
}
return v;
}
-void bch2_disk_path_to_text(struct printbuf *out,
- struct bch_sb_handle *sb,
- unsigned v)
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
{
struct bch_sb_field_disk_groups *groups =
- bch2_sb_get_disk_groups(sb->sb);
+ bch2_sb_get_disk_groups(sb);
struct bch_disk_group *g;
unsigned nr = 0;
u16 path[32];
v = path[--nr];
g = groups->entries + v;
- bch_scnmemcpy(out, g->label,
- strnlen(g->label, sizeof(g->label)));
-
+ prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
if (nr)
- pr_buf(out, ".");
+ prt_printf(out, ".");
}
return;
inval:
- pr_buf(out, "invalid group %u", v);
+ prt_printf(out, "invalid label %u", v);
}
-int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
{
struct bch_member *mi;
- int v = -1;
- int ret = 0;
-
- mutex_lock(&c->sb_lock);
+ int ret, v = -1;
if (!strlen(name) || !strcmp(name, "none"))
- goto write_sb;
+ return 0;
v = bch2_disk_path_find_or_create(&c->disk_sb, name);
- if (v < 0) {
- mutex_unlock(&c->sb_lock);
+ if (v < 0)
return v;
- }
ret = bch2_sb_disk_groups_to_cpu(c);
if (ret)
- goto unlock;
-write_sb:
+ return ret;
+
mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
SET_BCH_MEMBER_GROUP(mi, v + 1);
+ return 0;
+}
+
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+ int ret;
- bch2_write_super(c);
-unlock:
+ mutex_lock(&c->sb_lock);
+ ret = __bch2_dev_group_set(c, ca, name) ?:
+ bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return ret;
return -EINVAL;
}
-void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v)
+void bch2_opt_target_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct bch_sb *sb,
+ u64 v)
{
struct target t = target_decode(v);
switch (t.type) {
case TARGET_NULL:
- pr_buf(out, "none");
+ prt_printf(out, "none");
break;
- case TARGET_DEV: {
- struct bch_dev *ca;
-
- rcu_read_lock();
- ca = t.dev < c->sb.nr_devices
- ? rcu_dereference(c->devs[t.dev])
- : NULL;
-
- if (ca && percpu_ref_tryget(&ca->io_ref)) {
- char b[BDEVNAME_SIZE];
-
- pr_buf(out, "/dev/%s",
- bdevname(ca->disk_sb.bdev, b));
- percpu_ref_put(&ca->io_ref);
- } else if (ca) {
- pr_buf(out, "offline device %u", t.dev);
+ case TARGET_DEV:
+ if (c) {
+ struct bch_dev *ca;
+
+ rcu_read_lock();
+ ca = t.dev < c->sb.nr_devices
+ ? rcu_dereference(c->devs[t.dev])
+ : NULL;
+
+ if (ca && percpu_ref_tryget(&ca->io_ref)) {
+ prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
+ percpu_ref_put(&ca->io_ref);
+ } else if (ca) {
+ prt_printf(out, "offline device %u", t.dev);
+ } else {
+ prt_printf(out, "invalid device %u", t.dev);
+ }
+
+ rcu_read_unlock();
} else {
- pr_buf(out, "invalid device %u", t.dev);
+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+ struct bch_member *m = mi->members + t.dev;
+
+ if (bch2_dev_exists(sb, mi, t.dev)) {
+ prt_printf(out, "Device ");
+ pr_uuid(out, m->uuid.b);
+ prt_printf(out, " (%u)", t.dev);
+ } else {
+ prt_printf(out, "Bad device %u", t.dev);
+ }
}
-
- rcu_read_unlock();
break;
- }
case TARGET_GROUP:
- mutex_lock(&c->sb_lock);
- bch2_disk_path_to_text(out, &c->disk_sb, t.group);
- mutex_unlock(&c->sb_lock);
+ if (c) {
+ mutex_lock(&c->sb_lock);
+ bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
+ mutex_unlock(&c->sb_lock);
+ } else {
+ bch2_disk_path_to_text(out, sb, t.group);
+ }
break;
default:
BUG();
/* Exported for userspace bcachefs-tools: */
int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
-void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *,
- unsigned);
+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
-void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64);
+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
+int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
const char *bch2_sb_validate_disk_groups(struct bch_sb *,
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "bkey_buf.h"
#include "bset.h"
#include "btree_gc.h"
/* Stripes btree keys: */
-const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- if (!bkey_cmp(k.k->p, POS_MIN))
- return "stripe at pos 0";
+ if (!bkey_cmp(k.k->p, POS_MIN)) {
+ prt_printf(err, "stripe at POS_MIN");
+ return -EINVAL;
+ }
- if (k.k->p.inode)
- return "invalid stripe key";
+ if (k.k->p.inode) {
+ prt_printf(err, "nonzero inode field");
+ return -EINVAL;
+ }
- if (bkey_val_bytes(k.k) < sizeof(*s))
- return "incorrect value size";
+ if (bkey_val_bytes(k.k) < sizeof(*s)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(*s));
+ return -EINVAL;
+ }
- if (bkey_val_bytes(k.k) < sizeof(*s) ||
- bkey_val_u64s(k.k) < stripe_val_u64s(s))
- return "incorrect value size";
+ if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) {
+ prt_printf(err, "incorrect value size (%zu < %u)",
+ bkey_val_u64s(k.k), stripe_val_u64s(s));
+ return -EINVAL;
+ }
- return bch2_bkey_ptrs_invalid(c, k);
+ return bch2_bkey_ptrs_invalid(c, k, rw, err);
}
void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
unsigned i;
- pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
+ prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
s->algorithm,
le16_to_cpu(s->sectors),
s->nr_blocks - s->nr_redundant,
1U << s->csum_granularity_bits);
for (i = 0; i < s->nr_blocks; i++)
- pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
+ prt_printf(out, " %u:%llu:%u", s->ptrs[i].dev,
(u64) s->ptrs[i].offset,
stripe_blockcount_get(s, i));
}
struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) {
- char buf2[200];
+ struct printbuf buf2 = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i));
bch_err_ratelimited(c,
"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
(void *) _RET_IP_, i, j, v->csum_type,
- want.lo, got.lo, buf2);
+ want.lo, got.lo, buf2.buf);
+ printbuf_exit(&buf2);
clear_bit(i, buf->valid);
break;
}
nr_iovecs << PAGE_SHIFT);
struct ec_bio *ec_bio;
- ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs,
+ ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
+ nr_iovecs,
+ rw,
+ GFP_KERNEL,
&c->ec_bioset),
struct ec_bio, bio);
ec_bio->buf = buf;
ec_bio->idx = idx;
- bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev);
- bio_set_op_attrs(&ec_bio->bio, rw, 0);
-
ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
ec_bio->bio.bi_end_io = ec_block_endio;
ec_bio->bio.bi_private = cl;
struct btree_iter *iter)
{
size_t idx = iter->pos.offset;
- int ret = 0;
if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN))
- return ret;
+ return 0;
bch2_trans_unlock(trans);
- ret = -EINTR;
- if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL))
- return ret;
-
- return -ENOMEM;
+ return __ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL) ?:
+ bch2_trans_relock(trans);
}
static ssize_t stripe_idx_to_delete(struct bch_fs *c)
struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
int ret;
- for_each_btree_key(trans, iter, BTREE_ID_stripes, start_pos,
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
if (start_pos.offset) {
continue;
}
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_stripe_create;
break;
}
if (bkey_deleted(k.k))
- goto found_slot;
+ break;
}
- goto err;
-found_slot:
- start_pos = iter.pos;
+ c->ec_stripe_hint = iter.pos.offset;
+
+ if (ret)
+ goto err;
ret = ec_stripe_mem_alloc(trans, &iter);
if (ret)
stripe->k.p = iter.pos;
ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0);
-
- c->ec_stripe_hint = start_pos.offset;
err:
bch2_trans_iter_exit(trans, &iter);
};
}
-static int ec_stripe_update_ptrs(struct bch_fs *c,
- struct ec_stripe_buf *s,
- struct bkey *pos)
+static int ec_stripe_update_extent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct ec_stripe_buf *s)
{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_extent e;
- struct bkey_buf sk;
- struct bpos next_pos;
- int ret = 0, dev, block;
+ const struct bch_extent_ptr *ptr_c;
+ struct bch_extent_ptr *ptr, *ec_ptr = NULL;
+ struct bkey_i *n;
+ int ret, dev, block;
+
+ if (extent_has_stripe_ptr(k, s->key.k.p.offset))
+ return 0;
+
+ ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
+ /*
+ * It doesn't generally make sense to erasure code cached ptrs:
+ * XXX: should we be incrementing a counter?
+ */
+ if (!ptr_c || ptr_c->cached)
+ return 0;
+
+ dev = s->key.v.ptrs[block].dev;
+
+ n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(n, k);
- bch2_bkey_buf_init(&sk);
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+ bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
+ ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev);
+ BUG_ON(!ec_ptr);
- /* XXX this doesn't support the reflink btree */
+ extent_stripe_ptr_add(bkey_i_to_s_extent(n), s, ec_ptr, block);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
- bkey_start_pos(pos),
- BTREE_ITER_INTENT);
+ return bch2_trans_update(trans, iter, n, 0);
+}
+
+static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
+ unsigned block)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_extent_ptr bucket = s->key.v.ptrs[block];
+ struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
+ struct bch_backpointer bp;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 bp_offset = 0;
+ int ret = 0;
retry:
- while (bch2_trans_begin(&trans),
- (k = bch2_btree_iter_peek(&iter)).k &&
- !(ret = bkey_err(k)) &&
- bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
- const struct bch_extent_ptr *ptr_c;
- struct bch_extent_ptr *ptr, *ec_ptr = NULL;
-
- if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
- bch2_btree_iter_advance(&iter);
- continue;
+ while (1) {
+ bch2_trans_begin(trans);
+
+ ret = bch2_get_next_backpointer(trans, bucket_pos, bucket.gen,
+ &bp_offset, &bp,
+ BTREE_ITER_CACHED);
+ if (ret)
+ break;
+ if (bp_offset == U64_MAX)
+ break;
+
+ if (bch2_fs_inconsistent_on(bp.level, c, "found btree node in erasure coded bucket!?")) {
+ ret = -EIO;
+ break;
}
- ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
- /*
- * It doesn't generally make sense to erasure code cached ptrs:
- * XXX: should we be incrementing a counter?
- */
- if (!ptr_c || ptr_c->cached) {
- bch2_btree_iter_advance(&iter);
+ k = bch2_backpointer_get_key(trans, &iter, bucket_pos, bp_offset, bp);
+ ret = bkey_err(k);
+ if (ret)
+ break;
+ if (!k.k)
continue;
- }
- dev = s->key.v.ptrs[block].dev;
+ ret = ec_stripe_update_extent(trans, &iter, k, s);
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret)
+ break;
- bch2_bkey_buf_reassemble(&sk, c, k);
- e = bkey_i_to_s_extent(sk.k);
+ bp_offset++;
+ }
- bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev);
- ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev);
- BUG_ON(!ec_ptr);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
- extent_stripe_ptr_add(e, s, ec_ptr, block);
+ return ret;
+}
- bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
- next_pos = sk.k->k.p;
+static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
+{
+ struct btree_trans trans;
+ struct bch_stripe *v = &s->key.v;
+ unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+ int ret = 0;
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(&trans, &iter, sk.k, 0) ?:
- bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
- if (!ret)
- bch2_btree_iter_set_pos(&iter, next_pos);
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for (i = 0; i < nr_data; i++) {
+ ret = ec_stripe_update_bucket(&trans, s, i);
if (ret)
break;
}
- if (ret == -EINTR)
- goto retry;
- bch2_trans_iter_exit(&trans, &iter);
+
bch2_trans_exit(&trans);
- bch2_bkey_buf_exit(&sk, c);
return ret;
}
{
struct bch_fs *c = s->c;
struct open_bucket *ob;
- struct bkey_i *k;
struct stripe *m;
struct bch_stripe *v = &s->new_stripe.key.v;
unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
BUG_ON(!s->allocated);
- if (!percpu_ref_tryget(&c->writes))
+ if (!percpu_ref_tryget_live(&c->writes))
goto err;
ec_generate_ec(&s->new_stripe);
goto err_put_writes;
}
- for_each_keylist_key(&s->keys, k) {
- ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k);
- if (ret) {
- bch_err(c, "error creating stripe: error %i updating pointers", ret);
- break;
- }
- }
+ ret = ec_stripe_update_extents(c, &s->new_stripe);
+ if (ret)
+ bch_err(c, "error creating stripe: error updating pointers: %s",
+ bch2_err_str(ret));
spin_lock(&c->ec_stripes_heap_lock);
m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset);
}
}
- bch2_keylist_free(&s->keys, s->inline_keys);
-
ec_stripe_buf_exit(&s->existing_stripe);
ec_stripe_buf_exit(&s->new_stripe);
closure_debug_destroy(&s->iodone);
return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
}
-void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob,
- struct bkey *k)
-{
- struct ec_stripe_new *ec = ob->ec;
-
- if (!ec)
- return;
-
- mutex_lock(&ec->lock);
-
- if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
- ARRAY_SIZE(ec->inline_keys),
- BKEY_U64s)) {
- BUG();
- }
-
- bkey_init(&ec->keys.top->k);
- ec->keys.top->k.p = k->p;
- ec->keys.top->k.size = k->size;
- bch2_keylist_push(&ec->keys);
-
- mutex_unlock(&ec->lock);
-}
-
static int unsigned_cmp(const void *_l, const void *_r)
{
unsigned l = *((const unsigned *) _l);
BCH_BKEY_PTRS_MAX) - h->redundancy;
s->nr_parity = h->redundancy;
- bch2_keylist_init(&s->keys, s->inline_keys);
-
ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
s->nr_parity, h->blocksize);
BUG_ON(nr_have_data > h->s->nr_data);
BUG_ON(nr_have_parity > h->s->nr_parity);
- percpu_down_read(&c->mark_lock);
- rcu_read_lock();
-
buckets.nr = 0;
if (nr_have_parity < h->s->nr_parity) {
ret = bch2_bucket_alloc_set(c, &buckets,
&nr_have_parity,
&have_cache,
h->copygc
- ? RESERVE_MOVINGGC
- : RESERVE_NONE,
+ ? RESERVE_movinggc
+ : RESERVE_none,
0,
cl);
}
if (ret)
- goto err;
+ return ret;
}
buckets.nr = 0;
&nr_have_data,
&have_cache,
h->copygc
- ? RESERVE_MOVINGGC
- : RESERVE_NONE,
+ ? RESERVE_movinggc
+ : RESERVE_none,
0,
cl);
}
if (ret)
- goto err;
+ return ret;
}
-err:
- rcu_read_unlock();
- percpu_up_read(&c->mark_lock);
- return ret;
+
+ return 0;
}
/* XXX: doesn't obey target: */
int ret;
idx = get_existing_stripe(c, h);
- if (idx < 0) {
- bch_err(c, "failed to find an existing stripe");
- return -ENOSPC;
- }
+ if (idx < 0)
+ return -BCH_ERR_ENOSPC_stripe_reuse;
h->s->have_existing_stripe = true;
ret = get_stripe_key(c, idx, &h->s->existing_stripe);
static int __bch2_ec_stripe_head_reserve(struct bch_fs *c,
struct ec_stripe_head *h)
{
- int ret;
-
- ret = bch2_disk_reservation_get(c, &h->s->res,
- h->blocksize,
- h->s->nr_parity, 0);
-
- if (ret) {
- /*
- * This means we need to wait for copygc to
- * empty out buckets from existing stripes:
- */
- bch_err(c, "failed to reserve stripe");
- }
-
- return ret;
+ return bch2_disk_reservation_get(c, &h->s->res,
+ h->blocksize,
+ h->s->nr_parity, 0);
}
struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
ret = __bch2_ec_stripe_head_reserve(c, h);
if (ret && needs_stripe_new)
ret = __bch2_ec_stripe_head_reuse(c, h);
- if (ret)
+ if (ret) {
+ bch_err_ratelimited(c, "failed to get stripe: %s", bch2_err_str(ret));
goto err;
+ }
if (!h->s->allocated) {
ret = new_stripe_alloc_buckets(c, h, cl);
for (i = 0; i < min_t(size_t, h->used, 20); i++) {
m = genradix_ptr(&c->stripes, h->data[i].idx);
- pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
+ prt_printf(out, "%zu %u/%u+%u\n", h->data[i].idx,
h->data[i].blocks_nonempty,
m->nr_blocks - m->nr_redundant,
m->nr_redundant);
mutex_lock(&c->ec_stripe_head_lock);
list_for_each_entry(h, &c->ec_stripe_head_list, list) {
- pr_buf(out, "target %u algo %u redundancy %u:\n",
+ prt_printf(out, "target %u algo %u redundancy %u:\n",
h->target, h->algo, h->redundancy);
if (h->s)
- pr_buf(out, "\tpending: blocks %u+%u allocated %u\n",
+ prt_printf(out, "\tpending: blocks %u+%u allocated %u\n",
h->s->nr_data, h->s->nr_parity,
bitmap_weight(h->s->blocks_allocated,
h->s->nr_data));
mutex_lock(&c->ec_stripe_new_lock);
list_for_each_entry(s, &c->ec_stripe_new_list, list) {
- pr_buf(out, "\tin flight: blocks %u+%u pin %u\n",
+ prt_printf(out, "\tin flight: blocks %u+%u pin %u\n",
s->nr_data, s->nr_parity,
atomic_read(&s->pin));
}
bioset_exit(&c->ec_bioset);
}
-int bch2_fs_ec_init(struct bch_fs *c)
+void bch2_fs_ec_init_early(struct bch_fs *c)
{
INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
+}
+int bch2_fs_ec_init(struct bch_fs *c)
+{
return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
BIOSET_NEED_BVECS);
}
#include "ec_types.h"
#include "buckets_types.h"
-#include "keylist_types.h"
-const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
+ int rw, struct printbuf *);
void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
.key_invalid = bch2_stripe_invalid, \
.val_to_text = bch2_stripe_to_text, \
.swab = bch2_ptr_swab, \
+ .trans_trigger = bch2_trans_mark_stripe, \
+ .atomic_trigger = bch2_mark_stripe, \
}
static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX];
struct disk_reservation res;
- struct keylist keys;
- u64 inline_keys[BKEY_U64s * 8];
-
struct ec_stripe_buf new_stripe;
struct ec_stripe_buf existing_stripe;
};
int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *,
- struct bkey *);
void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
void bch2_fs_ec_exit(struct bch_fs *);
+void bch2_fs_ec_init_early(struct bch_fs *);
int bch2_fs_ec_init(struct bch_fs *);
#endif /* _BCACHEFS_EC_H */
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "errcode.h"
+
+#include <linux/errname.h>
+
+static const char * const bch2_errcode_strs[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err,
+ BCH_ERRCODES()
+#undef x
+ NULL
+};
+
+#define BCH_ERR_0 0
+
+static unsigned bch2_errcode_parents[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
+ BCH_ERRCODES()
+#undef x
+};
+
+const char *bch2_err_str(int err)
+{
+ const char *errstr;
+ err = abs(err);
+
+ BUG_ON(err >= BCH_ERR_MAX);
+
+ if (err >= BCH_ERR_START)
+ errstr = bch2_errcode_strs[err - BCH_ERR_START];
+ else if (err)
+ errstr = errname(err);
+ else
+ errstr = "(No error)";
+ return errstr ?: "(Invalid error)";
+}
+
+bool __bch2_err_matches(int err, int class)
+{
+ err = abs(err);
+ class = abs(class);
+
+ BUG_ON(err >= BCH_ERR_MAX);
+ BUG_ON(class >= BCH_ERR_MAX);
+
+ while (err >= BCH_ERR_START && err != class)
+ err = bch2_errcode_parents[err - BCH_ERR_START];
+
+ return err == class;
+}
+
+int __bch2_err_class(int err)
+{
+ err = -err;
+ BUG_ON((unsigned) err >= BCH_ERR_MAX);
+
+ while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START])
+ err = bch2_errcode_parents[err - BCH_ERR_START];
+
+ return -err;
+}
#ifndef _BCACHEFS_ERRCODE_H
#define _BCACHEFS_ERRCODE_H
-enum {
- /* Bucket allocator: */
- OPEN_BUCKETS_EMPTY = 2048,
- FREELIST_EMPTY, /* Allocator thread not keeping up */
- INSUFFICIENT_DEVICES,
+#define BCH_ERRCODES() \
+ x(ENOSPC, ENOSPC_disk_reservation) \
+ x(ENOSPC, ENOSPC_bucket_alloc) \
+ x(ENOSPC, ENOSPC_disk_label_add) \
+ x(ENOSPC, ENOSPC_stripe_create) \
+ x(ENOSPC, ENOSPC_stripe_reuse) \
+ x(ENOSPC, ENOSPC_inode_create) \
+ x(ENOSPC, ENOSPC_str_hash_create) \
+ x(ENOSPC, ENOSPC_snapshot_create) \
+ x(ENOSPC, ENOSPC_subvolume_create) \
+ x(ENOSPC, ENOSPC_sb) \
+ x(ENOSPC, ENOSPC_sb_journal) \
+ x(ENOSPC, ENOSPC_sb_quota) \
+ x(ENOSPC, ENOSPC_sb_replicas) \
+ x(ENOSPC, ENOSPC_sb_members) \
+ x(0, open_buckets_empty) \
+ x(0, freelist_empty) \
+ x(BCH_ERR_freelist_empty, no_buckets_found) \
+ x(0, insufficient_devices) \
+ x(0, transaction_restart) \
+ x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \
+ x(BCH_ERR_transaction_restart, transaction_restart_relock) \
+ x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \
+ x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \
+ x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \
+ x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \
+ x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \
+ x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \
+ x(BCH_ERR_transaction_restart, transaction_restart_fill_mem_alloc_fail)\
+ x(BCH_ERR_transaction_restart, transaction_restart_mem_realloced) \
+ x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \
+ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \
+ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\
+ x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\
+ x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \
+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \
+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \
+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \
+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\
+ x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \
+ x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
+ x(BCH_ERR_transaction_restart, transaction_restart_nested) \
+ x(0, no_btree_node) \
+ x(BCH_ERR_no_btree_node, no_btree_node_relock) \
+ x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \
+ x(BCH_ERR_no_btree_node, no_btree_node_drop) \
+ x(BCH_ERR_no_btree_node, no_btree_node_lock_root) \
+ x(BCH_ERR_no_btree_node, no_btree_node_up) \
+ x(BCH_ERR_no_btree_node, no_btree_node_down) \
+ x(BCH_ERR_no_btree_node, no_btree_node_init) \
+ x(BCH_ERR_no_btree_node, no_btree_node_cached) \
+ x(0, backpointer_to_overwritten_btree_node) \
+ x(0, lock_fail_root_changed) \
+ x(0, journal_reclaim_would_deadlock) \
+ x(0, fsck) \
+ x(BCH_ERR_fsck, fsck_fix) \
+ x(BCH_ERR_fsck, fsck_ignore) \
+ x(BCH_ERR_fsck, fsck_errors_not_fixed) \
+ x(BCH_ERR_fsck, fsck_repair_unimplemented) \
+ x(BCH_ERR_fsck, fsck_repair_impossible) \
+ x(0, need_snapshot_cleanup) \
+ x(0, need_topology_repair)
+
+enum bch_errcode {
+ BCH_ERR_START = 2048,
+#define x(class, err) BCH_ERR_##err,
+ BCH_ERRCODES()
+#undef x
+ BCH_ERR_MAX
};
+const char *bch2_err_str(int);
+bool __bch2_err_matches(int, int);
+
+static inline bool _bch2_err_matches(int err, int class)
+{
+ return err && __bch2_err_matches(err, class);
+}
+
+#define bch2_err_matches(_err, _class) \
+({ \
+ BUILD_BUG_ON(!__builtin_constant_p(_class)); \
+ _bch2_err_matches(_err, _class); \
+})
+
+int __bch2_err_class(int);
+
+static inline long bch2_err_class(long err)
+{
+ return err < 0 ? __bch2_err_class(err) : err;
+}
+
#endif /* _BCACHFES_ERRCODE_H */
#include "tools-util.h"
#endif
-enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
- const char *fmt, ...)
+static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
{
- struct fsck_err_state *s = NULL;
- va_list args;
- bool fix = false, print = true, suppressing = false;
- char _buf[sizeof(s->buf)], *buf = _buf;
-
- if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
- va_start(args, fmt);
- vprintk(fmt, args);
- va_end(args);
-
- if (c->opts.errors == BCH_ON_ERROR_continue) {
- bch_err(c, "fixing");
- return FSCK_ERR_FIX;
- } else {
- bch2_inconsistent_error(c);
- return FSCK_ERR_EXIT;
- }
- }
+ struct fsck_err_state *s;
- mutex_lock(&c->fsck_error_lock);
+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+ return NULL;
list_for_each_entry(s, &c->fsck_errors, list)
- if (s->fmt == fmt)
- goto found;
+ if (s->fmt == fmt) {
+ /*
+ * move it to the head of the list: repeated fsck errors
+ * are common
+ */
+ list_move(&s->list, &c->fsck_errors);
+ return s;
+ }
s = kzalloc(sizeof(*s), GFP_NOFS);
if (!s) {
if (!c->fsck_alloc_err)
bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
c->fsck_alloc_err = true;
- buf = _buf;
- goto print;
+ return NULL;
}
INIT_LIST_HEAD(&s->list);
s->fmt = fmt;
-found:
- list_move(&s->list, &c->fsck_errors);
- s->nr++;
- if (c->opts.ratelimit_errors &&
- !(flags & FSCK_NO_RATELIMIT) &&
- s->nr >= FSCK_ERR_RATELIMIT_NR) {
- if (s->nr == FSCK_ERR_RATELIMIT_NR)
- suppressing = true;
- else
- print = false;
+ s->buf = PRINTBUF;
+ list_add(&s->list, &c->fsck_errors);
+ return s;
+}
+
+int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
+{
+ struct fsck_err_state *s = NULL;
+ va_list args;
+ bool print = true, suppressing = false, inconsistent = false;
+ struct printbuf buf = PRINTBUF, *out = &buf;
+ int ret = -BCH_ERR_fsck_ignore;
+
+ mutex_lock(&c->fsck_error_lock);
+ s = fsck_err_get(c, fmt);
+ if (s) {
+ if (c->opts.ratelimit_errors &&
+ !(flags & FSCK_NO_RATELIMIT) &&
+ s->nr >= FSCK_ERR_RATELIMIT_NR) {
+ if (s->nr == FSCK_ERR_RATELIMIT_NR)
+ suppressing = true;
+ else
+ print = false;
+ }
+
+ printbuf_reset(&s->buf);
+ out = &s->buf;
+ s->nr++;
}
- buf = s->buf;
-print:
+
+ if (!strncmp(fmt, "bcachefs:", 9))
+ prt_printf(out, bch2_log_msg(c, ""));
+
va_start(args, fmt);
- vscnprintf(buf, sizeof(_buf), fmt, args);
+ prt_vprintf(out, fmt, args);
va_end(args);
- if (c->opts.fix_errors == FSCK_OPT_EXIT) {
- bch_err(c, "%s, exiting", buf);
+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
+ if (c->opts.errors != BCH_ON_ERROR_continue ||
+ !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
+ prt_str(out, ", shutting down");
+ inconsistent = true;
+ ret = -BCH_ERR_fsck_errors_not_fixed;
+ } else if (flags & FSCK_CAN_FIX) {
+ prt_str(out, ", fixing");
+ ret = -BCH_ERR_fsck_fix;
+ } else {
+ prt_str(out, ", continuing");
+ ret = -BCH_ERR_fsck_ignore;
+ }
+ } else if (c->opts.fix_errors == FSCK_OPT_EXIT) {
+ prt_str(out, ", exiting");
+ ret = -BCH_ERR_fsck_errors_not_fixed;
} else if (flags & FSCK_CAN_FIX) {
if (c->opts.fix_errors == FSCK_OPT_ASK) {
- printk(KERN_ERR "%s: fix?", buf);
- fix = ask_yn();
+ prt_str(out, ": fix?");
+ bch2_print_string_as_lines(KERN_ERR, out->buf);
+ print = false;
+ ret = ask_yn()
+ ? -BCH_ERR_fsck_fix
+ : -BCH_ERR_fsck_ignore;
} else if (c->opts.fix_errors == FSCK_OPT_YES ||
(c->opts.nochanges &&
!(flags & FSCK_CAN_IGNORE))) {
- if (print)
- bch_err(c, "%s, fixing", buf);
- fix = true;
+ prt_str(out, ", fixing");
+ ret = -BCH_ERR_fsck_fix;
} else {
- if (print)
- bch_err(c, "%s, not fixing", buf);
- fix = false;
+ prt_str(out, ", not fixing");
}
} else if (flags & FSCK_NEED_FSCK) {
- if (print)
- bch_err(c, "%s (run fsck to correct)", buf);
+ prt_str(out, " (run fsck to correct)");
} else {
- if (print)
- bch_err(c, "%s (repair unimplemented)", buf);
+ prt_str(out, " (repair unimplemented)");
}
- if (suppressing)
+ if (ret == -BCH_ERR_fsck_ignore &&
+ (c->opts.fix_errors == FSCK_OPT_EXIT ||
+ !(flags & FSCK_CAN_IGNORE)))
+ ret = -BCH_ERR_fsck_errors_not_fixed;
+
+ if (print)
+ bch2_print_string_as_lines(KERN_ERR, out->buf);
+
+ if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) &&
+ (ret != -BCH_ERR_fsck_fix &&
+ ret != -BCH_ERR_fsck_ignore))
+ bch_err(c, "Unable to continue, halting");
+ else if (suppressing)
bch_err(c, "Ratelimiting new instances of previous error");
mutex_unlock(&c->fsck_error_lock);
- if (fix) {
+ printbuf_exit(&buf);
+
+ if (inconsistent)
+ bch2_inconsistent_error(c);
+
+ if (ret == -BCH_ERR_fsck_fix) {
set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
- return FSCK_ERR_FIX;
} else {
set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
set_bit(BCH_FS_ERROR, &c->flags);
- return c->opts.fix_errors == FSCK_OPT_EXIT ||
- !(flags & FSCK_CAN_IGNORE)
- ? FSCK_ERR_EXIT
- : FSCK_ERR_IGNORE;
}
+
+ return ret;
}
void bch2_flush_fsck_errs(struct bch_fs *c)
list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
if (s->ratelimited)
- bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf);
+ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf.buf);
list_del(&s->list);
+ printbuf_exit(&s->buf);
kfree(s);
}
#define bch2_fs_inconsistent_on(cond, c, ...) \
({ \
- int _ret = !!(cond); \
+ bool _ret = unlikely(!!(cond)); \
\
if (_ret) \
bch2_fs_inconsistent(c, __VA_ARGS__); \
#define bch2_dev_inconsistent_on(cond, ca, ...) \
({ \
- int _ret = !!(cond); \
+ bool _ret = unlikely(!!(cond)); \
\
if (_ret) \
bch2_dev_inconsistent(ca, __VA_ARGS__); \
_ret; \
})
+/*
+ * When a transaction update discovers or is causing a fs inconsistency, it's
+ * helpful to also dump the pending updates:
+ */
+#define bch2_trans_inconsistent(trans, ...) \
+({ \
+ bch_err(trans->c, __VA_ARGS__); \
+ bch2_inconsistent_error(trans->c); \
+ bch2_dump_trans_updates(trans); \
+})
+
+#define bch2_trans_inconsistent_on(cond, trans, ...) \
+({ \
+ bool _ret = unlikely(!!(cond)); \
+ \
+ if (_ret) \
+ bch2_trans_inconsistent(trans, __VA_ARGS__); \
+ _ret; \
+})
+
/*
* Fsck errors: inconsistency errors we detect at mount time, and should ideally
* be able to repair:
*/
-enum {
- BCH_FSCK_OK = 0,
- BCH_FSCK_ERRORS_NOT_FIXED = 1,
- BCH_FSCK_REPAIR_UNIMPLEMENTED = 2,
- BCH_FSCK_REPAIR_IMPOSSIBLE = 3,
- BCH_FSCK_UNKNOWN_VERSION = 4,
-};
-
enum fsck_err_opts {
FSCK_OPT_EXIT,
FSCK_OPT_YES,
FSCK_OPT_ASK,
};
-enum fsck_err_ret {
- FSCK_ERR_IGNORE = 0,
- FSCK_ERR_FIX = 1,
- FSCK_ERR_EXIT = 2,
- FSCK_ERR_START_TOPOLOGY_REPAIR = 3,
-};
-
struct fsck_err_state {
struct list_head list;
const char *fmt;
u64 nr;
bool ratelimited;
- char buf[512];
+ struct printbuf buf;
};
#define FSCK_CAN_FIX (1 << 0)
#define FSCK_NO_RATELIMIT (1 << 3)
__printf(3, 4) __cold
-enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
- unsigned, const char *, ...);
+int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...);
void bch2_flush_fsck_errs(struct bch_fs *);
#define __fsck_err(c, _flags, msg, ...) \
({ \
- int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
+ int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__); \
\
- if (_fix == FSCK_ERR_EXIT) { \
- bch_err(c, "Unable to continue, halting"); \
- ret = BCH_FSCK_ERRORS_NOT_FIXED; \
+ if (_ret != -BCH_ERR_fsck_fix && \
+ _ret != -BCH_ERR_fsck_ignore) { \
+ ret = _ret; \
goto fsck_err; \
} \
\
- _fix; \
+ _ret == -BCH_ERR_fsck_fix; \
})
/* These macros return true if error should be fixed: */
/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
#define __fsck_err_on(cond, c, _flags, ...) \
- ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
+ (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
#define need_fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
#define bch2_fs_fatal_err_on(cond, c, ...) \
({ \
- int _ret = !!(cond); \
+ bool _ret = unlikely(!!(cond)); \
\
if (_ret) \
bch2_fs_fatal_error(c, __VA_ARGS__); \
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
- unsigned ret = 0;
+ unsigned ret = 0, lru = 0;
bkey_extent_entry_for_each(ptrs, entry) {
switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
+ /* Might also be updating LRU btree */
+ if (entry->ptr.cached)
+ lru++;
+
+ fallthrough;
case BCH_EXTENT_ENTRY_stripe_ptr:
ret++;
}
}
- return ret;
+ /*
+ * Updating keys in the alloc btree may also update keys in the
+ * freespace or discard btrees:
+ */
+ return lru + ret * 2;
}
static int count_iters_for_insert(struct btree_trans *trans,
#include <trace/events/bcachefs.h>
+static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
+
static unsigned bch2_crc_field_size_max[] = {
[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
/* KEY_TYPE_btree_ptr: */
-const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
- if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX)
- return "value too big";
+ if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
+ prt_printf(err, "value too big (%zu > %u)",
+ bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
+ return -EINVAL;
+ }
- return bch2_bkey_ptrs_invalid(c, k);
+ return bch2_bkey_ptrs_invalid(c, k, rw, err);
}
void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
bch2_bkey_ptrs_to_text(out, c, k);
}
-const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- if (bkey_val_bytes(k.k) <= sizeof(*bp.v))
- return "value too small";
+ if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) {
+ prt_printf(err, "value too small (%zu <= %zu)",
+ bkey_val_bytes(k.k), sizeof(*bp.v));
+ return -EINVAL;
+ }
- if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
- return "value too big";
+ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
+ prt_printf(err, "value too big (%zu > %zu)",
+ bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
+ return -EINVAL;
+ }
if (c->sb.version < bcachefs_metadata_version_snapshot &&
- bp.v->min_key.snapshot)
- return "invalid min_key.snapshot";
+ bp.v->min_key.snapshot) {
+ prt_printf(err, "invalid min_key.snapshot (%u != 0)",
+ bp.v->min_key.snapshot);
+ return -EINVAL;
+ }
- return bch2_bkey_ptrs_invalid(c, k);
+ return bch2_bkey_ptrs_invalid(c, k, rw, err);
}
void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
+ struct bkey_s_c k)
{
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- pr_buf(out, "seq %llx written %u min_key %s",
+ prt_printf(out, "seq %llx written %u min_key %s",
le64_to_cpu(bp.v->seq),
le16_to_cpu(bp.v->sectors_written),
BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
bch2_bpos_to_text(out, bp.v->min_key);
- pr_buf(out, " ");
+ prt_printf(out, " ");
bch2_bkey_ptrs_to_text(out, c, k);
}
/* KEY_TYPE_extent: */
-const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
- return bch2_bkey_ptrs_invalid(c, k);
-}
-
-void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- bch2_bkey_ptrs_to_text(out, c, k);
-}
-
bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
{
struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l);
if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
lp.crc.uncompressed_size) {
/* can use left extent's crc entry */
- } else if (lp.crc.live_size <= rp.crc.offset ) {
+ } else if (lp.crc.live_size <= rp.crc.offset) {
/* can use right extent's crc entry */
} else {
/* check if checksums can be merged: */
lp.crc.uncompressed_size +
rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
return false;
+ }
+
+ en_l = extent_entry_next(en_l);
+ en_r = extent_entry_next(en_r);
+ }
- if (lp.crc.uncompressed_size + rp.crc.uncompressed_size >
+ en_l = l_ptrs.start;
+ en_r = r_ptrs.start;
+ while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
+ if (extent_entry_is_crc(en_l)) {
+ struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+ struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+ if (crc_l.uncompressed_size + crc_r.uncompressed_size >
bch2_crc_field_size_max[extent_entry_type(en_l)])
return false;
}
if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
crc_l.uncompressed_size) {
/* can use left extent's crc entry */
- } else if (crc_l.live_size <= crc_r.offset ) {
+ } else if (crc_l.live_size <= crc_r.offset) {
/* can use right extent's crc entry */
crc_r.offset -= crc_l.live_size;
bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
/* KEY_TYPE_reservation: */
-const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
- if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
- return "incorrect value size";
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) {
+ prt_printf(err, "incorrect value size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(*r.v));
+ return -EINVAL;
+ }
- if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
- return "invalid nr_replicas";
+ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
+ prt_printf(err, "invalid nr_replicas (%u)",
+ r.v->nr_replicas);
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
- pr_buf(out, "generation %u replicas %u",
+ prt_printf(out, "generation %u replicas %u",
le32_to_cpu(r.v->generation),
r.v->nr_replicas);
}
return durability;
}
-void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
- unsigned target,
- unsigned nr_desired_replicas)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
-
- if (target && extra > 0)
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- int n = bch2_extent_ptr_durability(c, p);
-
- if (n && n <= extra &&
- !bch2_dev_in_target(c, p.ptr.dev, target)) {
- entry->ptr.cached = true;
- extra -= n;
- }
- }
-
- if (extra > 0)
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- int n = bch2_extent_ptr_durability(c, p);
-
- if (n && n <= extra) {
- entry->ptr.cached = true;
- extra -= n;
- }
- }
-}
-
void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
{
union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
/*
* Returns pointer to the next entry after the one being dropped:
*/
-union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
- struct bch_extent_ptr *ptr)
+static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
+ struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry = to_entry(ptr), *next;
bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
}
+void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
+{
+ struct bch_extent_ptr *ptr = (void *) bch2_bkey_has_device(k.s_c, dev);
+
+ if (ptr)
+ __bch2_bkey_drop_ptr(k, ptr);
+}
+
const struct bch_extent_ptr *
bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
{
return false;
}
+/*
+ * Returns true if two extents refer to the same data:
+ */
+bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
+{
+ struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
+ struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+ const union bch_extent_entry *entry1, *entry2;
+ struct extent_ptr_decoded p1, p2;
+
+ bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
+ bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+ if (p1.ptr.dev == p2.ptr.dev &&
+ p1.ptr.gen == p2.ptr.gen &&
+ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+ return true;
+
+ return false;
+}
+
+bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1,
+ struct bkey_s_c k2)
+{
+ struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+ const union bch_extent_entry *entry2;
+ struct extent_ptr_decoded p2;
+
+ bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+ if (p1.ptr.dev == p2.ptr.dev &&
+ p1.ptr.gen == p2.ptr.gen &&
+ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+ return true;
+
+ return false;
+}
+
/*
* bch_extent_normalize - clean up an extent, dropping stale pointers etc.
*
bkey_extent_entry_for_each(ptrs, entry) {
if (!first)
- pr_buf(out, " ");
+ prt_printf(out, " ");
switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
- ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+ ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
? bch_dev_bkey_exists(c, ptr->dev)
: NULL;
- pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
- (u64) ptr->offset, ptr->gen,
- ptr->cached ? " cached" : "",
- ca && ptr_stale(ca, ptr)
- ? " stale" : "");
+ if (!ca) {
+ prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+ (u64) ptr->offset, ptr->gen,
+ ptr->cached ? " cached" : "");
+ } else {
+ u32 offset;
+ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+ prt_printf(out, "ptr: %u:%llu:%u gen %u%s", ptr->dev,
+ b, offset, ptr->gen,
+ ptr->cached ? " cached" : "");
+
+ if (ca && ptr_stale(ca, ptr))
+ prt_printf(out, " stale");
+ }
break;
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128:
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
crc.compressed_size,
crc.uncompressed_size,
crc.offset, crc.nonce,
case BCH_EXTENT_ENTRY_stripe_ptr:
ec = &entry->stripe_ptr;
- pr_buf(out, "ec: idx %llu block %u",
+ prt_printf(out, "ec: idx %llu block %u",
(u64) ec->idx, ec->block);
break;
default:
- pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
+ prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
return;
}
}
}
-static const char *extent_ptr_invalid(const struct bch_fs *c,
- struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
- unsigned size_ondisk,
- bool metadata)
+static int extent_ptr_invalid(const struct bch_fs *c,
+ struct bkey_s_c k,
+ const struct bch_extent_ptr *ptr,
+ unsigned size_ondisk,
+ bool metadata,
+ struct printbuf *err)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const struct bch_extent_ptr *ptr2;
+ u64 bucket;
+ u32 bucket_offset;
struct bch_dev *ca;
- if (!bch2_dev_exists2(c, ptr->dev))
- return "pointer to invalid device";
+ if (!bch2_dev_exists2(c, ptr->dev)) {
+ prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
+ return -EINVAL;
+ }
ca = bch_dev_bkey_exists(c, ptr->dev);
- if (!ca)
- return "pointer to invalid device";
-
bkey_for_each_ptr(ptrs, ptr2)
- if (ptr != ptr2 && ptr->dev == ptr2->dev)
- return "multiple pointers to same device";
+ if (ptr != ptr2 && ptr->dev == ptr2->dev) {
+ prt_printf(err, "multiple pointers to same device (%u)", ptr->dev);
+ return -EINVAL;
+ }
- if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
- return "offset past end of device";
+ bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
- if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
- return "offset before first bucket";
+ if (bucket >= ca->mi.nbuckets) {
+ prt_printf(err, "pointer past last bucket (%llu > %llu)",
+ bucket, ca->mi.nbuckets);
+ return -EINVAL;
+ }
- if (bucket_remainder(ca, ptr->offset) +
- size_ondisk > ca->mi.bucket_size)
- return "spans multiple buckets";
+ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) {
+ prt_printf(err, "pointer before first bucket (%llu < %u)",
+ bucket, ca->mi.first_bucket);
+ return -EINVAL;
+ }
- return NULL;
+ if (bucket_offset + size_ondisk > ca->mi.bucket_size) {
+ prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)",
+ bucket_offset, size_ondisk, ca->mi.bucket_size);
+ return -EINVAL;
+ }
+
+ return 0;
}
-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_devs_list devs;
const union bch_extent_entry *entry;
struct bch_extent_crc_unpacked crc;
unsigned size_ondisk = k.k->size;
- const char *reason;
unsigned nonce = UINT_MAX;
- unsigned i;
+ unsigned nr_ptrs = 0;
+ int ret;
- if (k.k->type == KEY_TYPE_btree_ptr ||
- k.k->type == KEY_TYPE_btree_ptr_v2)
+ if (bkey_is_btree_ptr(k.k))
size_ondisk = btree_sectors(c);
bkey_extent_entry_for_each(ptrs, entry) {
- if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
- return "invalid extent entry type";
+ if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) {
+ prt_printf(err, "invalid extent entry type (got %u, max %u)",
+ __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
+ return -EINVAL;
+ }
- if (k.k->type == KEY_TYPE_btree_ptr &&
- !extent_entry_is_ptr(entry))
- return "has non ptr field";
+ if (bkey_is_btree_ptr(k.k) &&
+ !extent_entry_is_ptr(entry)) {
+ prt_printf(err, "has non ptr field");
+ return -EINVAL;
+ }
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
- reason = extent_ptr_invalid(c, k, &entry->ptr,
- size_ondisk, false);
- if (reason)
- return reason;
+ ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk,
+ false, err);
+ if (ret)
+ return ret;
+ nr_ptrs++;
break;
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
if (crc.offset + crc.live_size >
- crc.uncompressed_size)
- return "checksum offset + key size > uncompressed size";
+ crc.uncompressed_size) {
+ prt_printf(err, "checksum offset + key size > uncompressed size");
+ return -EINVAL;
+ }
size_ondisk = crc.compressed_size;
- if (!bch2_checksum_type_valid(c, crc.csum_type))
- return "invalid checksum type";
+ if (!bch2_checksum_type_valid(c, crc.csum_type)) {
+ prt_printf(err, "invalid checksum type");
+ return -EINVAL;
+ }
- if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR)
- return "invalid compression type";
+ if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) {
+ prt_printf(err, "invalid compression type");
+ return -EINVAL;
+ }
if (bch2_csum_type_is_encryption(crc.csum_type)) {
if (nonce == UINT_MAX)
nonce = crc.offset + crc.nonce;
- else if (nonce != crc.offset + crc.nonce)
- return "incorrect nonce";
+ else if (nonce != crc.offset + crc.nonce) {
+ prt_printf(err, "incorrect nonce");
+ return -EINVAL;
+ }
}
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
}
}
- devs = bch2_bkey_devs(k);
- bubble_sort(devs.devs, devs.nr, u8_cmp);
- for (i = 0; i + 1 < devs.nr; i++)
- if (devs.devs[i] == devs.devs[i + 1])
- return "multiple ptrs to same device";
+ if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
+ prt_str(err, "too many ptrs");
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
void bch2_ptr_swab(struct bkey_s k)
/* KEY_TYPE_btree_ptr: */
-const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
+int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
int, struct bkey_s);
.key_invalid = bch2_btree_ptr_invalid, \
.val_to_text = bch2_btree_ptr_to_text, \
.swab = bch2_ptr_swab, \
+ .trans_trigger = bch2_trans_mark_extent, \
+ .atomic_trigger = bch2_mark_extent, \
}
#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \
.val_to_text = bch2_btree_ptr_v2_to_text, \
.swab = bch2_ptr_swab, \
.compat = bch2_btree_ptr_v2_compat, \
+ .trans_trigger = bch2_trans_mark_extent, \
+ .atomic_trigger = bch2_mark_extent, \
}
/* KEY_TYPE_extent: */
-const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
#define bch2_bkey_ops_extent (struct bkey_ops) { \
- .key_invalid = bch2_extent_invalid, \
- .val_to_text = bch2_extent_to_text, \
+ .key_invalid = bch2_bkey_ptrs_invalid, \
+ .val_to_text = bch2_bkey_ptrs_to_text, \
.swab = bch2_ptr_swab, \
.key_normalize = bch2_extent_normalize, \
.key_merge = bch2_extent_merge, \
+ .trans_trigger = bch2_trans_mark_extent, \
+ .atomic_trigger = bch2_mark_extent, \
}
/* KEY_TYPE_reservation: */
-const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c,
+ int, struct printbuf *);
void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
.key_invalid = bch2_reservation_invalid, \
.val_to_text = bch2_reservation_to_text, \
.key_merge = bch2_reservation_merge, \
+ .trans_trigger = bch2_trans_mark_reservation, \
+ .atomic_trigger = bch2_mark_reservation, \
}
/* Extent checksum entries: */
unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
-void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
- unsigned, unsigned);
-
void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
void bch2_extent_ptr_decoded_append(struct bkey_i *,
struct extent_ptr_decoded *);
-union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s,
- struct bch_extent_ptr *);
union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
struct bch_extent_ptr *);
} while (0)
void bch2_bkey_drop_device(struct bkey_s, unsigned);
+void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
struct bch_extent_ptr, u64);
+bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
+bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
+ int, struct printbuf *);
void bch2_ptr_swab(struct bkey_s);
goto err;
inode_u->bi_ctime = now;
- bch2_inode_nlink_inc(inode_u);
+ ret = bch2_inode_nlink_inc(inode_u);
+ if (ret)
+ return ret;
ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
if (ret)
goto err;
+ if (bch2_reinherit_attrs(inode_u, dir_u)) {
+ ret = -EXDEV;
+ goto err;
+ }
+
dir_u->bi_mtime = dir_u->bi_ctime = now;
dir_hash = bch2_hash_info_init(c, dir_u);
if (ret)
goto err;
} else {
- bch2_inode_nlink_dec(inode_u);
+ bch2_inode_nlink_dec(trans, inode_u);
}
if (inode_u->bi_dir == dirent_iter.pos.inode &&
}
if (mode == BCH_RENAME_OVERWRITE)
- bch2_inode_nlink_dec(dst_inode_u);
+ bch2_inode_nlink_dec(trans, dst_inode_u);
src_dir_u->bi_mtime = now;
src_dir_u->bi_ctime = now;
ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
(src_dir.inum != dst_dir.inum
? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
- : 0 ) ?:
+ : 0) ?:
bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
(dst_inum.inum
? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
- : 0 );
+ : 0);
err:
bch2_trans_iter_exit(trans, &dst_inode_iter);
bch2_trans_iter_exit(trans, &src_inode_iter);
#include <trace/events/bcachefs.h>
#include <trace/events/writeback.h>
+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+ if (bio->bi_vcnt >= bio->bi_max_vecs)
+ return true;
+ if (bio->bi_iter.bi_size > UINT_MAX - len)
+ return true;
+ return false;
+}
+
static inline struct address_space *faults_disabled_mapping(void)
{
return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
static int bch2_quota_reservation_add(struct bch_fs *c,
struct bch_inode_info *inode,
struct quota_res *res,
- unsigned sectors,
+ u64 sectors,
bool check_enospc)
{
int ret;
return;
mutex_lock(&inode->ei_quota_lock);
- BUG_ON((s64) inode->v.i_blocks + sectors < 0);
+ bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
+ "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+ inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
+ inode->ei_inode.bi_sectors);
inode->v.i_blocks += sectors;
#ifdef CONFIG_BCACHEFS_QUOTA
offset = iter.pos.offset;
bch2_trans_iter_exit(&trans, &iter);
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
{
pgoff_t index = start >> PAGE_SECTORS_SHIFT;
pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
- struct pagevec pvec;
+ struct folio_batch fbatch;
+ unsigned i, j;
if (end <= start)
return;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
- do {
- unsigned nr_pages, i, j;
-
- nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
- &index, end_index);
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
- u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+ while (filemap_get_folios(inode->v.i_mapping,
+ &index, end_index, &fbatch)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+ u64 pg_start = folio->index << PAGE_SECTORS_SHIFT;
+ u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
unsigned pg_offset = max(start, pg_start) - pg_start;
unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
struct bch_page_state *s;
BUG_ON(pg_offset >= PAGE_SECTORS);
BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
- lock_page(page);
- s = bch2_page_state(page);
+ folio_lock(folio);
+ s = bch2_page_state(&folio->page);
if (s) {
spin_lock(&s->lock);
spin_unlock(&s->lock);
}
- unlock_page(page);
+ folio_unlock(folio);
}
- pagevec_release(&pvec);
- } while (index <= end_index);
+ folio_batch_release(&fbatch);
+ cond_resched();
+ }
}
static void mark_pagecache_reserved(struct bch_inode_info *inode,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
pgoff_t index = start >> PAGE_SECTORS_SHIFT;
pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
- struct pagevec pvec;
+ struct folio_batch fbatch;
s64 i_sectors_delta = 0;
+ unsigned i, j;
if (end <= start)
return;
- pagevec_init(&pvec);
-
- do {
- unsigned nr_pages, i, j;
+ folio_batch_init(&fbatch);
- nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
- &index, end_index);
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
- u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+ while (filemap_get_folios(inode->v.i_mapping,
+ &index, end_index, &fbatch)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+ u64 pg_start = folio->index << PAGE_SECTORS_SHIFT;
+ u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
unsigned pg_offset = max(start, pg_start) - pg_start;
unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
struct bch_page_state *s;
BUG_ON(pg_offset >= PAGE_SECTORS);
BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
- lock_page(page);
- s = bch2_page_state(page);
+ folio_lock(folio);
+ s = bch2_page_state(&folio->page);
if (s) {
spin_lock(&s->lock);
spin_unlock(&s->lock);
}
- unlock_page(page);
+ folio_unlock(folio);
}
- pagevec_release(&pvec);
- } while (index <= end_index);
+ folio_batch_release(&fbatch);
+ cond_resched();
+ }
i_sectors_acct(c, inode, NULL, i_sectors_delta);
}
static int bch2_page_reservation_get(struct bch_fs *c,
struct bch_inode_info *inode, struct page *page,
struct bch2_page_reservation *res,
- unsigned offset, unsigned len, bool check_enospc)
+ unsigned offset, unsigned len)
{
struct bch_page_state *s = bch2_page_state_create(page, 0);
unsigned i, disk_sectors = 0, quota_sectors = 0;
}
if (disk_sectors) {
- ret = bch2_disk_reservation_add(c, &res->disk,
- disk_sectors,
- !check_enospc
- ? BCH_DISK_RESERVATION_NOFAIL
- : 0);
+ ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
if (unlikely(ret))
return ret;
}
if (quota_sectors) {
ret = bch2_quota_reservation_add(c, inode, &res->quota,
- quota_sectors,
- check_enospc);
+ quota_sectors, true);
if (unlikely(ret)) {
struct disk_reservation tmp = {
.sectors = disk_sectors
}
}
- if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
+ if (bch2_page_reservation_get(c, inode, page, &res, 0, len)) {
unlock_page(page);
ret = VM_FAULT_SIGBUS;
goto out;
return ret;
}
-void bch2_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
- if (offset || length < PAGE_SIZE)
+ if (offset || length < folio_size(folio))
return;
- bch2_clear_page_bits(page);
+ bch2_clear_page_bits(&folio->page);
}
-int bch2_releasepage(struct page *page, gfp_t gfp_mask)
+bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
{
- if (PageDirty(page))
- return 0;
-
- bch2_clear_page_bits(page);
- return 1;
-}
-
-#ifdef CONFIG_MIGRATION
-int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
- struct page *page, enum migrate_mode mode)
-{
- int ret;
-
- EBUG_ON(!PageLocked(page));
- EBUG_ON(!PageLocked(newpage));
-
- ret = migrate_page_move_mapping(mapping, newpage, page, 0);
- if (ret != MIGRATEPAGE_SUCCESS)
- return ret;
+ if (folio_test_dirty(folio) || folio_test_writeback(folio))
+ return false;
- if (PagePrivate(page))
- attach_page_private(newpage, detach_page_private(page));
-
- if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
- else
- migrate_page_states(newpage, page);
- return MIGRATEPAGE_SUCCESS;
+ bch2_clear_page_bits(&folio->page);
+ return true;
}
-#endif
/* readpage(s): */
* read_extent -> io_time_reset may cause a transaction restart
* without returning an error, we need to check for that here:
*/
- if (!bch2_trans_relock(trans)) {
- ret = -EINTR;
+ ret = bch2_trans_relock(trans);
+ if (ret)
break;
- }
bch2_btree_iter_set_pos(&iter,
POS(inum.inum, rbio->bio.bi_iter.bi_sector));
err:
bch2_trans_iter_exit(trans, &iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (ret) {
readpages_iter.idx,
BIO_MAX_VECS);
struct bch_read_bio *rbio =
- rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read),
+ rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
+ GFP_NOFS, &c->bio_read),
opts);
readpages_iter.idx++;
- bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0);
rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT;
rbio->bio.bi_end_io = bch2_readpages_end_io;
BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
bch2_trans_exit(&trans);
}
-int bch2_readpage(struct file *file, struct page *page)
-{
- struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
- struct bch_read_bio *rbio;
-
- rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
- rbio->bio.bi_end_io = bch2_readpages_end_io;
-
- __bchfs_readpage(c, rbio, inode_inum(inode), page);
- return 0;
-}
-
static void bch2_read_single_page_end_io(struct bio *bio)
{
complete(bio->bi_private);
int ret;
DECLARE_COMPLETION_ONSTACK(done);
- rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read),
+ rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read),
io_opts(c, &inode->ei_inode));
rbio->bio.bi_private = &done;
rbio->bio.bi_end_io = bch2_read_single_page_end_io;
return 0;
}
+int bch2_read_folio(struct file *file, struct folio *folio)
+{
+ struct page *page = &folio->page;
+ int ret;
+
+ ret = bch2_read_single_page(page, page->mapping);
+ folio_unlock(folio);
+ return bch2_err_class(ret);
+}
+
/* writepages: */
struct bch_writepage_state {
struct bio_vec *bvec;
unsigned i;
- up(&io->op.c->io_in_flight);
-
if (io->op.error) {
set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
* racing with fallocate can cause us to add fewer sectors than
* expected - but we shouldn't add more sectors than expected:
*/
- WARN_ON(io->op.i_sectors_delta > 0);
+ WARN_ON_ONCE(io->op.i_sectors_delta > 0);
/*
* (error (due to going RO) halfway through a page can screw that up
{
struct bch_writepage_io *io = w->io;
- down(&io->op.c->io_in_flight);
-
w->io = NULL;
closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
continue_at(&io->cl, bch2_writepage_io_done, NULL);
{
struct bch_write_op *op;
- w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS,
+ w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
+ REQ_OP_WRITE,
+ GFP_NOFS,
&c->writepage_bioset),
struct bch_writepage_io, op.wbio.bio);
sectors << 9, offset << 9));
/* Check for writing past i_size: */
- WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) >
- round_up(i_size, block_bytes(c)));
+ WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
+ round_up(i_size, block_bytes(c)));
w->io->op.res.sectors += reserved_sectors;
w->io->op.i_sectors_delta -= dirty_sectors;
if (w.io)
bch2_writepage_do_io(&w);
blk_finish_plug(&plug);
- return ret;
-}
-
-int bch2_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct bch_fs *c = page->mapping->host->i_sb->s_fs_info;
- struct bch_writepage_state w =
- bch_writepage_state_init(c, to_bch_ei(page->mapping->host));
- int ret;
-
- ret = __bch2_writepage(page, wbc, &w);
- if (w.io)
- bch2_writepage_do_io(&w);
-
- return ret;
+ return bch2_err_class(ret);
}
/* buffered writes: */
int bch2_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
goto err_unlock;
if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
ret = bch2_page_state_set(c, inode_inum(inode), &page, 1);
if (ret)
- goto out;
+ goto err;
}
- ret = bch2_page_reservation_get(c, inode, page, res,
- offset, len, true);
+ ret = bch2_page_reservation_get(c, inode, page, res, offset, len);
if (ret) {
if (!PageUptodate(page)) {
/*
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
kfree(res);
*fsdata = NULL;
- return ret;
+ return bch2_err_class(ret);
}
int bch2_write_end(struct file *file, struct address_space *mapping,
bch2_page_reservation_init(c, inode, &res);
for (i = 0; i < nr_pages; i++) {
- pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
+ pages[i] = grab_cache_page_write_begin(mapping, index + i);
if (!pages[i]) {
nr_pages = i;
if (!i) {
}
ret = bch2_page_reservation_get(c, inode, page, &res,
- pg_offset, pg_len, true);
+ pg_offset, pg_len);
if (ret)
goto out;
unsigned pg_len = min_t(unsigned, len - copied,
PAGE_SIZE - pg_offset);
unsigned pg_copied = copy_page_from_iter_atomic(page,
- pg_offset, pg_len,iter);
+ pg_offset, pg_len, iter);
if (!pg_copied)
break;
* to check that the address is actually valid, when atomic
* usercopies are used, below.
*/
- if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
bytes = min_t(unsigned long, iov_iter_count(iter),
PAGE_SIZE - offset);
- if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
ret = -EFAULT;
break;
}
{
struct dio_read *dio = container_of(cl, struct dio_read, cl);
- dio->req->ki_complete(dio->req, dio->ret, 0);
+ dio->req->ki_complete(dio->req, dio->ret);
bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
}
shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
iter->count -= shorten;
- bio = bio_alloc_bioset(GFP_KERNEL,
- iov_iter_npages(iter, BIO_MAX_VECS),
+ bio = bio_alloc_bioset(NULL,
+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+ REQ_OP_READ,
+ GFP_KERNEL,
&c->dio_read_bioset);
bio->bi_end_io = bch2_direct_IO_read_endio;
goto start;
while (iter->count) {
- bio = bio_alloc_bioset(GFP_KERNEL,
- iov_iter_npages(iter, BIO_MAX_VECS),
+ bio = bio_alloc_bioset(NULL,
+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+ REQ_OP_READ,
+ GFP_KERNEL,
&c->bio_read);
bio->bi_end_io = bch2_direct_IO_read_split_endio;
start:
iocb->ki_pos,
iocb->ki_pos + count - 1);
if (ret < 0)
- return ret;
+ goto out;
file_accessed(file);
ret = generic_file_read_iter(iocb, iter);
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
}
-
- return ret;
+out:
+ return bch2_err_class(ret);
}
/* O_DIRECT writes */
offset = iter.pos.offset;
bch2_trans_iter_exit(&trans, &iter);
err:
- if (err == -EINTR)
+ if (bch2_err_matches(err, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
if (dio->loop)
goto loop;
- down(&c->io_in_flight);
-
while (1) {
iter_count = dio->iter.count;
- if (kthread)
+ if (kthread && dio->mm)
kthread_use_mm(dio->mm);
BUG_ON(current->faults_disabled_mapping);
current->faults_disabled_mapping = mapping;
dropped_locks = fdm_dropped_locks();
current->faults_disabled_mapping = NULL;
- if (kthread)
+ if (kthread && dio->mm)
kthread_unuse_mm(dio->mm);
/*
struct iovec *iov = dio->inline_vecs;
if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
- iov = kmalloc(dio->iter.nr_segs * sizeof(*iov),
- GFP_KERNEL);
+ iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
+ GFP_KERNEL);
if (unlikely(!iov)) {
dio->sync = sync = true;
goto do_io;
if (!dio->iter.count)
break;
- bio_reset(bio);
+ bio_reset(bio, NULL, REQ_OP_WRITE);
reinit_completion(&dio->done);
}
ret = dio->op.error ?: ((long) dio->written << 9);
err:
- up(&c->io_in_flight);
bch2_pagecache_block_put(&inode->ei_pagecache_lock);
bch2_quota_reservation_put(c, inode, &dio->quota_res);
/* inode->i_dio_count is our ref on inode and thus bch_fs */
inode_dio_end(&inode->v);
+ if (ret < 0)
+ ret = bch2_err_class(ret);
+
if (!sync) {
- req->ki_complete(req, ret, 0);
+ req->ki_complete(req, ret);
ret = -EIOCBQUEUED;
}
return ret;
locked = false;
}
- bio = bio_alloc_bioset(GFP_KERNEL,
- iov_iter_is_bvec(iter)
- ? 0
- : iov_iter_npages(iter, BIO_MAX_VECS),
+ bio = bio_alloc_bioset(NULL,
+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+ REQ_OP_WRITE,
+ GFP_KERNEL,
&c->dio_write_bioset);
dio = container_of(bio, struct dio_write, op.wbio.bio);
init_completion(&dio->done);
struct bch_inode_info *inode = file_bch_inode(file);
ssize_t ret;
- if (iocb->ki_flags & IOCB_DIRECT)
- return bch2_direct_write(iocb, from);
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = bch2_direct_write(iocb, from);
+ goto out;
+ }
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(&inode->v);
if (ret > 0)
ret = generic_write_sync(iocb, ret);
-
- return ret;
+out:
+ return bch2_err_class(ret);
}
/* fsync: */
ret2 = sync_inode_metadata(&inode->v, 1);
ret3 = bch2_flush_inode(c, inode_inum(inode));
- return ret ?: ret2 ?: ret3;
+ return bch2_err_class(ret ?: ret2 ?: ret3);
}
/* truncate: */
start = iter.pos;
bch2_trans_iter_exit(&trans, &iter);
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
U64_MAX, &i_sectors_delta);
i_sectors_acct(c, inode, NULL, i_sectors_delta);
- WARN_ON(!inode->v.i_size && inode->v.i_blocks &&
- !bch2_journal_error(&c->journal));
-
+ bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
+ !bch2_journal_error(&c->journal), c,
+ "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
+ inode->v.i_ino, (u64) inode->v.i_blocks,
+ inode->ei_inode.bi_sectors);
if (unlikely(ret))
goto err;
ret = bch2_setattr_nonsize(mnt_userns, inode, iattr);
err:
bch2_pagecache_block_put(&inode->ei_pagecache_lock);
- return ret;
+ return bch2_err_class(ret);
}
/* fallocate: */
truncate_pagecache_range(&inode->v, offset, end - 1);
- if (block_start < block_end ) {
+ if (block_start < block_end) {
s64 i_sectors_delta = 0;
ret = bch2_fpunch(c, inode_inum(inode),
bch2_trans_copy_iter(&dst, &src);
bch2_trans_copy_iter(&del, &src);
- while (ret == 0 || ret == -EINTR) {
+ while (ret == 0 ||
+ bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
struct bkey_i delete;
next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
- if (copy.k->k.size == k.k->size) {
- /*
- * If we're moving the entire extent, we can skip
- * running triggers:
- */
- trigger_flags |= BTREE_TRIGGER_NORUN;
- } else {
+ if (copy.k->k.size != k.k->size) {
/* We might end up splitting compressed extents: */
unsigned nr_ptrs =
bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
bkey_err:
bch2_quota_reservation_put(c, inode, "a_res);
bch2_disk_reservation_put(c, &disk_res);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
ret = 0;
}
bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */
mark_pagecache_reserved(inode, start_sector, iter.pos.offset);
- if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) {
+ if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
struct quota_res quota_res = { 0 };
s64 i_sectors_delta = 0;
* so that the VFS cache i_size is consistent with the btree i_size:
*/
if (ret &&
- !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)))
+ !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
return ret;
if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
long ret;
- if (!percpu_ref_tryget(&c->writes))
+ if (!percpu_ref_tryget_live(&c->writes))
return -EROFS;
inode_lock(&inode->v);
inode_dio_wait(&inode->v);
bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+ ret = file_modified(file);
+ if (ret)
+ goto err;
+
if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
ret = bchfs_fallocate(inode, mode, offset, len);
else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
ret = bchfs_fcollapse_finsert(inode, offset, len, false);
else
ret = -EOPNOTSUPP;
-
-
+err:
bch2_pagecache_block_put(&inode->ei_pagecache_lock);
inode_unlock(&inode->v);
percpu_ref_put(&c->writes);
- return ret;
+ return bch2_err_class(ret);
+}
+
+static int quota_reserve_range(struct bch_inode_info *inode,
+ struct quota_res *res,
+ u64 start, u64 end)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u32 snapshot;
+ u64 sectors = end - start;
+ u64 pos = start;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ SPOS(inode->v.i_ino, pos, snapshot), 0);
+
+ while (!(ret = btree_trans_too_many_iters(&trans)) &&
+ (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
+ !(ret = bkey_err(k))) {
+ if (bkey_extent_is_allocation(k.k)) {
+ u64 s = min(end, k.k->p.offset) -
+ max(start, bkey_start_offset(k.k));
+ BUG_ON(s > sectors);
+ sectors -= s;
+ }
+ bch2_btree_iter_advance(&iter);
+ }
+ pos = iter.pos.offset;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ bch2_trans_exit(&trans);
+
+ if (ret)
+ return ret;
+
+ return bch2_quota_reservation_add(c, inode, res, sectors, true);
}
loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct bch_inode_info *src = file_bch_inode(file_src);
struct bch_inode_info *dst = file_bch_inode(file_dst);
struct bch_fs *c = src->v.i_sb->s_fs_info;
+ struct quota_res quota_res = { 0 };
s64 i_sectors_delta = 0;
u64 aligned_len;
loff_t ret = 0;
bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
- file_update_time(file_dst);
-
inode_dio_wait(&src->v);
inode_dio_wait(&dst->v);
if (ret)
goto err;
+ ret = quota_reserve_range(dst, "a_res, pos_dst >> 9,
+ (pos_dst + aligned_len) >> 9);
+ if (ret)
+ goto err;
+
+ file_update_time(file_dst);
+
mark_pagecache_unallocated(src, pos_src >> 9,
(pos_src + aligned_len) >> 9);
*/
ret = min((u64) ret << 9, (u64) len);
- /* XXX get a quota reservation */
- i_sectors_acct(c, dst, NULL, i_sectors_delta);
+ i_sectors_acct(c, dst, "a_res, i_sectors_delta);
spin_lock(&dst->v.i_lock);
if (pos_dst + ret > dst->v.i_size)
IS_SYNC(file_inode(file_dst)))
ret = bch2_flush_inode(c, inode_inum(dst));
err:
+ bch2_quota_reservation_put(c, dst, "a_res);
bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
- return ret;
+ return bch2_err_class(ret);
}
/* fseek: */
loff_t start_offset,
loff_t end_offset)
{
- struct address_space *mapping = vinode->i_mapping;
- struct page *page;
+ struct folio_batch fbatch;
pgoff_t start_index = start_offset >> PAGE_SHIFT;
pgoff_t end_index = end_offset >> PAGE_SHIFT;
pgoff_t index = start_index;
+ unsigned i;
loff_t ret;
int offset;
- while (index <= end_index) {
- if (find_get_pages_range(mapping, &index, end_index, 1, &page)) {
- lock_page(page);
+ folio_batch_init(&fbatch);
+
+ while (filemap_get_folios(vinode->i_mapping,
+ &index, end_index, &fbatch)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
- offset = page_data_offset(page,
- page->index == start_index
+ folio_lock(folio);
+
+ offset = page_data_offset(&folio->page,
+ folio->index == start_index
? start_offset & (PAGE_SIZE - 1)
: 0);
if (offset >= 0) {
- ret = clamp(((loff_t) page->index << PAGE_SHIFT) +
+ ret = clamp(((loff_t) folio->index << PAGE_SHIFT) +
offset,
start_offset, end_offset);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_batch_release(&fbatch);
return ret;
}
- unlock_page(page);
- put_page(page);
- } else {
- break;
+ folio_unlock(folio);
}
+ folio_batch_release(&fbatch);
+ cond_resched();
}
return end_offset;
}
bch2_trans_iter_exit(&trans, &iter);
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
}
bch2_trans_iter_exit(&trans, &iter);
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
{
+ loff_t ret;
+
switch (whence) {
case SEEK_SET:
case SEEK_CUR:
case SEEK_END:
- return generic_file_llseek(file, offset, whence);
+ ret = generic_file_llseek(file, offset, whence);
+ break;
case SEEK_DATA:
- return bch2_seek_data(file, offset);
+ ret = bch2_seek_data(file, offset);
+ break;
case SEEK_HOLE:
- return bch2_seek_hole(file, offset);
+ ret = bch2_seek_hole(file, offset);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
}
- return -EINVAL;
+ return bch2_err_class(ret);
}
void bch2_fs_fsio_exit(struct bch_fs *c)
struct bch_inode_info *,
loff_t, unsigned);
-int bch2_writepage(struct page *, struct writeback_control *);
-int bch2_readpage(struct file *, struct page *);
+int bch2_read_folio(struct file *, struct folio *);
int bch2_writepages(struct address_space *, struct writeback_control *);
void bch2_readahead(struct readahead_control *);
int bch2_write_begin(struct file *, struct address_space *, loff_t,
- unsigned, unsigned, struct page **, void **);
+ unsigned, struct page **, void **);
int bch2_write_end(struct file *, struct address_space *, loff_t,
unsigned, unsigned, struct page *, void *);
vm_fault_t bch2_page_fault(struct vm_fault *);
vm_fault_t bch2_page_mkwrite(struct vm_fault *);
-void bch2_invalidatepage(struct page *, unsigned int, unsigned int);
-int bch2_releasepage(struct page *, gfp_t);
-int bch2_migrate_page(struct address_space *, struct page *,
- struct page *, enum migrate_mode);
+void bch2_invalidate_folio(struct folio *, size_t, size_t);
+bool bch2_release_folio(struct folio *, gfp_t);
void bch2_fs_fsio_exit(struct bch_fs *);
int bch2_fs_fsio_init(struct bch_fs *);
unsigned flags;
unsigned projid;
+
+ bool set_projinherit;
+ bool projinherit;
};
static int bch2_inode_flags_set(struct bch_inode_info *inode,
(newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
return -EINVAL;
+ if (s->set_projinherit) {
+ bi->bi_fields_set &= ~(1 << Inode_opt_project);
+ bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
+ }
+
bi->bi_flags &= ~s->mask;
bi->bi_flags |= newflags;
struct fsxattr fa = { 0 };
fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
+
+ if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
+ fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
+
fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
return copy_to_user(arg, &fa, sizeof(fa));
if (copy_from_user(&fa, arg, sizeof(fa)))
return -EFAULT;
+ s.set_projinherit = true;
+ s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
+ fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
+
s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
if (fa.fsx_xflags)
return -EOPNOTSUPP;
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ long ret;
switch (cmd) {
case FS_IOC_GETFLAGS:
- return bch2_ioc_getflags(inode, (int __user *) arg);
+ ret = bch2_ioc_getflags(inode, (int __user *) arg);
+ break;
case FS_IOC_SETFLAGS:
- return bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+ ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+ break;
case FS_IOC_FSGETXATTR:
- return bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+ ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+ break;
+
case FS_IOC_FSSETXATTR:
- return bch2_ioc_fssetxattr(c, file, inode,
- (void __user *) arg);
+ ret = bch2_ioc_fssetxattr(c, file, inode,
+ (void __user *) arg);
+ break;
case BCHFS_IOC_REINHERIT_ATTRS:
- return bch2_ioc_reinherit_attrs(c, file, inode,
- (void __user *) arg);
+ ret = bch2_ioc_reinherit_attrs(c, file, inode,
+ (void __user *) arg);
+ break;
case FS_IOC_GETVERSION:
- return -ENOTTY;
+ ret = -ENOTTY;
+ break;
+
case FS_IOC_SETVERSION:
- return -ENOTTY;
+ ret = -ENOTTY;
+ break;
case FS_IOC_GOINGDOWN:
- return bch2_ioc_goingdown(c, (u32 __user *) arg);
+ ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
+ break;
case BCH_IOCTL_SUBVOLUME_CREATE: {
struct bch_ioctl_subvolume i;
- if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
- return -EFAULT;
- return bch2_ioctl_subvolume_create(c, file, i);
+ ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+ ? -EFAULT
+ : bch2_ioctl_subvolume_create(c, file, i);
+ break;
}
case BCH_IOCTL_SUBVOLUME_DESTROY: {
struct bch_ioctl_subvolume i;
- if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
- return -EFAULT;
- return bch2_ioctl_subvolume_destroy(c, file, i);
+ ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+ ? -EFAULT
+ : bch2_ioctl_subvolume_destroy(c, file, i);
+ break;
}
default:
- return bch2_fs_ioctl(c, cmd, (void __user *) arg);
+ ret = bch2_fs_ioctl(c, cmd, (void __user *) arg);
+ break;
}
+
+ return bch2_err_class(ret);
}
#ifdef CONFIG_COMPAT
#include "buckets.h"
#include "chardev.h"
#include "dirent.h"
+#include "errcode.h"
#include "extents.h"
#include "fs.h"
#include "fs-common.h"
#include <linux/pagemap.h>
#include <linux/posix_acl.h>
#include <linux/random.h>
+#include <linux/seq_file.h>
#include <linux/statfs.h>
#include <linux/string.h>
#include <linux/xattr.h>
bch2_trans_iter_exit(&trans, &iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
err_before_quota:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
goto err_trans;
}
(subvol_inum) { 0 }, 0);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return bch2_err_class(PTR_ERR(inode));
d_instantiate(dentry, &inode->v);
return 0;
mutex_lock(&inode->ei_update_lock);
bch2_trans_init(&trans, c, 4, 1024);
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
bch2_link_trans(&trans,
inode_inum(dir), &dir_u,
inode_inum(inode), &inode_u,
bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
bch2_trans_init(&trans, c, 4, 1024);
- ret = __bch2_trans_do(&trans, NULL, NULL,
+ ret = commit_do(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL,
bch2_unlink_trans(&trans,
inode_inum(dir), &dir_u,
inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
(subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
- if (unlikely(IS_ERR(inode)))
- return PTR_ERR(inode);
+ if (IS_ERR(inode))
+ return bch2_err_class(PTR_ERR(inode));
inode_lock(&inode->v);
ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
goto err;
}
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
bch2_rename_trans(&trans,
inode_inum(src_dir), &src_dir_u,
inode_inum(dst_dir), &dst_dir_u,
btree_err:
bch2_trans_iter_exit(&trans, &inode_iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (unlikely(ret))
goto err_trans;
err:
mutex_unlock(&inode->ei_update_lock);
- return ret;
+ return bch2_err_class(ret);
}
static int bch2_getattr(struct user_namespace *mnt_userns,
(subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return bch2_err_class(PTR_ERR(inode));
d_mark_tmpfile(dentry, &inode->v);
d_instantiate(dentry, &inode->v);
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
SPOS(ei->v.i_ino, start, snapshot), 0);
- while ((k = bch2_btree_iter_peek(&iter)).k &&
- !(ret = bkey_err(k)) &&
- bkey_cmp(iter.pos, end) < 0) {
+ while (!(ret = btree_trans_too_many_iters(&trans)) &&
+ (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
+ !(ret = bkey_err(k))) {
enum btree_id data_btree = BTREE_ID_extents;
if (!bkey_extent_is_data(k.k) &&
start = iter.pos.offset;
bch2_trans_iter_exit(&trans, &iter);
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (!ret && have_extent)
};
static const struct address_space_operations bch_address_space_operations = {
- .writepage = bch2_writepage,
- .readpage = bch2_readpage,
+ .read_folio = bch2_read_folio,
.writepages = bch2_writepages,
.readahead = bch2_readahead,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.write_begin = bch2_write_begin,
.write_end = bch2_write_end,
- .invalidatepage = bch2_invalidatepage,
- .releasepage = bch2_releasepage,
+ .invalidate_folio = bch2_invalidate_folio,
+ .release_folio = bch2_release_folio,
.direct_IO = noop_direct_IO,
#ifdef CONFIG_MIGRATION
- .migratepage = bch2_migrate_page,
+ .migrate_folio = filemap_migrate_folio,
#endif
.error_remove_page = generic_error_remove_page,
};
memcpy(name, d.v->d_name, name_len);
name[name_len] = '\0';
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_iter_exit(&trans, &iter1);
ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
- return ret;
+ return bch2_err_class(ret);
}
static void bch2_evict_inode(struct inode *vinode)
}
void bch2_evict_subvolume_inodes(struct bch_fs *c,
- struct snapshot_id_list *s)
+ snapshot_id_list *s)
{
struct super_block *sb = c->vfs_sb;
struct inode *inode;
static int bch2_sync_fs(struct super_block *sb, int wait)
{
struct bch_fs *c = sb->s_fs_info;
+ int ret;
if (c->opts.journal_flush_disabled)
return 0;
return 0;
}
- return bch2_journal_flush(&c->journal);
+ ret = bch2_journal_flush(&c->journal);
+ return bch2_err_class(ret);
}
static struct bch_fs *bch2_path_to_fs(const char *path)
ret = bch2_parse_mount_opts(c, &opts, data);
if (ret)
- return ret;
+ goto err;
if (opts.read_only != c->opts.read_only) {
down_write(&c->state_lock);
if (ret) {
bch_err(c, "error going rw: %i", ret);
up_write(&c->state_lock);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
sb->s_flags &= ~SB_RDONLY;
if (opts.errors >= 0)
c->opts.errors = opts.errors;
-
- return ret;
+err:
+ return bch2_err_class(ret);
}
static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
enum bch_opt_id i;
- char buf[512];
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
for (i = 0; i < bch2_opts_nr; i++) {
const struct bch_option *opt = &bch2_opt_table[i];
if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
continue;
- bch2_opt_to_text(&PBUF(buf), c, opt, v,
+ printbuf_reset(&buf);
+ bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
OPT_SHOW_MOUNT_STYLE);
seq_putc(seq, ',');
- seq_puts(seq, buf);
+ seq_puts(seq, buf.buf);
}
- return 0;
+ if (buf.allocation_failure)
+ ret = -ENOMEM;
+ printbuf_exit(&buf);
+ return ret;
}
static void bch2_put_super(struct super_block *sb)
sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
c->vfs_sb = sb;
- strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
+ strscpy(sb->s_id, c->name, sizeof(sb->s_id));
ret = super_setup_bdi(sb);
if (ret)
sb->s_shrink.seeks = 0;
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
- if (IS_ERR(vinode)) {
- bch_err(c, "error mounting: error getting root inode %i",
- (int) PTR_ERR(vinode));
- ret = PTR_ERR(vinode);
+ ret = PTR_ERR_OR_ZERO(vinode);
+ if (ret) {
+ bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret));
goto err_put_super;
}
void bch2_vfs_exit(void)
{
unregister_filesystem(&bcache_fs_type);
- if (bch2_inode_cache)
- kmem_cache_destroy(bch2_inode_cache);
+ kmem_cache_destroy(bch2_inode_cache);
}
int __init bch2_vfs_init(void)
struct iattr *);
int __bch2_unlink(struct inode *, struct dentry *, bool);
-void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *);
+void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
void bch2_vfs_exit(void);
int bch2_vfs_init(void);
#else
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
- struct snapshot_id_list *s) {}
+ snapshot_id_list *s) {}
static inline void bch2_vfs_exit(void) {}
static inline int bch2_vfs_init(void) { return 0; }
#include "bcachefs.h"
#include "bkey_buf.h"
#include "btree_update.h"
+#include "darray.h"
#include "dirent.h"
#include "error.h"
#include "fs-common.h"
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+/*
+ * XXX: this is handling transaction restarts without returning
+ * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
+ */
static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
u32 snapshot)
{
ret = bch2_inode_unpack(k, inode);
err:
- if (ret && ret != -EINTR)
- bch_err(trans->c, "error %i fetching inode %llu",
- ret, inode_nr);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(trans->c, "error fetching inode %llu: %s",
+ inode_nr, bch2_err_str(ret));
bch2_trans_iter_exit(trans, &iter);
return ret;
}
if (!ret)
*snapshot = iter.pos.snapshot;
err:
- if (ret && ret != -EINTR)
- bch_err(trans->c, "error %i fetching inode %llu:%u",
- ret, inode_nr, *snapshot);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(trans->c, "error fetching inode %llu:%u: %s",
+ inode_nr, *snapshot, bch2_err_str(ret));
bch2_trans_iter_exit(trans, &iter);
return ret;
}
struct bch_inode_unpacked *inode,
u32 snapshot)
{
- int ret = __bch2_trans_do(trans, NULL, NULL,
+ int ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
__write_inode(trans, inode, snapshot));
if (ret)
- bch_err(trans->c, "error in fsck: error %i updating inode", ret);
+ bch_err(trans->c, "error in fsck: error updating inode: %s",
+ bch2_err_str(ret));
return ret;
}
static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
{
+ struct bch_fs *c = trans->c;
struct btree_iter iter = { NULL };
struct bkey_i_inode_generation delete;
struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
int ret;
- ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
- SPOS(inum, 0, snapshot),
- SPOS(inum, U64_MAX, snapshot),
- 0, NULL) ?:
- bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
- SPOS(inum, 0, snapshot),
- SPOS(inum, U64_MAX, snapshot),
- 0, NULL) ?:
- bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
- SPOS(inum, 0, snapshot),
- SPOS(inum, U64_MAX, snapshot),
- 0, NULL);
+ do {
+ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL) ?:
+ bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL) ?:
+ bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL);
+ } while (ret == -BCH_ERR_transaction_restart_nested);
if (ret)
goto err;
retry:
goto err;
if (!bkey_is_inode(k.k)) {
- bch2_fs_inconsistent(trans->c,
+ bch2_fs_inconsistent(c,
"inode %llu:%u not found when deleting",
inum, snapshot);
ret = -EIO;
bch2_inode_unpack(k, &inode_u);
/* Subvolume root? */
- if (inode_u.bi_subvol) {
- ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
- if (ret)
- goto err;
- }
+ if (inode_u.bi_subvol)
+ bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
bkey_inode_generation_init(&delete.k_i);
delete.k.p = iter.pos;
BTREE_INSERT_NOFAIL);
err:
bch2_trans_iter_exit(trans, &iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
- return ret;
+ return ret ?: -BCH_ERR_transaction_restart_nested;
}
static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
ret = lookup_first_inode(trans, pos.inode, &dir_inode);
if (ret)
- return ret;
+ goto err;
dir_hash_info = bch2_hash_info_init(c, &dir_inode);
bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, &iter, 0);
+ &dir_hash_info, &iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch2_trans_iter_exit(trans, &iter);
+err:
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "error from __remove_dirent(): %s", bch2_err_str(ret));
return ret;
}
goto create_lostfound;
}
- if (ret && ret != -EINTR)
- bch_err(c, "error looking up lost+found: %i", ret);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret));
if (ret)
return ret;
lostfound, &lostfound_str,
0, 0, S_IFDIR|0700, 0, NULL, NULL,
(subvol_inum) { }, 0);
- if (ret && ret != -EINTR)
- bch_err(c, "error creating lost+found: %i", ret);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "error creating lost+found: %s", bch2_err_str(ret));
return ret;
}
struct bch_inode_unpacked *inode,
u32 inode_snapshot)
{
- int ret = __bch2_trans_do(trans, NULL, NULL,
+ int ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL,
__reattach_inode(trans, inode, inode_snapshot));
if (ret) {
- bch_err(trans->c, "error %i reattaching inode %llu",
- ret, inode->bi_inum);
+ bch_err(trans->c, "error reattaching inode %llu: %s",
+ inode->bi_inum, bch2_err_str(ret));
return ret;
}
return ret;
}
-static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
+struct snapshots_seen_entry {
+ u32 id;
+ u32 equiv;
+};
+
+struct snapshots_seen {
+ struct bpos pos;
+ DARRAY(struct snapshots_seen_entry) ids;
+};
+
+static inline void snapshots_seen_exit(struct snapshots_seen *s)
+{
+ darray_exit(&s->ids);
+}
+
+static inline void snapshots_seen_init(struct snapshots_seen *s)
+{
+ memset(s, 0, sizeof(*s));
+}
+
+static int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+ struct snapshots_seen_entry *i, n = { id, id };
+ int ret;
+
+ darray_for_each(s->ids, i) {
+ if (n.equiv < i->equiv)
+ break;
+
+ if (i->equiv == n.equiv) {
+ bch_err(c, "adding duplicate snapshot in snapshots_seen_add()");
+ return -EINVAL;
+ }
+ }
+
+ ret = darray_insert_item(&s->ids, i - s->ids.data, n);
+ if (ret)
+ bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+ s->ids.size);
+ return ret;
+}
+
+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
+ enum btree_id btree_id, struct bpos pos)
{
- pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
+ struct snapshots_seen_entry *i, n = {
+ .id = pos.snapshot,
+ .equiv = bch2_snapshot_equiv(c, pos.snapshot),
+ };
+ int ret = 0;
if (bkey_cmp(s->pos, pos))
- s->nr = 0;
+ s->ids.nr = 0;
+
+ pos.snapshot = n.equiv;
s->pos = pos;
- /* Might get called multiple times due to lock restarts */
- if (s->nr && s->d[s->nr - 1] == pos.snapshot)
- return 0;
+ darray_for_each(s->ids, i)
+ if (i->equiv == n.equiv) {
+ if (fsck_err_on(i->id != n.id, c,
+ "snapshot deletion did not run correctly:\n"
+ " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
+ bch2_btree_ids[btree_id],
+ pos.inode, pos.offset,
+ i->id, n.id, n.equiv))
+ return -BCH_ERR_need_snapshot_cleanup;
+
+ return 0;
+ }
- return snapshots_seen_add(c, s, pos.snapshot);
+ ret = darray_push(&s->ids, n);
+ if (ret)
+ bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+ s->ids.size);
+fsck_err:
+ return ret;
}
/**
u32 id, u32 ancestor)
{
ssize_t i;
+ u32 top = seen->ids.nr ? seen->ids.data[seen->ids.nr - 1].equiv : 0;
BUG_ON(id > ancestor);
-
- id = snapshot_t(c, id)->equiv;
- ancestor = snapshot_t(c, ancestor)->equiv;
+ BUG_ON(!bch2_snapshot_is_equiv(c, id));
+ BUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
/* @ancestor should be the snapshot most recently added to @seen */
- BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
- BUG_ON(seen->pos.snapshot != ancestor);
+ BUG_ON(ancestor != seen->pos.snapshot);
+ BUG_ON(ancestor != top);
if (id == ancestor)
return true;
if (!bch2_snapshot_is_ancestor(c, id, ancestor))
return false;
- for (i = seen->nr - 2;
- i >= 0 && seen->d[i] >= id;
+ for (i = seen->ids.nr - 2;
+ i >= 0 && seen->ids.data[i].equiv >= id;
--i)
- if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
- bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
+ if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv) &&
+ bch2_snapshot_is_ancestor(c, seen->ids.data[i].equiv, ancestor))
return false;
return true;
: bch2_snapshot_is_ancestor(c, src, dst);
}
-#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
- for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
+ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \
+ (_i)->snapshot <= (_snapshot); _i++) \
if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
+struct inode_walker_entry {
+ struct bch_inode_unpacked inode;
+ u32 snapshot;
+ u64 count;
+};
+
struct inode_walker {
bool first_this_inode;
u64 cur_inum;
- size_t nr;
- size_t size;
- struct inode_walker_entry {
- struct bch_inode_unpacked inode;
- u32 snapshot;
- u64 count;
- } *d;
+ DARRAY(struct inode_walker_entry) inodes;
};
static void inode_walker_exit(struct inode_walker *w)
{
- kfree(w->d);
- w->d = NULL;
+ darray_exit(&w->inodes);
}
static struct inode_walker inode_walker_init(void)
return (struct inode_walker) { 0, };
}
-static int inode_walker_realloc(struct bch_fs *c, struct inode_walker *w)
-{
- if (w->nr == w->size) {
- size_t new_size = max_t(size_t, 8UL, w->size * 2);
- void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
- GFP_KERNEL);
- if (!d) {
- bch_err(c, "fsck: error allocating memory for inode_walker, size %zu",
- new_size);
- return -ENOMEM;
- }
-
- w->d = d;
- w->size = new_size;
- }
-
- return 0;
-}
-
static int add_inode(struct bch_fs *c, struct inode_walker *w,
struct bkey_s_c inode)
{
struct bch_inode_unpacked u;
- int ret;
-
- ret = inode_walker_realloc(c, w);
- if (ret)
- return ret;
BUG_ON(bch2_inode_unpack(inode, &u));
- w->d[w->nr++] = (struct inode_walker_entry) {
+ return darray_push(&w->inodes, ((struct inode_walker_entry) {
.inode = u,
- .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv,
- };
-
- return 0;
+ .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot),
+ }));
}
static int __walk_inode(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
- unsigned i, ancestor_pos;
+ u32 restart_count = trans->restart_count;
+ unsigned i;
int ret;
- pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
+ pos.snapshot = bch2_snapshot_equiv(c, pos.snapshot);
if (pos.inode == w->cur_inum) {
w->first_this_inode = false;
goto lookup_snapshot;
}
- w->nr = 0;
+ w->inodes.nr = 0;
for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
w->cur_inum = pos.inode;
w->first_this_inode = true;
+
+ if (trans_was_restarted(trans, restart_count))
+ return -BCH_ERR_transaction_restart_nested;
+
lookup_snapshot:
- for (i = 0; i < w->nr; i++)
- if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
+ for (i = 0; i < w->inodes.nr; i++)
+ if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot))
goto found;
return INT_MAX;
found:
- BUG_ON(pos.snapshot > w->d[i].snapshot);
+ BUG_ON(pos.snapshot > w->inodes.data[i].snapshot);
+
+ if (pos.snapshot != w->inodes.data[i].snapshot) {
+ struct inode_walker_entry e = w->inodes.data[i];
+
+ e.snapshot = pos.snapshot;
+ e.count = 0;
- if (pos.snapshot != w->d[i].snapshot) {
- ancestor_pos = i;
+ bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
+ pos.inode, pos.snapshot, w->inodes.data[i].snapshot);
- while (i && w->d[i - 1].snapshot > pos.snapshot)
+ while (i && w->inodes.data[i - 1].snapshot > pos.snapshot)
--i;
- ret = inode_walker_realloc(c, w);
+ ret = darray_insert_item(&w->inodes, i, e);
if (ret)
return ret;
-
- array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
- w->d[i].snapshot = pos.snapshot;
- w->d[i].count = 0;
}
return i;
struct bkey_s_c k;
int ret;
- w->nr = 0;
+ w->inodes.nr = 0;
- for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
if (k.k->p.offset != inum)
break;
- if (!bkey_is_inode(k.k))
+ if (!ref_visible(c, s, s->pos.snapshot, equiv))
continue;
- if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
+ if (bkey_is_inode(k.k))
add_inode(c, w, k);
- if (k.k->p.snapshot >= s->pos.snapshot)
- break;
- }
+
+ if (equiv >= s->pos.snapshot)
+ break;
}
bch2_trans_iter_exit(trans, &iter);
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
+ if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
"key in missing snapshot: %s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return bch2_btree_delete_at(trans, iter,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
struct bch_hash_info *hash_info,
struct btree_iter *k_iter, struct bkey_s_c k)
{
- bch_err(trans->c, "hash_redo_key() not implemented yet");
- return -EINVAL;
-#if 0
struct bkey_i *delete;
struct bkey_i *tmp;
delete->k.p = k_iter->pos;
return bch2_btree_iter_traverse(k_iter) ?:
bch2_trans_update(trans, k_iter, delete, 0) ?:
- bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
-#endif
+ bch2_hash_set_snapshot(trans, desc, hash_info,
+ (subvol_inum) { 0, k.k->p.inode },
+ k.k->p.snapshot, tmp,
+ BCH_HASH_SET_MUST_CREATE,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW);
}
static int hash_check_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_iter iter = { NULL };
- char buf[200];
+ struct printbuf buf = PRINTBUF;
struct bkey_s_c k;
u64 hash;
int ret = 0;
if (hash_k.k->p.offset < hash)
goto bad_hash;
- for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
- BTREE_ITER_SLOTS, k, ret) {
+ for_each_btree_key_norestart(trans, iter, desc.btree_id,
+ POS(hash_k.k->p.inode, hash),
+ BTREE_ITER_SLOTS, k, ret) {
if (!bkey_cmp(k.k->p, hash_k.k->p))
break;
if (fsck_err_on(k.k->type == desc.key_type &&
!desc.cmp_bkey(k, hash_k), c,
"duplicate hash table keys:\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c,
- hash_k), buf))) {
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, hash_k),
+ buf.buf))) {
ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
break;
}
bch2_trans_iter_exit(trans, &iter);
goto bad_hash;
}
-
}
+out:
bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
return ret;
bad_hash:
- if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
+ if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, "
"hashed to %llu\n%s",
- desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
- (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
- return 0;
-
- ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
- if (ret) {
- bch_err(c, "hash_redo_key err %i", ret);
- return ret;
+ bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
+ ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
+ if (ret) {
+ bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
+ return ret;
+ }
+ ret = -BCH_ERR_transaction_restart_nested;
}
- return -EINTR;
fsck_err:
- return ret;
+ goto out;
}
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
+ struct bkey_s_c k,
struct bch_inode_unpacked *prev,
+ struct snapshots_seen *s,
bool full)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k;
struct bch_inode_unpacked u;
bool do_update = false;
int ret;
- k = bch2_btree_iter_peek(iter);
- if (!k.k)
- return 0;
-
- ret = bkey_err(k);
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret < 0)
+ goto err;
if (ret)
- return ret;
+ return 0;
- ret = check_key_has_snapshot(trans, iter, k);
+ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
if (ret)
- return ret < 0 ? ret : 0;
+ goto err;
/*
* if snapshot id isn't a leaf node, skip it - deletion in
bch2_fs_lazy_rw(c);
ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
- if (ret)
- bch_err(c, "error in fsck: error %i while deleting inode", ret);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "error in fsck: error while deleting inode: %s",
+ bch2_err_str(ret));
return ret;
}
POS(u.bi_inum, U64_MAX),
0, NULL);
if (ret) {
- bch_err(c, "error in fsck: error %i truncating inode", ret);
+ bch_err(c, "error in fsck: error truncating inode: %s",
+ bch2_err_str(ret));
return ret;
}
sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
if (sectors < 0) {
- bch_err(c, "error in fsck: error %i recounting inode sectors",
- (int) sectors);
+ bch_err(c, "error in fsck: error recounting inode sectors: %s",
+ bch2_err_str(sectors));
return sectors;
}
}
if (do_update) {
- ret = write_inode(trans, &u, iter->pos.snapshot);
+ ret = __write_inode(trans, &u, iter->pos.snapshot);
if (ret)
- bch_err(c, "error in fsck: error %i "
- "updating inode", ret);
+ bch_err(c, "error in fsck: error updating inode: %s",
+ bch2_err_str(ret));
}
+err:
fsck_err:
+ if (ret)
+ bch_err(c, "error from check_inode(): %s", bch2_err_str(ret));
return ret;
}
struct btree_trans trans;
struct btree_iter iter;
struct bch_inode_unpacked prev = { 0 };
- int ret;
-
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
-
- do {
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_inode(&trans, &iter, &prev, full));
- if (ret)
- break;
- } while (bch2_btree_iter_advance(&iter));
- bch2_trans_iter_exit(&trans, &iter);
-
- bch2_trans_exit(&trans);
- return ret;
-}
-
-static int check_subvol(struct btree_trans *trans,
- struct btree_iter *iter)
-{
+ struct snapshots_seen s;
struct bkey_s_c k;
- struct bkey_s_c_subvolume subvol;
- int ret;
-
- k = bch2_btree_iter_peek(iter);
- if (!k.k)
- return 0;
-
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (k.k->type != KEY_TYPE_subvolume)
- return 0;
-
- subvol = bkey_s_c_to_subvolume(k);
-
- if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
- ret = bch2_subvolume_delete(trans, iter->pos.offset);
- if (ret && ret != -EINTR)
- bch_err(trans->c, "error deleting subvolume %llu: %i",
- iter->pos.offset, ret);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-noinline_for_stack
-static int check_subvols(struct bch_fs *c)
-{
- struct btree_trans trans;
- struct btree_iter iter;
int ret;
+ snapshots_seen_init(&s);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
- POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH);
-
- do {
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_subvol(&trans, &iter));
- if (ret)
- break;
- } while (bch2_btree_iter_advance(&iter));
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
+ POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_inode(&trans, &iter, k, &prev, &s, full));
bch2_trans_exit(&trans);
+ snapshots_seen_exit(&s);
+ if (ret)
+ bch_err(c, "error from check_inodes(): %s", bch2_err_str(ret));
return ret;
}
SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
ret = bkey_err(d.s_c);
if (ret)
- return ret;
+ return ret == -ENOENT ? 0 : ret;
ret = dirent_points_to_inode(d, inode);
bch2_trans_iter_exit(trans, &iter);
{
struct bch_fs *c = trans->c;
struct inode_walker_entry *i;
- int ret = 0, ret2 = 0;
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
s64 count2;
- for (i = w->d; i < w->d + w->nr; i++) {
+ darray_for_each(w->inodes, i) {
if (i->inode.bi_sectors == i->count)
continue;
- count2 = lockrestart_do(trans,
- bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
+ count2 = bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot);
if (i->count != count2) {
bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
w->cur_inum, i->snapshot,
- i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
- continue;
-
- i->inode.bi_sectors = i->count;
- ret = write_inode(trans, &i->inode, i->snapshot);
- if (ret)
- break;
- ret2 = -EINTR;
+ i->inode.bi_sectors, i->count)) {
+ i->inode.bi_sectors = i->count;
+ ret = write_inode(trans, &i->inode, i->snapshot);
+ if (ret)
+ break;
+ }
}
fsck_err:
- return ret ?: ret2;
+ if (ret)
+ bch_err(c, "error from check_i_sectors(): %s", bch2_err_str(ret));
+ if (!ret && trans_was_restarted(trans, restart_count))
+ ret = -BCH_ERR_transaction_restart_nested;
+ return ret;
}
static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k,
struct inode_walker *inode,
struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k;
struct inode_walker_entry *i;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
+ struct bpos equiv;
int ret = 0;
- k = bch2_btree_iter_peek(iter);
- if (!k.k)
- return 0;
-
- ret = bkey_err(k);
- if (ret)
- return ret;
-
ret = check_key_has_snapshot(trans, iter, k);
- if (ret)
- return ret < 0 ? ret : 0;
+ if (ret) {
+ ret = ret < 0 ? ret : 0;
+ goto out;
+ }
+
+ equiv = k.k->p;
+ equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
- ret = snapshots_seen_update(c, s, k.k->p);
+ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
if (ret)
- return ret;
+ goto err;
if (k.k->type == KEY_TYPE_whiteout)
- return 0;
+ goto out;
if (inode->cur_inum != k.k->p.inode) {
ret = check_i_sectors(trans, inode);
if (ret)
- return ret;
+ goto err;
}
+
+ BUG_ON(!iter->path->should_be_locked);
#if 0
if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
char buf1[200];
bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
bch2_bkey_val_to_text(&PBUF(buf2), c, k);
- if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
- return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
+ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
+ ret = fix_overlapping_extent(trans, k, prev.k->k.p)
+ ?: -BCH_ERR_transaction_restart_nested;
+ goto out;
+ }
}
#endif
- ret = __walk_inode(trans, inode, k.k->p);
+ ret = __walk_inode(trans, inode, equiv);
if (ret < 0)
- return ret;
+ goto err;
if (fsck_err_on(ret == INT_MAX, c,
"extent in missing inode:\n %s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return bch2_btree_delete_at(trans, iter,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ goto out;
+ }
- if (ret == INT_MAX)
- return 0;
+ if (ret == INT_MAX) {
+ ret = 0;
+ goto out;
+ }
- i = inode->d + ret;
+ i = inode->inodes.data + ret;
ret = 0;
if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
!S_ISLNK(i->inode.bi_mode), c,
"extent in non regular inode mode %o:\n %s",
i->inode.bi_mode,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return bch2_btree_delete_at(trans, iter,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ goto out;
+ }
+
+ /*
+ * Check inodes in reverse order, from oldest snapshots to newest, so
+ * that we emit the fewest number of whiteouts necessary:
+ */
+ for (i = inode->inodes.data + inode->inodes.nr - 1;
+ i >= inode->inodes.data;
+ --i) {
+ if (i->snapshot > equiv.snapshot ||
+ !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
+ continue;
+
+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+ k.k->type != KEY_TYPE_reservation &&
+ k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
+ "extent type past end of inode %llu:%u, i_size %llu\n %s",
+ i->inode.bi_inum, i->snapshot, i->inode.bi_size,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ struct btree_iter iter2;
+
+ bch2_trans_copy_iter(&iter2, iter);
+ bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
+ ret = bch2_btree_iter_traverse(&iter2) ?:
+ bch2_btree_delete_at(trans, &iter2,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_iter_exit(trans, &iter2);
+ if (ret)
+ goto err;
- if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
- for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
- if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
- k.k->type != KEY_TYPE_reservation &&
- k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
- "extent type %u offset %llu past end of inode %llu, i_size %llu",
- k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
- bch2_fs_lazy_rw(c);
- return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
- SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
- k.k->p.snapshot),
- POS(k.k->p.inode, U64_MAX),
- 0, NULL) ?: -EINTR;
+ if (i->snapshot != equiv.snapshot) {
+ ret = snapshots_seen_add(c, s, i->snapshot);
+ if (ret)
+ goto err;
}
}
}
if (bkey_extent_is_allocation(k.k))
- for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
+ for_each_visible_inode(c, s, inode, equiv.snapshot, i)
i->count += k.k->size;
#if 0
bch2_bkey_buf_reassemble(&prev, c, k);
#endif
+out:
+err:
fsck_err:
+ printbuf_exit(&buf);
+
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "error from check_extent(): %s", bch2_err_str(ret));
return ret;
}
struct snapshots_seen s;
struct btree_trans trans;
struct btree_iter iter;
+ struct bkey_s_c k;
int ret = 0;
#if 0
bch_verbose(c, "checking extents");
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
-
- do {
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_extent(&trans, &iter, &w, &s));
- if (ret)
- break;
- } while (bch2_btree_iter_advance(&iter));
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL,
+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_extent(&trans, &iter, k, &w, &s));
#if 0
bch2_bkey_buf_exit(&prev, c);
#endif
bch2_trans_exit(&trans);
snapshots_seen_exit(&s);
+ if (ret)
+ bch_err(c, "error from check_extents(): %s", bch2_err_str(ret));
return ret;
}
{
struct bch_fs *c = trans->c;
struct inode_walker_entry *i;
- int ret = 0, ret2 = 0;
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
s64 count2;
- for (i = w->d; i < w->d + w->nr; i++) {
+ darray_for_each(w->inodes, i) {
if (i->inode.bi_nlink == i->count)
continue;
ret = write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
- ret2 = -EINTR;
}
}
fsck_err:
- return ret ?: ret2;
+ if (ret)
+ bch_err(c, "error from check_subdir_count(): %s", bch2_err_str(ret));
+ if (!ret && trans_was_restarted(trans, restart_count))
+ ret = -BCH_ERR_transaction_restart_nested;
+ return ret;
}
static int check_dirent_target(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bkey_i_dirent *n;
bool backpointer_exists = true;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
if (!target->bi_dir &&
"directory %llu with multiple links",
target->bi_inum)) {
ret = __remove_dirent(trans, d.k->p);
- if (ret)
- goto err;
- return 0;
+ goto out;
}
if (fsck_err_on(backpointer_exists &&
!target->bi_nlink, c,
- "inode %llu has multiple links but i_nlink 0",
- target->bi_inum)) {
+ "inode %llu type %s has multiple links but i_nlink 0",
+ target->bi_inum, bch2_d_types[d.v->d_type])) {
target->bi_nlink++;
target->bi_flags &= ~BCH_INODE_UNLINKED;
"incorrect d_type: got %s, should be %s:\n%s",
bch2_d_type_str(d.v->d_type),
bch2_d_type_str(inode_d_type(target)),
- (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
ret = PTR_ERR_OR_ZERO(n);
if (ret)
- return ret;
+ goto err;
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_type = inode_d_type(target);
ret = bch2_trans_update(trans, iter, &n->k_i, 0);
if (ret)
- return ret;
+ goto err;
d = dirent_i_to_s_c(n);
}
n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
ret = PTR_ERR_OR_ZERO(n);
if (ret)
- return ret;
+ goto err;
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
ret = bch2_trans_update(trans, iter, &n->k_i, 0);
if (ret)
- return ret;
+ goto err;
d = dirent_i_to_s_c(n);
}
+out:
err:
fsck_err:
+ printbuf_exit(&buf);
+
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "error from check_target(): %s", bch2_err_str(ret));
return ret;
}
static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k,
struct bch_hash_info *hash_info,
struct inode_walker *dir,
struct inode_walker *target,
struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k;
struct bkey_s_c_dirent d;
struct inode_walker_entry *i;
- char buf[200];
- int ret;
-
- k = bch2_btree_iter_peek(iter);
- if (!k.k)
- return 0;
-
- ret = bkey_err(k);
- if (ret)
- return ret;
+ struct printbuf buf = PRINTBUF;
+ struct bpos equiv;
+ int ret = 0;
ret = check_key_has_snapshot(trans, iter, k);
- if (ret)
- return ret < 0 ? ret : 0;
+ if (ret) {
+ ret = ret < 0 ? ret : 0;
+ goto out;
+ }
- ret = snapshots_seen_update(c, s, k.k->p);
+ equiv = k.k->p;
+ equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
if (ret)
- return ret;
+ goto err;
if (k.k->type == KEY_TYPE_whiteout)
- return 0;
+ goto out;
if (dir->cur_inum != k.k->p.inode) {
ret = check_subdir_count(trans, dir);
if (ret)
- return ret;
+ goto err;
}
- ret = __walk_inode(trans, dir, k.k->p);
+ BUG_ON(!iter->path->should_be_locked);
+
+ ret = __walk_inode(trans, dir, equiv);
if (ret < 0)
- return ret;
+ goto err;
if (fsck_err_on(ret == INT_MAX, c,
"dirent in nonexisting directory:\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return bch2_btree_delete_at(trans, iter,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ goto out;
+ }
- if (ret == INT_MAX)
- return 0;
+ if (ret == INT_MAX) {
+ ret = 0;
+ goto out;
+ }
- i = dir->d + ret;
+ i = dir->inodes.data + ret;
ret = 0;
if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
"dirent in non directory inode type %s:\n%s",
bch2_d_type_str(inode_d_type(&i->inode)),
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return bch2_btree_delete_at(trans, iter, 0);
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter, 0);
+ goto out;
+ }
if (dir->first_this_inode)
- *hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
+ *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
ret = hash_check_key(trans, bch2_dirent_hash_desc,
hash_info, iter, k);
if (ret < 0)
- return ret;
- if (ret) /* dirent has been deleted */
- return 0;
+ goto err;
+ if (ret) {
+ /* dirent has been deleted */
+ ret = 0;
+ goto out;
+ }
if (k.k->type != KEY_TYPE_dirent)
- return 0;
+ goto out;
d = bkey_s_c_to_dirent(k);
ret = __subvol_lookup(trans, target_subvol,
&target_snapshot, &target_inum);
if (ret && ret != -ENOENT)
- return ret;
+ goto err;
if (fsck_err_on(ret, c,
"dirent points to missing subvolume %llu",
- le64_to_cpu(d.v->d_child_subvol)))
- return __remove_dirent(trans, d.k->p);
+ le64_to_cpu(d.v->d_child_subvol))) {
+ ret = __remove_dirent(trans, d.k->p);
+ goto err;
+ }
ret = __lookup_inode(trans, target_inum,
&subvol_root, &target_snapshot);
if (ret && ret != -ENOENT)
- return ret;
+ goto err;
if (fsck_err_on(ret, c,
"subvolume %u points to missing subvolume root %llu",
target_subvol,
target_inum)) {
bch_err(c, "repair not implemented yet");
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
subvol_root.bi_subvol = target_subvol;
ret = __write_inode(trans, &subvol_root, target_snapshot);
if (ret)
- return ret;
+ goto err;
}
ret = check_dirent_target(trans, iter, d, &subvol_root,
target_snapshot);
if (ret)
- return ret;
+ goto err;
} else {
ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
if (ret)
- return ret;
+ goto err;
- if (fsck_err_on(!target->nr, c,
- "dirent points to missing inode:\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c,
- k), buf))) {
+ if (fsck_err_on(!target->inodes.nr, c,
+ "dirent points to missing inode: (equiv %u)\n%s",
+ equiv.snapshot,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf))) {
ret = __remove_dirent(trans, d.k->p);
if (ret)
- return ret;
+ goto err;
}
- for (i = target->d; i < target->d + target->nr; i++) {
+ darray_for_each(target->inodes, i) {
ret = check_dirent_target(trans, iter, d,
&i->inode, i->snapshot);
if (ret)
- return ret;
+ goto err;
}
}
if (d.v->d_type == DT_DIR)
- for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+ for_each_visible_inode(c, s, dir, equiv.snapshot, i)
i->count++;
+out:
+err:
fsck_err:
+ printbuf_exit(&buf);
+
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "error from check_dirent(): %s", bch2_err_str(ret));
return ret;
}
struct bch_hash_info hash_info;
struct btree_trans trans;
struct btree_iter iter;
+ struct bkey_s_c k;
int ret = 0;
bch_verbose(c, "checking dirents");
snapshots_seen_init(&s);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
-
- do {
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_dirent(&trans, &iter, &hash_info,
- &dir, &target, &s));
- if (ret)
- break;
- } while (bch2_btree_iter_advance(&iter));
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ k,
+ NULL, NULL,
+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s));
bch2_trans_exit(&trans);
snapshots_seen_exit(&s);
inode_walker_exit(&dir);
inode_walker_exit(&target);
+
+ if (ret)
+ bch_err(c, "error from check_dirents(): %s", bch2_err_str(ret));
return ret;
}
static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k,
struct bch_hash_info *hash_info,
struct inode_walker *inode)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k;
int ret;
- k = bch2_btree_iter_peek(iter);
- if (!k.k)
- return 0;
-
- ret = bkey_err(k);
- if (ret)
- return ret;
-
ret = check_key_has_snapshot(trans, iter, k);
if (ret)
return ret;
ret = 0;
if (inode->first_this_inode)
- *hash_info = bch2_hash_info_init(c, &inode->d[0].inode);
+ *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
fsck_err:
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "error from check_xattr(): %s", bch2_err_str(ret));
return ret;
}
struct bch_hash_info hash_info;
struct btree_trans trans;
struct btree_iter iter;
+ struct bkey_s_c k;
int ret = 0;
bch_verbose(c, "checking xattrs");
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
-
- do {
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_xattr(&trans, &iter, &hash_info,
- &inode));
- if (ret)
- break;
- } while (bch2_btree_iter_advance(&iter));
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ k,
+ NULL, NULL,
+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_xattr(&trans, &iter, k, &hash_info, &inode));
bch2_trans_exit(&trans);
+
+ if (ret)
+ bch_err(c, "error from check_xattrs(): %s", bch2_err_str(ret));
return ret;
}
root_subvol.v.flags = 0;
root_subvol.v.snapshot = cpu_to_le32(snapshot);
root_subvol.v.inode = cpu_to_le64(inum);
- ret = __bch2_trans_do(trans, NULL, NULL,
+ ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
__bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
if (ret) {
- bch_err(c, "error writing root subvol: %i", ret);
+ bch_err(c, "error writing root subvol: %s", bch2_err_str(ret));
goto err;
}
ret = __write_inode(trans, &root_inode, snapshot);
if (ret)
- bch_err(c, "error writing root inode: %i", ret);
+ bch_err(c, "error writing root inode: %s", bch2_err_str(ret));
}
err:
fsck_err:
check_root_trans(&trans));
}
-struct pathbuf {
- size_t nr;
- size_t size;
-
- struct pathbuf_entry {
- u64 inum;
- u32 snapshot;
- } *entries;
+struct pathbuf_entry {
+ u64 inum;
+ u32 snapshot;
};
-static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
+typedef DARRAY(struct pathbuf_entry) pathbuf;
+
+static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
{
struct pathbuf_entry *i;
- for (i = p->entries; i < p->entries + p->nr; i++)
+ darray_for_each(*p, i)
if (i->inum == inum &&
i->snapshot == snapshot)
return true;
return false;
}
-static int path_down(struct bch_fs *c, struct pathbuf *p,
+static int path_down(struct bch_fs *c, pathbuf *p,
u64 inum, u32 snapshot)
{
- if (p->nr == p->size) {
- size_t new_size = max_t(size_t, 256UL, p->size * 2);
- void *n = krealloc(p->entries,
- new_size * sizeof(p->entries[0]),
- GFP_KERNEL);
- if (!n) {
- bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
- new_size);
- return -ENOMEM;
- }
-
- p->entries = n;
- p->size = new_size;
- };
-
- p->entries[p->nr++] = (struct pathbuf_entry) {
+ int ret = darray_push(p, ((struct pathbuf_entry) {
.inum = inum,
.snapshot = snapshot,
- };
- return 0;
+ }));
+
+ if (ret)
+ bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
+ p->size);
+ return ret;
}
/*
* XXX: we should also be verifying that inodes are in the right subvolumes
*/
static int check_path(struct btree_trans *trans,
- struct pathbuf *p,
+ pathbuf *p,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
struct bch_fs *c = trans->c;
int ret = 0;
- snapshot = snapshot_t(c, snapshot)->equiv;
+ snapshot = bch2_snapshot_equiv(c, snapshot);
p->nr = 0;
while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
/* XXX print path */
bch_err(c, "directory structure loop");
- for (i = p->entries; i < p->entries + p->nr; i++)
+ darray_for_each(*p, i)
pr_err("%llu:%u", i->inum, i->snapshot);
pr_err("%llu:%u", inode->bi_inum, snapshot);
if (!fsck_err(c, "directory structure loop"))
return 0;
- ret = __bch2_trans_do(trans, NULL, NULL,
+ ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
remove_backpointer(trans, inode));
}
fsck_err:
if (ret)
- bch_err(c, "%s: err %i", __func__, ret);
+ bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
return ret;
}
struct btree_iter iter;
struct bkey_s_c k;
struct bch_inode_unpacked u;
- struct pathbuf path = { 0, 0, NULL };
+ pathbuf path = { 0, };
int ret;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
}
bch2_trans_iter_exit(&trans, &iter);
- BUG_ON(ret == -EINTR);
-
- kfree(path.entries);
+ darray_exit(&path);
bch2_trans_exit(&trans);
return ret;
{
if (t->nr == t->size) {
size_t new_size = max_t(size_t, 128UL, t->size * 2);
- void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
+ void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
+
if (!d) {
bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
new_size);
BTREE_ITER_INTENT|
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- ret = snapshots_seen_update(c, &s, k.k->p);
+ ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
if (ret)
break;
d.v->d_type != DT_SUBVOL)
inc_link(c, &s, links, range_start, range_end,
le64_to_cpu(d.v->d_inum),
- d.k->p.snapshot);
+ bch2_snapshot_equiv(c, d.k->p.snapshot));
break;
}
}
return ret;
}
+static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct nlink_table *links,
+ size_t *idx, u64 range_end)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked u;
+ struct nlink *link = &links->d[*idx];
+ int ret = 0;
+
+ if (k.k->p.offset >= range_end)
+ return 1;
+
+ if (!bkey_is_inode(k.k))
+ return 0;
+
+ BUG_ON(bch2_inode_unpack(k, &u));
+
+ if (S_ISDIR(le16_to_cpu(u.bi_mode)))
+ return 0;
+
+ if (!u.bi_nlink)
+ return 0;
+
+ while ((cmp_int(link->inum, k.k->p.offset) ?:
+ cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
+ BUG_ON(*idx == links->nr);
+ link = &links->d[++*idx];
+ }
+
+ if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
+ "inode %llu type %s has wrong i_nlink (%u, should be %u)",
+ u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
+ bch2_inode_nlink_get(&u), link->count)) {
+ bch2_inode_nlink_set(&u, link->count);
+ ret = __write_inode(trans, &u, k.k->p.snapshot);
+ }
+fsck_err:
+ return ret;
+}
+
noinline_for_stack
static int check_nlinks_update_hardlinks(struct bch_fs *c,
struct nlink_table *links,
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct bch_inode_unpacked u;
- struct nlink *link = links->d;
+ size_t idx = 0;
int ret = 0;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_inodes,
- POS(0, range_start),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- if (k.k->p.offset >= range_end)
- break;
-
- if (!bkey_is_inode(k.k))
- continue;
-
- BUG_ON(bch2_inode_unpack(k, &u));
-
- if (S_ISDIR(le16_to_cpu(u.bi_mode)))
- continue;
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
+ POS(0, range_start),
+ BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end));
- if (!u.bi_nlink)
- continue;
-
- while ((cmp_int(link->inum, k.k->p.offset) ?:
- cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
- link++;
- BUG_ON(link >= links->d + links->nr);
- }
-
- if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
- "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
- u.bi_inum, mode_to_type(u.bi_mode),
- bch2_inode_nlink_get(&u), link->count)) {
- bch2_inode_nlink_set(&u, link->count);
-
- ret = write_inode(&trans, &u, k.k->p.snapshot);
- if (ret)
- bch_err(c, "error in fsck: error %i updating inode", ret);
- }
- }
-fsck_err:
- bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
- if (ret)
+ if (ret < 0) {
bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+ return ret;
+ }
- return ret;
+ return 0;
}
noinline_for_stack
return ret;
}
-static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
+static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
{
- struct bkey_s_c k;
struct bkey_s_c_reflink_p p;
struct bkey_i_reflink_p *u;
int ret;
- k = bch2_btree_iter_peek(iter);
- if (!k.k)
- return 0;
-
- ret = bkey_err(k);
- if (ret)
- return ret;
-
if (k.k->type != KEY_TYPE_reflink_p)
return 0;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- if (k.k->type == KEY_TYPE_reflink_p) {
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- fix_reflink_p_key(&trans, &iter));
- if (ret)
- break;
- }
- }
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_extents, POS_MIN,
+ BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ fix_reflink_p_key(&trans, &iter, k));
bch2_trans_exit(&trans);
return ret;
*/
int bch2_fsck_full(struct bch_fs *c)
{
- return bch2_fs_snapshots_check(c) ?:
+ int ret;
+again:
+ ret = bch2_fs_check_snapshots(c) ?:
+ bch2_fs_check_subvols(c) ?:
+ bch2_delete_dead_snapshots(c) ?:
check_inodes(c, true) ?:
- check_subvols(c) ?:
check_extents(c) ?:
check_dirents(c) ?:
check_xattrs(c) ?:
check_directory_structure(c) ?:
check_nlinks(c) ?:
fix_reflink_p(c);
+
+ if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) {
+ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+ goto again;
+ }
+
+ return ret;
}
int bch2_fsck_walk_inodes_only(struct bch_fs *c)
{
- return check_inodes(c, false);
+ return bch2_fs_check_snapshots(c) ?:
+ bch2_fs_check_subvols(c) ?:
+ bch2_delete_dead_snapshots(c) ?:
+ check_inodes(c, false);
}
return bytes;
}
-void bch2_inode_pack(struct bch_fs *c,
- struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
+static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
{
- struct bkey_i_inode_v2 *k = &packed->inode;
+ struct bkey_i_inode_v3 *k = &packed->inode;
u8 *out = k->v.fields;
u8 *end = (void *) &packed[1];
u8 *last_nonzero_field = out;
unsigned bytes;
int ret;
- bkey_inode_v2_init(&packed->inode.k_i);
+ bkey_inode_v3_init(&packed->inode.k_i);
packed->inode.k.p.offset = inode->bi_inum;
packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq);
packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags);
- packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags);
- packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
+ packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors);
+ packed->inode.v.bi_size = cpu_to_le64(inode->bi_size);
+ packed->inode.v.bi_version = cpu_to_le64(inode->bi_version);
+ SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
+ SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
+
#define x(_name, _bits) \
nr_fields++; \
*out++ = 0; \
}
- BCH_INODE_FIELDS()
+ BCH_INODE_FIELDS_v3()
#undef x
BUG_ON(out > end);
set_bkey_val_bytes(&packed->inode.k, bytes);
memset_u64s_tail(&packed->inode.v, 0, bytes);
- SET_INODEv2_NR_FIELDS(&k->v, nr_fields);
+ SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
struct bch_inode_unpacked unpacked;
BUG_ON(ret);
BUG_ON(unpacked.bi_inum != inode->bi_inum);
BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed);
+ BUG_ON(unpacked.bi_sectors != inode->bi_sectors);
+ BUG_ON(unpacked.bi_size != inode->bi_size);
+ BUG_ON(unpacked.bi_version != inode->bi_version);
BUG_ON(unpacked.bi_mode != inode->bi_mode);
#define x(_name, _bits) if (unpacked._name != inode->_name) \
panic("unpacked %llu should be %llu", \
(u64) unpacked._name, (u64) inode->_name);
- BCH_INODE_FIELDS()
+ BCH_INODE_FIELDS_v3()
#undef x
}
}
+void bch2_inode_pack(struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
+{
+ bch2_inode_pack_inlined(packed, inode);
+}
+
static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
struct bch_inode_unpacked *unpacked)
{
#define x(_name, _bits) \
if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \
- memset(&unpacked->_name, 0, \
- sizeof(*unpacked) - \
- offsetof(struct bch_inode_unpacked, _name)); \
+ unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
+ memset((void *) unpacked + offset, 0, \
+ sizeof(*unpacked) - offset); \
return 0; \
} \
\
unpacked->_name = field[1]; \
in += ret;
- BCH_INODE_FIELDS()
+ BCH_INODE_FIELDS_v2()
#undef x
/* XXX: signal if there were more fields than expected? */
return -1; \
fieldnr++;
- BCH_INODE_FIELDS()
+ BCH_INODE_FIELDS_v2()
#undef x
/* XXX: signal if there were more fields than expected? */
return 0;
}
-int bch2_inode_unpack(struct bkey_s_c k,
- struct bch_inode_unpacked *unpacked)
+static int bch2_inode_unpack_v3(struct bkey_s_c k,
+ struct bch_inode_unpacked *unpacked)
+{
+ struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+ const u8 *in = inode.v->fields;
+ const u8 *end = bkey_val_end(inode);
+ unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
+ unsigned fieldnr = 0;
+ int ret;
+ u64 v[2];
+
+ unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+ unpacked->bi_hash_seed = inode.v->bi_hash_seed;
+ unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
+ unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors);
+ unpacked->bi_size = le64_to_cpu(inode.v->bi_size);
+ unpacked->bi_version = le64_to_cpu(inode.v->bi_version);
+ unpacked->bi_mode = INODEv3_MODE(inode.v);
+
+#define x(_name, _bits) \
+ if (fieldnr < nr_fields) { \
+ ret = bch2_varint_decode_fast(in, end, &v[0]); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ \
+ if (_bits > 64) { \
+ ret = bch2_varint_decode_fast(in, end, &v[1]); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ } else { \
+ v[1] = 0; \
+ } \
+ } else { \
+ v[0] = v[1] = 0; \
+ } \
+ \
+ unpacked->_name = v[0]; \
+ if (v[1] || v[0] != unpacked->_name) \
+ return -1; \
+ fieldnr++;
+
+ BCH_INODE_FIELDS_v3()
+#undef x
+
+ /* XXX: signal if there were more fields than expected? */
+ return 0;
+}
+
+static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
+ struct bch_inode_unpacked *unpacked)
{
switch (k.k->type) {
case KEY_TYPE_inode: {
}
}
+int bch2_inode_unpack(struct bkey_s_c k,
+ struct bch_inode_unpacked *unpacked)
+{
+ if (likely(k.k->type == KEY_TYPE_inode_v3))
+ return bch2_inode_unpack_v3(k, unpacked);
+ return bch2_inode_unpack_slowpath(k, unpacked);
+}
+
int bch2_inode_peek(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
if (IS_ERR(inode_p))
return PTR_ERR(inode_p);
- bch2_inode_pack(trans->c, inode_p, inode);
+ bch2_inode_pack_inlined(inode_p, inode);
inode_p->inode.k.p.snapshot = iter->snapshot;
return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
}
-const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
+struct bkey_s_c bch2_inode_to_v3(struct btree_trans *trans, struct bkey_s_c k)
{
- struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
- struct bch_inode_unpacked unpacked;
+ struct bch_inode_unpacked u;
+ struct bkey_inode_buf *inode_p;
+ int ret;
+
+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+ if (IS_ERR(inode_p))
+ return bkey_s_c_err(PTR_ERR(inode_p));
+
+ ret = bch2_inode_unpack(k, &u);
+ if (ret)
+ return bkey_s_c_err(ret);
- if (k.k->p.inode)
- return "nonzero k.p.inode";
+ bch2_inode_pack(inode_p, &u);
+ return bkey_i_to_s_c(&inode_p->inode.k_i);
+}
- if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
- return "incorrect value size";
+static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
+{
+ struct bch_inode_unpacked unpacked;
- if (k.k->p.offset < BLOCKDEV_INODE_MAX)
- return "fs inode in blockdev range";
+ if (k.k->p.inode) {
+ prt_printf(err, "nonzero k.p.inode");
+ return -EINVAL;
+ }
- if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
- return "invalid str hash type";
+ if (k.k->p.offset < BLOCKDEV_INODE_MAX) {
+ prt_printf(err, "fs inode in blockdev range");
+ return -EINVAL;
+ }
- if (bch2_inode_unpack(k, &unpacked))
- return "invalid variable length fields";
+ if (bch2_inode_unpack(k, &unpacked)) {
+ prt_printf(err, "invalid variable length fields");
+ return -EINVAL;
+ }
- if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
- return "invalid data checksum type";
+ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) {
+ prt_printf(err, "invalid data checksum type (%u >= %u",
+ unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
+ return -EINVAL;
+ }
- if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
- return "invalid data checksum type";
+ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) {
+ prt_printf(err, "invalid data checksum type (%u >= %u)",
+ unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1);
+ return -EINVAL;
+ }
if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
- unpacked.bi_nlink != 0)
- return "flagged as unlinked but bi_nlink != 0";
+ unpacked.bi_nlink != 0) {
+ prt_printf(err, "flagged as unlinked but bi_nlink != 0");
+ return -EINVAL;
+ }
- if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
- return "subvolume root but not a directory";
+ if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) {
+ prt_printf(err, "subvolume root but not a directory");
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
-const char *bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
- struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
- struct bch_inode_unpacked unpacked;
+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
- if (k.k->p.inode)
- return "nonzero k.p.inode";
+ if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(*inode.v));
+ return -EINVAL;
+ }
- if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
- return "incorrect value size";
+ if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
+ prt_printf(err, "invalid str hash type (%llu >= %u)",
+ INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
+ return -EINVAL;
+ }
- if (k.k->p.offset < BLOCKDEV_INODE_MAX)
- return "fs inode in blockdev range";
+ return __bch2_inode_invalid(k, err);
+}
- if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
- return "invalid str hash type";
+int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
+{
+ struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
- if (bch2_inode_unpack(k, &unpacked))
- return "invalid variable length fields";
+ if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(*inode.v));
+ return -EINVAL;
+ }
- if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
- return "invalid data checksum type";
+ if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
+ prt_printf(err, "invalid str hash type (%llu >= %u)",
+ INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
+ return -EINVAL;
+ }
- if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
- return "invalid data checksum type";
+ return __bch2_inode_invalid(k, err);
+}
- if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
- unpacked.bi_nlink != 0)
- return "flagged as unlinked but bi_nlink != 0";
+int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
+{
+ struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+
+ if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(*inode.v));
+ return -EINVAL;
+ }
+
+ if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
+ INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) {
+ prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)",
+ INODEv3_FIELDS_START(inode.v),
+ INODEv3_FIELDS_START_INITIAL,
+ bkey_val_u64s(inode.k));
+ return -EINVAL;
+ }
- if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
- return "subvolume root but not a directory";
+ if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
+ prt_printf(err, "invalid str hash type (%llu >= %u)",
+ INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
+ return -EINVAL;
+ }
- return NULL;
+ return __bch2_inode_invalid(k, err);
}
-static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
+static void __bch2_inode_unpacked_to_text(struct printbuf *out,
+ struct bch_inode_unpacked *inode)
{
- pr_buf(out, "mode %o flags %x journal_seq %llu",
+ prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu",
inode->bi_mode, inode->bi_flags,
- inode->bi_journal_seq);
+ inode->bi_journal_seq,
+ inode->bi_size,
+ inode->bi_sectors,
+ inode->bi_version);
#define x(_name, _bits) \
- pr_buf(out, " "#_name " %llu", (u64) inode->_name);
- BCH_INODE_FIELDS()
+ prt_printf(out, " "#_name " %llu", (u64) inode->_name);
+ BCH_INODE_FIELDS_v3()
#undef x
}
void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
{
- pr_buf(out, "inum: %llu ", inode->bi_inum);
+ prt_printf(out, "inum: %llu ", inode->bi_inum);
__bch2_inode_unpacked_to_text(out, inode);
}
-void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
struct bch_inode_unpacked inode;
if (bch2_inode_unpack(k, &inode)) {
- pr_buf(out, "(unpack error)");
+ prt_printf(out, "(unpack error)");
return;
}
__bch2_inode_unpacked_to_text(out, &inode);
}
-const char *bch2_inode_generation_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
- if (k.k->p.inode)
- return "nonzero k.p.inode";
+ if (k.k->p.inode) {
+ prt_printf(err, "nonzero k.p.inode");
+ return -EINVAL;
+ }
- if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
- return "incorrect value size";
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) {
+ prt_printf(err, "incorrect value size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_inode_generation));
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
- pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
+ prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
}
void bch2_inode_init_early(struct bch_fs *c,
}
if (!ret && start == min)
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_inode_create;
if (ret) {
bch2_trans_iter_exit(trans, iter);
bch2_btree_iter_set_snapshot(&iter, snapshot);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
ret = bkey_err(k);
if (ret)
goto err;
- if (!k.k || iter.pos.inode != inum.inum)
+ if (!k.k)
break;
bkey_init(&delete.k);
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
err:
- if (ret && ret != -EINTR)
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
}
BTREE_INSERT_NOFAIL);
err:
bch2_trans_iter_exit(&trans, &iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
return bch2_trans_do(c, NULL, NULL, 0,
bch2_inode_find_by_inum_trans(&trans, inum, inode));
}
+
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
+{
+ if (bi->bi_flags & BCH_INODE_UNLINKED)
+ bi->bi_flags &= ~BCH_INODE_UNLINKED;
+ else {
+ if (bi->bi_nlink == U32_MAX)
+ return -EINVAL;
+
+ bi->bi_nlink++;
+ }
+
+ return 0;
+}
+
+void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
+{
+ if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) {
+ bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
+ bi->bi_inum);
+ return;
+ }
+
+ if (bi->bi_flags & BCH_INODE_UNLINKED) {
+ bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
+ return;
+ }
+
+ if (bi->bi_nlink)
+ bi->bi_nlink--;
+ else
+ bi->bi_flags |= BCH_INODE_UNLINKED;
+}
#ifndef _BCACHEFS_INODE_H
#define _BCACHEFS_INODE_H
+#include "bkey.h"
#include "opts.h"
extern const char * const bch2_inode_opts[];
-const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
-const char *bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_inode (struct bkey_ops) { \
.key_invalid = bch2_inode_invalid, \
.val_to_text = bch2_inode_to_text, \
+ .trans_trigger = bch2_trans_mark_inode, \
+ .atomic_trigger = bch2_mark_inode, \
}
#define bch2_bkey_ops_inode_v2 (struct bkey_ops) { \
.key_invalid = bch2_inode_v2_invalid, \
.val_to_text = bch2_inode_to_text, \
+ .trans_trigger = bch2_trans_mark_inode, \
+ .atomic_trigger = bch2_mark_inode, \
+}
+
+#define bch2_bkey_ops_inode_v3 (struct bkey_ops) { \
+ .key_invalid = bch2_inode_v3_invalid, \
+ .val_to_text = bch2_inode_to_text, \
+ .trans_trigger = bch2_trans_mark_inode, \
+ .atomic_trigger = bch2_mark_inode, \
}
static inline bool bkey_is_inode(const struct bkey *k)
{
return k->type == KEY_TYPE_inode ||
- k->type == KEY_TYPE_inode_v2;
+ k->type == KEY_TYPE_inode_v2 ||
+ k->type == KEY_TYPE_inode_v3;
}
-const char *bch2_inode_generation_invalid(const struct bch_fs *,
- struct bkey_s_c);
-void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
+int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
+ int, struct printbuf *);
+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_inode_generation (struct bkey_ops) { \
.key_invalid = bch2_inode_generation_invalid, \
u64 bi_inum;
u64 bi_journal_seq;
__le64 bi_hash_seed;
+ u64 bi_size;
+ u64 bi_sectors;
+ u64 bi_version;
u32 bi_flags;
u16 bi_mode;
#define x(_name, _bits) u##_bits _name;
- BCH_INODE_FIELDS()
+ BCH_INODE_FIELDS_v3()
#undef x
};
struct bkey_inode_buf {
- struct bkey_i_inode_v2 inode;
+ struct bkey_i_inode_v3 inode;
#define x(_name, _bits) + 8 + _bits / 8
- u8 _pad[0 + BCH_INODE_FIELDS()];
+ u8 _pad[0 + BCH_INODE_FIELDS_v3()];
#undef x
} __attribute__((packed, aligned(8)));
-void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
- const struct bch_inode_unpacked *);
+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
+struct bkey_s_c bch2_inode_to_v3(struct btree_trans *, struct bkey_s_c);
void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
return S_ISDIR(mode) ? 2 : 1;
}
-static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
-{
- if (bi->bi_flags & BCH_INODE_UNLINKED)
- bi->bi_flags &= ~BCH_INODE_UNLINKED;
- else
- bi->bi_nlink++;
-}
-
-static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi)
-{
- BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED);
- if (bi->bi_nlink)
- bi->bi_nlink--;
- else
- bi->bi_flags |= BCH_INODE_UNLINKED;
-}
-
static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
{
return bi->bi_flags & BCH_INODE_UNLINKED
}
}
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
+void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
+
#endif /* _BCACHEFS_INODE_H */
s64 *i_sectors_delta_total,
bool check_enospc)
{
- struct btree_iter inode_iter;
- struct bch_inode_unpacked inode_u;
+ struct btree_iter inode_iter = { NULL };
struct bpos next_pos;
bool usage_increasing;
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
return ret;
}
- ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum,
- BTREE_ITER_INTENT);
- if (ret)
- return ret;
+ if (new_i_size || i_sectors_delta) {
+ struct bkey_s_c k;
+ struct bkey_s_c_inode_v3 inode;
+ struct bkey_i_inode_v3 *new_inode;
+ bool i_size_update;
+
+ bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes,
+ SPOS(0, inum.inum, iter->snapshot),
+ BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+ k = bch2_btree_iter_peek_slot(&inode_iter);
+ ret = bkey_err(k);
+ if (unlikely(ret))
+ goto err;
+
+ ret = bkey_is_inode(k.k) ? 0 : -ENOENT;
+ if (unlikely(ret))
+ goto err;
+
+ if (unlikely(k.k->type != KEY_TYPE_inode_v3)) {
+ k = bch2_inode_to_v3(trans, k);
+ ret = bkey_err(k);
+ if (unlikely(ret))
+ goto err;
+ }
+
+ inode = bkey_s_c_to_inode_v3(k);
+ i_size_update = !(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
+ new_i_size > le64_to_cpu(inode.v->bi_size);
- if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
- new_i_size > inode_u.bi_size)
- inode_u.bi_size = new_i_size;
+ if (!i_sectors_delta && !i_size_update)
+ goto no_inode_update;
- inode_u.bi_sectors += i_sectors_delta;
+ new_inode = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ ret = PTR_ERR_OR_ZERO(new_inode);
+ if (unlikely(ret))
+ goto err;
+ bkey_reassemble(&new_inode->k_i, k);
+
+ if (i_size_update)
+ new_inode->v.bi_size = cpu_to_le64(new_i_size);
+
+ le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta);
+ ret = bch2_trans_update(trans, &inode_iter, &new_inode->k_i, 0);
+ if (unlikely(ret))
+ goto err;
+ }
+no_inode_update:
ret = bch2_trans_update(trans, iter, k, 0) ?:
- bch2_inode_write(trans, &inode_iter, &inode_u) ?:
bch2_trans_commit(trans, disk_res, journal_seq,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL);
- bch2_trans_iter_exit(trans, &inode_iter);
-
- if (ret)
- return ret;
+ if (unlikely(ret))
+ goto err;
if (i_sectors_delta_total)
*i_sectors_delta_total += i_sectors_delta;
bch2_btree_iter_set_pos(iter, next_pos);
-
- return 0;
+err:
+ bch2_trans_iter_exit(trans, &inode_iter);
+ return ret;
}
/*
- * Returns -EINTR if we had to drop locks:
+ * Returns -BCH_ERR_transacton_restart if we had to drop locks:
*/
int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
subvol_inum inum, u64 end,
int ret = 0, ret2 = 0;
u32 snapshot;
- while (!ret || ret == -EINTR) {
+ while (!ret ||
+ bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
struct bkey_i delete;
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
- return ret == -EINTR ? 0 : ret;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ ret = 0;
+
+ return ret;
}
int bch2_write_index_default(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct bkey_buf sk;
- struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
struct keylist *keys = &op->insert_keys;
struct bkey_i *k = bch2_keylist_front(keys);
struct btree_trans trans;
ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
&sk.k->k.p.snapshot);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
break;
op->flags & BCH_WRITE_CHECK_ENOSPC);
bch2_trans_iter_exit(&trans, &iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
break;
- if (ec_ob)
- bch2_ob_add_backpointer(c, ec_ob, &sk.k->k);
-
if (bkey_cmp(iter.pos, k->k.p) >= 0)
bch2_keylist_pop_front(&op->insert_keys);
else
ca = bch_dev_bkey_exists(c, ptr->dev);
if (to_entry(ptr + 1) < ptrs.end) {
- n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
- &ca->replica_set));
+ n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
+ GFP_NOIO, &ca->replica_set));
n->bio.bi_end_io = wbio->bio.bi_end_io;
n->bio.bi_private = wbio->bio.bi_private;
}
}
-/**
- * bch_write_index - after a write, update index to point to new data
- */
-static void __bch2_write_index(struct bch_write_op *op)
+static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
{
- struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
struct bch_extent_ptr *ptr;
- struct bkey_i *src, *dst = keys->keys, *n, *k;
- unsigned dev;
- int ret;
+ struct bkey_i *src, *dst = keys->keys, *n;
for (src = keys->keys; src != keys->top; src = n) {
n = bkey_next(src);
bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
test_bit(ptr->dev, op->failed.d));
- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
- ret = -EIO;
- goto err;
- }
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
+ return -EIO;
}
if (dst != src)
}
keys->top = dst;
+ return 0;
+}
+
+/**
+ * bch_write_index - after a write, update index to point to new data
+ */
+static void __bch2_write_index(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct keylist *keys = &op->insert_keys;
+ struct bkey_i *k;
+ unsigned dev;
+ int ret;
+
+ if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+ ret = bch2_write_drop_io_error_ptrs(op);
+ if (ret)
+ goto err;
+ }
/*
* probably not the ideal place to hook this in, but I don't
u64 sectors_start = keylist_sectors(keys);
int ret = op->index_update_fn(op);
- BUG_ON(ret == -EINTR);
+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
BUG_ON(keylist_sectors(keys) && !ret);
op->written += sectors_start - keylist_sectors(keys);
if (ret) {
bch_err_inum_ratelimited(c, op->pos.inode,
- "write error %i from btree update", ret);
+ "write error while doing btree update: %s", bch2_err_str(ret));
op->error = ret;
}
}
op->pos.inode,
op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */
"data write error: %s",
- bch2_blk_status_to_str(bio->bi_status)))
+ bch2_blk_status_to_str(bio->bi_status))) {
set_bit(wbio->dev, op->failed.d);
+ op->flags |= BCH_WRITE_IO_ERROR;
+ }
if (wbio->have_ioref) {
bch2_latency_acct(ca, wbio->submit_time, WRITE);
pages = min(pages, BIO_MAX_VECS);
- bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
+ bio = bio_alloc_bioset(NULL, pages, 0,
+ GFP_NOIO, &c->bio_write);
wbio = wbio_init(bio);
wbio->put_bio = true;
/* copy WRITE_SYNC flag */
struct bch_fs *c = op->c;
struct nonce nonce = extent_nonce(op->version, op->crc);
struct bch_csum csum;
+ int ret;
if (!bch2_csum_type_is_encryption(op->crc.csum_type))
return 0;
if (bch2_crc_cmp(op->crc.csum, csum))
return -EIO;
- bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
op->crc.csum_type = 0;
op->crc.csum = (struct bch_csum) { 0, 0 };
- return 0;
+ return ret;
}
static enum prep_encoded_ret {
saved_iter = dst->bi_iter;
do {
- struct bch_extent_crc_unpacked crc =
- (struct bch_extent_crc_unpacked) { 0 };
+ struct bch_extent_crc_unpacked crc = { 0 };
struct bversion version = op->version;
size_t dst_len, src_len;
!crc_is_compressed(crc) &&
bch2_csum_type_is_encryption(op->crc.csum_type) ==
bch2_csum_type_is_encryption(op->csum_type)) {
+ u8 compression_type = crc.compression_type;
+ u16 nonce = crc.nonce;
/*
* Note: when we're using rechecksum(), we need to be
* checksumming @src because it has all the data our
bio_sectors(src) - (src_len >> 9),
op->csum_type))
goto csum_err;
+ /*
+ * rchecksum_bio sets compression_type on crc from op->crc,
+ * this isn't always correct as sometimes we're changing
+ * an extent from uncompressed to incompressible.
+ */
+ crc.compression_type = compression_type;
+ crc.nonce = nonce;
} else {
if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
bch2_rechecksum_bio(c, src, version, op->crc,
crc.live_size = src_len >> 9;
swap(dst->bi_iter.bi_size, dst_len);
- bch2_encrypt_bio(c, op->csum_type,
- extent_nonce(version, crc), dst);
+ ret = bch2_encrypt_bio(c, op->csum_type,
+ extent_nonce(version, crc), dst);
+ if (ret)
+ goto err;
+
crc.csum = bch2_checksum_bio(c, op->csum_type,
extent_nonce(version, crc), dst);
crc.csum_type = op->csum_type;
*_dst = dst;
return more;
csum_err:
- bch_err(c, "error verifying existing checksum while "
- "rewriting existing data (memory corruption?)");
+ bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
ret = -EIO;
err:
if (to_wbio(dst)->bounce)
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
struct write_point *wp;
- struct bio *bio;
+ struct bio *bio = NULL;
bool skip_put = true;
unsigned nofs_flags;
int ret;
BKEY_EXTENT_U64s_MAX))
goto flush_io;
- if ((op->flags & BCH_WRITE_FROM_INTERNAL) &&
- percpu_ref_is_dying(&c->writes)) {
- ret = -EROFS;
- goto err;
- }
-
/*
* The copygc thread is now global, which means it's no longer
* freeing up space on specific disks, which means that
BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
EBUG_ON(!wp);
- if (unlikely(IS_ERR(wp))) {
- if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
+ if (IS_ERR(wp)) {
+ if (unlikely(wp != ERR_PTR(-EAGAIN))) {
ret = PTR_ERR(wp);
goto err;
}
}
if (c->opts.nochanges ||
- !percpu_ref_tryget(&c->writes)) {
+ !percpu_ref_tryget_live(&c->writes)) {
op->error = -EROFS;
goto err;
}
+ this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
bch2_increment_clock(c, bio_sectors(bio), WRITE);
data_len = min_t(u64, bio->bi_iter.bi_size,
struct rhash_head hash;
struct bpos pos;
- struct migrate_write write;
+ struct data_update write;
struct bio_vec bi_inline_vecs[0]; /* must be last */
};
bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
op->start_time);
- bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
+ bch2_data_update_exit(&op->write);
promote_free(c, op);
}
static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
{
- struct bch_fs *c = rbio->c;
struct closure *cl = &op->cl;
struct bio *bio = &op->write.op.wbio.bio;
- trace_promote(&rbio->bio);
+ trace_and_count(op->write.op.c, read_promote, &rbio->bio);
/* we now own pages: */
BUG_ON(!rbio->bounce);
sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
- bch2_migrate_read_done(&op->write, rbio);
-
closure_init(cl, NULL);
- closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, cl);
+ bch2_data_update_read_done(&op->write, rbio->pick.crc, cl);
closure_return_with_destructor(cl, promote_done);
}
unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
int ret;
- if (!percpu_ref_tryget(&c->writes))
+ if (!percpu_ref_tryget_live(&c->writes))
return NULL;
op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
goto err;
rbio_init(&(*rbio)->bio, opts);
- bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages);
+ bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
GFP_NOIO))
goto err;
bio = &op->write.op.wbio.bio;
- bio_init(bio, bio->bi_inline_vecs, pages);
+ bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
- ret = bch2_migrate_write_init(c, &op->write,
+ ret = bch2_data_update_init(c, &op->write,
writepoint_hashed((unsigned long) current),
opts,
- DATA_PROMOTE,
- (struct data_opts) {
+ (struct data_update_opts) {
.target = opts.promote_target,
- .nr_replicas = 1,
+ .extra_replicas = 1,
+ .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
},
btree_id, k);
BUG_ON(ret);
};
struct bch_io_failures failed = { .nr = 0 };
- trace_read_retry(&rbio->bio);
+ trace_and_count(c, read_retry, &rbio->bio);
if (rbio->retry == READ_RETRY_AVOID)
bch2_mark_io_failure(&failed, &rbio->pick);
struct nonce nonce = extent_nonce(rbio->version, crc);
unsigned nofs_flags;
struct bch_csum csum;
+ int ret;
nofs_flags = memalloc_nofs_save();
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
if (crc_is_compressed(crc)) {
- bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
+
if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
goto decompression_err;
} else {
BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
src->bi_iter.bi_size = dst_iter.bi_size;
- bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
if (rbio->bounce) {
struct bvec_iter src_iter = src->bi_iter;
* Re encrypt data we decrypted, so it's consistent with
* rbio->crc:
*/
- bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
+
promote_start(rbio->promote, rbio);
rbio->promote = NULL;
}
}
bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector,
- "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)",
+ "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
- csum.hi, csum.lo, crc.csum_type);
+ csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
decompression_err:
"decompression error");
bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
goto out;
+decrypt_err:
+ bch_err_inum_ratelimited(c, rbio->read_pos.inode,
+ "decrypt error");
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ goto out;
}
static void bch2_read_endio(struct bio *bio)
if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
ptr_stale(ca, &rbio->pick.ptr)) {
- atomic_long_inc(&c->read_realloc_races);
+ trace_and_count(c, read_reuse_race, &rbio->bio);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
}
if (rbio->narrow_crcs ||
+ rbio->promote ||
crc_is_compressed(rbio->pick.crc) ||
bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
struct btree_iter iter;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret;
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf);
-
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)),
+ PTR_BUCKET_POS(c, &ptr),
BTREE_ITER_CACHED);
+ prt_printf(&buf, "Attempting to read from stale dirty pointer:");
+ printbuf_indent_add(&buf, 2);
+ prt_newline(&buf);
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+
ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
- if (ret)
- return;
+ if (!ret) {
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ }
+
+ bch2_fs_inconsistent(c, "%s", buf.buf);
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- bch_err(c, "%s", buf);
- bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
}
int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
- if (!pick.ptr.cached &&
+ /*
+ * Stale dirty pointers are treated as IO errors, but @failed isn't
+ * allocated unless we're in the retry path - so if we're not in the
+ * retry path, don't check here, it'll be caught in bch2_read_endio()
+ * and we'll end up in the retry path:
+ */
+ if ((flags & BCH_READ_IN_RETRY) &&
+ !pick.ptr.cached &&
unlikely(ptr_stale(ca, &pick.ptr))) {
read_from_stale_dirty_pointer(trans, k, pick.ptr);
bch2_mark_io_failure(failed, &pick);
} else if (bounce) {
unsigned sectors = pick.crc.compressed_size;
- rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
+ rbio = rbio_init(bio_alloc_bioset(NULL,
DIV_ROUND_UP(sectors, PAGE_SECTORS),
+ 0,
+ GFP_NOIO,
&c->bio_read_split),
orig->opts);
* from the whole bio, in which case we don't want to retry and
* lose the error)
*/
- rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
- &c->bio_read_split),
+ rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO,
+ &c->bio_read_split),
orig->opts);
rbio->bio.bi_iter = iter;
rbio->split = true;
rbio->bio.bi_end_io = bch2_read_endio;
if (rbio->bounce)
- trace_read_bounce(&rbio->bio);
+ trace_and_count(c, read_bounce, &rbio->bio);
+ this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
/*
if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
bio_inc_remaining(&orig->bio);
- trace_read_split(&orig->bio);
+ trace_and_count(c, read_split, &orig->bio);
}
if (!rbio->pick.idx) {
* read_extent -> io_time_reset may cause a transaction restart
* without returning an error, we need to check for that here:
*/
- if (!bch2_trans_relock(&trans)) {
- ret = -EINTR;
+ ret = bch2_trans_relock(&trans);
+ if (ret)
break;
- }
bch2_btree_iter_set_pos(&iter,
POS(inum.inum, bvec_iter.bi_sector));
err:
bch2_trans_iter_exit(&trans, &iter);
- if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ ret == READ_RETRY ||
+ ret == READ_RETRY_AVOID)
goto retry;
bch2_trans_exit(&trans);
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10),
BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11),
BCH_WRITE_DONE = (1 << 12),
+ BCH_WRITE_IO_ERROR = (1 << 13),
};
static inline u64 *op_journal_seq(struct bch_write_op *op)
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
- return op->alloc_reserve == RESERVE_MOVINGGC
+ return op->alloc_reserve == RESERVE_movinggc
? op->c->copygc_wq
: op->c->btree_update_wq;
}
op->compression_type = bch2_compression_opt_to_type[opts.compression];
op->nr_replicas = 0;
op->nr_replicas_required = c->opts.data_replicas_required;
- op->alloc_reserve = RESERVE_NONE;
+ op->alloc_reserve = RESERVE_none;
op->incompressible = 0;
op->open_buckets.nr = 0;
op->devs_have.nr = 0;
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
+#include "journal_sb.h"
#include "journal_seq_blacklist.h"
-#include "super-io.h"
#include <trace/events/bcachefs.h>
-static u64 last_unwritten_seq(struct journal *j)
-{
- union journal_res_state s = READ_ONCE(j->reservations);
+#define x(n) #n,
+static const char * const bch2_journal_watermarks[] = {
+ JOURNAL_WATERMARKS()
+ NULL
+};
- lockdep_assert_held(&j->lock);
-
- return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK);
-}
+static const char * const bch2_journal_errors[] = {
+ JOURNAL_ERRORS()
+ NULL
+};
+#undef x
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
{
- return seq >= last_unwritten_seq(j);
+ return seq > j->seq_ondisk;
}
static bool __journal_entry_is_open(union journal_res_state state)
return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
}
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+ return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
static bool journal_entry_is_open(struct journal *j)
{
return __journal_entry_is_open(j->reservations);
struct journal_buf *buf = NULL;
EBUG_ON(seq > journal_cur_seq(j));
- EBUG_ON(seq == journal_cur_seq(j) &&
- j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
if (journal_seq_unwritten(j, seq)) {
buf = j->buf + (seq & JOURNAL_BUF_MASK);
p->devs.nr = 0;
}
-static void journal_pin_new_entry(struct journal *j)
-{
- /*
- * The fifo_push() needs to happen at the same time as j->seq is
- * incremented for journal_last_seq() to be calculated correctly
- */
- atomic64_inc(&j->seq);
- journal_pin_list_init(fifo_push_ref(&j->pin), 1);
-}
-
-static void bch2_journal_buf_init(struct journal *j)
-{
- struct journal_buf *buf = journal_cur_buf(j);
-
- bkey_extent_init(&buf->key);
- buf->noflush = false;
- buf->must_flush = false;
- buf->separate_flush = false;
-
- memset(buf->data, 0, sizeof(*buf->data));
- buf->data->seq = cpu_to_le64(journal_cur_seq(j));
- buf->data->u64s = 0;
-}
-
-void bch2_journal_halt(struct journal *j)
-{
- union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
-
- do {
- old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return;
-
- new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
-
- /*
- * XXX: we're not using j->lock here because this can be called from
- * interrupt context, this can race with journal_write_done()
- */
- if (!j->err_seq)
- j->err_seq = journal_cur_seq(j);
- journal_wake(j);
- closure_wake_up(&journal_cur_buf(j)->wait);
-}
-
/* journal entry close/open: */
void __bch2_journal_buf_put(struct journal *j)
* We don't close a journal_buf until the next journal_buf is finished writing,
* and can be opened again - this also initializes the next journal_buf:
*/
-static bool __journal_entry_close(struct journal *j)
+static void __journal_entry_close(struct journal *j, unsigned closed_val)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
u64 v = atomic64_read(&j->reservations.counter);
unsigned sectors;
+ BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
+ closed_val != JOURNAL_ENTRY_ERROR_VAL);
+
lockdep_assert_held(&j->lock);
do {
old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
- return true;
+ new.cur_entry_offset = closed_val;
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
- /* this entry will never be written: */
- closure_wake_up(&buf->wait);
- return true;
- }
-
- if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
- set_bit(JOURNAL_NEED_WRITE, &j->flags);
- j->need_write_time = local_clock();
- }
-
- new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
- new.idx++;
-
- if (new.idx == new.unwritten_idx)
- return false;
-
- BUG_ON(journal_state_count(new, new.idx));
+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
+ old.cur_entry_offset == new.cur_entry_offset)
+ return;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
+ if (!__journal_entry_is_open(old))
+ return;
+
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
*/
buf->last_seq = journal_last_seq(j);
buf->data->last_seq = cpu_to_le64(buf->last_seq);
+ BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
- /* Initialize new buffer: */
- journal_pin_new_entry(j);
-
- bch2_journal_buf_init(j);
-
cancel_delayed_work(&j->write_work);
- clear_bit(JOURNAL_NEED_WRITE, &j->flags);
bch2_journal_space_available(j);
bch2_journal_buf_put(j, old.idx);
- return true;
+}
+
+void bch2_journal_halt(struct journal *j)
+{
+ spin_lock(&j->lock);
+ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
+ if (!j->err_seq)
+ j->err_seq = journal_cur_seq(j);
+ spin_unlock(&j->lock);
}
static bool journal_entry_want_write(struct journal *j)
{
- union journal_res_state s = READ_ONCE(j->reservations);
- bool ret = false;
+ bool ret = !journal_entry_is_open(j) ||
+ journal_cur_seq(j) == journal_last_unwritten_seq(j);
- /*
- * Don't close it yet if we already have a write in flight, but do set
- * NEED_WRITE:
- */
- if (s.idx != s.unwritten_idx)
- set_bit(JOURNAL_NEED_WRITE, &j->flags);
- else
- ret = __journal_entry_close(j);
+ /* Don't close it yet if we already have a write in flight: */
+ if (ret)
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ else if (nr_unwritten_journal_entries(j)) {
+ struct journal_buf *buf = journal_cur_buf(j);
+
+ if (!buf->flush_time) {
+ buf->flush_time = local_clock() ?: 1;
+ buf->expires = jiffies;
+ }
+ }
return ret;
}
static int journal_entry_open(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *buf = journal_cur_buf(j);
+ struct journal_buf *buf = j->buf +
+ ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
union journal_res_state old, new;
int u64s;
u64 v;
- BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-
lockdep_assert_held(&j->lock);
BUG_ON(journal_entry_is_open(j));
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
if (j->blocked)
- return cur_entry_blocked;
+ return JOURNAL_ERR_blocked;
if (j->cur_entry_error)
return j->cur_entry_error;
+ if (bch2_journal_error(j))
+ return JOURNAL_ERR_insufficient_devices; /* -EROFS */
+
+ if (!fifo_free(&j->pin))
+ return JOURNAL_ERR_journal_pin_full;
+
+ if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) - 1)
+ return JOURNAL_ERR_max_in_flight;
+
BUG_ON(!j->cur_entry_sectors);
+ buf->expires =
+ (journal_cur_seq(j) == j->flushed_seq_ondisk
+ ? jiffies
+ : j->last_flush_write) +
+ msecs_to_jiffies(c->opts.journal_flush_delay);
+
buf->u64s_reserved = j->entry_u64s_reserved;
buf->disk_sectors = j->cur_entry_sectors;
buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9);
u64s = (int) (buf->sectors << 9) / sizeof(u64) -
journal_entry_overhead(j);
- u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
+ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
- if (u64s <= le32_to_cpu(buf->data->u64s))
- return cur_entry_journal_full;
+ if (u64s <= 0)
+ return JOURNAL_ERR_journal_full;
+
+ if (fifo_empty(&j->pin) && j->reclaim_thread)
+ wake_up_process(j->reclaim_thread);
+
+ /*
+ * The fifo_push() needs to happen at the same time as j->seq is
+ * incremented for journal_last_seq() to be calculated correctly
+ */
+ atomic64_inc(&j->seq);
+ journal_pin_list_init(fifo_push_ref(&j->pin), 1);
+
+ BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
+
+ bkey_extent_init(&buf->key);
+ buf->noflush = false;
+ buf->must_flush = false;
+ buf->separate_flush = false;
+ buf->flush_time = 0;
+
+ memset(buf->data, 0, sizeof(*buf->data));
+ buf->data->seq = cpu_to_le64(journal_cur_seq(j));
+ buf->data->u64s = 0;
/*
* Must be set before marking the journal entry as open:
do {
old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return cur_entry_insufficient_devices;
+ BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
- /* Handle any already added entries */
- new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+ new.idx++;
+ BUG_ON(journal_state_count(new, new.idx));
+ BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
- EBUG_ON(journal_state_count(new, new.idx));
journal_state_inc(&new);
+ new.cur_entry_offset = 0;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
static bool journal_quiesced(struct journal *j)
{
- union journal_res_state s = READ_ONCE(j->reservations);
- bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s);
+ bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
if (!ret)
journal_entry_close(j);
static void journal_write_work(struct work_struct *work)
{
struct journal *j = container_of(work, struct journal, write_work.work);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ long delta;
+
+ spin_lock(&j->lock);
+ if (!__journal_entry_is_open(j->reservations))
+ goto unlock;
+
+ delta = journal_cur_buf(j)->expires - jiffies;
- journal_entry_close(j);
+ if (delta > 0)
+ mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
+ else
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+unlock:
+ spin_unlock(&j->lock);
}
static int __journal_res_get(struct journal *j, struct journal_res *res,
return 0;
}
- if (!(flags & JOURNAL_RES_GET_RESERVED) &&
- !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+ if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) {
/*
* Don't want to close current journal entry, just need to
* invoke reclaim:
*/
- ret = cur_entry_journal_full;
+ ret = JOURNAL_ERR_journal_full;
goto unlock;
}
buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
- if (journal_entry_is_open(j) &&
- !__journal_entry_close(j)) {
- /*
- * We failed to get a reservation on the current open journal
- * entry because it's full, and we can't close it because
- * there's still a previous one in flight:
- */
- trace_journal_entry_full(c);
- ret = cur_entry_blocked;
- } else {
- ret = journal_entry_open(j);
- }
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ ret = journal_entry_open(j);
+
+ if (ret == JOURNAL_ERR_max_in_flight)
+ trace_and_count(c, journal_entry_full, c);
unlock:
- if ((ret && ret != cur_entry_insufficient_devices) &&
+ if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
!j->res_get_blocked_start) {
j->res_get_blocked_start = local_clock() ?: 1;
- trace_journal_full(c);
+ trace_and_count(c, journal_full, c);
}
can_discard = j->can_discard;
if (!ret)
goto retry;
- if ((ret == cur_entry_journal_full ||
- ret == cur_entry_journal_pin_full) &&
+ if ((ret == JOURNAL_ERR_journal_full ||
+ ret == JOURNAL_ERR_journal_pin_full) &&
!can_discard &&
- j->reservations.idx == j->reservations.unwritten_idx &&
- (flags & JOURNAL_RES_GET_RESERVED)) {
- char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
-
- bch_err(c, "Journal stuck!");
- if (journal_debug_buf) {
- bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
- bch_err(c, "%s", journal_debug_buf);
-
- bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j);
- bch_err(c, "Journal pins:\n%s", journal_debug_buf);
- kfree(journal_debug_buf);
- }
+ !nr_unwritten_journal_entries(j) &&
+ (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) {
+ struct printbuf buf = PRINTBUF;
+
+ bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)",
+ bch2_journal_errors[ret]);
+
+ bch2_journal_debug_to_text(&buf, j);
+ bch_err(c, "%s", buf.buf);
+ printbuf_reset(&buf);
+ bch2_journal_pins_to_text(&buf, j);
+ bch_err(c, "Journal pins:\n%s", buf.buf);
+
+ printbuf_exit(&buf);
bch2_fatal_error(c);
dump_stack();
}
* Journal is full - can't rely on reclaim from work item due to
* freezing:
*/
- if ((ret == cur_entry_journal_full ||
- ret == cur_entry_journal_pin_full) &&
+ if ((ret == JOURNAL_ERR_journal_full ||
+ ret == JOURNAL_ERR_journal_pin_full) &&
!(flags & JOURNAL_RES_GET_NONBLOCK)) {
if (can_discard) {
bch2_journal_do_discards(j);
}
}
- return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN;
+ return ret == JOURNAL_ERR_insufficient_devices ? -EROFS : -EAGAIN;
}
/*
/*
* Not enough room in current journal entry, have to flush it:
*/
- __journal_entry_close(j);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
} else {
journal_cur_buf(j)->u64s_reserved += d;
}
}
/* if seq was written, but not flushed - flush a newer one instead */
- seq = max(seq, last_unwritten_seq(j));
+ seq = max(seq, journal_last_unwritten_seq(j));
recheck_need_open:
- if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
+ if (seq > journal_cur_seq(j)) {
struct journal_res res = { 0 };
+ if (journal_entry_is_open(j))
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+
spin_unlock(&j->lock);
ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
seq = res.seq;
buf = j->buf + (seq & JOURNAL_BUF_MASK);
buf->must_flush = true;
- set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+ if (!buf->flush_time) {
+ buf->flush_time = local_clock() ?: 1;
+ buf->expires = jiffies;
+ }
if (parent && !closure_wait(&buf->wait, parent))
BUG();
return ret ?: ret2 < 0 ? ret2 : 0;
}
-int bch2_journal_meta(struct journal *j)
-{
- struct journal_buf *buf;
- struct journal_res res;
- int ret;
-
- memset(&res, 0, sizeof(res));
-
- ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
- if (ret)
- return ret;
-
- buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
- buf->must_flush = true;
- set_bit(JOURNAL_NEED_WRITE, &j->flags);
-
- bch2_journal_res_put(j, &res);
-
- return bch2_journal_flush_seq(j, res.seq);
-}
-
/*
* bch2_journal_flush_async - if there is an open journal entry, or a journal
* still being written, write it and wait for the write to complete
*/
void bch2_journal_flush_async(struct journal *j, struct closure *parent)
{
- u64 seq, journal_seq;
-
- spin_lock(&j->lock);
- journal_seq = journal_cur_seq(j);
-
- if (journal_entry_is_open(j)) {
- seq = journal_seq;
- } else if (journal_seq) {
- seq = journal_seq - 1;
- } else {
- spin_unlock(&j->lock);
- return;
- }
- spin_unlock(&j->lock);
-
- bch2_journal_flush_seq_async(j, seq, parent);
+ bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
}
int bch2_journal_flush(struct journal *j)
{
- u64 seq, journal_seq;
-
- spin_lock(&j->lock);
- journal_seq = journal_cur_seq(j);
-
- if (journal_entry_is_open(j)) {
- seq = journal_seq;
- } else if (journal_seq) {
- seq = journal_seq - 1;
- } else {
- spin_unlock(&j->lock);
- return 0;
- }
- spin_unlock(&j->lock);
-
- return bch2_journal_flush_seq(j, seq);
+ return bch2_journal_flush_seq(j, atomic64_read(&j->seq));
}
/*
if (seq <= c->journal.flushed_seq_ondisk)
goto out;
- for (unwritten_seq = last_unwritten_seq(j);
+ for (unwritten_seq = journal_last_unwritten_seq(j);
unwritten_seq < seq;
unwritten_seq++) {
struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
/* journal write is already in flight, and was a flush write: */
- if (unwritten_seq == last_unwritten_seq(j) && !buf->noflush)
+ if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
goto out;
buf->noflush = true;
return ret;
}
+int bch2_journal_meta(struct journal *j)
+{
+ struct journal_buf *buf;
+ struct journal_res res;
+ int ret;
+
+ memset(&res, 0, sizeof(res));
+
+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+ if (ret)
+ return ret;
+
+ buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+ buf->must_flush = true;
+
+ if (!buf->flush_time) {
+ buf->flush_time = local_clock() ?: 1;
+ buf->expires = jiffies;
+ }
+
+ bch2_journal_res_put(j, &res);
+
+ return bch2_journal_flush_seq(j, res.seq);
+}
+
+int bch2_journal_log_msg(struct journal *j, const char *fmt, ...)
+{
+ struct jset_entry_log *entry;
+ struct journal_res res = { 0 };
+ unsigned msglen, u64s;
+ va_list args;
+ int ret;
+
+ va_start(args, fmt);
+ msglen = vsnprintf(NULL, 0, fmt, args) + 1;
+ va_end(args);
+
+ u64s = jset_u64s(DIV_ROUND_UP(msglen, sizeof(u64)));
+
+ ret = bch2_journal_res_get(j, &res, u64s, 0);
+ if (ret)
+ return ret;
+
+ entry = container_of(journal_res_entry(j, &res),
+ struct jset_entry_log, entry);
+ memset(entry, 0, u64s * sizeof(u64));
+ entry->entry.type = BCH_JSET_ENTRY_log;
+ entry->entry.u64s = u64s - 1;
+
+ va_start(args, fmt);
+ vsnprintf(entry->d, INT_MAX, fmt, args);
+ va_end(args);
+
+ bch2_journal_res_put(j, &res);
+
+ return bch2_journal_flush_seq(j, res.seq);
+}
+
/* block/unlock the journal: */
void bch2_journal_unblock(struct journal *j)
{
struct bch_fs *c = ca->fs;
struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets;
u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+ struct open_bucket **ob = NULL;
+ long *bu = NULL;
+ unsigned i, nr_got = 0, nr_want = nr - ja->nr;
+ unsigned old_nr = ja->nr;
+ unsigned old_discard_idx = ja->discard_idx;
+ unsigned old_dirty_idx_ondisk = ja->dirty_idx_ondisk;
+ unsigned old_dirty_idx = ja->dirty_idx;
+ unsigned old_cur_idx = ja->cur_idx;
int ret = 0;
- /* don't handle reducing nr of buckets yet: */
- if (nr <= ja->nr)
- return 0;
+ if (c) {
+ bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_block(&c->journal);
+ }
- new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- if (!new_buckets || !new_bucket_seq) {
+ bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
+ ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
+ new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL);
+ new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL);
+ if (!bu || !ob || !new_buckets || !new_bucket_seq) {
ret = -ENOMEM;
- goto err;
+ goto err_unblock;
}
- journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
- nr + sizeof(*journal_buckets) / sizeof(u64));
- if (!journal_buckets) {
- ret = -ENOSPC;
- goto err;
+ for (nr_got = 0; nr_got < nr_want; nr_got++) {
+ if (new_fs) {
+ bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
+ if (bu[nr_got] < 0) {
+ ret = -BCH_ERR_ENOSPC_bucket_alloc;
+ break;
+ }
+ } else {
+ ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none,
+ false, cl);
+ if (IS_ERR(ob[nr_got])) {
+ ret = cl
+ ? -EAGAIN
+ : -BCH_ERR_ENOSPC_bucket_alloc;
+ break;
+ }
+
+ bu[nr_got] = ob[nr_got]->bucket;
+ }
}
+ if (!nr_got)
+ goto err_unblock;
+
/*
* We may be called from the device add path, before the new device has
* actually been added to the running filesystem:
swap(new_buckets, ja->buckets);
swap(new_bucket_seq, ja->bucket_seq);
- if (!new_fs)
- spin_unlock(&c->journal.lock);
-
- while (ja->nr < nr) {
- struct open_bucket *ob = NULL;
- unsigned pos;
- long b;
-
- if (new_fs) {
- b = bch2_bucket_alloc_new_fs(ca);
- if (b < 0) {
- ret = -ENOSPC;
- goto err;
- }
- } else {
- rcu_read_lock();
- ob = bch2_bucket_alloc(c, ca, RESERVE_NONE,
- false, cl);
- rcu_read_unlock();
- if (IS_ERR(ob)) {
- ret = cl ? -EAGAIN : -ENOSPC;
- goto err;
- }
-
- b = ob->bucket;
- }
-
- if (c)
- spin_lock(&c->journal.lock);
-
- /*
- * XXX
- * For resize at runtime, we should be writing the new
- * superblock before inserting into the journal array
- */
+ for (i = 0; i < nr_got; i++) {
+ unsigned pos = ja->discard_idx ?: ja->nr;
+ long b = bu[i];
- pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
__array_insert_item(ja->buckets, ja->nr, pos);
__array_insert_item(ja->bucket_seq, ja->nr, pos);
- __array_insert_item(journal_buckets->buckets, ja->nr, pos);
ja->nr++;
ja->buckets[pos] = b;
ja->bucket_seq[pos] = 0;
- journal_buckets->buckets[pos] = cpu_to_le64(b);
if (pos <= ja->discard_idx)
ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
if (pos <= ja->cur_idx)
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+ }
- if (c)
- spin_unlock(&c->journal.lock);
+ ret = bch2_journal_buckets_to_sb(c, ca);
+ if (ret) {
+ /* Revert: */
+ swap(new_buckets, ja->buckets);
+ swap(new_bucket_seq, ja->bucket_seq);
+ ja->nr = old_nr;
+ ja->discard_idx = old_discard_idx;
+ ja->dirty_idx_ondisk = old_dirty_idx_ondisk;
+ ja->dirty_idx = old_dirty_idx;
+ ja->cur_idx = old_cur_idx;
+ }
- if (!new_fs) {
- ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
- bch2_trans_mark_metadata_bucket(&trans, ca,
- b, BCH_DATA_journal,
- ca->mi.bucket_size));
+ if (!new_fs)
+ spin_unlock(&c->journal.lock);
- bch2_open_bucket_put(c, ob);
+ if (c)
+ bch2_journal_unblock(&c->journal);
+
+ if (ret)
+ goto err;
- if (ret)
+ if (!new_fs) {
+ for (i = 0; i < nr_got; i++) {
+ ret = bch2_trans_run(c,
+ bch2_trans_mark_metadata_bucket(&trans, ca,
+ bu[i], BCH_DATA_journal,
+ ca->mi.bucket_size));
+ if (ret) {
+ bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret);
goto err;
+ }
}
}
err:
- bch2_sb_resize_journal(&ca->disk_sb,
- ja->nr + sizeof(*journal_buckets) / sizeof(u64));
+ if (ob && !new_fs)
+ for (i = 0; i < nr_got; i++)
+ bch2_open_bucket_put(c, ob[i]);
+
kfree(new_bucket_seq);
kfree(new_buckets);
+ kfree(ob);
+ kfree(bu);
return ret;
+err_unblock:
+ if (c)
+ bch2_journal_unblock(&c->journal);
+ goto err;
}
/*
struct journal_device *ja = &ca->journal;
struct closure cl;
unsigned current_nr;
- int ret;
+ int ret = 0;
+
+ /* don't handle reducing nr of buckets yet: */
+ if (nr < ja->nr)
+ return 0;
closure_init_stack(&cl);
- do {
+ while (ja->nr != nr && (ret == 0 || ret == -EAGAIN)) {
struct disk_reservation disk_res = { 0, 0 };
closure_sync(&cl);
* reservation to ensure we'll actually be able to allocate:
*/
- if (bch2_disk_reservation_get(c, &disk_res,
- bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
+ ret = bch2_disk_reservation_get(c, &disk_res,
+ bucket_to_sector(ca, nr - ja->nr), 1, 0);
+ if (ret) {
mutex_unlock(&c->sb_lock);
- return -ENOSPC;
+ return ret;
}
ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
if (ja->nr != current_nr)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- } while (ret == -EAGAIN);
+ }
return ret;
}
int bch2_dev_journal_alloc(struct bch_dev *ca)
{
unsigned nr;
+ int ret;
if (dynamic_fault("bcachefs:add:journal_alloc"))
return -ENOMEM;
min(1 << 13,
(1 << 24) / ca->mi.bucket_size));
- return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+ if (ca->fs)
+ mutex_lock(&ca->fs->sb_lock);
+
+ ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+
+ if (ca->fs)
+ mutex_unlock(&ca->fs->sb_lock);
+
+ return ret;
}
/* startup/shutdown: */
static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
{
- union journal_res_state state;
bool ret = false;
- unsigned i;
+ u64 seq;
spin_lock(&j->lock);
- state = READ_ONCE(j->reservations);
- i = state.idx;
+ for (seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j) && !ret;
+ seq++) {
+ struct journal_buf *buf = journal_seq_to_buf(j, seq);
- while (i != state.unwritten_idx) {
- i = (i - 1) & JOURNAL_BUF_MASK;
- if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx))
+ if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx))
ret = true;
}
spin_unlock(&j->lock);
void bch2_fs_journal_stop(struct journal *j)
{
+ bch2_journal_reclaim_stop(j);
bch2_journal_flush_all_pins(j);
wait_event(j->wait, journal_entry_close(j));
BUG_ON(!bch2_journal_error(j) &&
test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
- (journal_entry_is_open(j) ||
- j->last_empty_seq + 1 != journal_cur_seq(j)));
+ j->last_empty_seq != journal_cur_seq(j));
cancel_delayed_work_sync(&j->write_work);
- bch2_journal_reclaim_stop(j);
}
-int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
- struct list_head *journal_entries)
+int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin_list *p;
- struct journal_replay *i;
+ struct journal_replay *i, **_i;
+ struct genradix_iter iter;
+ bool had_entries = false;
+ unsigned ptr;
u64 last_seq = cur_seq, nr, seq;
- if (!list_empty(journal_entries))
- last_seq = le64_to_cpu(list_last_entry(journal_entries,
- struct journal_replay, list)->j.last_seq);
+ genradix_for_each_reverse(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ last_seq = le64_to_cpu(i->j.last_seq);
+ break;
+ }
nr = cur_seq - last_seq;
j->replay_journal_seq_end = cur_seq;
j->last_seq_ondisk = last_seq;
j->flushed_seq_ondisk = cur_seq - 1;
+ j->seq_ondisk = cur_seq - 1;
j->pin.front = last_seq;
j->pin.back = cur_seq;
atomic64_set(&j->seq, cur_seq - 1);
- if (list_empty(journal_entries))
- j->last_empty_seq = cur_seq - 1;
-
fifo_for_each_entry_ptr(p, &j->pin, seq)
journal_pin_list_init(p, 1);
- list_for_each_entry(i, journal_entries, list) {
- unsigned ptr;
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
seq = le64_to_cpu(i->j.seq);
BUG_ON(seq >= cur_seq);
p->devs.nr = 0;
for (ptr = 0; ptr < i->nr_ptrs; ptr++)
bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+
+ had_entries = true;
}
- if (list_empty(journal_entries))
+ if (!had_entries)
j->last_empty_seq = cur_seq;
spin_lock(&j->lock);
set_bit(JOURNAL_STARTED, &j->flags);
j->last_flush_write = jiffies;
- journal_pin_new_entry(j);
-
j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
-
- bch2_journal_buf_init(j);
+ j->reservations.unwritten_idx++;
c->last_bucket_seq_cleanup = journal_cur_seq(j);
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets =
bch2_sb_get_journal(sb);
- unsigned i;
+ struct bch_sb_field_journal_v2 *journal_buckets_v2 =
+ bch2_sb_get_journal_v2(sb);
+ unsigned i, nr_bvecs;
+
+ ja->nr = 0;
- ja->nr = bch2_nr_journal_buckets(journal_buckets);
+ if (journal_buckets_v2) {
+ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+
+ for (i = 0; i < nr; i++)
+ ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
+ } else if (journal_buckets) {
+ ja->nr = bch2_nr_journal_buckets(journal_buckets);
+ }
ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->bucket_seq)
return -ENOMEM;
- ca->journal.bio = bio_kmalloc(GFP_KERNEL,
- DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE));
+ nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+
+ ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
if (!ca->journal.bio)
return -ENOMEM;
+ bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+
ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->buckets)
return -ENOMEM;
- for (i = 0; i < ja->nr; i++)
- ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+ if (journal_buckets_v2) {
+ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+ unsigned j, dst = 0;
+
+ for (i = 0; i < nr; i++)
+ for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+ ja->buckets[dst++] =
+ le64_to_cpu(journal_buckets_v2->d[i].start) + j;
+ } else if (journal_buckets) {
+ for (i = 0; i < ja->nr; i++)
+ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+ }
return 0;
}
union journal_res_state s;
struct bch_dev *ca;
unsigned long now = jiffies;
+ u64 seq;
unsigned i;
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 24);
+ out->atomic++;
+
rcu_read_lock();
s = READ_ONCE(j->reservations);
- pr_buf(out, "active journal entries:\t%llu\n", fifo_used(&j->pin));
- pr_buf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
- pr_buf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
- pr_buf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
- pr_buf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
- pr_buf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining);
- pr_buf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
- pr_buf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
- pr_buf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
- pr_buf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
- pr_buf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
- pr_buf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
- pr_buf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
+ prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size);
+ prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
+ prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk);
+ prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
+ prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
+ prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
+ prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining);
+ prt_printf(out, "watermark:\t\t%s\n", bch2_journal_watermarks[j->watermark]);
+ prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
+ prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
+ prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
+ prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
+ prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
+ prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
+ prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
- pr_buf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
- pr_buf(out, "current entry error:\t%u\n", j->cur_entry_error);
- pr_buf(out, "current entry:\t\t");
+ prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
+ prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
+ prt_printf(out, "current entry:\t\t");
switch (s.cur_entry_offset) {
case JOURNAL_ENTRY_ERROR_VAL:
- pr_buf(out, "error\n");
+ prt_printf(out, "error");
break;
case JOURNAL_ENTRY_CLOSED_VAL:
- pr_buf(out, "closed\n");
+ prt_printf(out, "closed");
break;
default:
- pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
+ prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s);
break;
}
- pr_buf(out, "current entry:\t\tidx %u refcount %u\n", s.idx, journal_state_count(s, s.idx));
+ prt_newline(out);
+
+ for (seq = journal_cur_seq(j);
+ seq >= journal_last_unwritten_seq(j);
+ --seq) {
+ i = seq & JOURNAL_BUF_MASK;
+
+ prt_printf(out, "unwritten entry:");
+ prt_tab(out);
+ prt_printf(out, "%llu", seq);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
- i = s.idx;
- while (i != s.unwritten_idx) {
- i = (i - 1) & JOURNAL_BUF_MASK;
+ prt_printf(out, "refcount:");
+ prt_tab(out);
+ prt_printf(out, "%u", journal_state_count(s, i));
+ prt_newline(out);
- pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n",
- i, journal_state_count(s, i), j->buf[i].sectors);
+ prt_printf(out, "sectors:");
+ prt_tab(out);
+ prt_printf(out, "%u", j->buf[i].sectors);
+ prt_newline(out);
+
+ prt_printf(out, "expires");
+ prt_tab(out);
+ prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies);
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
}
- pr_buf(out,
- "need write:\t\t%i\n"
+ prt_printf(out,
"replay done:\t\t%i\n",
- test_bit(JOURNAL_NEED_WRITE, &j->flags),
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
- pr_buf(out, "space:\n");
- pr_buf(out, "\tdiscarded\t%u:%u\n",
+ prt_printf(out, "space:\n");
+ prt_printf(out, "\tdiscarded\t%u:%u\n",
j->space[journal_space_discarded].next_entry,
j->space[journal_space_discarded].total);
- pr_buf(out, "\tclean ondisk\t%u:%u\n",
+ prt_printf(out, "\tclean ondisk\t%u:%u\n",
j->space[journal_space_clean_ondisk].next_entry,
j->space[journal_space_clean_ondisk].total);
- pr_buf(out, "\tclean\t\t%u:%u\n",
+ prt_printf(out, "\tclean\t\t%u:%u\n",
j->space[journal_space_clean].next_entry,
j->space[journal_space_clean].total);
- pr_buf(out, "\ttotal\t\t%u:%u\n",
+ prt_printf(out, "\ttotal\t\t%u:%u\n",
j->space[journal_space_total].next_entry,
j->space[journal_space_total].total);
if (!ja->nr)
continue;
- pr_buf(out, "dev %u:\n", i);
- pr_buf(out, "\tnr\t\t%u\n", ja->nr);
- pr_buf(out, "\tbucket size\t%u\n", ca->mi.bucket_size);
- pr_buf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
- pr_buf(out, "\tdiscard_idx\t%u\n", ja->discard_idx);
- pr_buf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]);
- pr_buf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]);
- pr_buf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
+ prt_printf(out, "dev %u:\n", i);
+ prt_printf(out, "\tnr\t\t%u\n", ja->nr);
+ prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size);
+ prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
+ prt_printf(out, "\tdiscard_idx\t%u\n", ja->discard_idx);
+ prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]);
+ prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]);
+ prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
}
rcu_read_unlock();
+
+ --out->atomic;
}
void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
spin_unlock(&j->lock);
}
-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *pin;
- u64 i;
spin_lock(&j->lock);
- fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
- pr_buf(out, "%llu: count %u\n",
- i, atomic_read(&pin_list->count));
+ *seq = max(*seq, j->pin.front);
- list_for_each_entry(pin, &pin_list->list, list)
- pr_buf(out, "\t%px %ps\n",
- pin, pin->flush);
+ if (*seq >= j->pin.back) {
+ spin_unlock(&j->lock);
+ return true;
+ }
+
+ out->atomic++;
+
+ pin_list = journal_seq_pin(j, *seq);
+
+ prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
- if (!list_empty(&pin_list->flushed))
- pr_buf(out, "flushed:\n");
+ list_for_each_entry(pin, &pin_list->list, list) {
+ prt_printf(out, "\t%px %ps", pin, pin->flush);
+ prt_newline(out);
+ }
+
+ list_for_each_entry(pin, &pin_list->key_cache_list, list) {
+ prt_printf(out, "\t%px %ps", pin, pin->flush);
+ prt_newline(out);
+ }
+
+ if (!list_empty(&pin_list->flushed)) {
+ prt_printf(out, "flushed:");
+ prt_newline(out);
+ }
- list_for_each_entry(pin, &pin_list->flushed, list)
- pr_buf(out, "\t%px %ps\n",
- pin, pin->flush);
+ list_for_each_entry(pin, &pin_list->flushed, list) {
+ prt_printf(out, "\t%px %ps", pin, pin->flush);
+ prt_newline(out);
}
+
+ printbuf_indent_sub(out, 2);
+
+ --out->atomic;
spin_unlock(&j->lock);
+
+ return false;
+}
+
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+{
+ u64 seq = 0;
+
+ while (!bch2_journal_seq_pins_to_text(out, j, &seq))
+ seq++;
}
*/
#include <linux/hash.h>
+#include <linux/prefetch.h>
#include "journal_types.h"
return j->pin.back - 1;
}
-void bch2_journal_set_has_inum(struct journal *, u64, u64);
+static inline u64 journal_last_unwritten_seq(struct journal *j)
+{
+ return j->seq_ondisk + 1;
+}
static inline int journal_state_count(union journal_res_state s, int idx)
{
return vstruct_idx(j->buf[res->idx].data, res->offset);
}
-static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
enum btree_id id, unsigned level,
- const void *data, unsigned u64s)
+ unsigned u64s)
{
entry->u64s = cpu_to_le16(u64s);
entry->btree_id = id;
entry->pad[0] = 0;
entry->pad[1] = 0;
entry->pad[2] = 0;
- memcpy_u64s_small(entry->_data, data, u64s);
-
return jset_u64s(u64s);
}
-static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
- unsigned type, enum btree_id id,
- unsigned level,
+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+ enum btree_id id, unsigned level,
const void *data, unsigned u64s)
{
- unsigned actual = journal_entry_set(journal_res_entry(j, res),
- type, id, level, data, u64s);
+ unsigned ret = journal_entry_init(entry, type, id, level, u64s);
+
+ memcpy_u64s_small(entry->_data, data, u64s);
+ return ret;
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry(struct journal *j, struct journal_res *res,
+ unsigned type, enum btree_id id,
+ unsigned level, unsigned u64s)
+{
+ struct jset_entry *entry = journal_res_entry(j, res);
+ unsigned actual = journal_entry_init(entry, type, id, level, u64s);
EBUG_ON(!res->ref);
EBUG_ON(actual > res->u64s);
res->offset += actual;
res->u64s -= actual;
-}
-
-static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
- enum btree_id id, unsigned level,
- const struct bkey_i *k)
-{
- bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
- id, level, k, k->k.u64s);
+ return entry;
}
static inline bool journal_entry_empty(struct jset *j)
.buf3_count = idx == 3,
}).v, &j->reservations.counter);
- EBUG_ON(((s.idx - idx) & 3) >
- ((s.idx - s.unwritten_idx) & 3));
-
if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
__bch2_journal_buf_put(j);
}
while (res->u64s)
bch2_journal_add_entry(j, res,
BCH_JSET_ENTRY_btree_keys,
- 0, 0, NULL, 0);
+ 0, 0, 0);
bch2_journal_buf_put(j, res->idx);
int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
unsigned);
-#define JOURNAL_RES_GET_NONBLOCK (1 << 0)
-#define JOURNAL_RES_GET_CHECK (1 << 1)
-#define JOURNAL_RES_GET_RESERVED (1 << 2)
+/* First two bits for JOURNAL_WATERMARK: */
+#define JOURNAL_RES_GET_NONBLOCK (1 << 2)
+#define JOURNAL_RES_GET_CHECK (1 << 3)
static inline int journal_res_get_fast(struct journal *j,
struct journal_res *res,
{
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
+ unsigned u64s, offset;
do {
old.v = new.v = v;
+ /*
+ * Round up the end of the journal reservation to the next
+ * cacheline boundary:
+ */
+ u64s = res->u64s;
+ offset = sizeof(struct jset) / sizeof(u64) +
+ new.cur_entry_offset + u64s;
+ u64s += ((offset - 1) & ((SMP_CACHE_BYTES / sizeof(u64)) - 1)) + 1;
+
+
/*
* Check if there is still room in the current journal
* entry:
*/
- if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
+ if (new.cur_entry_offset + u64s > j->cur_entry_u64s)
return 0;
EBUG_ON(!journal_state_count(new, new.idx));
- if (!(flags & JOURNAL_RES_GET_RESERVED) &&
- !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
+ if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark)
return 0;
- new.cur_entry_offset += res->u64s;
+ new.cur_entry_offset += u64s;
journal_state_inc(&new);
/*
res->ref = true;
res->idx = old.idx;
+ res->u64s = u64s;
res->offset = old.cur_entry_offset;
res->seq = le64_to_cpu(j->buf[old.idx].data->seq);
+
+ offset = res->offset;
+ while (offset < res->offset + res->u64s) {
+ prefetchw(vstruct_idx(j->buf[res->idx].data, offset));
+ offset += SMP_CACHE_BYTES / sizeof(u64);
+ }
return 1;
}
/* journal_preres: */
-static inline bool journal_check_may_get_unreserved(struct journal *j)
+static inline void journal_set_watermark(struct journal *j)
{
union journal_preres_state s = READ_ONCE(j->prereserved);
- bool ret = s.reserved < s.remaining &&
- fifo_free(&j->pin) > 8;
-
- lockdep_assert_held(&j->lock);
-
- if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
- if (ret) {
- set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
- journal_wake(j);
- } else {
- clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
- }
- }
- return ret;
+ unsigned watermark = JOURNAL_WATERMARK_any;
+
+ if (fifo_free(&j->pin) < j->pin.size / 4)
+ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
+ if (fifo_free(&j->pin) < j->pin.size / 8)
+ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
+
+ if (s.reserved > s.remaining)
+ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
+ if (!s.remaining)
+ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
+
+ if (watermark == j->watermark)
+ return;
+
+ swap(watermark, j->watermark);
+ if (watermark > j->watermark)
+ journal_wake(j);
}
static inline void bch2_journal_preres_put(struct journal *j,
closure_wake_up(&j->preres_wait);
}
- if (s.reserved <= s.remaining &&
- !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
- spin_lock(&j->lock);
- journal_check_may_get_unreserved(j);
- spin_unlock(&j->lock);
- }
+ if (s.reserved <= s.remaining && j->watermark)
+ journal_set_watermark(j);
}
int __bch2_journal_preres_get(struct journal *,
old.v = new.v = v;
ret = 0;
- if ((flags & JOURNAL_RES_GET_RESERVED) ||
- test_bit(JOURNAL_NOCHANGES, &j->flags) ||
+ if ((flags & JOURNAL_WATERMARK_reserved) ||
new.reserved + d < new.remaining) {
new.reserved += d;
ret = 1;
int bch2_journal_flush(struct journal *);
bool bch2_journal_noflush_seq(struct journal *, u64);
int bch2_journal_meta(struct journal *);
+int bch2_journal_log_msg(struct journal *, const char *, ...);
void bch2_journal_halt(struct journal *);
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
-int bch2_fs_journal_start(struct journal *, u64, struct list_head *);
+int bch2_fs_journal_start(struct journal *, u64);
void bch2_dev_journal_exit(struct bch_dev *);
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "alloc_background.h"
#include "alloc_foreground.h"
#include "btree_io.h"
#include "btree_update_interior.h"
#include <trace/events/bcachefs.h>
-static void __journal_replay_free(struct journal_replay *i)
+static struct nonce journal_nonce(const struct jset *jset)
+{
+ return (struct nonce) {{
+ [0] = 0,
+ [1] = ((__le32 *) &jset->seq)[0],
+ [2] = ((__le32 *) &jset->seq)[1],
+ [3] = BCH_NONCE_JOURNAL,
+ }};
+}
+
+static bool jset_csum_good(struct bch_fs *c, struct jset *j)
+{
+ return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) &&
+ !bch2_crc_cmp(j->csum,
+ csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j));
+}
+
+static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
{
- list_del(&i->list);
+ return (seq - c->journal_entries_base_seq) & (~0U >> 1);
+}
+
+static void __journal_replay_free(struct bch_fs *c,
+ struct journal_replay *i)
+{
+ struct journal_replay **p =
+ genradix_ptr(&c->journal_entries,
+ journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
+
+ BUG_ON(*p != i);
+ *p = NULL;
kvpfree(i, offsetof(struct journal_replay, j) +
vstruct_bytes(&i->j));
-
}
static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
i->ignore = true;
if (!c->opts.read_entire_journal)
- __journal_replay_free(i);
+ __journal_replay_free(c, i);
}
struct journal_list {
struct closure cl;
+ u64 last_seq;
struct mutex lock;
- struct list_head *head;
int ret;
};
* be replayed:
*/
static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
- struct bch_extent_ptr entry_ptr,
- struct journal_list *jlist, struct jset *j,
- bool bad)
+ struct journal_ptr entry_ptr,
+ struct journal_list *jlist, struct jset *j)
{
- struct journal_replay *i, *pos, *dup = NULL;
- struct bch_extent_ptr *ptr;
- struct list_head *where;
+ struct genradix_iter iter;
+ struct journal_replay **_i, *i, *dup;
+ struct journal_ptr *ptr;
size_t bytes = vstruct_bytes(j);
- u64 last_seq = 0;
+ u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
int ret = JOURNAL_ENTRY_ADD_OK;
- list_for_each_entry_reverse(i, jlist->head, list) {
- if (!JSET_NO_FLUSH(&i->j)) {
- last_seq = le64_to_cpu(i->j.last_seq);
- break;
- }
- }
-
/* Is this entry older than the range we need? */
if (!c->opts.read_entire_journal &&
- le64_to_cpu(j->seq) < last_seq) {
- ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
- goto out;
- }
+ le64_to_cpu(j->seq) < jlist->last_seq)
+ return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+
+ /*
+ * genradixes are indexed by a ulong, not a u64, so we can't index them
+ * by sequence number directly: Assume instead that they will all fall
+ * within the range of +-2billion of the filrst one we find.
+ */
+ if (!c->journal_entries_base_seq)
+ c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
/* Drop entries we don't need anymore */
- if (!JSET_NO_FLUSH(j)) {
- list_for_each_entry_safe(i, pos, jlist->head, list) {
- if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+ if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
+ genradix_for_each_from(&c->journal_entries, iter, _i,
+ journal_entry_radix_idx(c, jlist->last_seq)) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ if (le64_to_cpu(i->j.seq) >= last_seq)
break;
journal_replay_free(c, i);
}
}
- list_for_each_entry_reverse(i, jlist->head, list) {
- if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
- where = &i->list;
- goto add;
- }
- }
-
- where = jlist->head;
-add:
- dup = where->next != jlist->head
- ? container_of(where->next, struct journal_replay, list)
- : NULL;
+ jlist->last_seq = max(jlist->last_seq, last_seq);
- if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq))
- dup = NULL;
+ _i = genradix_ptr_alloc(&c->journal_entries,
+ journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
+ GFP_KERNEL);
+ if (!_i)
+ return -ENOMEM;
/*
* Duplicate journal entries? If so we want the one that didn't have a
* checksum error:
*/
+ dup = *_i;
if (dup) {
- if (dup->bad) {
- /* we'll replace @dup: */
- } else if (bad) {
+ if (bytes == vstruct_bytes(&dup->j) &&
+ !memcmp(j, &dup->j, bytes)) {
i = dup;
goto found;
- } else {
- fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
- memcmp(j, &dup->j, bytes), c,
- "found duplicate but non identical journal entries (seq %llu)",
- le64_to_cpu(j->seq));
+ }
+
+ if (!entry_ptr.csum_good) {
i = dup;
goto found;
}
- }
- i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
- if (!i) {
- ret = -ENOMEM;
- goto out;
+ if (!dup->csum_good)
+ goto replace;
+
+ fsck_err(c, "found duplicate but non identical journal entries (seq %llu)",
+ le64_to_cpu(j->seq));
+ i = dup;
+ goto found;
}
+replace:
+ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+ if (!i)
+ return -ENOMEM;
- i->nr_ptrs = 0;
- i->bad = bad;
+ i->nr_ptrs = 0;
+ i->csum_good = entry_ptr.csum_good;
i->ignore = false;
memcpy(&i->j, j, bytes);
+ i->ptrs[i->nr_ptrs++] = entry_ptr;
if (dup) {
- i->nr_ptrs = dup->nr_ptrs;
- memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
- __journal_replay_free(dup);
+ if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
+ bch_err(c, "found too many copies of journal entry %llu",
+ le64_to_cpu(i->j.seq));
+ dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
+ }
+
+ /* The first ptr should represent the jset we kept: */
+ memcpy(i->ptrs + i->nr_ptrs,
+ dup->ptrs,
+ sizeof(dup->ptrs[0]) * dup->nr_ptrs);
+ i->nr_ptrs += dup->nr_ptrs;
+ __journal_replay_free(c, dup);
}
- list_add(&i->list, where);
+ *_i = i;
+ return 0;
found:
for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
if (ptr->dev == ca->dev_idx) {
return ret;
}
-static struct nonce journal_nonce(const struct jset *jset)
-{
- return (struct nonce) {{
- [0] = 0,
- [1] = ((__le32 *) &jset->seq)[0],
- [2] = ((__le32 *) &jset->seq)[1],
- [3] = BCH_NONCE_JOURNAL,
- }};
-}
-
/* this fills in a range with empty jset_entries: */
static void journal_entry_null_range(void *start, void *end)
{
#define JOURNAL_ENTRY_NONE 6
#define JOURNAL_ENTRY_BAD 7
-#define journal_entry_err(c, msg, ...) \
+static void journal_entry_err_msg(struct printbuf *out,
+ struct jset *jset,
+ struct jset_entry *entry)
+{
+ prt_str(out, "invalid journal entry ");
+ if (entry)
+ prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]);
+
+ if (!jset)
+ prt_printf(out, "in superblock");
+ else if (!entry)
+ prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq));
+ else
+ prt_printf(out, "at offset %zi/%u seq %llu",
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s),
+ le64_to_cpu(jset->seq));
+ prt_str(out, ": ");
+}
+
+#define journal_entry_err(c, jset, entry, msg, ...) \
({ \
+ struct printbuf buf = PRINTBUF; \
+ \
+ journal_entry_err_msg(&buf, jset, entry); \
+ prt_printf(&buf, msg, ##__VA_ARGS__); \
+ \
switch (write) { \
case READ: \
- mustfix_fsck_err(c, msg, ##__VA_ARGS__); \
+ mustfix_fsck_err(c, "%s", buf.buf); \
break; \
case WRITE: \
- bch_err(c, "corrupt metadata before write:\n" \
- msg, ##__VA_ARGS__); \
+ bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\
if (bch2_fs_inconsistent(c)) { \
- ret = BCH_FSCK_ERRORS_NOT_FIXED; \
+ ret = -BCH_ERR_fsck_errors_not_fixed; \
goto fsck_err; \
} \
break; \
} \
+ \
+ printbuf_exit(&buf); \
true; \
})
-#define journal_entry_err_on(cond, c, msg, ...) \
- ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
+#define journal_entry_err_on(cond, c, jset, entry, msg, ...) \
+ ((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false)
#define FSCK_DELETED_KEY 5
-static int journal_validate_key(struct bch_fs *c, const char *where,
+static int journal_validate_key(struct bch_fs *c,
+ struct jset *jset,
struct jset_entry *entry,
unsigned level, enum btree_id btree_id,
- struct bkey_i *k, const char *type,
+ struct bkey_i *k,
unsigned version, int big_endian, int write)
{
void *next = vstruct_next(entry);
- const char *invalid;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- if (journal_entry_err_on(!k->k.u64s, c,
- "invalid %s in %s entry offset %zi/%u: k->u64s 0",
- type, where,
- (u64 *) k - entry->_data,
- le16_to_cpu(entry->u64s))) {
+ if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
return FSCK_DELETED_KEY;
}
if (journal_entry_err_on((void *) bkey_next(k) >
- (void *) vstruct_next(entry), c,
- "invalid %s in %s entry offset %zi/%u: extends past end of journal entry",
- type, where,
- (u64 *) k - entry->_data,
- le16_to_cpu(entry->u64s))) {
+ (void *) vstruct_next(entry),
+ c, jset, entry,
+ "extends past end of journal entry")) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
return FSCK_DELETED_KEY;
}
- if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
- "invalid %s in %s entry offset %zi/%u: bad format %u",
- type, where,
- (u64 *) k - entry->_data,
- le16_to_cpu(entry->u64s),
- k->k.format)) {
+ if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
+ c, jset, entry,
+ "bad format %u", k->k.format)) {
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
journal_entry_null_range(vstruct_next(entry), next);
bch2_bkey_compat(level, btree_id, version, big_endian,
write, NULL, bkey_to_packed(k));
- invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
- __btree_node_type(level, btree_id));
- if (invalid) {
- char buf[160];
+ if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+ __btree_node_type(level, btree_id), write, &buf)) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:",
+ bch2_jset_entry_types[entry->type],
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s),
+ le64_to_cpu(jset->seq));
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+ prt_newline(&buf);
+ bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+ __btree_node_type(level, btree_id), write, &buf);
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
- mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s",
- type, where,
- (u64 *) k - entry->_data,
- le16_to_cpu(entry->u64s),
- invalid, buf);
+ mustfix_fsck_err(c, "%s", buf.buf);
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
journal_entry_null_range(vstruct_next(entry), next);
+
+ printbuf_exit(&buf);
return FSCK_DELETED_KEY;
}
bch2_bkey_compat(level, btree_id, version, big_endian,
write, NULL, bkey_to_packed(k));
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
static int journal_entry_btree_keys_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
struct bkey_i *k = entry->start;
while (k != vstruct_last(entry)) {
- int ret = journal_validate_key(c, where, entry,
+ int ret = journal_validate_key(c, jset, entry,
entry->level,
entry->btree_id,
- k, "key", version, big_endian, write);
+ k, version, big_endian, write);
if (ret == FSCK_DELETED_KEY)
continue;
vstruct_for_each(entry, k) {
if (!first) {
- printbuf_newline(out);
- pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ prt_newline(out);
+ prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
}
- pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
+ prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
first = false;
}
}
static int journal_entry_btree_root_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
int ret = 0;
if (journal_entry_err_on(!entry->u64s ||
- le16_to_cpu(entry->u64s) != k->k.u64s, c,
+ le16_to_cpu(entry->u64s) != k->k.u64s,
+ c, jset, entry,
"invalid btree root journal entry: wrong number of keys")) {
void *next = vstruct_next(entry);
/*
return 0;
}
- return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
- "btree root", version, big_endian, write);
+ return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
+ version, big_endian, write);
fsck_err:
return ret;
}
}
static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
}
static int journal_entry_blacklist_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
int ret = 0;
- if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
+ c, jset, entry,
"invalid journal seq blacklist entry: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
}
struct jset_entry_blacklist *bl =
container_of(entry, struct jset_entry_blacklist, entry);
- pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq));
+ prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
}
static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
struct jset_entry_blacklist_v2 *bl_entry;
int ret = 0;
- if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
+ c, jset, entry,
"invalid journal seq blacklist entry: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
goto out;
bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
- le64_to_cpu(bl_entry->end), c,
+ le64_to_cpu(bl_entry->end),
+ c, jset, entry,
"invalid journal seq blacklist entry: start > end")) {
journal_entry_null_range(entry, vstruct_next(entry));
}
struct jset_entry_blacklist_v2 *bl =
container_of(entry, struct jset_entry_blacklist_v2, entry);
- pr_buf(out, "start=%llu end=%llu",
+ prt_printf(out, "start=%llu end=%llu",
le64_to_cpu(bl->start),
le64_to_cpu(bl->end));
}
static int journal_entry_usage_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
int ret = 0;
if (journal_entry_err_on(bytes < sizeof(*u),
- c,
+ c, jset, entry,
"invalid journal entry usage: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
- pr_buf(out, "type=%s v=%llu",
+ prt_printf(out, "type=%s v=%llu",
bch2_fs_usage_types[u->entry.btree_id],
le64_to_cpu(u->v));
}
static int journal_entry_data_usage_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
if (journal_entry_err_on(bytes < sizeof(*u) ||
bytes < sizeof(*u) + u->r.nr_devs,
- c,
+ c, jset, entry,
"invalid journal entry usage: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
container_of(entry, struct jset_entry_data_usage, entry);
bch2_replicas_entry_to_text(out, &u->r);
- pr_buf(out, "=%llu", le64_to_cpu(u->v));
+ prt_printf(out, "=%llu", le64_to_cpu(u->v));
}
static int journal_entry_clock_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
int ret = 0;
if (journal_entry_err_on(bytes != sizeof(*clock),
- c, "invalid journal entry clock: bad size")) {
+ c, jset, entry, "bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
if (journal_entry_err_on(clock->rw > 1,
- c, "invalid journal entry clock: bad rw")) {
+ c, jset, entry, "bad rw")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
- pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
+ prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
}
static int journal_entry_dev_usage_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
int ret = 0;
if (journal_entry_err_on(bytes < expected,
- c, "invalid journal entry dev usage: bad size (%u < %u)",
+ c, jset, entry, "bad size (%u < %u)",
bytes, expected)) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
dev = le32_to_cpu(u->dev);
if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
- c, "invalid journal entry dev usage: bad dev")) {
+ c, jset, entry, "bad dev")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
if (journal_entry_err_on(u->pad,
- c, "invalid journal entry dev usage: bad pad")) {
+ c, jset, entry, "bad pad")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
container_of(entry, struct jset_entry_dev_usage, entry);
unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
- pr_buf(out, "dev=%u", le32_to_cpu(u->dev));
+ prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
for (i = 0; i < nr_types; i++) {
if (i < BCH_DATA_NR)
- pr_buf(out, " %s", bch2_data_types[i]);
+ prt_printf(out, " %s", bch2_data_types[i]);
else
- pr_buf(out, " (unknown data type %u)", i);
- pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
+ prt_printf(out, " (unknown data type %u)", i);
+ prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
le64_to_cpu(u->d[i].buckets),
le64_to_cpu(u->d[i].sectors),
le64_to_cpu(u->d[i].fragmented));
}
- pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu",
- le64_to_cpu(u->buckets_ec),
- le64_to_cpu(u->buckets_unavailable));
+ prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
}
static int journal_entry_log_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
- bch_scnmemcpy(out, l->d, strnlen(l->d, bytes));
+ prt_printf(out, "%.*s", bytes, l->d);
+}
+
+static int journal_entry_overwrite_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian, int write)
+{
+ return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, write);
+}
+
+static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ journal_entry_btree_keys_to_text(out, c, entry);
}
struct jset_entry_ops {
- int (*validate)(struct bch_fs *, const char *,
+ int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int, int);
void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
};
#undef x
};
-int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
+int bch2_journal_entry_validate(struct bch_fs *c,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
return entry->type < BCH_JSET_ENTRY_NR
- ? bch2_jset_entry_ops[entry->type].validate(c, where, entry,
+ ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
version, big_endian, write)
: 0;
}
struct jset_entry *entry)
{
if (entry->type < BCH_JSET_ENTRY_NR) {
- pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
} else {
- pr_buf(out, "(unknown type %u)", entry->type);
+ prt_printf(out, "(unknown type %u)", entry->type);
}
}
static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
int write)
{
- char buf[100];
struct jset_entry *entry;
int ret = 0;
vstruct_for_each(jset, entry) {
- scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u",
- le64_to_cpu(jset->seq),
- (u64 *) entry - jset->_data,
- le32_to_cpu(jset->u64s));
-
if (journal_entry_err_on(vstruct_next(entry) >
- vstruct_last(jset), c,
+ vstruct_last(jset), c, jset, entry,
"journal entry extends past end of jset")) {
jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
break;
}
- ret = bch2_journal_entry_validate(c, buf, entry,
+ ret = bch2_journal_entry_validate(c, jset, entry,
le32_to_cpu(jset->version),
JSET_BIG_ENDIAN(jset), write);
if (ret)
static int jset_validate(struct bch_fs *c,
struct bch_dev *ca,
struct jset *jset, u64 sector,
- unsigned bucket_sectors_left,
- unsigned sectors_read,
int write)
{
- size_t bytes = vstruct_bytes(jset);
- struct bch_csum csum;
unsigned version;
int ret = 0;
version = le32_to_cpu(jset->version);
if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
version < bcachefs_metadata_version_min) ||
- version >= bcachefs_metadata_version_max, c,
+ version >= bcachefs_metadata_version_max,
+ c, jset, NULL,
"%s sector %llu seq %llu: unknown journal entry version %u",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
version)) {
/* don't try to continue: */
- return EINVAL;
+ return -EINVAL;
}
- if (bytes > (sectors_read << 9) &&
- sectors_read < bucket_sectors_left)
- return JOURNAL_ENTRY_REREAD;
-
- if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
- "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
- ca ? ca->name : c->name,
- sector, le64_to_cpu(jset->seq), bytes)) {
- ret = JOURNAL_ENTRY_BAD;
- le32_add_cpu(&jset->u64s,
- -((bytes - (bucket_sectors_left << 9)) / 8));
- }
-
- if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
+ if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
+ c, jset, NULL,
"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
- JSET_CSUM_TYPE(jset))) {
+ JSET_CSUM_TYPE(jset)))
ret = JOURNAL_ENTRY_BAD;
- goto csum_done;
- }
- if (write)
- goto csum_done;
-
- csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
- if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
- "%s sector %llu seq %llu: journal checksum bad",
- ca ? ca->name : c->name,
- sector, le64_to_cpu(jset->seq)))
- ret = JOURNAL_ENTRY_BAD;
-
- bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
- jset->encrypted_start,
- vstruct_end(jset) - (void *) jset->encrypted_start);
-csum_done:
/* last_seq is ignored when JSET_NO_FLUSH is true */
if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
- le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
+ le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
+ c, jset, NULL,
"invalid journal entry: last_seq > seq (%llu > %llu)",
le64_to_cpu(jset->last_seq),
le64_to_cpu(jset->seq))) {
jset->last_seq = jset->seq;
return JOURNAL_ENTRY_BAD;
}
+
+ ret = jset_validate_entries(c, jset, write);
fsck_err:
return ret;
}
-static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
+static int jset_validate_early(struct bch_fs *c,
+ struct bch_dev *ca,
+ struct jset *jset, u64 sector,
+ unsigned bucket_sectors_left,
+ unsigned sectors_read)
{
- unsigned sectors = vstruct_sectors(jset, c->block_bits);
+ size_t bytes = vstruct_bytes(jset);
+ unsigned version;
+ int write = READ;
+ int ret = 0;
+
+ if (le64_to_cpu(jset->magic) != jset_magic(c))
+ return JOURNAL_ENTRY_NONE;
- return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
- jset_validate_entries(c, jset, WRITE);
+ version = le32_to_cpu(jset->version);
+ if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
+ version < bcachefs_metadata_version_min) ||
+ version >= bcachefs_metadata_version_max,
+ c, jset, NULL,
+ "%s sector %llu seq %llu: unknown journal entry version %u",
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq),
+ version)) {
+ /* don't try to continue: */
+ return -EINVAL;
+ }
+
+ if (bytes > (sectors_read << 9) &&
+ sectors_read < bucket_sectors_left)
+ return JOURNAL_ENTRY_REREAD;
+
+ if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
+ c, jset, NULL,
+ "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq), bytes))
+ le32_add_cpu(&jset->u64s,
+ -((bytes - (bucket_sectors_left << 9)) / 8));
+fsck_err:
+ return ret;
}
struct journal_read_buf {
unsigned sectors, sectors_read = 0;
u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
end = offset + ca->mi.bucket_size;
- bool saw_bad = false;
+ bool saw_bad = false, csum_good;
int ret = 0;
pr_debug("reading %u", bucket);
while (offset < end) {
if (!sectors_read) {
struct bio *bio;
+ unsigned nr_bvecs;
reread:
sectors_read = min_t(unsigned,
end - offset, buf->size >> 9);
+ nr_bvecs = buf_pages(buf->data, sectors_read << 9);
- bio = bio_kmalloc(GFP_KERNEL,
- buf_pages(buf->data,
- sectors_read << 9));
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_iter.bi_sector = offset;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+ bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
+
+ bio->bi_iter.bi_sector = offset;
bch2_bio_map(bio, buf->data, sectors_read << 9);
ret = submit_bio_wait(bio);
- bio_put(bio);
+ kfree(bio);
if (bch2_dev_io_err_on(ret, ca,
"journal read error: sector %llu",
j = buf->data;
}
- ret = jset_validate(c, ca, j, offset,
- end - offset, sectors_read,
- READ);
+ ret = jset_validate_early(c, ca, j, offset,
+ end - offset, sectors_read);
switch (ret) {
- case BCH_FSCK_OK:
+ case 0:
sectors = vstruct_sectors(j, c->block_bits);
break;
case JOURNAL_ENTRY_REREAD:
case JOURNAL_ENTRY_NONE:
if (!saw_bad)
return 0;
- sectors = block_sectors(c);
- goto next_block;
- case JOURNAL_ENTRY_BAD:
- saw_bad = true;
/*
* On checksum error we don't really trust the size
* field of the journal entry we read, so try reading
* again at next block boundary:
*/
sectors = block_sectors(c);
- break;
+ goto next_block;
default:
return ret;
}
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
+ csum_good = jset_csum_good(c, j);
+ if (!csum_good)
+ saw_bad = true;
+
+ ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
+ j->encrypted_start,
+ vstruct_end(j) - (void *) j->encrypted_start);
+ bch2_fs_fatal_err_on(ret, c,
+ "error decrypting journal entry: %i", ret);
+
mutex_lock(&jlist->lock);
- ret = journal_entry_add(c, ca, (struct bch_extent_ptr) {
- .dev = ca->dev_idx,
- .offset = offset,
- }, jlist, j, ret != 0);
+ ret = journal_entry_add(c, ca, (struct journal_ptr) {
+ .csum_good = csum_good,
+ .dev = ca->dev_idx,
+ .bucket = bucket,
+ .bucket_offset = offset -
+ bucket_to_sector(ca, ja->buckets[bucket]),
+ .sector = offset,
+ }, jlist, j);
mutex_unlock(&jlist->lock);
switch (ret) {
struct bch_fs *c = ca->fs;
struct journal_list *jlist =
container_of(cl->parent, struct journal_list, cl);
+ struct journal_replay *r, **_r;
+ struct genradix_iter iter;
struct journal_read_buf buf = { NULL, 0 };
u64 min_seq = U64_MAX;
unsigned i;
* allocate
*/
while (ja->bucket_seq[ja->cur_idx] > min_seq &&
- ja->bucket_seq[ja->cur_idx] >
+ ja->bucket_seq[ja->cur_idx] ==
ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
- ja->sectors_free = 0;
+ ja->sectors_free = ca->mi.bucket_size;
+
+ mutex_lock(&jlist->lock);
+ genradix_for_each(&c->journal_entries, iter, _r) {
+ r = *_r;
+
+ if (!r)
+ continue;
+
+ for (i = 0; i < r->nr_ptrs; i++) {
+ if (r->ptrs[i].dev == ca->dev_idx &&
+ sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
+ unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
+ vstruct_sectors(&r->j, c->block_bits);
+
+ ja->sectors_free = min(ja->sectors_free,
+ ca->mi.bucket_size - wrote);
+ }
+ }
+ }
+ mutex_unlock(&jlist->lock);
+
+ if (ja->bucket_seq[ja->cur_idx] &&
+ ja->sectors_free == ca->mi.bucket_size) {
+ bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
+ bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
+ for (i = 0; i < 3; i++) {
+ unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
+ bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
+ }
+ ja->sectors_free = 0;
+ }
/*
* Set dirty_idx to indicate the entire journal is full and needs to be
goto out;
}
-static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
{
unsigned i;
struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
u64 offset;
- div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
+ div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
if (i)
- pr_buf(out, " ");
- pr_buf(out, "%u:%llu (offset %llu)",
+ prt_printf(out, " ");
+ prt_printf(out, "%u:%u:%u (sector %llu)",
j->ptrs[i].dev,
- (u64) j->ptrs[i].offset, offset);
+ j->ptrs[i].bucket,
+ j->ptrs[i].bucket_offset,
+ j->ptrs[i].sector);
}
}
-int bch2_journal_read(struct bch_fs *c, struct list_head *list,
- u64 *blacklist_seq, u64 *start_seq)
+int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
{
struct journal_list jlist;
- struct journal_replay *i, *t;
+ struct journal_replay *i, **_i, *prev = NULL;
+ struct genradix_iter radix_iter;
struct bch_dev *ca;
unsigned iter;
+ struct printbuf buf = PRINTBUF;
size_t keys = 0, entries = 0;
bool degraded = false;
u64 seq, last_seq = 0;
closure_init_stack(&jlist.cl);
mutex_init(&jlist.lock);
- jlist.head = list;
+ jlist.last_seq = 0;
jlist.ret = 0;
for_each_member_device(ca, c, iter) {
- if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+ if (!c->opts.fsck &&
!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
continue;
if (jlist.ret)
return jlist.ret;
- if (list_empty(list)) {
- bch_info(c, "journal read done, but no entries found");
- return 0;
- }
-
- i = list_last_entry(list, struct journal_replay, list);
- *start_seq = le64_to_cpu(i->j.seq) + 1;
+ *start_seq = 0;
/*
* Find most recent flush entry, and ignore newer non flush entries -
* those entries will be blacklisted:
*/
- list_for_each_entry_safe_reverse(i, t, list, list) {
- if (i->ignore)
+ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
+ if (!*start_seq)
+ *start_seq = le64_to_cpu(i->j.seq) + 1;
+
if (!JSET_NO_FLUSH(&i->j)) {
+ int write = READ;
+ if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
+ c, &i->j, NULL,
+ "invalid journal entry: last_seq > seq (%llu > %llu)",
+ le64_to_cpu(i->j.last_seq),
+ le64_to_cpu(i->j.seq)))
+ i->j.last_seq = i->j.seq;
+
last_seq = le64_to_cpu(i->j.last_seq);
*blacklist_seq = le64_to_cpu(i->j.seq) + 1;
break;
journal_replay_free(c, i);
}
+ if (!*start_seq) {
+ bch_info(c, "journal read done, but no entries found");
+ return 0;
+ }
+
if (!last_seq) {
fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
- return -1;
+ ret = -1;
+ goto err;
}
/* Drop blacklisted entries and entries older than last_seq: */
- list_for_each_entry_safe(i, t, list, list) {
- if (i->ignore)
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
seq = le64_to_cpu(i->j.seq);
/* Check for missing entries: */
seq = last_seq;
- list_for_each_entry(i, list, list) {
- if (i->ignore)
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
BUG_ON(seq > le64_to_cpu(i->j.seq));
while (seq < le64_to_cpu(i->j.seq)) {
u64 missing_start, missing_end;
- char buf1[200], buf2[200];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
while (seq < le64_to_cpu(i->j.seq) &&
bch2_journal_seq_is_blacklisted(c, seq, false))
!bch2_journal_seq_is_blacklisted(c, seq, false))
seq++;
- if (i->list.prev != list) {
- struct printbuf out = PBUF(buf1);
- struct journal_replay *p = list_prev_entry(i, list);
-
- bch2_journal_ptrs_to_text(&out, c, p);
- pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits));
+ if (prev) {
+ bch2_journal_ptrs_to_text(&buf1, c, prev);
+ prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
} else
- sprintf(buf1, "(none)");
- bch2_journal_ptrs_to_text(&PBUF(buf2), c, i);
+ prt_printf(&buf1, "(none)");
+ bch2_journal_ptrs_to_text(&buf2, c, i);
missing_end = seq - 1;
fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
" next at %s",
missing_start, missing_end,
last_seq, *blacklist_seq - 1,
- buf1, buf2);
+ buf1.buf, buf2.buf);
+
+ printbuf_exit(&buf1);
+ printbuf_exit(&buf2);
}
+ prev = i;
seq++;
}
- list_for_each_entry(i, list, list) {
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct bch_replicas_padded replicas = {
.e.nr_required = 1,
};
unsigned ptr;
- char buf[80];
- if (i->ignore)
+ i = *_i;
+ if (!i || i->ignore)
continue;
- ret = jset_validate_entries(c, &i->j, READ);
+ for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+
+ if (!i->ptrs[ptr].csum_good)
+ printk(KERN_ERR "bcachefs (%s) sector %llu: invalid journal checksum, seq %llu%s\n",
+ ca->name, i->ptrs[ptr].sector,
+ le64_to_cpu(i->j.seq),
+ i->csum_good ? " (had good copy on another device)" : "");
+ }
+
+ ret = jset_validate(c,
+ bch_dev_bkey_exists(c, i->ptrs[0].dev),
+ &i->j,
+ i->ptrs[0].sector,
+ READ);
if (ret)
- goto fsck_err;
+ goto err;
for (ptr = 0; ptr < i->nr_ptrs; ptr++)
replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
* the devices - this is wrong:
*/
+ printbuf_reset(&buf);
+ bch2_replicas_entry_to_text(&buf, &replicas.e);
+
if (!degraded &&
- (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
- "superblock not marked as containing replicas %s",
- (bch2_replicas_entry_to_text(&PBUF(buf),
- &replicas.e), buf)))) {
+ fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
+ "superblock not marked as containing replicas %s",
+ buf.buf)) {
ret = bch2_mark_replicas(c, &replicas.e);
if (ret)
- return ret;
+ goto err;
}
for_each_jset_key(k, _n, entry, &i->j)
if (*start_seq != *blacklist_seq)
bch_info(c, "dropped unflushed entries %llu-%llu",
*blacklist_seq, *start_seq - 1);
+err:
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
}
-static void journal_write_compact(struct jset *jset)
-{
- struct jset_entry *i, *next, *prev = NULL;
-
- /*
- * Simple compaction, dropping empty jset_entries (from journal
- * reservations that weren't fully used) and merging jset_entries that
- * can be.
- *
- * If we wanted to be really fancy here, we could sort all the keys in
- * the jset and drop keys that were overwritten - probably not worth it:
- */
- vstruct_for_each_safe(jset, i, next) {
- unsigned u64s = le16_to_cpu(i->u64s);
-
- /* Empty entry: */
- if (!u64s)
- continue;
-
- /* Can we merge with previous entry? */
- if (prev &&
- i->btree_id == prev->btree_id &&
- i->level == prev->level &&
- i->type == prev->type &&
- i->type == BCH_JSET_ENTRY_btree_keys &&
- le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
- memmove_u64s_down(vstruct_next(prev),
- i->_data,
- u64s);
- le16_add_cpu(&prev->u64s, u64s);
- continue;
- }
-
- /* Couldn't merge, move i into new position (after prev): */
- prev = prev ? vstruct_next(prev) : jset->start;
- if (i != prev)
- memmove_u64s_down(prev, i, jset_u64s(u64s));
- }
-
- prev = prev ? vstruct_next(prev) : jset->start;
- jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
-}
-
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
{
/* we aren't holding j->lock: */
static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
{
- return j->buf + j->reservations.unwritten_idx;
+ return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
}
static void journal_write_done(struct closure *cl)
journal_seq_pin(j, seq)->devs = w->devs_written;
if (!err) {
- j->seq_ondisk = seq;
-
if (!JSET_NO_FLUSH(w->data)) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
+
+ bch2_do_discards(c);
+ closure_wake_up(&c->freelist_wait);
}
} else if (!j->err_seq || seq < j->err_seq)
j->err_seq = seq;
+ j->seq_ondisk = seq;
+
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
* more buckets:
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
- journal_reclaim_kick(&c->journal);
+ if (j->watermark)
+ journal_reclaim_kick(&c->journal);
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
v = atomic64_read(&j->reservations.counter);
do {
old.v = new.v = v;
- BUG_ON(new.idx == new.unwritten_idx);
+ BUG_ON(journal_state_count(new, new.unwritten_idx));
new.unwritten_idx++;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
closure_wake_up(&w->wait);
journal_wake(j);
- if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
- mod_delayed_work(c->io_complete_wq, &j->write_work, 0);
- spin_unlock(&j->lock);
-
- if (new.unwritten_idx != new.idx &&
- !journal_state_count(new, new.unwritten_idx))
+ if (!journal_state_count(new, new.unwritten_idx) &&
+ journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+ } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+ new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
+ struct journal_buf *buf = journal_cur_buf(j);
+ long delta = buf->expires - jiffies;
+
+ /*
+ * We don't close a journal entry to write it while there's
+ * previous entries still in flight - the current journal entry
+ * might want to be written now:
+ */
+
+ mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
+ }
+
+ spin_unlock(&j->lock);
}
static void journal_write_endio(struct bio *bio)
sectors);
bio = ca->journal.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
- bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META;
BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
ca->prev_journal_sector = bio->bi_iter.bi_sector;
bch2_bio_map(bio, w->data, sectors << 9);
- trace_journal_write(bio);
+ trace_and_count(c, journal_write, bio);
closure_bio_submit(bio, cl);
ca->journal.bucket_seq[ca->journal.cur_idx] =
struct jset_entry *start, *end;
struct jset *jset;
struct bio *bio;
- char *journal_debug_buf = NULL;
+ struct printbuf journal_debug_buf = PRINTBUF;
bool validate_before_checksum = false;
unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
int ret;
j->write_start_time = local_clock();
spin_lock(&j->lock);
- if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
- (w->noflush ||
- (!w->must_flush &&
- (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
- test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
+ if (bch2_journal_error(j) ||
+ w->noflush ||
+ (!w->must_flush &&
+ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
w->noflush = true;
SET_JSET_NO_FLUSH(jset, true);
jset->last_seq = 0;
le32_add_cpu(&jset->u64s, u64s);
BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
- journal_write_compact(jset);
-
jset->magic = cpu_to_le64(jset_magic(c));
- jset->version = c->sb.version < bcachefs_metadata_version_new_versioning
+ jset->version = c->sb.version < bcachefs_metadata_version_bkey_renumber
? cpu_to_le32(BCH_JSET_VERSION_OLD)
: cpu_to_le32(c->sb.version);
validate_before_checksum = true;
if (validate_before_checksum &&
- jset_validate_for_write(c, jset))
+ jset_validate(c, NULL, jset, 0, WRITE))
goto err;
- bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+ ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error decrypting journal entry: %i", ret))
+ goto err;
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
journal_nonce(jset), jset);
if (!validate_before_checksum &&
- jset_validate_for_write(c, jset))
+ jset_validate(c, NULL, jset, 0, WRITE))
goto err;
sectors = vstruct_sectors(jset, c->block_bits);
goto retry_alloc;
}
- if (ret) {
- journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
- if (journal_debug_buf)
- __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
- }
+ if (ret)
+ __bch2_journal_debug_to_text(&journal_debug_buf, j);
/*
* write is allocated, no longer need to account for it in
if (ret) {
bch_err(c, "Unable to allocate journal write:\n%s",
- journal_debug_buf);
- kfree(journal_debug_buf);
+ journal_debug_buf.buf);
+ printbuf_exit(&journal_debug_buf);
bch2_fatal_error(c);
continue_at(cl, journal_write_done, c->io_complete_wq);
return;
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
- if (test_bit(JOURNAL_NOCHANGES, &j->flags))
+ if (c->opts.nochanges)
goto no_io;
for_each_rw_member(ca, c, i)
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_opf = REQ_OP_FLUSH;
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
* during cache_registration
*/
struct journal_replay {
- struct list_head list;
- struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
+ struct journal_ptr {
+ bool csum_good;
+ u8 dev;
+ u32 bucket;
+ u32 bucket_offset;
+ u64 sector;
+ } ptrs[BCH_REPLICAS_MAX];
unsigned nr_ptrs;
- /* checksum error, but we may want to try using it anyways: */
- bool bad;
+ bool csum_good;
bool ignore;
/* must be last: */
struct jset j;
for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
vstruct_for_each_safe(entry, k, _n)
-int bch2_journal_entry_validate(struct bch_fs *, const char *,
+int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int, int);
void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
struct jset_entry *);
-int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
+void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
+ struct journal_replay *);
+
+int bch2_journal_read(struct bch_fs *, u64 *, u64 *);
void bch2_journal_write(struct closure *);
#include "bcachefs.h"
#include "btree_key_cache.h"
+#include "errcode.h"
#include "error.h"
#include "journal.h"
#include "journal_io.h"
struct journal_device *ja,
enum journal_space_from from)
{
- unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags)
- ? ((journal_space_from(ja, from) -
- ja->cur_idx - 1 + ja->nr) % ja->nr)
- : ja->nr;
+ unsigned available = (journal_space_from(ja, from) -
+ ja->cur_idx - 1 + ja->nr) % ja->nr;
/*
* Don't use the last bucket unless writing the new last_seq
old.v, new.v)) != old.v);
}
-static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
-{
- unsigned sectors = 0;
-
- while (!sectors && *idx != j->reservations.idx) {
- sectors = j->buf[*idx].sectors;
-
- *idx = (*idx + 1) & JOURNAL_BUF_MASK;
- }
-
- return sectors;
-}
-
static struct journal_space
journal_dev_space_available(struct journal *j, struct bch_dev *ca,
enum journal_space_from from)
{
struct journal_device *ja = &ca->journal;
- unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx;
+ unsigned sectors, buckets, unwritten;
+ u64 seq;
if (from == journal_space_total)
return (struct journal_space) {
* We that we don't allocate the space for a journal entry
* until we write it out - thus, account for it here:
*/
- while ((unwritten = get_unwritten_sectors(j, &idx))) {
+ for (seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++) {
+ unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
+
+ if (!unwritten)
+ continue;
+
/* entry won't fit on this device, skip: */
if (unwritten > ca->mi.bucket_size)
continue;
j->can_discard = can_discard;
if (nr_online < c->opts.metadata_replicas_required) {
- ret = cur_entry_insufficient_devices;
+ ret = JOURNAL_ERR_insufficient_devices;
goto out;
}
total = j->space[journal_space_total].total;
if (!clean_ondisk &&
- j->reservations.idx ==
- j->reservations.unwritten_idx) {
- char *buf = kmalloc(4096, GFP_ATOMIC);
-
- bch_err(c, "journal stuck");
- if (buf) {
- __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
- pr_err("\n%s", buf);
- kfree(buf);
- }
+ journal_cur_seq(j) == j->seq_ondisk) {
+ struct printbuf buf = PRINTBUF;
+ __bch2_journal_debug_to_text(&buf, j);
+ bch_err(c, "journal stuck\n%s", buf.buf);
+ printbuf_exit(&buf);
+
+ /*
+ * Hack: bch2_fatal_error() calls bch2_journal_halt() which
+ * takes journal lock:
+ */
+ spin_unlock(&j->lock);
bch2_fatal_error(c);
- ret = cur_entry_journal_stuck;
+ spin_lock(&j->lock);
+
+ ret = JOURNAL_ERR_journal_stuck;
} else if (!j->space[journal_space_discarded].next_entry)
- ret = cur_entry_journal_full;
- else if (!fifo_free(&j->pin))
- ret = cur_entry_journal_pin_full;
+ ret = JOURNAL_ERR_journal_full;
if ((j->space[journal_space_clean_ondisk].next_entry <
j->space[journal_space_clean_ondisk].total) &&
(clean - clean_ondisk <= total / 8) &&
- (clean_ondisk * 2 > clean ))
+ (clean_ondisk * 2 > clean))
set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
journal_set_remaining(j, u64s_remaining);
- journal_check_may_get_unreserved(j);
+ journal_set_watermark(j);
if (!ret)
journal_wake(j);
struct journal_device *ja = &ca->journal;
while (should_discard_bucket(j, ja)) {
- if (ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+ if (!c->opts.nochanges &&
+ ca->mi.discard &&
+ bdev_max_discard_sectors(ca->disk_sb.bdev))
blkdev_issue_discard(ca->disk_sb.bdev,
bucket_to_sector(ca,
ja->buckets[ja->discard_idx]),
- ca->mi.bucket_size, GFP_NOIO, 0);
+ ca->mi.bucket_size, GFP_NOIO);
spin_lock(&j->lock);
ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
list_del_init(&pin->list);
/*
- * Unpinning a journal entry make make journal_next_bucket() succeed, if
+ * Unpinning a journal entry may make journal_next_bucket() succeed if
* writing a new last_seq will now make another bucket available:
*/
if (atomic_dec_and_test(&pin_list->count) &&
pin_list == &fifo_peek_front(&j->pin))
bch2_journal_reclaim_fast(j);
- else if (fifo_used(&j->pin) == 1 &&
- atomic_read(&pin_list->count) == 1)
- journal_wake(j);
}
void bch2_journal_pin_drop(struct journal *j,
* 512 journal entries or 25% of all journal buckets, then
* journal_next_bucket() should not stall.
*/
-static int __bch2_journal_reclaim(struct journal *j, bool direct)
+static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
bool kthread = (current->flags & PF_KTHREAD) != 0;
if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
min_nr = 1;
- trace_journal_reclaim_start(c,
- min_nr,
+ min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
+
+ trace_and_count(c, journal_reclaim_start, c,
+ direct, kicked,
+ min_nr, min_key_cache,
j->prereserved.reserved,
j->prereserved.remaining,
atomic_read(&c->btree_cache.dirty),
atomic_long_read(&c->btree_key_cache.nr_dirty),
atomic_long_read(&c->btree_key_cache.nr_keys));
- min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
-
nr_flushed = journal_flush_pins(j, seq_to_flush,
min_nr, min_key_cache);
j->nr_direct_reclaim += nr_flushed;
else
j->nr_background_reclaim += nr_flushed;
- trace_journal_reclaim_finish(c, nr_flushed);
+ trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
if (nr_flushed)
wake_up(&j->reclaim_wait);
- } while ((min_nr || min_key_cache) && !direct);
+ } while ((min_nr || min_key_cache) && nr_flushed && !direct);
memalloc_noreclaim_restore(flags);
int bch2_journal_reclaim(struct journal *j)
{
- return __bch2_journal_reclaim(j, true);
+ return __bch2_journal_reclaim(j, true, true);
}
static int bch2_journal_reclaim_thread(void *arg)
struct journal *j = arg;
struct bch_fs *c = container_of(j, struct bch_fs, journal);
unsigned long delay, now;
+ bool journal_empty;
int ret = 0;
set_freezable();
j->last_flushed = jiffies;
while (!ret && !kthread_should_stop()) {
+ bool kicked = j->reclaim_kicked;
+
j->reclaim_kicked = false;
mutex_lock(&j->reclaim_lock);
- ret = __bch2_journal_reclaim(j, false);
+ ret = __bch2_journal_reclaim(j, false, kicked);
mutex_unlock(&j->reclaim_lock);
now = jiffies;
break;
if (j->reclaim_kicked)
break;
- if (time_after_eq(jiffies, j->next_reclaim))
- break;
- freezable_schedule_timeout(j->next_reclaim - jiffies);
+ spin_lock(&j->lock);
+ journal_empty = fifo_empty(&j->pin);
+ spin_unlock(&j->lock);
+
+ if (journal_empty)
+ freezable_schedule();
+ else if (time_after(j->next_reclaim, jiffies))
+ freezable_schedule_timeout(j->next_reclaim - jiffies);
+ else
+ break;
}
__set_current_state(TASK_RUNNING);
}
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct task_struct *p;
+ int ret;
if (j->reclaim_thread)
return 0;
p = kthread_create(bch2_journal_reclaim_thread, j,
"bch-reclaim/%s", c->name);
- if (IS_ERR(p)) {
- bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p));
- return PTR_ERR(p);
+ ret = PTR_ERR_OR_ZERO(p);
+ if (ret) {
+ bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret));
+ return ret;
}
get_task_struct(p);
mutex_lock(&j->reclaim_lock);
- *did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0;
+ if (journal_flush_pins(j, seq_to_flush, 0, 0))
+ *did_work = true;
spin_lock(&j->lock);
/*
*/
ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
journal_last_seq(j) > seq_to_flush ||
- (fifo_used(&j->pin) == 1 &&
- atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+ !fifo_used(&j->pin);
spin_unlock(&j->lock);
mutex_unlock(&j->reclaim_lock);
seq = 0;
spin_lock(&j->lock);
- while (!ret && seq < j->pin.back) {
+ while (!ret) {
struct bch_replicas_padded replicas;
seq = max(seq, journal_last_seq(j));
+ if (seq >= j->pin.back)
+ break;
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
journal_seq_pin(j, seq)->devs);
seq++;
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "journal_sb.h"
+#include "darray.h"
+
+#include <linux/sort.h>
+
+/* BCH_SB_FIELD_journal: */
+
+static int u64_cmp(const void *_l, const void *_r)
+{
+ const u64 *l = _l;
+ const u64 *r = _r;
+
+ return cmp_int(*l, *r);
+}
+
+static int bch2_sb_journal_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_journal *journal = field_to_type(f, journal);
+ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+ int ret = -EINVAL;
+ unsigned nr;
+ unsigned i;
+ u64 *b;
+
+ nr = bch2_nr_journal_buckets(journal);
+ if (!nr)
+ return 0;
+
+ b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+
+ for (i = 0; i < nr; i++)
+ b[i] = le64_to_cpu(journal->buckets[i]);
+
+ sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+ if (!b[0]) {
+ prt_printf(err, "journal bucket at sector 0");
+ goto err;
+ }
+
+ if (b[0] < le16_to_cpu(m->first_bucket)) {
+ prt_printf(err, "journal bucket %llu before first bucket %u",
+ b[0], le16_to_cpu(m->first_bucket));
+ goto err;
+ }
+
+ if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
+ prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+ b[nr - 1], le64_to_cpu(m->nbuckets));
+ goto err;
+ }
+
+ for (i = 0; i + 1 < nr; i++)
+ if (b[i] == b[i + 1]) {
+ prt_printf(err, "duplicate journal buckets %llu", b[i]);
+ goto err;
+ }
+
+ ret = 0;
+err:
+ kfree(b);
+ return ret;
+}
+
+static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_journal *journal = field_to_type(f, journal);
+ unsigned i, nr = bch2_nr_journal_buckets(journal);
+
+ prt_printf(out, "Buckets: ");
+ for (i = 0; i < nr; i++)
+ prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i]));
+ prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal = {
+ .validate = bch2_sb_journal_validate,
+ .to_text = bch2_sb_journal_to_text,
+};
+
+struct u64_range {
+ u64 start;
+ u64 end;
+};
+
+static int u64_range_cmp(const void *_l, const void *_r)
+{
+ const struct u64_range *l = _l;
+ const struct u64_range *r = _r;
+
+ return cmp_int(l->start, r->start);
+}
+
+static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+ int ret = -EINVAL;
+ unsigned nr;
+ unsigned i;
+ struct u64_range *b;
+
+ nr = bch2_sb_field_journal_v2_nr_entries(journal);
+ if (!nr)
+ return 0;
+
+ b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+
+ for (i = 0; i < nr; i++) {
+ b[i].start = le64_to_cpu(journal->d[i].start);
+ b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
+ }
+
+ sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
+
+ if (!b[0].start) {
+ prt_printf(err, "journal bucket at sector 0");
+ goto err;
+ }
+
+ if (b[0].start < le16_to_cpu(m->first_bucket)) {
+ prt_printf(err, "journal bucket %llu before first bucket %u",
+ b[0].start, le16_to_cpu(m->first_bucket));
+ goto err;
+ }
+
+ if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) {
+ prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+ b[nr - 1].end - 1, le64_to_cpu(m->nbuckets));
+ goto err;
+ }
+
+ for (i = 0; i + 1 < nr; i++) {
+ if (b[i].end > b[i + 1].start) {
+ prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
+ b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
+ goto err;
+ }
+ }
+
+ ret = 0;
+err:
+ kfree(b);
+ return ret;
+}
+
+static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+ unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
+
+ prt_printf(out, "Buckets: ");
+ for (i = 0; i < nr; i++)
+ prt_printf(out, " %llu-%llu",
+ le64_to_cpu(journal->d[i].start),
+ le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
+ prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
+ .validate = bch2_sb_journal_v2_validate,
+ .to_text = bch2_sb_journal_v2_to_text,
+};
+
+int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca)
+{
+ struct journal_device *ja = &ca->journal;
+ struct bch_sb_field_journal_v2 *j;
+ unsigned i, dst = 0, nr = 1;
+
+ if (c)
+ lockdep_assert_held(&c->sb_lock);
+
+ if (!ja->nr) {
+ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
+ return 0;
+ }
+
+ for (i = 0; i + 1 < ja->nr; i++)
+ if (ja->buckets[i] + 1 != ja->buckets[i + 1])
+ nr++;
+
+ j = bch2_sb_resize_journal_v2(&ca->disk_sb,
+ (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64));
+ if (!j)
+ return -BCH_ERR_ENOSPC_sb_journal;
+
+ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+
+ j->d[dst].start = le64_to_cpu(ja->buckets[0]);
+ j->d[dst].nr = le64_to_cpu(1);
+
+ for (i = 1; i < ja->nr; i++) {
+ if (ja->buckets[i] == ja->buckets[i - 1] + 1) {
+ le64_add_cpu(&j->d[dst].nr, 1);
+ } else {
+ dst++;
+ j->d[dst].start = le64_to_cpu(ja->buckets[i]);
+ j->d[dst].nr = le64_to_cpu(1);
+ }
+ }
+
+ BUG_ON(dst + 1 != nr);
+
+ return 0;
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "super-io.h"
+#include "vstructs.h"
+
+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
+{
+ return j
+ ? (__le64 *) vstruct_end(&j->field) - j->buckets
+ : 0;
+}
+
+static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
+{
+ if (!j)
+ return 0;
+
+ return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
+
+int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *);
if (le64_to_cpu(e->start) >=
le64_to_cpu(e->end)) {
- pr_buf(err, "entry %u start >= end (%llu >= %llu)",
+ prt_printf(err, "entry %u start >= end (%llu >= %llu)",
i, le64_to_cpu(e->start), le64_to_cpu(e->end));
return -EINVAL;
}
if (i + 1 < nr &&
le64_to_cpu(e[0].end) >
le64_to_cpu(e[1].start)) {
- pr_buf(err, "entry %u out of order with next entry (%llu > %llu)",
+ prt_printf(err, "entry %u out of order with next entry (%llu > %llu)",
i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
return -EINVAL;
}
for (i = bl->start; i < bl->start + nr; i++) {
if (i != bl->start)
- pr_buf(out, " ");
+ prt_printf(out, " ");
- pr_buf(out, "%llu-%llu",
+ prt_printf(out, "%llu-%llu",
le64_to_cpu(i->start),
le64_to_cpu(i->end));
}
+ prt_newline(out);
}
const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
!test_bit(BCH_FS_STOPPING, &c->flags))
b = bch2_btree_iter_next_node(&iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_iter_exit(&trans, &iter);
struct closure_waitlist wait;
u64 last_seq; /* copy of data->last_seq */
+ long expires;
+ u64 flush_time;
unsigned buf_size; /* size in bytes of @data */
unsigned sectors; /* maximum size for current entry */
journal_space_nr,
};
-/*
- * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
- * either because something's waiting on the write to complete or because it's
- * been dirty too long and the timer's expired.
- */
-
enum {
JOURNAL_REPLAY_DONE,
JOURNAL_STARTED,
- JOURNAL_NEED_WRITE,
- JOURNAL_MAY_GET_UNRESERVED,
JOURNAL_MAY_SKIP_FLUSH,
- JOURNAL_NOCHANGES,
+};
+
+#define JOURNAL_WATERMARKS() \
+ x(any) \
+ x(copygc) \
+ x(reserved)
+
+enum journal_watermark {
+#define x(n) JOURNAL_WATERMARK_##n,
+ JOURNAL_WATERMARKS()
+#undef x
+};
+
+#define JOURNAL_WATERMARK_MASK 3
+
+/* Reasons we may fail to get a journal reservation: */
+#define JOURNAL_ERRORS() \
+ x(ok) \
+ x(blocked) \
+ x(max_in_flight) \
+ x(journal_full) \
+ x(journal_pin_full) \
+ x(journal_stuck) \
+ x(insufficient_devices)
+
+enum journal_errors {
+#define x(n) JOURNAL_ERR_##n,
+ JOURNAL_ERRORS()
+#undef x
};
/* Embedded in struct bch_fs */
unsigned long flags;
union journal_res_state reservations;
+ enum journal_watermark watermark;
/* Max size of current journal entry */
unsigned cur_entry_u64s;
* 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
* insufficient devices:
*/
- enum {
- cur_entry_ok,
- cur_entry_blocked,
- cur_entry_journal_full,
- cur_entry_journal_pin_full,
- cur_entry_journal_stuck,
- cur_entry_insufficient_devices,
- } cur_entry_error;
+ enum journal_errors cur_entry_error;
union journal_preres_state prereserved;
spinlock_t err_lock;
struct mutex reclaim_lock;
+ /*
+ * Used for waiting until journal reclaim has freed up space in the
+ * journal:
+ */
wait_queue_head_t reclaim_wait;
struct task_struct *reclaim_thread;
bool reclaim_kicked;
unsigned long last_flush_write;
u64 res_get_blocked_start;
- u64 need_write_time;
u64 write_start_time;
u64 nr_flush_writes;
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey.h"
#include "keylist.h"
int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "error.h"
+#include "lru.h"
+#include "recovery.h"
+
+int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
+{
+ const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+ if (bkey_val_bytes(k.k) < sizeof(*lru)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(*lru));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+ prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
+}
+
+int bch2_lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time,
+ struct bkey_s_c orig_k)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 existing_idx;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (!time)
+ return 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
+ POS(id, time),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_lru) {
+ bch2_bkey_val_to_text(&buf, trans->c, orig_k);
+ bch2_trans_inconsistent(trans,
+ "pointer to nonexistent lru %llu:%llu\n%s",
+ id, time, buf.buf);
+ ret = -EIO;
+ goto err;
+ }
+
+ existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
+ if (existing_idx != idx) {
+ bch2_bkey_val_to_text(&buf, trans->c, orig_k);
+ bch2_trans_inconsistent(trans,
+ "lru %llu:%llu with wrong backpointer: got %llu, should be %llu\n%s",
+ id, time, existing_idx, idx, buf.buf);
+ ret = -EIO;
+ goto err;
+ }
+
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+int bch2_lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_lru *lru;
+ int ret = 0;
+
+ if (!*time)
+ return 0;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_lru,
+ POS(lru_id, *time),
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_INTENT|
+ BTREE_ITER_WITH_UPDATES, k, ret)
+ if (bkey_deleted(k.k))
+ break;
+
+ if (ret)
+ goto err;
+
+ BUG_ON(iter.pos.inode != lru_id);
+ *time = iter.pos.offset;
+
+ lru = bch2_trans_kmalloc(trans, sizeof(*lru));
+ ret = PTR_ERR_OR_ZERO(lru);
+ if (ret)
+ goto err;
+
+ bkey_lru_init(&lru->k_i);
+ lru->k.p = iter.pos;
+ lru->v.idx = cpu_to_le64(idx);
+
+ ret = bch2_trans_update(trans, &iter, &lru->k_i, 0);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx,
+ u64 old_time, u64 *new_time,
+ struct bkey_s_c k)
+{
+ if (old_time == *new_time)
+ return 0;
+
+ return bch2_lru_delete(trans, id, idx, old_time, k) ?:
+ bch2_lru_set(trans, id, idx, new_time);
+}
+
+static int bch2_check_lru_key(struct btree_trans *trans,
+ struct btree_iter *lru_iter,
+ struct bkey_s_c lru_k)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_alloc_v4 a;
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+ struct bpos alloc_pos;
+ int ret;
+
+ alloc_pos = POS(lru_k.k->p.inode,
+ le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx));
+
+ if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
+ "lru key points to nonexistent device:bucket %llu:%llu",
+ alloc_pos.inode, alloc_pos.offset))
+ return bch2_btree_delete_at(trans, lru_iter, 0);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ bch2_alloc_to_v4(k, &a);
+
+ if (fsck_err_on(a.data_type != BCH_DATA_cached ||
+ a.io_time[READ] != lru_k.k->p.offset, c,
+ "incorrect lru entry %s\n"
+ " for %s",
+ (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
+ (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
+ struct bkey_i *update =
+ bch2_trans_kmalloc(trans, sizeof(*update));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ bkey_init(&update->k);
+ update->k.p = lru_iter->pos;
+
+ ret = bch2_trans_update(trans, lru_iter, update, 0);
+ if (ret)
+ goto err;
+ }
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
+ return ret;
+}
+
+int bch2_check_lrus(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ ret = for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ bch2_check_lru_key(&trans, &iter, k));
+
+ bch2_trans_exit(&trans);
+ return ret;
+
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LRU_H
+#define _BCACHEFS_LRU_H
+
+int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
+void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_lru (struct bkey_ops) { \
+ .key_invalid = bch2_lru_invalid, \
+ .val_to_text = bch2_lru_to_text, \
+}
+
+int bch2_lru_delete(struct btree_trans *, u64, u64, u64, struct bkey_s_c);
+int bch2_lru_set(struct btree_trans *, u64, u64, u64 *);
+int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *, struct bkey_s_c);
+
+int bch2_check_lrus(struct bch_fs *);
+
+#endif /* _BCACHEFS_LRU_H */
#include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h"
+#include "errcode.h"
#include "extents.h"
#include "io.h"
#include "journal.h"
return 0;
}
-static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags,
- enum btree_id btree_id)
+static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ unsigned dev_idx,
+ int flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i *n;
+ int ret;
+
+ if (!bch2_bkey_has_device(k, dev_idx))
+ return 0;
+
+ n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(n, k);
+
+ ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false);
+ if (ret)
+ return ret;
+
+ /*
+ * If the new extent no longer has any pointers, bch2_extent_normalize()
+ * will do the appropriate thing with it (turning it into a
+ * KEY_TYPE_error key, or just a discard if it was a cached extent)
+ */
+ bch2_extent_normalize(c, bkey_i_to_s(n));
+
+ /*
+ * Since we're not inserting through an extent iterator
+ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+ * we aren't using the extent overwrite path to delete, we're
+ * just using the normal key deletion path:
+ */
+ if (bkey_deleted(&n->k))
+ n->k.size = 0;
+
+ return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_buf sk;
+ enum btree_id id;
int ret = 0;
- bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
-
- while ((bch2_trans_begin(&trans),
- (k = bch2_btree_iter_peek(&iter)).k) &&
- !(ret = bkey_err(k))) {
- if (!bch2_bkey_has_device(k, dev_idx)) {
- bch2_btree_iter_advance(&iter);
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ if (!btree_type_has_ptrs(id))
continue;
- }
-
- bch2_bkey_buf_reassemble(&sk, c, k);
- ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k),
- dev_idx, flags, false);
- if (ret)
- break;
-
- /*
- * If the new extent no longer has any pointers, bch2_extent_normalize()
- * will do the appropriate thing with it (turning it into a
- * KEY_TYPE_error key, or just a discard if it was a cached extent)
- */
- bch2_extent_normalize(c, bkey_i_to_s(sk.k));
-
- /*
- * Since we're not inserting through an extent iterator
- * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
- * we aren't using the extent overwrite path to delete, we're
- * just using the normal key deletion path:
- */
- if (bkey_deleted(&sk.k->k))
- sk.k->k.size = 0;
-
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(&trans, &iter, sk.k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
-
- /*
- * don't want to leave ret == -EINTR, since if we raced and
- * something else overwrote the key we could spuriously return
- * -EINTR below:
- */
- if (ret == -EINTR)
- ret = 0;
+ ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags));
if (ret)
break;
}
- bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
- bch2_bkey_buf_exit(&sk, c);
-
- BUG_ON(ret == -EINTR);
return ret;
}
-static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
-{
- return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_extents) ?:
- __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_reflink);
-}
-
static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct btree_trans trans;
}
ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false);
- if (ret == -EINTR) {
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
ret = 0;
continue;
}
if (ret) {
- bch_err(c, "Error updating btree node key: %i", ret);
+ bch_err(c, "Error updating btree node key: %s",
+ bch2_err_str(ret));
break;
}
next:
bch2_btree_iter_next_node(&iter);
}
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_iter_exit(&trans, &iter);
goto err;
}
- /* flush relevant btree updates */
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
-
+ bch2_btree_interior_updates_flush(c);
ret = 0;
err:
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&k, c);
- BUG_ON(ret == -EINTR);
+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
return ret;
}
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "bkey_buf.h"
#include "btree_gc.h"
#include "btree_update.h"
#include "btree_update_interior.h"
-#include "buckets.h"
#include "disk_groups.h"
#include "ec.h"
+#include "errcode.h"
+#include "error.h"
#include "inode.h"
#include "io.h"
#include "journal_reclaim.h"
#include "move.h"
#include "replicas.h"
-#include "subvolume.h"
#include "super-io.h"
#include "keylist.h"
#include <trace/events/bcachefs.h>
-#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
+static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
+{
+ mutex_lock(&c->data_progress_lock);
+ list_add(&stats->list, &c->data_progress_list);
+ mutex_unlock(&c->data_progress_lock);
+}
+
+static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
+{
+ mutex_lock(&c->data_progress_lock);
+ list_del(&stats->list);
+ mutex_unlock(&c->data_progress_lock);
+}
struct moving_io {
struct list_head list;
struct bch_read_bio rbio;
- struct migrate_write write;
+ struct data_update write;
/* Must be last since it is variable size */
struct bio_vec bi_inline_vecs[0];
};
-struct moving_context {
- /* Closure for waiting on all reads and writes to complete */
- struct closure cl;
-
- struct bch_move_stats *stats;
-
- struct list_head reads;
-
- /* in flight sectors: */
- atomic_t read_sectors;
- atomic_t write_sectors;
-
- wait_queue_head_t wait;
-};
-
-static int insert_snapshot_whiteouts(struct btree_trans *trans,
- enum btree_id id,
- struct bpos old_pos,
- struct bpos new_pos)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter, update_iter;
- struct bkey_s_c k;
- struct snapshots_seen s;
- int ret;
-
- if (!btree_type_has_snapshots(id))
- return 0;
-
- snapshots_seen_init(&s);
-
- if (!bkey_cmp(old_pos, new_pos))
- return 0;
-
- if (!snapshot_t(c, old_pos.snapshot)->children[0])
- return 0;
-
- bch2_trans_iter_init(trans, &iter, id, old_pos,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
- while (1) {
-next:
- k = bch2_btree_iter_prev(&iter);
- ret = bkey_err(k);
- if (ret)
- break;
-
- if (bkey_cmp(old_pos, k.k->p))
- break;
-
- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
- struct bkey_i *update;
- size_t i;
-
- for (i = 0; i < s.nr; i++)
- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i]))
- goto next;
-
- update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-
- ret = PTR_ERR_OR_ZERO(update);
- if (ret)
- break;
-
- bkey_init(&update->k);
- update->k.p = new_pos;
- update->k.p.snapshot = k.k->p.snapshot;
-
- bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(&update_iter) ?:
- bch2_trans_update(trans, &update_iter, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
- bch2_trans_iter_exit(trans, &update_iter);
- if (ret)
- break;
-
- ret = snapshots_seen_add(c, &s, k.k->p.snapshot);
- if (ret)
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
- kfree(s.d);
-
- return ret;
-}
-
-static int bch2_migrate_index_update(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct btree_trans trans;
- struct btree_iter iter;
- struct migrate_write *m =
- container_of(op, struct migrate_write, op);
- struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
- struct keylist *keys = &op->insert_keys;
- struct bkey_buf _new, _insert;
- int ret = 0;
-
- bch2_bkey_buf_init(&_new);
- bch2_bkey_buf_init(&_insert);
- bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
-
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
- bch2_trans_iter_init(&trans, &iter, m->btree_id,
- bkey_start_pos(&bch2_keylist_front(keys)->k),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
- while (1) {
- struct bkey_s_c k;
- struct bkey_i *insert;
- struct bkey_i_extent *new;
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- struct bpos next_pos;
- bool did_work = false;
- bool should_check_enospc;
- s64 i_sectors_delta = 0, disk_sectors_delta = 0;
-
- bch2_trans_begin(&trans);
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- new = bkey_i_to_extent(bch2_keylist_front(keys));
-
- if (bversion_cmp(k.k->version, new->k.version) ||
- !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
- goto nomatch;
-
- bkey_reassemble(_insert.k, k);
- insert = _insert.k;
-
- bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
- new = bkey_i_to_extent(_new.k);
- bch2_cut_front(iter.pos, &new->k_i);
-
- bch2_cut_front(iter.pos, insert);
- bch2_cut_back(new->k.p, insert);
- bch2_cut_back(insert->k.p, &new->k_i);
-
- if (m->data_cmd == DATA_REWRITE) {
- struct bch_extent_ptr *new_ptr, *old_ptr = (void *)
- bch2_bkey_has_device(bkey_i_to_s_c(insert),
- m->data_opts.rewrite_dev);
- if (!old_ptr)
- goto nomatch;
-
- if (old_ptr->cached)
- extent_for_each_ptr(extent_i_to_s(new), new_ptr)
- new_ptr->cached = true;
-
- __bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
- }
-
- extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
- if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
- /*
- * raced with another move op? extent already
- * has a pointer to the device we just wrote
- * data to
- */
- continue;
- }
-
- bch2_extent_ptr_decoded_append(insert, &p);
- did_work = true;
- }
-
- if (!did_work)
- goto nomatch;
-
- bch2_bkey_narrow_crcs(insert,
- (struct bch_extent_crc_unpacked) { 0 });
- bch2_extent_normalize(c, bkey_i_to_s(insert));
- bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert),
- op->opts.background_target,
- op->opts.data_replicas);
-
- ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
- &should_check_enospc,
- &i_sectors_delta,
- &disk_sectors_delta);
- if (ret)
- goto err;
-
- if (disk_sectors_delta > (s64) op->res.sectors) {
- ret = bch2_disk_reservation_add(c, &op->res,
- disk_sectors_delta - op->res.sectors,
- !should_check_enospc
- ? BCH_DISK_RESERVATION_NOFAIL : 0);
- if (ret)
- goto out;
- }
-
- next_pos = insert->k.p;
-
- ret = insert_snapshot_whiteouts(&trans, m->btree_id,
- k.k->p, insert->k.p) ?:
- bch2_trans_update(&trans, &iter, insert,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(&trans, &op->res,
- op_journal_seq(op),
- BTREE_INSERT_NOFAIL|
- m->data_opts.btree_insert_flags);
- if (!ret) {
- bch2_btree_iter_set_pos(&iter, next_pos);
- atomic_long_inc(&c->extent_migrate_done);
- if (ec_ob)
- bch2_ob_add_backpointer(c, ec_ob, &insert->k);
- }
-err:
- if (ret == -EINTR)
- ret = 0;
- if (ret)
- break;
-next:
- while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
- bch2_keylist_pop_front(keys);
- if (bch2_keylist_empty(keys))
- goto out;
- }
- continue;
-nomatch:
- if (m->ctxt) {
- BUG_ON(k.k->p.offset <= iter.pos.offset);
- atomic64_inc(&m->ctxt->stats->keys_raced);
- atomic64_add(k.k->p.offset - iter.pos.offset,
- &m->ctxt->stats->sectors_raced);
- }
- atomic_long_inc(&c->extent_migrate_raced);
- trace_move_race(&new->k);
- bch2_btree_iter_advance(&iter);
- goto next;
- }
-out:
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
- bch2_bkey_buf_exit(&_insert, c);
- bch2_bkey_buf_exit(&_new, c);
- BUG_ON(ret == -EINTR);
- return ret;
-}
-
-void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
-{
- /* write bio must own pages: */
- BUG_ON(!m->op.wbio.bio.bi_vcnt);
-
- m->ptr = rbio->pick.ptr;
- m->offset = rbio->data_pos.offset - rbio->pick.crc.offset;
- m->op.devs_have = rbio->devs_have;
- m->op.pos = rbio->data_pos;
- m->op.version = rbio->version;
- m->op.crc = rbio->pick.crc;
- m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
-
- if (m->data_cmd == DATA_REWRITE)
- bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
-}
-
-int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
- struct write_point_specifier wp,
- struct bch_io_opts io_opts,
- enum data_cmd data_cmd,
- struct data_opts data_opts,
- enum btree_id btree_id,
- struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct bch_extent_crc_unpacked crc;
- struct extent_ptr_decoded p;
- int ret;
-
- m->btree_id = btree_id;
- m->data_cmd = data_cmd;
- m->data_opts = data_opts;
- m->nr_ptrs_reserved = 0;
-
- bch2_write_op_init(&m->op, c, io_opts);
-
- if (!bch2_bkey_is_incompressible(k))
- m->op.compression_type =
- bch2_compression_opt_to_type[io_opts.background_compression ?:
- io_opts.compression];
- else
- m->op.incompressible = true;
-
- m->op.target = data_opts.target,
- m->op.write_point = wp;
-
- /*
- * op->csum_type is normally initialized from the fs/file's current
- * options - but if an extent is encrypted, we require that it stays
- * encrypted:
- */
- bkey_for_each_crc(k.k, ptrs, crc, entry)
- if (bch2_csum_type_is_encryption(crc.csum_type)) {
- m->op.nonce = crc.nonce + crc.offset;
- m->op.csum_type = crc.csum_type;
- break;
- }
-
- if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
- m->op.alloc_reserve = RESERVE_MOVINGGC;
- m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
- } else {
- /* XXX: this should probably be passed in */
- m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
- }
-
- m->op.flags |= BCH_WRITE_PAGES_STABLE|
- BCH_WRITE_PAGES_OWNED|
- BCH_WRITE_DATA_ENCODED|
- BCH_WRITE_FROM_INTERNAL;
-
- m->op.nr_replicas = data_opts.nr_replicas;
- m->op.nr_replicas_required = data_opts.nr_replicas;
- m->op.index_update_fn = bch2_migrate_index_update;
-
- switch (data_cmd) {
- case DATA_ADD_REPLICAS: {
- /*
- * DATA_ADD_REPLICAS is used for moving data to a different
- * device in the background, and due to compression the new copy
- * might take up more space than the old copy:
- */
-#if 0
- int nr = (int) io_opts.data_replicas -
- bch2_bkey_nr_ptrs_allocated(k);
-#endif
- int nr = (int) io_opts.data_replicas;
-
- if (nr > 0) {
- m->op.nr_replicas = m->nr_ptrs_reserved = nr;
-
- ret = bch2_disk_reservation_get(c, &m->op.res,
- k.k->size, m->op.nr_replicas, 0);
- if (ret)
- return ret;
- }
- break;
- }
- case DATA_REWRITE: {
- unsigned compressed_sectors = 0;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (p.ptr.dev == data_opts.rewrite_dev) {
- if (p.ptr.cached)
- m->op.flags |= BCH_WRITE_CACHED;
-
- if (!p.ptr.cached &&
- crc_is_compressed(p.crc))
- compressed_sectors += p.crc.compressed_size;
- }
-
- if (compressed_sectors) {
- ret = bch2_disk_reservation_add(c, &m->op.res,
- k.k->size * m->op.nr_replicas,
- BCH_DISK_RESERVATION_NOFAIL);
- if (ret)
- return ret;
- }
- break;
- }
- case DATA_PROMOTE:
- m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
- m->op.flags |= BCH_WRITE_CACHED;
- break;
- default:
- BUG();
- }
-
- return 0;
-}
-
static void move_free(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->write.ctxt;
- struct bvec_iter_all iter;
- struct bio_vec *bv;
-
- bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
-
- bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
- if (bv->bv_page)
- __free_page(bv->bv_page);
+ struct bch_fs *c = ctxt->c;
+ bch2_data_update_exit(&io->write);
wake_up(&ctxt->wait);
-
+ percpu_ref_put(&c->writes);
kfree(io);
}
static void move_write_done(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
+ struct moving_context *ctxt = io->write.ctxt;
+
+ if (io->write.op.error)
+ ctxt->write_error = true;
atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
closure_return_with_destructor(cl, move_free);
return;
}
- bch2_migrate_read_done(&io->write, &io->rbio);
-
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
- closure_call(&io->write.op.cl, bch2_write, NULL, cl);
+
+ bch2_data_update_read_done(&io->write, io->rbio.pick.crc, cl);
continue_at(cl, move_write_done, NULL);
}
atomic_sub(io->read_sectors, &ctxt->read_sectors);
io->read_completed = true;
- if (next_pending_write(ctxt))
- wake_up(&ctxt->wait);
-
+ wake_up(&ctxt->wait);
closure_put(&ctxt->cl);
}
atomic_read(&ctxt->write_sectors) != sectors_pending);
}
+void bch2_moving_ctxt_exit(struct moving_context *ctxt)
+{
+ move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
+ closure_sync(&ctxt->cl);
+ EBUG_ON(atomic_read(&ctxt->write_sectors));
+
+ if (ctxt->stats) {
+ progress_list_del(ctxt->c, ctxt->stats);
+
+ trace_move_data(ctxt->c,
+ atomic64_read(&ctxt->stats->sectors_moved),
+ atomic64_read(&ctxt->stats->keys_moved));
+ }
+}
+
+void bch2_moving_ctxt_init(struct moving_context *ctxt,
+ struct bch_fs *c,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc)
+{
+ memset(ctxt, 0, sizeof(*ctxt));
+
+ ctxt->c = c;
+ ctxt->rate = rate;
+ ctxt->stats = stats;
+ ctxt->wp = wp;
+ ctxt->wait_on_copygc = wait_on_copygc;
+
+ closure_init_stack(&ctxt->cl);
+ INIT_LIST_HEAD(&ctxt->reads);
+ init_waitqueue_head(&ctxt->wait);
+
+ if (stats) {
+ progress_list_add(c, stats);
+ stats->data_type = BCH_DATA_user;
+ }
+}
+
+void bch_move_stats_init(struct bch_move_stats *stats, char *name)
+{
+ memset(stats, 0, sizeof(*stats));
+ scnprintf(stats->name, sizeof(stats->name), "%s", name);
+}
+
+static int bch2_extent_drop_ptrs(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct data_update_opts data_opts)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i *n;
+ int ret;
+
+ n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(n, k);
+
+ while (data_opts.kill_ptrs) {
+ unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
+ struct bch_extent_ptr *ptr;
+
+ bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
+ data_opts.kill_ptrs ^= 1U << drop;
+ }
+
+ /*
+ * If the new extent no longer has any pointers, bch2_extent_normalize()
+ * will do the appropriate thing with it (turning it into a
+ * KEY_TYPE_error key, or just a discard if it was a cached extent)
+ */
+ bch2_extent_normalize(c, bkey_i_to_s(n));
+
+ /*
+ * Since we're not inserting through an extent iterator
+ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+ * we aren't using the extent overwrite path to delete, we're
+ * just using the normal key deletion path:
+ */
+ if (bkey_deleted(&n->k))
+ n->k.size = 0;
+
+ return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
static int bch2_move_extent(struct btree_trans *trans,
+ struct btree_iter *iter,
struct moving_context *ctxt,
- struct write_point_specifier wp,
struct bch_io_opts io_opts,
enum btree_id btree_id,
struct bkey_s_c k,
- enum data_cmd data_cmd,
- struct data_opts data_opts)
+ struct data_update_opts data_opts)
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
+ bch2_data_update_opts_normalize(k, &data_opts);
+
+ if (!data_opts.rewrite_ptrs &&
+ !data_opts.extra_replicas) {
+ if (data_opts.kill_ptrs)
+ return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
+ return 0;
+ }
+
+ if (!percpu_ref_tryget_live(&c->writes))
+ return -EROFS;
+
/* write path might have to decompress data: */
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
io->read_sectors = k.k->size;
io->write_sectors = k.k->size;
- bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
+ bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
bio_set_prio(&io->write.op.wbio.bio,
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->rbio.c = c;
io->rbio.opts = io_opts;
- bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
+ bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
io->rbio.bio.bi_vcnt = pages;
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->rbio.bio.bi_iter.bi_size = sectors << 9;
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
io->rbio.bio.bi_end_io = move_read_endio;
- ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
- data_cmd, data_opts, btree_id, k);
+ ret = bch2_data_update_init(c, &io->write, ctxt->wp, io_opts,
+ data_opts, btree_id, k);
if (ret)
goto err_free_pages;
+ io->write.ctxt = ctxt;
+
atomic64_inc(&ctxt->stats->keys_moved);
atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
-
- trace_move_extent(k.k);
+ this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
+ this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
+ trace_move_extent_read(k.k);
atomic_add(io->read_sectors, &ctxt->read_sectors);
list_add_tail(&io->list, &ctxt->reads);
err_free:
kfree(io);
err:
- trace_move_alloc_fail(k.k);
+ percpu_ref_put(&c->writes);
+ trace_and_count(c, move_extent_alloc_mem_fail, k.k);
return ret;
}
return ret;
}
-static int __bch2_move_data(struct bch_fs *c,
- struct moving_context *ctxt,
- struct bch_ratelimit *rate,
- struct write_point_specifier wp,
- struct bpos start,
- struct bpos end,
- move_pred_fn pred, void *arg,
- struct bch_move_stats *stats,
- enum btree_id btree_id)
+static int move_ratelimit(struct btree_trans *trans,
+ struct moving_context *ctxt)
{
- bool kthread = (current->flags & PF_KTHREAD) != 0;
+ struct bch_fs *c = trans->c;
+ u64 delay;
+
+ if (ctxt->wait_on_copygc) {
+ bch2_trans_unlock(trans);
+ wait_event_killable(c->copygc_running_wq,
+ !c->copygc_running ||
+ kthread_should_stop());
+ }
+
+ do {
+ delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
+
+ if (delay) {
+ bch2_trans_unlock(trans);
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ return 1;
+ }
+
+ if (delay)
+ schedule_timeout(delay);
+
+ if (unlikely(freezing(current))) {
+ move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
+ try_to_freeze();
+ }
+ } while (delay);
+
+ move_ctxt_wait_event(ctxt, trans,
+ atomic_read(&ctxt->write_sectors) <
+ c->opts.move_bytes_in_flight >> 9);
+
+ move_ctxt_wait_event(ctxt, trans,
+ atomic_read(&ctxt->read_sectors) <
+ c->opts.move_bytes_in_flight >> 9);
+
+ return 0;
+}
+
+static int move_get_io_opts(struct btree_trans *trans,
+ struct bch_io_opts *io_opts,
+ struct bkey_s_c k, u64 *cur_inum)
+{
+ struct bch_inode_unpacked inode;
+ int ret;
+
+ if (*cur_inum == k.k->p.inode)
+ return 0;
+
+ *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+
+ ret = lookup_inode(trans,
+ SPOS(0, k.k->p.inode, k.k->p.snapshot),
+ &inode);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ret;
+
+ if (!ret)
+ bch2_io_opts_apply(io_opts, bch2_inode_opts_get(&inode));
+
+ *cur_inum = k.k->p.inode;
+ return 0;
+}
+
+static int __bch2_move_data(struct moving_context *ctxt,
+ struct bpos start,
+ struct bpos end,
+ move_pred_fn pred, void *arg,
+ enum btree_id btree_id)
+{
+ struct bch_fs *c = ctxt->c;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct bkey_buf sk;
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct data_opts data_opts;
- enum data_cmd data_cmd;
- u64 delay, cur_inum = U64_MAX;
+ struct data_update_opts data_opts;
+ u64 cur_inum = U64_MAX;
int ret = 0, ret2;
bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
- stats->data_type = BCH_DATA_user;
- stats->btree_id = btree_id;
- stats->pos = start;
+ ctxt->stats->data_type = BCH_DATA_user;
+ ctxt->stats->btree_id = btree_id;
+ ctxt->stats->pos = start;
bch2_trans_iter_init(&trans, &iter, btree_id, start,
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS);
- if (rate)
- bch2_ratelimit_reset(rate);
-
- while (1) {
- do {
- delay = rate ? bch2_ratelimit_delay(rate) : 0;
-
- if (delay) {
- bch2_trans_unlock(&trans);
- set_current_state(TASK_INTERRUPTIBLE);
- }
-
- if (kthread && (ret = kthread_should_stop())) {
- __set_current_state(TASK_RUNNING);
- goto out;
- }
-
- if (delay)
- schedule_timeout(delay);
-
- if (unlikely(freezing(current))) {
- move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads));
- try_to_freeze();
- }
- } while (delay);
-
- move_ctxt_wait_event(ctxt, &trans,
- atomic_read(&ctxt->write_sectors) <
- SECTORS_IN_FLIGHT_PER_DEVICE);
-
- move_ctxt_wait_event(ctxt, &trans,
- atomic_read(&ctxt->read_sectors) <
- SECTORS_IN_FLIGHT_PER_DEVICE);
+ if (ctxt->rate)
+ bch2_ratelimit_reset(ctxt->rate);
+ while (!move_ratelimit(&trans, ctxt)) {
bch2_trans_begin(&trans);
k = bch2_btree_iter_peek(&iter);
break;
ret = bkey_err(k);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
break;
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
- stats->pos = iter.pos;
+ ctxt->stats->pos = iter.pos;
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
- if (btree_id == BTREE_ID_extents &&
- cur_inum != k.k->p.inode) {
- struct bch_inode_unpacked inode;
-
- io_opts = bch2_opts_to_inode_opts(c->opts);
-
- ret = lookup_inode(&trans,
- SPOS(0, k.k->p.inode, k.k->p.snapshot),
- &inode);
- if (ret == -EINTR)
- continue;
-
- if (!ret)
- bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
-
- cur_inum = k.k->p.inode;
- }
+ ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
+ if (ret)
+ continue;
- switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) {
- case DATA_SKIP:
+ memset(&data_opts, 0, sizeof(data_opts));
+ if (!pred(c, arg, k, &io_opts, &data_opts))
goto next;
- case DATA_SCRUB:
- BUG();
- case DATA_ADD_REPLICAS:
- case DATA_REWRITE:
- case DATA_PROMOTE:
- break;
- default:
- BUG();
- }
/*
* The iterator gets unlocked by __bch2_read_extent - need to
* save a copy of @k elsewhere:
- */
+ */
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
- data_cmd, data_opts);
+ ret2 = bch2_move_extent(&trans, &iter, ctxt, io_opts,
+ btree_id, k, data_opts);
if (ret2) {
- if (ret2 == -EINTR)
+ if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
continue;
if (ret2 == -ENOMEM) {
goto next;
}
- if (rate)
- bch2_ratelimit_increment(rate, k.k->size);
+ if (ctxt->rate)
+ bch2_ratelimit_increment(ctxt->rate, k.k->size);
next:
- atomic64_add(k.k->size, &stats->sectors_seen);
+ atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
next_nondata:
bch2_btree_iter_advance(&iter);
}
-out:
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
-inline void bch_move_stats_init(struct bch_move_stats *stats, char *name)
-{
- memset(stats, 0, sizeof(*stats));
-
- scnprintf(stats->name, sizeof(stats->name),
- "%s", name);
-}
-
-static inline void progress_list_add(struct bch_fs *c,
- struct bch_move_stats *stats)
-{
- mutex_lock(&c->data_progress_lock);
- list_add(&stats->list, &c->data_progress_list);
- mutex_unlock(&c->data_progress_lock);
-}
-
-static inline void progress_list_del(struct bch_fs *c,
- struct bch_move_stats *stats)
-{
- mutex_lock(&c->data_progress_lock);
- list_del(&stats->list);
- mutex_unlock(&c->data_progress_lock);
-}
-
int bch2_move_data(struct bch_fs *c,
enum btree_id start_btree_id, struct bpos start_pos,
enum btree_id end_btree_id, struct bpos end_pos,
struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
struct write_point_specifier wp,
- move_pred_fn pred, void *arg,
- struct bch_move_stats *stats)
+ bool wait_on_copygc,
+ move_pred_fn pred, void *arg)
{
- struct moving_context ctxt = { .stats = stats };
+ struct moving_context ctxt;
enum btree_id id;
int ret;
- progress_list_add(c, stats);
- closure_init_stack(&ctxt.cl);
- INIT_LIST_HEAD(&ctxt.reads);
- init_waitqueue_head(&ctxt.wait);
-
- stats->data_type = BCH_DATA_user;
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
for (id = start_btree_id;
id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
id != BTREE_ID_reflink)
continue;
- ret = __bch2_move_data(c, &ctxt, rate, wp,
+ ret = __bch2_move_data(&ctxt,
id == start_btree_id ? start_pos : POS_MIN,
id == end_btree_id ? end_pos : POS_MAX,
- pred, arg, stats, id);
+ pred, arg, id);
if (ret)
break;
}
+ bch2_moving_ctxt_exit(&ctxt);
- move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads));
- closure_sync(&ctxt.cl);
+ return ret;
+}
+
+static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ bucket, BTREE_ITER_CACHED);
+again:
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
- EBUG_ON(atomic_read(&ctxt.write_sectors));
+ if (!ret && k.k->type == KEY_TYPE_alloc_v4) {
+ struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
- trace_move_data(c,
- atomic64_read(&stats->sectors_moved),
- atomic64_read(&stats->keys_moved));
+ if (a.v->gen == gen &&
+ a.v->dirty_sectors) {
+ struct printbuf buf = PRINTBUF;
+
+ if (a.v->data_type == BCH_DATA_btree) {
+ bch2_trans_unlock(trans);
+ if (bch2_btree_interior_updates_flush(c))
+ goto again;
+ }
+
+ prt_str(&buf, "failed to evacuate bucket ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int __bch2_evacuate_bucket(struct moving_context *ctxt,
+ struct bpos bucket, int gen,
+ struct data_update_opts _data_opts)
+{
+ struct bch_fs *c = ctxt->c;
+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_buf sk;
+ struct bch_backpointer bp;
+ struct data_update_opts data_opts;
+ u64 bp_offset = 0, cur_inum = U64_MAX;
+ int ret = 0;
+
+ bch2_bkey_buf_init(&sk);
+ bch2_trans_init(&trans, c, 0, 0);
+
+ while (!(ret = move_ratelimit(&trans, ctxt))) {
+ bch2_trans_begin(&trans);
+
+ ret = bch2_get_next_backpointer(&trans, bucket, gen,
+ &bp_offset, &bp,
+ BTREE_ITER_CACHED);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ goto err;
+ if (bp_offset == U64_MAX)
+ break;
+
+ if (!bp.level) {
+ const struct bch_extent_ptr *ptr;
+ struct bkey_s_c k;
+ unsigned i = 0;
+
+ k = bch2_backpointer_get_key(&trans, &iter,
+ bucket, bp_offset, bp);
+ ret = bkey_err(k);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ goto err;
+ if (!k.k)
+ continue;
+
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
+
+ ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
+ if (ret) {
+ bch2_trans_iter_exit(&trans, &iter);
+ continue;
+ }
+
+ data_opts = _data_opts;
+ data_opts.target = io_opts.background_target;
+ data_opts.rewrite_ptrs = 0;
+
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+ if (ptr->dev == bucket.inode)
+ data_opts.rewrite_ptrs |= 1U << i;
+ i++;
+ }
+
+ ret = bch2_move_extent(&trans, &iter, ctxt, io_opts,
+ bp.btree_id, k, data_opts);
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret == -ENOMEM) {
+ /* memory allocation failure, wait for some IO to finish */
+ bch2_move_ctxt_wait_for_io(ctxt, &trans);
+ continue;
+ }
+ if (ret)
+ goto err;
+
+ if (ctxt->rate)
+ bch2_ratelimit_increment(ctxt->rate, k.k->size);
+ atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+ } else {
+ struct btree *b;
+
+ b = bch2_backpointer_get_node(&trans, &iter,
+ bucket, bp_offset, bp);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+ continue;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ goto err;
+ if (!b)
+ continue;
+
+ ret = bch2_btree_node_rewrite(&trans, &iter, b, 0);
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ goto err;
+
+ if (ctxt->rate)
+ bch2_ratelimit_increment(ctxt->rate,
+ c->opts.btree_node_size >> 9);
+ atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
+ atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
+ }
+
+ bp_offset++;
+ }
+
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) {
+ bch2_trans_unlock(&trans);
+ move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
+ closure_sync(&ctxt->cl);
+ if (!ctxt->write_error)
+ lockrestart_do(&trans, verify_bucket_evacuated(&trans, bucket, gen));
+ }
+err:
+ bch2_trans_exit(&trans);
+ bch2_bkey_buf_exit(&sk, c);
+ return ret;
+}
+
+int bch2_evacuate_bucket(struct bch_fs *c,
+ struct bpos bucket, int gen,
+ struct data_update_opts data_opts,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc)
+{
+ struct moving_context ctxt;
+ int ret;
+
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+ ret = __bch2_evacuate_bucket(&ctxt, bucket, gen, data_opts);
+ bch2_moving_ctxt_exit(&ctxt);
- progress_list_del(c, stats);
return ret;
}
-typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *,
- struct btree *, struct bch_io_opts *,
- struct data_opts *);
+typedef bool (*move_btree_pred)(struct bch_fs *, void *,
+ struct btree *, struct bch_io_opts *,
+ struct data_update_opts *);
static int bch2_move_btree(struct bch_fs *c,
enum btree_id start_btree_id, struct bpos start_pos,
struct btree_iter iter;
struct btree *b;
enum btree_id id;
- struct data_opts data_opts;
- enum data_cmd cmd;
+ struct data_update_opts data_opts;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
stats->pos = iter.pos;
- switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
- case DATA_SKIP:
+ if (!pred(c, arg, b, &io_opts, &data_opts))
goto next;
- case DATA_SCRUB:
- BUG();
- case DATA_ADD_REPLICAS:
- case DATA_REWRITE:
- break;
- default:
- BUG();
- }
ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
break;
next:
bch2_btree_iter_next_node(&iter);
}
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
if (ret)
- bch_err(c, "error %i in bch2_move_btree", ret);
+ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
- /* flush relevant btree updates */
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
+ bch2_btree_interior_updates_flush(c);
progress_list_del(c, stats);
return ret;
}
-#if 0
-static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
-{
- return DATA_SCRUB;
-}
-#endif
-
-static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool rereplicate_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
unsigned nr_good = bch2_bkey_durability(c, k);
unsigned replicas = bkey_is_btree_ptr(k.k)
: io_opts->data_replicas;
if (!nr_good || nr_good >= replicas)
- return DATA_SKIP;
+ return false;
data_opts->target = 0;
- data_opts->nr_replicas = 1;
+ data_opts->extra_replicas = replicas - nr_good;
data_opts->btree_insert_flags = 0;
- return DATA_ADD_REPLICAS;
+ return true;
}
-static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool migrate_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
struct bch_ioctl_data *op = arg;
+ unsigned i = 0;
- if (!bch2_bkey_has_device(k, op->migrate.dev))
- return DATA_SKIP;
-
+ data_opts->rewrite_ptrs = 0;
data_opts->target = 0;
- data_opts->nr_replicas = 1;
+ data_opts->extra_replicas = 0;
data_opts->btree_insert_flags = 0;
- data_opts->rewrite_dev = op->migrate.dev;
- return DATA_REWRITE;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (ptr->dev == op->migrate.dev)
+ data_opts->rewrite_ptrs |= 1U << i;
+ i++;
+ }
+
+ return data_opts->rewrite_ptrs != 0;
}
-static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
-static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool migrate_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
return false;
}
-static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
if (b->version_ondisk != c->sb.version ||
btree_node_need_rewrite(b) ||
bformat_needs_redo(&b->format)) {
data_opts->target = 0;
- data_opts->nr_replicas = 1;
+ data_opts->extra_replicas = 0;
data_opts->btree_insert_flags = 0;
- return DATA_REWRITE;
+ return true;
}
- return DATA_SKIP;
+ return false;
}
int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
ret = bch2_move_data(c,
op.start_btree, op.start_pos,
op.end_btree, op.end_pos,
- NULL, writepoint_hashed((unsigned long) current),
- rereplicate_pred, c, stats) ?: ret;
+ NULL,
+ stats,
+ writepoint_hashed((unsigned long) current),
+ true,
+ rereplicate_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
break;
case BCH_DATA_OP_MIGRATE:
ret = bch2_move_data(c,
op.start_btree, op.start_pos,
op.end_btree, op.end_pos,
- NULL, writepoint_hashed((unsigned long) current),
- migrate_pred, &op, stats) ?: ret;
+ NULL,
+ stats,
+ writepoint_hashed((unsigned long) current),
+ true,
+ migrate_pred, &op) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
break;
case BCH_DATA_OP_REWRITE_OLD_NODES:
#include "btree_iter.h"
#include "buckets.h"
-#include "io_types.h"
+#include "data_update.h"
#include "move_types.h"
struct bch_read_bio;
-struct moving_context;
-enum data_cmd {
- DATA_SKIP,
- DATA_SCRUB,
- DATA_ADD_REPLICAS,
- DATA_REWRITE,
- DATA_PROMOTE,
-};
-
-struct data_opts {
- u16 target;
- u8 rewrite_dev;
- u8 nr_replicas;
- int btree_insert_flags;
-};
+struct moving_context {
+ struct bch_fs *c;
+ struct bch_ratelimit *rate;
+ struct bch_move_stats *stats;
+ struct write_point_specifier wp;
+ bool wait_on_copygc;
+ bool write_error;
-struct migrate_write {
- enum btree_id btree_id;
- enum data_cmd data_cmd;
- struct data_opts data_opts;
+ /* For waiting on outstanding reads and writes: */
+ struct closure cl;
+ struct list_head reads;
- unsigned nr_ptrs_reserved;
+ /* in flight sectors: */
+ atomic_t read_sectors;
+ atomic_t write_sectors;
- struct moving_context *ctxt;
-
- /* what we read: */
- struct bch_extent_ptr ptr;
- u64 offset;
-
- struct bch_write_op op;
+ wait_queue_head_t wait;
};
-void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
-int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
- struct write_point_specifier,
- struct bch_io_opts,
- enum data_cmd, struct data_opts,
- enum btree_id, struct bkey_s_c);
+typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
+ struct bch_io_opts *, struct data_update_opts *);
-typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
- struct bkey_s_c,
- struct bch_io_opts *, struct data_opts *);
+void bch2_moving_ctxt_exit(struct moving_context *);
+void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
+ struct bch_ratelimit *, struct bch_move_stats *,
+ struct write_point_specifier, bool);
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
enum btree_id, struct bpos,
enum btree_id, struct bpos,
struct bch_ratelimit *,
+ struct bch_move_stats *,
struct write_point_specifier,
- move_pred_fn, void *,
- struct bch_move_stats *);
-
+ bool,
+ move_pred_fn, void *);
+
+int __bch2_evacuate_bucket(struct moving_context *,
+ struct bpos, int,
+ struct data_update_opts);
+int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
+ struct data_update_opts,
+ struct bch_ratelimit *,
+ struct bch_move_stats *,
+ struct write_point_specifier,
+ bool);
int bch2_data_job(struct bch_fs *,
struct bch_move_stats *,
struct bch_ioctl_data);
#include "buckets.h"
#include "clock.h"
#include "disk_groups.h"
+#include "errcode.h"
#include "error.h"
#include "extents.h"
#include "eytzinger.h"
#include <linux/sort.h>
#include <linux/wait.h>
-/*
- * We can't use the entire copygc reserve in one iteration of copygc: we may
- * need the buckets we're freeing up to go back into the copygc reserve to make
- * forward progress, but if the copygc reserve is full they'll be available for
- * any allocation - and it's possible that in a given iteration, we free up most
- * of the buckets we're going to free before we allocate most of the buckets
- * we're going to allocate.
- *
- * If we only use half of the reserve per iteration, then in steady state we'll
- * always have room in the reserve for the buckets we're going to need in the
- * next iteration:
- */
-#define COPYGC_BUCKETS_PER_ITER(ca) \
- ((ca)->free[RESERVE_MOVINGGC].size / 2)
-
-static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
-{
- const struct copygc_heap_entry *l = _l;
- const struct copygc_heap_entry *r = _r;
-
- return cmp_int(l->dev, r->dev) ?:
- cmp_int(l->offset, r->offset);
-}
-
-static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
-{
- copygc_heap *h = &c->copygc_heap;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p = { 0 };
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct copygc_heap_entry search = {
- .dev = p.ptr.dev,
- .offset = p.ptr.offset,
- };
- ssize_t i;
-
- if (p.ptr.cached)
- continue;
-
- i = eytzinger0_find_le(h->data, h->used,
- sizeof(h->data[0]),
- bucket_offset_cmp, &search);
-#if 0
- /* eytzinger search verify code: */
- ssize_t j = -1, k;
-
- for (k = 0; k < h->used; k++)
- if (h->data[k].offset <= ptr->offset &&
- (j < 0 || h->data[k].offset > h->data[j].offset))
- j = k;
-
- BUG_ON(i != j);
-#endif
- if (i >= 0 &&
- p.ptr.dev == h->data[i].dev &&
- p.ptr.offset < h->data[i].offset + ca->mi.bucket_size &&
- p.ptr.gen == h->data[i].gen) {
- /*
- * We need to use the journal reserve here, because
- * - journal reclaim depends on btree key cache
- * flushing to make forward progress,
- * - which has to make forward progress when the
- * journal is pre-reservation full,
- * - and depends on allocation - meaning allocator and
- * copygc
- */
-
- data_opts->target = io_opts->background_target;
- data_opts->nr_replicas = 1;
- data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_JOURNAL_RESERVED;
- data_opts->rewrite_dev = p.ptr.dev;
-
- if (p.has_ec)
- data_opts->nr_replicas += p.ec.redundancy;
-
- return DATA_REWRITE;
- }
- }
-
- return DATA_SKIP;
-}
-
-static bool have_copygc_reserve(struct bch_dev *ca)
-{
- bool ret;
-
- spin_lock(&ca->fs->freelist_lock);
- ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
- ca->allocator_state != ALLOCATOR_running;
- spin_unlock(&ca->fs->freelist_lock);
-
- return ret;
-}
-
static inline int fragmentation_cmp(copygc_heap *heap,
struct copygc_heap_entry l,
struct copygc_heap_entry r)
return cmp_int(l.fragmentation, r.fragmentation);
}
-static int walk_buckets_to_copygc(struct bch_fs *c)
+static int find_buckets_to_copygc(struct bch_fs *c)
{
copygc_heap *h = &c->copygc_heap;
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_alloc_unpacked u;
+ struct bch_alloc_v4 a;
int ret;
bch2_trans_init(&trans, c, 0, 0);
+ /*
+ * Find buckets with lowest sector counts, skipping completely
+ * empty buckets, by building a maxheap sorted by sector count,
+ * and repeatedly replacing the maximum element until all
+ * buckets have been visited.
+ */
+ h->used = 0;
+
for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
struct copygc_heap_entry e;
- u = bch2_alloc_unpack(k);
+ bch2_alloc_to_v4(k, &a);
- if (u.data_type != BCH_DATA_user ||
- u.dirty_sectors >= ca->mi.bucket_size ||
+ if ((a.data_type != BCH_DATA_btree &&
+ a.data_type != BCH_DATA_user) ||
+ a.dirty_sectors >= ca->mi.bucket_size ||
bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
continue;
e = (struct copygc_heap_entry) {
.dev = iter.pos.inode,
- .gen = u.gen,
- .replicas = 1 + u.stripe_redundancy,
- .fragmentation = u.dirty_sectors * (1U << 15)
- / ca->mi.bucket_size,
- .sectors = u.dirty_sectors,
- .offset = bucket_to_sector(ca, iter.pos.offset),
+ .gen = a.gen,
+ .replicas = 1 + a.stripe_redundancy,
+ .fragmentation = div_u64((u64) a.dirty_sectors * (1ULL << 31),
+ ca->mi.bucket_size),
+ .sectors = a.dirty_sectors,
+ .bucket = iter.pos.offset,
};
heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
return ret;
}
-static int bucket_inorder_cmp(const void *_l, const void *_r)
-{
- const struct copygc_heap_entry *l = _l;
- const struct copygc_heap_entry *r = _r;
-
- return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset);
-}
-
-static int check_copygc_was_done(struct bch_fs *c,
- u64 *sectors_not_moved,
- u64 *buckets_not_moved)
-{
- copygc_heap *h = &c->copygc_heap;
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_alloc_unpacked u;
- struct copygc_heap_entry *i;
- int ret = 0;
-
- sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL);
-
- bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0);
-
- for (i = h->data; i < h->data + h->used; i++) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
-
- bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset)));
-
- ret = lockrestart_do(&trans,
- bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
- if (ret)
- break;
-
- u = bch2_alloc_unpack(k);
-
- if (u.gen == i->gen && u.dirty_sectors) {
- *sectors_not_moved += u.dirty_sectors;
- *buckets_not_moved += 1;
- }
- }
- bch2_trans_iter_exit(&trans, &iter);
-
- bch2_trans_exit(&trans);
- return ret;
-}
-
static int bch2_copygc(struct bch_fs *c)
{
copygc_heap *h = &c->copygc_heap;
- struct copygc_heap_entry e, *i;
+ struct copygc_heap_entry e;
struct bch_move_stats move_stats;
- u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0;
- u64 sectors_reserved = 0;
- u64 buckets_to_move, buckets_not_moved = 0;
struct bch_dev *ca;
unsigned dev_idx;
size_t heap_size = 0;
- int ret;
+ struct moving_context ctxt;
+ struct data_update_opts data_opts = {
+ .btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc,
+ };
+ int ret = 0;
bch_move_stats_init(&move_stats, "copygc");
- /*
- * Find buckets with lowest sector counts, skipping completely
- * empty buckets, by building a maxheap sorted by sector count,
- * and repeatedly replacing the maximum element until all
- * buckets have been visited.
- */
- h->used = 0;
-
for_each_rw_member(ca, c, dev_idx)
heap_size += ca->mi.nbuckets >> 7;
}
}
- for_each_rw_member(ca, c, dev_idx) {
- closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
-
- spin_lock(&ca->fs->freelist_lock);
- sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
- spin_unlock(&ca->fs->freelist_lock);
- }
-
- ret = walk_buckets_to_copygc(c);
+ ret = find_buckets_to_copygc(c);
if (ret) {
bch2_fs_fatal_error(c, "error walking buckets to copygc!");
return ret;
}
if (!h->used) {
- bch_err_ratelimited(c, "copygc requested to run but found no buckets to move!");
+ s64 wait = S64_MAX, dev_wait;
+ u64 dev_min_wait_fragmented = 0;
+ u64 dev_min_wait_allowed = 0;
+ int dev_min_wait = -1;
+
+ for_each_rw_member(ca, c, dev_idx) {
+ struct bch_dev_usage usage = bch2_dev_usage_read(ca);
+ s64 allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
+ ca->mi.bucket_size) >> 1);
+ s64 fragmented = usage.d[BCH_DATA_user].fragmented;
+
+ dev_wait = max(0LL, allowed - fragmented);
+
+ if (dev_min_wait < 0 || dev_wait < wait) {
+ dev_min_wait = dev_idx;
+ dev_min_wait_fragmented = fragmented;
+ dev_min_wait_allowed = allowed;
+ }
+ }
+
+ bch_err_ratelimited(c, "copygc requested to run but found no buckets to move! dev %u fragmented %llu allowed %llu",
+ dev_min_wait, dev_min_wait_fragmented, dev_min_wait_allowed);
return 0;
}
- /*
- * Our btree node allocations also come out of RESERVE_MOVINGGC:
- */
- sectors_reserved = (sectors_reserved * 3) / 4;
- if (!sectors_reserved) {
- bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
- return -1;
- }
+ heap_resort(h, fragmentation_cmp, NULL);
- for (i = h->data; i < h->data + h->used; i++) {
- sectors_to_move += i->sectors;
- sectors_to_write += i->sectors * i->replicas;
- }
+ bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
+ writepoint_ptr(&c->copygc_write_point),
+ false);
- while (sectors_to_write > sectors_reserved) {
+ /* not correct w.r.t. device removal */
+ while (h->used && !ret) {
BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
- sectors_to_write -= e.sectors * e.replicas;
+ ret = __bch2_evacuate_bucket(&ctxt, POS(e.dev, e.bucket), e.gen,
+ data_opts);
}
- buckets_to_move = h->used;
-
- if (!buckets_to_move) {
- bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!",
- sectors_reserved);
- return 0;
- }
-
- eytzinger0_sort(h->data, h->used,
- sizeof(h->data[0]),
- bucket_offset_cmp, NULL);
-
- ret = bch2_move_data(c,
- 0, POS_MIN,
- BTREE_ID_NR, POS_MAX,
- NULL,
- writepoint_ptr(&c->copygc_write_point),
- copygc_pred, NULL,
- &move_stats);
- if (ret) {
- bch_err(c, "error %i from bch2_move_data() in copygc", ret);
- return ret;
- }
+ bch2_moving_ctxt_exit(&ctxt);
- ret = check_copygc_was_done(c, §ors_not_moved, &buckets_not_moved);
- if (ret) {
- bch_err(c, "error %i from check_copygc_was_done()", ret);
- return ret;
- }
+ if (ret < 0 && ret != -EROFS)
+ bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
- if (sectors_not_moved)
- bch_warn_ratelimited(c,
- "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)",
- sectors_not_moved, sectors_to_move,
- buckets_not_moved, buckets_to_move,
- atomic64_read(&move_stats.sectors_moved),
- atomic64_read(&move_stats.keys_raced),
- atomic64_read(&move_stats.sectors_raced));
-
- trace_copygc(c,
- atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
- buckets_to_move, buckets_not_moved);
- return 0;
+ trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0);
+ return ret;
}
/*
for_each_rw_member(ca, c, dev_idx) {
struct bch_dev_usage usage = bch2_dev_usage_read(ca);
- fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) *
- ca->mi.bucket_size) >> 1);
+ fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
+ ca->mi.bucket_size) >> 1);
fragmented = usage.d[BCH_DATA_user].fragmented;
wait = min(wait, max(0LL, fragmented_allowed - fragmented));
struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE];
u64 last, wait;
+ int ret = 0;
set_freezable();
- while (!kthread_should_stop()) {
+ while (!ret && !kthread_should_stop()) {
cond_resched();
if (kthread_wait_freezable(c->copy_gc_enabled))
wait = bch2_copygc_wait_amount(c);
if (wait > clock->max_slop) {
- trace_copygc_wait(c, wait, last + wait);
+ trace_and_count(c, copygc_wait, c, wait, last + wait);
c->copygc_wait = last + wait;
bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT);
c->copygc_wait = 0;
- if (bch2_copygc(c))
- break;
+ c->copygc_running = true;
+ ret = bch2_copygc(c);
+ c->copygc_running = false;
+
+ wake_up(&c->copygc_running_wq);
}
return 0;
int bch2_copygc_start(struct bch_fs *c)
{
struct task_struct *t;
+ int ret;
if (c->copygc_thread)
return 0;
return -ENOMEM;
t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
- if (IS_ERR(t)) {
- bch_err(c, "error creating copygc thread: %li", PTR_ERR(t));
- return PTR_ERR(t);
+ ret = PTR_ERR_OR_ZERO(t);
+ if (ret) {
+ bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret));
+ return ret;
}
get_task_struct(t);
void bch2_fs_copygc_init(struct bch_fs *c)
{
+ init_waitqueue_head(&c->copygc_running_wq);
+ c->copygc_running = false;
}
#ifndef _BCACHEFS_MOVINGGC_H
#define _BCACHEFS_MOVINGGC_H
+unsigned long bch2_copygc_wait_amount(struct bch_fs *);
void bch2_copygc_stop(struct bch_fs *);
int bch2_copygc_start(struct bch_fs *);
void bch2_fs_copygc_init(struct bch_fs *);
#include "super-io.h"
#include "util.h"
-#define x(t, n) #t,
+#include <linux/pretty-printers.h>
+
+#define x(t, n) [n] = #t,
+
+const char * const bch2_metadata_versions[] = {
+ BCH_METADATA_VERSIONS()
+ NULL
+};
const char * const bch2_error_actions[] = {
BCH_ERROR_ACTIONS()
const char * const bch2_btree_ids[] = {
BCH_BTREE_IDS()
+ "interior btree node",
NULL
};
[DT_SUBVOL] = "subvol",
};
+u64 BCH2_NO_SB_OPT(const struct bch_sb *sb)
+{
+ BUG();
+}
+
+void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v)
+{
+ BUG();
+}
+
void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
{
#define x(_name, ...) \
return bch2_opt_lookup(name);
}
-static int bch2_opt_validate(const struct bch_option *opt, const char *msg, u64 v)
+int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
{
if (v < opt->min) {
- if (msg)
- pr_err("invalid %s%s: too small (min %llu)",
- msg, opt->attr.name, opt->min);
+ if (err)
+ prt_printf(err, "%s: too small (min %llu)",
+ opt->attr.name, opt->min);
return -ERANGE;
}
if (opt->max && v >= opt->max) {
- if (msg)
- pr_err("invalid %s%s: too big (max %llu)",
- msg, opt->attr.name, opt->max);
+ if (err)
+ prt_printf(err, "%s: too big (max %llu)",
+ opt->attr.name, opt->max);
return -ERANGE;
}
if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
- if (msg)
- pr_err("invalid %s %s: not a multiple of 512",
- msg, opt->attr.name);
+ if (err)
+ prt_printf(err, "%s: not a multiple of 512",
+ opt->attr.name);
return -EINVAL;
}
if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
- if (msg)
- pr_err("invalid %s%s: must be a power of two",
- msg, opt->attr.name);
+ if (err)
+ prt_printf(err, "%s: must be a power of two",
+ opt->attr.name);
return -EINVAL;
}
return 0;
}
-int bch2_opt_parse(struct bch_fs *c, const char *msg,
+int bch2_opt_parse(struct bch_fs *c,
const struct bch_option *opt,
- const char *val, u64 *res)
+ const char *val, u64 *res,
+ struct printbuf *err)
{
ssize_t ret;
switch (opt->type) {
case BCH_OPT_BOOL:
ret = kstrtou64(val, 10, res);
- if (ret < 0)
+ if (ret < 0 || (*res != 0 && *res != 1)) {
+ prt_printf(err, "%s: must be bool",
+ opt->attr.name);
return ret;
+ }
break;
case BCH_OPT_UINT:
ret = opt->flags & OPT_HUMAN_READABLE
? bch2_strtou64_h(val, res)
: kstrtou64(val, 10, res);
- if (ret < 0)
+ if (ret < 0) {
+ if (err)
+ prt_printf(err, "%s: must be a number",
+ opt->attr.name);
return ret;
+ }
break;
case BCH_OPT_STR:
ret = match_string(opt->choices, -1, val);
- if (ret < 0)
+ if (ret < 0) {
+ if (err)
+ prt_printf(err, "%s: invalid selection",
+ opt->attr.name);
return ret;
+ }
*res = ret;
break;
return 0;
ret = opt->parse(c, val, res);
- if (ret < 0)
+ if (ret < 0) {
+ if (err)
+ prt_printf(err, "%s: parse error",
+ opt->attr.name);
return ret;
+ }
}
- return bch2_opt_validate(opt, msg, *res);
+ return bch2_opt_validate(opt, *res, err);
}
-void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
+void bch2_opt_to_text(struct printbuf *out,
+ struct bch_fs *c, struct bch_sb *sb,
const struct bch_option *opt, u64 v,
unsigned flags)
{
if (flags & OPT_SHOW_MOUNT_STYLE) {
if (opt->type == BCH_OPT_BOOL) {
- pr_buf(out, "%s%s",
+ prt_printf(out, "%s%s",
v ? "" : "no",
opt->attr.name);
return;
}
- pr_buf(out, "%s=", opt->attr.name);
+ prt_printf(out, "%s=", opt->attr.name);
}
switch (opt->type) {
case BCH_OPT_BOOL:
case BCH_OPT_UINT:
if (opt->flags & OPT_HUMAN_READABLE)
- bch2_hprint(out, v);
+ prt_human_readable_u64(out, v);
else
- pr_buf(out, "%lli", v);
+ prt_printf(out, "%lli", v);
break;
case BCH_OPT_STR:
if (flags & OPT_SHOW_FULL_LIST)
- bch2_string_opt_to_text(out, opt->choices, v);
+ prt_string_option(out, opt->choices, v);
else
- pr_buf(out, opt->choices[v]);
+ prt_printf(out, "%s", opt->choices[v]);
break;
case BCH_OPT_FN:
- opt->to_text(out, c, v);
+ opt->to_text(out, c, sb, v);
break;
default:
BUG();
char *copied_opts, *copied_opts_start;
char *opt, *name, *val;
int ret, id;
+ struct printbuf err = PRINTBUF;
u64 v;
if (!options)
if (id < 0)
goto bad_opt;
- ret = bch2_opt_parse(c, "mount option ",
- &bch2_opt_table[id], val, &v);
+ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
if (ret < 0)
goto bad_val;
} else {
ret = -1;
goto out;
bad_val:
- pr_err("Invalid value %s for mount option %s", val, name);
+ pr_err("Invalid mount option %s", err.buf);
ret = -1;
goto out;
no_val:
goto out;
out:
kfree(copied_opts_start);
+ printbuf_exit(&err);
return ret;
}
+u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
+{
+ const struct bch_option *opt = bch2_opt_table + id;
+ u64 v;
+
+ v = opt->get_sb(sb);
+
+ if (opt->flags & OPT_SB_FIELD_ILOG2)
+ v = 1ULL << v;
+
+ if (opt->flags & OPT_SB_FIELD_SECTORS)
+ v <<= 9;
+
+ return v;
+}
+
/*
* Initial options from superblock - here we don't want any options undefined,
* any options the superblock doesn't specify are set to 0:
int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
{
unsigned id;
- int ret;
for (id = 0; id < bch2_opts_nr; id++) {
const struct bch_option *opt = bch2_opt_table + id;
- u64 v;
- if (opt->get_sb == NO_SB_OPT)
+ if (opt->get_sb == BCH2_NO_SB_OPT)
continue;
- v = opt->get_sb(sb);
-
- if (opt->flags & OPT_SB_FIELD_ILOG2)
- v = 1ULL << v;
-
- if (opt->flags & OPT_SB_FIELD_SECTORS)
- v <<= 9;
-
- ret = bch2_opt_validate(opt, "superblock option ", v);
- if (ret)
- return ret;
-
- bch2_opt_set_by_id(opts, id, v);
+ bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id));
}
return 0;
void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
{
- if (opt->set_sb == SET_NO_SB_OPT)
+ if (opt->set_sb == SET_BCH2_NO_SB_OPT)
return;
if (opt->flags & OPT_SB_FIELD_SECTORS)
void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
{
- if (opt->set_sb == SET_NO_SB_OPT)
+ if (opt->set_sb == SET_BCH2_NO_SB_OPT)
return;
mutex_lock(&c->sb_lock);
#include <linux/sysfs.h>
#include "bcachefs_format.h"
+extern const char * const bch2_metadata_versions[];
extern const char * const bch2_error_actions[];
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
*/
/* dummy option, for options that aren't stored in the superblock */
-LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0);
+u64 BCH2_NO_SB_OPT(const struct bch_sb *);
+void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
/* When can be set: */
enum opt_flags {
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_METADATA_TARGET, 0, \
- "(target)", "Device or disk group for metadata writes") \
+ "(target)", "Device or label for metadata writes") \
x(foreground_target, u16, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_FOREGROUND_TARGET, 0, \
- "(target)", "Device or disk group for foreground writes") \
+ "(target)", "Device or label for foreground writes") \
x(background_target, u16, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_BACKGROUND_TARGET, 0, \
- "(target)", "Device or disk group to move data to in the background")\
+ "(target)", "Device or label to move data to in the background")\
x(promote_target, u16, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_PROMOTE_TARGET, 0, \
- "(target)", "Device or disk group to promote data to on read")\
+ "(target)", "Device or label to promote data to on read") \
x(erasure_code, u16, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
x(btree_node_mem_ptr_optimization, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- NO_SB_OPT, true, \
+ BCH2_NO_SB_OPT, true, \
NULL, "Stash pointer to in memory btree node in btree ptr")\
x(gc_reserve_percent, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
x(inline_data, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- NO_SB_OPT, true, \
+ BCH2_NO_SB_OPT, true, \
NULL, "Enable inline data extents") \
x(acl, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT, \
x(degraded, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Allow mounting in degraded mode") \
x(very_degraded, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Allow mounting in when data will be missing") \
x(discard, u8, \
OPT_FS|OPT_MOUNT|OPT_DEVICE, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, true, \
NULL, "Enable discard/TRIM support") \
x(verbose, u8, \
- OPT_FS|OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Extra debugging information during mount/recovery")\
x(journal_flush_delay, u32, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(0, U32_MAX), \
+ OPT_UINT(1, U32_MAX), \
BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \
NULL, "Delay in milliseconds before automatic journal commits")\
x(journal_flush_disabled, u8, \
OPT_UINT(0, U32_MAX), \
BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \
NULL, "Delay in milliseconds before automatic journal reclaim")\
+ x(move_bytes_in_flight, u32, \
+ OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(1024, U32_MAX), \
+ BCH2_NO_SB_OPT, 1U << 20, \
+ NULL, "Amount of IO in flight to keep in flight by the move path")\
x(fsck, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Run fsck on mount") \
x(fix_errors, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Fix errors during fsck without asking") \
x(ratelimit_errors, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \
+ BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \
NULL, "Ratelimit error messages during fsck") \
x(nochanges, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Super read only mode - no writes at all will be issued,\n"\
"even if we have to replay the journal") \
x(norecovery, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Don't replay the journal") \
- x(rebuild_replicas, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- NO_SB_OPT, false, \
- NULL, "Rebuild the superblock replicas section") \
x(keep_journal, u8, \
0, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Don't free journal entries/keys after startup")\
x(read_entire_journal, u8, \
0, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Read all journal entries, not just dirty ones")\
- x(journal_transaction_names, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ x(read_journal_only, u8, \
+ 0, \
OPT_BOOL(), \
- BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \
- NULL, "Log transaction function names in journal") \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Only read the journal, skip the rest of recovery")\
x(noexcl, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Don't open device in exclusive mode") \
+ x(direct_io, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "Use O_DIRECT (userspace only)") \
x(sb, u64, \
OPT_MOUNT, \
OPT_UINT(0, S64_MAX), \
- NO_SB_OPT, BCH_SB_SECTOR, \
+ BCH2_NO_SB_OPT, BCH_SB_SECTOR, \
"offset", "Sector offset of superblock") \
x(read_only, u8, \
OPT_FS, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, NULL) \
x(nostart, u8, \
0, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Don\'t start filesystem, only open devices") \
x(reconstruct_alloc, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Reconstruct alloc btree") \
x(version_upgrade, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Set superblock to latest version,\n" \
"allowing any new features to be used") \
x(buckets_nouse, u8, \
0, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Allocate the buckets_nouse bitmap") \
x(project, u8, \
OPT_INODE, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, NULL) \
x(fs_size, u64, \
OPT_DEVICE, \
OPT_UINT(0, S64_MAX), \
- NO_SB_OPT, 0, \
+ BCH2_NO_SB_OPT, 0, \
"size", "Size of filesystem on device") \
x(bucket, u32, \
OPT_DEVICE, \
OPT_UINT(0, S64_MAX), \
- NO_SB_OPT, 0, \
+ BCH2_NO_SB_OPT, 0, \
"size", "Size of filesystem on device") \
x(durability, u8, \
OPT_DEVICE, \
OPT_UINT(0, BCH_REPLICAS_MAX), \
- NO_SB_OPT, 1, \
+ BCH2_NO_SB_OPT, 1, \
"n", "Data written to this device will be considered\n"\
"to have already been replicated n times")
enum opt_flags flags;
u64 min, max;
- union {
- struct {
- };
- struct {
- const char * const *choices;
- };
- struct {
- int (*parse)(struct bch_fs *, const char *, u64 *);
- void (*to_text)(struct printbuf *, struct bch_fs *, u64);
- };
- };
+ const char * const *choices;
+ int (*parse)(struct bch_fs *, const char *, u64 *);
+ void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
const char *hint;
const char *help;
u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
+u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
int bch2_opt_lookup(const char *);
-int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *,
- const char *, u64 *);
+int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
+int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
+ const char *, u64 *, struct printbuf *);
#define OPT_SHOW_FULL_LIST (1 << 0)
#define OPT_SHOW_MOUNT_STYLE (1 << 1)
-void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
+void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
const struct bch_option *, u64, unsigned);
int bch2_opt_check_may_set(struct bch_fs *, int, u64);
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_update.h"
+#include "errcode.h"
#include "inode.h"
#include "quota.h"
#include "subvolume.h"
#include "super-io.h"
-static int bch2_sb_validate_quota(struct bch_sb *sb, struct bch_sb_field *f,
+static const char * const bch2_quota_types[] = {
+ "user",
+ "group",
+ "project",
+};
+
+static const char * const bch2_quota_counters[] = {
+ "space",
+ "inodes",
+};
+
+static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
struct printbuf *err)
{
struct bch_sb_field_quota *q = field_to_type(f, quota);
if (vstruct_bytes(&q->field) < sizeof(*q)) {
- pr_buf(err, "wrong size (got %llu should be %zu)",
+ prt_printf(err, "wrong size (got %zu should be %zu)",
vstruct_bytes(&q->field), sizeof(*q));
+ return -EINVAL;
}
return 0;
}
+static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_quota *q = field_to_type(f, quota);
+ unsigned qtyp, counter;
+
+ for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
+ prt_printf(out, "%s: flags %llx",
+ bch2_quota_types[qtyp],
+ le64_to_cpu(q->q[qtyp].flags));
+
+ for (counter = 0; counter < Q_COUNTERS; counter++)
+ prt_printf(out, " %s timelimit %u warnlimit %u",
+ bch2_quota_counters[counter],
+ le32_to_cpu(q->q[qtyp].c[counter].timelimit),
+ le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
+
+ prt_newline(out);
+ }
+}
+
const struct bch_sb_field_ops bch_sb_field_ops_quota = {
- .validate = bch2_sb_validate_quota,
+ .validate = bch2_sb_quota_validate,
+ .to_text = bch2_sb_quota_to_text,
};
-const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
- if (k.k->p.inode >= QTYP_NR)
- return "invalid quota type";
+ if (k.k->p.inode >= QTYP_NR) {
+ prt_printf(err, "invalid quota type (%llu >= %u)",
+ k.k->p.inode, QTYP_NR);
+ return -EINVAL;
+ }
- if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
- return "incorrect value size";
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) {
+ prt_printf(err, "incorrect value size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_quota));
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
-static const char * const bch2_quota_counters[] = {
- "space",
- "inodes",
-};
-
void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
unsigned i;
for (i = 0; i < Q_COUNTERS; i++)
- pr_buf(out, "%s hardlimit %llu softlimit %llu",
+ prt_printf(out, "%s hardlimit %llu softlimit %llu",
bch2_quota_counters[i],
le64_to_cpu(dq.v->c[i].hardlimit),
le64_to_cpu(dq.v->c[i].softlimit));
#include <linux/fs.h>
#include <linux/quota.h>
+static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
+{
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 20);
+
+ prt_str(out, "i_fieldmask");
+ prt_tab(out);
+ prt_printf(out, "%x", i->i_fieldmask);
+ prt_newline(out);
+
+ prt_str(out, "i_flags");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_flags);
+ prt_newline(out);
+
+ prt_str(out, "i_spc_timelimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_spc_timelimit);
+ prt_newline(out);
+
+ prt_str(out, "i_ino_timelimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_ino_timelimit);
+ prt_newline(out);
+
+ prt_str(out, "i_rt_spc_timelimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_rt_spc_timelimit);
+ prt_newline(out);
+
+ prt_str(out, "i_spc_warnlimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_spc_warnlimit);
+ prt_newline(out);
+
+ prt_str(out, "i_ino_warnlimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_ino_warnlimit);
+ prt_newline(out);
+
+ prt_str(out, "i_rt_spc_warnlimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_rt_spc_warnlimit);
+ prt_newline(out);
+}
+
+static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
+{
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 20);
+
+ prt_str(out, "d_fieldmask");
+ prt_tab(out);
+ prt_printf(out, "%x", q->d_fieldmask);
+ prt_newline(out);
+
+ prt_str(out, "d_spc_hardlimit");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_spc_hardlimit);
+ prt_newline(out);
+
+ prt_str(out, "d_spc_softlimit");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_spc_softlimit);
+ prt_newline(out);
+
+ prt_str(out, "d_ino_hardlimit");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_ino_hardlimit);
+ prt_newline(out);
+
+ prt_str(out, "d_ino_softlimit");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_ino_softlimit);
+ prt_newline(out);
+
+ prt_str(out, "d_space");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_space);
+ prt_newline(out);
+
+ prt_str(out, "d_ino_count");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_ino_count);
+ prt_newline(out);
+
+ prt_str(out, "d_ino_timer");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_ino_timer);
+ prt_newline(out);
+
+ prt_str(out, "d_spc_timer");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_spc_timer);
+ prt_newline(out);
+
+ prt_str(out, "d_ino_warns");
+ prt_tab(out);
+ prt_printf(out, "%i", q->d_ino_warns);
+ prt_newline(out);
+
+ prt_str(out, "d_spc_warns");
+ prt_tab(out);
+ prt_printf(out, "%i", q->d_spc_warns);
+ prt_newline(out);
+}
+
static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
{
qtypes >>= i;
if (qc->hardlimit &&
qc->hardlimit < n &&
!ignore_hardlimit(q)) {
- if (mode == KEY_TYPE_QUOTA_PREALLOC)
- return -EDQUOT;
-
prepare_warning(qc, qtype, counter, msgs, HARDWARN);
+ return -EDQUOT;
}
if (qc->softlimit &&
- qc->softlimit < n &&
- qc->timer &&
- ktime_get_real_seconds() >= qc->timer &&
- !ignore_hardlimit(q)) {
- if (mode == KEY_TYPE_QUOTA_PREALLOC)
- return -EDQUOT;
-
- prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
- }
-
- if (qc->softlimit &&
- qc->softlimit < n &&
- qc->timer == 0) {
- if (mode == KEY_TYPE_QUOTA_PREALLOC)
+ qc->softlimit < n) {
+ if (qc->timer == 0) {
+ qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit;
+ prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
+ } else if (ktime_get_real_seconds() >= qc->timer &&
+ !ignore_hardlimit(q)) {
+ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
return -EDQUOT;
-
- prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
-
- /* XXX is this the right one? */
- qc->timer = ktime_get_real_seconds() +
- q->limits[counter].warnlimit;
+ }
}
return 0;
return ret;
}
-static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k,
+ struct qc_dqblk *qdq)
{
struct bkey_s_c_quota dq;
struct bch_memquota_type *q;
BUG_ON(k.k->p.inode >= QTYP_NR);
+ if (!((1U << k.k->p.inode) & enabled_qtypes(c)))
+ return 0;
+
switch (k.k->type) {
case KEY_TYPE_quota:
dq = bkey_s_c_to_quota(k);
mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
}
+ if (qdq && qdq->d_fieldmask & QC_SPC_TIMER)
+ mq->c[Q_SPC].timer = cpu_to_le64(qdq->d_spc_timer);
+ if (qdq && qdq->d_fieldmask & QC_SPC_WARNS)
+ mq->c[Q_SPC].warns = cpu_to_le64(qdq->d_spc_warns);
+ if (qdq && qdq->d_fieldmask & QC_INO_TIMER)
+ mq->c[Q_INO].timer = cpu_to_le64(qdq->d_ino_timer);
+ if (qdq && qdq->d_fieldmask & QC_INO_WARNS)
+ mq->c[Q_INO].warns = cpu_to_le64(qdq->d_ino_warns);
+
mutex_unlock(&q->lock);
}
return 0;
}
-static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
-{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- for_each_btree_key(&trans, iter, BTREE_ID_quotas, POS(type, 0),
- BTREE_ITER_PREFETCH, k, ret) {
- if (k.k->p.inode != type)
- break;
-
- ret = __bch2_quota_set(c, k);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(&trans, &iter);
-
- bch2_trans_exit(&trans);
- return ret;
-}
-
void bch2_fs_quota_exit(struct bch_fs *c)
{
unsigned i;
mutex_init(&c->quotas[i].lock);
}
+static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
+{
+ struct bch_sb_field_quota *sb_quota = bch2_sb_get_quota(sb->sb);
+
+ if (sb_quota)
+ return sb_quota;
+
+ sb_quota = bch2_sb_resize_quota(sb, sizeof(*sb_quota) / sizeof(u64));
+ if (sb_quota) {
+ unsigned qtype, qc;
+
+ for (qtype = 0; qtype < QTYP_NR; qtype++)
+ for (qc = 0; qc < Q_COUNTERS; qc++)
+ sb_quota->q[qtype].c[qc].timelimit =
+ cpu_to_le32(7 * 24 * 60 * 60);
+ }
+
+ return sb_quota;
+}
+
static void bch2_sb_quota_read(struct bch_fs *c)
{
struct bch_sb_field_quota *sb_quota;
}
static int bch2_fs_quota_read_inode(struct btree_trans *trans,
- struct btree_iter *iter)
+ struct btree_iter *iter,
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bch_inode_unpacked u;
struct bch_subvolume subvolume;
- struct bkey_s_c k;
int ret;
- k = bch2_btree_iter_peek(iter);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (!k.k)
- return 1;
-
ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume);
if (ret)
return ret;
int bch2_fs_quota_read(struct bch_fs *c)
{
- unsigned i, qtypes = enabled_qtypes(c);
- struct bch_memquota_type *q;
+ struct bch_sb_field_quota *sb_quota;
struct btree_trans trans;
struct btree_iter iter;
+ struct bkey_s_c k;
int ret;
mutex_lock(&c->sb_lock);
+ sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+ if (!sb_quota) {
+ mutex_unlock(&c->sb_lock);
+ return -BCH_ERR_ENOSPC_sb_quota;
+ }
+
bch2_sb_quota_read(c);
mutex_unlock(&c->sb_lock);
- for_each_set_qtype(c, i, q, qtypes) {
- ret = bch2_quota_init_type(c, i);
- if (ret)
- return ret;
- }
-
bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
- do {
- ret = lockrestart_do(&trans,
- bch2_fs_quota_read_inode(&trans, &iter));
- } while (!ret);
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas,
+ POS_MIN, BTREE_ITER_PREFETCH, k,
+ __bch2_quota_set(c, k, NULL)) ?:
+ for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
+ POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ bch2_fs_quota_read_inode(&trans, &iter, k));
+ if (ret)
+ bch_err(c, "err in quota_read: %s", bch2_err_str(ret));
bch2_trans_exit(&trans);
- return ret < 0 ? ret : 0;
+ return ret;
}
/* Enable/disable/delete quotas for an entire filesystem: */
static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
{
struct bch_fs *c = sb->s_fs_info;
+ struct bch_sb_field_quota *sb_quota;
+ int ret = 0;
if (sb->s_flags & SB_RDONLY)
return -EROFS;
return -EINVAL;
mutex_lock(&c->sb_lock);
+ sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+ if (!sb_quota) {
+ ret = -BCH_ERR_ENOSPC_sb_quota;
+ goto unlock;
+ }
+
if (uflags & FS_QUOTA_UDQ_ENFD)
SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
bch2_write_super(c);
+unlock:
mutex_unlock(&c->sb_lock);
- return 0;
+ return bch2_err_class(ret);
}
static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
struct bch_fs *c = sb->s_fs_info;
struct bch_sb_field_quota *sb_quota;
struct bch_memquota_type *q;
+ int ret = 0;
+
+ if (0) {
+ struct printbuf buf = PRINTBUF;
+
+ qc_info_to_text(&buf, info);
+ pr_info("setting:\n%s", buf.buf);
+ printbuf_exit(&buf);
+ }
if (sb->s_flags & SB_RDONLY)
return -EROFS;
q = &c->quotas[type];
mutex_lock(&c->sb_lock);
- sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+ sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
if (!sb_quota) {
- sb_quota = bch2_sb_resize_quota(&c->disk_sb,
- sizeof(*sb_quota) / sizeof(u64));
- if (!sb_quota)
- return -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_sb_quota;
+ goto unlock;
}
if (info->i_fieldmask & QC_SPC_TIMER)
bch2_sb_quota_read(c);
bch2_write_super(c);
+unlock:
mutex_unlock(&c->sb_lock);
- return 0;
+ return bch2_err_class(ret);
}
/* Get/set individual quotas: */
struct bkey_i_quota new_quota;
int ret;
+ if (0) {
+ struct printbuf buf = PRINTBUF;
+
+ qc_dqblk_to_text(&buf, qdq);
+ pr_info("setting:\n%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
if (sb->s_flags & SB_RDONLY)
return -EROFS;
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
- __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
+ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
return ret;
}
extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
-const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_quota (struct bkey_ops) { \
#include "buckets.h"
#include "clock.h"
#include "disk_groups.h"
+#include "errcode.h"
#include "extents.h"
#include "io.h"
#include "move.h"
* returns -1 if it should not be moved, or
* device of pointer that should be moved, if known, or INT_MAX if unknown
*/
-static int __bch2_rebalance_pred(struct bch_fs *c,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts)
+static bool rebalance_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
+ unsigned i;
+
+ data_opts->rewrite_ptrs = 0;
+ data_opts->target = io_opts->background_target;
+ data_opts->extra_replicas = 0;
+ data_opts->btree_insert_flags = 0;
if (io_opts->background_compression &&
- !bch2_bkey_is_incompressible(k))
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ !bch2_bkey_is_incompressible(k)) {
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ i = 0;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (!p.ptr.cached &&
p.crc.compression_type !=
bch2_compression_opt_to_type[io_opts->background_compression])
- return p.ptr.dev;
+ data_opts->rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
- if (io_opts->background_target)
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (!p.ptr.cached &&
- !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target))
- return p.ptr.dev;
+ if (io_opts->background_target) {
+ const struct bch_extent_ptr *ptr;
+
+ i = 0;
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (!ptr->cached &&
+ !bch2_dev_in_target(c, ptr->dev, io_opts->background_target))
+ data_opts->rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
- return -1;
+ return data_opts->rewrite_ptrs != 0;
}
void bch2_rebalance_add_key(struct bch_fs *c,
struct bkey_s_c k,
struct bch_io_opts *io_opts)
{
- atomic64_t *counter;
- int dev;
+ struct data_update_opts update_opts = { 0 };
+ struct bkey_ptrs_c ptrs;
+ const struct bch_extent_ptr *ptr;
+ unsigned i;
- dev = __bch2_rebalance_pred(c, k, io_opts);
- if (dev < 0)
+ if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
return;
- counter = dev < INT_MAX
- ? &bch_dev_bkey_exists(c, dev)->rebalance_work
- : &c->rebalance.work_unknown_dev;
-
- if (atomic64_add_return(k.k->size, counter) == k.k->size)
- rebalance_wakeup(c);
-}
-
-static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
-{
- if (__bch2_rebalance_pred(c, k, io_opts) >= 0) {
- data_opts->target = io_opts->background_target;
- data_opts->nr_replicas = 1;
- data_opts->btree_insert_flags = 0;
- return DATA_ADD_REPLICAS;
- } else {
- return DATA_SKIP;
+ i = 0;
+ ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr(ptrs, ptr) {
+ if ((1U << i) && update_opts.rewrite_ptrs)
+ if (atomic64_add_return(k.k->size,
+ &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
+ k.k->size)
+ rebalance_wakeup(c);
+ i++;
}
}
BTREE_ID_NR, POS_MAX,
/* ratelimiting disabled for now */
NULL, /* &r->pd.rate, */
+ &move_stats,
writepoint_ptr(&c->rebalance_write_point),
- rebalance_pred, NULL,
- &move_stats);
+ true,
+ rebalance_pred, NULL);
}
return 0;
{
struct bch_fs_rebalance *r = &c->rebalance;
struct rebalance_work w = rebalance_work(c);
- char h1[21], h2[21];
- bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9);
- bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9);
- pr_buf(out, "fullest_dev (%i):\t%s/%s\n",
- w.dev_most_full_idx, h1, h2);
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 20);
+
+ prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx);
+ prt_tab(out);
+
+ prt_human_readable_u64(out, w.dev_most_full_work << 9);
+ prt_printf(out, "/");
+ prt_human_readable_u64(out, w.dev_most_full_capacity << 9);
+ prt_newline(out);
+
+ prt_printf(out, "total work:");
+ prt_tab(out);
- bch2_hprint(&PBUF(h1), w.total_work << 9);
- bch2_hprint(&PBUF(h2), c->capacity << 9);
- pr_buf(out, "total work:\t\t%s/%s\n", h1, h2);
+ prt_human_readable_u64(out, w.total_work << 9);
+ prt_printf(out, "/");
+ prt_human_readable_u64(out, c->capacity << 9);
+ prt_newline(out);
- pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate);
+ prt_printf(out, "rate:");
+ prt_tab(out);
+ prt_printf(out, "%u", r->pd.rate.rate);
+ prt_newline(out);
switch (r->state) {
case REBALANCE_WAITING:
- pr_buf(out, "waiting\n");
+ prt_printf(out, "waiting");
break;
case REBALANCE_THROTTLED:
- bch2_hprint(&PBUF(h1),
+ prt_printf(out, "throttled for %lu sec or ",
+ (r->throttled_until_cputime - jiffies) / HZ);
+ prt_human_readable_u64(out,
(r->throttled_until_iotime -
atomic64_read(&c->io_clock[WRITE].now)) << 9);
- pr_buf(out, "throttled for %lu sec or %s io\n",
- (r->throttled_until_cputime - jiffies) / HZ,
- h1);
+ prt_printf(out, " io");
break;
case REBALANCE_RUNNING:
- pr_buf(out, "running\n");
+ prt_printf(out, "running");
break;
}
+ prt_newline(out);
}
void bch2_rebalance_stop(struct bch_fs *c)
int bch2_rebalance_start(struct bch_fs *c)
{
struct task_struct *p;
+ int ret;
if (c->rebalance.thread)
return 0;
return 0;
p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
- if (IS_ERR(p)) {
- bch_err(c, "error creating rebalance thread: %li", PTR_ERR(p));
- return PTR_ERR(p);
+ ret = PTR_ERR_OR_ZERO(p);
+ if (ret) {
+ bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret));
+ return ret;
}
get_task_struct(p);
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "backpointers.h"
#include "bkey_buf.h"
#include "alloc_background.h"
#include "btree_gc.h"
#include "buckets.h"
#include "dirent.h"
#include "ec.h"
+#include "errcode.h"
#include "error.h"
#include "fs-common.h"
#include "fsck.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
+#include "lru.h"
#include "move.h"
#include "quota.h"
#include "recovery.h"
return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
}
-size_t bch2_journal_key_search(struct journal_keys *journal_keys,
- enum btree_id id, unsigned level,
- struct bpos pos)
+static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
{
- size_t l = 0, r = journal_keys->nr, m;
+ size_t gap_size = keys->size - keys->nr;
+
+ if (idx >= keys->gap)
+ idx += gap_size;
+ return idx;
+}
+
+static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
+{
+ return keys->d + idx_to_pos(keys, idx);
+}
+
+static size_t __bch2_journal_key_search(struct journal_keys *keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
+{
+ size_t l = 0, r = keys->nr, m;
while (l < r) {
m = l + ((r - l) >> 1);
- if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0)
+ if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
l = m + 1;
else
r = m;
}
- BUG_ON(l < journal_keys->nr &&
- __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0);
+ BUG_ON(l < keys->nr &&
+ __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
BUG_ON(l &&
- __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0);
+ __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
return l;
}
-static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx)
+static size_t bch2_journal_key_search(struct journal_keys *keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
{
- struct bkey_i *n = iter->keys->d[idx].k;
- struct btree_and_journal_iter *biter =
- container_of(iter, struct btree_and_journal_iter, journal);
-
- if (iter->idx > idx ||
- (iter->idx == idx &&
- biter->last &&
- bpos_cmp(n->k.p, biter->unpacked.p) <= 0))
- iter->idx++;
+ return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
+}
+
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos,
+ struct bpos end_pos, size_t *idx)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ unsigned iters = 0;
+ struct journal_key *k;
+search:
+ if (!*idx)
+ *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+
+ while (*idx < keys->nr &&
+ (k = idx_to_key(keys, *idx),
+ k->btree_id == btree_id &&
+ k->level == level &&
+ bpos_cmp(k->k->k.p, end_pos) <= 0)) {
+ if (bpos_cmp(k->k->k.p, pos) >= 0 &&
+ !k->overwritten)
+ return k->k;
+
+ (*idx)++;
+ iters++;
+ if (iters == 10) {
+ *idx = 0;
+ goto search;
+ }
+ }
+
+ return NULL;
+}
+
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos)
+{
+ size_t idx = 0;
+
+ return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
+}
+
+static void journal_iters_fix(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ /* The key we just inserted is immediately before the gap: */
+ size_t gap_end = keys->gap + (keys->size - keys->nr);
+ struct btree_and_journal_iter *iter;
+
+ /*
+ * If an iterator points one after the key we just inserted, decrement
+ * the iterator so it points at the key we just inserted - if the
+ * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
+ * handle that:
+ */
+ list_for_each_entry(iter, &c->journal_iters, journal.list)
+ if (iter->journal.idx == gap_end)
+ iter->journal.idx = keys->gap - 1;
+}
+
+static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_iter *iter;
+ size_t gap_size = keys->size - keys->nr;
+
+ list_for_each_entry(iter, &c->journal_iters, list) {
+ if (iter->idx > old_gap)
+ iter->idx -= gap_size;
+ if (iter->idx >= new_gap)
+ iter->idx += gap_size;
+ }
}
int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
.journal_seq = U32_MAX,
};
struct journal_keys *keys = &c->journal_keys;
- struct journal_iter *iter;
size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
BUG_ON(test_bit(BCH_FS_RW, &c->flags));
- if (idx < keys->nr &&
+ if (idx < keys->size &&
journal_key_cmp(&n, &keys->d[idx]) == 0) {
if (keys->d[idx].allocated)
kfree(keys->d[idx].k);
return 0;
}
+ if (idx > keys->gap)
+ idx -= keys->size - keys->nr;
+
if (keys->nr == keys->size) {
struct journal_keys new_keys = {
.nr = keys->nr,
- .size = keys->size * 2,
- .journal_seq_base = keys->journal_seq_base,
+ .size = max_t(size_t, keys->size, 8) * 2,
};
- new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL);
+ new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
if (!new_keys.d) {
bch_err(c, "%s: error allocating new key array (size %zu)",
__func__, new_keys.size);
return -ENOMEM;
}
+ /* Since @keys was full, there was no gap: */
memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
kvfree(keys->d);
*keys = new_keys;
+
+ /* And now the gap is at the end: */
+ keys->gap = keys->nr;
}
- array_insert_item(keys->d, keys->nr, idx, n);
+ journal_iters_move_gap(c, keys->gap, idx);
- list_for_each_entry(iter, &c->journal_iters, list)
- journal_iter_fix(c, iter, idx);
+ move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
+ keys->gap = idx;
+
+ keys->nr++;
+ keys->d[keys->gap++] = n;
+
+ journal_iters_fix(c);
return 0;
}
struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
- if (idx < keys->nr &&
+ if (idx < keys->size &&
keys->d[idx].btree_id == btree &&
keys->d[idx].level == level &&
!bpos_cmp(keys->d[idx].k->k.p, pos))
keys->d[idx].overwritten = true;
}
-static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+ if (iter->idx < iter->keys->size) {
+ iter->idx++;
+ if (iter->idx == iter->keys->gap)
+ iter->idx += iter->keys->size - iter->keys->nr;
+ }
+}
+
+struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
struct journal_key *k = iter->keys->d + iter->idx;
- while (k < iter->keys->d + iter->keys->nr &&
+ while (k < iter->keys->d + iter->keys->size &&
k->btree_id == iter->btree_id &&
k->level == iter->level) {
if (!k->overwritten)
- return k->k;
+ return bkey_i_to_s_c(k->k);
- iter->idx++;
+ bch2_journal_iter_advance(iter);
k = iter->keys->d + iter->idx;
}
- return NULL;
-}
-
-static void bch2_journal_iter_advance(struct journal_iter *iter)
-{
- if (iter->idx < iter->keys->nr)
- iter->idx++;
+ return bkey_s_c_null;
}
static void bch2_journal_iter_exit(struct journal_iter *iter)
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
{
- switch (iter->last) {
- case none:
- break;
- case btree:
- bch2_journal_iter_advance_btree(iter);
- break;
- case journal:
- bch2_journal_iter_advance(&iter->journal);
- break;
- }
-
- iter->last = none;
+ if (!bpos_cmp(iter->pos, SPOS_MAX))
+ iter->at_end = true;
+ else
+ iter->pos = bpos_successor(iter->pos);
}
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
{
- struct bkey_s_c ret;
-
- while (1) {
- struct bkey_s_c btree_k =
- bch2_journal_iter_peek_btree(iter);
- struct bkey_s_c journal_k =
- bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
+ struct bkey_s_c btree_k, journal_k, ret;
+again:
+ if (iter->at_end)
+ return bkey_s_c_null;
- if (btree_k.k && journal_k.k) {
- int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p);
+ while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
+ bpos_cmp(btree_k.k->p, iter->pos) < 0)
+ bch2_journal_iter_advance_btree(iter);
- if (!cmp)
- bch2_journal_iter_advance_btree(iter);
+ while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+ bpos_cmp(journal_k.k->p, iter->pos) < 0)
+ bch2_journal_iter_advance(&iter->journal);
- iter->last = cmp < 0 ? btree : journal;
- } else if (btree_k.k) {
- iter->last = btree;
- } else if (journal_k.k) {
- iter->last = journal;
- } else {
- iter->last = none;
- return bkey_s_c_null;
- }
+ ret = journal_k.k &&
+ (!btree_k.k || bpos_cmp(journal_k.k->p, btree_k.k->p) <= 0)
+ ? journal_k
+ : btree_k;
- ret = iter->last == journal ? journal_k : btree_k;
+ if (ret.k && iter->b && bpos_cmp(ret.k->p, iter->b->data->max_key) > 0)
+ ret = bkey_s_c_null;
- if (iter->b &&
- bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) {
- iter->journal.idx = iter->journal.keys->nr;
- iter->last = none;
- return bkey_s_c_null;
+ if (ret.k) {
+ iter->pos = ret.k->p;
+ if (bkey_deleted(ret.k)) {
+ bch2_btree_and_journal_iter_advance(iter);
+ goto again;
}
-
- if (!bkey_deleted(ret.k))
- break;
-
- bch2_btree_and_journal_iter_advance(iter);
+ } else {
+ iter->pos = SPOS_MAX;
+ iter->at_end = true;
}
return ret;
}
-struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter)
-{
- bch2_btree_and_journal_iter_advance(iter);
-
- return bch2_btree_and_journal_iter_peek(iter);
-}
-
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
{
bch2_journal_iter_exit(&iter->journal);
iter->node_iter = node_iter;
bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
INIT_LIST_HEAD(&iter->journal.list);
+ iter->pos = b->data->min_key;
+ iter->at_end = false;
}
/*
/* sort and dedup all keys in the journal: */
-void bch2_journal_entries_free(struct list_head *list)
+void bch2_journal_entries_free(struct bch_fs *c)
{
-
- while (!list_empty(list)) {
- struct journal_replay *i =
- list_first_entry(list, struct journal_replay, list);
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
- }
+ struct journal_replay **i;
+ struct genradix_iter iter;
+
+ genradix_for_each(&c->journal_entries, iter, i)
+ if (*i)
+ kvpfree(*i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&(*i)->j));
+ genradix_free(&c->journal_entries);
}
/*
{
struct journal_key *i;
+ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+ keys->gap = keys->nr;
+
for (i = keys->d; i < keys->d + keys->nr; i++)
if (i->allocated)
kfree(i->k);
kvfree(keys->d);
keys->d = NULL;
- keys->nr = 0;
+ keys->nr = keys->gap = keys->size = 0;
}
-static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
+static int journal_keys_sort(struct bch_fs *c)
{
- struct journal_replay *i;
+ struct genradix_iter iter;
+ struct journal_replay *i, **_i;
struct jset_entry *entry;
struct bkey_i *k, *_n;
- struct journal_keys keys = { NULL };
+ struct journal_keys *keys = &c->journal_keys;
struct journal_key *src, *dst;
size_t nr_keys = 0;
- if (list_empty(journal_entries))
- return keys;
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
- list_for_each_entry(i, journal_entries, list) {
- if (i->ignore)
+ if (!i || i->ignore)
continue;
- if (!keys.journal_seq_base)
- keys.journal_seq_base = le64_to_cpu(i->j.seq);
-
for_each_jset_key(k, _n, entry, &i->j)
nr_keys++;
}
- keys.size = roundup_pow_of_two(nr_keys);
+ if (!nr_keys)
+ return 0;
- keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL);
- if (!keys.d)
- goto err;
+ keys->size = roundup_pow_of_two(nr_keys);
- list_for_each_entry(i, journal_entries, list) {
- if (i->ignore)
- continue;
+ keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+ if (!keys->d)
+ return -ENOMEM;
- BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
for_each_jset_key(k, _n, entry, &i->j)
- keys.d[keys.nr++] = (struct journal_key) {
+ keys->d[keys->nr++] = (struct journal_key) {
.btree_id = entry->btree_id,
.level = entry->level,
.k = k,
- .journal_seq = le64_to_cpu(i->j.seq) -
- keys.journal_seq_base,
+ .journal_seq = le64_to_cpu(i->j.seq),
.journal_offset = k->_data - i->j._data,
};
}
- sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
+ sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
- src = dst = keys.d;
- while (src < keys.d + keys.nr) {
- while (src + 1 < keys.d + keys.nr &&
+ src = dst = keys->d;
+ while (src < keys->d + keys->nr) {
+ while (src + 1 < keys->d + keys->nr &&
src[0].btree_id == src[1].btree_id &&
src[0].level == src[1].level &&
!bpos_cmp(src[0].k->k.p, src[1].k->k.p))
*dst++ = *src++;
}
- keys.nr = dst - keys.d;
-err:
- return keys;
+ keys->nr = dst - keys->d;
+ keys->gap = keys->nr;
+ return 0;
}
/* journal replay: */
static void replay_now_at(struct journal *j, u64 seq)
{
BUG_ON(seq < j->replay_journal_seq);
- BUG_ON(seq > j->replay_journal_seq_end);
+
+ seq = min(seq, j->replay_journal_seq_end);
while (j->replay_journal_seq < seq)
bch2_journal_pin_put(j, j->replay_journal_seq++);
size_t i;
int ret;
+ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+ keys->gap = keys->nr;
+
keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
if (!keys_sorted)
return -ENOMEM;
sizeof(keys_sorted[0]),
journal_sort_seq_cmp, NULL);
- if (keys->nr)
- replay_now_at(j, keys->journal_seq_base);
-
for (i = 0; i < keys->nr; i++) {
k = keys_sorted[i];
cond_resched();
- if (!k->allocated)
- replay_now_at(j, keys->journal_seq_base + k->journal_seq);
+ replay_now_at(j, k->journal_seq);
ret = bch2_trans_do(c, NULL, NULL,
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RESERVED|
- (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0),
+ (!k->allocated
+ ? BTREE_INSERT_JOURNAL_REPLAY|JOURNAL_WATERMARK_reserved
+ : 0),
bch2_journal_replay_key(&trans, k));
if (ret) {
bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
bch2_journal_set_replay_done(j);
bch2_journal_flush_all_pins(j);
ret = bch2_journal_error(j);
+
+ if (keys->nr && !ret)
+ bch2_journal_log_msg(&c->journal, "journal replay finished");
err:
kvfree(keys_sorted);
return ret;
unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
- ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable);
for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
}
static int journal_replay_early(struct bch_fs *c,
- struct bch_sb_field_clean *clean,
- struct list_head *journal)
+ struct bch_sb_field_clean *clean)
{
- struct journal_replay *i;
struct jset_entry *entry;
int ret;
return ret;
}
} else {
- list_for_each_entry(i, journal, list) {
- if (i->ignore)
+ struct genradix_iter iter;
+ struct journal_replay *i, **_i;
+
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
vstruct_for_each(&i->j, entry) {
{
unsigned i;
struct bch_sb_field_clean *clean = *cleanp;
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
int ret = 0;
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
}
for (i = 0; i < BTREE_ID_NR; i++) {
- char buf1[200], buf2[200];
struct bkey_i *k1, *k2;
unsigned l1 = 0, l2 = 0;
if (!k1 && !k2)
continue;
+ printbuf_reset(&buf1);
+ printbuf_reset(&buf2);
+
+ if (k1)
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
+ else
+ prt_printf(&buf1, "(none)");
+
+ if (k2)
+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
+ else
+ prt_printf(&buf2, "(none)");
+
mustfix_fsck_err_on(!k1 || !k2 ||
IS_ERR(k1) ||
IS_ERR(k2) ||
"superblock btree root %u doesn't match journal after clean shutdown\n"
"sb: l=%u %s\n"
"journal: l=%u %s\n", i,
- l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1),
- l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2));
+ l1, buf1.buf,
+ l2, buf2.buf);
}
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
return ERR_PTR(-ENOMEM);
}
- ret = bch2_sb_clean_validate(c, clean, READ);
+ ret = bch2_sb_clean_validate_late(c, clean, READ);
if (ret) {
mutex_unlock(&c->sb_lock);
return ERR_PTR(ret);
return ERR_PTR(ret);
}
+static bool btree_id_is_alloc(enum btree_id id)
+{
+ switch (id) {
+ case BTREE_ID_alloc:
+ case BTREE_ID_backpointers:
+ case BTREE_ID_need_discard:
+ case BTREE_ID_freespace:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int read_btree_roots(struct bch_fs *c)
{
unsigned i;
if (!r->alive)
continue;
- if (i == BTREE_ID_alloc &&
+ if (btree_id_is_alloc(i) &&
c->opts.reconstruct_alloc) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
continue;
}
if (r->error) {
- __fsck_err(c, i == BTREE_ID_alloc
+ __fsck_err(c, btree_id_is_alloc(i)
? FSCK_CAN_IGNORE : 0,
"invalid btree root %s",
bch2_btree_ids[i]);
ret = bch2_btree_root_read(c, i, &r->key, r->level);
if (ret) {
- __fsck_err(c, i == BTREE_ID_alloc
+ __fsck_err(c,
+ btree_id_is_alloc(i)
? FSCK_CAN_IGNORE : 0,
"error reading btree root %s",
bch2_btree_ids[i]);
if (ret)
return ret;
-
bkey_subvolume_init(&root_volume.k_i);
root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
root_volume.v.flags = 0;
c->opts.fix_errors = FSCK_OPT_YES;
}
- if (!c->replicas.entries ||
- c->opts.rebuild_replicas) {
- bch_info(c, "building replicas info");
- set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
- }
-
if (!c->opts.nochanges) {
- if (c->sb.version < bcachefs_metadata_version_inode_backpointers) {
- bch_info(c, "version prior to inode backpointers, upgrade and fsck required");
+ if (c->sb.version < bcachefs_metadata_version_backpointers) {
+ bch_info(c, "version prior to backpointers, upgrade and fsck required");
c->opts.version_upgrade = true;
c->opts.fsck = true;
c->opts.fix_errors = FSCK_OPT_YES;
- } else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) {
- bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
- c->opts.version_upgrade = true;
- c->opts.fsck = true;
- } else if (c->sb.version < bcachefs_metadata_version_inode_v2) {
- bch_info(c, "filesystem version is prior to inode_v2 - upgrading");
- c->opts.version_upgrade = true;
+ } else if (c->sb.version < bcachefs_metadata_version_inode_v3) {
+ bch_info(c, "version prior to inode_v3, upgrade required");
+ c->opts.version_upgrade = true;
}
}
+ if (c->opts.fsck && c->opts.norecovery) {
+ bch_err(c, "cannot select both norecovery and fsck");
+ ret = -EINVAL;
+ goto err;
+ }
+
ret = bch2_blacklist_table_initialize(c);
if (ret) {
bch_err(c, "error initializing blacklist table");
}
if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
- struct journal_replay *i;
+ struct genradix_iter iter;
+ struct journal_replay **i;
bch_verbose(c, "starting journal read");
- ret = bch2_journal_read(c, &c->journal_entries,
- &blacklist_seq, &journal_seq);
+ ret = bch2_journal_read(c, &blacklist_seq, &journal_seq);
if (ret)
goto err;
- list_for_each_entry_reverse(i, &c->journal_entries, list)
- if (!i->ignore) {
- last_journal_entry = &i->j;
+ genradix_for_each_reverse(&c->journal_entries, iter, i)
+ if (*i && !(*i)->ignore) {
+ last_journal_entry = &(*i)->j;
break;
}
goto use_clean;
}
- c->journal_keys = journal_keys_sort(&c->journal_entries);
- if (!c->journal_keys.d) {
- ret = -ENOMEM;
+ ret = journal_keys_sort(c);
+ if (ret)
goto err;
- }
if (c->sb.clean && last_journal_entry) {
ret = verify_superblock_clean(c, &clean,
use_clean:
if (!clean) {
bch_err(c, "no superblock clean section found");
- ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
+ ret = -BCH_ERR_fsck_repair_impossible;
goto err;
}
zero_out_btree_mem_ptr(&c->journal_keys);
- ret = journal_replay_early(c, clean, &c->journal_entries);
+ ret = journal_replay_early(c, clean);
if (ret)
goto err;
}
}
- ret = bch2_fs_journal_start(&c->journal, journal_seq,
- &c->journal_entries);
+ /*
+ * note: cmd_list_journal needs the blacklist table fully up to date so
+ * it can asterisk ignored journal entries:
+ */
+ if (c->opts.read_journal_only)
+ goto out;
+
+ ret = bch2_fs_journal_start(&c->journal, journal_seq);
if (ret)
goto err;
+ /*
+ * Skip past versions that might have possibly been used (as nonces),
+ * but hadn't had their pointers written:
+ */
+ if (c->sb.encryption_type && !c->sb.clean)
+ atomic64_add(1 << 16, &c->key_version);
+
ret = read_btree_roots(c);
if (ret)
goto err;
err = "error reading allocation information";
down_read(&c->gc_lock);
- ret = bch2_alloc_read(c, false, false);
+ ret = bch2_alloc_read(c);
up_read(&c->gc_lock);
if (ret)
goto err;
bch_verbose(c, "stripes_read done");
- set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
-
- /*
- * If we're not running fsck, this ensures bch2_fsck_err() calls are
- * instead interpreted as bch2_inconsistent_err() calls:
- */
- if (!c->opts.fsck)
- set_bit(BCH_FS_FSCK_DONE, &c->flags);
+ bch2_stripes_heap_start(c);
- if (c->opts.fsck ||
- !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
- !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
- test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
+ if (c->opts.fsck) {
bool metadata_only = c->opts.norecovery;
bch_info(c, "checking allocations");
- err = "error in mark and sweep";
+ err = "error checking allocations";
ret = bch2_gc(c, true, metadata_only);
if (ret)
goto err;
bch_verbose(c, "done checking allocations");
- }
- bch2_stripes_heap_start(c);
+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
- clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
- set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+ bch_info(c, "checking need_discard and freespace btrees");
+ err = "error checking need_discard and freespace btrees";
+ ret = bch2_check_alloc_info(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "done checking need_discard and freespace btrees");
- /*
- * Skip past versions that might have possibly been used (as nonces),
- * but hadn't had their pointers written:
- */
- if (c->sb.encryption_type && !c->sb.clean)
- atomic64_add(1 << 16, &c->key_version);
+ set_bit(BCH_FS_MAY_GO_RW, &c->flags);
- if (c->opts.norecovery)
- goto out;
+ bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr);
+ err = "journal replay failed";
+ ret = bch2_journal_replay(c);
+ if (ret)
+ goto err;
+ if (c->opts.verbose || !c->sb.clean)
+ bch_info(c, "journal replay done");
+
+ bch_info(c, "checking lrus");
+ err = "error checking lrus";
+ ret = bch2_check_lrus(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "done checking lrus");
+ set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+
+ bch_info(c, "checking backpointers to alloc keys");
+ err = "error checking backpointers to alloc keys";
+ ret = bch2_check_btree_backpointers(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "done checking backpointers to alloc keys");
+
+ bch_info(c, "checking backpointers to extents");
+ err = "error checking backpointers to extents";
+ ret = bch2_check_backpointers_to_extents(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "done checking backpointers to extents");
+
+ bch_info(c, "checking extents to backpointers");
+ err = "error checking extents to backpointers";
+ ret = bch2_check_extents_to_backpointers(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "done checking extents to backpointers");
+ set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
+
+ bch_info(c, "checking alloc to lru refs");
+ err = "error checking alloc to lru refs";
+ ret = bch2_check_alloc_to_lru_refs(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "done checking alloc to lru refs");
+ set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
+ } else {
+ set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
+ set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
+ if (c->opts.norecovery)
+ goto out;
+
+ bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
+ err = "journal replay failed";
+ ret = bch2_journal_replay(c);
+ if (ret)
+ goto err;
+ if (c->opts.verbose || !c->sb.clean)
+ bch_info(c, "journal replay done");
+ }
- bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
- err = "journal replay failed";
- ret = bch2_journal_replay(c);
+ err = "error initializing freespace";
+ ret = bch2_fs_freespace_init(c);
if (ret)
goto err;
- if (c->opts.verbose || !c->sb.clean)
- bch_info(c, "journal replay done");
if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
bch2_fs_lazy_rw(c);
if (!c->opts.keep_journal) {
bch2_journal_keys_free(&c->journal_keys);
- bch2_journal_entries_free(&c->journal_entries);
+ bch2_journal_entries_free(c);
}
kfree(clean);
+
+ if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) {
+ bch2_fs_read_write_early(c);
+ bch2_delete_dead_snapshots_async(c);
+ }
+
if (ret)
- bch_err(c, "Error in recovery: %s (%i)", err, ret);
+ bch_err(c, "Error in recovery: %s (%s)", err, bch2_err_str(ret));
else
- bch_verbose(c, "ret %i", ret);
+ bch_verbose(c, "ret %s", bch2_err_str(ret));
return ret;
err:
fsck_err:
struct qstr lostfound = QSTR("lost+found");
const char *err = "cannot allocate memory";
struct bch_dev *ca;
- LIST_HEAD(journal);
unsigned i;
int ret;
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
+ if (c->sb.version < bcachefs_metadata_version_inode_v3)
+ c->opts.version_upgrade = true;
+
if (c->opts.version_upgrade) {
c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
}
mutex_unlock(&c->sb_lock);
- set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+ set_bit(BCH_FS_MAY_GO_RW, &c->flags);
set_bit(BCH_FS_FSCK_DONE, &c->flags);
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
+ for_each_online_member(ca, c, i)
+ bch2_dev_usage_init(ca);
+
err = "unable to allocate journal buckets";
for_each_online_member(ca, c, i) {
ret = bch2_dev_journal_alloc(ca);
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
*/
- bch2_fs_journal_start(&c->journal, 1, &journal);
+ bch2_fs_journal_start(&c->journal, 1);
bch2_journal_set_replay_done(&c->journal);
err = "error going read-write";
* Write out the superblock and journal buckets, now that we can do
* btree updates
*/
+ bch_verbose(c, "marking superblocks");
err = "error marking superblock and journal";
for_each_member_device(ca, c, i) {
ret = bch2_trans_mark_dev_sb(c, ca);
ca->new_fs_bucket_idx = 0;
}
+ bch_verbose(c, "initializing freespace");
+ err = "error initializing freespace";
+ ret = bch2_fs_freespace_init(c);
+ if (ret)
+ goto err;
+
err = "error creating root snapshot node";
ret = bch2_fs_initialize_subvolumes(c);
if (ret)
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
root_inode.bi_inum = BCACHEFS_ROOT_INO;
root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
- bch2_inode_pack(c, &packed_inode, &root_inode);
+ bch2_inode_pack(&packed_inode, &root_inode);
packed_inode.inode.k.p.snapshot = U32_MAX;
err = "error creating root directory";
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
-#define for_each_journal_key(keys, i) \
- for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
-
struct journal_iter {
struct list_head list;
enum btree_id btree_id;
struct bkey unpacked;
struct journal_iter journal;
-
- enum last_key_returned {
- none,
- btree,
- journal,
- } last;
+ struct bpos pos;
+ bool at_end;
};
-size_t bch2_journal_key_search(struct journal_keys *, enum btree_id,
- unsigned, struct bpos);
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
+ unsigned, struct bpos, struct bpos, size_t *);
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
+ unsigned, struct bpos);
int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct btree *);
void bch2_journal_keys_free(struct journal_keys *);
-void bch2_journal_entries_free(struct list_head *);
+void bch2_journal_entries_free(struct bch_fs *);
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
/* reflink pointers */
-const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- if (bkey_val_bytes(p.k) != sizeof(*p.v))
- return "incorrect value size";
+ if (bkey_val_bytes(p.k) != sizeof(*p.v)) {
+ prt_printf(err, "incorrect value size (%zu != %zu)",
+ bkey_val_bytes(p.k), sizeof(*p.v));
+ return -EINVAL;
+ }
if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
- le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad))
- return "idx < front_pad";
+ le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) {
+ prt_printf(err, "idx < front_pad (%llu < %u)",
+ le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- pr_buf(out, "idx %llu front_pad %u back_pad %u",
+ prt_printf(out, "idx %llu front_pad %u back_pad %u",
le64_to_cpu(p.v->idx),
le32_to_cpu(p.v->front_pad),
le32_to_cpu(p.v->back_pad));
/* indirect extents */
-const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
- if (bkey_val_bytes(r.k) < sizeof(*r.v))
- return "incorrect value size";
+ if (bkey_val_bytes(r.k) < sizeof(*r.v)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(r.k), sizeof(*r.v));
+ return -EINVAL;
+ }
- return bch2_bkey_ptrs_invalid(c, k);
+ return bch2_bkey_ptrs_invalid(c, k, rw, err);
}
void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
- pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+ prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
bch2_bkey_ptrs_to_text(out, c, k);
}
return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
}
+int bch2_trans_mark_reflink_v(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
+ struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new);
+
+ if (!r->v.refcount) {
+ r->k.type = KEY_TYPE_deleted;
+ r->k.size = 0;
+ set_bkey_val_u64s(&r->k, 0);
+ return 0;
+ }
+ }
+
+ return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
+}
+
/* indirect inline data */
-const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
- if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data))
- return "incorrect value size";
- return NULL;
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_indirect_inline_data));
+ return -EINVAL;
+ }
+
+ return 0;
}
void bch2_indirect_inline_data_to_text(struct printbuf *out,
struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
unsigned datalen = bkey_inline_data_bytes(k.k);
- pr_buf(out, "refcount %llu datalen %u: %*phN",
+ prt_printf(out, "refcount %llu datalen %u: %*phN",
le64_to_cpu(d.v->refcount), datalen,
min(datalen, 32U), d.v->data);
}
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
+ struct bkey_i_indirect_inline_data *r =
+ bkey_i_to_indirect_inline_data(new);
+
+ if (!r->v.refcount) {
+ r->k.type = KEY_TYPE_deleted;
+ r->k.size = 0;
+ set_bkey_val_u64s(&r->k, 0);
+ }
+ }
+
+ return 0;
+}
+
static int bch2_make_extent_indirect(struct btree_trans *trans,
struct btree_iter *extent_iter,
struct bkey_i *orig)
u32 dst_snapshot, src_snapshot;
int ret = 0, ret2 = 0;
- if (!percpu_ref_tryget(&c->writes))
+ if (!percpu_ref_tryget_live(&c->writes))
return -EROFS;
bch2_check_set_feature(c, BCH_FEATURE_reflink);
bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
BTREE_ITER_INTENT);
- while ((ret == 0 || ret == -EINTR) &&
+ while ((ret == 0 ||
+ bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
bkey_cmp(dst_iter.pos, dst_end) < 0) {
struct disk_reservation disk_res = { 0 };
}
bch2_trans_iter_exit(&trans, &inode_iter);
- } while (ret2 == -EINTR);
+ } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&new_src, c);
#ifndef _BCACHEFS_REFLINK_H
#define _BCACHEFS_REFLINK_H
-const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c,
+ int, struct printbuf *);
void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \
.key_invalid = bch2_reflink_p_invalid, \
.val_to_text = bch2_reflink_p_to_text, \
- .key_merge = bch2_reflink_p_merge, \
+ .key_merge = bch2_reflink_p_merge, \
+ .trans_trigger = bch2_trans_mark_reflink_p, \
+ .atomic_trigger = bch2_mark_reflink_p, \
}
-const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
+ int, struct printbuf *);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
+int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *, unsigned);
#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \
.key_invalid = bch2_reflink_v_invalid, \
.val_to_text = bch2_reflink_v_to_text, \
.swab = bch2_ptr_swab, \
+ .trans_trigger = bch2_trans_mark_reflink_v, \
+ .atomic_trigger = bch2_mark_extent, \
}
-const char *bch2_indirect_inline_data_invalid(const struct bch_fs *,
- struct bkey_s_c);
+int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
+ int, struct printbuf *);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+ enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *,
+ unsigned);
#define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) { \
.key_invalid = bch2_indirect_inline_data_invalid, \
.val_to_text = bch2_indirect_inline_data_to_text, \
+ .trans_trigger = bch2_trans_mark_indirect_inline_data, \
}
static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
}
+void bch2_replicas_entry_v0_to_text(struct printbuf *out,
+ struct bch_replicas_entry_v0 *e)
+{
+ unsigned i;
+
+ if (e->data_type < BCH_DATA_NR)
+ prt_printf(out, "%s", bch2_data_types[e->data_type]);
+ else
+ prt_printf(out, "(invalid data type %u)", e->data_type);
+
+ prt_printf(out, ": %u [", e->nr_devs);
+ for (i = 0; i < e->nr_devs; i++)
+ prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+ prt_printf(out, "]");
+}
+
void bch2_replicas_entry_to_text(struct printbuf *out,
struct bch_replicas_entry *e)
{
unsigned i;
if (e->data_type < BCH_DATA_NR)
- pr_buf(out, "%s", bch2_data_types[e->data_type]);
+ prt_printf(out, "%s", bch2_data_types[e->data_type]);
else
- pr_buf(out, "(invalid data type %u)", e->data_type);
+ prt_printf(out, "(invalid data type %u)", e->data_type);
- pr_buf(out, ": %u/%u [", e->nr_required, e->nr_devs);
+ prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
for (i = 0; i < e->nr_devs; i++)
- pr_buf(out, i ? " %u" : "%u", e->devs[i]);
- pr_buf(out, "]");
+ prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+ prt_printf(out, "]");
}
void bch2_cpu_replicas_to_text(struct printbuf *out,
for_each_cpu_replicas_entry(r, e) {
if (!first)
- pr_buf(out, " ");
+ prt_printf(out, " ");
first = false;
bch2_replicas_entry_to_text(out, e);
bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) {
n = cpu_replicas_add_entry(&c->replicas_gc, e);
if (!n.entries) {
- ret = -ENOSPC;
+ ret = -ENOMEM;
goto err;
}
}
}
- if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
- ret = -ENOSPC;
+ ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
+ if (ret)
goto err;
- }
ret = replicas_table_update(c, &c->replicas_gc);
err:
bch2_cpu_replicas_sort(&new);
- if (bch2_cpu_replicas_to_sb_replicas(c, &new)) {
- ret = -ENOSPC;
+ ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
+ if (ret)
goto err;
- }
ret = replicas_table_update(c, &new);
err:
sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb,
DIV_ROUND_UP(bytes, sizeof(u64)));
if (!sb_r)
- return -ENOSPC;
+ return -BCH_ERR_ENOSPC_sb_replicas;
bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb);
sb_r = bch2_sb_resize_replicas(&c->disk_sb,
DIV_ROUND_UP(bytes, sizeof(u64)));
if (!sb_r)
- return -ENOSPC;
+ return -BCH_ERR_ENOSPC_sb_replicas;
bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
cpu_replicas_entry(cpu_r, i);
if (e->data_type >= BCH_DATA_NR) {
- pr_buf(err, "invalid data type in entry ");
+ prt_printf(err, "invalid data type in entry ");
bch2_replicas_entry_to_text(err, e);
return -EINVAL;
}
if (!e->nr_devs) {
- pr_buf(err, "no devices in entry ");
+ prt_printf(err, "no devices in entry ");
bch2_replicas_entry_to_text(err, e);
return -EINVAL;
}
if (e->nr_required > 1 &&
e->nr_required >= e->nr_devs) {
- pr_buf(err, "bad nr_required in entry ");
+ prt_printf(err, "bad nr_required in entry ");
bch2_replicas_entry_to_text(err, e);
return -EINVAL;
}
for (j = 0; j < e->nr_devs; j++)
if (!bch2_dev_exists(sb, mi, e->devs[j])) {
- pr_buf(err, "invalid device %u in entry ", e->devs[j]);
+ prt_printf(err, "invalid device %u in entry ", e->devs[j]);
bch2_replicas_entry_to_text(err, e);
return -EINVAL;
}
BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
if (!memcmp(e, n, cpu_r->entry_size)) {
- pr_buf(err, "duplicate replicas entry ");
+ prt_printf(err, "duplicate replicas entry ");
bch2_replicas_entry_to_text(err, e);
return -EINVAL;
}
return 0;
}
-static int bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f,
+static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
struct printbuf *err)
{
struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
for_each_replicas_entry(r, e) {
if (!first)
- pr_buf(out, " ");
+ prt_printf(out, " ");
first = false;
bch2_replicas_entry_to_text(out, e);
}
+ prt_newline(out);
}
const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
- .validate = bch2_sb_validate_replicas,
+ .validate = bch2_sb_replicas_validate,
.to_text = bch2_sb_replicas_to_text,
};
-static int bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f,
+static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
struct printbuf *err)
{
struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
return ret;
}
+static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
+ struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
+ struct bch_replicas_entry_v0 *e;
+ bool first = true;
+
+ for_each_replicas_entry(sb_r, e) {
+ if (!first)
+ prt_printf(out, " ");
+ first = false;
+
+ bch2_replicas_entry_v0_to_text(out, e);
+ }
+ prt_newline(out);
+}
+
const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
- .validate = bch2_sb_validate_replicas_v0,
+ .validate = bch2_sb_replicas_v0_validate,
+ .to_text = bch2_sb_replicas_v0_to_text,
};
/* Query replicas: */
if (dflags & ~flags) {
if (print) {
- char buf[100];
+ struct printbuf buf = PRINTBUF;
- bch2_replicas_entry_to_text(&PBUF(buf), e);
+ bch2_replicas_entry_to_text(&buf, e);
bch_err(c, "insufficient devices online (%u) for replicas entry %s",
- nr_online, buf);
+ nr_online, buf.buf);
+ printbuf_exit(&buf);
}
ret = false;
break;
return ret;
}
-unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
{
- struct bch_replicas_entry *e;
- unsigned i, ret = 0;
+ struct bch_sb_field_replicas *replicas;
+ struct bch_sb_field_replicas_v0 *replicas_v0;
+ unsigned i, data_has = 0;
+
+ replicas = bch2_sb_get_replicas(sb);
+ replicas_v0 = bch2_sb_get_replicas_v0(sb);
+
+ if (replicas) {
+ struct bch_replicas_entry *r;
+
+ for_each_replicas_entry(replicas, r)
+ for (i = 0; i < r->nr_devs; i++)
+ if (r->devs[i] == dev)
+ data_has |= 1 << r->data_type;
+ } else if (replicas_v0) {
+ struct bch_replicas_entry_v0 *r;
+
+ for_each_replicas_entry_v0(replicas_v0, r)
+ for (i = 0; i < r->nr_devs; i++)
+ if (r->devs[i] == dev)
+ data_has |= 1 << r->data_type;
+ }
- percpu_down_read(&c->mark_lock);
- for_each_cpu_replicas_entry(&c->replicas, e)
- for (i = 0; i < e->nr_devs; i++)
- if (e->devs[i] == ca->dev_idx)
- ret |= 1 << e->data_type;
+ return data_has;
+}
- percpu_up_read(&c->mark_lock);
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+ unsigned ret;
+
+ mutex_lock(&c->sb_lock);
+ ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
+ mutex_unlock(&c->sb_lock);
return ret;
}
#ifndef _BCACHEFS_REPLICAS_H
#define _BCACHEFS_REPLICAS_H
+#include "bkey.h"
#include "eytzinger.h"
#include "replicas_types.h"
bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
unsigned, bool);
+unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
int bch2_replicas_gc_end(struct bch_fs *, int);
r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
memset(ctx, 0, sizeof(*ctx));
- return (r);
+ return r;
}
u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
{
return k.k->type == desc.key_type &&
- (!desc.is_visible || desc.is_visible(inum, k));
+ (!desc.is_visible ||
+ !inum.inum ||
+ desc.is_visible(inum, k));
}
static __always_inline int
if (ret)
return ret;
- for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
+ POS(inum.inum, U64_MAX),
BTREE_ITER_SLOTS|flags, k, ret) {
- if (iter->pos.inode != inum.inum)
- break;
-
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_key(k, key))
return 0;
if (ret)
return ret;
- for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (iter->pos.inode != inum.inum)
- break;
-
+ POS(inum.inum, U64_MAX),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret)
if (!is_visible_key(desc, inum, k))
return 0;
- }
bch2_trans_iter_exit(trans, iter);
- return ret ?: -ENOSPC;
+ return ret ?: -BCH_ERR_ENOSPC_str_hash_create;
}
static __always_inline
}
static __always_inline
-int bch2_hash_set(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum,
- struct bkey_i *insert, int flags)
+int bch2_hash_set_snapshot(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum, u32 snapshot,
+ struct bkey_i *insert,
+ int flags,
+ int update_flags)
{
struct btree_iter iter, slot = { NULL };
struct bkey_s_c k;
bool found = false;
- u32 snapshot;
int ret;
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- return ret;
-
- for_each_btree_key_norestart(trans, iter, desc.btree_id,
- SPOS(inum.inum,
+ for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
+ SPOS(insert->k.p.inode,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
+ POS(insert->k.p.inode, U64_MAX),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (iter.pos.inode != inum.inum)
- break;
-
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
goto found;
}
if (!ret)
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_str_hash_create;
out:
bch2_trans_iter_exit(trans, &slot);
bch2_trans_iter_exit(trans, &iter);
goto out;
}
+static __always_inline
+int bch2_hash_set(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum,
+ struct bkey_i *insert, int flags)
+{
+ u32 snapshot;
+ int ret;
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ insert->k.p.inode = inum.inum;
+
+ return bch2_hash_set_snapshot(trans, desc, info, inum,
+ snapshot, insert, flags, 0);
+}
+
static __always_inline
int bch2_hash_delete_at(struct btree_trans *trans,
const struct bch_hash_desc desc,
#include "bcachefs.h"
#include "btree_key_cache.h"
#include "btree_update.h"
+#include "errcode.h"
#include "error.h"
#include "fs.h"
#include "subvolume.h"
/* Snapshot tree: */
-static void bch2_delete_dead_snapshots_work(struct work_struct *);
-static void bch2_delete_dead_snapshots(struct bch_fs *);
-
void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
- pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
+ prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u",
BCH_SNAPSHOT_SUBVOL(s.v),
BCH_SNAPSHOT_DELETED(s.v),
le32_to_cpu(s.v->parent),
le32_to_cpu(s.v->subvol));
}
-const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
struct bkey_s_c_snapshot s;
u32 i, id;
if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
- bkey_cmp(k.k->p, POS(0, 1)) < 0)
- return "bad pos";
+ bkey_cmp(k.k->p, POS(0, 1)) < 0) {
+ prt_printf(err, "bad pos");
+ return -EINVAL;
+ }
- if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot))
- return "bad val size";
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) {
+ prt_printf(err, "bad val size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_snapshot));
+ return -EINVAL;
+ }
s = bkey_s_c_to_snapshot(k);
id = le32_to_cpu(s.v->parent);
- if (id && id <= k.k->p.offset)
- return "bad parent node";
+ if (id && id <= k.k->p.offset) {
+ prt_printf(err, "bad parent node (%u <= %llu)",
+ id, k.k->p.offset);
+ return -EINVAL;
+ }
- if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]))
- return "children not normalized";
+ if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
+ prt_printf(err, "children not normalized");
+ return -EINVAL;
+ }
if (s.v->children[0] &&
- s.v->children[0] == s.v->children[1])
- return "duplicate child nodes";
+ s.v->children[0] == s.v->children[1]) {
+ prt_printf(err, "duplicate child nodes");
+ return -EINVAL;
+ }
for (i = 0; i < 2; i++) {
id = le32_to_cpu(s.v->children[i]);
- if (id >= k.k->p.offset)
- return "bad child node";
+ if (id >= k.k->p.offset) {
+ prt_printf(err, "bad child node (%u >= %llu)",
+ id, k.k->p.offset);
+ return -EINVAL;
+ }
}
- return NULL;
+ return 0;
}
int bch2_mark_snapshot(struct btree_trans *trans,
if (!id)
return 0;
- ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+ ret = snapshot_lookup(trans, id, &v);
if (ret == -ENOENT)
bch_err(trans->c, "snapshot node %u not found", id);
if (ret)
return !BCH_SNAPSHOT_DELETED(&v);
}
-static int bch2_snapshots_set_equiv(struct btree_trans *trans)
+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
+ unsigned i, nr_live = 0, live_idx = 0;
struct bkey_s_c_snapshot snap;
- unsigned i;
- int ret;
+ u32 id = k.k->p.offset, child[2];
- for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ret) {
- u32 id = k.k->p.offset, child[2];
- unsigned nr_live = 0, live_idx;
+ if (k.k->type != KEY_TYPE_snapshot)
+ return 0;
- if (k.k->type != KEY_TYPE_snapshot)
- continue;
+ snap = bkey_s_c_to_snapshot(k);
- snap = bkey_s_c_to_snapshot(k);
- child[0] = le32_to_cpu(snap.v->children[0]);
- child[1] = le32_to_cpu(snap.v->children[1]);
+ child[0] = le32_to_cpu(snap.v->children[0]);
+ child[1] = le32_to_cpu(snap.v->children[1]);
- for (i = 0; i < 2; i++) {
- ret = snapshot_live(trans, child[i]);
- if (ret < 0)
- break;
-
- if (ret)
- live_idx = i;
- nr_live += ret;
- }
+ for (i = 0; i < 2; i++) {
+ int ret = snapshot_live(trans, child[i]);
+ if (ret < 0)
+ return ret;
- snapshot_t(c, id)->equiv = nr_live == 1
- ? snapshot_t(c, child[live_idx])->equiv
- : id;
+ if (ret)
+ live_idx = i;
+ nr_live += ret;
}
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- bch_err(c, "error walking snapshots: %i", ret);
- return ret;
+ snapshot_t(c, id)->equiv = nr_live == 1
+ ? snapshot_t(c, child[live_idx])->equiv
+ : id;
+ return 0;
}
/* fsck: */
-static int bch2_snapshot_check(struct btree_trans *trans,
- struct bkey_s_c_snapshot s)
+static int check_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c_snapshot s;
struct bch_subvolume subvol;
struct bch_snapshot v;
+ struct printbuf buf = PRINTBUF;
+ bool should_have_subvol;
u32 i, id;
- int ret;
-
- id = le32_to_cpu(s.v->subvol);
- ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
- if (ret == -ENOENT)
- bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
- s.k->p.offset, id);
- if (ret)
- return ret;
+ int ret = 0;
- if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
- bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
- s.k->p.offset);
- return -EINVAL;
- }
+ if (k.k->type != KEY_TYPE_snapshot)
+ return 0;
+ s = bkey_s_c_to_snapshot(k);
id = le32_to_cpu(s.v->parent);
if (id) {
- ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+ ret = snapshot_lookup(trans, id, &v);
if (ret == -ENOENT)
- bch_err(trans->c, "snapshot node %llu has nonexistent parent %u",
- s.k->p.offset, id);
+ bch_err(c, "snapshot with nonexistent parent:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
if (ret)
- return ret;
+ goto err;
if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
le32_to_cpu(v.children[1]) != s.k->p.offset) {
- bch_err(trans->c, "snapshot parent %u missing pointer to child %llu",
+ bch_err(c, "snapshot parent %u missing pointer to child %llu",
id, s.k->p.offset);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
}
for (i = 0; i < 2 && s.v->children[i]; i++) {
id = le32_to_cpu(s.v->children[i]);
- ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+ ret = snapshot_lookup(trans, id, &v);
if (ret == -ENOENT)
- bch_err(trans->c, "snapshot node %llu has nonexistent child %u",
+ bch_err(c, "snapshot node %llu has nonexistent child %u",
s.k->p.offset, id);
if (ret)
- return ret;
+ goto err;
if (le32_to_cpu(v.parent) != s.k->p.offset) {
- bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)",
+ bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
id, le32_to_cpu(v.parent), s.k->p.offset);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
}
- return 0;
+ should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) &&
+ !BCH_SNAPSHOT_DELETED(s.v);
+
+ if (should_have_subvol) {
+ id = le32_to_cpu(s.v->subvol);
+ ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
+ if (ret == -ENOENT)
+ bch_err(c, "snapshot points to nonexistent subvolume:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
+ if (ret)
+ goto err;
+
+ if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
+ bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+ s.k->p.offset);
+ ret = -EINVAL;
+ goto err;
+ }
+ } else {
+ if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u));
+
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&u->k_i, s.s_c);
+ u->v.subvol = 0;
+ ret = bch2_trans_update(trans, iter, &u->k_i, 0);
+ if (ret)
+ goto err;
+ }
+ }
+
+ if (BCH_SNAPSHOT_DELETED(s.v))
+ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
}
-int bch2_fs_snapshots_check(struct bch_fs *c)
+int bch2_fs_check_snapshots(struct bch_fs *c)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct bch_snapshot s;
- unsigned id;
int ret;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ret) {
- if (k.k->type != KEY_TYPE_snapshot)
- continue;
+ ret = for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_snapshots, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_snapshot(&trans, &iter, k));
+
+ if (ret)
+ bch_err(c, "error %i checking snapshots", ret);
+
+ bch2_trans_exit(&trans);
+ return ret;
+}
- ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k));
+static int check_subvol(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_subvolume subvol;
+ struct bch_snapshot snapshot;
+ unsigned snapid;
+ int ret;
+
+ if (k.k->type != KEY_TYPE_subvolume)
+ return 0;
+
+ subvol = bkey_s_c_to_subvolume(k);
+ snapid = le32_to_cpu(subvol.v->snapshot);
+ ret = snapshot_lookup(trans, snapid, &snapshot);
+
+ if (ret == -ENOENT)
+ bch_err(trans->c, "subvolume %llu points to nonexistent snapshot %u",
+ k.k->p.offset, snapid);
+ if (ret)
+ return ret;
+
+ if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+ ret = bch2_subvolume_delete(trans, iter->pos.offset);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(trans->c, "error deleting subvolume %llu: %s",
+ iter->pos.offset, bch2_err_str(ret));
if (ret)
- break;
+ return ret;
}
- bch2_trans_iter_exit(&trans, &iter);
- if (ret) {
- bch_err(c, "error %i checking snapshots", ret);
- goto err;
- }
+ return 0;
+}
+
+int bch2_fs_check_subvols(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ ret = for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_subvol(&trans, &iter, k));
- for_each_btree_key(&trans, iter, BTREE_ID_subvolumes,
- POS_MIN, 0, k, ret) {
- if (k.k->type != KEY_TYPE_subvolume)
- continue;
-again_2:
- id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
- ret = snapshot_lookup(&trans, id, &s);
-
- if (ret == -EINTR) {
- k = bch2_btree_iter_peek(&iter);
- goto again_2;
- } else if (ret == -ENOENT)
- bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
- k.k->p.offset, id);
- else if (ret)
- break;
- }
- bch2_trans_iter_exit(&trans, &iter);
-err:
bch2_trans_exit(&trans);
+
return ret;
}
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- bool have_deleted = false;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ret) {
- if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
- break;
-
- if (k.k->type != KEY_TYPE_snapshot) {
- bch_err(c, "found wrong key type %u in snapshot node table",
- k.k->type);
- continue;
- }
-
- if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
- have_deleted = true;
-
- ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(&trans, &iter);
+ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
+ bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0) ?:
+ bch2_snapshot_set_equiv(&trans, k));
- if (ret)
- goto err;
-
- ret = bch2_snapshots_set_equiv(&trans);
- if (ret)
- goto err;
-err:
bch2_trans_exit(&trans);
- if (!ret && have_deleted) {
- bch_info(c, "restarting deletion of dead snapshots");
- if (c->opts.fsck) {
- bch2_delete_dead_snapshots_work(&c->snapshot_delete_work);
- } else {
- bch2_delete_dead_snapshots(c);
- }
- }
-
+ if (ret)
+ bch_err(c, "error starting snapshots: %s", bch2_err_str(ret));
return ret;
}
goto err;
bkey_reassemble(&s->k_i, k);
-
SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+ SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
+ s->v.subvol = 0;
+
ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
if (ret)
goto err;
goto err;
if (!k.k || !k.k->p.offset) {
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_snapshot_create;
goto err;
}
n->v.children[0] = cpu_to_le32(new_snapids[0]);
n->v.children[1] = cpu_to_le32(new_snapids[1]);
+ n->v.subvol = 0;
SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
if (ret)
return ret;
}
-static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
+static int snapshot_delete_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ snapshot_id_list *deleted,
+ snapshot_id_list *equiv_seen,
+ struct bpos *last_pos)
{
- BUG_ON(snapshot_list_has_id(s, id));
-
- if (s->nr == s->size) {
- size_t new_size = max(8U, s->size * 2);
- void *n = krealloc(s->d,
- new_size * sizeof(s->d[0]),
- GFP_KERNEL);
- if (!n) {
- pr_err("error allocating snapshot ID list");
- return -ENOMEM;
- }
+ struct bch_fs *c = trans->c;
+ u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
- s->d = n;
- s->size = new_size;
- };
+ if (bkey_cmp(k.k->p, *last_pos))
+ equiv_seen->nr = 0;
+ *last_pos = k.k->p;
- s->d[s->nr++] = id;
- return 0;
+ if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+ snapshot_list_has_id(equiv_seen, equiv)) {
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ } else {
+ return snapshot_list_add(c, equiv_seen, equiv);
+ }
}
-static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
- struct snapshot_id_list *deleted,
- enum btree_id btree_id)
+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct snapshot_id_list equiv_seen = { 0 };
- struct bpos last_pos = POS_MIN;
- int ret = 0;
+ struct bkey_s_c_snapshot snap;
+ u32 children[2];
+ int ret;
- /*
- * XXX: We should also delete whiteouts that no longer overwrite
- * anything
- */
+ if (k.k->type != KEY_TYPE_snapshot)
+ return 0;
- bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
-
- while ((bch2_trans_begin(trans),
- (k = bch2_btree_iter_peek(&iter)).k) &&
- !(ret = bkey_err(k))) {
- u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
-
- if (bkey_cmp(k.k->p, last_pos))
- equiv_seen.nr = 0;
- last_pos = k.k->p;
-
- if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
- snapshot_list_has_id(&equiv_seen, equiv)) {
- if (btree_id == BTREE_ID_inodes &&
- bch2_btree_key_cache_flush(trans, btree_id, iter.pos))
- continue;
-
- ret = __bch2_trans_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL,
- bch2_btree_iter_traverse(&iter) ?:
- bch2_btree_delete_at(trans, &iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
- if (ret)
- break;
- } else {
- ret = snapshot_id_add(&equiv_seen, equiv);
- if (ret)
- break;
- }
+ snap = bkey_s_c_to_snapshot(k);
+ if (BCH_SNAPSHOT_DELETED(snap.v) ||
+ BCH_SNAPSHOT_SUBVOL(snap.v))
+ return 0;
- bch2_btree_iter_advance(&iter);
- }
- bch2_trans_iter_exit(trans, &iter);
+ children[0] = le32_to_cpu(snap.v->children[0]);
+ children[1] = le32_to_cpu(snap.v->children[1]);
- kfree(equiv_seen.d);
+ ret = snapshot_live(trans, children[0]) ?:
+ snapshot_live(trans, children[1]);
+ if (ret < 0)
+ return ret;
- return ret;
+ if (!ret)
+ return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
+ return 0;
}
-static void bch2_delete_dead_snapshots_work(struct work_struct *work)
+int bch2_delete_dead_snapshots(struct bch_fs *c)
{
- struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_snapshot snap;
- struct snapshot_id_list deleted = { 0 };
- u32 i, id, children[2];
+ snapshot_id_list deleted = { 0 };
+ u32 i, id;
int ret = 0;
+ if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
+ return 0;
+
+ if (!test_bit(BCH_FS_STARTED, &c->flags)) {
+ ret = bch2_fs_read_write_early(c);
+ if (ret) {
+ bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
+ return ret;
+ }
+ }
+
bch2_trans_init(&trans, c, 0, 0);
/*
* For every snapshot node: If we have no live children and it's not
* pointed to by a subvolume, delete it:
*/
- for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ret) {
- if (k.k->type != KEY_TYPE_snapshot)
- continue;
-
- snap = bkey_s_c_to_snapshot(k);
- if (BCH_SNAPSHOT_DELETED(snap.v) ||
- BCH_SNAPSHOT_SUBVOL(snap.v))
- continue;
-
- children[0] = le32_to_cpu(snap.v->children[0]);
- children[1] = le32_to_cpu(snap.v->children[1]);
-
- ret = snapshot_live(&trans, children[0]) ?:
- snapshot_live(&trans, children[1]);
- if (ret < 0)
- break;
- if (ret)
- continue;
-
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
- if (ret) {
- bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
- break;
- }
- }
- bch2_trans_iter_exit(&trans, &iter);
-
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
+ NULL, NULL, 0,
+ bch2_delete_redundant_snapshot(&trans, &iter, k));
if (ret) {
- bch_err(c, "error walking snapshots: %i", ret);
+ bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
goto err;
}
- ret = bch2_snapshots_set_equiv(&trans);
- if (ret)
+ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
+ bch2_snapshot_set_equiv(&trans, k));
+ if (ret) {
+ bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
goto err;
+ }
for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
POS_MIN, 0, k, ret) {
snap = bkey_s_c_to_snapshot(k);
if (BCH_SNAPSHOT_DELETED(snap.v)) {
- ret = snapshot_id_add(&deleted, k.k->p.offset);
+ ret = snapshot_list_add(c, &deleted, k.k->p.offset);
if (ret)
break;
}
bch2_trans_iter_exit(&trans, &iter);
if (ret) {
- bch_err(c, "error walking snapshots: %i", ret);
+ bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
goto err;
}
for (id = 0; id < BTREE_ID_NR; id++) {
+ struct bpos last_pos = POS_MIN;
+ snapshot_id_list equiv_seen = { 0 };
+
if (!btree_type_has_snapshots(id))
continue;
- ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id);
+ ret = for_each_btree_key_commit(&trans, iter,
+ id, POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL,
+ snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
+
+ darray_exit(&equiv_seen);
+
if (ret) {
- bch_err(c, "error deleting snapshot keys: %i", ret);
+ bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
goto err;
}
}
for (i = 0; i < deleted.nr; i++) {
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_snapshot_node_delete(&trans, deleted.d[i]));
+ ret = commit_do(&trans, NULL, NULL, 0,
+ bch2_snapshot_node_delete(&trans, deleted.data[i]));
if (ret) {
- bch_err(c, "error deleting snapshot %u: %i",
- deleted.d[i], ret);
+ bch_err(c, "error deleting snapshot %u: %s",
+ deleted.data[i], bch2_err_str(ret));
goto err;
}
}
+
+ clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
err:
- kfree(deleted.d);
+ darray_exit(&deleted);
bch2_trans_exit(&trans);
+ return ret;
+}
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+
+ bch2_delete_dead_snapshots(c);
percpu_ref_put(&c->writes);
}
-static void bch2_delete_dead_snapshots(struct bch_fs *c)
+void bch2_delete_dead_snapshots_async(struct bch_fs *c)
{
- if (unlikely(!percpu_ref_tryget(&c->writes)))
+ if (!percpu_ref_tryget_live(&c->writes))
return;
if (!queue_work(system_long_wq, &c->snapshot_delete_work))
static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
struct btree_trans_commit_hook *h)
{
- bch2_delete_dead_snapshots(trans->c);
+ struct bch_fs *c = trans->c;
+
+ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+
+ if (!test_bit(BCH_FS_FSCK_DONE, &c->flags))
+ return 0;
+
+ bch2_delete_dead_snapshots_async(c);
return 0;
}
/* Subvolumes: */
-const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
- if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0)
- return "invalid pos";
-
- if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
- return "invalid pos";
+ if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0 ||
+ bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) {
+ prt_printf(err, "invalid pos");
+ return -EINVAL;
+ }
- if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume))
- return "bad val size";
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) {
+ prt_printf(err, "incorrect value size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_subvolume));
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
- pr_buf(out, "root %llu snapshot id %u",
+ prt_printf(out, "root %llu snapshot id %u",
le64_to_cpu(s.v->inode),
le32_to_cpu(s.v->snapshot));
}
struct bkey_s_c k;
struct bkey_s_c_subvolume subvol;
struct btree_trans_commit_hook *h;
- struct bkey_i *delete;
u32 snapid;
int ret = 0;
subvol = bkey_s_c_to_subvolume(k);
snapid = le32_to_cpu(subvol.v->snapshot);
- delete = bch2_trans_kmalloc(trans, sizeof(*delete));
- ret = PTR_ERR_OR_ZERO(delete);
+ ret = bch2_btree_delete_at(trans, &iter, 0);
if (ret)
goto err;
- bkey_init(&delete->k);
- delete->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, delete, 0);
+ ret = bch2_snapshot_node_set_deleted(trans, snapid);
if (ret)
goto err;
- ret = bch2_snapshot_node_set_deleted(trans, snapid);
-
h = bch2_trans_kmalloc(trans, sizeof(*h));
ret = PTR_ERR_OR_ZERO(h);
if (ret)
{
struct bch_fs *c = container_of(work, struct bch_fs,
snapshot_wait_for_pagecache_and_delete_work);
- struct snapshot_id_list s;
+ snapshot_id_list s;
u32 *id;
int ret = 0;
while (!ret) {
mutex_lock(&c->snapshots_unlinked_lock);
s = c->snapshots_unlinked;
- memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked));
+ darray_init(&c->snapshots_unlinked);
mutex_unlock(&c->snapshots_unlinked_lock);
if (!s.nr)
bch2_evict_subvolume_inodes(c, &s);
- for (id = s.d; id < s.d + s.nr; id++) {
+ for (id = s.data; id < s.data + s.nr; id++) {
ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
bch2_subvolume_delete(&trans, *id));
if (ret) {
- bch_err(c, "error %i deleting subvolume %u", ret, *id);
+ bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
break;
}
}
- kfree(s.d);
+ darray_exit(&s);
}
percpu_ref_put(&c->writes);
mutex_lock(&c->snapshots_unlinked_lock);
if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
- ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol);
+ ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
mutex_unlock(&c->snapshots_unlinked_lock);
if (ret)
return ret;
- if (unlikely(!percpu_ref_tryget(&c->writes)))
+ if (unlikely(!percpu_ref_tryget_live(&c->writes)))
return -EROFS;
if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
}
if (!ret)
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_subvolume_create;
goto err;
found_slot:
snapshot_subvols[0] = dst_iter.pos.offset;
#ifndef _BCACHEFS_SUBVOLUME_H
#define _BCACHEFS_SUBVOLUME_H
+#include "darray.h"
#include "subvolume_types.h"
void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
+ int rw, struct printbuf *);
#define bch2_bkey_ops_snapshot (struct bkey_ops) { \
.key_invalid = bch2_snapshot_invalid, \
return snapshot_t(c, id)->parent;
}
+static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+ return snapshot_t(c, id)->equiv;
+}
+
+static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
+{
+ return id == snapshot_t(c, id)->equiv;
+}
+
static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id)
{
struct snapshot_t *s = snapshot_t(c, id);
return id == ancestor;
}
-struct snapshots_seen {
- struct bpos pos;
- size_t nr;
- size_t size;
- u32 *d;
-};
-
-static inline void snapshots_seen_exit(struct snapshots_seen *s)
+static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
{
- kfree(s->d);
- s->d = NULL;
-}
+ u32 *i;
-static inline void snapshots_seen_init(struct snapshots_seen *s)
-{
- memset(s, 0, sizeof(*s));
+ darray_for_each(*s, i)
+ if (*i == id)
+ return true;
+ return false;
}
-static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
{
- if (s->nr == s->size) {
- size_t new_size = max(s->size, (size_t) 128) * 2;
- u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
-
- if (!d) {
- bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
- new_size);
- return -ENOMEM;
- }
-
- s->size = new_size;
- s->d = d;
- }
+ u32 *i;
- s->d[s->nr++] = id;
- return 0;
+ darray_for_each(*s, i)
+ if (bch2_snapshot_is_ancestor(c, id, *i))
+ return true;
+ return false;
}
-static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
+static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
{
- unsigned i;
+ int ret;
- for (i = 0; i < s->nr; i++)
- if (id == s->d[i])
- return true;
- return false;
+ BUG_ON(snapshot_list_has_id(s, id));
+ ret = darray_push(s, id);
+ if (ret)
+ bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+ return ret;
}
-int bch2_fs_snapshots_check(struct bch_fs *);
+int bch2_fs_check_snapshots(struct bch_fs *);
+int bch2_fs_check_subvols(struct bch_fs *);
+
void bch2_fs_snapshots_exit(struct bch_fs *);
int bch2_fs_snapshots_start(struct bch_fs *);
-const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
+ int rw, struct printbuf *);
void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_subvolume (struct bkey_ops) { \
int bch2_snapshot_node_create(struct btree_trans *, u32,
u32 *, u32 *, unsigned);
+int bch2_delete_dead_snapshots(struct bch_fs *);
+void bch2_delete_dead_snapshots_async(struct bch_fs *);
+
int bch2_subvolume_delete(struct btree_trans *, u32);
int bch2_subvolume_unlink(struct btree_trans *, u32);
int bch2_subvolume_create(struct btree_trans *, u64, u32,
#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
#define _BCACHEFS_SUBVOLUME_TYPES_H
-struct snapshot_id_list {
- u32 nr;
- u32 size;
- u32 *d;
-};
+#include "darray.h"
+
+typedef DARRAY(u32) snapshot_id_list;
#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
#include "io.h"
#include "journal.h"
#include "journal_io.h"
+#include "journal_sb.h"
#include "journal_seq_blacklist.h"
#include "replicas.h"
#include "quota.h"
#include "super-io.h"
#include "super.h"
#include "vstructs.h"
+#include "counters.h"
#include <linux/backing-dev.h>
+#include <linux/pretty-printers.h>
#include <linux/sort.h>
+#include <trace/events/bcachefs.h>
+
const char * const bch2_sb_fields[] = {
#define x(name, nr) #name,
BCH_SB_FIELDS()
void bch2_free_super(struct bch_sb_handle *sb)
{
- if (sb->bio)
- bio_put(sb->bio);
+ kfree(sb->bio);
if (!IS_ERR_OR_NULL(sb->bdev))
blkdev_put(sb->bdev, sb->mode);
u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
if (new_bytes > max_bytes) {
- char buf[BDEVNAME_SIZE];
-
- pr_err("%s: superblock too big: want %zu but have %llu",
- bdevname(sb->bdev, buf), new_bytes, max_bytes);
- return -ENOSPC;
+ pr_err("%pg: superblock too big: want %zu but have %llu",
+ sb->bdev, new_bytes, max_bytes);
+ return -BCH_ERR_ENOSPC_sb;
}
}
return -ENOMEM;
if (sb->have_bio) {
- bio = bio_kmalloc(GFP_KERNEL,
- DIV_ROUND_UP(new_buffer_size, PAGE_SIZE));
+ unsigned nr_bvecs = DIV_ROUND_UP(new_buffer_size, PAGE_SIZE);
+
+ bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
if (!bio)
return -ENOMEM;
- if (sb->bio)
- bio_put(sb->bio);
+ bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
+
+ kfree(sb->bio);
sb->bio = bio;
}
unsigned i;
if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) {
- pr_buf(out, "Not a bcachefs superblock layout");
+ prt_printf(out, "Not a bcachefs superblock layout");
return -EINVAL;
}
if (layout->layout_type != 0) {
- pr_buf(out, "Invalid superblock layout type %u",
+ prt_printf(out, "Invalid superblock layout type %u",
layout->layout_type);
return -EINVAL;
}
if (!layout->nr_superblocks) {
- pr_buf(out, "Invalid superblock layout: no superblocks");
+ prt_printf(out, "Invalid superblock layout: no superblocks");
return -EINVAL;
}
if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
- pr_buf(out, "Invalid superblock layout: too many superblocks");
+ prt_printf(out, "Invalid superblock layout: too many superblocks");
return -EINVAL;
}
offset = le64_to_cpu(layout->sb_offset[i]);
if (offset < prev_offset + max_sectors) {
- pr_buf(out, "Invalid superblock layout: superblocks overlap\n"
+ prt_printf(out, "Invalid superblock layout: superblocks overlap\n"
" (sb %u ends at %llu next starts at %llu",
i - 1, prev_offset + max_sectors, offset);
return -EINVAL;
return 0;
}
-static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out)
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
+ int rw)
{
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field *f;
struct bch_sb_field_members *mi;
+ enum bch_opt_id opt_id;
u32 version, version_min;
u16 block_size;
int ret;
version = le16_to_cpu(sb->version);
- version_min = version >= bcachefs_metadata_version_new_versioning
+ version_min = version >= bcachefs_metadata_version_bkey_renumber
? le16_to_cpu(sb->version_min)
: version;
if (version >= bcachefs_metadata_version_max) {
- pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+ prt_printf(out, "Unsupported superblock version %u (min %u, max %u)",
version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
return -EINVAL;
}
if (version_min < bcachefs_metadata_version_min) {
- pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+ prt_printf(out, "Unsupported superblock version %u (min %u, max %u)",
version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
return -EINVAL;
}
if (version_min > version) {
- pr_buf(out, "Bad minimum version %u, greater than version field %u",
+ prt_printf(out, "Bad minimum version %u, greater than version field %u",
version_min, version);
return -EINVAL;
}
if (sb->features[1] ||
(le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
- pr_buf(out, "Filesystem has incompatible features");
+ prt_printf(out, "Filesystem has incompatible features");
return -EINVAL;
}
block_size = le16_to_cpu(sb->block_size);
if (block_size > PAGE_SECTORS) {
- pr_buf(out, "Block size too big (got %u, max %u)",
+ prt_printf(out, "Block size too big (got %u, max %u)",
block_size, PAGE_SECTORS);
return -EINVAL;
}
if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) {
- pr_buf(out, "Bad user UUID (got zeroes)");
+ prt_printf(out, "Bad user UUID (got zeroes)");
return -EINVAL;
}
if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) {
- pr_buf(out, "Bad intenal UUID (got zeroes)");
+ prt_printf(out, "Bad intenal UUID (got zeroes)");
return -EINVAL;
}
if (!sb->nr_devices ||
sb->nr_devices > BCH_SB_MEMBERS_MAX) {
- pr_buf(out, "Bad number of member devices %u (max %u)",
+ prt_printf(out, "Bad number of member devices %u (max %u)",
sb->nr_devices, BCH_SB_MEMBERS_MAX);
return -EINVAL;
}
if (sb->dev_idx >= sb->nr_devices) {
- pr_buf(out, "Bad dev_idx (got %u, nr_devices %u)",
+ prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)",
sb->dev_idx, sb->nr_devices);
return -EINVAL;
}
if (!sb->time_precision ||
le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
- pr_buf(out, "Invalid time precision: %u (min 1, max %lu)",
+ prt_printf(out, "Invalid time precision: %u (min 1, max %lu)",
le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
return -EINVAL;
}
+ if (rw == READ) {
+ /*
+ * Been seeing a bug where these are getting inexplicably
+ * zeroed, so we'r now validating them, but we have to be
+ * careful not to preven people's filesystems from mounting:
+ */
+ if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+ SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
+ if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+ SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
+ }
+
+ for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
+ const struct bch_option *opt = bch2_opt_table + opt_id;
+
+ if (opt->get_sb != BCH2_NO_SB_OPT) {
+ u64 v = bch2_opt_from_sb(sb, opt_id);
+
+ prt_printf(out, "Invalid option ");
+ ret = bch2_opt_validate(opt, v, out);
+ if (ret)
+ return ret;
+
+ printbuf_reset(out);
+ }
+ }
+
/* validate layout */
ret = validate_sb_layout(&sb->layout, out);
if (ret)
vstruct_for_each(sb, f) {
if (!f->u64s) {
- pr_buf(out, "Invalid superblock: optional with size 0 (type %u)",
+ prt_printf(out, "Invalid superblock: optional with size 0 (type %u)",
le32_to_cpu(f->type));
return -EINVAL;
}
if (vstruct_next(f) > vstruct_last(sb)) {
- pr_buf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
+ prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
le32_to_cpu(f->type));
return -EINVAL;
}
/* members must be validated first: */
mi = bch2_sb_get_members(sb);
if (!mi) {
- pr_buf(out, "Invalid superblock: member info area missing");
+ prt_printf(out, "Invalid superblock: member info area missing");
return -EINVAL;
}
memcpy(dst->compat, src->compat, sizeof(dst->compat));
for (i = 0; i < BCH_SB_FIELD_NR; i++) {
- if (i == BCH_SB_FIELD_journal)
+ if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
continue;
src_f = bch2_sb_field_get(src, i);
__copy_super(&c->disk_sb, src);
- if (BCH_SB_INITIALIZED(c->disk_sb.sb))
- set_bit(BCH_FS_INITIALIZED, &c->flags);
-
ret = bch2_sb_replicas_to_cpu_replicas(c);
if (ret)
return ret;
size_t bytes;
int ret;
reread:
- bio_reset(sb->bio);
- bio_set_dev(sb->bio, sb->bdev);
+ bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
sb->bio->bi_iter.bi_sector = offset;
- bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
ret = submit_bio_wait(sb->bio);
if (ret) {
- pr_buf(err, "IO error: %i", ret);
+ prt_printf(err, "IO error: %i", ret);
return ret;
}
if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) {
- pr_buf(err, "Not a bcachefs superblock");
+ prt_printf(err, "Not a bcachefs superblock");
return -EINVAL;
}
version = le16_to_cpu(sb->sb->version);
- version_min = version >= bcachefs_metadata_version_new_versioning
+ version_min = version >= bcachefs_metadata_version_bkey_renumber
? le16_to_cpu(sb->sb->version_min)
: version;
if (version >= bcachefs_metadata_version_max) {
- pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+ prt_printf(err, "Unsupported superblock version %u (min %u, max %u)",
version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
return -EINVAL;
}
if (version_min < bcachefs_metadata_version_min) {
- pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+ prt_printf(err, "Unsupported superblock version %u (min %u, max %u)",
version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
return -EINVAL;
}
bytes = vstruct_bytes(sb->sb);
if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
- pr_buf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
+ prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
bytes, 512UL << sb->sb->layout.sb_max_size_bits);
return -EINVAL;
}
}
if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
- pr_buf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
+ prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
return -EINVAL;
}
null_nonce(), sb->sb);
if (bch2_crc_cmp(csum, sb->sb->csum)) {
- pr_buf(err, "bad checksum");
+ prt_printf(err, "bad checksum");
return -EINVAL;
}
{
u64 offset = opt_get(*opts, sb);
struct bch_sb_layout layout;
- char *_err;
- struct printbuf err;
+ struct printbuf err = PRINTBUF;
__le64 *i;
int ret;
- _err = kmalloc(4096, GFP_KERNEL);
- if (!_err)
- return -ENOMEM;
- err = _PBUF(_err, 4096);
-
pr_verbose_init(*opts, "");
memset(sb, 0, sizeof(*sb));
ret = bch2_sb_realloc(sb, 0);
if (ret) {
- pr_buf(&err, "error allocating memory for superblock");
+ prt_printf(&err, "error allocating memory for superblock");
goto err;
}
if (bch2_fs_init_fault("read_super")) {
- pr_buf(&err, "dynamic fault");
+ prt_printf(&err, "dynamic fault");
ret = -EFAULT;
goto err;
}
goto err;
printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s",
- path, _err);
- err = _PBUF(_err, 4096);
+ path, err.buf);
+ printbuf_reset(&err);
/*
* Error reading primary superblock - read location of backup
* superblocks:
*/
- bio_reset(sb->bio);
- bio_set_dev(sb->bio, sb->bdev);
+ bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
- bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
/*
* use sb buffer to read layout, since sb buffer is page aligned but
* layout won't be:
ret = submit_bio_wait(sb->bio);
if (ret) {
- pr_buf(&err, "IO error: %i", ret);
+ prt_printf(&err, "IO error: %i", ret);
goto err;
}
got_super:
if (le16_to_cpu(sb->sb->block_size) << 9 <
bdev_logical_block_size(sb->bdev)) {
- pr_buf(&err, "block size (%u) smaller than device block size (%u)",
+ prt_printf(&err, "block size (%u) smaller than device block size (%u)",
le16_to_cpu(sb->sb->block_size) << 9,
bdev_logical_block_size(sb->bdev));
ret = -EINVAL;
ret = 0;
sb->have_layout = true;
- ret = bch2_sb_validate(sb, &err);
+ ret = bch2_sb_validate(sb, &err, READ);
if (ret) {
printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
- path, _err);
+ path, err.buf);
goto err_no_print;
}
out:
pr_verbose_init(*opts, "ret %i", ret);
- kfree(_err);
+ printbuf_exit(&err);
return ret;
err:
printk(KERN_ERR "bcachefs (%s): error reading superblock: %s",
- path, _err);
+ path, err.buf);
err_no_print:
bch2_free_super(sb);
goto out;
struct bch_sb *sb = ca->disk_sb.sb;
struct bio *bio = ca->disk_sb.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]);
bio->bi_end_io = write_super_endio;
bio->bi_private = ca;
- bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META);
bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb],
sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
null_nonce(), sb);
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
bio->bi_end_io = write_super_endio;
bio->bi_private = ca;
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
bch2_bio_map(bio, sb,
roundup((size_t) vstruct_bytes(sb),
bdev_logical_block_size(ca->disk_sb.bdev)));
{
struct closure *cl = &c->sb_write;
struct bch_dev *ca;
+ struct printbuf err = PRINTBUF;
unsigned i, sb = 0, nr_wrote;
struct bch_devs_mask sb_written;
bool wrote, can_mount_without_written, can_mount_with_written;
unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
int ret = 0;
+ trace_and_count(c, write_super, c, _RET_IP_);
+
if (c->opts.very_degraded)
degraded_flags |= BCH_FORCE_IF_LOST;
SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
+ bch2_sb_counters_from_cpu(c);
+
for_each_online_member(ca, c, i)
bch2_sb_from_fs(c, ca);
for_each_online_member(ca, c, i) {
- struct printbuf buf = { NULL, NULL };
+ printbuf_reset(&err);
- ret = bch2_sb_validate(&ca->disk_sb, &buf);
+ ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
if (ret) {
- char *_buf = kmalloc(4096, GFP_NOFS);
- if (_buf) {
- buf = _PBUF(_buf, 4096);
- bch2_sb_validate(&ca->disk_sb, &buf);
- }
-
- bch2_fs_inconsistent(c, "sb invalid before write: %s", _buf);
- kfree(_buf);
+ bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
percpu_ref_put(&ca->io_ref);
goto out;
}
if (c->opts.nochanges)
goto out;
+ /*
+ * Defer writing the superblock until filesystem initialization is
+ * complete - don't write out a partly initialized superblock:
+ */
+ if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
+ goto out;
+
for_each_online_member(ca, c, i) {
__set_bit(ca->dev_idx, sb_written.d);
ca->sb_write_error = 0;
out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
+ printbuf_exit(&err);
return ret;
}
mutex_unlock(&c->sb_lock);
}
-/* BCH_SB_FIELD_journal: */
-
-static int u64_cmp(const void *_l, const void *_r)
-{
- u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
-
- return l < r ? -1 : l > r ? 1 : 0;
-}
-
-static int bch2_sb_validate_journal(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
-{
- struct bch_sb_field_journal *journal = field_to_type(f, journal);
- struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
- int ret = -EINVAL;
- unsigned nr;
- unsigned i;
- u64 *b;
-
- nr = bch2_nr_journal_buckets(journal);
- if (!nr)
- return 0;
-
- b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
- if (!b)
- return -ENOMEM;
-
- for (i = 0; i < nr; i++)
- b[i] = le64_to_cpu(journal->buckets[i]);
-
- sort(b, nr, sizeof(u64), u64_cmp, NULL);
-
- if (!b[0]) {
- pr_buf(err, "journal bucket at sector 0");
- goto err;
- }
-
- if (b[0] < le16_to_cpu(m->first_bucket)) {
- pr_buf(err, "journal bucket %llu before first bucket %u",
- b[0], le16_to_cpu(m->first_bucket));
- goto err;
- }
-
- if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
- pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
- b[nr - 1], le64_to_cpu(m->nbuckets));
- goto err;
- }
-
- for (i = 0; i + 1 < nr; i++)
- if (b[i] == b[i + 1]) {
- pr_buf(err, "duplicate journal buckets %llu", b[i]);
- goto err;
- }
-
- ret = 0;
-err:
- kfree(b);
- return ret;
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
- .validate = bch2_sb_validate_journal,
-};
-
/* BCH_SB_FIELD_members: */
-static int bch2_sb_validate_members(struct bch_sb *sb,
+static int bch2_sb_members_validate(struct bch_sb *sb,
struct bch_sb_field *f,
struct printbuf *err)
{
if ((void *) (mi->members + sb->nr_devices) >
vstruct_end(&mi->field)) {
- pr_buf(err, "too many devices for section size");
+ prt_printf(err, "too many devices for section size");
return -EINVAL;
}
continue;
if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
- pr_buf(err, "device %u: too many buckets (got %llu, max %lu)",
+ prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
i, le64_to_cpu(m->nbuckets), LONG_MAX);
return -EINVAL;
}
if (le64_to_cpu(m->nbuckets) -
le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
- pr_buf(err, "device %u: not enough buckets (got %llu, max %u)",
+ prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
return -EINVAL;
}
if (le16_to_cpu(m->bucket_size) <
le16_to_cpu(sb->block_size)) {
- pr_buf(err, "device %u: bucket size %u smaller than block size %u",
+ prt_printf(err, "device %u: bucket size %u smaller than block size %u",
i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
return -EINVAL;
}
if (le16_to_cpu(m->bucket_size) <
BCH_SB_BTREE_NODE_SIZE(sb)) {
- pr_buf(err, "device %u: bucket size %u smaller than btree node size %llu",
+ prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
return -EINVAL;
}
return 0;
}
+static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_members *mi = field_to_type(f, members);
+ struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
+ unsigned i;
+
+ for (i = 0; i < sb->nr_devices; i++) {
+ struct bch_member *m = mi->members + i;
+ unsigned data_have = bch2_sb_dev_has_data(sb, i);
+ u64 bucket_size = le16_to_cpu(m->bucket_size);
+ u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
+
+ if (!bch2_member_exists(m))
+ continue;
+
+ prt_printf(out, "Device:");
+ prt_tab(out);
+ prt_printf(out, "%u", i);
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "UUID:");
+ prt_tab(out);
+ pr_uuid(out, m->uuid.b);
+ prt_newline(out);
+
+ prt_printf(out, "Size:");
+ prt_tab(out);
+ prt_units_u64(out, device_size << 9);
+ prt_newline(out);
+
+ prt_printf(out, "Bucket size:");
+ prt_tab(out);
+ prt_units_u64(out, bucket_size << 9);
+ prt_newline(out);
+
+ prt_printf(out, "First bucket:");
+ prt_tab(out);
+ prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
+ prt_newline(out);
+
+ prt_printf(out, "Buckets:");
+ prt_tab(out);
+ prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
+ prt_newline(out);
+
+ prt_printf(out, "Last mount:");
+ prt_tab(out);
+ if (m->last_mount)
+ pr_time(out, le64_to_cpu(m->last_mount));
+ else
+ prt_printf(out, "(never)");
+ prt_newline(out);
+
+ prt_printf(out, "State:");
+ prt_tab(out);
+ prt_printf(out, "%s",
+ BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
+ ? bch2_member_states[BCH_MEMBER_STATE(m)]
+ : "unknown");
+ prt_newline(out);
+
+ prt_printf(out, "Label:");
+ prt_tab(out);
+ if (BCH_MEMBER_GROUP(m)) {
+ unsigned idx = BCH_MEMBER_GROUP(m) - 1;
+
+ if (idx < disk_groups_nr(gi))
+ prt_printf(out, "%s (%u)",
+ gi->entries[idx].label, idx);
+ else
+ prt_printf(out, "(bad disk labels section)");
+ } else {
+ prt_printf(out, "(none)");
+ }
+ prt_newline(out);
+
+ prt_printf(out, "Data allowed:");
+ prt_tab(out);
+ if (BCH_MEMBER_DATA_ALLOWED(m))
+ prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
+ else
+ prt_printf(out, "(none)");
+ prt_newline(out);
+
+ prt_printf(out, "Has data:");
+ prt_tab(out);
+ if (data_have)
+ prt_bitflags(out, bch2_data_types, data_have);
+ else
+ prt_printf(out, "(none)");
+ prt_newline(out);
+
+ prt_printf(out, "Discard:");
+ prt_tab(out);
+ prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
+ prt_newline(out);
+
+ prt_printf(out, "Freespace initialized:");
+ prt_tab(out);
+ prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+ }
+}
+
static const struct bch_sb_field_ops bch_sb_field_ops_members = {
- .validate = bch2_sb_validate_members,
+ .validate = bch2_sb_members_validate,
+ .to_text = bch2_sb_members_to_text,
};
/* BCH_SB_FIELD_crypt: */
-static int bch2_sb_validate_crypt(struct bch_sb *sb,
+static int bch2_sb_crypt_validate(struct bch_sb *sb,
struct bch_sb_field *f,
struct printbuf *err)
{
struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
- pr_buf(err, "wrong size (got %llu should be %zu)",
+ prt_printf(err, "wrong size (got %zu should be %zu)",
vstruct_bytes(&crypt->field), sizeof(*crypt));
return -EINVAL;
}
if (BCH_CRYPT_KDF_TYPE(crypt)) {
- pr_buf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+ prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
return -EINVAL;
}
return 0;
}
+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+ prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt));
+ prt_newline(out);
+ prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt));
+ prt_newline(out);
+ prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt));
+ prt_newline(out);
+ prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt));
+ prt_newline(out);
+}
+
static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
- .validate = bch2_sb_validate_crypt,
+ .validate = bch2_sb_crypt_validate,
+ .to_text = bch2_sb_crypt_to_text,
};
/* BCH_SB_FIELD_clean: */
-int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
+int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
{
struct jset_entry *entry;
int ret;
for (entry = clean->start;
entry < (struct jset_entry *) vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
- ret = bch2_journal_entry_validate(c, "superblock", entry,
+ ret = bch2_journal_entry_validate(c, NULL, entry,
le16_to_cpu(c->disk_sb.sb->version),
BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
write);
u->entry.type = BCH_JSET_ENTRY_dev_usage;
u->dev = cpu_to_le32(dev);
u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
- u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable);
for (i = 0; i < BCH_DATA_NR; i++) {
u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
}
sb_clean->flags = 0;
- sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
+ sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq));
/* Trying to catch outstanding bug: */
BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
* this should be in the write path, and we should be validating every
* superblock section:
*/
- ret = bch2_sb_clean_validate(c, sb_clean, WRITE);
+ ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
if (ret) {
bch_err(c, "error writing marking filesystem clean: validate error");
goto out;
mutex_unlock(&c->sb_lock);
}
-static int bch2_sb_validate_clean(struct bch_sb *sb,
+static int bch2_sb_clean_validate(struct bch_sb *sb,
struct bch_sb_field *f,
struct printbuf *err)
{
struct bch_sb_field_clean *clean = field_to_type(f, clean);
if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
- pr_buf(err, "wrong size (got %llu should be %zu)",
+ prt_printf(err, "wrong size (got %zu should be %zu)",
vstruct_bytes(&clean->field), sizeof(*clean));
return -EINVAL;
}
return 0;
}
+static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_clean *clean = field_to_type(f, clean);
+ struct jset_entry *entry;
+
+ prt_printf(out, "flags: %x", le32_to_cpu(clean->flags));
+ prt_newline(out);
+ prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq));
+ prt_newline(out);
+
+ for (entry = clean->start;
+ entry != vstruct_end(&clean->field);
+ entry = vstruct_next(entry)) {
+ if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+ !entry->u64s)
+ continue;
+
+ bch2_journal_entry_to_text(out, NULL, entry);
+ prt_newline(out);
+ }
+}
+
static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
- .validate = bch2_sb_validate_clean,
+ .validate = bch2_sb_clean_validate,
+ .to_text = bch2_sb_clean_to_text,
};
static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
};
static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *orig_err)
+ struct printbuf *err)
{
unsigned type = le32_to_cpu(f->type);
- struct printbuf err = *orig_err;
+ struct printbuf field_err = PRINTBUF;
int ret;
if (type >= BCH_SB_FIELD_NR)
return 0;
- pr_buf(&err, "Invalid superblock section %s: ", bch2_sb_fields[type]);
-
- ret = bch2_sb_field_ops[type]->validate(sb, f, &err);
+ ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err);
if (ret) {
- pr_buf(&err, "\n");
- bch2_sb_field_to_text(&err, sb, f);
- *orig_err = err;
+ prt_printf(err, "Invalid superblock section %s: %s",
+ bch2_sb_fields[type],
+ field_err.buf);
+ prt_newline(err);
+ bch2_sb_field_to_text(err, sb, f);
}
+ printbuf_exit(&field_err);
return ret;
}
const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR
? bch2_sb_field_ops[type] : NULL;
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
+
if (ops)
- pr_buf(out, "%s", bch2_sb_fields[type]);
+ prt_printf(out, "%s", bch2_sb_fields[type]);
else
- pr_buf(out, "(unknown field %u)", type);
+ prt_printf(out, "(unknown field %u)", type);
- pr_buf(out, " (size %llu):", vstruct_bytes(f));
+ prt_printf(out, " (size %zu):", vstruct_bytes(f));
+ prt_newline(out);
- if (ops && ops->to_text)
+ if (ops && ops->to_text) {
+ printbuf_indent_add(out, 2);
bch2_sb_field_ops[type]->to_text(out, sb, f);
+ printbuf_indent_sub(out, 2);
+ }
+}
+
+void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
+{
+ unsigned i;
+
+ prt_printf(out, "Type: %u", l->layout_type);
+ prt_newline(out);
+
+ prt_str(out, "Superblock max size: ");
+ prt_units_u64(out, 512 << l->sb_max_size_bits);
+ prt_newline(out);
+
+ prt_printf(out, "Nr superblocks: %u", l->nr_superblocks);
+ prt_newline(out);
+
+ prt_str(out, "Offsets: ");
+ for (i = 0; i < l->nr_superblocks; i++) {
+ if (i)
+ prt_str(out, ", ");
+ prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
+ }
+ prt_newline(out);
+}
+
+void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
+ bool print_layout, unsigned fields)
+{
+ struct bch_sb_field_members *mi;
+ struct bch_sb_field *f;
+ u64 fields_have = 0;
+ unsigned nr_devices = 0;
+
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 44);
+
+ mi = bch2_sb_get_members(sb);
+ if (mi) {
+ struct bch_member *m;
+
+ for (m = mi->members;
+ m < mi->members + sb->nr_devices;
+ m++)
+ nr_devices += bch2_member_exists(m);
+ }
+
+ prt_printf(out, "External UUID:");
+ prt_tab(out);
+ pr_uuid(out, sb->user_uuid.b);
+ prt_newline(out);
+
+ prt_printf(out, "Internal UUID:");
+ prt_tab(out);
+ pr_uuid(out, sb->uuid.b);
+ prt_newline(out);
+
+ prt_str(out, "Device index:");
+ prt_tab(out);
+ prt_printf(out, "%u", sb->dev_idx);
+ prt_newline(out);
+
+ prt_str(out, "Label:");
+ prt_tab(out);
+ prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
+ prt_newline(out);
+
+ prt_str(out, "Version:");
+ prt_tab(out);
+ prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]);
+ prt_newline(out);
+
+ prt_printf(out, "Oldest version on disk:");
+ prt_tab(out);
+ prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]);
+ prt_newline(out);
+
+ prt_printf(out, "Created:");
+ prt_tab(out);
+ if (sb->time_base_lo)
+ pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
+ else
+ prt_printf(out, "(not set)");
+ prt_newline(out);
+
+ prt_printf(out, "Sequence number:");
+ prt_tab(out);
+ prt_printf(out, "%llu", le64_to_cpu(sb->seq));
+ prt_newline(out);
+
+ prt_printf(out, "Superblock size:");
+ prt_tab(out);
+ prt_printf(out, "%zu", vstruct_bytes(sb));
+ prt_newline(out);
+
+ prt_printf(out, "Clean:");
+ prt_tab(out);
+ prt_printf(out, "%llu", BCH_SB_CLEAN(sb));
+ prt_newline(out);
+
+ prt_printf(out, "Devices:");
+ prt_tab(out);
+ prt_printf(out, "%u", nr_devices);
+ prt_newline(out);
+
+ prt_printf(out, "Sections:");
+ vstruct_for_each(sb, f)
+ fields_have |= 1 << le32_to_cpu(f->type);
+ prt_tab(out);
+ prt_bitflags(out, bch2_sb_fields, fields_have);
+ prt_newline(out);
+
+ prt_printf(out, "Features:");
+ prt_tab(out);
+ prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
+ prt_newline(out);
+
+ prt_printf(out, "Compat features:");
+ prt_tab(out);
+ prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
+ prt_newline(out);
+
+ prt_newline(out);
+ prt_printf(out, "Options:");
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+ {
+ enum bch_opt_id id;
+
+ for (id = 0; id < bch2_opts_nr; id++) {
+ const struct bch_option *opt = bch2_opt_table + id;
+
+ if (opt->get_sb != BCH2_NO_SB_OPT) {
+ u64 v = bch2_opt_from_sb(sb, id);
+
+ prt_printf(out, "%s:", opt->attr.name);
+ prt_tab(out);
+ bch2_opt_to_text(out, NULL, sb, opt, v,
+ OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
+ prt_newline(out);
+ }
+ }
+ }
+
+ printbuf_indent_sub(out, 2);
+
+ if (print_layout) {
+ prt_newline(out);
+ prt_printf(out, "layout:");
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+ bch2_sb_layout_to_text(out, &sb->layout);
+ printbuf_indent_sub(out, 2);
+ }
+
+ vstruct_for_each(sb, f)
+ if (fields & (1 << le32_to_cpu(f->type))) {
+ prt_newline(out);
+ bch2_sb_field_to_text(out, sb, f);
+ }
}
__bch2_check_set_feature(c, feat);
}
-/* BCH_SB_FIELD_journal: */
-
-static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
-{
- return j
- ? (__le64 *) vstruct_end(&j->field) - j->buckets
- : 0;
-}
-
/* BCH_SB_FIELD_members: */
static inline bool bch2_member_exists(struct bch_member *m)
.durability = BCH_MEMBER_DURABILITY(mi)
? BCH_MEMBER_DURABILITY(mi) - 1
: 1,
+ .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
.valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
};
}
void bch2_journal_super_entries_add_common(struct bch_fs *,
struct jset_entry **, u64);
-int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int);
+int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
int bch2_fs_mark_dirty(struct bch_fs *);
void bch2_fs_mark_clean(struct bch_fs *);
void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
struct bch_sb_field *);
+void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
+void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned);
#endif /* _BCACHEFS_SUPER_IO_H */
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
+#include "errcode.h"
#include "error.h"
#include "fs.h"
#include "fs-io.h"
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
+#include "counters.h"
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/debugfs.h>
#include <linux/device.h>
-#include <linux/genhd.h>
#include <linux/idr.h>
#include <linux/module.h>
#include <linux/percpu.h>
+#include <linux/pretty-printers.h>
#include <linux/random.h>
#include <linux/sysfs.h>
#include <crypto/hash.h>
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
#define KTYPE(type) \
-struct kobj_type type ## _ktype = { \
+static const struct attribute_group type ## _group = { \
+ .attrs = type ## _files \
+}; \
+ \
+static const struct attribute_group *type ## _groups[] = { \
+ &type ## _group, \
+ NULL \
+}; \
+ \
+static const struct kobj_type type ## _ktype = { \
.release = type ## _release, \
.sysfs_ops = &type ## _sysfs_ops, \
- .default_attrs = type ## _files \
+ .default_groups = type ## _groups \
}
static void bch2_fs_release(struct kobject *);
static void bch2_dev_release(struct kobject *);
+static void bch2_fs_counters_release(struct kobject *k)
+{
+}
static void bch2_fs_internal_release(struct kobject *k)
{
{
}
-static KTYPE(bch2_fs);
-static KTYPE(bch2_fs_internal);
-static KTYPE(bch2_fs_opts_dir);
-static KTYPE(bch2_fs_time_stats);
-static KTYPE(bch2_dev);
+KTYPE(bch2_fs);
+KTYPE(bch2_fs_counters);
+KTYPE(bch2_fs_internal);
+KTYPE(bch2_fs_opts_dir);
+KTYPE(bch2_fs_time_stats);
+KTYPE(bch2_dev);
static struct kset *bcachefs_kset;
static LIST_HEAD(bch_fs_list);
{
struct bch_dev *ca;
unsigned i, clean_passes = 0;
+ u64 seq = 0;
bch2_rebalance_stop(c);
bch2_copygc_stop(c);
bch2_gc_thread_stop(c);
- /*
- * Flush journal before stopping allocators, because flushing journal
- * blacklist entries involves allocating new btree nodes:
- */
- bch2_journal_flush_all_pins(&c->journal);
-
- /*
- * If the allocator threads didn't all start up, the btree updates to
- * write out alloc info aren't going to work:
- */
- if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
- goto nowrote_alloc;
-
bch_verbose(c, "flushing journal and stopping allocators");
- bch2_journal_flush_all_pins(&c->journal);
- set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
-
do {
clean_passes++;
- if (bch2_journal_flush_all_pins(&c->journal))
- clean_passes = 0;
-
- /*
- * In flight interior btree updates will generate more journal
- * updates and btree updates (alloc btree):
- */
- if (bch2_btree_interior_updates_nr_pending(c)) {
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
+ if (bch2_btree_interior_updates_flush(c) ||
+ bch2_journal_flush_all_pins(&c->journal) ||
+ bch2_btree_flush_all_writes(c) ||
+ seq != atomic64_read(&c->journal.seq)) {
+ seq = atomic64_read(&c->journal.seq);
clean_passes = 0;
}
- flush_work(&c->btree_interior_update_work);
-
- if (bch2_journal_flush_all_pins(&c->journal))
- clean_passes = 0;
} while (clean_passes < 2);
- bch_verbose(c, "flushing journal and stopping allocators complete");
-
- set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
-nowrote_alloc:
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
- flush_work(&c->btree_interior_update_work);
-
- for_each_member_device(ca, c, i)
- bch2_dev_allocator_stop(ca);
- clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
- clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
+ bch_verbose(c, "flushing journal and stopping allocators complete");
+ if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+ set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
bch2_fs_journal_stop(&c->journal);
- /*
- * the journal kicks off btree writes via reclaim - wait for in flight
- * writes after stopping journal:
- */
- bch2_btree_flush_all_writes(c);
-
/*
* After stopping journal:
*/
/*
* Block new foreground-end write operations from starting - any new
* writes will return -EROFS:
- *
- * (This is really blocking new _allocations_, writes to previously
- * allocated space can still happen until stopping the allocator in
- * bch2_dev_allocator_stop()).
*/
percpu_ref_kill(&c->writes);
!test_bit(BCH_FS_ERROR, &c->flags) &&
!test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
test_bit(BCH_FS_STARTED, &c->flags) &&
- test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) &&
+ test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) &&
!c->opts.norecovery) {
bch_verbose(c, "marking filesystem clean");
bch2_fs_mark_clean(c);
{
int ret;
- ret = bch2_gc_thread_start(c);
- if (ret) {
- bch_err(c, "error starting gc thread");
- return ret;
- }
-
- ret = bch2_copygc_start(c);
- if (ret) {
- bch_err(c, "error starting copygc thread");
- return ret;
- }
-
ret = bch2_rebalance_start(c);
if (ret) {
bch_err(c, "error starting rebalance thread");
return ret;
}
- schedule_work(&c->ec_stripe_delete_work);
-
return 0;
}
if (ret)
goto err;
- clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
+ clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
for_each_rw_member(ca, c, i)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- for_each_rw_member(ca, c, i) {
- ret = bch2_dev_allocator_start(ca);
- if (ret) {
- bch_err(c, "error starting allocator threads");
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
+ ret = bch2_gc_thread_start(c);
+ if (ret) {
+ bch_err(c, "error starting gc thread");
+ return ret;
}
- set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
+ ret = bch2_copygc_start(c);
+ if (ret) {
+ bch_err(c, "error starting copygc thread");
+ return ret;
+ }
- for_each_rw_member(ca, c, i)
- bch2_wake_allocator(ca);
+ schedule_work(&c->ec_stripe_delete_work);
+
+ bch2_do_discards(c);
+ bch2_do_invalidates(c);
if (!early) {
ret = bch2_fs_read_write_late(c);
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
+ bch2_fs_counters_exit(c);
bch2_fs_snapshots_exit(c);
bch2_fs_quota_exit(c);
bch2_fs_fsio_exit(c);
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
bch2_journal_keys_free(&c->journal_keys);
- bch2_journal_entries_free(&c->journal_entries);
+ bch2_journal_entries_free(c);
percpu_free_rwsem(&c->mark_lock);
if (c->btree_paths_bufs)
kfree(c->unused_inode_hints);
free_heap(&c->copygc_heap);
- if (c->io_complete_wq )
- destroy_workqueue(c->io_complete_wq );
+ if (c->io_complete_wq)
+ destroy_workqueue(c->io_complete_wq);
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
if (c->btree_io_complete_wq)
bch2_fs_debug_exit(c);
bch2_fs_chardev_exit(c);
+ kobject_put(&c->counters_kobj);
kobject_put(&c->time_stats);
kobject_put(&c->opts_dir);
kobject_put(&c->internal);
kobject_add(&c->internal, &c->kobj, "internal") ?:
kobject_add(&c->opts_dir, &c->kobj, "options") ?:
kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+ kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
bch2_opts_create_sysfs_files(&c->opts_dir);
if (ret) {
bch_err(c, "error creating sysfs objects");
{
struct bch_sb_field_members *mi;
struct bch_fs *c;
+ struct printbuf name = PRINTBUF;
unsigned i, iter_size;
int ret = 0;
kobject_init(&c->internal, &bch2_fs_internal_ktype);
kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+ kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
c->minor = -1;
c->disk_sb.fs_sb = true;
bch2_fs_allocator_foreground_init(c);
bch2_fs_rebalance_init(c);
bch2_fs_quota_init(c);
+ bch2_fs_ec_init_early(c);
INIT_LIST_HEAD(&c->list);
INIT_WORK(&c->journal_seq_blacklist_gc_work,
bch2_blacklist_entries_gc);
- INIT_LIST_HEAD(&c->journal_entries);
INIT_LIST_HEAD(&c->journal_iters);
INIT_LIST_HEAD(&c->fsck_errors);
seqcount_init(&c->usage_lock);
- sema_init(&c->io_in_flight, 64);
-
c->copy_gc_enabled = 1;
c->rebalance.enabled = 1;
c->promote_whole_extents = true;
if (ret)
goto err;
- uuid_unparse_lower(c->sb.user_uuid.b, c->name);
+ pr_uuid(&name, c->sb.user_uuid.b);
+ strscpy(c->name, name.buf, sizeof(c->name));
+ printbuf_exit(&name);
+
+ ret = name.allocation_failure ? -ENOMEM : 0;
+ if (ret)
+ goto err;
/* Compat: */
if (sb->version <= bcachefs_metadata_version_inode_v2 &&
goto err;
}
- ret = bch2_io_clock_init(&c->io_clock[READ]) ?:
+ ret = bch2_fs_counters_init(c) ?:
+ bch2_io_clock_init(&c->io_clock[READ]) ?:
bch2_io_clock_init(&c->io_clock[WRITE]) ?:
bch2_fs_journal_init(&c->journal) ?:
bch2_fs_replicas_init(c) ?:
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_interior_update_init(c) ?:
- bch2_fs_buckets_waiting_for_journal_init(c);
+ bch2_fs_buckets_waiting_for_journal_init(c) ?:
bch2_fs_subvolumes_init(c) ?:
bch2_fs_io_init(c) ?:
bch2_fs_encryption_init(c) ?:
if (ret)
goto err;
- if (c->opts.nochanges)
- set_bit(JOURNAL_NOCHANGES, &c->journal.flags);
-
mi = bch2_sb_get_members(c->disk_sb.sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
static void print_mount_opts(struct bch_fs *c)
{
enum bch_opt_id i;
- char buf[512];
- struct printbuf p = PBUF(buf);
+ struct printbuf p = PRINTBUF;
bool first = true;
- strcpy(buf, "(null)");
-
if (c->opts.read_only) {
- pr_buf(&p, "ro");
+ prt_printf(&p, "ro");
first = false;
}
continue;
if (!first)
- pr_buf(&p, ",");
+ prt_printf(&p, ",");
first = false;
- bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE);
+ bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
}
- bch_info(c, "mounted with opts: %s", buf);
+ if (!p.pos)
+ prt_printf(&p, "(null)");
+
+ bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf);
+ printbuf_exit(&p);
}
int bch2_fs_start(struct bch_fs *c)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
+ for (i = 0; i < BCH_TRANSACTIONS_NR; i++) {
+ mutex_lock(&c->btree_transaction_stats[i].lock);
+ bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times);
+ mutex_unlock(&c->btree_transaction_stats[i].lock);
+ }
+
ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
? bch2_fs_recovery(c)
: bch2_fs_initialize(c);
set_bit(BCH_FS_STARTED, &c->flags);
- /*
- * Allocator threads don't start filling copygc reserve until after we
- * set BCH_FS_STARTED - wake them now:
- *
- * XXX ugly hack:
- * Need to set ca->allocator_state here instead of relying on the
- * allocator threads to do it to avoid racing with the copygc threads
- * checking it and thinking they have no alloc reserve:
- */
- for_each_online_member(ca, c, i) {
- ca->allocator_state = ALLOCATOR_running;
- bch2_wake_allocator(ca);
- }
-
if (c->opts.read_only || c->opts.nochanges) {
bch2_fs_read_only(c);
} else {
up_write(&c->state_lock);
return ret;
err:
- switch (ret) {
- case BCH_FSCK_ERRORS_NOT_FIXED:
- bch_err(c, "filesystem contains errors: please report this to the developers");
- pr_cont("mount with -o fix_errors to repair\n");
- break;
- case BCH_FSCK_REPAIR_UNIMPLEMENTED:
- bch_err(c, "filesystem contains errors: please report this to the developers");
- pr_cont("repair unimplemented: inform the developers so that it can be added\n");
- break;
- case BCH_FSCK_REPAIR_IMPOSSIBLE:
- bch_err(c, "filesystem contains errors, but repair impossible");
- break;
- case BCH_FSCK_UNKNOWN_VERSION:
- bch_err(c, "unknown metadata version");
- break;
- case -ENOMEM:
- bch_err(c, "cannot allocate memory");
- break;
- case -EIO:
- bch_err(c, "IO error");
- break;
- }
+ bch_err(c, "error starting filesystem: %s", bch2_err_str(ret));
- if (ret >= 0)
- ret = -EIO;
+ if (ret < -BCH_ERR_START)
+ ret = -EINVAL;
goto out;
}
static void bch2_dev_free(struct bch_dev *ca)
{
- bch2_dev_allocator_stop(ca);
-
cancel_work_sync(&ca->io_error_work);
if (ca->kobj.state_in_sysfs &&
ca->mi = bch2_mi_to_cpu(member);
ca->uuid = member->uuid;
- if (opt_defined(c->opts, discard))
- ca->mi.discard = opt_get(c->opts, discard);
+ ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
+ ca->mi.bucket_size / btree_sectors(c));
if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) ||
ca->fs = c;
- if (ca->mi.state == BCH_MEMBER_STATE_rw &&
- bch2_dev_allocator_start(ca)) {
- bch2_dev_free(ca);
- goto err;
- }
-
bch2_dev_attach(c, ca, dev_idx);
out:
pr_verbose_init(c->opts, "ret %i", ret);
bch2_dev_sysfs_online(c, ca);
if (c->sb.nr_devices == 1)
- bdevname(ca->disk_sb.bdev, c->name);
- bdevname(ca->disk_sb.bdev, ca->name);
+ snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev);
+ snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev);
rebalance_wakeup(c);
return 0;
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
{
- /*
- * Device going read only means the copygc reserve get smaller, so we
- * don't want that happening while copygc is in progress:
- */
- bch2_copygc_stop(c);
-
/*
* The allocator thread itself allocates btree nodes, so stop it first:
*/
- bch2_dev_allocator_stop(ca);
bch2_dev_allocator_remove(c, ca);
bch2_dev_journal_stop(&c->journal, ca);
-
- bch2_copygc_start(c);
}
-static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
{
lockdep_assert_held(&c->state_lock);
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
-
- return bch2_dev_allocator_start(ca);
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
mutex_unlock(&c->sb_lock);
if (new_state == BCH_MEMBER_STATE_rw)
- ret = __bch2_dev_read_write(c, ca);
+ __bch2_dev_read_write(c, ca);
rebalance_wakeup(c);
static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- struct btree_trans trans;
- size_t i;
+ struct bpos start = POS(ca->dev_idx, 0);
+ struct bpos end = POS(ca->dev_idx, U64_MAX);
int ret;
- bch2_trans_init(&trans, c, 0, 0);
-
- for (i = 0; i < ca->mi.nbuckets; i++) {
- ret = lockrestart_do(&trans,
- bch2_btree_key_cache_flush(&trans,
- BTREE_ID_alloc, POS(ca->dev_idx, i)));
- if (ret)
- break;
- }
- bch2_trans_exit(&trans);
-
- if (ret) {
- bch_err(c, "error %i removing dev alloc info", ret);
- return ret;
- }
+ /*
+ * We clear the LRU and need_discard btrees first so that we don't race
+ * with bch2_do_invalidates() and bch2_do_discards()
+ */
+ ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+ BTREE_TRIGGER_NORUN, NULL);
+ if (ret)
+ bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret));
- return bch2_btree_delete_range(c, BTREE_ID_alloc,
- POS(ca->dev_idx, 0),
- POS(ca->dev_idx + 1, 0),
- 0, NULL);
+ return ret;
}
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
if (ret) {
- bch_err(ca, "Remove failed: error %i dropping data", ret);
+ bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret));
goto err;
}
- ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
+ ret = bch2_dev_remove_alloc(c, ca);
if (ret) {
- bch_err(ca, "Remove failed: error %i flushing journal", ret);
+ bch_err(ca, "Remove failed, error deleting alloc info");
goto err;
}
- ret = bch2_dev_remove_alloc(c, ca);
+ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
if (ret) {
- bch_err(ca, "Remove failed, error deleting alloc info");
+ bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret));
goto err;
}
- /*
- * must flush all existing journal entries, they might have
- * (overwritten) keys that point to the device we're removing:
- */
- bch2_journal_flush_all_pins(&c->journal);
- /*
- * hack to ensure bch2_replicas_gc2() clears out entries to this device
- */
- bch2_journal_meta(&c->journal);
- ret = bch2_journal_error(&c->journal);
+ ret = bch2_journal_flush(&c->journal);
if (ret) {
bch_err(ca, "Remove failed, journal error");
goto err;
ret = bch2_replicas_gc2(c);
if (ret) {
- bch_err(ca, "Remove failed: error %i from replicas gc", ret);
+ bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret));
goto err;
}
data = bch2_dev_has_data(c, ca);
if (data) {
- char data_has_str[100];
+ struct printbuf data_has = PRINTBUF;
- bch2_flags_to_text(&PBUF(data_has_str),
- bch2_data_types, data);
- bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+ prt_bitflags(&data_has, bch2_data_types, data);
+ bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
+ printbuf_exit(&data_has);
ret = -EBUSY;
goto err;
}
struct bch_sb_field_members *mi;
struct bch_member dev_mi;
unsigned dev_idx, nr_devices, u64s;
- char *_errbuf;
- struct printbuf errbuf;
+ struct printbuf errbuf = PRINTBUF;
+ struct printbuf label = PRINTBUF;
int ret;
- _errbuf = kmalloc(4096, GFP_KERNEL);
- if (!_errbuf)
- return -ENOMEM;
-
- errbuf = _PBUF(_errbuf, 4096);
-
ret = bch2_read_super(path, &opts, &sb);
if (ret) {
- bch_err(c, "device add error: error reading super: %i", ret);
+ bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret));
goto err;
}
dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
+ if (BCH_MEMBER_GROUP(&dev_mi)) {
+ bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
+ if (label.allocation_failure) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ }
+
err = bch2_dev_may_add(sb.sb, c);
if (err) {
bch_err(c, "device add error: %s", err);
goto err;
}
+ bch2_dev_usage_init(ca);
+
ret = __bch2_dev_attach_bdev(ca, &sb);
if (ret) {
bch2_dev_free(ca);
le32_to_cpu(mi->field.u64s) +
sizeof(dev_mi) / sizeof(u64))) {
bch_err(c, "device add error: new device superblock too small");
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_sb_members;
goto err_unlock;
}
goto have_slot;
no_slot:
bch_err(c, "device add error: already have maximum number of devices");
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_sb_members;
goto err_unlock;
have_slot:
mi = bch2_sb_resize_members(&c->disk_sb, u64s);
if (!mi) {
bch_err(c, "device add error: no room in superblock for member info");
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_sb_members;
goto err_unlock;
}
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
+ if (BCH_MEMBER_GROUP(&dev_mi)) {
+ ret = __bch2_dev_group_set(c, ca, label.buf);
+ if (ret) {
+ bch_err(c, "device add error: error setting label");
+ goto err_unlock;
+ }
+ }
+
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
- bch_err(c, "device add error: error marking new superblock: %i", ret);
+ bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret));
+ goto err_late;
+ }
+
+ ret = bch2_fs_freespace_init(c);
+ if (ret) {
+ bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
goto err_late;
}
ca->new_fs_bucket_idx = 0;
- if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- ret = __bch2_dev_read_write(c, ca);
- if (ret) {
- bch_err(c, "device add error: error going RW on new device: %i", ret);
- goto err_late;
- }
- }
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
up_write(&c->state_lock);
return 0;
if (ca)
bch2_dev_free(ca);
bch2_free_super(&sb);
- kfree(_errbuf);
+ printbuf_exit(&label);
+ printbuf_exit(&errbuf);
return ret;
err_late:
up_write(&c->state_lock);
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
- bch_err(c, "error bringing %s online: error %i from bch2_trans_mark_dev_sb",
- path, ret);
+ bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s",
+ path, bch2_err_str(ret));
goto err;
}
- if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- ret = __bch2_dev_read_write(c, ca);
- if (ret)
- goto err;
- }
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb.sb);
ret = bch2_dev_buckets_resize(c, ca, nbuckets);
if (ret) {
- bch_err(ca, "Resize error: %i", ret);
+ bch_err(ca, "Resize error: %s", bch2_err_str(ret));
goto err;
}
ret = bch2_trans_mark_dev_sb(c, ca);
- if (ret) {
+ if (ret)
goto err;
- }
mutex_lock(&c->sb_lock);
mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
struct bch_sb_field_members *mi;
unsigned i, best_sb = 0;
const char *err;
- char *_errbuf = NULL;
- struct printbuf errbuf;
+ struct printbuf errbuf = PRINTBUF;
int ret = 0;
if (!try_module_get(THIS_MODULE))
goto err;
}
- _errbuf = kmalloc(4096, GFP_KERNEL);
- if (!_errbuf) {
- ret = -ENOMEM;
- goto err;
- }
-
- errbuf = _PBUF(_errbuf, 4096);
-
sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
if (!sb) {
ret = -ENOMEM;
while (i < nr_devices) {
if (i != best_sb &&
!bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) {
- char buf[BDEVNAME_SIZE];
- pr_info("%s has been removed, skipping",
- bdevname(sb[i].bdev, buf));
+ pr_info("%pg has been removed, skipping", sb[i].bdev);
bch2_free_super(&sb[i]);
array_remove_item(sb, nr_devices, i);
continue;
}
out:
kfree(sb);
- kfree(_errbuf);
+ printbuf_exit(&errbuf);
module_put(THIS_MODULE);
pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
return c;
return remainder;
}
+static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
+ u32 *offset)
+{
+ return div_u64_rem(s, ca->mi.bucket_size, offset);
+}
+
static inline bool bch2_dev_is_online(struct bch_dev *ca)
{
return !percpu_ref_is_zero(&ca->io_ref);
unsigned dev)
{
BUG_ON(bch2_dev_list_has_dev(*devs, dev));
- BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
+ BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
devs->devs[devs->nr++] = dev;
}
u8 discard;
u8 data_allowed;
u8 durability;
+ u8 freespace_initialized;
u8 valid;
};
#include "tests.h"
#include <linux/blkdev.h>
+#include <linux/pretty-printers.h>
#include <linux/sort.h>
#include <linux/sched/clock.h>
#include "util.h"
#define SYSFS_OPS(type) \
-struct sysfs_ops type ## _sysfs_ops = { \
+const struct sysfs_ops type ## _sysfs_ops = { \
.show = type ## _show, \
.store = type ## _store \
}
#define SHOW(fn) \
+static ssize_t fn ## _to_text(struct printbuf *, \
+ struct kobject *, struct attribute *); \
+ \
static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
char *buf) \
+{ \
+ struct printbuf out = PRINTBUF; \
+ ssize_t ret = fn ## _to_text(&out, kobj, attr); \
+ \
+ if (out.pos && out.buf[out.pos - 1] != '\n') \
+ prt_newline(&out); \
+ \
+ if (!ret && out.allocation_failure) \
+ ret = -ENOMEM; \
+ \
+ if (!ret) { \
+ ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \
+ memcpy(buf, out.buf, ret); \
+ } \
+ printbuf_exit(&out); \
+ return bch2_err_class(ret); \
+} \
+ \
+static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
+ struct attribute *attr)
#define STORE(fn) \
+static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\
+ const char *, size_t); \
+ \
static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
const char *buf, size_t size) \
+{ \
+ return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \
+} \
+ \
+static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\
+ const char *buf, size_t size)
#define __sysfs_attribute(_name, _mode) \
static struct attribute sysfs_##_name = \
#define sysfs_printf(file, fmt, ...) \
do { \
if (attr == &sysfs_ ## file) \
- return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\
+ prt_printf(out, fmt "\n", __VA_ARGS__); \
} while (0)
#define sysfs_print(file, var) \
do { \
if (attr == &sysfs_ ## file) \
- return snprint(buf, PAGE_SIZE, var); \
+ snprint(out, var); \
} while (0)
#define sysfs_hprint(file, val) \
do { \
- if (attr == &sysfs_ ## file) { \
- bch2_hprint(&out, val); \
- pr_buf(&out, "\n"); \
- return out.pos - buf; \
- } \
+ if (attr == &sysfs_ ## file) \
+ prt_human_readable_s64(out, val); \
} while (0)
#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var))
} while (0)
write_attribute(trigger_gc);
+write_attribute(trigger_discards);
+write_attribute(trigger_invalidates);
write_attribute(prune_cache);
+write_attribute(btree_wakeup);
rw_attribute(btree_gc_periodic);
rw_attribute(gc_gens_pos);
read_attribute(bucket_size);
read_attribute(first_bucket);
read_attribute(nbuckets);
-read_attribute(durability);
+rw_attribute(durability);
read_attribute(iodone);
read_attribute(io_latency_read);
read_attribute(btree_avg_write_size);
-read_attribute(reserve_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
read_attribute(journal_debug);
-read_attribute(journal_pins);
read_attribute(btree_updates);
-read_attribute(dirty_btree_nodes);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
-read_attribute(btree_transactions);
read_attribute(stripes_heap);
read_attribute(open_buckets);
read_attribute(has_data);
read_attribute(alloc_debug);
-write_attribute(wake_allocator);
-read_attribute(read_realloc_races);
-read_attribute(extent_migrate_done);
-read_attribute(extent_migrate_raced);
+#define x(t, n, ...) read_attribute(t);
+BCH_PERSISTENT_COUNTERS()
+#undef x
rw_attribute(discard);
rw_attribute(label);
mutex_lock(&c->data_progress_lock);
list_for_each_entry(stats, &c->data_progress_list, list) {
- pr_buf(out, "%s: data type %s btree_id %s position: ",
+ prt_printf(out, "%s: data type %s btree_id %s position: ",
stats->name,
bch2_data_types[stats->data_type],
bch2_btree_ids[stats->btree_id]);
bch2_bpos_to_text(out, stats->pos);
- pr_buf(out, "%s", "\n");
+ prt_printf(out, "%s", "\n");
}
mutex_unlock(&c->data_progress_lock);
bch2_trans_init(&trans, c, 0, 0);
for (id = 0; id < BTREE_ID_NR; id++) {
- if (!((1U << id) & BTREE_ID_HAS_PTRS))
+ if (!btree_type_has_ptrs(id))
continue;
for_each_btree_key(&trans, iter, id, POS_MIN,
if (ret)
return ret;
- pr_buf(out, "uncompressed:\n");
- pr_buf(out, " nr extents: %llu\n", nr_uncompressed_extents);
- pr_buf(out, " size: ");
- bch2_hprint(out, uncompressed_sectors << 9);
- pr_buf(out, "\n");
-
- pr_buf(out, "compressed:\n");
- pr_buf(out, " nr extents: %llu\n", nr_compressed_extents);
- pr_buf(out, " compressed size: ");
- bch2_hprint(out, compressed_sectors_compressed << 9);
- pr_buf(out, "\n");
- pr_buf(out, " uncompressed size: ");
- bch2_hprint(out, compressed_sectors_uncompressed << 9);
- pr_buf(out, "\n");
-
- pr_buf(out, "incompressible:\n");
- pr_buf(out, " nr extents: %llu\n", nr_incompressible_extents);
- pr_buf(out, " size: ");
- bch2_hprint(out, incompressible_sectors << 9);
- pr_buf(out, "\n");
+ prt_printf(out, "uncompressed:\n");
+ prt_printf(out, " nr extents: %llu\n", nr_uncompressed_extents);
+ prt_printf(out, " size: ");
+ prt_human_readable_u64(out, uncompressed_sectors << 9);
+ prt_printf(out, "\n");
+
+ prt_printf(out, "compressed:\n");
+ prt_printf(out, " nr extents: %llu\n", nr_compressed_extents);
+ prt_printf(out, " compressed size: ");
+ prt_human_readable_u64(out, compressed_sectors_compressed << 9);
+ prt_printf(out, "\n");
+ prt_printf(out, " uncompressed size: ");
+ prt_human_readable_u64(out, compressed_sectors_uncompressed << 9);
+ prt_printf(out, "\n");
+
+ prt_printf(out, "incompressible:\n");
+ prt_printf(out, " nr extents: %llu\n", nr_incompressible_extents);
+ prt_printf(out, " size: ");
+ prt_human_readable_u64(out, incompressible_sectors << 9);
+ prt_printf(out, "\n");
return 0;
}
static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
{
- pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
+ prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
bch2_bpos_to_text(out, c->gc_gens_pos);
- pr_buf(out, "\n");
+ prt_printf(out, "\n");
+}
+
+static void bch2_btree_wakeup_all(struct bch_fs *c)
+{
+ struct btree_trans *trans;
+
+ mutex_lock(&c->btree_trans_lock);
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ struct btree_bkey_cached_common *b = READ_ONCE(trans->locking);
+
+ if (b)
+ six_lock_wakeup_all(&b->lock);
+
+ }
+ mutex_unlock(&c->btree_trans_lock);
}
SHOW(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
sysfs_print(minor, c->minor);
sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c));
- sysfs_print(read_realloc_races,
- atomic_long_read(&c->read_realloc_races));
- sysfs_print(extent_migrate_done,
- atomic_long_read(&c->extent_migrate_done));
- sysfs_print(extent_migrate_raced,
- atomic_long_read(&c->extent_migrate_raced));
-
sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
- if (attr == &sysfs_gc_gens_pos) {
- bch2_gc_gens_pos_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_gc_gens_pos)
+ bch2_gc_gens_pos_to_text(out, c);
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
max(0LL, c->copygc_wait -
atomic64_read(&c->io_clock[WRITE].now)) << 9);
- if (attr == &sysfs_rebalance_work) {
- bch2_rebalance_work_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_rebalance_work)
+ bch2_rebalance_work_to_text(out, c);
sysfs_print(promote_whole_extents, c->promote_whole_extents);
/* Debugging: */
- if (attr == &sysfs_journal_debug) {
- bch2_journal_debug_to_text(&out, &c->journal);
- return out.pos - buf;
- }
-
- if (attr == &sysfs_journal_pins) {
- bch2_journal_pins_to_text(&out, &c->journal);
- return out.pos - buf;
- }
+ if (attr == &sysfs_journal_debug)
+ bch2_journal_debug_to_text(out, &c->journal);
- if (attr == &sysfs_btree_updates) {
- bch2_btree_updates_to_text(&out, c);
- return out.pos - buf;
- }
-
- if (attr == &sysfs_dirty_btree_nodes) {
- bch2_dirty_btree_nodes_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_btree_updates)
+ bch2_btree_updates_to_text(out, c);
- if (attr == &sysfs_btree_cache) {
- bch2_btree_cache_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_btree_cache)
+ bch2_btree_cache_to_text(out, &c->btree_cache);
- if (attr == &sysfs_btree_key_cache) {
- bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
- return out.pos - buf;
- }
+ if (attr == &sysfs_btree_key_cache)
+ bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
- if (attr == &sysfs_btree_transactions) {
- bch2_btree_trans_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_stripes_heap)
+ bch2_stripes_heap_to_text(out, c);
- if (attr == &sysfs_stripes_heap) {
- bch2_stripes_heap_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_open_buckets)
+ bch2_open_buckets_to_text(out, c);
- if (attr == &sysfs_open_buckets) {
- bch2_open_buckets_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_compression_stats)
+ bch2_compression_stats_to_text(out, c);
- if (attr == &sysfs_compression_stats) {
- bch2_compression_stats_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_new_stripes)
+ bch2_new_stripes_to_text(out, c);
- if (attr == &sysfs_new_stripes) {
- bch2_new_stripes_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_io_timers_read)
+ bch2_io_timers_to_text(out, &c->io_clock[READ]);
- if (attr == &sysfs_io_timers_read) {
- bch2_io_timers_to_text(&out, &c->io_clock[READ]);
- return out.pos - buf;
- }
- if (attr == &sysfs_io_timers_write) {
- bch2_io_timers_to_text(&out, &c->io_clock[WRITE]);
- return out.pos - buf;
- }
+ if (attr == &sysfs_io_timers_write)
+ bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
- if (attr == &sysfs_data_jobs) {
- data_progress_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_data_jobs)
+ data_progress_to_text(out, c);
return 0;
}
c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
}
+ if (attr == &sysfs_btree_wakeup)
+ bch2_btree_wakeup_all(c);
+
if (attr == &sysfs_trigger_gc) {
/*
* Full gc is currently incompatible with btree key cache:
#endif
}
+ if (attr == &sysfs_trigger_discards)
+ bch2_do_discards(c);
+
+ if (attr == &sysfs_trigger_invalidates)
+ bch2_do_invalidates(c);
+
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
NULL
};
+/* counters dir */
+
+SHOW(bch2_fs_counters)
+{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
+ u64 counter = 0;
+ u64 counter_since_mount = 0;
+
+ printbuf_tabstop_push(out, 32);
+
+ #define x(t, ...) \
+ if (attr == &sysfs_##t) { \
+ counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
+ counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
+ prt_printf(out, "since mount:"); \
+ prt_tab(out); \
+ prt_human_readable_u64(out, counter_since_mount << 9); \
+ prt_newline(out); \
+ \
+ prt_printf(out, "since filesystem creation:"); \
+ prt_tab(out); \
+ prt_human_readable_u64(out, counter << 9); \
+ prt_newline(out); \
+ }
+ BCH_PERSISTENT_COUNTERS()
+ #undef x
+ return 0;
+}
+
+STORE(bch2_fs_counters) {
+ return 0;
+}
+
+SYSFS_OPS(bch2_fs_counters);
+
+struct attribute *bch2_fs_counters_files[] = {
+#define x(t, ...) \
+ &sysfs_##t,
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ NULL
+};
/* internal dir - just a wrapper */
SHOW(bch2_fs_internal)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
- return bch2_fs_show(&c->kobj, attr, buf);
+ return bch2_fs_to_text(out, &c->kobj, attr);
}
STORE(bch2_fs_internal)
struct attribute *bch2_fs_internal_files[] = {
&sysfs_journal_debug,
- &sysfs_journal_pins,
&sysfs_btree_updates,
- &sysfs_dirty_btree_nodes,
&sysfs_btree_cache,
&sysfs_btree_key_cache,
- &sysfs_btree_transactions,
&sysfs_new_stripes,
&sysfs_stripes_heap,
&sysfs_open_buckets,
&sysfs_io_timers_write,
&sysfs_trigger_gc,
+ &sysfs_trigger_discards,
+ &sysfs_trigger_invalidates,
&sysfs_prune_cache,
-
- &sysfs_read_realloc_races,
- &sysfs_extent_migrate_done,
- &sysfs_extent_migrate_raced,
+ &sysfs_btree_wakeup,
&sysfs_gc_gens_pos,
SHOW(bch2_fs_opts_dir)
{
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
const struct bch_option *opt = container_of(attr, struct bch_option, attr);
int id = opt - bch2_opt_table;
u64 v = bch2_opt_get_by_id(&c->opts, id);
- bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST);
- pr_buf(&out, "\n");
+ bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
+ prt_char(out, '\n');
- return out.pos - buf;
+ return 0;
}
STORE(bch2_fs_opts_dir)
char *tmp;
u64 v;
+ /*
+ * We don't need to take c->writes for correctness, but it eliminates an
+ * unsightly error message in the dmesg log when we're RO:
+ */
+ if (unlikely(!percpu_ref_tryget_live(&c->writes)))
+ return -EROFS;
+
tmp = kstrdup(buf, GFP_KERNEL);
- if (!tmp)
- return -ENOMEM;
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto err;
+ }
- ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v);
+ ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
kfree(tmp);
if (ret < 0)
- return ret;
+ goto err;
ret = bch2_opt_check_may_set(c, id, v);
if (ret < 0)
- return ret;
+ goto err;
bch2_opt_set_sb(c, opt, v);
bch2_opt_set_by_id(&c->opts, id, v);
rebalance_wakeup(c);
}
- return size;
+ ret = size;
+err:
+ percpu_ref_put(&c->writes);
+ return ret;
}
SYSFS_OPS(bch2_fs_opts_dir);
SHOW(bch2_fs_time_stats)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
#define x(name) \
- if (attr == &sysfs_time_stat_##name) { \
- bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\
- return out.pos - buf; \
- }
+ if (attr == &sysfs_time_stat_##name) \
+ bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
BCH_TIME_STATS()
#undef x
NULL
};
-static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
-{
- enum alloc_reserve i;
-
- spin_lock(&ca->fs->freelist_lock);
-
- pr_buf(out, "free_inc:\t%zu\t%zu\n",
- fifo_used(&ca->free_inc),
- ca->free_inc.size);
-
- for (i = 0; i < RESERVE_NR; i++)
- pr_buf(out, "free[%u]:\t%zu\t%zu\n", i,
- fifo_used(&ca->free[i]),
- ca->free[i].size);
-
- spin_unlock(&ca->fs->freelist_lock);
-}
-
static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
nr[c->open_buckets[i].data_type]++;
- pr_buf(out,
- "\t\t buckets\t sectors fragmented\n"
- "capacity%16llu\n",
+ prt_printf(out,
+ "\t\t\t buckets\t sectors fragmented\n"
+ "capacity\t%16llu\n",
ca->mi.nbuckets - ca->mi.first_bucket);
- for (i = 1; i < BCH_DATA_NR; i++)
- pr_buf(out, "%-8s%16llu%16llu%16llu\n",
+ for (i = 0; i < BCH_DATA_NR; i++)
+ prt_printf(out, "%-16s%16llu%16llu%16llu\n",
bch2_data_types[i], stats.d[i].buckets,
stats.d[i].sectors, stats.d[i].fragmented);
- pr_buf(out,
- "ec\t%16llu\n"
- "available%15llu\n"
+ prt_printf(out,
+ "ec\t\t%16llu\n"
"\n"
- "free_inc\t\t%zu/%zu\n"
- "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
- "free[RESERVE_NONE]\t%zu/%zu\n"
"freelist_wait\t\t%s\n"
"open buckets allocated\t%u\n"
"open buckets this dev\t%u\n"
"open_buckets_wait\t%s\n"
"open_buckets_btree\t%u\n"
"open_buckets_user\t%u\n"
- "btree reserve cache\t%u\n"
- "thread state:\t\t%s\n",
+ "buckets_to_invalidate\t%llu\n"
+ "btree reserve cache\t%u\n",
stats.buckets_ec,
- __dev_buckets_available(ca, stats),
- fifo_used(&ca->free_inc), ca->free_inc.size,
- fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
- fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
c->freelist_wait.list.first ? "waiting" : "empty",
OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
ca->nr_open_buckets,
c->open_buckets_wait.list.first ? "waiting" : "empty",
nr[BCH_DATA_btree],
nr[BCH_DATA_user],
- c->btree_reserve_cache_nr,
- bch2_allocator_states[ca->allocator_state]);
+ should_invalidate_buckets(ca, stats),
+ c->btree_reserve_cache_nr);
}
static const char * const bch2_rw[] = {
int rw, i;
for (rw = 0; rw < 2; rw++) {
- pr_buf(out, "%s:\n", bch2_rw[rw]);
+ prt_printf(out, "%s:\n", bch2_rw[rw]);
for (i = 1; i < BCH_DATA_NR; i++)
- pr_buf(out, "%-12s:%12llu\n",
+ prt_printf(out, "%-12s:%12llu\n",
bch2_data_types[i],
percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
}
{
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
struct bch_fs *c = ca->fs;
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
sysfs_printf(uuid, "%pU\n", ca->uuid.b);
if (attr == &sysfs_label) {
if (ca->mi.group) {
mutex_lock(&c->sb_lock);
- bch2_disk_path_to_text(&out, &c->disk_sb,
+ bch2_disk_path_to_text(out, c->disk_sb.sb,
ca->mi.group - 1);
mutex_unlock(&c->sb_lock);
}
- pr_buf(&out, "\n");
- return out.pos - buf;
+ prt_char(out, '\n');
}
if (attr == &sysfs_has_data) {
- bch2_flags_to_text(&out, bch2_data_types,
- bch2_dev_has_data(c, ca));
- pr_buf(&out, "\n");
- return out.pos - buf;
+ prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
+ prt_char(out, '\n');
}
if (attr == &sysfs_state_rw) {
- bch2_string_opt_to_text(&out, bch2_member_states,
- ca->mi.state);
- pr_buf(&out, "\n");
- return out.pos - buf;
+ prt_string_option(out, bch2_member_states, ca->mi.state);
+ prt_char(out, '\n');
}
- if (attr == &sysfs_iodone) {
- dev_iodone_to_text(&out, ca);
- return out.pos - buf;
- }
+ if (attr == &sysfs_iodone)
+ dev_iodone_to_text(out, ca);
sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ]));
sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
- if (attr == &sysfs_io_latency_stats_read) {
- bch2_time_stats_to_text(&out, &ca->io_latency[READ]);
- return out.pos - buf;
- }
- if (attr == &sysfs_io_latency_stats_write) {
- bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]);
- return out.pos - buf;
- }
+ if (attr == &sysfs_io_latency_stats_read)
+ bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+
+ if (attr == &sysfs_io_latency_stats_write)
+ bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
sysfs_printf(congested, "%u%%",
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
* 100 / CONGESTED_MAX);
- if (attr == &sysfs_reserve_stats) {
- reserve_stats_to_text(&out, ca);
- return out.pos - buf;
- }
- if (attr == &sysfs_alloc_debug) {
- dev_alloc_debug_to_text(&out, ca);
- return out.pos - buf;
- }
+ if (attr == &sysfs_alloc_debug)
+ dev_alloc_debug_to_text(out, ca);
return 0;
}
mutex_unlock(&c->sb_lock);
}
+ if (attr == &sysfs_durability) {
+ u64 v = strtoul_or_return(buf);
+
+ mutex_lock(&c->sb_lock);
+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+
+ if (v != BCH_MEMBER_DURABILITY(mi)) {
+ SET_BCH_MEMBER_DURABILITY(mi, v + 1);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+ }
+
if (attr == &sysfs_label) {
char *tmp;
int ret;
return ret;
}
- if (attr == &sysfs_wake_allocator)
- bch2_wake_allocator(ca);
-
return size;
}
SYSFS_OPS(bch2_dev);
&sysfs_io_latency_stats_write,
&sysfs_congested,
- &sysfs_reserve_stats,
-
/* debug: */
&sysfs_alloc_debug,
- &sysfs_wake_allocator,
NULL
};
struct sysfs_ops;
extern struct attribute *bch2_fs_files[];
+extern struct attribute *bch2_fs_counters_files[];
extern struct attribute *bch2_fs_internal_files[];
extern struct attribute *bch2_fs_opts_dir_files[];
extern struct attribute *bch2_fs_time_stats_files[];
extern struct attribute *bch2_dev_files[];
-extern struct sysfs_ops bch2_fs_sysfs_ops;
-extern struct sysfs_ops bch2_fs_internal_sysfs_ops;
-extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
-extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
-extern struct sysfs_ops bch2_dev_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_counters_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_internal_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+extern const struct sysfs_ops bch2_dev_sysfs_ops;
int bch2_opts_create_sysfs_files(struct kobject *);
#else
static struct attribute *bch2_fs_files[] = {};
+static struct attribute *bch2_fs_counters_files[] = {};
static struct attribute *bch2_fs_internal_files[] = {};
static struct attribute *bch2_fs_opts_dir_files[] = {};
static struct attribute *bch2_fs_time_stats_files[] = {};
static struct attribute *bch2_dev_files[] = {};
static const struct sysfs_ops bch2_fs_sysfs_ops;
+static const struct sysfs_ops bch2_fs_counters_sysfs_ops;
static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
int ret;
ret = bch2_btree_delete_range(c, BTREE_ID_extents,
- POS_MIN, SPOS_MAX,
- BTREE_ITER_ALL_SNAPSHOTS,
+ SPOS(0, 0, U32_MAX), SPOS_MAX,
+ 0,
NULL);
BUG_ON(ret);
ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
- POS_MIN, SPOS_MAX,
- BTREE_ITER_ALL_SNAPSHOTS,
- NULL);
+ SPOS(0, 0, U32_MAX), SPOS_MAX,
+ 0, NULL);
BUG_ON(ret);
}
bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
BTREE_ITER_INTENT);
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(&trans, &iter, &k.k_i, 0));
if (ret) {
- bch_err(c, "update error in test_delete: %i", ret);
+ bch_err(c, "update error in test_delete: %s", bch2_err_str(ret));
goto err;
}
pr_info("deleting once");
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
bch2_btree_delete_at(&trans, &iter, 0));
if (ret) {
- bch_err(c, "delete error (first) in test_delete: %i", ret);
+ bch_err(c, "delete error (first) in test_delete: %s", bch2_err_str(ret));
goto err;
}
pr_info("deleting twice");
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
bch2_btree_delete_at(&trans, &iter, 0));
if (ret) {
- bch_err(c, "delete error (second) in test_delete: %i", ret);
+ bch_err(c, "delete error (second) in test_delete: %s", bch2_err_str(ret));
goto err;
}
err:
bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
BTREE_ITER_INTENT);
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(&trans, &iter, &k.k_i, 0));
if (ret) {
- bch_err(c, "update error in test_delete_written: %i", ret);
+ bch_err(c, "update error in test_delete_written: %s", bch2_err_str(ret));
goto err;
}
bch2_trans_unlock(&trans);
bch2_journal_flush_all_pins(&c->journal);
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
bch2_btree_delete_at(&trans, &iter, 0));
if (ret) {
- bch_err(c, "delete error in test_delete_written: %i", ret);
+ bch_err(c, "delete error in test_delete_written: %s", bch2_err_str(ret));
goto err;
}
err:
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
NULL, NULL, 0);
if (ret) {
- bch_err(c, "insert error in test_iterate: %i", ret);
+ bch_err(c, "insert error in test_iterate: %s", bch2_err_str(ret));
goto err;
}
}
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), 0, k, ret) {
- if (k.k->p.inode)
- break;
-
+ ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), 0, k, ({
BUG_ON(k.k->p.offset != i++);
+ 0;
+ }));
+ if (ret) {
+ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+ goto err;
}
BUG_ON(i != nr);
pr_info("iterating backwards");
- while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k))
- BUG_ON(k.k->p.offset != --i);
+ ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, U64_MAX, U32_MAX), 0, k,
+ ({
+ BUG_ON(k.k->p.offset != --i);
+ 0;
+ }));
+ if (ret) {
+ bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret));
+ goto err;
+ }
BUG_ON(i);
err:
ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
NULL, NULL, 0);
if (ret) {
- bch_err(c, "insert error in test_iterate_extents: %i", ret);
+ bch_err(c, "insert error in test_iterate_extents: %s", bch2_err_str(ret));
goto err;
}
}
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), 0, k, ret) {
+ ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), 0, k, ({
BUG_ON(bkey_start_offset(k.k) != i);
i = k.k->p.offset;
+ 0;
+ }));
+ if (ret) {
+ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+ goto err;
}
BUG_ON(i != nr);
pr_info("iterating backwards");
- while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) {
- BUG_ON(k.k->p.offset != i);
- i = bkey_start_offset(k.k);
+ ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents,
+ SPOS(0, U64_MAX, U32_MAX), 0, k,
+ ({
+ BUG_ON(k.k->p.offset != i);
+ i = bkey_start_offset(k.k);
+ 0;
+ }));
+ if (ret) {
+ bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret));
+ goto err;
}
BUG_ON(i);
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
NULL, NULL, 0);
if (ret) {
- bch_err(c, "insert error in test_iterate_slots: %i", ret);
+ bch_err(c, "insert error in test_iterate_slots: %s", bch2_err_str(ret));
goto err;
}
}
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), 0, k, ret) {
- if (k.k->p.inode)
- break;
-
+ ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), 0, k, ({
BUG_ON(k.k->p.offset != i);
i += 2;
+ 0;
+ }));
+ if (ret) {
+ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+ goto err;
}
- bch2_trans_iter_exit(&trans, &iter);
BUG_ON(i != nr * 2);
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX),
- BTREE_ITER_SLOTS, k, ret) {
+ ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX),
+ BTREE_ITER_SLOTS, k, ({
+ if (i >= nr * 2)
+ break;
+
BUG_ON(k.k->p.offset != i);
BUG_ON(bkey_deleted(k.k) != (i & 1));
i++;
- if (i == nr * 2)
- break;
+ 0;
+ }));
+ if (ret < 0) {
+ bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret));
+ goto err;
}
- bch2_trans_iter_exit(&trans, &iter);
+ ret = 0;
err:
bch2_trans_exit(&trans);
return ret;
ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
NULL, NULL, 0);
if (ret) {
- bch_err(c, "insert error in test_iterate_slots_extents: %i", ret);
+ bch_err(c, "insert error in test_iterate_slots_extents: %s", bch2_err_str(ret));
goto err;
}
}
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), 0, k, ret) {
+ ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), 0, k, ({
BUG_ON(bkey_start_offset(k.k) != i + 8);
BUG_ON(k.k->size != 8);
i += 16;
+ 0;
+ }));
+ if (ret) {
+ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+ goto err;
}
- bch2_trans_iter_exit(&trans, &iter);
BUG_ON(i != nr);
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX),
- BTREE_ITER_SLOTS, k, ret) {
+ ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX),
+ BTREE_ITER_SLOTS, k, ({
+ if (i == nr)
+ break;
BUG_ON(bkey_deleted(k.k) != !(i % 16));
BUG_ON(bkey_start_offset(k.k) != i);
BUG_ON(k.k->size != 8);
i = k.k->p.offset;
-
- if (i == nr)
- break;
+ 0;
+ }));
+ if (ret) {
+ bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret));
+ goto err;
}
- bch2_trans_iter_exit(&trans, &iter);
+ ret = 0;
err:
bch2_trans_exit(&trans);
return 0;
bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), 0);
- k = bch2_btree_iter_peek(&iter);
+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
BUG_ON(k.k);
- k = bch2_btree_iter_peek(&iter);
+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
BUG_ON(k.k);
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), 0);
- k = bch2_btree_iter_peek(&iter);
+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
BUG_ON(k.k);
- k = bch2_btree_iter_peek(&iter);
+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
BUG_ON(k.k);
bch2_trans_iter_exit(&trans, &iter);
ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
NULL, NULL, 0);
if (ret)
- bch_err(c, "insert error in insert_test_extent: %i", ret);
+ bch_err(c, "insert error in insert_test_extent: %s", bch2_err_str(ret));
return ret;
}
bch2_trans_init(&trans, c, 0, 0);
bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, snapid_lo), 0);
- k = bch2_btree_iter_peek(&iter);
+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
BUG_ON(k.k->p.snapshot != U32_MAX);
ret = test_snapshot_filter(c, snapids[0], snapids[1]);
if (ret) {
- bch_err(c, "err %i from test_snapshot_filter", ret);
+ bch_err(c, "err from test_snapshot_filter: %s", bch2_err_str(ret));
return ret;
}
k.k.p.offset = test_rand();
k.k.p.snapshot = U32_MAX;
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
if (ret) {
- bch_err(c, "error in rand_insert: %i", ret);
+ bch_err(c, "error in rand_insert: %s", bch2_err_str(ret));
break;
}
}
k[j].k.p.snapshot = U32_MAX;
}
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?:
__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?:
__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?:
__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?:
__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i));
if (ret) {
- bch_err(c, "error in rand_insert_multi: %i", ret);
+ bch_err(c, "error in rand_insert_multi: %s", bch2_err_str(ret));
break;
}
}
for (i = 0; i < nr; i++) {
bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
- k = bch2_btree_iter_peek(&iter);
+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
ret = bkey_err(k);
if (ret) {
- bch_err(c, "error in rand_lookup: %i", ret);
+ bch_err(c, "error in rand_lookup: %s", bch2_err_str(ret));
break;
}
}
k = bch2_btree_iter_peek(iter);
ret = bkey_err(k);
- if (ret && ret != -EINTR)
- bch_err(trans->c, "lookup error in rand_mixed: %i", ret);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(trans->c, "lookup error in rand_mixed: %s", bch2_err_str(ret));
if (ret)
return ret;
for (i = 0; i < nr; i++) {
rand = test_rand();
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
rand_mixed_trans(&trans, &iter, &cookie, i, rand));
if (ret) {
- bch_err(c, "update error in rand_mixed: %i", ret);
+ bch_err(c, "update error in rand_mixed: %s", bch2_err_str(ret));
break;
}
}
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek(&iter);
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
ret = bkey_err(k);
if (ret)
goto err;
for (i = 0; i < nr; i++) {
struct bpos pos = SPOS(0, test_rand(), U32_MAX);
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
__do_delete(&trans, pos));
if (ret) {
- bch_err(c, "error in rand_delete: %i", ret);
+ bch_err(c, "error in rand_delete: %s", bch2_err_str(ret));
break;
}
}
struct bkey_s_c k;
struct bkey_i_cookie insert;
int ret = 0;
- u64 i = 0;
bkey_cookie_init(&insert.k_i);
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- insert.k.p = iter.pos;
-
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(&trans, &iter, &insert.k_i, 0));
- if (ret) {
- bch_err(c, "error in seq_insert: %i", ret);
- break;
- }
-
- if (++i == nr)
- break;
- }
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
+ NULL, NULL, 0,
+ ({
+ if (iter.pos.offset >= nr)
+ break;
+ insert.k.p = iter.pos;
+ bch2_trans_update(&trans, &iter, &insert.k_i, 0);
+ }));
+ if (ret)
+ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
bch2_trans_exit(&trans);
return ret;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), 0, k, ret)
- ;
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), 0, k,
+ 0);
+ if (ret)
+ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
bch2_trans_exit(&trans);
return ret;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX),
- BTREE_ITER_INTENT, k, ret) {
- struct bkey_i_cookie u;
-
- bkey_reassemble(&u.k_i, k);
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX),
+ BTREE_ITER_INTENT, k,
+ NULL, NULL, 0,
+ ({
+ struct bkey_i_cookie u;
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(&trans, &iter, &u.k_i, 0));
- if (ret) {
- bch_err(c, "error in seq_overwrite: %i", ret);
- break;
- }
- }
- bch2_trans_iter_exit(&trans, &iter);
+ bkey_reassemble(&u.k_i, k);
+ bch2_trans_update(&trans, &iter, &u.k_i, 0);
+ }));
+ if (ret)
+ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
bch2_trans_exit(&trans);
return ret;
int ret;
ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
- POS_MIN, SPOS_MAX,
- BTREE_ITER_ALL_SNAPSHOTS,
- NULL);
+ SPOS(0, 0, U32_MAX), SPOS_MAX,
+ 0, NULL);
if (ret)
- bch_err(c, "error in seq_delete: %i", ret);
+ bch_err(c, "error in seq_delete: %s", bch2_err_str(ret));
return ret;
}
ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
if (ret) {
- bch_err(j->c, "%ps: error %i", j->fn, ret);
+ bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret));
j->ret = ret;
}
u64 nr, unsigned nr_threads)
{
struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
- char name_buf[20], nr_buf[20], per_sec_buf[20];
+ char name_buf[20];
+ struct printbuf nr_buf = PRINTBUF;
+ struct printbuf per_sec_buf = PRINTBUF;
unsigned i;
u64 time;
time = j.finish - j.start;
scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
- bch2_hprint(&PBUF(nr_buf), nr);
- bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time));
+ prt_human_readable_u64(&nr_buf, nr);
+ prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
- name_buf, nr_buf, nr_threads,
+ name_buf, nr_buf.buf, nr_threads,
div_u64(time, NSEC_PER_SEC),
div_u64(time * nr_threads, nr),
- per_sec_buf);
+ per_sec_buf.buf);
+ printbuf_exit(&per_sec_buf);
+ printbuf_exit(&nr_buf);
return j.ret;
}
#include "bcachefs.h"
#include "alloc_types.h"
#include "buckets.h"
-#include "btree_types.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
#include "keylist.h"
+#include "opts.h"
#include <linux/blktrace_api.h>
-#include "keylist.h"
+#include <linux/six.h>
#define CREATE_TRACE_POINTS
#include <trace/events/bcachefs.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/console.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/freezer.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/sched/clock.h>
+#include <linux/mean_and_variance.h>
#include "eytzinger.h"
#include "util.h"
static const char si_units[] = "?kMGTPEZY";
-static int __bch2_strtoh(const char *cp, u64 *res,
- u64 t_max, bool t_signed)
+/* string_get_size units: */
+static const char *const units_2[] = {
+ "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
+};
+static const char *const units_10[] = {
+ "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
+};
+
+static int parse_u64(const char *cp, u64 *res)
{
- bool positive = *cp != '-';
- unsigned u;
+ const char *start = cp;
u64 v = 0;
- if (*cp == '+' || *cp == '-')
- cp++;
-
if (!isdigit(*cp))
return -EINVAL;
cp++;
} while (isdigit(*cp));
+ *res = v;
+ return cp - start;
+}
+
+static int bch2_pow(u64 n, u64 p, u64 *res)
+{
+ *res = 1;
+
+ while (p--) {
+ if (*res > div_u64(U64_MAX, n))
+ return -ERANGE;
+ *res *= n;
+ }
+ return 0;
+}
+
+static int parse_unit_suffix(const char *cp, u64 *res)
+{
+ const char *start = cp;
+ u64 base = 1024;
+ unsigned u;
+ int ret;
+
+ if (*cp == ' ')
+ cp++;
+
for (u = 1; u < strlen(si_units); u++)
if (*cp == si_units[u]) {
cp++;
goto got_unit;
}
- u = 0;
+
+ for (u = 0; u < ARRAY_SIZE(units_2); u++)
+ if (!strncmp(cp, units_2[u], strlen(units_2[u]))) {
+ cp += strlen(units_2[u]);
+ goto got_unit;
+ }
+
+ for (u = 0; u < ARRAY_SIZE(units_10); u++)
+ if (!strncmp(cp, units_10[u], strlen(units_10[u]))) {
+ cp += strlen(units_10[u]);
+ base = 1000;
+ goto got_unit;
+ }
+
+ *res = 1;
+ return 0;
got_unit:
- if (*cp == '\n')
+ ret = bch2_pow(base, u, res);
+ if (ret)
+ return ret;
+
+ return cp - start;
+}
+
+#define parse_or_ret(cp, _f) \
+do { \
+ int ret = _f; \
+ if (ret < 0) \
+ return ret; \
+ cp += ret; \
+} while (0)
+
+static int __bch2_strtou64_h(const char *cp, u64 *res)
+{
+ const char *start = cp;
+ u64 v = 0, b, f_n = 0, f_d = 1;
+ int ret;
+
+ parse_or_ret(cp, parse_u64(cp, &v));
+
+ if (*cp == '.') {
cp++;
- if (*cp)
- return -EINVAL;
+ ret = parse_u64(cp, &f_n);
+ if (ret < 0)
+ return ret;
+ cp += ret;
+
+ ret = bch2_pow(10, ret, &f_d);
+ if (ret)
+ return ret;
+ }
+
+ parse_or_ret(cp, parse_unit_suffix(cp, &b));
+
+ if (v > div_u64(U64_MAX, b))
+ return -ERANGE;
+ v *= b;
+
+ if (f_n > div_u64(U64_MAX, b))
+ return -ERANGE;
- if (fls64(v) + u * 10 > 64)
+ f_n = div_u64(f_n * b, f_d);
+ if (v + f_n < v)
return -ERANGE;
+ v += f_n;
- v <<= u * 10;
+ *res = v;
+ return cp - start;
+}
+
+static int __bch2_strtoh(const char *cp, u64 *res,
+ u64 t_max, bool t_signed)
+{
+ bool positive = *cp != '-';
+ u64 v = 0;
+
+ if (*cp == '+' || *cp == '-')
+ cp++;
+
+ parse_or_ret(cp, __bch2_strtou64_h(cp, &v));
+
+ if (*cp == '\n')
+ cp++;
+ if (*cp)
+ return -EINVAL;
if (positive) {
if (v > t_max)
#define STRTO_H(name, type) \
int bch2_ ## name ## _h(const char *cp, type *res) \
{ \
- u64 v; \
+ u64 v = 0; \
int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \
ANYSINT_MAX(type) != ((type) ~0ULL)); \
*res = v; \
STRTO_H(strtoull, unsigned long long)
STRTO_H(strtou64, u64)
-void bch2_hprint(struct printbuf *buf, s64 v)
-{
- int u, t = 0;
-
- for (u = 0; v >= 1024 || v <= -1024; u++) {
- t = v & ~(~0U << 10);
- v >>= 10;
- }
-
- pr_buf(buf, "%lli", v);
-
- /*
- * 103 is magic: t is in the range [-1023, 1023] and we want
- * to turn it into [-9, 9]
- */
- if (u && t && v < 100 && v > -100)
- pr_buf(buf, ".%i", t / 103);
- if (u)
- pr_buf(buf, "%c", si_units[u]);
-}
-
-void bch2_string_opt_to_text(struct printbuf *out,
- const char * const list[],
- size_t selected)
-{
- size_t i;
-
- for (i = 0; list[i]; i++)
- pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]);
-}
-
-void bch2_flags_to_text(struct printbuf *out,
- const char * const list[], u64 flags)
-{
- unsigned bit, nr = 0;
- bool first = true;
-
- if (out->pos != out->end)
- *out->pos = '\0';
-
- while (list[nr])
- nr++;
-
- while (flags && (bit = __ffs(flags)) < nr) {
- if (!first)
- pr_buf(out, ",");
- first = false;
- pr_buf(out, "%s", list[bit]);
- flags ^= 1 << bit;
- }
-}
-
u64 bch2_read_flag_list(char *opt, const char * const list[])
{
u64 ret = 0;
}
}
-/* time stats: */
+void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
+{
+ while (nr_bits)
+ prt_char(out, '0' + ((v >> --nr_bits) & 1));
+}
-static void bch2_time_stats_update_one(struct time_stats *stats,
- u64 start, u64 end)
+void bch2_print_string_as_lines(const char *prefix, const char *lines)
{
- u64 duration, freq;
+ const char *p;
- duration = time_after64(end, start)
- ? end - start : 0;
- freq = time_after64(end, stats->last_event)
- ? end - stats->last_event : 0;
+ if (!lines) {
+ printk("%s (null)\n", prefix);
+ return;
+ }
- stats->count++;
+ console_lock();
+ while (1) {
+ p = strchrnul(lines, '\n');
+ printk("%s%.*s\n", prefix, (int) (p - lines), lines);
+ if (!*p)
+ break;
+ lines = p + 1;
+ prefix = KERN_CONT;
+ }
+ console_unlock();
+}
- stats->average_duration = stats->average_duration
- ? ewma_add(stats->average_duration, duration, 6)
- : duration;
+int bch2_prt_backtrace(struct printbuf *out, struct task_struct *task)
+{
+ unsigned long entries[32];
+ unsigned i, nr_entries;
+ int ret;
+
+ ret = down_read_killable(&task->signal->exec_update_lock);
+ if (ret)
+ return ret;
+
+ nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0);
+ for (i = 0; i < nr_entries; i++) {
+ prt_printf(out, "[<0>] %pB", (void *)entries[i]);
+ prt_newline(out);
+ }
+
+ up_read(&task->signal->exec_update_lock);
+ return 0;
+}
- stats->average_frequency = stats->average_frequency
- ? ewma_add(stats->average_frequency, freq, 6)
- : freq;
+/* time stats: */
- stats->max_duration = max(stats->max_duration, duration);
+static void bch2_time_stats_update_one(struct time_stats *stats,
+ u64 start, u64 end)
+{
+ u64 duration, freq;
- stats->last_event = end;
+ if (time_after64(end, start)) {
+ duration = end - start;
+ stats->duration_stats = mean_and_variance_update(stats->duration_stats,
+ duration);
+ stats->duration_stats_weighted = mean_and_variance_weighted_update(
+ stats->duration_stats_weighted,
+ duration);
+ stats->max_duration = max(stats->max_duration, duration);
+ stats->min_duration = min(stats->min_duration, duration);
+ bch2_quantiles_update(&stats->quantiles, duration);
+ }
- bch2_quantiles_update(&stats->quantiles, duration);
+ if (time_after64(end, stats->last_event)) {
+ freq = end - stats->last_event;
+ stats->freq_stats = mean_and_variance_update(stats->freq_stats, freq);
+ stats->freq_stats_weighted = mean_and_variance_weighted_update(
+ stats->freq_stats_weighted,
+ freq);
+ stats->max_freq = max(stats->max_freq, freq);
+ stats->min_freq = min(stats->min_freq, freq);
+ stats->last_event = end;
+ }
}
void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
{
unsigned long flags;
+ WARN_RATELIMIT(!stats->min_duration || !stats->min_freq,
+ "time_stats: min_duration = %llu, min_freq = %llu",
+ stats->min_duration, stats->min_freq);
+
if (!stats->buffer) {
spin_lock_irqsave(&stats->lock, flags);
bch2_time_stats_update_one(stats, start, end);
- if (stats->average_frequency < 32 &&
- stats->count > 1024)
+ if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
+ stats->duration_stats.n > 1024)
stats->buffer =
alloc_percpu_gfp(struct time_stat_buffer,
GFP_ATOMIC);
static const struct time_unit {
const char *name;
- u32 nsecs;
+ u64 nsecs;
} time_units[] = {
- { "ns", 1 },
- { "us", NSEC_PER_USEC },
- { "ms", NSEC_PER_MSEC },
- { "sec", NSEC_PER_SEC },
+ { "ns", 1 },
+ { "us", NSEC_PER_USEC },
+ { "ms", NSEC_PER_MSEC },
+ { "s", NSEC_PER_SEC },
+ { "m", NSEC_PER_SEC * 60},
+ { "h", NSEC_PER_SEC * 3600},
+ { "eon", U64_MAX },
};
static const struct time_unit *pick_time_units(u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
- pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+ prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
+ prt_tab_rjust(out);
+ prt_printf(out, "%s", u->name);
+}
+
+#define TABSTOP_SIZE 12
+
+static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
+{
+ prt_str(out, name);
+ prt_tab(out);
+ pr_time_units(out, ns);
+ prt_newline(out);
}
void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
{
const struct time_unit *u;
- u64 freq = READ_ONCE(stats->average_frequency);
- u64 q, last_q = 0;
+ s64 f_mean = 0, d_mean = 0;
+ u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
int i;
+ /*
+ * avoid divide by zero
+ */
+ if (stats->freq_stats.n) {
+ f_mean = mean_and_variance_get_mean(stats->freq_stats);
+ f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+ d_mean = mean_and_variance_get_mean(stats->duration_stats);
+ d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+ }
- pr_buf(out, "count:\t\t%llu\n",
- stats->count);
- pr_buf(out, "rate:\t\t%llu/sec\n",
- freq ? div64_u64(NSEC_PER_SEC, freq) : 0);
-
- pr_buf(out, "frequency:\t");
- pr_time_units(out, freq);
-
- pr_buf(out, "\navg duration:\t");
- pr_time_units(out, stats->average_duration);
-
- pr_buf(out, "\nmax duration:\t");
- pr_time_units(out, stats->max_duration);
+ printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
+ prt_printf(out, "count:");
+ prt_tab(out);
+ prt_printf(out, "%llu ",
+ stats->duration_stats.n);
+ printbuf_tabstop_pop(out);
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
+
+ printbuf_tabstop_push(out, out->indent + 20);
+ printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+ printbuf_tabstop_push(out, 0);
+ printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+
+ prt_tab(out);
+ prt_printf(out, "since mount");
+ prt_tab_rjust(out);
+ prt_tab(out);
+ prt_printf(out, "recent");
+ prt_tab_rjust(out);
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, out->indent + 20);
+ printbuf_tabstop_push(out, TABSTOP_SIZE);
+ printbuf_tabstop_push(out, 2);
+ printbuf_tabstop_push(out, TABSTOP_SIZE);
+
+ prt_printf(out, "duration of events");
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ pr_name_and_units(out, "min:", stats->min_duration);
+ pr_name_and_units(out, "max:", stats->max_duration);
+
+ prt_printf(out, "mean:");
+ prt_tab(out);
+ pr_time_units(out, d_mean);
+ prt_tab(out);
+ pr_time_units(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+ prt_newline(out);
+
+ prt_printf(out, "stddev:");
+ prt_tab(out);
+ pr_time_units(out, d_stddev);
+ prt_tab(out);
+ pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+
+ printbuf_indent_sub(out, 2);
+ prt_newline(out);
+
+ prt_printf(out, "time between events");
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ pr_name_and_units(out, "min:", stats->min_freq);
+ pr_name_and_units(out, "max:", stats->max_freq);
+
+ prt_printf(out, "mean:");
+ prt_tab(out);
+ pr_time_units(out, f_mean);
+ prt_tab(out);
+ pr_time_units(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+ prt_newline(out);
+
+ prt_printf(out, "stddev:");
+ prt_tab(out);
+ pr_time_units(out, f_stddev);
+ prt_tab(out);
+ pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+
+ printbuf_indent_sub(out, 2);
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
i = eytzinger0_first(NR_QUANTILES);
u = pick_time_units(stats->quantiles.entries[i].m);
- pr_buf(out, "\nquantiles (%s):\t", u->name);
+ prt_printf(out, "quantiles (%s):\t", u->name);
eytzinger0_for_each(i, NR_QUANTILES) {
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
q = max(stats->quantiles.entries[i].m, last_q);
- pr_buf(out, "%llu%s",
- div_u64(q, u->nsecs),
- is_last ? "\n" : " ");
+ prt_printf(out, "%llu ",
+ div_u64(q, u->nsecs));
+ if (is_last)
+ prt_newline(out);
last_q = q;
}
}
void bch2_time_stats_init(struct time_stats *stats)
{
memset(stats, 0, sizeof(*stats));
+ stats->duration_stats_weighted.w = 8;
+ stats->freq_stats_weighted.w = 8;
+ stats->min_duration = U64_MAX;
+ stats->min_freq = U64_MAX;
spin_lock_init(&stats->lock);
}
pd->backpressure = 1;
}
-size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
+void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
{
- /* 2^64 - 1 is 20 digits, plus null byte */
- char rate[21];
- char actual[21];
- char target[21];
- char proportional[21];
- char derivative[21];
- char change[21];
- s64 next_io;
-
- bch2_hprint(&PBUF(rate), pd->rate.rate);
- bch2_hprint(&PBUF(actual), pd->last_actual);
- bch2_hprint(&PBUF(target), pd->last_target);
- bch2_hprint(&PBUF(proportional), pd->last_proportional);
- bch2_hprint(&PBUF(derivative), pd->last_derivative);
- bch2_hprint(&PBUF(change), pd->last_change);
-
- next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC);
-
- return sprintf(buf,
- "rate:\t\t%s/sec\n"
- "target:\t\t%s\n"
- "actual:\t\t%s\n"
- "proportional:\t%s\n"
- "derivative:\t%s\n"
- "change:\t\t%s/sec\n"
- "next io:\t%llims\n",
- rate, target, actual, proportional,
- derivative, change, next_io);
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 20);
+
+ prt_printf(out, "rate:");
+ prt_tab(out);
+ prt_human_readable_s64(out, pd->rate.rate);
+ prt_newline(out);
+
+ prt_printf(out, "target:");
+ prt_tab(out);
+ prt_human_readable_u64(out, pd->last_target);
+ prt_newline(out);
+
+ prt_printf(out, "actual:");
+ prt_tab(out);
+ prt_human_readable_u64(out, pd->last_actual);
+ prt_newline(out);
+
+ prt_printf(out, "proportional:");
+ prt_tab(out);
+ prt_human_readable_s64(out, pd->last_proportional);
+ prt_newline(out);
+
+ prt_printf(out, "derivative:");
+ prt_tab(out);
+ prt_human_readable_s64(out, pd->last_derivative);
+ prt_newline(out);
+
+ prt_printf(out, "change:");
+ prt_tab(out);
+ prt_human_readable_s64(out, pd->last_change);
+ prt_newline(out);
+
+ prt_printf(out, "next io:");
+ prt_tab(out);
+ prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
+ prt_newline(out);
}
/* misc: */
}
}
-void bch_scnmemcpy(struct printbuf *out,
- const char *src, size_t len)
-{
- size_t n = printbuf_remaining(out);
-
- if (n) {
- n = min(n - 1, len);
- memcpy(out->pos, src, n);
- out->pos += n;
- *out->pos = '\0';
- }
-}
-
-#include "eytzinger.h"
-
static int alignment_ok(const void *base, size_t align)
{
return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
#include <linux/sched/clock.h>
#include <linux/llist.h>
#include <linux/log2.h>
+#include <linux/printbuf.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/ratelimit.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
+#include <linux/mean_and_variance.h>
struct closure;
\
BUG_ON(_i >= (h)->used); \
(h)->used--; \
- heap_swap(h, _i, (h)->used, set_backpointer); \
- heap_sift_up(h, _i, cmp, set_backpointer); \
- heap_sift_down(h, _i, cmp, set_backpointer); \
+ if ((_i) < (h)->used) { \
+ heap_swap(h, _i, (h)->used, set_backpointer); \
+ heap_sift_up(h, _i, cmp, set_backpointer); \
+ heap_sift_down(h, _i, cmp, set_backpointer); \
+ } \
} while (0)
#define heap_pop(h, d, cmp, set_backpointer) \
#define ANYSINT_MAX(t) \
((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
-struct printbuf {
- char *pos;
- char *end;
- unsigned indent;
-};
-static inline size_t printbuf_remaining(struct printbuf *buf)
+#ifdef __KERNEL__
+static inline void pr_time(struct printbuf *out, u64 time)
{
- return buf->end - buf->pos;
+ prt_printf(out, "%llu", time);
}
-
-#define _PBUF(_buf, _len) \
- ((struct printbuf) { \
- .pos = _buf, \
- .end = _buf + _len, \
- })
-
-#define PBUF(_buf) _PBUF(_buf, sizeof(_buf))
-
-#define pr_buf(_out, ...) \
-do { \
- (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \
- __VA_ARGS__); \
-} while (0)
-
-static inline void printbuf_indent_push(struct printbuf *buf, unsigned spaces)
-{
- buf->indent += spaces;
- while (spaces--)
- pr_buf(buf, " ");
+#else
+#include <time.h>
+static inline void pr_time(struct printbuf *out, u64 _time)
+{
+ char time_str[64];
+ time_t time = _time;
+ struct tm *tm = localtime(&time);
+ size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
+ if (!err)
+ prt_printf(out, "(formatting error)");
+ else
+ prt_printf(out, "%s", time_str);
}
+#endif
-static inline void printbuf_indent_pop(struct printbuf *buf, unsigned spaces)
+#ifdef __KERNEL__
+static inline void uuid_unparse_lower(u8 *uuid, char *out)
{
- buf->indent -= spaces;
+ sprintf(out, "%pUb", uuid);
}
+#else
+#include <uuid/uuid.h>
+#endif
-static inline void printbuf_newline(struct printbuf *buf)
+static inline void pr_uuid(struct printbuf *out, u8 *uuid)
{
- unsigned i;
+ char uuid_str[40];
- pr_buf(buf, "\n");
- for (i = 0; i < buf->indent; i++)
- pr_buf(buf, " ");
+ uuid_unparse_lower(uuid, uuid_str);
+ prt_printf(out, "%s", uuid_str);
}
-void bch_scnmemcpy(struct printbuf *, const char *, size_t);
-
int bch2_strtoint_h(const char *, int *);
int bch2_strtouint_h(const char *, unsigned int *);
int bch2_strtoll_h(const char *, long long *);
_r; \
})
-#define snprint(buf, size, var) \
- snprintf(buf, size, \
+#define snprint(out, var) \
+ prt_printf(out, \
type_is(var, int) ? "%i\n" \
: type_is(var, unsigned) ? "%u\n" \
: type_is(var, long) ? "%li\n" \
: type_is(var, char *) ? "%s\n" \
: "%i\n", var)
-void bch2_hprint(struct printbuf *, s64);
-
bool bch2_is_zero(const void *, size_t);
-void bch2_string_opt_to_text(struct printbuf *,
- const char * const [], size_t);
-
-void bch2_flags_to_text(struct printbuf *, const char * const[], u64);
u64 bch2_read_flag_list(char *, const char * const[]);
+void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines);
+int bch2_prt_backtrace(struct printbuf *, struct task_struct *);
+
#define NR_QUANTILES 15
#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
struct time_stats {
spinlock_t lock;
- u64 count;
/* all fields are in nanoseconds */
- u64 average_duration;
- u64 average_frequency;
u64 max_duration;
+ u64 min_duration;
+ u64 max_freq;
+ u64 min_freq;
u64 last_event;
struct quantiles quantiles;
+ struct mean_and_variance duration_stats;
+ struct mean_and_variance_weighted duration_stats_weighted;
+ struct mean_and_variance freq_stats;
+ struct mean_and_variance_weighted freq_stats_weighted;
struct time_stat_buffer __percpu *buffer;
};
void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
void bch2_pd_controller_init(struct bch_pd_controller *);
-size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *);
+void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *);
#define sysfs_pd_controller_attribute(name) \
rw_attribute(name##_rate); \
sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \
\
if (attr == &sysfs_##name##_rate_debug) \
- return bch2_pd_controller_print_debug(var, buf); \
+ bch2_pd_controller_debug_to_text(out, var); \
} while (0)
#define sysfs_pd_controller_store(name, var) \
#define array_remove_item(_array, _nr, _pos) \
array_remove_items(_array, _nr, _pos, 1)
+static inline void __move_gap(void *array, size_t element_size,
+ size_t nr, size_t size,
+ size_t old_gap, size_t new_gap)
+{
+ size_t gap_end = old_gap + size - nr;
+
+ if (new_gap < old_gap) {
+ size_t move = old_gap - new_gap;
+
+ memmove(array + element_size * (gap_end - move),
+ array + element_size * (old_gap - move),
+ element_size * move);
+ } else if (new_gap > old_gap) {
+ size_t move = new_gap - old_gap;
+
+ memmove(array + element_size * old_gap,
+ array + element_size * gap_end,
+ element_size * move);
+ }
+}
+
+/* Move the gap in a gap buffer: */
+#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \
+ __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap)
+
#define bubble_sort(_base, _nr, _cmp) \
do { \
ssize_t _i, _end; \
return cmp_int(l, r);
}
-#ifdef __KERNEL__
-static inline void uuid_unparse_lower(u8 *uuid, char *out)
-{
- sprintf(out, "%plU", uuid);
-}
-#else
-#include <uuid/uuid.h>
-#endif
-
#endif /* _BCACHEFS_UTIL_H */
// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
+#include <linux/math.h>
#include <linux/string.h>
#include <asm/unaligned.h>
({ \
BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \
\
- (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \
+ (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \
})
#define vstruct_bytes(_s) \
.cmp_bkey = xattr_cmp_bkey,
};
-const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
{
const struct xattr_handler *handler;
struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
- if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
- return "value too small";
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(*xattr.v));
+ return -EINVAL;
+ }
if (bkey_val_u64s(k.k) <
xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len)))
- return "value too small";
+ le16_to_cpu(xattr.v->x_val_len))) {
+ prt_printf(err, "value too small (%zu < %u)",
+ bkey_val_u64s(k.k),
+ xattr_val_u64s(xattr.v->x_name_len,
+ le16_to_cpu(xattr.v->x_val_len)));
+ return -EINVAL;
+ }
+ /* XXX why +4 ? */
if (bkey_val_u64s(k.k) >
xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len) + 4))
- return "value too big";
+ le16_to_cpu(xattr.v->x_val_len) + 4)) {
+ prt_printf(err, "value too big (%zu > %u)",
+ bkey_val_u64s(k.k),
+ xattr_val_u64s(xattr.v->x_name_len,
+ le16_to_cpu(xattr.v->x_val_len) + 4));
+ return -EINVAL;
+ }
handler = bch2_xattr_type_to_handler(xattr.v->x_type);
- if (!handler)
- return "invalid type";
+ if (!handler) {
+ prt_printf(err, "invalid type (%u)", xattr.v->x_type);
+ return -EINVAL;
+ }
- if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
- return "xattr name has invalid characters";
+ if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) {
+ prt_printf(err, "xattr name has invalid characters");
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
handler = bch2_xattr_type_to_handler(xattr.v->x_type);
if (handler && handler->prefix)
- pr_buf(out, "%s", handler->prefix);
+ prt_printf(out, "%s", handler->prefix);
else if (handler)
- pr_buf(out, "(type %u)", xattr.v->x_type);
+ prt_printf(out, "(type %u)", xattr.v->x_type);
else
- pr_buf(out, "(unknown type %u)", xattr.v->x_type);
+ prt_printf(out, "(unknown type %u)", xattr.v->x_type);
- bch_scnmemcpy(out, xattr.v->x_name,
- xattr.v->x_name_len);
- pr_buf(out, ":");
- bch_scnmemcpy(out, xattr_val(xattr.v),
- le16_to_cpu(xattr.v->x_val_len));
+ prt_printf(out, "%.*s:%.*s",
+ xattr.v->x_name_len,
+ xattr.v->x_name,
+ le16_to_cpu(xattr.v->x_val_len),
+ (char *) xattr_val(xattr.v));
}
static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
if (ret)
goto err;
- for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs,
- SPOS(inum, offset, snapshot), 0, k, ret) {
- BUG_ON(k.k->p.inode < inum);
-
- if (k.k->p.inode > inum)
- break;
-
+ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs,
+ SPOS(inum, offset, snapshot),
+ POS(inum, U64_MAX), 0, k, ret) {
if (k.k->type != KEY_TYPE_xattr)
continue;
offset = iter.pos.offset;
bch2_trans_iter_exit(&trans, &iter);
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
if (ret)
- return ret;
+ goto out;
ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
if (ret)
- return ret;
+ goto out;
ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
if (ret)
- return ret;
+ goto out;
return buf.used;
+out:
+ return bch2_err_class(ret);
}
static int bch2_xattr_get_handler(const struct xattr_handler *handler,
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ int ret;
- return bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
+ ret = bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
+ return bch2_err_class(ret);
}
static int bch2_xattr_set_handler(const struct xattr_handler *handler,
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
+ int ret;
- return bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_xattr_set(&trans, inode_inum(inode), &hash,
name, value, size,
handler->flags, flags));
+ return bch2_err_class(ret);
}
static const struct xattr_handler bch_xattr_user_handler = {
bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
const struct bch_option *opt;
int id, inode_opt_id;
- char buf[512];
- struct printbuf out = PBUF(buf);
- unsigned val_len;
+ struct printbuf out = PRINTBUF;
+ int ret;
u64 v;
id = bch2_opt_lookup(name);
return -ENODATA;
v = bch2_opt_get_by_id(&opts, id);
- bch2_opt_to_text(&out, c, opt, v, 0);
+ bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0);
- val_len = out.pos - buf;
+ ret = out.pos;
- if (buffer && val_len > size)
- return -ERANGE;
+ if (out.allocation_failure) {
+ ret = -ENOMEM;
+ } else if (buffer) {
+ if (out.pos > size)
+ ret = -ERANGE;
+ else
+ memcpy(buffer, out.buf, out.pos);
+ }
- if (buffer)
- memcpy(buffer, buf, val_len);
- return val_len;
+ printbuf_exit(&out);
+ return ret;
}
static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
memcpy(buf, value, size);
buf[size] = '\0';
- ret = bch2_opt_parse(c, NULL, opt, buf, &v);
+ ret = bch2_opt_parse(c, opt, buf, &v, NULL);
kfree(buf);
if (ret < 0)
extern const struct bch_hash_desc bch2_xattr_hash_desc;
-const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_xattr (struct bkey_ops) { \
}
}
-void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
+static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
{
- /*
- * most users will be overriding ->bi_bdev with a new target,
- * so we don't set nor calculate new physical/hw segment counts here
- */
- bio->bi_bdev = bio_src->bi_bdev;
bio_set_flag(bio, BIO_CLONED);
- bio->bi_opf = bio_src->bi_opf;
+ bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_iter = bio_src->bi_iter;
- bio->bi_io_vec = bio_src->bi_io_vec;
+ return 0;
}
-struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
+struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
+ gfp_t gfp, struct bio_set *bs)
{
- struct bio *b;
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs);
+ if (!bio)
+ return NULL;
- b = bio_alloc_bioset(gfp_mask, 0, bs);
- if (!b)
+ if (__bio_clone(bio, bio_src, gfp) < 0) {
+ bio_put(bio);
return NULL;
+ }
+ bio->bi_io_vec = bio_src->bi_io_vec;
- __bio_clone_fast(b, bio);
- return b;
+ return bio;
}
struct bio *bio_split(struct bio *bio, int sectors,
BUG_ON(sectors <= 0);
BUG_ON(sectors >= bio_sectors(bio));
- /*
- * Discards need a mutable bio_vec to accommodate the payload
- * required by the DSM TRIM and UNMAP commands.
- */
- if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
- split = bio_clone_bioset(bio, gfp, bs);
- else
- split = bio_clone_fast(bio, gfp, bs);
-
+ split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs);
if (!split)
return NULL;
static void bio_free(struct bio *bio)
{
- unsigned front_pad = bio->bi_pool ? bio->bi_pool->front_pad : 0;
+ struct bio_set *bs = bio->bi_pool;
+
+ if (bs) {
+ if (bio->bi_max_vecs > BIO_INLINE_VECS)
+ mempool_free(bio->bi_io_vec, &bs->bvec_pool);
- kfree((void *) bio - front_pad);
+ mempool_free((void *) bio - bs->front_pad, &bs->bio_pool);
+ } else {
+ kfree(bio);
+ }
}
void bio_put(struct bio *bio)
bio->bi_end_io(bio);
}
-void bio_reset(struct bio *bio)
+void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf)
{
unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
memset(bio, 0, BIO_RESET_BYTES);
- bio->bi_flags = flags;
+ bio->bi_bdev = bdev;
+ bio->bi_opf = opf;
+ bio->bi_flags = flags;
atomic_set(&bio->__bi_remaining, 1);
}
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
+struct bio *bio_kmalloc(unsigned int nr_iovecs, gfp_t gfp_mask)
{
- unsigned front_pad = bs ? bs->front_pad : 0;
struct bio *bio;
- void *p;
-
- p = kmalloc(front_pad +
- sizeof(struct bio) +
- nr_iovecs * sizeof(struct bio_vec),
- gfp_mask);
- if (unlikely(!p))
+ bio = kmalloc(sizeof(struct bio) +
+ sizeof(struct bio_vec) * nr_iovecs, gfp_mask);
+ if (unlikely(!bio))
return NULL;
+ bio_init(bio, NULL, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs, 0);
+ bio->bi_pool = NULL;
+ return bio;
+}
- bio = p + front_pad;
- bio_init(bio, bio->bi_inline_vecs, nr_iovecs);
- bio->bi_pool = bs;
+static struct bio_vec *bvec_alloc(mempool_t *pool, int *nr_vecs,
+ gfp_t gfp_mask)
+{
+ *nr_vecs = roundup_pow_of_two(*nr_vecs);
+ /*
+ * Try a slab allocation first for all smaller allocations. If that
+ * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
+ * The mempool is sized to handle up to BIO_MAX_VECS entries.
+ */
+ if (*nr_vecs < BIO_MAX_VECS) {
+ struct bio_vec *bvl;
- return bio;
+ bvl = kmalloc(sizeof(*bvl) * *nr_vecs, gfp_mask);
+ if (likely(bvl))
+ return bvl;
+ *nr_vecs = BIO_MAX_VECS;
+ }
+
+ return mempool_alloc(pool, gfp_mask);
}
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+struct bio *bio_alloc_bioset(struct block_device *bdev,
+ unsigned nr_iovecs,
+ unsigned opf,
+ gfp_t gfp_mask,
struct bio_set *bs)
{
- struct bvec_iter iter;
- struct bio_vec bv;
struct bio *bio;
+ void *p;
- bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
- if (!bio)
+ if (nr_iovecs > BIO_MAX_VECS)
+ return NULL;
+
+ p = mempool_alloc(&bs->bio_pool, gfp_mask);
+ if (unlikely(!p))
return NULL;
- bio->bi_bdev = bio_src->bi_bdev;
- bio->bi_opf = bio_src->bi_opf;
- bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
- bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
-
- switch (bio_op(bio)) {
- case REQ_OP_DISCARD:
- case REQ_OP_SECURE_ERASE:
- break;
- case REQ_OP_WRITE_SAME:
- bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
- break;
- default:
- bio_for_each_segment(bv, bio_src, iter)
- bio->bi_io_vec[bio->bi_vcnt++] = bv;
- break;
+ bio = p + bs->front_pad;
+ if (nr_iovecs > BIO_INLINE_VECS) {
+ struct bio_vec *bvl = NULL;
+
+ bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
+ if (unlikely(!bvl))
+ goto err_free;
+
+ bio_init(bio, bdev, bvl, nr_iovecs, opf);
+ } else if (nr_iovecs) {
+ bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf);
+ } else {
+ bio_init(bio, bdev, NULL, 0, opf);
}
+ bio->bi_pool = bs;
return bio;
+
+err_free:
+ mempool_free(p, &bs->bio_pool);
+ return NULL;
+}
+
+void bioset_exit(struct bio_set *bs)
+{
+ mempool_exit(&bs->bio_pool);
+ mempool_exit(&bs->bvec_pool);
+}
+
+int bioset_init(struct bio_set *bs,
+ unsigned int pool_size,
+ unsigned int front_pad,
+ int flags)
+{
+ int ret;
+
+ bs->front_pad = front_pad;
+ if (flags & BIOSET_NEED_BVECS)
+ bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
+ else
+ bs->back_pad = 0;
+
+ ret = mempool_init_kmalloc_pool(&bs->bio_pool, pool_size, bs->front_pad +
+ sizeof(struct bio) + bs->back_pad) ?:
+ mempool_init_kmalloc_pool(&bs->bvec_pool, pool_size,
+ sizeof(struct bio_vec) * BIO_MAX_VECS);
+ if (ret)
+ bioset_exit(bs);
+ return ret;
}
int blkdev_issue_discard(struct block_device *bdev,
sector_t sector, sector_t nr_sects,
- gfp_t gfp_mask, unsigned long flags)
+ gfp_t gfp_mask)
{
return 0;
}
BUG_ON(ret);
if (!S_ISBLK(statbuf.st_mode))
- return statbuf.st_blksize >> 9;
+ return statbuf.st_blksize;
- ret = ioctl(bdev->bd_fd, BLKPBSZGET, &blksize);
- BUG_ON(ret);
-
- return blksize >> 9;
+ xioctl(bdev->bd_fd, BLKPBSZGET, &blksize);
+ return blksize;
}
sector_t get_capacity(struct gendisk *disk)
void *holder)
{
struct block_device *bdev;
- int fd, sync_fd, flags = O_DIRECT;
+ int fd, sync_fd, buffered_fd, flags = 0;
if ((mode & (FMODE_READ|FMODE_WRITE)) == (FMODE_READ|FMODE_WRITE))
flags = O_RDWR;
flags |= O_EXCL;
#endif
- fd = open(path, flags);
+ fd = open(path, flags|O_DIRECT);
if (fd < 0)
return ERR_PTR(-errno);
- sync_fd = open(path, flags|O_SYNC);
- if (sync_fd < 0) {
- assert(0);
- close(fd);
- return ERR_PTR(-errno);
- }
+ sync_fd = xopen(path, flags|O_DIRECT|O_SYNC);
+ buffered_fd = xopen(path, flags);
bdev = malloc(sizeof(*bdev));
memset(bdev, 0, sizeof(*bdev));
bdev->bd_dev = xfstat(fd).st_rdev;
bdev->bd_fd = fd;
bdev->bd_sync_fd = sync_fd;
+ bdev->bd_buffered_fd = buffered_fd;
bdev->bd_holder = holder;
bdev->bd_disk = &bdev->__bd_disk;
bdev->bd_disk->bdi = &bdev->bd_disk->__bdi;
#include <linux/export.h>
#include <linux/generic-radix-tree.h>
#include <linux/gfp.h>
+#include <linux/kmemleak.h>
#define GENRADIX_ARY (PAGE_SIZE / sizeof(struct genradix_node *))
#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
#define GENRADIX_DEPTH_MASK \
((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
-unsigned genradix_root_to_depth(struct genradix_root *r)
+static inline unsigned genradix_root_to_depth(struct genradix_root *r)
{
return (unsigned long) r & GENRADIX_DEPTH_MASK;
}
-struct genradix_node *genradix_root_to_node(struct genradix_root *r)
+static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
{
return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
}
}
EXPORT_SYMBOL(__genradix_ptr);
+static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
+{
+ struct genradix_node *node;
+
+ node = (struct genradix_node *)__get_free_page(gfp_mask|__GFP_ZERO);
+
+ /*
+ * We're using pages (not slab allocations) directly for kernel data
+ * structures, so we need to explicitly inform kmemleak of them in order
+ * to avoid false positive memory leak reports.
+ */
+ kmemleak_alloc(node, PAGE_SIZE, 1, gfp_mask);
+ return node;
+}
+
+static inline void genradix_free_node(struct genradix_node *node)
+{
+ kmemleak_free(node);
+ free_page((unsigned long)node);
+}
+
/*
* Returns pointer to the specified byte @offset within @radix, allocating it if
* necessary - newly allocated slots are always zeroed out:
break;
if (!new_node) {
- new_node = (void *)
- __get_free_page(gfp_mask|__GFP_ZERO);
+ new_node = genradix_alloc_node(gfp_mask);
if (!new_node)
return NULL;
}
n = READ_ONCE(*p);
if (!n) {
if (!new_node) {
- new_node = (void *)
- __get_free_page(gfp_mask|__GFP_ZERO);
+ new_node = genradix_alloc_node(gfp_mask);
if (!new_node)
return NULL;
}
}
if (new_node)
- free_page((unsigned long) new_node);
+ genradix_free_node(new_node);
return &n->data[offset];
}
}
EXPORT_SYMBOL(__genradix_iter_peek);
+void *__genradix_iter_peek_prev(struct genradix_iter *iter,
+ struct __genradix *radix,
+ size_t objs_per_page,
+ size_t obj_size_plus_page_remainder)
+{
+ struct genradix_root *r;
+ struct genradix_node *n;
+ unsigned level, i;
+
+ if (iter->offset == SIZE_MAX)
+ return NULL;
+
+restart:
+ r = READ_ONCE(radix->root);
+ if (!r)
+ return NULL;
+
+ n = genradix_root_to_node(r);
+ level = genradix_root_to_depth(r);
+
+ if (ilog2(iter->offset) >= genradix_depth_shift(level)) {
+ iter->offset = genradix_depth_size(level);
+ iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
+
+ iter->offset -= obj_size_plus_page_remainder;
+ iter->pos--;
+ }
+
+ while (level) {
+ level--;
+
+ i = (iter->offset >> genradix_depth_shift(level)) &
+ (GENRADIX_ARY - 1);
+
+ while (!n->children[i]) {
+ size_t objs_per_ptr = genradix_depth_size(level);
+
+ iter->offset = round_down(iter->offset, objs_per_ptr);
+ iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
+
+ if (!iter->offset)
+ return NULL;
+
+ iter->offset -= obj_size_plus_page_remainder;
+ iter->pos--;
+
+ if (!i)
+ goto restart;
+ --i;
+ }
+
+ n = n->children[i];
+ }
+
+ return &n->data[iter->offset & (PAGE_SIZE - 1)];
+}
+EXPORT_SYMBOL(__genradix_iter_peek_prev);
+
static void genradix_free_recurse(struct genradix_node *n, unsigned level)
{
if (level) {
genradix_free_recurse(n->children[i], level - 1);
}
- free_page((unsigned long) n);
+ genradix_free_node(n);
}
int __genradix_prealloc(struct __genradix *radix, size_t size,
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013 Davidlohr Bueso <davidlohr.bueso@hp.com>
+ *
+ * Based on the shift-and-subtract algorithm for computing integer
+ * square root from Guy L. Steele.
+ */
+
+#include <linux/export.h>
+#include <linux/bitops.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+
+/**
+ * int_sqrt - computes the integer square root
+ * @x: integer of which to calculate the sqrt
+ *
+ * Computes: floor(sqrt(x))
+ */
+unsigned long int_sqrt(unsigned long x)
+{
+ unsigned long b, m, y = 0;
+
+ if (x <= 1)
+ return x;
+
+ m = 1UL << (__fls(x) & ~1UL);
+ while (m != 0) {
+ b = y + m;
+ y >>= 1;
+
+ if (x >= b) {
+ x -= b;
+ y += m;
+ }
+ m >>= 2;
+ }
+
+ return y;
+}
+EXPORT_SYMBOL(int_sqrt);
+
+#if BITS_PER_LONG < 64
+/**
+ * int_sqrt64 - strongly typed int_sqrt function when minimum 64 bit input
+ * is expected.
+ * @x: 64bit integer of which to calculate the sqrt
+ */
+u32 int_sqrt64(u64 x)
+{
+ u64 b, m, y = 0;
+
+ if (x <= ULONG_MAX)
+ return int_sqrt((unsigned long) x);
+
+ m = 1ULL << ((fls64(x) - 1) & ~1ULL);
+ while (m != 0) {
+ b = y + m;
+ y >>= 1;
+
+ if (x >= b) {
+ x -= b;
+ y += m;
+ }
+ m >>= 2;
+ }
+
+ return y;
+}
+EXPORT_SYMBOL(int_sqrt64);
+#endif
p->thread_fn = thread_fn;
p->thread_data = thread_data;
p->state = TASK_UNINTERRUPTIBLE;
+ p->signal = &p->_signal;
atomic_set(&p->usage, 1);
init_completion(&p->exited);
+ init_rwsem(&p->_signal.exec_update_lock);
pthread_attr_t attr;
pthread_attr_init(&attr);
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Functions for incremental mean and variance.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * Copyright © 2022 Daniel B. Hill
+ *
+ * Author: Daniel B. Hill <daniel@gluo.nz>
+ *
+ * Description:
+ *
+ * This is includes some incremental algorithms for mean and variance calculation
+ *
+ * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
+ *
+ * Create a struct and if it's the weighted variant set the w field (weight = 2^k).
+ *
+ * Use mean_and_variance[_weighted]_update() on the struct to update it's state.
+ *
+ * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation
+ * is deferred to these functions for performance reasons.
+ *
+ * see lib/math/mean_and_variance_test.c for examples of usage.
+ *
+ * DO NOT access the mean and variance fields of the weighted variants directly.
+ * DO NOT change the weight after calling update.
+ */
+
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+#include <linux/math64.h>
+#include <linux/mean_and_variance.h>
+#include <linux/module.h>
+#include <linux/printbuf.h>
+
+
+/**
+ * fast_divpow2() - fast approximation for n / (1 << d)
+ * @n: numerator
+ * @d: the power of 2 denominator.
+ *
+ * note: this rounds towards 0.
+ */
+inline s64 fast_divpow2(s64 n, u8 d)
+{
+ return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
+}
+
+/**
+ * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
+ * and return it.
+ * @s1: the mean_and_variance to update.
+ * @v1: the new sample.
+ *
+ * see linked pdf equation 12.
+ */
+struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1)
+{
+ struct mean_and_variance s2;
+ u64 v2 = abs(v1);
+
+ s2.n = s1.n + 1;
+ s2.sum = s1.sum + v1;
+ s2.sum_squares = u128_add(s1.sum_squares, u128_square(v2));
+ return s2;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_update);
+
+/**
+ * mean_and_variance_get_mean() - get mean from @s
+ */
+s64 mean_and_variance_get_mean(struct mean_and_variance s)
+{
+ return div64_u64(s.sum, s.n);
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
+
+/**
+ * mean_and_variance_get_variance() - get variance from @s1
+ *
+ * see linked pdf equation 12.
+ */
+u64 mean_and_variance_get_variance(struct mean_and_variance s1)
+{
+ u128 s2 = u128_div(s1.sum_squares, s1.n);
+ u64 s3 = abs(mean_and_variance_get_mean(s1));
+
+ return u128_to_u64(u128_sub(s2, u128_square(s3)));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
+
+/**
+ * mean_and_variance_get_stddev() - get standard deviation from @s
+ */
+u32 mean_and_variance_get_stddev(struct mean_and_variance s)
+{
+ return int_sqrt64(mean_and_variance_get_variance(s));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
+
+/**
+ * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
+ * @s1: ..
+ * @s2: ..
+ *
+ * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
+ * values are stored bitshifted for performance and added precision.
+ */
+struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1,
+ s64 x)
+{
+ struct mean_and_variance_weighted s2;
+ // previous weighted variance.
+ u64 var_w0 = s1.variance;
+ u8 w = s2.w = s1.w;
+ // new value weighted.
+ s64 x_w = x << w;
+ s64 diff_w = x_w - s1.mean;
+ s64 diff = fast_divpow2(diff_w, w);
+ // new mean weighted.
+ s64 u_w1 = s1.mean + diff;
+
+ BUG_ON(w % 2 != 0);
+
+ if (!s1.init) {
+ s2.mean = x_w;
+ s2.variance = 0;
+ } else {
+ s2.mean = u_w1;
+ s2.variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
+ }
+ s2.init = true;
+
+ return s2;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
+
+/**
+ * mean_and_variance_weighted_get_mean() - get mean from @s
+ */
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
+{
+ return fast_divpow2(s.mean, s.w);
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
+
+/**
+ * mean_and_variance_weighted_get_variance() -- get variance from @s
+ */
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
+{
+ // always positive don't need fast divpow2
+ return s.variance >> s.w;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
+
+/**
+ * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
+ */
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
+{
+ return int_sqrt64(mean_and_variance_weighted_get_variance(s));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
+
+MODULE_AUTHOR("Daniel B. Hill");
+MODULE_LICENSE("GPL");
--- /dev/null
+// SPDX-License-Identifier: LGPL-2.1+
+/* Copyright (C) 2022 Kent Overstreet */
+
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/printbuf.h>
+#include <linux/pretty-printers.h>
+
+/**
+ * prt_string_option - Given a list of strings, print out the list and indicate
+ * which option is selected, with square brackets (sysfs style)
+ *
+ * @out: The printbuf to output to
+ * @list: List of strings to choose from
+ * @selected: The option to highlight, with square brackets
+ */
+void prt_string_option(struct printbuf *out,
+ const char * const list[],
+ size_t selected)
+{
+ size_t i;
+
+ for (i = 0; list[i]; i++) {
+ if (i)
+ prt_char(out, ' ');
+ if (i == selected)
+ prt_char(out, '[');
+ prt_str(out, list[i]);
+ if (i == selected)
+ prt_char(out, ']');
+ }
+}
+EXPORT_SYMBOL(prt_string_option);
+
+/**
+ * prt_bitflags: Given a bitmap and a list of names for each bit, print out which
+ * bits are on, comma separated
+ *
+ * @out: The printbuf to output to
+ * @list: List of names for each bit
+ * @flags: Bits to print
+ */
+void prt_bitflags(struct printbuf *out,
+ const char * const list[], u64 flags)
+{
+ unsigned bit, nr = 0;
+ bool first = true;
+
+ while (list[nr])
+ nr++;
+
+ while (flags && (bit = __ffs(flags)) < nr) {
+ if (!first)
+ prt_char(out, ',');
+ first = false;
+ prt_str(out, list[bit]);
+ flags ^= 1 << bit;
+ }
+}
+EXPORT_SYMBOL(prt_bitflags);
--- /dev/null
+// SPDX-License-Identifier: LGPL-2.1+
+/* Copyright (C) 2022 Kent Overstreet */
+
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/printbuf.h>
+#include <linux/slab.h>
+#include <linux/string_helpers.h>
+
+static inline unsigned printbuf_linelen(struct printbuf *buf)
+{
+ return buf->pos - buf->last_newline;
+}
+
+int printbuf_make_room(struct printbuf *out, unsigned extra)
+{
+ unsigned new_size;
+ char *buf;
+
+ if (!out->heap_allocated)
+ return 0;
+
+ /* Reserved space for terminating nul: */
+ extra += 1;
+
+ if (out->pos + extra < out->size)
+ return 0;
+
+ new_size = roundup_pow_of_two(out->size + extra);
+
+ /*
+ * Note: output buffer must be freeable with kfree(), it's not required
+ * that the user use printbuf_exit().
+ */
+ buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
+
+ if (!buf) {
+ out->allocation_failure = true;
+ return -ENOMEM;
+ }
+
+ out->buf = buf;
+ out->size = new_size;
+ return 0;
+}
+EXPORT_SYMBOL(printbuf_make_room);
+
+/**
+ * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null
+ * terminated
+ */
+const char *printbuf_str(const struct printbuf *buf)
+{
+ /*
+ * If we've written to a printbuf then it's guaranteed to be a null
+ * terminated string - but if we haven't, then we might not have
+ * allocated a buffer at all:
+ */
+ return buf->pos
+ ? buf->buf
+ : "";
+}
+EXPORT_SYMBOL(printbuf_str);
+
+/**
+ * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it
+ * against accidental use.
+ */
+void printbuf_exit(struct printbuf *buf)
+{
+ if (buf->heap_allocated) {
+ kfree(buf->buf);
+ buf->buf = ERR_PTR(-EINTR); /* poison value */
+ }
+}
+EXPORT_SYMBOL(printbuf_exit);
+
+void printbuf_tabstops_reset(struct printbuf *buf)
+{
+ buf->nr_tabstops = 0;
+}
+EXPORT_SYMBOL(printbuf_tabstops_reset);
+
+void printbuf_tabstop_pop(struct printbuf *buf)
+{
+ if (buf->nr_tabstops)
+ --buf->nr_tabstops;
+}
+EXPORT_SYMBOL(printbuf_tabstop_pop);
+
+/*
+ * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces from previous tabpstop
+ *
+ * In the future this function may allocate memory if setting more than
+ * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start
+ * of line.
+ */
+int printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
+{
+ unsigned prev_tabstop = buf->nr_tabstops
+ ? buf->_tabstops[buf->nr_tabstops - 1]
+ : 0;
+
+ if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops)))
+ return -EINVAL;
+
+ buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces;
+ buf->has_indent_or_tabstops = true;
+ return 0;
+}
+EXPORT_SYMBOL(printbuf_tabstop_push);
+
+/**
+ * printbuf_indent_add - add to the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to add to the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces more spaces.
+ */
+void printbuf_indent_add(struct printbuf *buf, unsigned spaces)
+{
+ if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
+ spaces = 0;
+
+ buf->indent += spaces;
+ prt_chars(buf, ' ', spaces);
+
+ buf->has_indent_or_tabstops = true;
+}
+EXPORT_SYMBOL(printbuf_indent_add);
+
+/**
+ * printbuf_indent_sub - subtract from the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to subtract from the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces less spaces.
+ */
+void printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
+{
+ if (WARN_ON_ONCE(spaces > buf->indent))
+ spaces = buf->indent;
+
+ if (buf->last_newline + buf->indent == buf->pos) {
+ buf->pos -= spaces;
+ printbuf_nul_terminate(buf);
+ }
+ buf->indent -= spaces;
+
+ if (!buf->indent && !buf->nr_tabstops)
+ buf->has_indent_or_tabstops = false;
+}
+EXPORT_SYMBOL(printbuf_indent_sub);
+
+void prt_newline(struct printbuf *buf)
+{
+ unsigned i;
+
+ printbuf_make_room(buf, 1 + buf->indent);
+
+ __prt_char(buf, '\n');
+
+ buf->last_newline = buf->pos;
+
+ for (i = 0; i < buf->indent; i++)
+ __prt_char(buf, ' ');
+
+ printbuf_nul_terminate(buf);
+
+ buf->last_field = buf->pos;
+ buf->cur_tabstop = 0;
+}
+EXPORT_SYMBOL(prt_newline);
+
+/*
+ * Returns spaces from start of line, if set, or 0 if unset:
+ */
+static inline unsigned cur_tabstop(struct printbuf *buf)
+{
+ return buf->cur_tabstop < buf->nr_tabstops
+ ? buf->_tabstops[buf->cur_tabstop]
+ : 0;
+}
+
+static void __prt_tab(struct printbuf *out)
+{
+ int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
+
+ prt_chars(out, ' ', spaces);
+
+ out->last_field = out->pos;
+ out->cur_tabstop++;
+}
+
+/**
+ * prt_tab - Advance printbuf to the next tabstop
+ *
+ * @buf: printbuf to control
+ *
+ * Advance output to the next tabstop by printing spaces.
+ */
+void prt_tab(struct printbuf *out)
+{
+ if (WARN_ON(!cur_tabstop(out)))
+ return;
+
+ __prt_tab(out);
+}
+EXPORT_SYMBOL(prt_tab);
+
+static void __prt_tab_rjust(struct printbuf *buf)
+{
+ unsigned move = buf->pos - buf->last_field;
+ int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
+
+ if (pad > 0) {
+ printbuf_make_room(buf, pad);
+
+ if (buf->last_field + pad < buf->size)
+ memmove(buf->buf + buf->last_field + pad,
+ buf->buf + buf->last_field,
+ min(move, buf->size - 1 - buf->last_field - pad));
+
+ if (buf->last_field < buf->size)
+ memset(buf->buf + buf->last_field, ' ',
+ min((unsigned) pad, buf->size - buf->last_field));
+
+ buf->pos += pad;
+ printbuf_nul_terminate(buf);
+ }
+
+ buf->last_field = buf->pos;
+ buf->cur_tabstop++;
+}
+
+/**
+ * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
+ * previous output
+ *
+ * @buf: printbuf to control
+ *
+ * Advance output to the next tabstop by inserting spaces immediately after the
+ * previous tabstop, right justifying previously outputted text.
+ */
+void prt_tab_rjust(struct printbuf *buf)
+{
+ if (WARN_ON(!cur_tabstop(buf)))
+ return;
+
+ __prt_tab_rjust(buf);
+}
+EXPORT_SYMBOL(prt_tab_rjust);
+
+/**
+ * prt_bytes_indented - Print an array of chars, handling embedded control characters
+ *
+ * @out: printbuf to output to
+ * @str: string to print
+ * @count: number of bytes to print
+ *
+ * The following contol characters are handled as so:
+ * \n: prt_newline newline that obeys current indent level
+ * \t: prt_tab advance to next tabstop
+ * \r: prt_tab_rjust advance to next tabstop, with right justification
+ */
+void prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
+{
+ const char *unprinted_start = str;
+ const char *end = str + count;
+
+ if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) {
+ prt_bytes(out, str, count);
+ return;
+ }
+
+ while (str != end) {
+ switch (*str) {
+ case '\n':
+ prt_bytes(out, unprinted_start, str - unprinted_start);
+ unprinted_start = str + 1;
+ prt_newline(out);
+ break;
+ case '\t':
+ if (likely(cur_tabstop(out))) {
+ prt_bytes(out, unprinted_start, str - unprinted_start);
+ unprinted_start = str + 1;
+ __prt_tab(out);
+ }
+ break;
+ case '\r':
+ if (likely(cur_tabstop(out))) {
+ prt_bytes(out, unprinted_start, str - unprinted_start);
+ unprinted_start = str + 1;
+ __prt_tab_rjust(out);
+ }
+ break;
+ }
+
+ str++;
+ }
+
+ prt_bytes(out, unprinted_start, str - unprinted_start);
+}
+EXPORT_SYMBOL(prt_bytes_indented);
+
+/**
+ * prt_human_readable_u64 - Print out a u64 in human readable units
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ */
+void prt_human_readable_u64(struct printbuf *buf, u64 v)
+{
+ printbuf_make_room(buf, 10);
+ buf->pos += string_get_size(v, 1, !buf->si_units,
+ buf->buf + buf->pos,
+ printbuf_remaining_size(buf));
+}
+EXPORT_SYMBOL(prt_human_readable_u64);
+
+/**
+ * prt_human_readable_s64 - Print out a s64 in human readable units
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ */
+void prt_human_readable_s64(struct printbuf *buf, s64 v)
+{
+ if (v < 0)
+ prt_char(buf, '-');
+ prt_human_readable_u64(buf, abs(v));
+}
+EXPORT_SYMBOL(prt_human_readable_s64);
+
+/**
+ * prt_units_u64 - Print out a u64 according to printbuf unit options
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void prt_units_u64(struct printbuf *out, u64 v)
+{
+ if (out->human_readable_units)
+ prt_human_readable_u64(out, v);
+ else
+ prt_printf(out, "%llu", v);
+}
+EXPORT_SYMBOL(prt_units_u64);
+
+/**
+ * prt_units_s64 - Print out a s64 according to printbuf unit options
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void prt_units_s64(struct printbuf *out, s64 v)
+{
+ if (v < 0)
+ prt_char(out, '-');
+ prt_units_u64(out, abs(v));
+}
+EXPORT_SYMBOL(prt_units_s64);
--- /dev/null
+
+#include <stdio.h>
+#include <linux/printbuf.h>
+
+void prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
+{
+ int len;
+
+ do {
+ va_list args2;
+
+ va_copy(args2, args);
+ len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+ } while (len + 1 >= printbuf_remaining(out) &&
+ !printbuf_make_room(out, len + 1));
+
+ len = min_t(size_t, len,
+ printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+ out->pos += len;
+}
+
+void prt_printf(struct printbuf *out, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ prt_vprintf(out, fmt, args);
+ va_end(args);
+}
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ratelimit.c - Do something with rate limit.
+ *
+ * Isolated from kernel/printk.c by Dave Young <hidave.darkstar@gmail.com>
+ *
+ * 2008-05-01 rewrite the function and use a ratelimit_state data struct as
+ * parameter. Now every user can use their own standalone ratelimit_state.
+ */
+
+#include <linux/ratelimit.h>
+#include <linux/jiffies.h>
+#include <linux/export.h>
+
+/*
+ * __ratelimit - rate limiting
+ * @rs: ratelimit_state data
+ * @func: name of calling function
+ *
+ * This enforces a rate limit: not more than @rs->burst callbacks
+ * in every @rs->interval
+ *
+ * RETURNS:
+ * 0 means callbacks will be suppressed.
+ * 1 means go ahead and do it.
+ */
+int ___ratelimit(struct ratelimit_state *rs, const char *func)
+{
+ int ret;
+
+ if (!rs->interval)
+ return 1;
+
+ /*
+ * If we contend on this state's lock then almost
+ * by definition we are too busy to print a message,
+ * in addition to the one that will be printed by
+ * the entity that is holding the lock already:
+ */
+ if (!raw_spin_trylock(&rs->lock))
+ return 0;
+
+ if (!rs->begin)
+ rs->begin = jiffies;
+
+ if (time_is_before_jiffies(rs->begin + rs->interval)) {
+ if (rs->missed) {
+ if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) {
+ printk(KERN_WARNING
+ "%s: %d callbacks suppressed\n",
+ func, rs->missed);
+ rs->missed = 0;
+ }
+ }
+ rs->begin = jiffies;
+ rs->printed = 0;
+ }
+ if (rs->burst && rs->burst > rs->printed) {
+ rs->printed++;
+ ret = 1;
+ } else {
+ rs->missed++;
+ ret = 0;
+ }
+ raw_spin_unlock(&rs->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(___ratelimit);
#include <stdio.h>
#include <linux/list.h>
+#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/shrinker.h>
static LIST_HEAD(shrinker_list);
static DEFINE_MUTEX(shrinker_lock);
-int register_shrinker(struct shrinker *shrinker)
+int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
{
mutex_lock(&shrinker_lock);
list_add_tail(&shrinker->list, &shrinker_list);
return v << 10;
}
-static struct meminfo read_meminfo(void)
+void si_meminfo(struct sysinfo *val)
{
- struct meminfo ret = { 0 };
size_t len, n = 0;
char *line = NULL;
const char *v;
FILE *f;
+ memset(val, 0, sizeof(*val));
+ val->mem_unit = 1;
+
f = fopen("/proc/meminfo", "r");
if (!f)
- return ret;
+ return;
while ((len = getline(&line, &n, f)) != -1) {
if ((v = strcmp_prefix(line, "MemTotal:")))
- ret.total = parse_meminfo_line(v);
+ val->totalram = parse_meminfo_line(v);
if ((v = strcmp_prefix(line, "MemAvailable:")))
- ret.available = parse_meminfo_line(v);
+ val->freeram = parse_meminfo_line(v);
}
fclose(f);
free(line);
+}
+
+static void run_shrinkers_allocation_failed(gfp_t gfp_mask)
+{
+ struct shrinker *shrinker;
+
+ mutex_lock(&shrinker_lock);
+ list_for_each_entry(shrinker, &shrinker_list, list) {
+ struct shrink_control sc = { .gfp_mask = gfp_mask, };
+
+ unsigned long have = shrinker->count_objects(shrinker, &sc);
- return ret;
+ sc.nr_to_scan = have / 8;
+
+ shrinker->scan_objects(shrinker, &sc);
+ }
+ mutex_unlock(&shrinker_lock);
}
-void run_shrinkers(void)
+void run_shrinkers(gfp_t gfp_mask, bool allocation_failed)
{
struct shrinker *shrinker;
- struct meminfo info;
+ struct sysinfo info;
s64 want_shrink;
+ if (!(gfp_mask & GFP_KERNEL))
+ return;
+
/* Fast out if there are no shrinkers to run. */
if (list_empty(&shrinker_list))
return;
- info = read_meminfo();
+ if (allocation_failed) {
+ run_shrinkers_allocation_failed(gfp_mask);
+ return;
+ }
+
+ si_meminfo(&info);
- if (info.total && info.available) {
- want_shrink = (info.total >> 2) - info.available;
+ if (info.totalram && info.freeram) {
+ want_shrink = (info.totalram >> 2) - info.freeram;
if (want_shrink <= 0)
return;
mutex_lock(&shrinker_lock);
list_for_each_entry(shrinker, &shrinker_list, list) {
struct shrink_control sc = {
- .nr_to_scan = want_shrink >> PAGE_SHIFT
+ .gfp_mask = gfp_mask,
+ .nr_to_scan = want_shrink >> PAGE_SHIFT
};
shrinker->scan_objects(shrinker, &sc);
#include <linux/preempt.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/sched/rt.h>
#include <linux/six.h>
#include <linux/slab.h>
#define EBUG_ON(cond) do {} while (0)
#endif
-#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
+#define six_acquire(l, t, r) lock_acquire(l, 0, t, r, 1, NULL, _RET_IP_)
#define six_release(l) lock_release(l, _RET_IP_)
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
+
struct six_lock_vals {
/* Value we add to the lock in order to take the lock: */
u64 lock_val;
}
static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
- union six_lock_state old)
+ union six_lock_state old,
+ struct task_struct *owner)
{
if (type != SIX_LOCK_intent)
return;
if (!old.intent_lock) {
EBUG_ON(lock->owner);
- lock->owner = current;
+ lock->owner = owner;
} else {
EBUG_ON(lock->owner != current);
}
return read_count;
}
-struct six_lock_waiter {
- struct list_head list;
- struct task_struct *task;
-};
-
/* This is probably up there with the more evil things I've done */
#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
-static inline void six_lock_wakeup(struct six_lock *lock,
- union six_lock_state state,
- unsigned waitlist_id)
-{
- if (waitlist_id == SIX_LOCK_write) {
- if (state.write_locking && !state.read_lock) {
- struct task_struct *p = READ_ONCE(lock->owner);
- if (p)
- wake_up_process(p);
- }
- } else {
- struct list_head *wait_list = &lock->wait_list[waitlist_id];
- struct six_lock_waiter *w, *next;
-
- if (!(state.waiters & (1 << waitlist_id)))
- return;
-
- clear_bit(waitlist_bitnr(waitlist_id),
- (unsigned long *) &lock->state.v);
-
- raw_spin_lock(&lock->wait_lock);
-
- list_for_each_entry_safe(w, next, wait_list, list) {
- list_del_init(&w->list);
-
- if (wake_up_process(w->task) &&
- waitlist_id != SIX_LOCK_read) {
- if (!list_empty(wait_list))
- set_bit(waitlist_bitnr(waitlist_id),
- (unsigned long *) &lock->state.v);
- break;
- }
- }
-
- raw_spin_unlock(&lock->wait_lock);
- }
-}
-
-static __always_inline bool do_six_trylock_type(struct six_lock *lock,
- enum six_lock_type type,
- bool try)
+static int __do_six_trylock_type(struct six_lock *lock,
+ enum six_lock_type type,
+ struct task_struct *task,
+ bool try)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state old, new;
- bool ret;
+ int ret;
u64 v;
- EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
+ EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
-
EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
/*
*/
if (type == SIX_LOCK_read && lock->readers) {
-retry:
preempt_disable();
this_cpu_inc(*lock->readers); /* signal that we own lock */
* lock, issue a wakeup because we might have caused a
* spurious trylock failure:
*/
- if (old.write_locking) {
- struct task_struct *p = READ_ONCE(lock->owner);
-
- if (p)
- wake_up_process(p);
- }
-
- /*
- * If we failed from the lock path and the waiting bit wasn't
- * set, set it:
- */
- if (!try && !ret) {
- v = old.v;
-
- do {
- new.v = old.v = v;
-
- if (!(old.v & l[type].lock_fail))
- goto retry;
-
- if (new.waiters & (1 << type))
- break;
-
- new.waiters |= 1 << type;
- } while ((v = atomic64_cmpxchg(&lock->state.counter,
- old.v, new.v)) != old.v);
- }
+ if (old.write_locking)
+ ret = -1 - SIX_LOCK_write;
} else if (type == SIX_LOCK_write && lock->readers) {
if (try) {
atomic64_add(__SIX_VAL(write_locking, 1),
&lock->state.counter);
smp_mb__after_atomic();
+ } else if (!(lock->state.waiters & (1 << SIX_LOCK_write))) {
+ atomic64_add(__SIX_VAL(waiters, 1 << SIX_LOCK_write),
+ &lock->state.counter);
+ /*
+ * pairs with barrier after unlock and before checking
+ * for readers in unlock path
+ */
+ smp_mb__after_atomic();
}
ret = !pcpu_read_count(lock);
if (try && !ret) {
old.v = atomic64_add_return(v, &lock->state.counter);
- six_lock_wakeup(lock, old, SIX_LOCK_read);
+ if (old.waiters & (1 << SIX_LOCK_read))
+ ret = -1 - SIX_LOCK_read;
} else {
atomic64_add(v, &lock->state.counter);
}
if (type == SIX_LOCK_write)
new.write_locking = 0;
- } else if (!try && type != SIX_LOCK_write &&
- !(new.waiters & (1 << type)))
+ } else if (!try && !(new.waiters & (1 << type)))
new.waiters |= 1 << type;
else
break; /* waiting bit already set */
EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
}
- if (ret)
- six_set_owner(lock, type, old);
+ if (ret > 0)
+ six_set_owner(lock, type, old, task);
- EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
+ EBUG_ON(type == SIX_LOCK_write && (try || ret > 0) && (lock->state.write_locking));
return ret;
}
+static inline void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
+{
+ struct six_lock_waiter *w, *next;
+ struct task_struct *task;
+ bool saw_one;
+ int ret;
+again:
+ ret = 0;
+ saw_one = false;
+ raw_spin_lock(&lock->wait_lock);
+
+ list_for_each_entry_safe(w, next, &lock->wait_list, list) {
+ if (w->lock_want != lock_type)
+ continue;
+
+ if (saw_one && lock_type != SIX_LOCK_read)
+ goto unlock;
+ saw_one = true;
+
+ ret = __do_six_trylock_type(lock, lock_type, w->task, false);
+ if (ret <= 0)
+ goto unlock;
+
+ __list_del(w->list.prev, w->list.next);
+ task = w->task;
+ /*
+ * Do no writes to @w besides setting lock_acquired - otherwise
+ * we would need a memory barrier:
+ */
+ barrier();
+ w->lock_acquired = true;
+ wake_up_process(task);
+ }
+
+ clear_bit(waitlist_bitnr(lock_type), (unsigned long *) &lock->state.v);
+unlock:
+ raw_spin_unlock(&lock->wait_lock);
+
+ if (ret < 0) {
+ lock_type = -ret - 1;
+ goto again;
+ }
+}
+
+static inline void six_lock_wakeup(struct six_lock *lock,
+ union six_lock_state state,
+ enum six_lock_type lock_type)
+{
+ if (lock_type == SIX_LOCK_write && state.read_lock)
+ return;
+
+ if (!(state.waiters & (1 << lock_type)))
+ return;
+
+ __six_lock_wakeup(lock, lock_type);
+}
+
+static bool do_six_trylock_type(struct six_lock *lock,
+ enum six_lock_type type,
+ bool try)
+{
+ int ret;
+
+ ret = __do_six_trylock_type(lock, type, current, try);
+ if (ret < 0)
+ __six_lock_wakeup(lock, -ret - 1);
+
+ return ret > 0;
+}
+
__always_inline __flatten
static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
{
return false;
if (type != SIX_LOCK_write)
- six_acquire(&lock->dep_map, 1);
+ six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
return true;
}
* Similar to the lock path, we may have caused a spurious write
* lock fail and need to issue a wakeup:
*/
- if (old.write_locking) {
- struct task_struct *p = READ_ONCE(lock->owner);
-
- if (p)
- wake_up_process(p);
- }
+ if (old.write_locking)
+ six_lock_wakeup(lock, old, SIX_LOCK_write);
if (ret)
- six_acquire(&lock->dep_map, 1);
+ six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
return ret;
}
old.v,
old.v + l[type].lock_val)) != old.v);
- six_set_owner(lock, type, old);
+ six_set_owner(lock, type, old, current);
if (type != SIX_LOCK_write)
- six_acquire(&lock->dep_map, 1);
+ six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read);
return true;
}
#ifdef CONFIG_LOCK_SPIN_ON_OWNER
-static inline int six_can_spin_on_owner(struct six_lock *lock)
+static inline bool six_optimistic_spin(struct six_lock *lock,
+ struct six_lock_waiter *wait)
{
- struct task_struct *owner;
- int retval = 1;
+ struct task_struct *owner, *task = current;
- if (need_resched())
- return 0;
+ switch (wait->lock_want) {
+ case SIX_LOCK_read:
+ break;
+ case SIX_LOCK_intent:
+ if (lock->wait_list.next != &wait->list)
+ return false;
+ break;
+ case SIX_LOCK_write:
+ return false;
+ }
rcu_read_lock();
owner = READ_ONCE(lock->owner);
- if (owner)
- retval = owner->on_cpu;
- rcu_read_unlock();
- /*
- * if lock->owner is not set, the mutex owner may have just acquired
- * it and not set the owner yet or the mutex has been released.
- */
- return retval;
-}
-
-static inline bool six_spin_on_owner(struct six_lock *lock,
- struct task_struct *owner)
-{
- bool ret = true;
- rcu_read_lock();
- while (lock->owner == owner) {
+ while (owner && lock->owner == owner) {
/*
* Ensure we emit the owner->on_cpu, dereference _after_
* checking lock->owner still matches owner. If that fails,
*/
barrier();
- if (!owner->on_cpu || need_resched()) {
- ret = false;
- break;
- }
-
- cpu_relax();
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
-{
- struct task_struct *task = current;
-
- if (type == SIX_LOCK_write)
- return false;
-
- preempt_disable();
- if (!six_can_spin_on_owner(lock))
- goto fail;
-
- if (!osq_lock(&lock->osq))
- goto fail;
-
- while (1) {
- struct task_struct *owner;
-
/*
- * If there's an owner, wait for it to either
- * release the lock or go to sleep.
- */
- owner = READ_ONCE(lock->owner);
- if (owner && !six_spin_on_owner(lock, owner))
- break;
-
- if (do_six_trylock_type(lock, type, false)) {
- osq_unlock(&lock->osq);
- preempt_enable();
- return true;
- }
-
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
+ * If we're an RT task that will live-lock because we won't let
* the owner complete.
*/
- if (!owner && (need_resched() || rt_task(task)))
+ if (wait->lock_acquired ||
+ !owner->on_cpu ||
+ rt_task(task) ||
+ need_resched())
break;
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
cpu_relax();
}
+ rcu_read_unlock();
- osq_unlock(&lock->osq);
-fail:
- preempt_enable();
-
- /*
- * If we fell out of the spin path because of need_resched(),
- * reschedule now, before we try-lock again. This avoids getting
- * scheduled out right after we obtained the lock.
- */
- if (need_resched())
- schedule();
-
- return false;
+ return wait->lock_acquired;
}
#else /* CONFIG_LOCK_SPIN_ON_OWNER */
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+static inline bool six_optimistic_spin(struct six_lock *lock,
+ struct six_lock_waiter *wait)
{
return false;
}
noinline
static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
+ struct six_lock_waiter *wait,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
union six_lock_state old;
- struct six_lock_waiter wait;
int ret = 0;
if (type == SIX_LOCK_write) {
smp_mb__after_atomic();
}
- ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
- if (ret)
- goto out_before_sleep;
+ lock_contended(&lock->dep_map, _RET_IP_);
- if (six_optimistic_spin(lock, type))
- goto out_before_sleep;
+ wait->task = current;
+ wait->lock_want = type;
+ wait->lock_acquired = false;
- lock_contended(&lock->dep_map, _RET_IP_);
+ raw_spin_lock(&lock->wait_lock);
+ if (!(lock->state.waiters & (1 << type)))
+ set_bit(waitlist_bitnr(type), (unsigned long *) &lock->state.v);
+ /*
+ * Retry taking the lock after taking waitlist lock, have raced with an
+ * unlock:
+ */
+ ret = __do_six_trylock_type(lock, type, current, false);
+ if (ret <= 0) {
+ wait->start_time = local_clock();
- INIT_LIST_HEAD(&wait.list);
- wait.task = current;
+ if (!list_empty(&lock->wait_list)) {
+ struct six_lock_waiter *last =
+ list_last_entry(&lock->wait_list,
+ struct six_lock_waiter, list);
+
+ if (time_before_eq64(wait->start_time, last->start_time))
+ wait->start_time = last->start_time + 1;
+ }
+
+ list_add_tail(&wait->list, &lock->wait_list);
+ }
+ raw_spin_unlock(&lock->wait_lock);
+
+ if (unlikely(ret > 0)) {
+ ret = 0;
+ goto out;
+ }
+
+ if (unlikely(ret < 0)) {
+ __six_lock_wakeup(lock, -ret - 1);
+ ret = 0;
+ }
+
+ if (six_optimistic_spin(lock, wait))
+ goto out;
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (type == SIX_LOCK_write)
- EBUG_ON(lock->owner != current);
- else if (list_empty_careful(&wait.list)) {
- raw_spin_lock(&lock->wait_lock);
- list_add_tail(&wait.list, &lock->wait_list[type]);
- raw_spin_unlock(&lock->wait_lock);
- }
- if (do_six_trylock_type(lock, type, false))
+ if (wait->lock_acquired)
break;
ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
- if (ret)
+ if (unlikely(ret)) {
+ raw_spin_lock(&lock->wait_lock);
+ if (!wait->lock_acquired)
+ list_del(&wait->list);
+ raw_spin_unlock(&lock->wait_lock);
+
+ if (wait->lock_acquired)
+ do_six_unlock_type(lock, type);
break;
+ }
schedule();
}
__set_current_state(TASK_RUNNING);
-
- if (!list_empty_careful(&wait.list)) {
- raw_spin_lock(&lock->wait_lock);
- list_del_init(&wait.list);
- raw_spin_unlock(&lock->wait_lock);
- }
-out_before_sleep:
- if (ret && type == SIX_LOCK_write) {
+out:
+ if (ret && type == SIX_LOCK_write && lock->state.write_locking) {
old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
&lock->state.counter);
six_lock_wakeup(lock, old, SIX_LOCK_read);
return ret;
}
-__always_inline
-static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p)
+__always_inline __flatten
+static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
+ struct six_lock_waiter *wait,
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
{
int ret;
+ wait->start_time = 0;
+
if (type != SIX_LOCK_write)
- six_acquire(&lock->dep_map, 0);
+ six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read);
ret = do_six_trylock_type(lock, type, true) ? 0
- : __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
+ : __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p);
if (ret && type != SIX_LOCK_write)
six_release(&lock->dep_map);
return ret;
}
+__always_inline
+static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+ struct six_lock_waiter wait;
+
+ return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p);
+}
+
__always_inline __flatten
-static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state state;
- EBUG_ON(type == SIX_LOCK_write &&
- !(lock->state.v & __SIX_LOCK_HELD_intent));
-
- if (type != SIX_LOCK_write)
- six_release(&lock->dep_map);
-
- if (type == SIX_LOCK_intent) {
- EBUG_ON(lock->owner != current);
-
- if (lock->intent_lock_recurse) {
- --lock->intent_lock_recurse;
- return;
- }
-
+ if (type == SIX_LOCK_intent)
lock->owner = NULL;
- }
if (type == SIX_LOCK_read &&
lock->readers) {
six_lock_wakeup(lock, state, l[type].unlock_wakeup);
}
+__always_inline __flatten
+static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ EBUG_ON(type == SIX_LOCK_write &&
+ !(lock->state.v & __SIX_LOCK_HELD_intent));
+ EBUG_ON((type == SIX_LOCK_write ||
+ type == SIX_LOCK_intent) &&
+ lock->owner != current);
+
+ if (type != SIX_LOCK_write)
+ six_release(&lock->dep_map);
+
+ if (type == SIX_LOCK_intent &&
+ lock->intent_lock_recurse) {
+ --lock->intent_lock_recurse;
+ return;
+ }
+
+ do_six_unlock_type(lock, type);
+}
+
#define __SIX_LOCK(type) \
bool six_trylock_##type(struct six_lock *lock) \
{ \
} \
EXPORT_SYMBOL_GPL(six_lock_##type); \
\
+int six_lock_waiter_##type(struct six_lock *lock, \
+ struct six_lock_waiter *wait, \
+ six_lock_should_sleep_fn should_sleep_fn, void *p)\
+{ \
+ return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p);\
+} \
+EXPORT_SYMBOL_GPL(six_lock_waiter_##type); \
+ \
void six_unlock_##type(struct six_lock *lock) \
{ \
__six_unlock_type(lock, SIX_LOCK_##type); \
if (lock->readers)
this_cpu_dec(*lock->readers);
- six_set_owner(lock, SIX_LOCK_intent, old);
+ six_set_owner(lock, SIX_LOCK_intent, old, current);
return true;
}
{
const struct six_lock_vals l[] = LOCK_VALS;
- six_acquire(&lock->dep_map, 0);
+ six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read);
/* XXX: assert already locked, and that we don't overflow: */
void six_lock_wakeup_all(struct six_lock *lock)
{
+ union six_lock_state state = lock->state;
struct six_lock_waiter *w;
- raw_spin_lock(&lock->wait_lock);
+ six_lock_wakeup(lock, state, SIX_LOCK_read);
+ six_lock_wakeup(lock, state, SIX_LOCK_intent);
+ six_lock_wakeup(lock, state, SIX_LOCK_write);
- list_for_each_entry(w, &lock->wait_list[0], list)
- wake_up_process(w->task);
- list_for_each_entry(w, &lock->wait_list[1], list)
+ raw_spin_lock(&lock->wait_lock);
+ list_for_each_entry(w, &lock->wait_list, list)
wake_up_process(w->task);
-
raw_spin_unlock(&lock->wait_lock);
}
EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
-struct free_pcpu_rcu {
- struct rcu_head rcu;
- void __percpu *p;
-};
-
-static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
-{
- struct free_pcpu_rcu *rcu =
- container_of(_rcu, struct free_pcpu_rcu, rcu);
-
- free_percpu(rcu->p);
- kfree(rcu);
-}
-
-void six_lock_pcpu_free_rcu(struct six_lock *lock)
-{
- struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
-
- if (!rcu)
- return;
-
- rcu->p = lock->readers;
- lock->readers = NULL;
-
- call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
-}
-EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
-
void six_lock_pcpu_free(struct six_lock *lock)
{
BUG_ON(lock->readers && pcpu_read_count(lock));
#endif
}
EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
+
+/*
+ * Returns lock held counts, for both read and intent
+ */
+struct six_lock_count six_lock_counts(struct six_lock *lock)
+{
+ struct six_lock_count ret;
+
+ ret.n[SIX_LOCK_read] = 0;
+ ret.n[SIX_LOCK_intent] = lock->state.intent_lock + lock->intent_lock_recurse;
+ ret.n[SIX_LOCK_write] = lock->state.seq & 1;
+
+ if (!lock->readers)
+ ret.n[SIX_LOCK_read] += lock->state.read_lock;
+ else {
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ ret.n[SIX_LOCK_read] += *per_cpu_ptr(lock->readers, cpu);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_counts);
#include <ctype.h>
#include <errno.h>
+#include <limits.h>
#include <string.h>
+#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/string.h>
return ret;
}
+ssize_t strscpy(char *dest, const char *src, size_t count)
+{
+ long res = 0;
+
+ if (count == 0 || WARN_ON_ONCE(count > INT_MAX))
+ return -E2BIG;
+
+ while (count) {
+ char c;
+
+ c = src[res];
+ dest[res] = c;
+ if (!c)
+ return res;
+ res++;
+ count--;
+ }
+
+ /* Hit buffer length without finding a NUL; force NUL-termination. */
+ if (res)
+ dest[res-1] = '\0';
+
+ return -E2BIG;
+}
+
void memzero_explicit(void *s, size_t count)
{
memset(s, 0, count);
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Helpers for formatting and printing strings
+ *
+ * Copyright 31 August 2008 James Bottomley
+ * Copyright (C) 2013, Intel Corporation
+ */
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/math64.h>
+#include <linux/export.h>
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/limits.h>
+#include <linux/printbuf.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/string_helpers.h>
+
+/**
+ * string_get_size - get the size in the specified units
+ * @size: The size to be converted in blocks
+ * @blk_size: Size of the block (use 1 for size in bytes)
+ * @units: units to use (powers of 1000 or 1024)
+ * @buf: buffer to format to
+ * @len: length of buffer
+ *
+ * This function returns a string formatted to 3 significant figures
+ * giving the size in the required units. @buf should have room for
+ * at least 9 bytes and will always be zero terminated.
+ *
+ */
+int string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
+ char *buf, int len)
+{
+ static const char *const units_10[] = {
+ "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
+ };
+ static const char *const units_2[] = {
+ "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
+ };
+ static const char *const *const units_str[] = {
+ [STRING_UNITS_10] = units_10,
+ [STRING_UNITS_2] = units_2,
+ };
+ static const unsigned int divisor[] = {
+ [STRING_UNITS_10] = 1000,
+ [STRING_UNITS_2] = 1024,
+ };
+ static const unsigned int rounding[] = { 500, 50, 5 };
+ int i = 0, j;
+ u32 remainder = 0, sf_cap;
+ char tmp[12];
+ const char *unit;
+
+ tmp[0] = '\0';
+
+ if (blk_size == 0)
+ size = 0;
+ if (size == 0)
+ goto out;
+
+ /* This is Napier's algorithm. Reduce the original block size to
+ *
+ * coefficient * divisor[units]^i
+ *
+ * we do the reduction so both coefficients are just under 32 bits so
+ * that multiplying them together won't overflow 64 bits and we keep
+ * as much precision as possible in the numbers.
+ *
+ * Note: it's safe to throw away the remainders here because all the
+ * precision is in the coefficients.
+ */
+ while (blk_size >> 32) {
+ do_div(blk_size, divisor[units]);
+ i++;
+ }
+
+ while (size >> 32) {
+ do_div(size, divisor[units]);
+ i++;
+ }
+
+ /* now perform the actual multiplication keeping i as the sum of the
+ * two logarithms */
+ size *= blk_size;
+
+ /* and logarithmically reduce it until it's just under the divisor */
+ while (size >= divisor[units]) {
+ remainder = do_div(size, divisor[units]);
+ i++;
+ }
+
+ /* work out in j how many digits of precision we need from the
+ * remainder */
+ sf_cap = size;
+ for (j = 0; sf_cap*10 < 1000; j++)
+ sf_cap *= 10;
+
+ if (units == STRING_UNITS_2) {
+ /* express the remainder as a decimal. It's currently the
+ * numerator of a fraction whose denominator is
+ * divisor[units], which is 1 << 10 for STRING_UNITS_2 */
+ remainder *= 1000;
+ remainder >>= 10;
+ }
+
+ /* add a 5 to the digit below what will be printed to ensure
+ * an arithmetical round up and carry it through to size */
+ remainder += rounding[j];
+ if (remainder >= 1000) {
+ remainder -= 1000;
+ size += 1;
+ }
+
+ if (j) {
+ snprintf(tmp, sizeof(tmp), ".%03u", remainder);
+ tmp[j+1] = '\0';
+ }
+
+ out:
+ if (i >= ARRAY_SIZE(units_2))
+ unit = "UNK";
+ else
+ unit = units_str[units][i];
+
+ return snprintf(buf, len, "%u%s %s", (u32)size, tmp, unit);
+}
+EXPORT_SYMBOL(string_get_size);
\
BUG_ON(_i >= (h)->used); \
(h)->used--; \
- heap_swap(h, _i, (h)->used); \
- heap_sift_down(h, _i, cmp); \
- heap_sift(h, _i, cmp); \
+ if ((_i) < (h)->used) { \
+ heap_swap(h, _i, (h)->used); \
+ heap_sift_down(h, _i, cmp); \
+ heap_sift(h, _i, cmp); \
+ } \
} while (0)
#define heap_pop(h, d, cmp) \
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/zstd.h>
+
+#define ZSTD_FORWARD_IF_ERR(ret) \
+ do { \
+ size_t const __ret = (ret); \
+ if (ZSTD_isError(__ret)) \
+ return __ret; \
+ } while (0)
+
+static size_t zstd_cctx_init(zstd_cctx *cctx, const zstd_parameters *parameters,
+ unsigned long long pledged_src_size)
+{
+ ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_reset(
+ cctx, ZSTD_reset_session_and_parameters));
+ ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setPledgedSrcSize(
+ cctx, pledged_src_size));
+ ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+ cctx, ZSTD_c_windowLog, parameters->cParams.windowLog));
+ ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+ cctx, ZSTD_c_hashLog, parameters->cParams.hashLog));
+ ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+ cctx, ZSTD_c_chainLog, parameters->cParams.chainLog));
+ ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+ cctx, ZSTD_c_searchLog, parameters->cParams.searchLog));
+ ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+ cctx, ZSTD_c_minMatch, parameters->cParams.minMatch));
+ ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+ cctx, ZSTD_c_targetLength, parameters->cParams.targetLength));
+ ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+ cctx, ZSTD_c_strategy, parameters->cParams.strategy));
+ ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+ cctx, ZSTD_c_contentSizeFlag, parameters->fParams.contentSizeFlag));
+ ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+ cctx, ZSTD_c_checksumFlag, parameters->fParams.checksumFlag));
+ ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+ cctx, ZSTD_c_dictIDFlag, !parameters->fParams.noDictIDFlag));
+ return 0;
+}
+
+int zstd_min_clevel(void)
+{
+ return ZSTD_minCLevel();
+}
+EXPORT_SYMBOL(zstd_min_clevel);
+
+int zstd_max_clevel(void)
+{
+ return ZSTD_maxCLevel();
+}
+EXPORT_SYMBOL(zstd_max_clevel);
+
+size_t zstd_compress_bound(size_t src_size)
+{
+ return ZSTD_compressBound(src_size);
+}
+EXPORT_SYMBOL(zstd_compress_bound);
+
+zstd_parameters zstd_get_params(int level,
+ unsigned long long estimated_src_size)
+{
+ return ZSTD_getParams(level, estimated_src_size, 0);
+}
+EXPORT_SYMBOL(zstd_get_params);
+
+size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *cparams)
+{
+ return ZSTD_estimateCCtxSize_usingCParams(*cparams);
+}
+EXPORT_SYMBOL(zstd_cctx_workspace_bound);
+
+zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size)
+{
+ if (workspace == NULL)
+ return NULL;
+ return ZSTD_initStaticCCtx(workspace, workspace_size);
+}
+EXPORT_SYMBOL(zstd_init_cctx);
+
+size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity,
+ const void *src, size_t src_size, const zstd_parameters *parameters)
+{
+ ZSTD_FORWARD_IF_ERR(zstd_cctx_init(cctx, parameters, src_size));
+ return ZSTD_compress2(cctx, dst, dst_capacity, src, src_size);
+}
+EXPORT_SYMBOL(zstd_compress_cctx);
+
+size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams)
+{
+ return ZSTD_estimateCStreamSize_usingCParams(*cparams);
+}
+EXPORT_SYMBOL(zstd_cstream_workspace_bound);
+
+zstd_cstream *zstd_init_cstream(const zstd_parameters *parameters,
+ unsigned long long pledged_src_size, void *workspace, size_t workspace_size)
+{
+ zstd_cstream *cstream;
+
+ if (workspace == NULL)
+ return NULL;
+
+ cstream = ZSTD_initStaticCStream(workspace, workspace_size);
+ if (cstream == NULL)
+ return NULL;
+
+ /* 0 means unknown in linux zstd API but means 0 in new zstd API */
+ if (pledged_src_size == 0)
+ pledged_src_size = ZSTD_CONTENTSIZE_UNKNOWN;
+
+ if (ZSTD_isError(zstd_cctx_init(cstream, parameters, pledged_src_size)))
+ return NULL;
+
+ return cstream;
+}
+EXPORT_SYMBOL(zstd_init_cstream);
+
+size_t zstd_reset_cstream(zstd_cstream *cstream,
+ unsigned long long pledged_src_size)
+{
+ return ZSTD_resetCStream(cstream, pledged_src_size);
+}
+EXPORT_SYMBOL(zstd_reset_cstream);
+
+size_t zstd_compress_stream(zstd_cstream *cstream, zstd_out_buffer *output,
+ zstd_in_buffer *input)
+{
+ return ZSTD_compressStream(cstream, output, input);
+}
+EXPORT_SYMBOL(zstd_compress_stream);
+
+size_t zstd_flush_stream(zstd_cstream *cstream, zstd_out_buffer *output)
+{
+ return ZSTD_flushStream(cstream, output);
+}
+EXPORT_SYMBOL(zstd_flush_stream);
+
+size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output)
+{
+ return ZSTD_endStream(cstream, output);
+}
+EXPORT_SYMBOL(zstd_end_stream);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Compressor");
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/zstd.h>
+
+/* Common symbols. zstd_compress must depend on zstd_decompress. */
+
+unsigned int zstd_is_error(size_t code)
+{
+ return ZSTD_isError(code);
+}
+EXPORT_SYMBOL(zstd_is_error);
+
+zstd_error_code zstd_get_error_code(size_t code)
+{
+ return ZSTD_getErrorCode(code);
+}
+EXPORT_SYMBOL(zstd_get_error_code);
+
+const char *zstd_get_error_name(size_t code)
+{
+ return ZSTD_getErrorName(code);
+}
+EXPORT_SYMBOL(zstd_get_error_name);
+
+/* Decompression symbols. */
+
+size_t zstd_dctx_workspace_bound(void)
+{
+ return ZSTD_estimateDCtxSize();
+}
+EXPORT_SYMBOL(zstd_dctx_workspace_bound);
+
+zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size)
+{
+ if (workspace == NULL)
+ return NULL;
+ return ZSTD_initStaticDCtx(workspace, workspace_size);
+}
+EXPORT_SYMBOL(zstd_init_dctx);
+
+size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity,
+ const void *src, size_t src_size)
+{
+ return ZSTD_decompressDCtx(dctx, dst, dst_capacity, src, src_size);
+}
+EXPORT_SYMBOL(zstd_decompress_dctx);
+
+size_t zstd_dstream_workspace_bound(size_t max_window_size)
+{
+ return ZSTD_estimateDStreamSize(max_window_size);
+}
+EXPORT_SYMBOL(zstd_dstream_workspace_bound);
+
+zstd_dstream *zstd_init_dstream(size_t max_window_size, void *workspace,
+ size_t workspace_size)
+{
+ if (workspace == NULL)
+ return NULL;
+ (void)max_window_size;
+ return ZSTD_initStaticDStream(workspace, workspace_size);
+}
+EXPORT_SYMBOL(zstd_init_dstream);
+
+size_t zstd_reset_dstream(zstd_dstream *dstream)
+{
+ return ZSTD_resetDStream(dstream);
+}
+EXPORT_SYMBOL(zstd_reset_dstream);
+
+size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output,
+ zstd_in_buffer *input)
+{
+ return ZSTD_decompressStream(dstream, output, input);
+}
+EXPORT_SYMBOL(zstd_decompress_stream);
+
+size_t zstd_find_frame_compressed_size(const void *src, size_t src_size)
+{
+ return ZSTD_findFrameCompressedSize(src, src_size);
+}
+EXPORT_SYMBOL(zstd_find_frame_compressed_size);
+
+size_t zstd_get_frame_header(zstd_frame_header *header, const void *src,
+ size_t src_size)
+{
+ return ZSTD_getFrameHeader(header, src, src_size);
+}
+EXPORT_SYMBOL(zstd_get_frame_header);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Decompressor");
tools = final.callPackage ../default.nix {
testWithValgrind = false;
filter = filter.lib;
- lastModified = builtins.substring 0 8 self.lastModifiedDate;
versionString = self.version;
};
toolsValgrind = final.bcachefs.tools.override {
ranges_sort_merge(data);
/* Write data: */
- darray_foreach(r, *data)
+ darray_for_each(*data, r)
for (src_offset = r->start;
src_offset < r->end;
src_offset += block_size) {
--- /dev/null
+{ kversion ? "linux_5_15"
+, pkgs ? import <nixpkgs> {} }:
+
+with pkgs;
+
+let
+ tools = pkgs.callPackage ./default.nix { doCheck = false ;} ;
+in
+mkShell {
+ buildInputs = [
+ linuxKernel.packages.${kversion}.perf
+ gdb
+ ccls # code completion in neovim/emacs
+ ];
+ inputsFrom = [
+ tools
+ ];
+}
...
fun:call_rcu_data_init
}
+{
+ urcu_memb_call_rcu
+ Memcheck:Leak
+ match-leak-kinds: possible
+ ...
+ fun:pthread_create*
+ obj:/*/liburcu.so.*
+ ...
+ fun:urcu_memb_call_rcu
+}
+{
+ pthread_create
+ Memcheck:Leak
+ match-leak-kinds: possible
+ fun:calloc
+ ...
+ fun:allocate_stack
+ fun:pthread_create*
+ fun:kthread_create
+ fun:bch2_rebalance_start
+}
return statbuf;
}
-/* Formatting: */
-
-int printf_pad(unsigned pad, const char * fmt, ...)
-{
- va_list args;
- int ret;
-
- va_start(args, fmt);
- ret = vprintf(fmt, args);
- va_end(args);
-
- while (ret++ < pad)
- putchar(' ');
-
- return ret;
-}
+/* File parsing (i.e. sysfs) */
-struct units_buf __pr_units(s64 _v, enum units units)
+void write_file_str(int dirfd, const char *path, const char *str)
{
- struct units_buf ret;
- char *out = ret.b, *end = out + sizeof(ret.b);
- u64 v = _v;
+ int fd = xopenat(dirfd, path, O_WRONLY);
+ ssize_t wrote, len = strlen(str);
- if (_v < 0) {
- out += scnprintf(out, end - out, "-");
- v = -_v;
- }
-
- switch (units) {
- case BYTES:
- snprintf(out, end - out, "%llu", v << 9);
- break;
- case SECTORS:
- snprintf(out, end - out, "%llu", v);
- break;
- case HUMAN_READABLE:
- v <<= 9;
-
- if (v >= 1024) {
- int exp = log(v) / log(1024);
- snprintf(out, end - out, "%.1f%c",
- v / pow(1024, exp),
- "KMGTPE"[exp-1]);
- } else {
- snprintf(out, end - out, "%llu", v);
- }
-
- break;
- }
-
- return ret;
+ wrote = write(fd, str, len);
+ if (wrote != len)
+ die("read error: %m");
+ close(fd);
}
-/* Argument parsing stuff: */
-
-/* File parsing (i.e. sysfs) */
-
char *read_file_str(int dirfd, const char *path)
{
int fd = xopenat(dirfd, path, O_RDONLY);
void ranges_sort_merge(ranges *r)
{
struct range *t, *i;
- ranges tmp = { NULL };
+ ranges tmp = { 0 };
- sort(&darray_item(*r, 0), darray_size(*r),
- sizeof(darray_item(*r, 0)), range_cmp, NULL);
+ sort(r->data, r->nr, sizeof(r->data[0]), range_cmp, NULL);
/* Merge contiguous ranges: */
- darray_foreach(i, *r) {
- t = tmp.size ? &tmp.item[tmp.size - 1] : NULL;
+ darray_for_each(*r, i) {
+ t = tmp.nr ? &tmp.data[tmp.nr - 1] : NULL;
if (t && t->end >= i->start)
t->end = max(t->end, i->end);
else
- darray_append(tmp, *i);
+ darray_push(&tmp, *i);
}
- darray_free(*r);
+ darray_exit(r);
*r = tmp;
}
{
struct range *i;
- darray_foreach(i, *r) {
+ darray_for_each(*r, i) {
i->start = round_down(i->start, block_size);
i->end = round_up(i->end, block_size);
}
{
struct range *i;
- darray_foreach(i, *r) {
+ darray_for_each(*r, i) {
i->start = round_up(i->start, block_size);
i->end = round_down(i->end, block_size);
i->end = max(i->end, i->start);
#include <linux/string.h>
#include <linux/types.h>
#include <linux/uuid.h>
-#include "ccan/darray/darray.h"
+#include "libbcachefs/darray.h"
#define noreturn __attribute__((noreturn))
_ret; \
})
-int printf_pad(unsigned pad, const char * fmt, ...);
-
-enum units {
- BYTES,
- SECTORS,
- HUMAN_READABLE,
-};
-
-struct units_buf __pr_units(s64, enum units);
-
-struct units_buf {
- char b[20];
-};
-
-#define pr_units(_v, _u) &(__pr_units(_v, _u).b[0])
-
+void write_file_str(int, const char *, const char *);
char *read_file_str(int, const char *);
u64 read_file_u64(int, const char *);
u64 end;
};
-typedef darray(struct range) ranges;
+typedef DARRAY(struct range) ranges;
static inline void range_add(ranges *data, u64 offset, u64 size)
{
- darray_append(*data, (struct range) {
+ darray_push(data, ((struct range) {
.start = offset,
.end = offset + size
- });
+ }));
}
void ranges_sort_merge(ranges *);
static inline struct range hole_iter_next(struct hole_iter *iter)
{
struct range r = {
- .start = iter->idx ? iter->r.item[iter->idx - 1].end : 0,
- .end = iter->idx < iter->r.size
- ? iter->r.item[iter->idx].start : iter->end,
+ .start = iter->idx ? iter->r.data[iter->idx - 1].end : 0,
+ .end = iter->idx < iter->r.nr
+ ? iter->r.data[iter->idx].start : iter->end,
};
BUG_ON(r.start > r.end);
#define for_each_hole(_iter, _ranges, _end, _i) \
for (_iter = (struct hole_iter) { .r = _ranges, .end = _end }; \
- (_iter.idx <= _iter.r.size && \
+ (_iter.idx <= _iter.r.nr && \
(_i = hole_iter_next(&_iter), true));)
#include <linux/fiemap.h>