-d83b992f653d9f742f3f8567dbcfd1f4f72e858f
+6f603b8d79efa7d9ac04ea0c38ef1bbaa10fd678
int fd, opt;
opt_set(opts, nochanges, true);
- opt_set(opts, noreplay, true);
+ opt_set(opts, norecovery, true);
opt_set(opts, degraded, true);
opt_set(opts, errors, BCH_ON_ERROR_CONTINUE);
struct btree_iter *iter;
struct bkey_s_c k;
char buf[512];
+ int ret;
bch2_trans_init(&trans, c);
for_each_btree_key(&trans, iter, btree_id, start,
- BTREE_ITER_PREFETCH, k) {
+ BTREE_ITER_PREFETCH, k, ret) {
if (bkey_cmp(k.k->p, end) > 0)
break;
int opt, ret = 0;
opt_set(opts, degraded, true);
+ opt_set(opts, fsck, true);
opt_set(opts, fix_errors, FSCK_OPT_ASK);
while ((opt = getopt(argc, argv, "apynfvh")) != -1)
mark_unreserved_space(c, extents);
- const char *err = bch2_fs_start(c);
- if (err)
- die("Error starting new filesystem: %s", err);
+ int ret = bch2_fs_start(c);
+ if (ret)
+ die("Error starting new filesystem: %s", strerror(-ret));
copy_fs(c, fs_fd, fs_path, bcachefs_inum, &extents);
return ts.tv_sec;
}
-static inline void ktime_get_real_ts64(struct timespec64 *ts)
+static inline void ktime_get_coarse_real_ts64(struct timespec64 *ts)
{
clock_gettime(CLOCK_MONOTONIC, ts);
}
#define kcalloc(n, size, flags) kmalloc_array(n, size, flags|__GFP_ZERO)
#define kfree(p) free(p)
-#define kvfree(p) free(p)
#define kzfree(p) free(p)
+#define kvmalloc(size, flags) kmalloc(size, flags)
+#define kvfree(p) kfree(p)
+
static inline struct page *alloc_pages(gfp_t flags, unsigned int order)
{
size_t size = PAGE_SIZE << order;
#include "debug.h"
#include "ec.h"
#include "error.h"
-#include "journal_io.h"
+#include "recovery.h"
#include <linux/kthread.h>
#include <linux/math64.h>
*p += bytes;
}
-struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *a)
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
{
- struct bkey_alloc_unpacked ret = { .gen = a->gen };
- const void *d = a->data;
- unsigned idx = 0;
+ struct bkey_alloc_unpacked ret = { .gen = 0 };
+
+ if (k.k->type == KEY_TYPE_alloc) {
+ const struct bch_alloc *a = bkey_s_c_to_alloc(k).v;
+ const void *d = a->data;
+ unsigned idx = 0;
+
+ ret.gen = a->gen;
#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++);
- BCH_ALLOC_FIELDS()
+ BCH_ALLOC_FIELDS()
#undef x
+ }
return ret;
}
-static void bch2_alloc_pack(struct bkey_i_alloc *dst,
- const struct bkey_alloc_unpacked src)
+void bch2_alloc_pack(struct bkey_i_alloc *dst,
+ const struct bkey_alloc_unpacked src)
{
unsigned idx = 0;
void *d = dst->v.data;
get_alloc_field(a.v, &d, i));
}
-static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
+static inline struct bkey_alloc_unpacked
+alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
{
- const void *d = a->data;
- unsigned idx = 0, data_type, dirty_sectors, cached_sectors;
- struct bucket_mark m;
-
- g->io_time[READ] = get_alloc_field(a, &d, idx++);
- g->io_time[WRITE] = get_alloc_field(a, &d, idx++);
- data_type = get_alloc_field(a, &d, idx++);
- dirty_sectors = get_alloc_field(a, &d, idx++);
- cached_sectors = get_alloc_field(a, &d, idx++);
- g->oldest_gen = get_alloc_field(a, &d, idx++);
-
- bucket_cmpxchg(g, m, ({
- m.gen = a->gen;
- m.data_type = data_type;
- m.dirty_sectors = dirty_sectors;
- m.cached_sectors = cached_sectors;
- }));
-
- g->gen_valid = 1;
+ return (struct bkey_alloc_unpacked) {
+ .gen = m.gen,
+ .oldest_gen = g->oldest_gen,
+ .data_type = m.data_type,
+ .dirty_sectors = m.dirty_sectors,
+ .cached_sectors = m.cached_sectors,
+ .read_time = g->io_time[READ],
+ .write_time = g->io_time[WRITE],
+ };
}
-static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
- struct bucket_mark m)
+int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
{
- unsigned idx = 0;
- void *d = a->v.data;
-
- a->v.fields = 0;
- a->v.gen = m.gen;
-
- d = a->v.data;
- put_alloc_field(a, &d, idx++, g->io_time[READ]);
- put_alloc_field(a, &d, idx++, g->io_time[WRITE]);
- put_alloc_field(a, &d, idx++, m.data_type);
- put_alloc_field(a, &d, idx++, m.dirty_sectors);
- put_alloc_field(a, &d, idx++, m.cached_sectors);
- put_alloc_field(a, &d, idx++, g->oldest_gen);
-
- set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v);
-}
-
-static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bch_dev *ca;
- struct bkey_s_c_alloc a;
-
- if (k.k->type != KEY_TYPE_alloc)
- return;
-
- a = bkey_s_c_to_alloc(k);
- ca = bch_dev_bkey_exists(c, a.k->p.inode);
-
- if (a.k->p.offset >= ca->mi.nbuckets)
- return;
-
- percpu_down_read_preempt_disable(&c->mark_lock);
- __alloc_read_key(bucket(ca, a.k->p.offset), a.v);
- percpu_up_read_preempt_enable(&c->mark_lock);
-}
-
-int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
-{
- struct journal_replay *r;
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
struct bch_dev *ca;
+ struct journal_key *j;
unsigned i;
int ret;
bch2_trans_init(&trans, c);
- for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k) {
- bch2_alloc_read_key(c, k);
- bch2_trans_cond_resched(&trans);
- }
+ for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
+ bch2_mark_key(c, k, true, 0, NULL, 0, 0);
- ret = bch2_trans_exit(&trans);
- if (ret)
+ ret = bch2_trans_exit(&trans) ?: ret;
+ if (ret) {
+ bch_err(c, "error reading alloc info: %i", ret);
return ret;
-
- list_for_each_entry(r, journal_replay_list, list) {
- struct bkey_i *k, *n;
- struct jset_entry *entry;
-
- for_each_jset_key(k, n, entry, &r->j)
- if (entry->btree_id == BTREE_ID_ALLOC)
- bch2_alloc_read_key(c, bkey_i_to_s_c(k));
}
+ for_each_journal_key(*journal_keys, j)
+ if (j->btree_id == BTREE_ID_ALLOC)
+ bch2_mark_key(c, bkey_i_to_s_c(j->k),
+ true, 0, NULL, 0, 0);
+
percpu_down_write(&c->mark_lock);
bch2_dev_usage_from_buckets(c);
percpu_up_write(&c->mark_lock);
return ret;
}
-static int __bch2_alloc_write_key(struct btree_trans *trans, struct bch_dev *ca,
- size_t b, struct btree_iter *iter,
- u64 *journal_seq, unsigned flags)
+int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
{
- struct bch_fs *c = trans->c;
-#if 0
- __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
-#else
- /* hack: */
- __BKEY_PADDED(k, 8) alloc_key;
-#endif
- struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k);
+ struct btree_trans trans;
+ struct btree_iter *iter;
+ struct bucket_array *buckets;
+ struct bch_dev *ca;
struct bucket *g;
struct bucket_mark m, new;
- int ret;
-
- BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
- a->k.p = POS(ca->dev_idx, b);
-
- bch2_btree_iter_set_pos(iter, a->k.p);
-
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- return ret;
-
- percpu_down_read_preempt_disable(&c->mark_lock);
- g = bucket(ca, b);
- m = READ_ONCE(g->mark);
-
- if (!m.dirty) {
- percpu_up_read_preempt_enable(&c->mark_lock);
- return 0;
- }
-
- __alloc_write_key(a, g, m);
- percpu_up_read_preempt_enable(&c->mark_lock);
-
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
-
- ret = bch2_trans_commit(trans, NULL, journal_seq,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE|
- BTREE_INSERT_NOMARK|
- flags);
- if (ret)
- return ret;
-
- new = m;
- new.dirty = false;
- atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
-
- if (ca->buckets_written)
- set_bit(b, ca->buckets_written);
-
- return 0;
-}
-
-int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
-{
- struct bch_dev *ca;
+ struct bkey_alloc_unpacked old_u, new_u;
+ __BKEY_PADDED(k, 8) alloc_key; /* hack: */
+ struct bkey_i_alloc *a;
+ struct bkey_s_c k;
unsigned i;
+ size_t b;
int ret = 0;
- *wrote = false;
-
- for_each_rw_member(ca, c, i) {
- struct btree_trans trans;
- struct btree_iter *iter;
- struct bucket_array *buckets;
- size_t b;
+ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
- bch2_trans_init(&trans, c);
+ bch2_trans_init(&trans, c);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ for_each_rw_member(ca, c, i) {
down_read(&ca->bucket_lock);
+restart:
buckets = bucket_array(ca);
for (b = buckets->first_bucket;
if (!buckets->b[b].mark.dirty)
continue;
- ret = __bch2_alloc_write_key(&trans, ca, b, iter, NULL,
- nowait
- ? BTREE_INSERT_NOWAIT
- : 0);
+ bch2_btree_iter_set_pos(iter, POS(i, b));
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ old_u = bch2_alloc_unpack(k);
+
+ percpu_down_read_preempt_disable(&c->mark_lock);
+ g = bucket(ca, b);
+ m = READ_ONCE(g->mark);
+ new_u = alloc_mem_to_key(g, m);
+ percpu_up_read_preempt_enable(&c->mark_lock);
+
+ if (!m.dirty)
+ continue;
+
+ if ((flags & BTREE_INSERT_LAZY_RW) &&
+ percpu_ref_is_zero(&c->writes)) {
+ up_read(&ca->bucket_lock);
+ bch2_trans_unlock(&trans);
+
+ ret = bch2_fs_read_write_early(c);
+ down_read(&ca->bucket_lock);
+
+ if (ret)
+ goto err;
+ goto restart;
+ }
+
+ a = bkey_alloc_init(&alloc_key.k);
+ a->k.p = iter->pos;
+ bch2_alloc_pack(a, new_u);
+
+ bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
+ ret = bch2_trans_commit(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_NOMARK|
+ flags);
+err:
+ if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
+ bch_err(c, "error %i writing alloc info", ret);
+ printk(KERN_CONT "dev %llu bucket %llu\n",
+ iter->pos.inode, iter->pos.offset);
+ printk(KERN_CONT "gen %u -> %u\n", old_u.gen, new_u.gen);
+#define x(_name, _bits) printk(KERN_CONT #_name " %u -> %u\n", old_u._name, new_u._name);
+ BCH_ALLOC_FIELDS()
+#undef x
+ }
if (ret)
break;
+ new = m;
+ new.dirty = false;
+ atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
+
+ if (ca->buckets_written)
+ set_bit(b, ca->buckets_written);
+
bch2_trans_cond_resched(&trans);
*wrote = true;
}
up_read(&ca->bucket_lock);
- bch2_trans_exit(&trans);
-
if (ret) {
percpu_ref_put(&ca->io_ref);
break;
}
}
+ bch2_trans_exit(&trans);
+
return ret;
}
unsigned long gc_count = c->gc_count;
int ret = 0;
+ ca->allocator_state = ALLOCATOR_BLOCKED;
+ closure_wake_up(&c->freelist_wait);
+
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
if (kthread_should_stop()) {
}
__set_current_state(TASK_RUNNING);
+ ca->allocator_state = ALLOCATOR_RUNNING;
+ closure_wake_up(&c->freelist_wait);
+
return ret;
}
struct alloc_heap_entry l,
struct alloc_heap_entry r)
{
- return (l.key > r.key) - (l.key < r.key) ?:
- (l.nr < r.nr) - (l.nr > r.nr) ?:
- (l.bucket > r.bucket) - (l.bucket < r.bucket);
+ return cmp_int(l.key, r.key) ?:
+ cmp_int(r.nr, l.nr) ?:
+ cmp_int(l.bucket, r.bucket);
}
static inline int bucket_idx_cmp(const void *_l, const void *_r)
{
const struct alloc_heap_entry *l = _l, *r = _r;
- return (l->bucket > r->bucket) - (l->bucket < r->bucket);
+ return cmp_int(l->bucket, r->bucket);
}
static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
struct bch_fs *c = trans->c;
struct bkey_i_alloc *a;
struct bkey_alloc_unpacked u;
+ struct bucket *g;
struct bucket_mark m;
struct bkey_s_c k;
bool invalidating_cached_data;
BUG_ON(!fifo_push(&ca->free_inc, b));
bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
- m = bucket(ca, b)->mark;
spin_unlock(&c->freelist_lock);
percpu_up_read_preempt_enable(&c->mark_lock);
if (ret)
return ret;
- if (k.k && k.k->type == KEY_TYPE_alloc)
- u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
- else
- memset(&u, 0, sizeof(u));
+ /*
+ * The allocator has to start before journal replay is finished - thus,
+ * we have to trust the in memory bucket @m, not the version in the
+ * btree:
+ */
+ percpu_down_read_preempt_disable(&c->mark_lock);
+ g = bucket(ca, b);
+ m = READ_ONCE(g->mark);
+ u = alloc_mem_to_key(g, m);
+ percpu_up_read_preempt_enable(&c->mark_lock);
invalidating_cached_data = m.cached_sectors != 0;
- //BUG_ON(u.dirty_sectors);
+ u.gen++;
u.data_type = 0;
u.dirty_sectors = 0;
u.cached_sectors = 0;
u.read_time = c->bucket_clock[READ].hand;
u.write_time = c->bucket_clock[WRITE].hand;
- /*
- * The allocator has to start before journal replay is finished - thus,
- * we have to trust the in memory bucket @m, not the version in the
- * btree:
- */
- u.gen = m.gen + 1;
-
a = bkey_alloc_init(&alloc_key.k);
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
fifo_pop(&ca->free_inc, bucket);
closure_wake_up(&c->freelist_wait);
- ca->allocator_blocked_full = false;
+ ca->allocator_state = ALLOCATOR_RUNNING;
spin_unlock(&c->freelist_lock);
goto out;
}
- if (!ca->allocator_blocked_full) {
- ca->allocator_blocked_full = true;
+ if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) {
+ ca->allocator_state = ALLOCATOR_BLOCKED_FULL;
closure_wake_up(&c->freelist_wait);
}
int ret;
set_freezable();
+ ca->allocator_state = ALLOCATOR_RUNNING;
while (1) {
cond_resched();
if (!nr ||
(nr < ALLOC_SCAN_BATCH(ca) &&
!fifo_full(&ca->free[RESERVE_MOVINGGC]))) {
- ca->allocator_blocked = true;
- closure_wake_up(&c->freelist_wait);
-
ret = wait_buckets_available(c, ca);
if (ret) {
up_read(&c->gc_lock);
}
} while (!nr);
- ca->allocator_blocked = false;
up_read(&c->gc_lock);
pr_debug("%zu buckets to invalidate", nr);
stop:
pr_debug("alloc thread stopping (ret %i)", ret);
+ ca->allocator_state = ALLOCATOR_STOPPED;
+ closure_wake_up(&c->freelist_wait);
return 0;
}
void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
{
if (ca->alloc_thread)
- closure_wait_event(&c->freelist_wait, ca->allocator_blocked_full);
+ closure_wait_event(&c->freelist_wait,
+ ca->allocator_state != ALLOCATOR_RUNNING);
}
/* stop allocator thread: */
* XXX: it's possible for this to deadlock waiting on journal reclaim,
* since we're holding btree writes. What then?
*/
- ret = bch2_alloc_write(c, true, &wrote);
+ ret = bch2_alloc_write(c,
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_USE_ALLOC_RESERVE|
+ BTREE_INSERT_NOWAIT, &wrote);
/*
* If bch2_alloc_write() did anything, it may have used some
#undef x
};
-struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *);
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
+void bch2_alloc_pack(struct bkey_i_alloc *,
+ const struct bkey_alloc_unpacked);
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
.val_to_text = bch2_alloc_to_text, \
}
-int bch2_alloc_read(struct bch_fs *, struct list_head *);
+struct journal_keys;
+int bch2_alloc_read(struct bch_fs *, struct journal_keys *);
int bch2_alloc_replay_key(struct bch_fs *, struct bkey_i *);
static inline void bch2_wake_allocator(struct bch_dev *ca)
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
-int bch2_alloc_write(struct bch_fs *, bool, bool *);
+int bch2_alloc_write(struct bch_fs *, unsigned, bool *);
int bch2_fs_allocator_start(struct bch_fs *);
void bch2_fs_allocator_background_init(struct bch_fs *);
* XXX: this should be an enum for allocator state, so as to include
* error state
*/
- bool allocator_blocked;
- bool allocator_blocked_full;
+ enum {
+ ALLOCATOR_STOPPED,
+ ALLOCATOR_RUNNING,
+ ALLOCATOR_BLOCKED,
+ ALLOCATOR_BLOCKED_FULL,
+ } allocator_state;
alloc_heap alloc_heap;
struct percpu_rw_semaphore mark_lock;
+ seqcount_t usage_lock;
+ struct bch_fs_usage *usage_base;
struct bch_fs_usage __percpu *usage[2];
+ struct bch_fs_usage __percpu *usage_gc;
/* single element mempool: */
struct mutex usage_scratch_lock;
{
struct timespec64 now;
- ktime_get_real_ts64(&now);
+ ktime_get_coarse_real_ts64(&now);
return timespec_to_bch2_time(c, now);
}
enum bch_sb_compat {
BCH_COMPAT_FEAT_ALLOC_INFO = 0,
+ BCH_COMPAT_FEAT_ALLOC_METADATA = 1,
};
/* options: */
r_v = *r;
}
- return (l_v > r_v) - (l_v < r_v);
+ return cmp_int(l_v, r_v);
}
#endif
static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
{
- return (l.hi > r.hi) - (l.hi < r.hi) ?:
- (l.lo > r.lo) - (l.lo < r.lo);
+ return cmp_int(l.hi, r.hi) ?:
+ cmp_int(l.lo, r.lo);
}
#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 })
{
return bkey_cmp_packed(b, l, r)
?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
- ?: (l > r) - (l < r);
+ ?: cmp_int(l, r);
}
static inline int btree_node_iter_cmp(struct btree *b,
#include "error.h"
#include "extents.h"
#include "journal.h"
-#include "journal_io.h"
#include "keylist.h"
#include "move.h"
+#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
struct btree_iter *iter;
struct btree *b;
struct range_checks r;
- unsigned depth = btree_node_type_needs_gc(btree_id) ? 0 : 1;
+ unsigned depth = metadata_only ? 1
+ : expensive_debug_checks(c) ? 0
+ : !btree_node_type_needs_gc(btree_id) ? 1
+ : 0;
u8 max_stale;
int ret = 0;
gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
- /*
- * if expensive_debug_checks is on, run range_checks on all leaf nodes:
- *
- * and on startup, we have to read every btree node (XXX: only if it was
- * an unclean shutdown)
- */
- if (metadata_only)
- depth = 1;
- else if (initial || expensive_debug_checks(c))
- depth = 0;
-
btree_node_range_checks_init(&r, depth);
__for_each_btree_node(&trans, iter, btree_id, POS_MIN,
(int) btree_id_to_gc_phase(r);
}
-static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
+static int mark_journal_key(struct bch_fs *c, enum btree_id id,
+ struct bkey_i *insert)
+{
+ struct btree_trans trans;
+ struct btree_iter *iter;
+ struct bkey_s_c k;
+ u8 max_stale;
+ int ret = 0;
+
+ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true);
+ if (ret)
+ return ret;
+
+ bch2_trans_init(&trans, c);
+
+ for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k),
+ BTREE_ITER_SLOTS, k, ret) {
+ percpu_down_read_preempt_disable(&c->mark_lock);
+ ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
+ BCH_BUCKET_MARK_GC|
+ BCH_BUCKET_MARK_NOATOMIC);
+ percpu_up_read_preempt_enable(&c->mark_lock);
+
+ if (!ret)
+ break;
+ }
+
+ return bch2_trans_exit(&trans) ?: ret;
+}
+
+static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
bool initial, bool metadata_only)
{
enum btree_id ids[BTREE_ID_NR];
- u8 max_stale;
unsigned i;
for (i = 0; i < BTREE_ID_NR; i++)
if (ret)
return ret;
- if (journal && !metadata_only &&
+ if (journal_keys && !metadata_only &&
btree_node_type_needs_gc(type)) {
- struct bkey_i *k, *n;
- struct jset_entry *j;
- struct journal_replay *r;
+ struct journal_key *j;
int ret;
- list_for_each_entry(r, journal, list)
- for_each_jset_key(k, n, j, &r->j) {
- if (type == __btree_node_type(j->level, j->btree_id)) {
- ret = bch2_gc_mark_key(c,
- bkey_i_to_s_c(k),
- &max_stale, initial);
- if (ret)
- return ret;
- }
+ for_each_journal_key(*journal_keys, j)
+ if (j->btree_id == id) {
+ ret = mark_journal_key(c, id, j->k);
+ if (ret)
+ return ret;
}
}
}
ca->usage[1] = NULL;
}
- free_percpu(c->usage[1]);
- c->usage[1] = NULL;
+ free_percpu(c->usage_gc);
+ c->usage_gc = NULL;
}
static int bch2_gc_done(struct bch_fs *c,
}
};
+ bch2_fs_usage_acc_to_base(c, 0);
+ bch2_fs_usage_acc_to_base(c, 1);
+
bch2_dev_usage_from_buckets(c);
{
unsigned nr = fs_usage_u64s(c);
- struct bch_fs_usage *dst = (void *)
- bch2_acc_percpu_u64s((void *) c->usage[0], nr);
+ struct bch_fs_usage *dst = c->usage_base;
struct bch_fs_usage *src = (void *)
- bch2_acc_percpu_u64s((void *) c->usage[1], nr);
+ bch2_acc_percpu_u64s((void *) c->usage_gc, nr);
copy_fs_field(hidden, "hidden");
copy_fs_field(btree, "btree");
*/
gc_pos_set(c, gc_phase(GC_PHASE_START));
- BUG_ON(c->usage[1]);
+ BUG_ON(c->usage_gc);
- c->usage[1] = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
+ c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
sizeof(u64), GFP_KERNEL);
- if (!c->usage[1])
+ if (!c->usage_gc)
return -ENOMEM;
for_each_member_device(ca, c, i) {
* move around - if references move backwards in the ordering GC
* uses, GC could skip past them
*/
-int bch2_gc(struct bch_fs *c, struct list_head *journal,
+int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
bool initial, bool metadata_only)
{
struct bch_dev *ca;
bch2_mark_superblocks(c);
- ret = bch2_gc_btrees(c, journal, initial, metadata_only);
+ ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only);
if (ret)
goto out;
ret = -EINVAL;
}
- percpu_down_write(&c->mark_lock);
+ if (!ret) {
+ bch2_journal_block(&c->journal);
- if (!ret)
+ percpu_down_write(&c->mark_lock);
ret = bch2_gc_done(c, initial, metadata_only);
+ bch2_journal_unblock(&c->journal);
+ } else {
+ percpu_down_write(&c->mark_lock);
+ }
+
/* Indicates that gc is no longer in progress: */
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
#include "btree_types.h"
void bch2_coalesce(struct bch_fs *);
-int bch2_gc(struct bch_fs *, struct list_head *, bool, bool);
+
+struct journal_keys;
+int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool);
void bch2_gc_thread_stop(struct bch_fs *);
int bch2_gc_thread_start(struct bch_fs *);
void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
goto retry_all;
}
- ret = btree_trans_has_multiple_iters(trans) ? -EINTR : 0;
+ ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0;
out:
bch2_btree_cache_cannibalize_unlock(c);
return ret;
if (unlikely(ret))
ret = __btree_iter_traverse_all(iter->trans, iter, ret);
- BUG_ON(ret == -EINTR && !btree_trans_has_multiple_iters(iter->trans));
-
return ret;
}
: bch2_btree_iter_next(iter);
}
-#define for_each_btree_key(_trans, _iter, _btree_id, _start, _flags, _k)\
- for (iter = bch2_trans_get_iter((_trans), (_btree_id), \
- (_start), (_flags)), \
- (_k) = __bch2_btree_iter_peek(_iter, _flags); \
- !IS_ERR_OR_NULL((_k).k); \
- (_k) = __bch2_btree_iter_next(_iter, _flags))
+#define for_each_btree_key(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _ret) \
+ for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \
+ bch2_trans_get_iter((_trans), (_btree_id), \
+ (_start), (_flags))) ?: \
+ PTR_ERR_OR_ZERO(((_k) = \
+ __bch2_btree_iter_peek(_iter, _flags)).k); \
+ !ret && (_k).k; \
+ (_ret) = PTR_ERR_OR_ZERO(((_k) = \
+ __bch2_btree_iter_next(_iter, _flags)).k))
#define for_each_btree_key_continue(_iter, _flags, _k) \
for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \
#include <linux/six.h>
#include "bkey_methods.h"
+#include "buckets_types.h"
#include "journal_types.h"
struct open_bucket;
};
bool deferred;
+ bool triggered;
};
#define BTREE_ITER_MAX 64
struct btree_iter iters_onstack[2];
struct btree_insert_entry updates_onstack[6];
+
+ struct replicas_delta_list fs_usage_deltas;
};
#define BTREE_FLAG(flag) \
__BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
__BTREE_INSERT_JOURNAL_RESERVED,
+ __BTREE_INSERT_NOMARK_INSERT,
+ __BTREE_INSERT_NOMARK_OVERWRITES,
__BTREE_INSERT_NOMARK,
+ __BTREE_INSERT_MARK_INMEM,
+ __BTREE_INSERT_NO_CLEAR_REPLICAS,
__BTREE_INSERT_NOWAIT,
__BTREE_INSERT_GC_LOCK_HELD,
__BCH_HASH_SET_MUST_CREATE,
#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED)
-/* Don't call bch2_mark_key: */
+/* Don't mark new key, just overwrites: */
+#define BTREE_INSERT_NOMARK_INSERT (1 << __BTREE_INSERT_NOMARK_INSERT)
+
+/* Don't mark overwrites, just new key: */
+#define BTREE_INSERT_NOMARK_OVERWRITES (1 << __BTREE_INSERT_NOMARK_OVERWRITES)
+
+/* Don't call mark new key at all: */
#define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK)
+/* Don't mark transactionally: */
+#define BTREE_INSERT_MARK_INMEM (1 << __BTREE_INSERT_MARK_INMEM)
+
+#define BTREE_INSERT_NO_CLEAR_REPLICAS (1 << __BTREE_INSERT_NO_CLEAR_REPLICAS)
+
/* Don't block on allocation failure (for new btree nodes: */
#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT)
#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD)
bch2_btree_node_free_index(as, NULL,
bkey_i_to_s_c(&old->key),
fs_usage);
- bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
+ bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
bch2_fs_usage_scratch_put(c, fs_usage);
percpu_up_read(&c->mark_lock);
bkey_disassemble(b, k, &tmp),
fs_usage);
- bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
+ bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
bch2_fs_usage_scratch_put(c, fs_usage);
percpu_up_read(&c->mark_lock);
bch2_btree_node_free_index(as, NULL,
bkey_i_to_s_c(&b->key),
fs_usage);
- bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
+ bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
bch2_fs_usage_scratch_put(c, fs_usage);
percpu_up_read(&c->mark_lock);
static inline int btree_trans_cmp(struct btree_insert_entry l,
struct btree_insert_entry r)
{
- return (l.deferred > r.deferred) - (l.deferred < r.deferred) ?:
+ return cmp_int(l.deferred, r.deferred) ?:
btree_iter_cmp(l.iter, r.iter);
}
btree_insert_key_deferred(trans, insert);
}
+static inline bool update_triggers_transactional(struct btree_trans *trans,
+ struct btree_insert_entry *i)
+{
+ return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) &&
+ (i->iter->btree_id == BTREE_ID_EXTENTS ||
+ i->iter->btree_id == BTREE_ID_INODES);
+}
+
+static inline bool update_has_triggers(struct btree_trans *trans,
+ struct btree_insert_entry *i)
+{
+ return likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
+ !i->deferred &&
+ btree_node_type_needs_gc(i->iter->btree_id);
+}
+
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
struct btree_iter *linked;
int ret;
+ if (likely(!(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS))) {
+ memset(&trans->fs_usage_deltas.fs_usage, 0,
+ sizeof(trans->fs_usage_deltas.fs_usage));
+ trans->fs_usage_deltas.top = trans->fs_usage_deltas.d;
+ }
+
trans_for_each_update_iter(trans, i)
BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
- btree_trans_lock_write(c, trans);
-
- trans_for_each_update_iter(trans, i) {
- if (i->deferred ||
- !btree_node_type_needs_gc(i->iter->btree_id))
- continue;
-
- if (!fs_usage) {
- percpu_down_read(&c->mark_lock);
- fs_usage = bch2_fs_usage_scratch_get(c);
+ trans_for_each_update_iter(trans, i)
+ if (update_has_triggers(trans, i) &&
+ update_triggers_transactional(trans, i)) {
+ ret = bch2_trans_mark_update(trans, i,
+ &trans->fs_usage_deltas);
+ if (ret)
+ return ret;
}
- if (!bch2_bkey_replicas_marked_locked(c,
- bkey_i_to_s_c(i->k), true)) {
- ret = BTREE_INSERT_NEED_MARK_REPLICAS;
- goto out;
- }
- }
+ btree_trans_lock_write(c, trans);
if (race_fault()) {
ret = -EINTR;
if (ret)
goto out;
+ trans_for_each_update_iter(trans, i) {
+ if (i->deferred ||
+ !btree_node_type_needs_gc(i->iter->btree_id))
+ continue;
+
+ if (!fs_usage) {
+ percpu_down_read(&c->mark_lock);
+ fs_usage = bch2_fs_usage_scratch_get(c);
+ }
+
+ if (!bch2_bkey_replicas_marked_locked(c,
+ bkey_i_to_s_c(i->k), true)) {
+ ret = BTREE_INSERT_NEED_MARK_REPLICAS;
+ goto out;
+ }
+ }
+
/*
* Don't get journal reservation until after we know insert will
* succeed:
}
trans_for_each_update_iter(trans, i)
- bch2_mark_update(trans, i, fs_usage, 0);
- if (fs_usage)
+ if (update_has_triggers(trans, i) &&
+ !update_triggers_transactional(trans, i))
+ bch2_mark_update(trans, i, fs_usage, 0);
+
+ if (fs_usage) {
+ bch2_replicas_delta_list_apply(c, fs_usage,
+ &trans->fs_usage_deltas);
bch2_trans_fs_usage_apply(trans, fs_usage);
+ }
- if (unlikely(c->gc_pos.phase)) {
+ if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
+ unlikely(c->gc_pos.phase))
trans_for_each_update_iter(trans, i)
if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
bch2_mark_update(trans, i, NULL,
BCH_BUCKET_MARK_GC);
- }
trans_for_each_update(trans, i)
do_btree_insert_one(trans, i);
{
struct bch_fs *c = trans->c;
unsigned flags = trans->flags;
+ struct btree_insert_entry *src, *dst;
+
+ src = dst = trans->updates;
+
+ while (src < trans->updates + trans->nr_updates) {
+ if (!src->triggered) {
+ *dst = *src;
+ dst++;
+ }
+ src++;
+ }
+
+ trans->nr_updates = dst - trans->updates;
/*
* BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
+ unsigned orig_mem_top = trans->mem_top;
int ret = 0;
if (!trans->nr_updates)
return ret;
err:
ret = bch2_trans_commit_error(trans, i, ret);
- if (!ret)
+
+ /* can't loop if it was passed in and we changed it: */
+ if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret)
+ ret = -EINTR;
+
+ if (!ret) {
+ /* free memory used by triggers, they'll be reexecuted: */
+ trans->mem_top = orig_mem_top;
goto retry;
+ }
goto out;
}
int ret = 0;
bch2_trans_init(&trans, c);
+ bch2_trans_preload_iters(&trans);
iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);
}
bch2_trans_exit(&trans);
+ BUG_ON(ret == -EINTR);
return ret;
}
unsigned i;
percpu_down_write(&c->mark_lock);
- usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0],
- fs_usage_u64s(c));
+ usage = c->usage_base;
+
+ bch2_fs_usage_acc_to_base(c, 0);
+ bch2_fs_usage_acc_to_base(c, 1);
for (i = 0; i < BCH_REPLICAS_MAX; i++)
usage->reserved += usage->persistent_reserved[i];
return ret;
}
+static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
+ unsigned journal_seq,
+ bool gc)
+{
+ return this_cpu_ptr(gc
+ ? c->usage_gc
+ : c->usage[journal_seq & 1]);
+}
+
+u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
+{
+ ssize_t offset = v - (u64 *) c->usage_base;
+ unsigned seq;
+ u64 ret;
+
+ BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
+ percpu_rwsem_assert_held(&c->mark_lock);
+
+ do {
+ seq = read_seqcount_begin(&c->usage_lock);
+ ret = *v +
+ percpu_u64_get((u64 __percpu *) c->usage[0] + offset) +
+ percpu_u64_get((u64 __percpu *) c->usage[1] + offset);
+ } while (read_seqcount_retry(&c->usage_lock, seq));
+
+ return ret;
+}
+
struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
{
struct bch_fs_usage *ret;
- unsigned v, u64s = fs_usage_u64s(c);
+ unsigned seq, v, u64s = fs_usage_u64s(c);
retry:
- ret = kzalloc(u64s * sizeof(u64), GFP_NOFS);
+ ret = kmalloc(u64s * sizeof(u64), GFP_NOFS);
if (unlikely(!ret))
return NULL;
goto retry;
}
- acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s);
+ do {
+ seq = read_seqcount_begin(&c->usage_lock);
+ memcpy(ret, c->usage_base, u64s * sizeof(u64));
+ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s);
+ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s);
+ } while (read_seqcount_retry(&c->usage_lock, seq));
return ret;
}
+void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
+{
+ unsigned u64s = fs_usage_u64s(c);
+
+ BUG_ON(idx >= 2);
+
+ write_seqcount_begin(&c->usage_lock);
+
+ acc_u64s_percpu((u64 *) c->usage_base,
+ (u64 __percpu *) c->usage[idx], u64s);
+ percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
+
+ write_seqcount_end(&c->usage_lock);
+}
+
+void bch2_fs_usage_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct bch_fs_usage *fs_usage)
+{
+ unsigned i;
+
+ pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity);
+
+ pr_buf(out, "hidden:\t\t\t\t%llu\n",
+ fs_usage->hidden);
+ pr_buf(out, "data:\t\t\t\t%llu\n",
+ fs_usage->data);
+ pr_buf(out, "cached:\t\t\t\t%llu\n",
+ fs_usage->cached);
+ pr_buf(out, "reserved:\t\t\t%llu\n",
+ fs_usage->reserved);
+ pr_buf(out, "nr_inodes:\t\t\t%llu\n",
+ fs_usage->nr_inodes);
+ pr_buf(out, "online reserved:\t\t%llu\n",
+ fs_usage->online_reserved);
+
+ for (i = 0;
+ i < ARRAY_SIZE(fs_usage->persistent_reserved);
+ i++) {
+ pr_buf(out, "%u replicas:\n", i + 1);
+ pr_buf(out, "\treserved:\t\t%llu\n",
+ fs_usage->persistent_reserved[i]);
+ }
+
+ for (i = 0; i < c->replicas.nr; i++) {
+ struct bch_replicas_entry *e =
+ cpu_replicas_entry(&c->replicas, i);
+
+ pr_buf(out, "\t");
+ bch2_replicas_entry_to_text(out, e);
+ pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]);
+ }
+}
+
#define RESERVE_FACTOR 6
static u64 reserve_factor(u64 r)
u64 data, reserved;
ret.capacity = c->capacity -
- percpu_u64_get(&c->usage[0]->hidden);
+ bch2_fs_usage_read_one(c, &c->usage_base->hidden);
- data = percpu_u64_get(&c->usage[0]->data) +
- percpu_u64_get(&c->usage[0]->btree);
- reserved = percpu_u64_get(&c->usage[0]->reserved) +
- percpu_u64_get(&c->usage[0]->online_reserved);
+ data = bch2_fs_usage_read_one(c, &c->usage_base->data) +
+ bch2_fs_usage_read_one(c, &c->usage_base->btree);
+ reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
+ bch2_fs_usage_read_one(c, &c->usage_base->online_reserved);
ret.used = min(ret.capacity, data + reserve_factor(reserved));
ret.free = ret.capacity - ret.used;
- ret.nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes);
+ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
return ret;
}
int bch2_fs_usage_apply(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
- struct disk_reservation *disk_res)
+ struct disk_reservation *disk_res,
+ unsigned journal_seq)
{
s64 added = fs_usage->data + fs_usage->reserved;
s64 should_not_have_added;
}
preempt_disable();
- acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
+ acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false),
(u64 *) fs_usage, fs_usage_u64s(c));
preempt_enable();
{
struct bch_dev *ca;
struct bucket_mark old = { .v.counter = 0 };
- struct bch_fs_usage *fs_usage;
struct bucket_array *buckets;
struct bucket *g;
unsigned i;
int cpu;
- percpu_u64_set(&c->usage[0]->hidden, 0);
+ c->usage_base->hidden = 0;
for_each_member_device(ca, c, i) {
for_each_possible_cpu(cpu)
memset(per_cpu_ptr(ca->usage[0], cpu), 0,
sizeof(*ca->usage[0]));
- preempt_disable();
- fs_usage = this_cpu_ptr(c->usage[0]);
buckets = bucket_array(ca);
for_each_bucket(g, buckets)
- bch2_dev_usage_update(c, ca, fs_usage,
+ bch2_dev_usage_update(c, ca, c->usage_base,
old, g->mark, false);
- preempt_enable();
}
}
size_t b, struct bucket_mark *ret,
bool gc)
{
- struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
+ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark old, new;
size_t b, bool owned_by_allocator,
bool gc)
{
- struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
+ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark old, new;
if (flags & BCH_BUCKET_MARK_GC)
return 0;
- u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
ca = bch_dev_bkey_exists(c, k.k->p.inode);
- g = __bucket(ca, k.k->p.offset, gc);
- /*
- * this should currently only be getting called from the bucket
- * invalidate path:
- */
- BUG_ON(u.dirty_sectors);
- BUG_ON(u.cached_sectors);
- BUG_ON(!g->mark.owned_by_allocator);
+ if (k.k->p.offset >= ca->mi.nbuckets)
+ return 0;
+
+ g = __bucket(ca, k.k->p.offset, gc);
+ u = bch2_alloc_unpack(k);
old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({
m.gen = u.gen;
m.data_type = u.data_type;
m.dirty_sectors = u.dirty_sectors;
m.cached_sectors = u.cached_sectors;
+
+ if (!(flags & BCH_BUCKET_MARK_GC)) {
+ m.journal_seq_valid = 1;
+ m.journal_seq = journal_seq;
+ }
}));
g->io_time[READ] = u.read_time;
g->oldest_gen = u.oldest_gen;
g->gen_valid = 1;
+ /*
+ * need to know if we're getting called from the invalidate path or
+ * not:
+ */
+
if (old.cached_sectors) {
update_cached_sectors(c, fs_usage, ca->dev_idx,
-old.cached_sectors);
old.dirty_sectors, sectors);
if (c)
- bch2_dev_usage_update(c, ca, this_cpu_ptr(c->usage[gc]),
+ bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
old, new, gc);
return 0;
}
}
-/*
- * Checking against gc's position has to be done here, inside the cmpxchg()
- * loop, to avoid racing with the start of gc clearing all the marks - GC does
- * that with the gc pos seqlock held.
- */
+static void bucket_set_stripe(struct bch_fs *c,
+ const struct bch_stripe *v,
+ bool enabled,
+ struct bch_fs_usage *fs_usage,
+ u64 journal_seq,
+ bool gc)
+{
+ unsigned i;
+
+ for (i = 0; i < v->nr_blocks; i++) {
+ const struct bch_extent_ptr *ptr = v->ptrs + i;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bucket *g = PTR_BUCKET(ca, ptr, gc);
+ struct bucket_mark new, old;
+
+ BUG_ON(ptr_stale(ca, ptr));
+
+ old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+ new.dirty = true;
+ new.stripe = enabled;
+ if (journal_seq) {
+ new.journal_seq_valid = 1;
+ new.journal_seq = journal_seq;
+ }
+ }));
+ }
+}
+
static bool bch2_mark_pointer(struct bch_fs *c,
struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type,
{
struct bucket_mark old, new;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- size_t b = PTR_BUCKET_NR(ca, &p.ptr);
- struct bucket *g = __bucket(ca, b, gc);
+ struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
bool overflow;
u64 v;
return 0;
}
-static void bucket_set_stripe(struct bch_fs *c,
- const struct bch_stripe *v,
- bool enabled,
- struct bch_fs_usage *fs_usage,
- u64 journal_seq,
- bool gc)
-{
- unsigned i;
-
- for (i = 0; i < v->nr_blocks; i++) {
- const struct bch_extent_ptr *ptr = v->ptrs + i;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- size_t b = PTR_BUCKET_NR(ca, ptr);
- struct bucket *g = __bucket(ca, b, gc);
- struct bucket_mark new, old;
-
- BUG_ON(ptr_stale(ca, ptr));
-
- old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
- new.dirty = true;
- new.stripe = enabled;
- if (journal_seq) {
- new.journal_seq_valid = 1;
- new.journal_seq = journal_seq;
- }
- }));
- }
-}
-
static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
bool inserting,
struct bch_fs_usage *fs_usage,
m->nr_blocks = s.v->nr_blocks;
m->nr_redundant = s.v->nr_redundant;
- memset(&m->r, 0, sizeof(m->r));
-
- m->r.e.data_type = BCH_DATA_USER;
- m->r.e.nr_devs = s.v->nr_blocks;
- m->r.e.nr_required = s.v->nr_blocks - s.v->nr_redundant;
-
- for (i = 0; i < s.v->nr_blocks; i++)
- m->r.e.devs[i] = s.v->ptrs[i].dev;
+ bch2_bkey_to_replicas(&m->r.e, k);
/*
* XXX: account for stripes somehow here
preempt_disable();
if (!fs_usage || gc)
- fs_usage = this_cpu_ptr(c->usage[gc]);
+ fs_usage = fs_usage_ptr(c, journal_seq, gc);
switch (k.k->type) {
case KEY_TYPE_alloc:
return ret;
}
-void bch2_mark_update(struct btree_trans *trans,
- struct btree_insert_entry *insert,
- struct bch_fs_usage *fs_usage,
- unsigned flags)
+inline int bch2_mark_overwrite(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ struct bch_fs_usage *fs_usage,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b = iter->l[0].b;
+ s64 sectors = 0;
+
+ if (btree_node_is_extents(b)
+ ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
+ : bkey_cmp(new->k.p, old.k->p))
+ return 0;
+
+ if (btree_node_is_extents(b)) {
+ switch (bch2_extent_overlap(&new->k, old.k)) {
+ case BCH_EXTENT_OVERLAP_ALL:
+ sectors = -((s64) old.k->size);
+ break;
+ case BCH_EXTENT_OVERLAP_BACK:
+ sectors = bkey_start_offset(&new->k) -
+ old.k->p.offset;
+ break;
+ case BCH_EXTENT_OVERLAP_FRONT:
+ sectors = bkey_start_offset(old.k) -
+ new->k.p.offset;
+ break;
+ case BCH_EXTENT_OVERLAP_MIDDLE:
+ sectors = old.k->p.offset - new->k.p.offset;
+ BUG_ON(sectors <= 0);
+
+ bch2_mark_key_locked(c, old, true, sectors,
+ fs_usage, trans->journal_res.seq,
+ flags);
+
+ sectors = bkey_start_offset(&new->k) -
+ old.k->p.offset;
+ break;
+ }
+
+ BUG_ON(sectors >= 0);
+ }
+
+ return bch2_mark_key_locked(c, old, false, sectors, fs_usage,
+ trans->journal_res.seq, flags) ?: 1;
+}
+
+int bch2_mark_update(struct btree_trans *trans,
+ struct btree_insert_entry *insert,
+ struct bch_fs_usage *fs_usage,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_iter *iter = insert->iter;
struct btree *b = iter->l[0].b;
struct btree_node_iter node_iter = iter->l[0].iter;
struct bkey_packed *_k;
+ int ret = 0;
if (!btree_node_type_needs_gc(iter->btree_id))
- return;
+ return 0;
- if (!(trans->flags & BTREE_INSERT_NOMARK))
+ if (!(trans->flags & BTREE_INSERT_NOMARK_INSERT))
bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
bpos_min(insert->k->k.p, b->key.k.p).offset -
bkey_start_offset(&insert->k->k),
fs_usage, trans->journal_res.seq, flags);
+ if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
+ return 0;
+
+ /*
+ * For non extents, we only mark the new key, not the key being
+ * overwritten - unless we're actually deleting:
+ */
+ if ((iter->btree_id == BTREE_ID_ALLOC ||
+ iter->btree_id == BTREE_ID_EC) &&
+ !bkey_deleted(&insert->k->k))
+ return 0;
+
while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
KEY_TYPE_discard))) {
struct bkey unpacked;
- struct bkey_s_c k;
- s64 sectors = 0;
-
- k = bkey_disassemble(b, _k, &unpacked);
+ struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
- if (btree_node_is_extents(b)
- ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
- : bkey_cmp(insert->k->k.p, k.k->p))
+ ret = bch2_mark_overwrite(trans, iter, k, insert->k,
+ fs_usage, flags);
+ if (ret <= 0)
break;
- if (btree_node_is_extents(b)) {
- switch (bch2_extent_overlap(&insert->k->k, k.k)) {
- case BCH_EXTENT_OVERLAP_ALL:
- sectors = -((s64) k.k->size);
- break;
- case BCH_EXTENT_OVERLAP_BACK:
- sectors = bkey_start_offset(&insert->k->k) -
- k.k->p.offset;
- break;
- case BCH_EXTENT_OVERLAP_FRONT:
- sectors = bkey_start_offset(k.k) -
- insert->k->k.p.offset;
- break;
- case BCH_EXTENT_OVERLAP_MIDDLE:
- sectors = k.k->p.offset - insert->k->k.p.offset;
- BUG_ON(sectors <= 0);
-
- bch2_mark_key_locked(c, k, true, sectors,
- fs_usage, trans->journal_res.seq,
- flags);
-
- sectors = bkey_start_offset(&insert->k->k) -
- k.k->p.offset;
- break;
- }
-
- BUG_ON(sectors >= 0);
- }
-
- bch2_mark_key_locked(c, k, false, sectors,
- fs_usage, trans->journal_res.seq, flags);
-
bch2_btree_node_iter_advance(&node_iter, b);
}
+
+ return ret;
}
void bch2_trans_fs_usage_apply(struct btree_trans *trans,
u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
char buf[200];
- if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res) ||
+ if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res,
+ trans->journal_res.seq) ||
warned_disk_usage ||
xchg(&warned_disk_usage, 1))
return;
}
}
+/* trans_mark: */
+
+static inline void update_replicas_list(struct replicas_delta_list *d,
+ struct bch_replicas_entry *r,
+ s64 sectors)
+{
+ d->top->delta = sectors;
+ memcpy(&d->top->r, r, replicas_entry_bytes(r));
+
+ d->top = (void *) d->top + replicas_entry_bytes(r) + 8;
+
+ BUG_ON((void *) d->top > (void *) d->d + sizeof(d->pad));
+}
+
+static inline void update_cached_sectors_list(struct replicas_delta_list *d,
+ unsigned dev, s64 sectors)
+{
+ struct bch_replicas_padded r;
+
+ bch2_replicas_entry_cached(&r.e, dev);
+
+ update_replicas_list(d, &r.e, sectors);
+}
+
+void bch2_replicas_delta_list_apply(struct bch_fs *c,
+ struct bch_fs_usage *fs_usage,
+ struct replicas_delta_list *r)
+{
+ struct replicas_delta *d = r->d;
+
+ acc_u64s((u64 *) fs_usage,
+ (u64 *) &r->fs_usage, sizeof(*fs_usage) / sizeof(u64));
+
+ while (d != r->top) {
+ BUG_ON((void *) d > (void *) r->top);
+
+ update_replicas(c, fs_usage, &d->r, d->delta);
+
+ d = (void *) d + replicas_entry_bytes(&d->r) + 8;
+ }
+}
+
+static int trans_get_key(struct btree_trans *trans,
+ enum btree_id btree_id, struct bpos pos,
+ struct btree_insert_entry **insert,
+ struct btree_iter **iter,
+ struct bkey_s_c *k)
+{
+ unsigned i;
+ int ret;
+
+ *insert = NULL;
+
+ for (i = 0; i < trans->nr_updates; i++)
+ if (!trans->updates[i].deferred &&
+ trans->updates[i].iter->btree_id == btree_id &&
+ !bkey_cmp(pos, trans->updates[i].iter->pos)) {
+ *insert = &trans->updates[i];
+ *iter = (*insert)->iter;
+ *k = bkey_i_to_s_c((*insert)->k);
+ return 0;
+ }
+
+ *iter = __bch2_trans_get_iter(trans, btree_id, pos,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, 0);
+ if (IS_ERR(*iter))
+ return PTR_ERR(*iter);
+
+ *k = bch2_btree_iter_peek_slot(*iter);
+ ret = bkey_err(*k);
+ if (ret)
+ bch2_trans_iter_put(trans, *iter);
+ return ret;
+}
+
+static int trans_update_key(struct btree_trans *trans,
+ struct btree_insert_entry **insert,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ unsigned extra_u64s)
+{
+ struct bkey_i *new_k;
+
+ if (*insert)
+ return 0;
+
+ new_k = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+ extra_u64s * sizeof(u64));
+ if (IS_ERR(new_k))
+ return PTR_ERR(new_k);
+
+ *insert = bch2_trans_update(trans, ((struct btree_insert_entry) {
+ .iter = iter,
+ .k = new_k,
+ .triggered = true,
+ }));
+
+ bkey_reassemble((*insert)->k, k);
+ return 0;
+}
+
+static int bch2_trans_mark_pointer(struct btree_trans *trans,
+ struct extent_ptr_decoded p,
+ s64 sectors, enum bch_data_type data_type,
+ struct replicas_delta_list *d)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ struct btree_insert_entry *insert;
+ struct btree_iter *iter;
+ struct bkey_s_c k;
+ struct bkey_alloc_unpacked u;
+ struct bkey_i_alloc *a;
+ bool overflow;
+ int ret;
+
+ ret = trans_get_key(trans, BTREE_ID_ALLOC,
+ POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)),
+ &insert, &iter, &k);
+ if (ret)
+ return ret;
+
+ if (k.k->type != KEY_TYPE_alloc) {
+ bch_err_ratelimited(c, "pointer to nonexistent bucket %u:%zu",
+ p.ptr.dev,
+ PTR_BUCKET_NR(ca, &p.ptr));
+ ret = -1;
+ goto out;
+ }
+
+ u = bch2_alloc_unpack(k);
+
+ if (gen_after(u.gen, p.ptr.gen)) {
+ ret = 1;
+ goto out;
+ }
+
+ if (!p.ptr.cached)
+ overflow = checked_add(u.dirty_sectors, sectors);
+ else
+ overflow = checked_add(u.cached_sectors, sectors);
+
+ u.data_type = u.dirty_sectors || u.cached_sectors
+ ? data_type : 0;
+
+ bch2_fs_inconsistent_on(overflow, c,
+ "bucket sector count overflow: %u + %lli > U16_MAX",
+ !p.ptr.cached
+ ? u.dirty_sectors
+ : u.cached_sectors, sectors);
+
+ ret = trans_update_key(trans, &insert, iter, k, 1);
+ if (ret)
+ goto out;
+
+ a = bkey_alloc_init(insert->k);
+ a->k.p = iter->pos;
+ bch2_alloc_pack(a, u);
+out:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
+static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
+ struct bch_extent_stripe_ptr p,
+ s64 sectors, enum bch_data_type data_type,
+ struct replicas_delta_list *d)
+{
+ struct bch_replicas_padded r;
+ struct btree_insert_entry *insert;
+ struct btree_iter *iter;
+ struct bkey_s_c k;
+ struct bkey_s_stripe s;
+ unsigned nr_data;
+ s64 parity_sectors;
+ int ret = 0;
+
+ BUG_ON(!sectors);
+
+ ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx),
+ &insert, &iter, &k);
+ if (ret)
+ return ret;
+
+ if (k.k->type != KEY_TYPE_stripe) {
+ bch_err_ratelimited(trans->c,
+ "pointer to nonexistent stripe %llu",
+ (u64) p.idx);
+ ret = -1;
+ goto out;
+ }
+
+ ret = trans_update_key(trans, &insert, iter, k, 1);
+ if (ret)
+ goto out;
+
+ s = bkey_i_to_s_stripe(insert->k);
+
+ nr_data = s.v->nr_blocks - s.v->nr_redundant;
+
+ parity_sectors = DIV_ROUND_UP(abs(sectors) * s.v->nr_redundant, nr_data);
+
+ if (sectors < 0)
+ parity_sectors = -parity_sectors;
+
+ stripe_blockcount_set(s.v, p.block,
+ stripe_blockcount_get(s.v, p.block) +
+ sectors + parity_sectors);
+
+ bch2_bkey_to_replicas(&r.e, s.s_c);
+
+ update_replicas_list(d, &r.e, sectors);
+out:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
+static int bch2_trans_mark_extent(struct btree_trans *trans,
+ struct bkey_s_c k,
+ s64 sectors, enum bch_data_type data_type,
+ struct replicas_delta_list *d)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct bch_replicas_padded r;
+ s64 dirty_sectors = 0;
+ bool stale;
+ unsigned i;
+ int ret;
+
+ r.e.data_type = data_type;
+ r.e.nr_devs = 0;
+ r.e.nr_required = 1;
+
+ BUG_ON(!sectors);
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ s64 disk_sectors = data_type == BCH_DATA_BTREE
+ ? sectors
+ : ptr_disk_sectors_delta(p, sectors);
+
+ ret = bch2_trans_mark_pointer(trans, p, disk_sectors,
+ data_type, d);
+ if (ret < 0)
+ return ret;
+
+ stale = ret > 0;
+
+ if (p.ptr.cached) {
+ if (disk_sectors && !stale)
+ update_cached_sectors_list(d, p.ptr.dev,
+ disk_sectors);
+ } else if (!p.ec_nr) {
+ dirty_sectors += disk_sectors;
+ r.e.devs[r.e.nr_devs++] = p.ptr.dev;
+ } else {
+ for (i = 0; i < p.ec_nr; i++) {
+ ret = bch2_trans_mark_stripe_ptr(trans, p.ec[i],
+ disk_sectors, data_type, d);
+ if (ret)
+ return ret;
+ }
+
+ r.e.nr_required = 0;
+ }
+ }
+
+ if (dirty_sectors)
+ update_replicas_list(d, &r.e, dirty_sectors);
+
+ return 0;
+}
+
+int bch2_trans_mark_key(struct btree_trans *trans,
+ struct bkey_s_c k,
+ bool inserting, s64 sectors,
+ struct replicas_delta_list *d)
+{
+ struct bch_fs *c = trans->c;
+
+ switch (k.k->type) {
+ case KEY_TYPE_btree_ptr:
+ return bch2_trans_mark_extent(trans, k, inserting
+ ? c->opts.btree_node_size
+ : -c->opts.btree_node_size,
+ BCH_DATA_BTREE, d);
+ case KEY_TYPE_extent:
+ return bch2_trans_mark_extent(trans, k,
+ sectors, BCH_DATA_USER, d);
+ case KEY_TYPE_inode:
+ if (inserting)
+ d->fs_usage.nr_inodes++;
+ else
+ d->fs_usage.nr_inodes--;
+ return 0;
+ case KEY_TYPE_reservation: {
+ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+
+ sectors *= replicas;
+ replicas = clamp_t(unsigned, replicas, 1,
+ ARRAY_SIZE(d->fs_usage.persistent_reserved));
+
+ d->fs_usage.reserved += sectors;
+ d->fs_usage.persistent_reserved[replicas - 1] += sectors;
+ return 0;
+ }
+ default:
+ return 0;
+ }
+}
+
+int bch2_trans_mark_update(struct btree_trans *trans,
+ struct btree_insert_entry *insert,
+ struct replicas_delta_list *d)
+{
+ struct btree_iter *iter = insert->iter;
+ struct btree *b = iter->l[0].b;
+ struct btree_node_iter node_iter = iter->l[0].iter;
+ struct bkey_packed *_k;
+ int ret;
+
+ if (!btree_node_type_needs_gc(iter->btree_id))
+ return 0;
+
+ ret = bch2_trans_mark_key(trans,
+ bkey_i_to_s_c(insert->k), true,
+ bpos_min(insert->k->k.p, b->key.k.p).offset -
+ bkey_start_offset(&insert->k->k), d);
+ if (ret)
+ return ret;
+
+ while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+ KEY_TYPE_discard))) {
+ struct bkey unpacked;
+ struct bkey_s_c k;
+ s64 sectors = 0;
+
+ k = bkey_disassemble(b, _k, &unpacked);
+
+ if (btree_node_is_extents(b)
+ ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
+ : bkey_cmp(insert->k->k.p, k.k->p))
+ break;
+
+ if (btree_node_is_extents(b)) {
+ switch (bch2_extent_overlap(&insert->k->k, k.k)) {
+ case BCH_EXTENT_OVERLAP_ALL:
+ sectors = -((s64) k.k->size);
+ break;
+ case BCH_EXTENT_OVERLAP_BACK:
+ sectors = bkey_start_offset(&insert->k->k) -
+ k.k->p.offset;
+ break;
+ case BCH_EXTENT_OVERLAP_FRONT:
+ sectors = bkey_start_offset(k.k) -
+ insert->k->k.p.offset;
+ break;
+ case BCH_EXTENT_OVERLAP_MIDDLE:
+ sectors = k.k->p.offset - insert->k->k.p.offset;
+ BUG_ON(sectors <= 0);
+
+ ret = bch2_trans_mark_key(trans, k, true,
+ sectors, d);
+ if (ret)
+ return ret;
+
+ sectors = bkey_start_offset(&insert->k->k) -
+ k.k->p.offset;
+ break;
+ }
+
+ BUG_ON(sectors >= 0);
+ }
+
+ ret = bch2_trans_mark_key(trans, k, false, sectors, d);
+ if (ret)
+ return ret;
+
+ bch2_btree_node_iter_advance(&node_iter, b);
+ }
+
+ return 0;
+}
+
/* Disk reservations: */
static u64 bch2_recalc_sectors_available(struct bch_fs *c)
struct bucket_mark m;
rcu_read_lock();
- m = READ_ONCE(bucket(ca, PTR_BUCKET_NR(ca, ptr))->mark);
+ m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark);
rcu_read_unlock();
return m;
void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *);
struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *);
+u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
+
struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *);
+void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
+
+void bch2_fs_usage_to_text(struct printbuf *,
+ struct bch_fs *, struct bch_fs_usage *);
+
u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *);
struct bch_fs_usage_short
bool, s64, struct bch_fs_usage *,
u64, unsigned);
int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
- struct disk_reservation *);
-
-void bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
- struct bch_fs_usage *, unsigned);
+ struct disk_reservation *, unsigned);
+
+int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
+ struct bkey_s_c, struct bkey_i *,
+ struct bch_fs_usage *, unsigned);
+int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
+ struct bch_fs_usage *, unsigned);
+
+void bch2_replicas_delta_list_apply(struct bch_fs *,
+ struct bch_fs_usage *,
+ struct replicas_delta_list *);
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
+ bool, s64, struct replicas_delta_list *);
+int bch2_trans_mark_update(struct btree_trans *,
+ struct btree_insert_entry *,
+ struct replicas_delta_list *);
void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
/* disk reservations: */
u64 nr_inodes;
};
+struct replicas_delta {
+ s64 delta;
+ struct bch_replicas_entry r;
+};
+
+struct replicas_delta_list {
+ struct bch_fs_usage fs_usage;
+
+ struct replicas_delta *top;
+ struct replicas_delta d[0];
+ u8 pad[256];
+};
+
/*
* A reservation for space on disk:
*/
if (arg.flags || arg.pad)
return -EINVAL;
- return bch2_fs_start(c) ? -EIO : 0;
+ return bch2_fs_start(c);
}
static long bch2_ioctl_stop(struct bch_fs *c)
{
struct btree_iter *iter;
struct bkey_s_c k;
- int ret = 0;
-
- iter = bch2_trans_get_iter(trans, BTREE_ID_DIRENTS,
- POS(dir_inum, 0), 0);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
+ int ret;
- for_each_btree_key_continue(iter, 0, k) {
+ for_each_btree_key(trans, iter, BTREE_ID_DIRENTS,
+ POS(dir_inum, 0), 0, k, ret) {
if (k.k->p.inode > dir_inum)
break;
struct bkey_s_c k;
struct bkey_s_c_dirent dirent;
unsigned len;
+ int ret;
if (!dir_emit_dots(file, ctx))
return 0;
bch2_trans_init(&trans, c);
for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
- POS(inode->v.i_ino, ctx->pos), 0, k) {
+ POS(inode->v.i_ino, ctx->pos), 0, k, ret) {
if (k.k->type != KEY_TYPE_dirent)
continue;
ctx->pos = k.k->p.offset + 1;
}
- bch2_trans_exit(&trans);
+ ret = bch2_trans_exit(&trans) ?: ret;
- return 0;
+ return ret;
}
#include "ec.h"
#include "error.h"
#include "io.h"
-#include "journal_io.h"
#include "keylist.h"
+#include "recovery.h"
#include "super-io.h"
#include "util.h"
struct btree_iter *iter)
{
size_t idx = iter->pos.offset;
+ int ret = 0;
if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT))
- return 0;
+ return ret;
bch2_btree_trans_unlock(iter->trans);
+ ret = -EINTR;
if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
- return -EINTR;
+ return ret;
+
return -ENOMEM;
}
bch2_trans_begin(&trans);
/* XXX: start pos hint */
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
- for_each_btree_key_continue(iter, BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
+ for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
break;
goto found_slot;
}
- ret = -ENOSPC;
- goto out;
+ if (!ret)
+ ret = -ENOSPC;
+ goto err;
found_slot:
ret = ec_stripe_mem_alloc(c, iter);
-
- if (ret == -EINTR)
- goto retry;
if (ret)
- return ret;
+ goto err;
stripe->k.p = iter->pos;
bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &stripe->k_i));
ret = bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE);
-out:
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL);
+err:
+ if (ret == -EINTR)
+ goto retry;
bch2_trans_exit(&trans);
return ret;
int ret = 0, dev, idx;
bch2_trans_init(&trans, c);
+ bch2_trans_preload_iters(&trans);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
bkey_start_pos(pos),
unsigned l = *((const unsigned *) _l);
unsigned r = *((const unsigned *) _r);
- return (l > r) - (l < r);
+ return cmp_int(l, r);
}
/* pick most common bucket size: */
BTREE_INSERT_NOFAIL|flags);
}
-int bch2_stripes_write(struct bch_fs *c, bool *wrote)
+int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
{
struct btree_trans trans;
struct btree_iter *iter;
continue;
ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos,
- new_key, BTREE_INSERT_NOCHECK_RW);
+ new_key, flags);
if (ret)
break;
return ret;
}
-static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k)
-{
- bch2_mark_key(c, k, true, 0, NULL, 0, 0);
-}
-
-int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list)
+int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
{
- struct journal_replay *r;
+ struct journal_key *i;
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
bch2_trans_init(&trans, c);
- for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k) {
- bch2_stripe_read_key(c, k);
- bch2_trans_cond_resched(&trans);
- }
+ for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k, ret)
+ bch2_mark_key(c, k, true, 0, NULL, 0, 0);
- ret = bch2_trans_exit(&trans);
- if (ret)
+ ret = bch2_trans_exit(&trans) ?: ret;
+ if (ret) {
+ bch_err(c, "error reading stripes: %i", ret);
return ret;
-
- list_for_each_entry(r, journal_replay_list, list) {
- struct bkey_i *k, *n;
- struct jset_entry *entry;
-
- for_each_jset_key(k, n, entry, &r->j)
- if (entry->btree_id == BTREE_ID_EC)
- bch2_stripe_read_key(c, bkey_i_to_s_c(k));
}
+ for_each_journal_key(*journal_keys, i)
+ if (i->btree_id == BTREE_ID_EC)
+ bch2_mark_key(c, bkey_i_to_s_c(i->k),
+ true, 0, NULL, 0, 0);
+
return 0;
}
void bch2_ec_flush_new_stripes(struct bch_fs *);
-int bch2_stripes_read(struct bch_fs *, struct list_head *);
-int bch2_stripes_write(struct bch_fs *, bool *);
+struct journal_keys;
+int bch2_stripes_read(struct bch_fs *, struct journal_keys *);
+int bch2_stripes_write(struct bch_fs *, unsigned, bool *);
int bch2_ec_mem_alloc(struct bch_fs *, bool);
bch2_btree_iter_verify(iter, l->b);
}
+static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ unsigned ret = 0;
+
+ bkey_extent_entry_for_each(ptrs, entry) {
+ switch (__extent_entry_type(entry)) {
+ case BCH_EXTENT_ENTRY_ptr:
+ case BCH_EXTENT_ENTRY_stripe_ptr:
+ ret++;
+ }
+ }
+
+ return ret;
+}
+
static inline struct bpos
-bch2_extent_atomic_end(struct bkey_i *k, struct btree_iter *iter)
+bch2_extent_atomic_end(struct bkey_i *insert, struct btree_iter *iter)
{
struct btree *b = iter->l[0].b;
+ struct btree_node_iter node_iter = iter->l[0].iter;
+ struct bkey_packed *_k;
+ unsigned nr_alloc_ptrs =
+ bch2_bkey_nr_alloc_ptrs(bkey_i_to_s_c(insert));
BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
- BUG_ON(bkey_cmp(bkey_start_pos(&k->k), b->data->min_key) < 0);
+ BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
+
+ while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+ KEY_TYPE_discard))) {
+ struct bkey unpacked;
+ struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
+
+ if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0)
+ break;
+
+ nr_alloc_ptrs += bch2_bkey_nr_alloc_ptrs(k);
+
+ if (nr_alloc_ptrs > 20) {
+ BUG_ON(bkey_cmp(k.k->p, bkey_start_pos(&insert->k)) <= 0);
+ return bpos_min(insert->k.p, k.k->p);
+ }
+
+ bch2_btree_node_iter_advance(&node_iter, b);
+ }
- return bpos_min(k->k.p, b->key.k.p);
+ return bpos_min(insert->k.p, b->key.k.p);
}
void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
struct bpos end = pos;
struct bkey_s_c k;
bool ret = true;
+ int err;
end.offset += size;
bch2_trans_init(&trans, c);
for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos,
- BTREE_ITER_SLOTS, k) {
+ BTREE_ITER_SLOTS, k, err) {
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
bch2_trans_init(&trans, c);
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k) {
+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) {
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
bch2_trans_init(&trans, c);
for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
- POS(inode->v.i_ino, offset >> 9), 0, k) {
+ POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
break;
} else if (bkey_extent_is_data(k.k)) {
break;
}
- ret = bch2_trans_exit(&trans);
+ ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
return ret;
for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
POS(inode->v.i_ino, offset >> 9),
- BTREE_ITER_SLOTS, k) {
+ BTREE_ITER_SLOTS, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
next_hole = bch2_next_pagecache_hole(&inode->v,
offset, MAX_LFS_FILESIZE);
}
}
- ret = bch2_trans_exit(&trans);
+ ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
return ret;
down_write(&sb->s_umount);
sb->s_flags |= SB_RDONLY;
- bch2_fs_emergency_read_only(c);
+ if (bch2_fs_emergency_read_only(c))
+ bch_err(c, "emergency read only due to ioctl");
up_write(&sb->s_umount);
return 0;
bch2_trans_init(&trans, c);
for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
- POS(ei->v.i_ino, start >> 9), 0, k)
+ POS(ei->v.i_ino, start >> 9), 0, k, ret)
if (bkey_extent_is_data(k.k) ||
k.k->type == KEY_TYPE_reservation) {
if (bkey_cmp(bkey_start_pos(k.k),
if (have_extent) {
ret = bch2_fill_extent(info, &tmp.k, 0);
if (ret)
- goto out;
+ break;
}
bkey_reassemble(&tmp.k, k);
have_extent = true;
}
- if (have_extent)
+ if (!ret && have_extent)
ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
-out:
- bch2_trans_exit(&trans);
+
+ ret = bch2_trans_exit(&trans) ?: ret;
return ret < 0 ? ret : 0;
}
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
if (IS_ERR(vinode)) {
+ bch_err(c, "error mounting: error getting root inode %i",
+ (int) PTR_ERR(vinode));
ret = PTR_ERR(vinode);
goto err_put_super;
}
sb->s_root = d_make_root(vinode);
if (!sb->s_root) {
+ bch_err(c, "error mounting: error allocating root dentry");
ret = -ENOMEM;
goto err_put_super;
}
static inline int ptrcmp(void *l, void *r)
{
- return (l > r) - (l < r);
+ return cmp_int(l, r);
}
#define __bch2_lock_inodes(_lock, ...) \
struct btree_iter *iter;
struct bkey_s_c k;
u64 sectors = 0;
+ int ret;
- for_each_btree_key(trans, iter, BTREE_ID_EXTENTS, POS(inum, 0), 0, k) {
+ for_each_btree_key(trans, iter, BTREE_ID_EXTENTS,
+ POS(inum, 0), 0, k, ret) {
if (k.k->p.inode != inum)
break;
sectors += k.k->size;
}
- return bch2_trans_iter_free(trans, iter) ?: sectors;
+ bch2_trans_iter_free(trans, iter);
+
+ return ret ?: sectors;
}
static int remove_dirent(struct btree_trans *trans,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW);
if (ret) {
- bch_err(c, "error in fs gc: error %i "
- "updating inode", ret);
+ bch_err(c, "error in fsck: error %i updating inode", ret);
goto err;
}
goto up;
for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
- POS(e->inum, e->offset + 1), 0, k) {
+ POS(e->inum, e->offset + 1), 0, k, ret) {
if (k.k->p.inode != e->inum)
break;
}
goto next;
}
- ret = bch2_trans_iter_free(&trans, iter);
+ ret = bch2_trans_iter_free(&trans, iter) ?: ret;
if (ret) {
bch_err(c, "btree error %i in fsck", ret);
goto err;
link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
if (!link) {
- bch_verbose(c, "allocation failed during fs gc - will need another pass");
+ bch_verbose(c, "allocation failed during fsck - will need another pass");
*range_end = inum;
return;
}
inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false);
- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k) {
+ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) {
switch (k.k->type) {
case KEY_TYPE_dirent:
d = bkey_s_c_to_dirent(k);
bch2_trans_cond_resched(&trans);
}
- ret = bch2_trans_exit(&trans);
+ ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
- bch_err(c, "error in fs gc: btree error %i while walking dirents", ret);
+ bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
return ret;
}
ret = bch2_inode_rm(c, u.bi_inum);
if (ret)
- bch_err(c, "error in fs gc: error %i "
- "while deleting inode", ret);
+ bch_err(c, "error in fsck: error %i while deleting inode", ret);
return ret;
}
ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size);
if (ret) {
- bch_err(c, "error in fs gc: error %i "
- "truncating inode", ret);
+ bch_err(c, "error in fsck: error %i truncating inode", ret);
return ret;
}
sectors = bch2_count_inode_sectors(trans, u.bi_inum);
if (sectors < 0) {
- bch_err(c, "error in fs gc: error %i "
- "recounting inode sectors",
+ bch_err(c, "error in fsck: error %i recounting inode sectors",
(int) sectors);
return sectors;
}
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW);
if (ret && ret != -EINTR)
- bch_err(c, "error in fs gc: error %i "
+ bch_err(c, "error in fsck: error %i "
"updating inode", ret);
}
fsck_err:
bch2_trans_exit(&trans);
if (ret2)
- bch_err(c, "error in fs gc: btree error %i while walking inodes", ret2);
+ bch_err(c, "error in fsck: btree error %i while walking inodes", ret2);
return ret ?: ret2;
}
return ret;
}
-noinline_for_stack
-static int check_inodes_fast(struct bch_fs *c)
-{
- struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
- struct bkey_s_c_inode inode;
- int ret = 0, ret2;
-
- bch2_trans_init(&trans, c);
- bch2_trans_preload_iters(&trans);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES,
- POS_MIN, 0);
-
- for_each_btree_key_continue(iter, 0, k) {
- if (k.k->type != KEY_TYPE_inode)
- continue;
-
- inode = bkey_s_c_to_inode(k);
-
- if (inode.v->bi_flags &
- (BCH_INODE_I_SIZE_DIRTY|
- BCH_INODE_I_SECTORS_DIRTY|
- BCH_INODE_UNLINKED)) {
- ret = check_inode(&trans, NULL, iter, inode, NULL);
- BUG_ON(ret == -EINTR);
- if (ret)
- break;
- }
- }
-
- ret2 = bch2_trans_exit(&trans);
-
- return ret ?: ret2;
-}
-
/*
* Checks for inconsistencies that shouldn't happen, unless we have a bug.
* Doesn't fix them yet, mainly because they haven't yet been observed:
*/
-static int bch2_fsck_full(struct bch_fs *c)
+int bch2_fsck_full(struct bch_fs *c)
{
struct bch_inode_unpacked root_inode, lostfound_inode;
- int ret;
- bch_verbose(c, "starting fsck:");
- ret = check_extents(c) ?:
+ return check_extents(c) ?:
check_dirents(c) ?:
check_xattrs(c) ?:
check_root(c, &root_inode) ?:
check_lostfound(c, &root_inode, &lostfound_inode) ?:
check_directory_structure(c, &lostfound_inode) ?:
check_inode_nlinks(c, &lostfound_inode);
-
- bch2_flush_fsck_errs(c);
- bch_verbose(c, "fsck done");
-
- return ret;
}
-static int bch2_fsck_inode_nlink(struct bch_fs *c)
+int bch2_fsck_inode_nlink(struct bch_fs *c)
{
struct bch_inode_unpacked root_inode, lostfound_inode;
- int ret;
- bch_verbose(c, "checking inode link counts:");
- ret = check_root(c, &root_inode) ?:
+ return check_root(c, &root_inode) ?:
check_lostfound(c, &root_inode, &lostfound_inode) ?:
check_inode_nlinks(c, &lostfound_inode);
-
- bch2_flush_fsck_errs(c);
- bch_verbose(c, "done");
-
- return ret;
}
-static int bch2_fsck_walk_inodes_only(struct bch_fs *c)
+int bch2_fsck_walk_inodes_only(struct bch_fs *c)
{
+ struct btree_trans trans;
+ struct btree_iter *iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_inode inode;
int ret;
- bch_verbose(c, "walking inodes:");
- ret = check_inodes_fast(c);
-
- bch2_flush_fsck_errs(c);
- bch_verbose(c, "done");
+ bch2_trans_init(&trans, c);
+ bch2_trans_preload_iters(&trans);
- return ret;
-}
+ for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_inode)
+ continue;
-int bch2_fsck(struct bch_fs *c)
-{
- if (c->opts.fsck)
- return bch2_fsck_full(c);
+ inode = bkey_s_c_to_inode(k);
- if (c->sb.clean)
- return 0;
+ if (inode.v->bi_flags &
+ (BCH_INODE_I_SIZE_DIRTY|
+ BCH_INODE_I_SECTORS_DIRTY|
+ BCH_INODE_UNLINKED)) {
+ ret = check_inode(&trans, NULL, iter, inode, NULL);
+ BUG_ON(ret == -EINTR);
+ if (ret)
+ break;
+ }
+ }
+ BUG_ON(ret == -EINTR);
- return c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK)
- ? bch2_fsck_walk_inodes_only(c)
- : bch2_fsck_inode_nlink(c);
+ return bch2_trans_exit(&trans) ?: ret;
}
#ifndef _BCACHEFS_FSCK_H
#define _BCACHEFS_FSCK_H
-int bch2_fsck(struct bch_fs *);
+int bch2_fsck_full(struct bch_fs *);
+int bch2_fsck_inode_nlink(struct bch_fs *);
+int bch2_fsck_walk_inodes_only(struct bch_fs *);
#endif /* _BCACHEFS_FSCK_H */
retry:
for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
POS(inode, bvec_iter.bi_sector),
- BTREE_ITER_SLOTS, k) {
+ BTREE_ITER_SLOTS, k, ret) {
BKEY_PADDED(k) tmp;
unsigned bytes;
* If we get here, it better have been because there was an error
* reading a btree node
*/
- BUG_ON(!btree_iter_err(iter));
- __bcache_io_error(c, "btree IO error");
+ BUG_ON(!ret);
+ __bcache_io_error(c, "btree IO error: %i", ret);
err:
rbio->bio.bi_status = BLK_STS_IOERR;
out:
unsigned flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE|
BCH_READ_USER_MAPPED;
+ int ret;
bch2_trans_init(&trans, c);
for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
POS(inode, rbio->bio.bi_iter.bi_sector),
- BTREE_ITER_SLOTS, k) {
+ BTREE_ITER_SLOTS, k, ret) {
BKEY_PADDED(k) tmp;
unsigned bytes;
* If we get here, it better have been because there was an error
* reading a btree node
*/
- BUG_ON(!btree_iter_err(iter));
- bcache_io_error(c, &rbio->bio, "btree IO error");
+ BUG_ON(!ret);
+ bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
bch2_trans_exit(&trans);
bch2_rbio_done(rbio);
buf->data->u64s = 0;
}
-static inline bool journal_entry_empty(struct jset *j)
-{
- struct jset_entry *i;
-
- if (j->seq != j->last_seq)
- return false;
-
- vstruct_for_each(j, i)
- if (i->type || i->u64s)
- return false;
- return true;
-}
-
void bch2_journal_halt(struct journal *j)
{
union journal_res_state old, new;
u64 last_seq = cur_seq, nr, seq;
if (!list_empty(journal_entries))
- last_seq = le64_to_cpu(list_last_entry(journal_entries,
- struct journal_replay,
- list)->j.last_seq);
+ last_seq = le64_to_cpu(list_first_entry(journal_entries,
+ struct journal_replay,
+ list)->j.seq);
nr = cur_seq - last_seq;
}
}
+ j->replay_journal_seq = last_seq;
+ j->replay_journal_seq_end = cur_seq;
j->last_seq_ondisk = last_seq;
j->pin.front = last_seq;
j->pin.back = cur_seq;
fifo_for_each_entry_ptr(p, &j->pin, seq) {
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
- atomic_set(&p->count, 0);
+ atomic_set(&p->count, 1);
p->devs.nr = 0;
}
BUG_ON(seq < last_seq || seq >= cur_seq);
- p = journal_seq_pin(j, seq);
-
- atomic_set(&p->count, 1);
- p->devs = i->devs;
+ journal_seq_pin(j, seq)->devs = i->devs;
}
spin_lock(&j->lock);
id, 0, k, k->k.u64s);
}
+static inline bool journal_entry_empty(struct jset *j)
+{
+ struct jset_entry *i;
+
+ if (j->seq != j->last_seq)
+ return false;
+
+ vstruct_for_each(j, i)
+ if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s)
+ return false;
+ return true;
+}
+
void __bch2_journal_buf_put(struct journal *, bool);
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
#include "bcachefs.h"
-#include "alloc_background.h"
#include "alloc_foreground.h"
-#include "btree_gc.h"
-#include "btree_update.h"
#include "buckets.h"
#include "checksum.h"
#include "error.h"
goto out;
}
-void bch2_journal_entries_free(struct list_head *list)
-{
-
- while (!list_empty(list)) {
- struct journal_replay *i =
- list_first_entry(list, struct journal_replay, list);
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
- }
-}
-
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
{
struct journal_list jlist;
return ret;
}
-/* journal replay: */
-
-static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
-{
- struct btree_trans trans;
- struct btree_iter *iter;
- /*
- * We might cause compressed extents to be
- * split, so we need to pass in a
- * disk_reservation:
- */
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(c, 0);
- BKEY_PADDED(k) split;
- int ret;
-
- bch2_trans_init(&trans, c);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
- bkey_start_pos(&k->k),
- BTREE_ITER_INTENT);
- do {
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- break;
-
- bkey_copy(&split.k, k);
- bch2_cut_front(iter->pos, &split.k);
- bch2_extent_trim_atomic(&split.k, iter);
-
- ret = bch2_disk_reservation_add(c, &disk_res,
- split.k.k.size *
- bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)),
- BCH_DISK_RESERVATION_NOFAIL);
- BUG_ON(ret);
-
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k));
- ret = bch2_trans_commit(&trans, &disk_res, NULL,
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_JOURNAL_REPLAY);
- } while ((!ret || ret == -EINTR) &&
- bkey_cmp(k->k.p, iter->pos));
-
- bch2_disk_reservation_put(c, &disk_res);
-
- /*
- * This isn't strictly correct - we should only be relying on the btree
- * node lock for synchronization with gc when we've got a write lock
- * held.
- *
- * but - there are other correctness issues if btree gc were to run
- * before journal replay finishes
- */
- BUG_ON(c->gc_pos.phase);
-
- bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
- NULL, 0, 0);
- bch2_trans_exit(&trans);
-
- return ret;
-}
-
-int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
-{
- struct journal *j = &c->journal;
- struct bkey_i *k, *_n;
- struct jset_entry *entry;
- struct journal_replay *i, *n;
- int ret = 0;
-
- list_for_each_entry_safe(i, n, list, list) {
- j->replay_journal_seq = le64_to_cpu(i->j.seq);
-
- for_each_jset_key(k, _n, entry, &i->j) {
- switch (entry->btree_id) {
- case BTREE_ID_ALLOC:
- ret = bch2_alloc_replay_key(c, k);
- break;
- case BTREE_ID_EXTENTS:
- ret = bch2_extent_replay_key(c, k);
- break;
- default:
- ret = bch2_btree_insert(c, entry->btree_id, k,
- NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_JOURNAL_REPLAY|
- BTREE_INSERT_NOMARK);
- break;
- }
-
- if (ret) {
- bch_err(c, "journal replay: error %d while replaying key",
- ret);
- goto err;
- }
-
- cond_resched();
- }
-
- bch2_journal_pin_put(j, j->replay_journal_seq);
- }
-
- j->replay_journal_seq = 0;
-
- bch2_journal_set_replay_done(j);
- bch2_journal_flush_all_pins(j);
- ret = bch2_journal_error(j);
-err:
- bch2_journal_entries_free(list);
- return ret;
-}
-
/* journal write: */
static void __journal_write_alloc(struct journal *j,
return;
err:
bch2_fatal_error(c);
- bch2_journal_halt(j);
spin_lock(&j->lock);
goto out;
}
j->write_start_time = local_clock();
start = vstruct_last(jset);
- end = bch2_journal_super_entries_add_common(c, start);
+ end = bch2_journal_super_entries_add_common(c, start,
+ le64_to_cpu(jset->seq));
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
spin_unlock(&j->lock);
if (ret) {
- bch2_journal_halt(j);
bch_err(c, "Unable to allocate journal write");
bch2_fatal_error(c);
continue_at(cl, journal_write_done, system_highpri_wq);
vstruct_for_each_safe(entry, k, _n)
int bch2_journal_read(struct bch_fs *, struct list_head *);
-void bch2_journal_entries_free(struct list_head *);
-int bch2_journal_replay(struct bch_fs *, struct list_head *);
void bch2_journal_write(struct closure *);
const struct journal_seq_blacklist_table_entry *l = _l;
const struct journal_seq_blacklist_table_entry *r = _r;
- return (l->start > r->start) - (l->start < r->start);
+ return cmp_int(l->start, r->start);
}
bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
} pin;
u64 replay_journal_seq;
+ u64 replay_journal_seq_end;
struct write_point wp;
spinlock_t err_lock;
int ret = 0;
bch2_trans_init(&trans, c);
+ bch2_trans_preload_iters(&trans);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
POS_MIN, BTREE_ITER_PREFETCH);
break;
}
+ BUG_ON(ret == -EINTR);
+
bch2_trans_exit(&trans);
bch2_replicas_gc_end(c, ret);
int ret = 0;
bch2_trans_init(&trans, c);
+ bch2_trans_preload_iters(&trans);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
bkey_start_pos(&bch2_keylist_front(keys)->k),
}
out:
bch2_trans_exit(&trans);
+ BUG_ON(ret == -EINTR);
return ret;
}
bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
- BTREE_ITER_PREFETCH, k) {
+ BTREE_ITER_PREFETCH, k, ret) {
ret = bch2_mark_bkey_replicas(c, k);
if (ret)
break;
struct copygc_heap_entry l,
struct copygc_heap_entry r)
{
- return (l.sectors > r.sectors) - (l.sectors < r.sectors);
+ return cmp_int(l.sectors, r.sectors);
}
static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
const struct copygc_heap_entry *l = _l;
const struct copygc_heap_entry *r = _r;
- return (l->offset > r->offset) - (l->offset < r->offset);
+ return cmp_int(l->offset, r->offset);
}
static bool __copygc_pred(struct bch_dev *ca,
spin_lock(&ca->freelist_lock);
ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
- ca->allocator_blocked;
+ ca->allocator_state != ALLOCATOR_RUNNING;
spin_unlock(&ca->freelist_lock);
return ret;
NO_SB_OPT, false, \
NULL, "Super read only mode - no writes at all will be issued,\n"\
"even if we have to replay the journal") \
- x(noreplay, u8, \
- OPT_MOUNT, \
- OPT_BOOL(), \
- NO_SB_OPT, false, \
- NULL, "Don't replay the journal (only for internal tools)")\
x(norecovery, u8, \
OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false, \
- NULL, NULL) \
+ NULL, "Don't replay the journal") \
x(noexcl, u8, \
OPT_MOUNT, \
OPT_BOOL(), \
bch2_trans_init(&trans, c);
for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0),
- BTREE_ITER_PREFETCH, k) {
+ BTREE_ITER_PREFETCH, k, ret) {
if (k.k->p.inode != type)
break;
bch2_trans_init(&trans, c);
for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN,
- BTREE_ITER_PREFETCH, k) {
+ BTREE_ITER_PREFETCH, k, ret) {
switch (k.k->type) {
case KEY_TYPE_inode:
ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
#include "error.h"
#include "fsck.h"
#include "journal_io.h"
+#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "quota.h"
#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
+#include <linux/sort.h>
#include <linux/stat.h>
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-static struct bkey_i *btree_root_find(struct bch_fs *c,
- struct bch_sb_field_clean *clean,
- struct jset *j,
- enum btree_id id, unsigned *level)
+/* sort and dedup all keys in the journal: */
+
+static void journal_entries_free(struct list_head *list)
{
- struct bkey_i *k;
- struct jset_entry *entry, *start, *end;
- if (clean) {
- start = clean->start;
- end = vstruct_end(&clean->field);
- } else {
- start = j->start;
- end = vstruct_last(j);
+ while (!list_empty(list)) {
+ struct journal_replay *i =
+ list_first_entry(list, struct journal_replay, list);
+ list_del(&i->list);
+ kvpfree(i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&i->j));
}
+}
- for (entry = start; entry < end; entry = vstruct_next(entry))
- if (entry->type == BCH_JSET_ENTRY_btree_root &&
- entry->btree_id == id)
- goto found;
-
- return NULL;
-found:
- if (!entry->u64s)
- return ERR_PTR(-EINVAL);
+static int journal_sort_key_cmp(const void *_l, const void *_r)
+{
+ const struct journal_key *l = _l;
+ const struct journal_key *r = _r;
- k = entry->start;
- *level = entry->level;
- return k;
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ bkey_cmp(l->pos, r->pos) ?:
+ cmp_int(l->journal_seq, r->journal_seq) ?:
+ cmp_int(l->journal_offset, r->journal_offset);
}
-static int verify_superblock_clean(struct bch_fs *c,
- struct bch_sb_field_clean **cleanp,
- struct jset *j)
+static int journal_sort_seq_cmp(const void *_l, const void *_r)
{
- unsigned i;
- struct bch_sb_field_clean *clean = *cleanp;
- int ret = 0;
+ const struct journal_key *l = _l;
+ const struct journal_key *r = _r;
- if (!clean || !j)
- return 0;
+ return cmp_int(l->journal_seq, r->journal_seq) ?:
+ cmp_int(l->btree_id, r->btree_id) ?:
+ bkey_cmp(l->pos, r->pos);
+}
- if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
- "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
- le64_to_cpu(clean->journal_seq),
- le64_to_cpu(j->seq))) {
- kfree(clean);
- *cleanp = NULL;
- return 0;
+static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i)
+{
+ while (i + 1 < keys->d + keys->nr &&
+ journal_sort_key_cmp(i, i + 1) > 0) {
+ swap(i[0], i[1]);
+ i++;
}
+}
- mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
- "superblock read clock doesn't match journal after clean shutdown");
- mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
- "superblock read clock doesn't match journal after clean shutdown");
+static void journal_keys_free(struct journal_keys *keys)
+{
+ struct journal_key *i;
+
+ for_each_journal_key(*keys, i)
+ if (i->allocated)
+ kfree(i->k);
+ kvfree(keys->d);
+ keys->d = NULL;
+ keys->nr = 0;
+}
- for (i = 0; i < BTREE_ID_NR; i++) {
- struct bkey_i *k1, *k2;
- unsigned l1 = 0, l2 = 0;
+static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
+{
+ struct journal_replay *p;
+ struct jset_entry *entry;
+ struct bkey_i *k, *_n;
+ struct journal_keys keys = { NULL }, keys_deduped = { NULL };
+ struct journal_key *i;
+ size_t nr_keys = 0;
+
+ list_for_each_entry(p, journal_entries, list)
+ for_each_jset_key(k, _n, entry, &p->j)
+ nr_keys++;
+
+ keys.journal_seq_base = keys_deduped.journal_seq_base =
+ le64_to_cpu(list_first_entry(journal_entries,
+ struct journal_replay,
+ list)->j.seq);
+
+ keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
+ if (!keys.d)
+ goto err;
- k1 = btree_root_find(c, clean, NULL, i, &l1);
- k2 = btree_root_find(c, NULL, j, i, &l2);
+ keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL);
+ if (!keys_deduped.d)
+ goto err;
- if (!k1 && !k2)
+ list_for_each_entry(p, journal_entries, list)
+ for_each_jset_key(k, _n, entry, &p->j)
+ keys.d[keys.nr++] = (struct journal_key) {
+ .btree_id = entry->btree_id,
+ .pos = bkey_start_pos(&k->k),
+ .k = k,
+ .journal_seq = le64_to_cpu(p->j.seq) -
+ keys.journal_seq_base,
+ .journal_offset = k->_data - p->j._data,
+ };
+
+ sort(keys.d, nr_keys, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
+
+ i = keys.d;
+ while (i < keys.d + keys.nr) {
+ if (i + 1 < keys.d + keys.nr &&
+ i[0].btree_id == i[1].btree_id &&
+ !bkey_cmp(i[0].pos, i[1].pos)) {
+ if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
+ i++;
+ } else {
+ bch2_cut_front(i[1].k->k.p, i[0].k);
+ i[0].pos = i[1].k->k.p;
+ journal_keys_sift(&keys, i);
+ }
continue;
+ }
- mustfix_fsck_err_on(!k1 || !k2 ||
- IS_ERR(k1) ||
- IS_ERR(k2) ||
- k1->k.u64s != k2->k.u64s ||
- memcmp(k1, k2, bkey_bytes(k1)) ||
- l1 != l2, c,
- "superblock btree root doesn't match journal after clean shutdown");
+ if (i + 1 < keys.d + keys.nr &&
+ i[0].btree_id == i[1].btree_id &&
+ bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) {
+ if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?:
+ cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) {
+ if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
+ bch2_cut_back(bkey_start_pos(&i[1].k->k), &i[0].k->k);
+ } else {
+ struct bkey_i *split =
+ kmalloc(bkey_bytes(i[0].k), GFP_KERNEL);
+
+ if (!split)
+ goto err;
+
+ bkey_copy(split, i[0].k);
+ bch2_cut_back(bkey_start_pos(&i[1].k->k), &split->k);
+ keys_deduped.d[keys_deduped.nr++] = (struct journal_key) {
+ .btree_id = i[0].btree_id,
+ .allocated = true,
+ .pos = bkey_start_pos(&split->k),
+ .k = split,
+ .journal_seq = i[0].journal_seq,
+ .journal_offset = i[0].journal_offset,
+ };
+
+ bch2_cut_front(i[1].k->k.p, i[0].k);
+ i[0].pos = i[1].k->k.p;
+ journal_keys_sift(&keys, i);
+ continue;
+ }
+ } else {
+ if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) {
+ i[1] = i[0];
+ i++;
+ continue;
+ } else {
+ bch2_cut_front(i[0].k->k.p, i[1].k);
+ i[1].pos = i[0].k->k.p;
+ journal_keys_sift(&keys, i + 1);
+ continue;
+ }
+ }
+ }
+
+ keys_deduped.d[keys_deduped.nr++] = *i++;
}
-fsck_err:
- return ret;
+
+ kvfree(keys.d);
+ return keys_deduped;
+err:
+ journal_keys_free(&keys_deduped);
+ kvfree(keys.d);
+ return (struct journal_keys) { NULL };
+}
+
+/* journal replay: */
+
+static void replay_now_at(struct journal *j, u64 seq)
+{
+ BUG_ON(seq < j->replay_journal_seq);
+ BUG_ON(seq > j->replay_journal_seq_end);
+
+ while (j->replay_journal_seq < seq)
+ bch2_journal_pin_put(j, j->replay_journal_seq++);
+}
+
+static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
+{
+ struct btree_trans trans;
+ struct btree_iter *iter, *split_iter;
+ /*
+ * We might cause compressed extents to be split, so we need to pass in
+ * a disk_reservation:
+ */
+ struct disk_reservation disk_res =
+ bch2_disk_reservation_init(c, 0);
+ struct bkey_i *split;
+ bool split_compressed = false;
+ int ret;
+
+ bch2_trans_init(&trans, c);
+ bch2_trans_preload_iters(&trans);
+retry:
+ bch2_trans_begin(&trans);
+
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ bkey_start_pos(&k->k),
+ BTREE_ITER_INTENT);
+
+ do {
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ goto err;
+
+ split_iter = bch2_trans_copy_iter(&trans, iter);
+ ret = PTR_ERR_OR_ZERO(split_iter);
+ if (ret)
+ goto err;
+
+ split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
+ ret = PTR_ERR_OR_ZERO(split);
+ if (ret)
+ goto err;
+
+ if (!split_compressed &&
+ bch2_extent_is_compressed(bkey_i_to_s_c(k)) &&
+ !bch2_extent_is_atomic(k, split_iter)) {
+ ret = bch2_disk_reservation_add(c, &disk_res,
+ k->k.size *
+ bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)),
+ BCH_DISK_RESERVATION_NOFAIL);
+ BUG_ON(ret);
+
+ split_compressed = true;
+ }
+
+ bkey_copy(split, k);
+ bch2_cut_front(split_iter->pos, split);
+ bch2_extent_trim_atomic(split, split_iter);
+
+ bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split));
+ bch2_btree_iter_set_pos(iter, split->k.p);
+ } while (bkey_cmp(iter->pos, k->k.p) < 0);
+
+ if (split_compressed) {
+ memset(&trans.fs_usage_deltas.fs_usage, 0,
+ sizeof(trans.fs_usage_deltas.fs_usage));
+ trans.fs_usage_deltas.top = trans.fs_usage_deltas.d;
+
+ ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), false,
+ -((s64) k->k.size),
+ &trans.fs_usage_deltas) ?:
+ bch2_trans_commit(&trans, &disk_res, NULL,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOMARK_OVERWRITES|
+ BTREE_INSERT_NO_CLEAR_REPLICAS);
+ } else {
+ ret = bch2_trans_commit(&trans, &disk_res, NULL,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_JOURNAL_REPLAY|
+ BTREE_INSERT_NOMARK);
+ }
+
+ if (ret)
+ goto err;
+err:
+ if (ret == -EINTR)
+ goto retry;
+
+ bch2_disk_reservation_put(c, &disk_res);
+
+ return bch2_trans_exit(&trans) ?: ret;
+}
+
+static int bch2_journal_replay(struct bch_fs *c,
+ struct journal_keys keys)
+{
+ struct journal *j = &c->journal;
+ struct journal_key *i;
+ int ret;
+
+ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
+
+ for_each_journal_key(keys, i) {
+ replay_now_at(j, keys.journal_seq_base + i->journal_seq);
+
+ switch (i->btree_id) {
+ case BTREE_ID_ALLOC:
+ ret = bch2_alloc_replay_key(c, i->k);
+ break;
+ case BTREE_ID_EXTENTS:
+ ret = bch2_extent_replay_key(c, i->k);
+ break;
+ default:
+ ret = bch2_btree_insert(c, i->btree_id, i->k,
+ NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_JOURNAL_REPLAY|
+ BTREE_INSERT_NOMARK);
+ break;
+ }
+
+ if (ret) {
+ bch_err(c, "journal replay: error %d while replaying key",
+ ret);
+ return ret;
+ }
+
+ cond_resched();
+ }
+
+ replay_now_at(j, j->replay_journal_seq_end);
+ j->replay_journal_seq = 0;
+
+ bch2_journal_set_replay_done(j);
+ bch2_journal_flush_all_pins(j);
+ return bch2_journal_error(j);
+}
+
+static bool journal_empty(struct list_head *journal)
+{
+ return list_empty(journal) ||
+ journal_entry_empty(&list_last_entry(journal,
+ struct journal_replay, list)->j);
}
static int
return ret;
}
-static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
-{
- struct bch_sb_field_clean *clean, *sb_clean;
- int ret;
-
- mutex_lock(&c->sb_lock);
- sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
-
- if (fsck_err_on(!sb_clean, c,
- "superblock marked clean but clean section not present")) {
- SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
- c->sb.clean = false;
- mutex_unlock(&c->sb_lock);
- return NULL;
- }
-
- clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
- GFP_KERNEL);
- if (!clean) {
- mutex_unlock(&c->sb_lock);
- return ERR_PTR(-ENOMEM);
- }
-
- if (le16_to_cpu(c->disk_sb.sb->version) <
- bcachefs_metadata_version_bkey_renumber)
- bch2_sb_clean_renumber(clean, READ);
-
- mutex_unlock(&c->sb_lock);
-
- return clean;
-fsck_err:
- mutex_unlock(&c->sb_lock);
- return ERR_PTR(ret);
-}
+/* journal replay early: */
static int journal_replay_entry_early(struct bch_fs *c,
struct jset_entry *entry)
switch (entry->btree_id) {
case FS_USAGE_RESERVED:
if (entry->level < BCH_REPLICAS_MAX)
- percpu_u64_set(&c->usage[0]->
- persistent_reserved[entry->level],
- le64_to_cpu(u->v));
+ c->usage_base->persistent_reserved[entry->level] =
+ le64_to_cpu(u->v);
break;
case FS_USAGE_INODES:
- percpu_u64_set(&c->usage[0]->nr_inodes,
- le64_to_cpu(u->v));
+ c->usage_base->nr_inodes = le64_to_cpu(u->v);
break;
case FS_USAGE_KEY_VERSION:
atomic64_set(&c->key_version,
return 0;
}
+/* sb clean section: */
+
+static struct bkey_i *btree_root_find(struct bch_fs *c,
+ struct bch_sb_field_clean *clean,
+ struct jset *j,
+ enum btree_id id, unsigned *level)
+{
+ struct bkey_i *k;
+ struct jset_entry *entry, *start, *end;
+
+ if (clean) {
+ start = clean->start;
+ end = vstruct_end(&clean->field);
+ } else {
+ start = j->start;
+ end = vstruct_last(j);
+ }
+
+ for (entry = start; entry < end; entry = vstruct_next(entry))
+ if (entry->type == BCH_JSET_ENTRY_btree_root &&
+ entry->btree_id == id)
+ goto found;
+
+ return NULL;
+found:
+ if (!entry->u64s)
+ return ERR_PTR(-EINVAL);
+
+ k = entry->start;
+ *level = entry->level;
+ return k;
+}
+
+static int verify_superblock_clean(struct bch_fs *c,
+ struct bch_sb_field_clean **cleanp,
+ struct jset *j)
+{
+ unsigned i;
+ struct bch_sb_field_clean *clean = *cleanp;
+ int ret = 0;
+
+ if (!c->sb.clean || !j)
+ return 0;
+
+ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+ le64_to_cpu(clean->journal_seq),
+ le64_to_cpu(j->seq))) {
+ kfree(clean);
+ *cleanp = NULL;
+ return 0;
+ }
+
+ mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
+ "superblock read clock doesn't match journal after clean shutdown");
+ mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
+ "superblock read clock doesn't match journal after clean shutdown");
+
+ for (i = 0; i < BTREE_ID_NR; i++) {
+ struct bkey_i *k1, *k2;
+ unsigned l1 = 0, l2 = 0;
+
+ k1 = btree_root_find(c, clean, NULL, i, &l1);
+ k2 = btree_root_find(c, NULL, j, i, &l2);
+
+ if (!k1 && !k2)
+ continue;
+
+ mustfix_fsck_err_on(!k1 || !k2 ||
+ IS_ERR(k1) ||
+ IS_ERR(k2) ||
+ k1->k.u64s != k2->k.u64s ||
+ memcmp(k1, k2, bkey_bytes(k1)) ||
+ l1 != l2, c,
+ "superblock btree root doesn't match journal after clean shutdown");
+ }
+fsck_err:
+ return ret;
+}
+
+static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
+{
+ struct bch_sb_field_clean *clean, *sb_clean;
+ int ret;
+
+ mutex_lock(&c->sb_lock);
+ sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
+
+ if (fsck_err_on(!sb_clean, c,
+ "superblock marked clean but clean section not present")) {
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+ c->sb.clean = false;
+ mutex_unlock(&c->sb_lock);
+ return NULL;
+ }
+
+ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+ GFP_KERNEL);
+ if (!clean) {
+ mutex_unlock(&c->sb_lock);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ if (le16_to_cpu(c->disk_sb.sb->version) <
+ bcachefs_metadata_version_bkey_renumber)
+ bch2_sb_clean_renumber(clean, READ);
+
+ mutex_unlock(&c->sb_lock);
+
+ return clean;
+fsck_err:
+ mutex_unlock(&c->sb_lock);
+ return ERR_PTR(ret);
+}
+
static int read_btree_roots(struct bch_fs *c)
{
unsigned i;
return ret;
}
-static bool journal_empty(struct list_head *journal)
-{
- struct journal_replay *i;
- struct jset_entry *entry;
-
- if (list_empty(journal))
- return true;
-
- i = list_last_entry(journal, struct journal_replay, list);
-
- if (i->j.last_seq != i->j.seq)
- return false;
-
- list_for_each_entry(i, journal, list) {
- vstruct_for_each(&i->j, entry) {
- if (entry->type == BCH_JSET_ENTRY_btree_root ||
- entry->type == BCH_JSET_ENTRY_usage ||
- entry->type == BCH_JSET_ENTRY_data_usage)
- continue;
-
- if (entry->type == BCH_JSET_ENTRY_btree_keys &&
- !entry->u64s)
- continue;
- return false;
- }
- }
-
- return true;
-}
-
int bch2_fs_recovery(struct bch_fs *c)
{
const char *err = "cannot allocate memory";
struct bch_sb_field_clean *clean = NULL;
u64 journal_seq;
- LIST_HEAD(journal);
+ LIST_HEAD(journal_entries);
+ struct journal_keys journal_keys = { NULL };
+ bool wrote = false, write_sb = false;
int ret;
if (c->sb.clean)
if (!c->sb.clean || c->opts.fsck) {
struct jset *j;
- ret = bch2_journal_read(c, &journal);
+ ret = bch2_journal_read(c, &journal_entries);
if (ret)
goto err;
- fsck_err_on(c->sb.clean && !journal_empty(&journal), c,
- "filesystem marked clean but journal not empty");
+ if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&journal_entries), c,
+ "filesystem marked clean but journal not empty")) {
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+ c->sb.clean = false;
+ }
- if (!c->sb.clean && list_empty(&journal)){
+ if (!c->sb.clean && list_empty(&journal_entries)) {
bch_err(c, "no journal entries found");
ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
goto err;
}
- j = &list_last_entry(&journal, struct journal_replay, list)->j;
+ journal_keys = journal_keys_sort(&journal_entries);
+ if (!journal_keys.d) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ j = &list_last_entry(&journal_entries,
+ struct journal_replay, list)->j;
ret = verify_superblock_clean(c, &clean, j);
if (ret)
journal_seq = le64_to_cpu(clean->journal_seq) + 1;
}
- ret = journal_replay_early(c, clean, &journal);
+ ret = journal_replay_early(c, clean, &journal_entries);
if (ret)
goto err;
if (!c->sb.clean) {
ret = bch2_journal_seq_blacklist_add(c,
- journal_seq,
- journal_seq + 4);
+ journal_seq,
+ journal_seq + 4);
if (ret) {
bch_err(c, "error creating new journal seq blacklist entry");
goto err;
ret = bch2_blacklist_table_initialize(c);
- ret = verify_journal_entries_not_blacklisted_or_missing(c, &journal);
+ ret = verify_journal_entries_not_blacklisted_or_missing(c,
+ &journal_entries);
if (ret)
goto err;
- ret = bch2_fs_journal_start(&c->journal, journal_seq, &journal);
+ ret = bch2_fs_journal_start(&c->journal, journal_seq,
+ &journal_entries);
if (ret)
goto err;
if (ret)
goto err;
+ bch_verbose(c, "starting alloc read");
err = "error reading allocation information";
- ret = bch2_alloc_read(c, &journal);
+ ret = bch2_alloc_read(c, &journal_keys);
if (ret)
goto err;
+ bch_verbose(c, "alloc read done");
bch_verbose(c, "starting stripes_read");
- ret = bch2_stripes_read(c, &journal);
+ err = "error reading stripes";
+ ret = bch2_stripes_read(c, &journal_keys);
if (ret)
goto err;
bch_verbose(c, "stripes_read done");
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+ if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) &&
+ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) {
+ /*
+ * interior btree node updates aren't consistent with the
+ * journal; after an unclean shutdown we have to walk all
+ * pointers to metadata:
+ */
+ bch_info(c, "starting metadata mark and sweep");
+ err = "error in mark and sweep";
+ ret = bch2_gc(c, NULL, true, true);
+ if (ret)
+ goto err;
+ bch_verbose(c, "mark and sweep done");
+ }
+
if (c->opts.fsck ||
!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
- bch_verbose(c, "starting mark and sweep:");
- err = "error in recovery";
- ret = bch2_gc(c, &journal, true, false);
+ bch_info(c, "starting mark and sweep");
+ err = "error in mark and sweep";
+ ret = bch2_gc(c, &journal_keys, true, false);
if (ret)
goto err;
bch_verbose(c, "mark and sweep done");
if (c->sb.encryption_type && !c->sb.clean)
atomic64_add(1 << 16, &c->key_version);
- if (c->opts.noreplay)
+ if (c->opts.norecovery)
goto out;
- bch_verbose(c, "starting journal replay:");
+ bch_verbose(c, "starting journal replay");
err = "journal replay failed";
- ret = bch2_journal_replay(c, &journal);
+ ret = bch2_journal_replay(c, journal_keys);
if (ret)
goto err;
bch_verbose(c, "journal replay done");
- if (c->opts.norecovery)
- goto out;
+ if (!c->opts.nochanges) {
+ /*
+ * note that even when filesystem was clean there might be work
+ * to do here, if we ran gc (because of fsck) which recalculated
+ * oldest_gen:
+ */
+ bch_verbose(c, "writing allocation info");
+ err = "error writing out alloc info";
+ ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?:
+ bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote);
+ if (ret) {
+ bch_err(c, "error writing alloc info");
+ goto err;
+ }
+ bch_verbose(c, "alloc write done");
+ }
- err = "error in fsck";
- ret = bch2_fsck(c);
- if (ret)
- goto err;
+ if (!c->sb.clean) {
+ if (!(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+ bch_info(c, "checking inode link counts");
+ err = "error in recovery";
+ ret = bch2_fsck_inode_nlink(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "check inodes done");
+
+ } else {
+ bch_verbose(c, "checking for deleted inodes");
+ err = "error in recovery";
+ ret = bch2_fsck_walk_inodes_only(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "check inodes done");
+ }
+ }
+
+ if (c->opts.fsck) {
+ bch_info(c, "starting fsck");
+ err = "error in fsck";
+ ret = bch2_fsck_full(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "fsck done");
+ }
if (enabled_qtypes(c)) {
- bch_verbose(c, "reading quotas:");
+ bch_verbose(c, "reading quotas");
ret = bch2_fs_quota_read(c);
if (ret)
goto err;
c->disk_sb.sb->version_min =
le16_to_cpu(bcachefs_metadata_version_min);
c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
+ write_sb = true;
+ }
+
+ if (!test_bit(BCH_FS_ERROR, &c->flags)) {
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
+ write_sb = true;
}
if (c->opts.fsck &&
!test_bit(BCH_FS_ERROR, &c->flags)) {
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
+ write_sb = true;
}
+
+ if (write_sb)
+ bch2_write_super(c);
mutex_unlock(&c->sb_lock);
if (c->journal_seq_blacklist_table &&
c->journal_seq_blacklist_table->nr > 128)
queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
out:
- bch2_journal_entries_free(&journal);
- kfree(clean);
- return ret;
+ ret = 0;
err:
fsck_err:
- pr_err("Error in recovery: %s (%i)", err, ret);
- goto out;
+ bch2_flush_fsck_errs(c);
+ journal_keys_free(&journal_keys);
+ journal_entries_free(&journal_entries);
+ kfree(clean);
+ if (ret)
+ bch_err(c, "Error in recovery: %s (%i)", err, ret);
+ else
+ bch_verbose(c, "ret %i", ret);
+ return ret;
}
int bch2_fs_initialize(struct bch_fs *c)
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
+struct journal_keys {
+ struct journal_key {
+ enum btree_id btree_id:8;
+ unsigned allocated:1;
+ struct bpos pos;
+ struct bkey_i *k;
+ u32 journal_seq;
+ u32 journal_offset;
+ } *d;
+ size_t nr;
+ u64 journal_seq_base;
+};
+
+#define for_each_journal_key(keys, i) \
+ for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
+
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
#include "bcachefs.h"
+#include "buckets.h"
#include "journal.h"
#include "replicas.h"
#include "super-io.h"
static inline int u8_cmp(u8 l, u8 r)
{
- return (l > r) - (l < r);
+ return cmp_int(l, r);
}
static void verify_replicas_entry_sorted(struct bch_replicas_entry *e)
r->devs[r->nr_devs++] = ptr->dev;
}
-static void bkey_to_replicas(struct bch_replicas_entry *e,
- struct bkey_s_c k)
+void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
+ struct bkey_s_c k)
{
e->nr_devs = 0;
return marked;
}
-static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
+static void __replicas_table_update(struct bch_fs_usage *dst,
struct bch_replicas_cpu *dst_r,
- struct bch_fs_usage __percpu *src_p,
+ struct bch_fs_usage *src,
struct bch_replicas_cpu *src_r)
{
- unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
- struct bch_fs_usage *dst, *src = (void *)
- bch2_acc_percpu_u64s((void *) src_p, src_nr);
int src_idx, dst_idx;
- preempt_disable();
- dst = this_cpu_ptr(dst_p);
- preempt_enable();
-
*dst = *src;
for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
}
}
+static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
+ struct bch_replicas_cpu *dst_r,
+ struct bch_fs_usage __percpu *src_p,
+ struct bch_replicas_cpu *src_r)
+{
+ unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
+ struct bch_fs_usage *dst, *src = (void *)
+ bch2_acc_percpu_u64s((void *) src_p, src_nr);
+
+ preempt_disable();
+ dst = this_cpu_ptr(dst_p);
+ preempt_enable();
+
+ __replicas_table_update(dst, dst_r, src, src_r);
+}
+
/*
* Resize filesystem accounting:
*/
{
struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
struct bch_fs_usage *new_scratch = NULL;
+ struct bch_fs_usage __percpu *new_gc = NULL;
+ struct bch_fs_usage *new_base = NULL;
unsigned bytes = sizeof(struct bch_fs_usage) +
sizeof(u64) * new_r->nr;
int ret = -ENOMEM;
- if (!(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
+ if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
+ !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
+ GFP_NOIO)) ||
+ !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
GFP_NOIO)) ||
- (c->usage[1] &&
- !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
- GFP_NOIO))) ||
- !(new_scratch = kmalloc(bytes, GFP_NOIO)))
+ !(new_scratch = kmalloc(bytes, GFP_NOIO)) ||
+ (c->usage_gc &&
+ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
goto err;
+ if (c->usage_base)
+ __replicas_table_update(new_base, new_r,
+ c->usage_base, &c->replicas);
if (c->usage[0])
- __replicas_table_update(new_usage[0], new_r,
- c->usage[0], &c->replicas);
+ __replicas_table_update_pcpu(new_usage[0], new_r,
+ c->usage[0], &c->replicas);
if (c->usage[1])
- __replicas_table_update(new_usage[1], new_r,
- c->usage[1], &c->replicas);
+ __replicas_table_update_pcpu(new_usage[1], new_r,
+ c->usage[1], &c->replicas);
+ if (c->usage_gc)
+ __replicas_table_update_pcpu(new_gc, new_r,
+ c->usage_gc, &c->replicas);
+ swap(c->usage_base, new_base);
swap(c->usage[0], new_usage[0]);
swap(c->usage[1], new_usage[1]);
swap(c->usage_scratch, new_scratch);
+ swap(c->usage_gc, new_gc);
swap(c->replicas, *new_r);
ret = 0;
err:
+ free_percpu(new_gc);
kfree(new_scratch);
free_percpu(new_usage[1]);
free_percpu(new_usage[0]);
+ kfree(new_base);
return ret;
}
return false;
}
- bkey_to_replicas(&search.e, k);
+ bch2_bkey_to_replicas(&search.e, k);
return bch2_replicas_marked_locked(c, &search.e, check_gc_replicas);
}
return ret;
}
- bkey_to_replicas(&search.e, k);
+ bch2_bkey_to_replicas(&search.e, k);
return bch2_mark_replicas(c, &search.e);
}
lockdep_assert_held(&c->replicas_gc_lock);
mutex_lock(&c->sb_lock);
-
- if (ret)
- goto err;
+ percpu_down_write(&c->mark_lock);
/*
* this is kind of crappy; the replicas gc mechanism needs to be ripped
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
struct bch_replicas_cpu n;
- u64 v;
-
- if (__replicas_has_entry(&c->replicas_gc, e))
- continue;
- v = percpu_u64_get(&c->usage[0]->replicas[i]);
- if (!v)
- continue;
+ if (!__replicas_has_entry(&c->replicas_gc, e) &&
+ (c->usage_base->replicas[i] ||
+ percpu_u64_get(&c->usage[0]->replicas[i]) ||
+ percpu_u64_get(&c->usage[1]->replicas[i]))) {
+ n = cpu_replicas_add_entry(&c->replicas_gc, e);
+ if (!n.entries) {
+ ret = -ENOSPC;
+ goto err;
+ }
- n = cpu_replicas_add_entry(&c->replicas_gc, e);
- if (!n.entries) {
- ret = -ENOSPC;
- goto err;
+ swap(n, c->replicas_gc);
+ kfree(n.entries);
}
-
- percpu_down_write(&c->mark_lock);
- swap(n, c->replicas_gc);
- percpu_up_write(&c->mark_lock);
-
- kfree(n.entries);
}
if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
goto err;
}
- bch2_write_super(c);
-
- /* don't update in memory replicas until changes are persistent */
+ ret = replicas_table_update(c, &c->replicas_gc);
err:
- percpu_down_write(&c->mark_lock);
- if (!ret)
- ret = replicas_table_update(c, &c->replicas_gc);
-
kfree(c->replicas_gc.entries);
c->replicas_gc.entries = NULL;
+
percpu_up_write(&c->mark_lock);
+ if (!ret)
+ bch2_write_super(c);
+
mutex_unlock(&c->sb_lock);
+
return ret;
}
BUG_ON(ret < 0);
}
- percpu_u64_set(&c->usage[0]->replicas[idx], sectors);
+ c->usage_base->replicas[idx] = sectors;
return 0;
}
bool bch2_bkey_replicas_marked_locked(struct bch_fs *,
struct bkey_s_c, bool);
+void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
bool bch2_bkey_replicas_marked(struct bch_fs *,
struct bkey_s_c, bool);
int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
{
struct btree_iter *iter;
struct bkey_s_c k;
+ int ret;
- iter = bch2_trans_get_iter(trans, desc.btree_id,
- POS(inode, desc.hash_key(info, key)),
- BTREE_ITER_SLOTS|flags);
- if (IS_ERR(iter))
- return iter;
-
- for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+ for_each_btree_key(trans, iter, desc.btree_id,
+ POS(inode, desc.hash_key(info, key)),
+ BTREE_ITER_SLOTS|flags, k, ret) {
if (iter->pos.inode != inode)
break;
}
}
- return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOENT);
+ return ERR_PTR(ret ?: -ENOENT);
}
static __always_inline struct btree_iter *
{
struct btree_iter *iter;
struct bkey_s_c k;
+ int ret;
- iter = bch2_trans_get_iter(trans, desc.btree_id,
- POS(inode, desc.hash_key(info, key)),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return iter;
-
- for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+ for_each_btree_key(trans, iter, desc.btree_id,
+ POS(inode, desc.hash_key(info, key)),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
if (iter->pos.inode != inode)
break;
return iter;
}
- return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOSPC);
+ return ERR_PTR(ret ?: -ENOSPC);
}
static __always_inline
struct btree_iter *iter, *slot = NULL;
struct bkey_s_c k;
bool found = false;
- int ret = 0;
-
- iter = bch2_trans_get_iter(trans, desc.btree_id,
- POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
+ int ret;
- for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+ for_each_btree_key(trans, iter, desc.btree_id,
+ POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
if (iter->pos.inode != inode)
break;
}
if (slot)
- bch2_trans_iter_free(trans, iter);
+ bch2_trans_iter_free(trans, slot);
+ bch2_trans_iter_free(trans, iter);
- return bch2_trans_iter_free(trans, iter) ?: -ENOSPC;
+ return ret ?: -ENOSPC;
found:
found = true;
not_found:
#include "bcachefs.h"
+#include "buckets.h"
#include "checksum.h"
#include "disk_groups.h"
#include "ec.h"
bio_reset(bio);
bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]);
- bio->bi_iter.bi_size = 4096;
+ bio->bi_iter.bi_size = PAGE_SIZE;
bio->bi_end_io = write_super_endio;
bio->bi_private = ca;
bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META);
mutex_lock(&c->sb_lock);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
- c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
ret = bch2_write_super(c);
mutex_unlock(&c->sb_lock);
struct jset_entry *
bch2_journal_super_entries_add_common(struct bch_fs *c,
- struct jset_entry *entry)
+ struct jset_entry *entry,
+ u64 journal_seq)
{
struct btree_root *r;
unsigned i;
mutex_unlock(&c->btree_root_lock);
- percpu_down_read_preempt_disable(&c->mark_lock);
+ percpu_down_write(&c->mark_lock);
+
+ if (!journal_seq) {
+ bch2_fs_usage_acc_to_base(c, 0);
+ bch2_fs_usage_acc_to_base(c, 1);
+ } else {
+ bch2_fs_usage_acc_to_base(c, journal_seq & 1);
+ }
{
- u64 nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes);
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
u->entry.u64s = DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1;
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_INODES;
- u->v = cpu_to_le64(nr_inodes);
+ u->v = cpu_to_le64(c->usage_base->nr_inodes);
entry = vstruct_next(entry);
}
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
- u64 sectors = percpu_u64_get(&c->usage[0]->persistent_reserved[i]);
-
- if (!sectors)
- continue;
memset(u, 0, sizeof(*u));
u->entry.u64s = DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1;
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_RESERVED;
u->entry.level = i;
- u->v = sectors;
+ u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
entry = vstruct_next(entry);
}
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
- u64 sectors = percpu_u64_get(&c->usage[0]->replicas[i]);
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
u->entry.u64s = DIV_ROUND_UP(sizeof(*u) + e->nr_devs,
sizeof(u64)) - 1;
u->entry.type = BCH_JSET_ENTRY_data_usage;
- u->v = cpu_to_le64(sectors);
+ u->v = cpu_to_le64(c->usage_base->replicas[i]);
memcpy(&u->r, e, replicas_entry_bytes(e));
entry = vstruct_next(entry);
}
- percpu_up_read_preempt_enable(&c->mark_lock);
+ percpu_up_write(&c->mark_lock);
return entry;
}
SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
entry = sb_clean->start;
- entry = bch2_journal_super_entries_add_common(c, entry);
+ entry = bch2_journal_super_entries_add_common(c, entry, 0);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
memset(entry, 0,
struct jset_entry *
bch2_journal_super_entries_add_common(struct bch_fs *,
- struct jset_entry *);
+ struct jset_entry *, u64);
void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
goto allocator_not_running;
do {
- ret = bch2_stripes_write(c, &wrote);
- if (ret) {
- bch2_fs_inconsistent(c, "error writing out stripes");
- break;
- }
+ wrote = false;
- ret = bch2_alloc_write(c, false, &wrote);
- if (ret) {
+ ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
+ bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
+
+ if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
+
+ if (ret)
break;
- }
for_each_member_device(ca, c, i)
bch2_dev_allocator_quiesce(c, ca);
if (!bch2_journal_error(&c->journal) &&
!test_bit(BCH_FS_ERROR, &c->flags) &&
!test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
- test_bit(BCH_FS_STARTED, &c->flags))
+ test_bit(BCH_FS_STARTED, &c->flags) &&
+ !c->opts.norecovery)
bch2_fs_mark_clean(c);
clear_bit(BCH_FS_RW, &c->flags);
if (test_bit(BCH_FS_RW, &c->flags))
return 0;
+ /*
+ * nochanges is used for fsck -n mode - we have to allow going rw
+ * during recovery for that to work:
+ */
+ if (c->opts.norecovery ||
+ (c->opts.nochanges &&
+ (!early || c->opts.read_only)))
+ return -EROFS;
+
ret = bch2_fs_mark_dirty(c);
if (ret)
goto err;
bch2_fs_compress_exit(c);
percpu_free_rwsem(&c->mark_lock);
kfree(c->usage_scratch);
+ free_percpu(c->usage[1]);
free_percpu(c->usage[0]);
+ kfree(c->usage_base);
free_percpu(c->pcpu);
mempool_exit(&c->btree_iters_pool);
mempool_exit(&c->btree_bounce_pool);
seqcount_init(&c->gc_pos_lock);
+ seqcount_init(&c->usage_lock);
+
c->copy_gc_enabled = 1;
c->rebalance.enabled = 1;
c->promote_whole_extents = true;
c->block_bits = ilog2(c->opts.block_size);
c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
- c->opts.nochanges |= c->opts.noreplay;
- c->opts.read_only |= c->opts.nochanges;
-
if (bch2_fs_init_fault("fs_alloc"))
goto err;
goto out;
}
-const char *bch2_fs_start(struct bch_fs *c)
+noinline_for_stack
+static void print_mount_opts(struct bch_fs *c)
+{
+ enum bch_opt_id i;
+ char buf[512];
+ struct printbuf p = PBUF(buf);
+ bool first = true;
+
+ strcpy(buf, "(null)");
+
+ if (c->opts.read_only) {
+ pr_buf(&p, "ro");
+ first = false;
+ }
+
+ for (i = 0; i < bch2_opts_nr; i++) {
+ const struct bch_option *opt = &bch2_opt_table[i];
+ u64 v = bch2_opt_get_by_id(&c->opts, i);
+
+ if (!(opt->mode & OPT_MOUNT))
+ continue;
+
+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+ continue;
+
+ if (!first)
+ pr_buf(&p, ",");
+ first = false;
+ bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE);
+ }
+
+ bch_info(c, "mounted with opts: %s", buf);
+}
+
+int bch2_fs_start(struct bch_fs *c)
{
const char *err = "cannot allocate memory";
struct bch_sb_field_members *mi;
goto err;
err = "dynamic fault";
+ ret = -EINVAL;
if (bch2_fs_init_fault("fs_start"))
goto err;
- if (c->opts.read_only) {
+ if (c->opts.read_only || c->opts.nochanges) {
bch2_fs_read_only(c);
} else {
- if (!test_bit(BCH_FS_RW, &c->flags)
- ? bch2_fs_read_write(c)
- : bch2_fs_read_write_late(c)) {
- err = "error going read write";
+ err = "error going read write";
+ ret = !test_bit(BCH_FS_RW, &c->flags)
+ ? bch2_fs_read_write(c)
+ : bch2_fs_read_write_late(c);
+ if (ret)
goto err;
- }
}
set_bit(BCH_FS_STARTED, &c->flags);
-
- err = NULL;
+ print_mount_opts(c);
+ ret = 0;
out:
mutex_unlock(&c->state_lock);
- return err;
+ return ret;
err:
switch (ret) {
case BCH_FSCK_ERRORS_NOT_FIXED:
break;
}
- BUG_ON(!err);
+ BUG_ON(!ret);
goto out;
}
free_percpu(ca->io_done);
bioset_exit(&ca->replica_set);
bch2_dev_buckets_free(ca);
- kfree(ca->sb_read_scratch);
+ free_page((unsigned long) ca->sb_read_scratch);
bch2_time_stats_exit(&ca->io_latency[WRITE]);
bch2_time_stats_exit(&ca->io_latency[READ]);
0, GFP_KERNEL) ||
percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
- !(ca->sb_read_scratch = kmalloc(4096, GFP_KERNEL)) ||
+ !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
bch2_dev_buckets_alloc(c, ca) ||
bioset_init(&ca->replica_set, 4,
offsetof(struct bch_write_bio, bio), 0) ||
static void dev_usage_clear(struct bch_dev *ca)
{
struct bucket_array *buckets;
- int cpu;
- for_each_possible_cpu(cpu) {
- struct bch_dev_usage *p =
- per_cpu_ptr(ca->usage[0], cpu);
- memset(p, 0, sizeof(*p));
- }
+ percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0]));
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
goto err_print;
if (!c->opts.nostart) {
- err = bch2_fs_start(c);
- if (err)
- goto err_print;
+ ret = bch2_fs_start(c);
+ if (ret)
+ goto err;
}
out:
kfree(sb);
const char *err;
struct bch_fs *c;
bool allocated_fs = false;
+ int ret;
err = bch2_sb_validate(sb);
if (err)
mutex_unlock(&c->sb_lock);
if (!c->opts.nostart && bch2_fs_may_start(c)) {
- err = bch2_fs_start(c);
- if (err)
+ err = "error starting filesystem";
+ ret = bch2_fs_start(c);
+ if (ret)
goto err;
}
void bch2_fs_stop(struct bch_fs *);
-const char *bch2_fs_start(struct bch_fs *);
+int bch2_fs_start(struct bch_fs *);
struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
const char *bch2_fs_open_incremental(const char *path);
{
struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c);
- unsigned i;
if (!fs_usage)
return -ENOMEM;
- pr_buf(&out, "capacity:\t\t\t%llu\n", c->capacity);
-
- pr_buf(&out, "hidden:\t\t\t\t%llu\n",
- fs_usage->hidden);
- pr_buf(&out, "data:\t\t\t\t%llu\n",
- fs_usage->data);
- pr_buf(&out, "cached:\t\t\t\t%llu\n",
- fs_usage->cached);
- pr_buf(&out, "reserved:\t\t\t%llu\n",
- fs_usage->reserved);
- pr_buf(&out, "nr_inodes:\t\t\t%llu\n",
- fs_usage->nr_inodes);
- pr_buf(&out, "online reserved:\t\t%llu\n",
- fs_usage->online_reserved);
-
- for (i = 0;
- i < ARRAY_SIZE(fs_usage->persistent_reserved);
- i++) {
- pr_buf(&out, "%u replicas:\n", i + 1);
- pr_buf(&out, "\treserved:\t\t%llu\n",
- fs_usage->persistent_reserved[i]);
- }
-
- for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
- cpu_replicas_entry(&c->replicas, i);
-
- pr_buf(&out, "\t");
- bch2_replicas_entry_to_text(&out, e);
- pr_buf(&out, ":\t%llu\n", fs_usage->replicas[i]);
- }
+ bch2_fs_usage_to_text(&out, c, fs_usage);
percpu_up_read_preempt_enable(&c->mark_lock);
nr_compressed_extents = 0,
compressed_sectors_compressed = 0,
compressed_sectors_uncompressed = 0;
+ int ret;
if (!test_bit(BCH_FS_STARTED, &c->flags))
return -EPERM;
bch2_trans_init(&trans, c);
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k)
+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret)
if (k.k->type == KEY_TYPE_extent) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const union bch_extent_entry *entry;
break;
}
}
- bch2_trans_exit(&trans);
+
+ ret = bch2_trans_exit(&trans) ?: ret;
+ if (ret)
+ return ret;
return scnprintf(buf, PAGE_SIZE,
"uncompressed data:\n"
if (attr == &sysfs_trigger_alloc_write) {
bool wrote;
- bch2_alloc_write(c, false, &wrote);
+ bch2_alloc_write(c, 0, &wrote);
}
if (attr == &sysfs_prune_cache) {
static int unsigned_cmp(const void *_l, const void *_r)
{
- unsigned l = *((unsigned *) _l);
- unsigned r = *((unsigned *) _r);
+ const unsigned *l = _l;
+ const unsigned *r = _r;
- return (l > r) - (l < r);
+ return cmp_int(*l, *r);
}
static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca,
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS(0, 0), 0, k)
+ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
+ POS_MIN, 0, k, ret)
BUG_ON(k.k->p.offset != i++);
BUG_ON(i != nr);
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(0, 0), 0, k) {
+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+ POS_MIN, 0, k, ret) {
BUG_ON(bkey_start_offset(k.k) != i);
i = k.k->p.offset;
}
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS(0, 0), 0, k) {
+ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+ 0, k, ret) {
BUG_ON(k.k->p.offset != i);
i += 2;
}
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS(0, 0),
- BTREE_ITER_SLOTS, k) {
+ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+ BTREE_ITER_SLOTS, k, ret) {
BUG_ON(bkey_deleted(k.k) != (i & 1));
BUG_ON(k.k->p.offset != i++);
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(0, 0), 0, k) {
+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
+ 0, k, ret) {
BUG_ON(bkey_start_offset(k.k) != i + 8);
BUG_ON(k.k->size != 8);
i += 16;
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(0, 0),
- BTREE_ITER_SLOTS, k) {
+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
+ BTREE_ITER_SLOTS, k, ret) {
BUG_ON(bkey_deleted(k.k) != !(i % 16));
BUG_ON(bkey_start_offset(k.k) != i);
bch2_trans_init(&trans, c);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
- for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
insert.k.p = iter->pos;
bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &insert.k_i));
bch2_trans_init(&trans, c);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN,
- BTREE_ITER_INTENT);
-
- for_each_btree_key_continue(iter, 0, k) {
+ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+ BTREE_ITER_INTENT, k) {
struct bkey_i_cookie u;
bkey_reassemble(&u.k_i, k);
acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
}
+static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ memset(per_cpu_ptr(p, cpu), c, bytes);
+}
+
u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
+#define cmp_int(l, r) ((l > r) - (l < r))
+
#endif /* _BCACHEFS_UTIL_H */
return ret;
}
-static void __bch2_xattr_emit(const char *prefix,
- const char *name, size_t name_len,
- char **buffer, size_t *buffer_size,
- ssize_t *ret)
+struct xattr_buf {
+ char *buf;
+ size_t len;
+ size_t used;
+};
+
+static int __bch2_xattr_emit(const char *prefix,
+ const char *name, size_t name_len,
+ struct xattr_buf *buf)
{
const size_t prefix_len = strlen(prefix);
const size_t total_len = prefix_len + name_len + 1;
- if (*buffer) {
- if (total_len > *buffer_size) {
- *ret = -ERANGE;
- return;
- }
+ if (buf->buf) {
+ if (buf->used + total_len > buf->len)
+ return -ERANGE;
- memcpy(*buffer, prefix, prefix_len);
- memcpy(*buffer + prefix_len,
+ memcpy(buf->buf + buf->used, prefix, prefix_len);
+ memcpy(buf->buf + buf->used + prefix_len,
name, name_len);
- (*buffer)[prefix_len + name_len] = '\0';
-
- *buffer += total_len;
- *buffer_size -= total_len;
+ buf->buf[buf->used + prefix_len + name_len] = '\0';
}
- *ret += total_len;
+ buf->used += total_len;
+ return 0;
}
-static void bch2_xattr_emit(struct dentry *dentry,
+static int bch2_xattr_emit(struct dentry *dentry,
const struct bch_xattr *xattr,
- char **buffer, size_t *buffer_size,
- ssize_t *ret)
+ struct xattr_buf *buf)
{
const struct xattr_handler *handler =
bch2_xattr_type_to_handler(xattr->x_type);
- if (handler && (!handler->list || handler->list(dentry)))
- __bch2_xattr_emit(handler->prefix ?: handler->name,
- xattr->x_name, xattr->x_name_len,
- buffer, buffer_size, ret);
+ return handler && (!handler->list || handler->list(dentry))
+ ? __bch2_xattr_emit(handler->prefix ?: handler->name,
+ xattr->x_name, xattr->x_name_len, buf)
+ : 0;
}
-static void bch2_xattr_list_bcachefs(struct bch_fs *c,
- struct bch_inode_info *inode,
- char **buffer,
- size_t *buffer_size,
- ssize_t *ret,
- bool all)
+static int bch2_xattr_list_bcachefs(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct xattr_buf *buf,
+ bool all)
{
const char *prefix = all ? "bcachefs_effective." : "bcachefs.";
unsigned id;
+ int ret = 0;
u64 v;
for (id = 0; id < Inode_opt_nr; id++) {
!(inode->ei_inode.bi_fields_set & (1 << id)))
continue;
- __bch2_xattr_emit(prefix,
- bch2_inode_opts[id],
- strlen(bch2_inode_opts[id]),
- buffer, buffer_size, ret);
- if (*ret < 0)
+ ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
+ strlen(bch2_inode_opts[id]), buf);
+ if (ret)
break;
}
+
+ return ret;
}
ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
+ struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
u64 inum = dentry->d_inode->i_ino;
- ssize_t ret = 0;
+ int ret;
bch2_trans_init(&trans, c);
for_each_btree_key(&trans, iter, BTREE_ID_XATTRS,
- POS(inum, 0), 0, k) {
+ POS(inum, 0), 0, k, ret) {
BUG_ON(k.k->p.inode < inum);
if (k.k->p.inode > inum)
if (k.k->type != KEY_TYPE_xattr)
continue;
- bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v,
- &buffer, &buffer_size, &ret);
- if (ret < 0)
+ ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
+ if (ret)
break;
}
- bch2_trans_exit(&trans);
+ ret = bch2_trans_exit(&trans) ?: ret;
- if (ret < 0)
+ if (ret)
return ret;
- bch2_xattr_list_bcachefs(c, inode, &buffer,
- &buffer_size, &ret, false);
- if (ret < 0)
+ ret = bch2_xattr_list_bcachefs(c, inode, &buf, false);
+ if (ret)
return ret;
- bch2_xattr_list_bcachefs(c, inode, &buffer,
- &buffer_size, &ret, true);
- if (ret < 0)
+ ret = bch2_xattr_list_bcachefs(c, inode, &buf, true);
+ if (ret)
return ret;
- return ret;
+ return buf.used;
}
static int bch2_xattr_get_handler(const struct xattr_handler *handler,