-4231dd5cf0f04dd61b0b8bae44a357da8331c0e2
+9ceb982d7790f552e2f5c96bebeab176516cf144
bucket_bytes(ca));
}
- /* Prios/gens: */
- for (i = 0; i < prio_buckets(ca); i++)
- range_add(&data,
- bucket_bytes(ca) * ca->prio_last_buckets[i],
- bucket_bytes(ca));
-
/* Btree: */
for (i = 0; i < BTREE_ID_NR; i++) {
const struct bch_extent_ptr *ptr;
opts.nochanges = true;
opts.noreplay = true;
opts.errors = BCH_ON_ERROR_CONTINUE;
+ opts.degraded = true;
while ((opt = getopt(argc, argv, "o:fh")) != -1)
switch (opt) {
opts.nochanges = true;
opts.norecovery = true;
opts.errors = BCH_ON_ERROR_CONTINUE;
+ opts.degraded = true;
while ((opt = getopt(argc, argv, "b:s:e:i:m:fvh")) != -1)
switch (opt) {
const char *err;
int opt;
+ opts.degraded = true;
+
while ((opt = getopt(argc, argv, "pynfvh")) != -1)
switch (opt) {
case 'p':
die("error reserving space in new filesystem: %s",
strerror(-ret));
- bch2_check_mark_super(c, &e->k_i, false);
+ bch2_check_mark_super(c, extent_i_to_s_c(e), false);
ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
&res, NULL, NULL, 0);
return __builtin_popcountl(w);
}
+static inline unsigned long hweight8(unsigned long w)
+{
+ return __builtin_popcountl(w);
+}
+
/**
* rol64 - rotate a 64-bit value left
* @word: value to rotate
SET_BCH_SB_BTREE_NODE_SIZE(sb, opts.btree_node_size);
SET_BCH_SB_GC_RESERVE(sb, 8);
SET_BCH_SB_META_REPLICAS_WANT(sb, opts.meta_replicas);
- SET_BCH_SB_META_REPLICAS_HAVE(sb, opts.meta_replicas);
SET_BCH_SB_META_REPLICAS_REQ(sb, opts.meta_replicas_required);
SET_BCH_SB_DATA_REPLICAS_WANT(sb, opts.data_replicas);
- SET_BCH_SB_DATA_REPLICAS_HAVE(sb, opts.data_replicas);
SET_BCH_SB_DATA_REPLICAS_REQ(sb, opts.data_replicas_required);
SET_BCH_SB_ERROR_ACTION(sb, opts.on_error_action);
SET_BCH_SB_STR_HASH_TYPE(sb, BCH_STR_HASH_SIPHASH);
BCH_SB_CLEAN(sb),
- BCH_SB_META_REPLICAS_HAVE(sb),
+ 0LLU, //BCH_SB_META_REPLICAS_HAVE(sb),
BCH_SB_META_REPLICAS_WANT(sb),
- BCH_SB_DATA_REPLICAS_HAVE(sb),
+ 0LLU, //BCH_SB_DATA_REPLICAS_HAVE(sb),
BCH_SB_DATA_REPLICAS_WANT(sb),
BCH_SB_META_CSUM_TYPE(sb) < BCH_CSUM_NR
: "unknown",
BCH_MEMBER_TIER(m),
- BCH_MEMBER_HAS_METADATA(m),
- BCH_MEMBER_HAS_DATA(m),
+ 0LLU, //BCH_MEMBER_HAS_METADATA(m),
+ 0LLU, //BCH_MEMBER_HAS_DATA(m),
BCH_MEMBER_REPLACEMENT(m) < CACHE_REPLACEMENT_NR
? bch2_cache_replacement_policies[BCH_MEMBER_REPLACEMENT(m)]
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
-static void __bch2_bucket_free(struct bch_dev *, struct bucket *);
static void bch2_recalc_min_prio(struct bch_dev *, int);
/* Allocation groups: */
c->pd_controllers_update_seconds * HZ);
}
-/*
- * Bucket priorities/gens:
- *
- * For each bucket, we store on disk its
- * 8 bit gen
- * 16 bit priority
- *
- * See alloc.c for an explanation of the gen. The priority is used to implement
- * lru (and in the future other) cache replacement policies; for most purposes
- * it's just an opaque integer.
- *
- * The gens and the priorities don't have a whole lot to do with each other, and
- * it's actually the gens that must be written out at specific times - it's no
- * big deal if the priorities don't get written, if we lose them we just reuse
- * buckets in suboptimal order.
- *
- * On disk they're stored in a packed array, and in as many buckets are required
- * to fit them all. The buckets we use to store them form a list; the journal
- * header points to the first bucket, the first bucket points to the second
- * bucket, et cetera.
- *
- * This code is used by the allocation code; periodically (whenever it runs out
- * of buckets to allocate from) the allocation code will invalidate some
- * buckets, but it can't use those buckets until their new gens are safely on
- * disk.
- */
+static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
+{
+ unsigned bytes = offsetof(struct bch_alloc, data);
+
+ if (a->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+ bytes += 2;
+ if (a->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+ bytes += 2;
+
+ return DIV_ROUND_UP(bytes, sizeof(u64));
+}
-static int prio_io(struct bch_dev *ca, uint64_t bucket, int op)
+static const char *bch2_alloc_invalid(const struct bch_fs *c,
+ struct bkey_s_c k)
{
- bio_init(ca->bio_prio, ca->bio_prio->bi_inline_vecs, bucket_pages(ca));
- ca->bio_prio->bi_opf = op|REQ_SYNC|REQ_META;
- ca->bio_prio->bi_iter.bi_sector = bucket * ca->mi.bucket_size;
- ca->bio_prio->bi_bdev = ca->disk_sb.bdev;
- ca->bio_prio->bi_iter.bi_size = bucket_bytes(ca);
- bch2_bio_map(ca->bio_prio, ca->disk_buckets);
-
- return submit_bio_wait(ca->bio_prio);
+ if (k.k->p.inode >= c->sb.nr_devices ||
+ !c->devs[k.k->p.inode])
+ return "invalid device";
+
+ switch (k.k->type) {
+ case BCH_ALLOC: {
+ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+
+ if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
+ return "incorrect value size";
+ break;
+ }
+ default:
+ return "invalid type";
+ }
+
+ return NULL;
}
-static struct nonce prio_nonce(struct prio_set *p)
+static void bch2_alloc_to_text(struct bch_fs *c, char *buf,
+ size_t size, struct bkey_s_c k)
{
- return (struct nonce) {{
- [0] = 0,
- [1] = p->nonce[0],
- [2] = p->nonce[1],
- [3] = p->nonce[2]^BCH_NONCE_PRIO,
- }};
+ buf[0] = '\0';
+
+ switch (k.k->type) {
+ case BCH_ALLOC:
+ break;
+ }
}
-int bch2_prio_write(struct bch_dev *ca)
+const struct bkey_ops bch2_bkey_alloc_ops = {
+ .key_invalid = bch2_alloc_invalid,
+ .val_to_text = bch2_alloc_to_text,
+};
+
+static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
{
- struct bch_fs *c = ca->fs;
- struct journal *j = &c->journal;
- struct journal_res res = { 0 };
- bool need_new_journal_entry;
- int i, ret = 0;
+ unsigned v;
- if (c->opts.nochanges)
- return 0;
+ switch (bytes) {
+ case 1:
+ v = **p;
+ break;
+ case 2:
+ v = le16_to_cpup((void *) *p);
+ break;
+ case 4:
+ v = le32_to_cpup((void *) *p);
+ break;
+ default:
+ BUG();
+ }
- mutex_lock(&ca->prio_write_lock);
- trace_prio_write_start(ca);
+ *p += bytes;
+ return v;
+}
- ca->need_prio_write = false;
+static inline void put_alloc_field(u8 **p, unsigned bytes, unsigned v)
+{
+ switch (bytes) {
+ case 1:
+ **p = v;
+ break;
+ case 2:
+ *((__le16 *) *p) = cpu_to_le16(v);
+ break;
+ case 4:
+ *((__le32 *) *p) = cpu_to_le32(v);
+ break;
+ default:
+ BUG();
+ }
- atomic64_add(ca->mi.bucket_size * prio_buckets(ca),
- &ca->meta_sectors_written);
+ *p += bytes;
+}
- for (i = prio_buckets(ca) - 1; i >= 0; --i) {
- struct bucket *g;
- struct prio_set *p = ca->disk_buckets;
- struct bucket_disk *d = p->data;
- struct bucket_disk *end = d + prios_per_bucket(ca);
- size_t r;
+static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bch_dev *ca;
+ struct bkey_s_c_alloc a;
+ struct bucket_mark new;
+ struct bucket *g;
+ const u8 *d;
- for (r = i * prios_per_bucket(ca);
- r < ca->mi.nbuckets && d < end;
- r++, d++) {
- g = ca->buckets + r;
- d->prio[READ] = cpu_to_le16(g->prio[READ]);
- d->prio[WRITE] = cpu_to_le16(g->prio[WRITE]);
- d->gen = ca->buckets[r].mark.gen;
- }
+ if (k.k->type != BCH_ALLOC)
+ return;
- p->next_bucket = cpu_to_le64(ca->prio_buckets[i + 1]);
- p->magic = cpu_to_le64(pset_magic(c));
- get_random_bytes(&p->nonce, sizeof(p->nonce));
+ a = bkey_s_c_to_alloc(k);
+ ca = c->devs[a.k->p.inode];
- spin_lock(&ca->prio_buckets_lock);
- r = bch2_bucket_alloc(ca, RESERVE_PRIO);
- BUG_ON(!r);
+ if (a.k->p.offset >= ca->mi.nbuckets)
+ return;
- /*
- * goes here before dropping prio_buckets_lock to guard against
- * it getting gc'd from under us
- */
- ca->prio_buckets[i] = r;
- bch2_mark_metadata_bucket(ca, ca->buckets + r,
- BUCKET_PRIOS, false);
- spin_unlock(&ca->prio_buckets_lock);
-
- SET_PSET_CSUM_TYPE(p, bch2_meta_checksum_type(c));
-
- bch2_encrypt(c, PSET_CSUM_TYPE(p),
- prio_nonce(p),
- p->encrypted_start,
- bucket_bytes(ca) -
- offsetof(struct prio_set, encrypted_start));
-
- p->csum = bch2_checksum(c, PSET_CSUM_TYPE(p),
- prio_nonce(p),
- (void *) p + sizeof(p->csum),
- bucket_bytes(ca) - sizeof(p->csum));
-
- ret = prio_io(ca, r, REQ_OP_WRITE);
- if (bch2_dev_fatal_io_err_on(ret, ca,
- "prio write to bucket %zu", r) ||
- bch2_meta_write_fault("prio"))
- goto err;
- }
+ g = ca->buckets + a.k->p.offset;
+ bucket_cmpxchg(g, new, ({
+ new.gen = a.v->gen;
+ new.gen_valid = 1;
+ }));
+
+ d = a.v->data;
+ if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+ g->prio[READ] = get_alloc_field(&d, 2);
+ if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+ g->prio[WRITE] = get_alloc_field(&d, 2);
+}
- spin_lock(&j->lock);
- j->prio_buckets[ca->dev_idx] = cpu_to_le64(ca->prio_buckets[0]);
- j->nr_prio_buckets = max_t(unsigned,
- ca->dev_idx + 1,
- j->nr_prio_buckets);
- spin_unlock(&j->lock);
+int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
+{
+ struct journal_replay *r;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
- do {
- unsigned u64s = jset_u64s(0);
+ if (!c->btree_roots[BTREE_ID_ALLOC].b)
+ return 0;
- if (!test_bit(JOURNAL_STARTED, &c->journal.flags))
- break;
+ for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
+ bch2_alloc_read_key(c, k);
+ bch2_btree_iter_cond_resched(&iter);
+ }
- ret = bch2_journal_res_get(j, &res, u64s, u64s);
- if (ret)
- goto err;
+ ret = bch2_btree_iter_unlock(&iter);
+ if (ret)
+ return ret;
- need_new_journal_entry = j->buf[res.idx].nr_prio_buckets <
- ca->dev_idx + 1;
- bch2_journal_res_put(j, &res);
+ list_for_each_entry(r, journal_replay_list, list) {
+ struct bkey_i *k, *n;
+ struct jset_entry *entry;
- ret = bch2_journal_flush_seq(j, res.seq);
- if (ret)
- goto err;
- } while (need_new_journal_entry);
+ for_each_jset_key(k, n, entry, &r->j)
+ if (entry->btree_id == BTREE_ID_ALLOC)
+ bch2_alloc_read_key(c, bkey_i_to_s_c(k));
+ }
- /*
- * Don't want the old priorities to get garbage collected until after we
- * finish writing the new ones, and they're journalled
- */
+ return 0;
+}
- spin_lock(&ca->prio_buckets_lock);
+static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket *g, struct btree_iter *iter,
+ u64 *journal_seq)
+{
+ struct bucket_mark m = READ_ONCE(g->mark);
+ __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
+ struct bkey_i_alloc *a;
+ u8 *d;
+ int ret;
- for (i = 0; i < prio_buckets(ca); i++) {
- if (ca->prio_last_buckets[i])
- __bch2_bucket_free(ca,
- &ca->buckets[ca->prio_last_buckets[i]]);
+ bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, g - ca->buckets));
- ca->prio_last_buckets[i] = ca->prio_buckets[i];
- }
+ do {
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ break;
- spin_unlock(&ca->prio_buckets_lock);
+ a = bkey_alloc_init(&alloc_key.k);
+ a->k.p = iter->pos;
+ a->v.fields = 0;
+ a->v.gen = m.gen;
+ set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
+
+ d = a->v.data;
+ if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+ put_alloc_field(&d, 2, g->prio[READ]);
+ if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+ put_alloc_field(&d, 2, g->prio[WRITE]);
+
+ bch2_btree_iter_set_pos(iter, a->k.p);
+ ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_USE_ALLOC_RESERVE|
+ BTREE_INSERT_NOWAIT,
+ BTREE_INSERT_ENTRY(iter, &a->k_i));
+ bch2_btree_iter_cond_resched(iter);
+ } while (ret == -EINTR);
- trace_prio_write_end(ca);
-err:
- mutex_unlock(&ca->prio_write_lock);
return ret;
}
-int bch2_prio_read(struct bch_dev *ca)
+int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
{
- struct bch_fs *c = ca->fs;
- struct prio_set *p = ca->disk_buckets;
- struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
- struct bucket_mark new;
- struct bch_csum csum;
- unsigned bucket_nr = 0;
- u64 bucket, expect, got;
- size_t b;
- int ret = 0;
+ struct bch_dev *ca;
+ struct bucket *g;
+ struct btree_iter iter;
+ int ret;
- if (ca->prio_read_done)
- return 0;
+ lockdep_assert_held(&c->state_lock);
- ca->prio_read_done = true;
+ if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
+ return 0;
- spin_lock(&c->journal.lock);
- bucket = le64_to_cpu(c->journal.prio_buckets[ca->dev_idx]);
- spin_unlock(&c->journal.lock);
+ ca = c->devs[pos.inode];
- /*
- * If the device hasn't been used yet, there won't be a prio bucket ptr
- */
- if (!bucket)
+ if (pos.offset >= ca->mi.nbuckets)
return 0;
- if (mustfix_fsck_err_on(bucket < ca->mi.first_bucket ||
- bucket >= ca->mi.nbuckets, c,
- "bad prio bucket %llu", bucket))
- return 0;
+ g = ca->buckets + pos.offset;
- for (b = 0; b < ca->mi.nbuckets; b++, d++) {
- if (d == end) {
- ca->prio_last_buckets[bucket_nr] = bucket;
- bucket_nr++;
-
- ret = prio_io(ca, bucket, REQ_OP_READ) ||
- bch2_meta_read_fault("prio");
-
- if (mustfix_fsck_err_on(ret, c,
- "IO error reading bucket gens (%i)",
- ret))
- return 0;
-
- got = le64_to_cpu(p->magic);
- expect = pset_magic(c);
- if (mustfix_fsck_err_on(got != expect, c,
- "bad magic (got %llu expect %llu) while reading prios from bucket %llu",
- got, expect, bucket))
- return 0;
-
- if (mustfix_fsck_err_on(PSET_CSUM_TYPE(p) >= BCH_CSUM_NR, c,
- "prio bucket with unknown csum type %llu bucket %lluu",
- PSET_CSUM_TYPE(p), bucket))
- return 0;
-
- csum = bch2_checksum(c, PSET_CSUM_TYPE(p),
- prio_nonce(p),
- (void *) p + sizeof(p->csum),
- bucket_bytes(ca) - sizeof(p->csum));
- if (fsck_err_on(bch2_crc_cmp(csum, p->csum), c,
- "bad checksum reading prios from bucket %llu",
- bucket))
- return 0;
-
- bch2_encrypt(c, PSET_CSUM_TYPE(p),
- prio_nonce(p),
- p->encrypted_start,
- bucket_bytes(ca) -
- offsetof(struct prio_set, encrypted_start));
-
- bucket = le64_to_cpu(p->next_bucket);
- d = p->data;
- }
+ bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
+ BTREE_ITER_INTENT);
- ca->buckets[b].prio[READ] = le16_to_cpu(d->prio[READ]);
- ca->buckets[b].prio[WRITE] = le16_to_cpu(d->prio[WRITE]);
+ ret = __bch2_alloc_write_key(c, ca, g, &iter, NULL);
+ bch2_btree_iter_unlock(&iter);
+ return ret;
+}
- bucket_cmpxchg(&ca->buckets[b], new, ({
- new.gen = d->gen;
- new.gen_valid = 1;
- }));
- }
+int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq)
+{
+ struct btree_iter iter;
+ struct bucket *g;
+ int ret = 0;
- mutex_lock(&c->bucket_lock);
- bch2_recalc_min_prio(ca, READ);
- bch2_recalc_min_prio(ca, WRITE);
- mutex_unlock(&c->bucket_lock);
+ bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
+ BTREE_ITER_INTENT);
+
+ for_each_bucket(g, ca) {
+ ret = __bch2_alloc_write_key(c, ca, g, &iter, journal_seq);
+ if (ret)
+ break;
+ }
- ret = 0;
-fsck_err:
+ bch2_btree_iter_unlock(&iter);
return ret;
}
long i;
unsigned j;
- for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
- BUG_ON(ca->prio_buckets[iter] == bucket);
-
for (j = 0; j < RESERVE_NR; j++)
fifo_for_each_entry(i, &ca->free[j], iter)
BUG_ON(i == bucket);
static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g)
{
- spin_lock(&ca->freelist_lock);
-
- bch2_invalidate_bucket(ca, g);
+ struct bch_fs *c = ca->fs;
+ struct bucket_mark m;
- g->prio[READ] = ca->fs->prio_clock[READ].hand;
- g->prio[WRITE] = ca->fs->prio_clock[WRITE].hand;
+ spin_lock(&ca->freelist_lock);
+ if (!bch2_invalidate_bucket(ca, g, &m)) {
+ spin_unlock(&ca->freelist_lock);
+ return;
+ }
verify_not_on_freelist(ca, g - ca->buckets);
BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
-
spin_unlock(&ca->freelist_lock);
+
+ g->prio[READ] = c->prio_clock[READ].hand;
+ g->prio[WRITE] = c->prio_clock[WRITE].hand;
+
+ if (m.cached_sectors) {
+ ca->allocator_invalidating_data = true;
+ } else if (m.journal_seq_valid) {
+ u64 journal_seq = atomic64_read(&c->journal.seq);
+ u64 bucket_seq = journal_seq;
+
+ bucket_seq &= ~((u64) U16_MAX);
+ bucket_seq |= m.journal_seq;
+
+ if (bucket_seq > journal_seq)
+ bucket_seq -= 1 << 16;
+
+ ca->allocator_journal_seq_flush =
+ max(ca->allocator_journal_seq_flush, bucket_seq);
+ }
}
/*
struct bucket *g,
struct bucket_mark m)
{
+ /*
+ * Time since last read, scaled to [0, 8) where larger value indicates
+ * more recently read data:
+ */
unsigned long hotness =
(g->prio[READ] - ca->min_prio[READ]) * 7 /
(ca->fs->prio_clock[READ].hand - ca->min_prio[READ]);
- return (((hotness + 1) * bucket_sectors_used(m)) << 8) |
+ /* How much we want to keep the data in this bucket: */
+ unsigned long data_wantness =
+ (hotness + 1) * bucket_sectors_used(m);
+
+ unsigned long needs_journal_commit =
+ bucket_needs_journal_commit(m, ca->fs->journal.last_seq_ondisk);
+
+ return (data_wantness << 9) |
+ (needs_journal_commit << 8) |
bucket_gc_gen(ca, g);
}
static void invalidate_buckets(struct bch_dev *ca)
{
- ca->inc_gen_needs_gc = 0;
- ca->inc_gen_really_needs_gc = 0;
+ ca->inc_gen_needs_gc = 0;
+ ca->inc_gen_really_needs_gc = 0;
switch (ca->mi.replacement) {
case CACHE_REPLACEMENT_LRU:
}
}
-static bool __bch2_allocator_push(struct bch_dev *ca, long bucket)
+static int size_t_cmp(const void *_l, const void *_r)
{
- if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
- goto success;
-
- if (fifo_push(&ca->free[RESERVE_MOVINGGC], bucket))
- goto success;
-
- if (fifo_push(&ca->free[RESERVE_BTREE], bucket))
- goto success;
-
- if (fifo_push(&ca->free[RESERVE_NONE], bucket))
- goto success;
+ const size_t *l = _l, *r = _r;
- return false;
-success:
- closure_wake_up(&ca->fs->freelist_wait);
- return true;
+ return (*l > *r) - (*l < *r);
}
-static bool bch2_allocator_push(struct bch_dev *ca, long bucket)
+static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
+ u64 *journal_seq)
{
- bool ret;
+ struct btree_iter iter;
+ unsigned nr_invalidated = 0;
+ size_t b, i;
+ int ret = 0;
- spin_lock(&ca->freelist_lock);
- ret = __bch2_allocator_push(ca, bucket);
- if (ret)
- fifo_pop(&ca->free_inc, bucket);
- spin_unlock(&ca->freelist_lock);
+ bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
+ BTREE_ITER_INTENT);
- return ret;
+ fifo_for_each_entry(b, &ca->free_inc, i) {
+ ret = __bch2_alloc_write_key(c, ca, ca->buckets + b,
+ &iter, journal_seq);
+ if (ret)
+ break;
+
+ nr_invalidated++;
+ }
+
+ bch2_btree_iter_unlock(&iter);
+ return nr_invalidated ?: ret;
}
-static void bch2_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca)
+/*
+ * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
+ * then add it to the freelist, waiting until there's room if necessary:
+ */
+static void discard_invalidated_bucket(struct bch_dev *ca, long bucket)
{
- u16 last_seq_ondisk = c->journal.last_seq_ondisk;
- struct bucket *g;
+ if (ca->mi.discard &&
+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ bucket_to_sector(ca, bucket),
+ ca->mi.bucket_size, GFP_NOIO, 0);
- for_each_bucket(g, ca) {
- struct bucket_mark m = READ_ONCE(g->mark);
- if (is_available_bucket(m) &&
- !m.cached_sectors &&
- !m.had_metadata &&
- !bucket_needs_journal_commit(m, last_seq_ondisk)) {
- spin_lock(&ca->freelist_lock);
+ while (1) {
+ bool pushed = false;
+ unsigned i;
- bch2_mark_alloc_bucket(ca, g, true);
- g->prio[READ] = c->prio_clock[READ].hand;
- g->prio[WRITE] = c->prio_clock[WRITE].hand;
+ set_current_state(TASK_INTERRUPTIBLE);
- verify_not_on_freelist(ca, g - ca->buckets);
- BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
+ /*
+ * Don't remove from free_inc until after it's added to
+ * freelist, so gc can find it:
+ */
+ spin_lock(&ca->freelist_lock);
+ for (i = 0; i < RESERVE_NR; i++)
+ if (fifo_push(&ca->free[i], bucket)) {
+ fifo_pop(&ca->free_inc, bucket);
+ closure_wake_up(&ca->fs->freelist_wait);
+ pushed = true;
+ break;
+ }
+ spin_unlock(&ca->freelist_lock);
- spin_unlock(&ca->freelist_lock);
+ if (pushed)
+ break;
- if (fifo_full(&ca->free_inc))
- break;
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ break;
}
+ schedule();
+ try_to_freeze();
}
-}
-
-static int size_t_cmp(const void *_l, const void *_r)
-{
- const size_t *l = _l, *r = _r;
- return (*l > *r) - (*l < *r);
+ __set_current_state(TASK_RUNNING);
}
/**
{
struct bch_dev *ca = arg;
struct bch_fs *c = ca->fs;
- long bucket;
+ size_t bucket;
int ret;
set_freezable();
- bch2_find_empty_buckets(c, ca);
-
- while (1) {
- /*
- * First, we pull buckets off of the free_inc list, possibly
- * issue discards to them, then we add the bucket to a
- * free list:
- */
-
- while (!fifo_empty(&ca->free_inc)) {
- bucket = fifo_peek(&ca->free_inc);
-
- /*
- * Don't remove from free_inc until after it's added
- * to freelist, so gc doesn't miss it while we've
- * dropped bucket lock
- */
-
- if (ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
- blkdev_issue_discard(ca->disk_sb.bdev,
- bucket_to_sector(ca, bucket),
- ca->mi.bucket_size, GFP_NOIO, 0);
-
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (bch2_allocator_push(ca, bucket))
- break;
-
- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- goto out;
- }
- schedule();
- try_to_freeze();
- }
-
- __set_current_state(TASK_RUNNING);
- }
-
- /* We've run out of free buckets! */
+ while (!kthread_should_stop()) {
+ u64 journal_seq = 0;
+ /* Reset front/back so we can easily sort fifo entries later: */
BUG_ON(fifo_used(&ca->free_inc));
- ca->free_inc.front = ca->free_inc.back = 0;
+ ca->free_inc.front = ca->free_inc.back = 0;
+ ca->allocator_journal_seq_flush = 0;
+ ca->allocator_invalidating_data = false;
down_read(&c->gc_lock);
+ if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
+ up_read(&c->gc_lock);
+ goto out;
+ }
+
while (1) {
/*
* Find some buckets that we can invalidate, either
*/
invalidate_buckets(ca);
-
trace_alloc_batch(ca, fifo_used(&ca->free_inc),
ca->free_inc.size);
spin_unlock(&ca->freelist_lock);
/*
- * free_inc is full of newly-invalidated buckets, must write out
- * prios and gens before they can be re-used
+ * free_inc is now full of newly-invalidated buckets: next,
+ * write out the new bucket gens:
*/
- ret = bch2_prio_write(ca);
- if (ret) {
- /*
- * Emergency read only - allocator thread has to
- * shutdown.
- *
- * N.B. we better be going into RO mode, else
- * allocations would hang indefinitely - whatever
- * generated the error will have sent us into RO mode.
- *
- * Clear out the free_inc freelist so things are
- * consistent-ish:
- */
- spin_lock(&ca->freelist_lock);
- while (fifo_pop(&ca->free_inc, bucket))
- bch2_mark_free_bucket(ca, ca->buckets + bucket);
- spin_unlock(&ca->freelist_lock);
- goto out;
+
+ while (!fifo_empty(&ca->free_inc) && !kthread_should_stop()) {
+ ret = bch2_invalidate_free_inc(c, ca, &journal_seq);
+ if (bch2_fs_fatal_err_on(ret < 0, c,
+ "error invalidating buckets: %i", ret))
+ goto err;
+
+ if (ca->allocator_invalidating_data)
+ bch2_journal_flush_seq(&c->journal, journal_seq);
+ else if (ca->allocator_journal_seq_flush)
+ bch2_journal_flush_seq(&c->journal,
+ ca->allocator_journal_seq_flush);
+
+ while (ret && !kthread_should_stop()) {
+ BUG_ON(fifo_empty(&ca->free_inc));
+
+ bucket = fifo_peek(&ca->free_inc);
+ discard_invalidated_bucket(ca, bucket);
+ --ret;
+ }
}
+
+ ca->alloc_thread_started = true;
}
out:
/*
*/
synchronize_rcu();
return 0;
+err:
+ /*
+ * Emergency read only - allocator thread has to shutdown.
+ *
+ * N.B. we better be going into RO mode, else allocations would hang
+ * indefinitely - whatever generated the error will have sent us into RO
+ * mode.
+ *
+ * Clear out the free_inc freelist so things are consistent-ish:
+ */
+ spin_lock(&ca->freelist_lock);
+ while (fifo_pop(&ca->free_inc, bucket))
+ bch2_mark_free_bucket(ca, ca->buckets + bucket);
+ spin_unlock(&ca->freelist_lock);
+ goto out;
}
/* Allocation */
+static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
+{
+ struct bucket *g;
+ long r = -1;
+
+ if (!down_read_trylock(&c->gc_lock))
+ return r;
+
+ if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+ goto out;
+
+ for_each_bucket(g, ca)
+ if (!g->mark.touched_this_mount &&
+ is_available_bucket(g->mark) &&
+ bch2_mark_alloc_bucket_startup(ca, g)) {
+ r = g - ca->buckets;
+ break;
+ }
+out:
+ up_read(&c->gc_lock);
+ return r;
+}
+
/**
* bch_bucket_alloc - allocate a single bucket from a specific device
*
* Returns index of bucket on success, 0 on failure
* */
-size_t bch2_bucket_alloc(struct bch_dev *ca, enum alloc_reserve reserve)
+long bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+ enum alloc_reserve reserve)
{
- struct bucket *g;
- long r;
+ size_t r;
spin_lock(&ca->freelist_lock);
- if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
- fifo_pop(&ca->free[reserve], r))
+ if (likely(fifo_pop(&ca->free[RESERVE_NONE], r)))
goto out;
+ switch (reserve) {
+ case RESERVE_ALLOC:
+ if (fifo_pop(&ca->free[RESERVE_BTREE], r))
+ goto out;
+ break;
+ case RESERVE_BTREE:
+ if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
+ ca->free[RESERVE_BTREE].size &&
+ fifo_pop(&ca->free[RESERVE_BTREE], r))
+ goto out;
+ break;
+ case RESERVE_MOVINGGC:
+ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], r))
+ goto out;
+ break;
+ default:
+ break;
+ }
+
spin_unlock(&ca->freelist_lock);
+ if (unlikely(!ca->alloc_thread_started) &&
+ (r = bch2_bucket_alloc_startup(c, ca)) >= 0) {
+ verify_not_on_freelist(ca, r);
+ goto out2;
+ }
+
trace_bucket_alloc_fail(ca, reserve);
- return 0;
+ return -1;
out:
verify_not_on_freelist(ca, r);
spin_unlock(&ca->freelist_lock);
- trace_bucket_alloc(ca, reserve);
-
bch2_wake_allocator(ca);
+out2:
+ ca->buckets[r].prio[READ] = c->prio_clock[READ].hand;
+ ca->buckets[r].prio[WRITE] = c->prio_clock[WRITE].hand;
- g = ca->buckets + r;
-
- g->prio[READ] = ca->fs->prio_clock[READ].hand;
- g->prio[WRITE] = ca->fs->prio_clock[WRITE].hand;
-
+ trace_bucket_alloc(ca, reserve);
return r;
}
-static void __bch2_bucket_free(struct bch_dev *ca, struct bucket *g)
-{
- bch2_mark_free_bucket(ca, g);
-}
-
enum bucket_alloc_ret {
ALLOC_SUCCESS,
NO_DEVICES, /* -EROFS */
while (ob->nr_ptrs < nr_replicas) {
struct bch_dev *ca;
- u64 bucket;
+ long bucket;
if (!available) {
ret = NO_DEVICES;
get_random_int() > devs->d[i].weight)
continue;
- bucket = bch2_bucket_alloc(ca, reserve);
- if (!bucket) {
+ bucket = bch2_bucket_alloc(c, ca, reserve);
+ if (bucket < 0) {
if (fail_idx == -1)
fail_idx = i;
continue;
? 0 : BTREE_NODE_RESERVE;
int ret;
- BUG_ON(!reserve);
BUG_ON(!nr_replicas);
retry:
ob = lock_writepoint(c, wp);
capacity *= (100 - c->opts.gc_reserve_percent);
capacity = div64_u64(capacity, 100);
- BUG_ON(capacity + reserved_sectors > total_capacity);
+ BUG_ON(reserved_sectors > total_capacity);
+
+ capacity = min(capacity, total_capacity - reserved_sectors);
c->capacity = capacity;
closure_wake_up(&c->freelist_wait);
}
-static void bch2_stop_write_point(struct bch_dev *ca,
- struct write_point *wp)
+static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
+ struct write_point *wp)
{
- struct bch_fs *c = ca->fs;
struct open_bucket *ob;
struct bch_extent_ptr *ptr;
bch2_open_bucket_put(c, ob);
}
-static bool bch2_dev_has_open_write_point(struct bch_dev *ca)
+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_fs *c = ca->fs;
struct bch_extent_ptr *ptr;
struct open_bucket *ob;
}
/* device goes ro: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_fs *c = ca->fs;
struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
- struct task_struct *p;
struct closure cl;
unsigned i;
+ BUG_ON(ca->alloc_thread);
+
closure_init_stack(&cl);
/* First, remove device from allocation groups: */
+ bch2_dev_group_remove(&c->journal.devs, ca);
bch2_dev_group_remove(tier, ca);
bch2_dev_group_remove(&c->all_devs, ca);
- bch2_recalc_capacity(c);
-
/*
- * Stopping the allocator thread comes after removing from allocation
- * groups, else pending allocations will hang:
- */
-
- p = ca->alloc_thread;
- ca->alloc_thread = NULL;
- smp_wmb();
-
- /*
- * We need an rcu barrier between setting ca->alloc_thread = NULL and
- * the thread shutting down to avoid a race with bch2_usage_update() -
- * the allocator thread itself does a synchronize_rcu() on exit.
- *
- * XXX: it would be better to have the rcu barrier be asynchronous
- * instead of blocking us here
+ * Capacity is calculated based off of devices in allocation groups:
*/
- if (p) {
- kthread_stop(p);
- put_task_struct(p);
- }
+ bch2_recalc_capacity(c);
/* Next, close write points that point to this device... */
-
for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
- bch2_stop_write_point(ca, &c->write_points[i]);
+ bch2_stop_write_point(c, ca, &c->write_points[i]);
- bch2_stop_write_point(ca, &ca->copygc_write_point);
- bch2_stop_write_point(ca, &c->promote_write_point);
- bch2_stop_write_point(ca, &ca->tiering_write_point);
- bch2_stop_write_point(ca, &c->migration_write_point);
- bch2_stop_write_point(ca, &c->btree_write_point);
+ bch2_stop_write_point(c, ca, &ca->copygc_write_point);
+ bch2_stop_write_point(c, ca, &c->promote_write_point);
+ bch2_stop_write_point(c, ca, &ca->tiering_write_point);
+ bch2_stop_write_point(c, ca, &c->migration_write_point);
+ bch2_stop_write_point(c, ca, &c->btree_write_point);
mutex_lock(&c->btree_reserve_cache_lock);
while (c->btree_reserve_cache_nr) {
}
mutex_unlock(&c->btree_reserve_cache_lock);
- /* Avoid deadlocks.. */
-
+ /*
+ * Wake up threads that were blocked on allocation, so they can notice
+ * the device can no longer be removed and the capacity has changed:
+ */
closure_wake_up(&c->freelist_wait);
+
+ /*
+ * journal_res_get() can block waiting for free space in the journal -
+ * it needs to notice there may not be devices to allocate from anymore:
+ */
wake_up(&c->journal.wait);
/* Now wait for any in flight writes: */
while (1) {
closure_wait(&c->open_buckets_wait, &cl);
- if (!bch2_dev_has_open_write_point(ca)) {
+ if (!bch2_dev_has_open_write_point(c, ca)) {
closure_wake_up(&c->open_buckets_wait);
break;
}
}
}
-/*
- * Startup the allocator thread for transition to RW mode:
- */
-int bch2_dev_allocator_start(struct bch_dev *ca)
+/* device goes rw: */
+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_fs *c = ca->fs;
struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
struct bch_sb_field_journal *journal_buckets;
bool has_journal;
- struct task_struct *k;
- /*
- * allocator thread already started?
- */
- if (ca->alloc_thread)
- return 0;
-
- k = kthread_create(bch2_allocator_thread, ca, "bcache_allocator");
- if (IS_ERR(k))
- return 0;
-
- get_task_struct(k);
- ca->alloc_thread = k;
-
- bch2_dev_group_add(tier, ca);
bch2_dev_group_add(&c->all_devs, ca);
+ bch2_dev_group_add(tier, ca);
mutex_lock(&c->sb_lock);
journal_buckets = bch2_sb_get_journal(ca->disk_sb.sb);
if (has_journal)
bch2_dev_group_add(&c->journal.devs, ca);
+}
- bch2_recalc_capacity(c);
+/* stop allocator thread: */
+void bch2_dev_allocator_stop(struct bch_dev *ca)
+{
+ struct task_struct *p = ca->alloc_thread;
+
+ ca->alloc_thread = NULL;
+ smp_wmb();
+
+ /*
+ * We need an rcu barrier between setting ca->alloc_thread = NULL and
+ * the thread shutting down to avoid a race with bch2_usage_update() -
+ * the allocator thread itself does a synchronize_rcu() on exit.
+ *
+ * XXX: it would be better to have the rcu barrier be asynchronous
+ * instead of blocking us here
+ */
+ if (p)
+ kthread_stop(p);
+}
+
+/* start allocator thread: */
+int bch2_dev_allocator_start(struct bch_dev *ca)
+{
+ struct task_struct *p;
/*
- * Don't wake up allocator thread until after adding device to
- * allocator groups - otherwise, alloc thread could get a spurious
- * -EROFS due to prio_write() -> journal_meta() not finding any devices:
+ * allocator thread already started?
*/
- wake_up_process(k);
+ if (ca->alloc_thread)
+ return 0;
+
+ p = kthread_run(bch2_allocator_thread, ca, "bcache_allocator");
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ ca->alloc_thread = p;
return 0;
}
struct bch_fs;
struct dev_group;
-static inline size_t prios_per_bucket(const struct bch_dev *ca)
-{
- return (bucket_bytes(ca) - sizeof(struct prio_set)) /
- sizeof(struct bucket_disk);
-}
-
-static inline size_t prio_buckets(const struct bch_dev *ca)
-{
- return DIV_ROUND_UP((size_t) (ca)->mi.nbuckets, prios_per_bucket(ca));
-}
-
void bch2_dev_group_remove(struct dev_group *, struct bch_dev *);
void bch2_dev_group_add(struct dev_group *, struct bch_dev *);
-int bch2_prio_read(struct bch_dev *);
-int bch2_prio_write(struct bch_dev *);
+int bch2_alloc_read(struct bch_fs *, struct list_head *);
+int bch2_alloc_write(struct bch_fs *, struct bch_dev *, u64 *);
+int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
-size_t bch2_bucket_alloc(struct bch_dev *, enum alloc_reserve);
+long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve);
void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
(_ptr)++)
void bch2_recalc_capacity(struct bch_fs *);
+
+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
+
void bch2_fs_allocator_init(struct bch_fs *);
+extern const struct bkey_ops bch2_bkey_alloc_ops;
+
#endif /* _BCACHE_ALLOC_H */
/* There is one reserve for each type of btree, one for prios and gens
* and one for moving GC */
enum alloc_reserve {
- RESERVE_PRIO,
- RESERVE_BTREE,
- RESERVE_METADATA_LAST = RESERVE_BTREE,
- RESERVE_MOVINGGC,
-
- RESERVE_NONE,
- RESERVE_NR,
+ RESERVE_ALLOC = -1,
+ RESERVE_BTREE = 0,
+ RESERVE_MOVINGGC = 1,
+ RESERVE_NONE = 2,
+ RESERVE_NR = 3,
};
-static inline bool allocation_is_metadata(enum alloc_reserve id)
-{
- return id <= RESERVE_METADATA_LAST;
-}
-
struct dev_group {
spinlock_t lock;
unsigned nr;
(btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES)
/* Size of the freelist we allocate btree nodes from: */
-#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 2)
+#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4)
struct btree;
struct crypto_blkcipher;
u16 bucket_size; /* sectors */
u8 state;
u8 tier;
- u8 has_metadata;
- u8 has_data;
u8 replacement;
u8 discard;
u8 valid;
};
+struct bch_replicas_cpu_entry {
+ u8 data_type;
+ u8 devs[BCH_SB_MEMBERS_MAX / 8];
+};
+
+struct bch_replicas_cpu {
+ struct rcu_head rcu;
+ unsigned nr;
+ unsigned entry_size;
+ struct bch_replicas_cpu_entry entries[];
+};
+
struct bch_dev {
struct kobject kobj;
struct percpu_ref ref;
struct task_struct *alloc_thread;
- struct prio_set *disk_buckets;
-
- /*
- * When allocating new buckets, prio_write() gets first dibs - since we
- * may not be allocate at all without writing priorities and gens.
- * prio_last_buckets[] contains the last buckets we wrote priorities to
- * (so gc can mark them as metadata).
- */
- u64 *prio_buckets;
- u64 *prio_last_buckets;
- spinlock_t prio_buckets_lock;
- struct bio *bio_prio;
- bool prio_read_done;
- bool need_prio_write;
- struct mutex prio_write_lock;
+ bool need_alloc_write;
/*
* free: Buckets that are ready to be used
DECLARE_FIFO(long, free)[RESERVE_NR];
DECLARE_FIFO(long, free_inc);
spinlock_t freelist_lock;
+ bool alloc_thread_started;
size_t fifo_last_bucket;
atomic_long_t saturated_count;
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
+ u64 allocator_journal_seq_flush;
+ bool allocator_invalidating_data;
alloc_heap alloc_heap;
bucket_heap copygc_heap;
BCH_FS_FSCK_FIXED_ERRORS,
BCH_FS_FSCK_DONE,
BCH_FS_FIXED_GENS,
+ BCH_FS_REBUILD_REPLICAS,
};
struct btree_debug {
struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
+ struct bch_replicas_cpu __rcu *replicas;
+ struct bch_replicas_cpu __rcu *replicas_gc;
+ struct mutex replicas_gc_lock;
+
struct bch_opts opts;
/* Updated by bch2_sb_update():*/
u8 nr_devices;
u8 clean;
- u8 meta_replicas_have;
- u8 data_replicas_have;
-
u8 str_hash_type;
u8 encryption_type;
#define _BCACHEFS_FORMAT_H
/*
- * Bcache on disk data structures
+ * bcachefs on disk data structures
*/
#include <asm/types.h>
} __attribute__((packed, aligned(8)));
BKEY_VAL_TYPE(xattr, BCH_XATTR);
+/* Bucket/allocation information: */
+
+enum {
+ BCH_ALLOC = 128,
+};
+
+enum {
+ BCH_ALLOC_FIELD_READ_TIME = 0,
+ BCH_ALLOC_FIELD_WRITE_TIME = 1,
+};
+
+struct bch_alloc {
+ struct bch_val v;
+ __u8 fields;
+ __u8 gen;
+ __u8 data[];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(alloc, BCH_ALLOC);
+
/* Superblock */
/* Version 0: Cache device
LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4)
LE64_BITMASK(BCH_MEMBER_TIER, struct bch_member, flags[0], 4, 8)
-LE64_BITMASK(BCH_MEMBER_HAS_METADATA, struct bch_member, flags[0], 8, 9)
-LE64_BITMASK(BCH_MEMBER_HAS_DATA, struct bch_member, flags[0], 9, 10)
+/* 8-10 unused, was HAS_(META)DATA */
LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14)
LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15);
BCH_SB_FIELD_journal = 0,
BCH_SB_FIELD_members = 1,
BCH_SB_FIELD_crypt = 2,
- BCH_SB_FIELD_NR = 3,
+ BCH_SB_FIELD_replicas = 3,
+ BCH_SB_FIELD_NR = 4,
};
struct bch_sb_field_journal {
LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
-struct bch_sb_field_replication {
+enum bch_data_types {
+ BCH_DATA_NONE = 0,
+ BCH_DATA_SB = 1,
+ BCH_DATA_JOURNAL = 2,
+ BCH_DATA_BTREE = 3,
+ BCH_DATA_USER = 4,
+ BCH_DATA_NR = 5,
+};
+
+struct bch_replicas_entry {
+ u8 data_type;
+ u8 nr;
+ u8 devs[0];
+};
+
+struct bch_sb_field_replicas {
struct bch_sb_field field;
+ struct bch_replicas_entry entries[0];
};
/*
LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52);
LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56);
-LE64_BITMASK(BCH_SB_META_REPLICAS_HAVE, struct bch_sb, flags[0], 56, 60);
-LE64_BITMASK(BCH_SB_DATA_REPLICAS_HAVE, struct bch_sb, flags[0], 60, 64);
+/* 56-64 unused, was REPLICAS_HAVE */
LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4);
LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8);
LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10);
LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14);
+
/* 14-20 unused, was JOURNAL_ENTRY_SIZE */
LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
BCH_COMPRESSION_NR = 3,
};
-/* backing device specific stuff: */
-
-struct backingdev_sb {
- __le64 csum;
- __le64 offset; /* sector where this sb was written */
- __le64 version; /* of on disk format */
-
- uuid_le magic; /* bcachefs superblock UUID */
-
- uuid_le disk_uuid;
-
- /*
- * Internal cache set UUID - xored with various magic numbers and thus
- * must never change:
- */
- union {
- uuid_le set_uuid;
- __le64 set_magic;
- };
- __u8 label[BCH_SB_LABEL_SIZE];
-
- __le64 flags;
-
- /* Incremented each time superblock is written: */
- __le64 seq;
-
- /*
- * User visible UUID for identifying the cache set the user is allowed
- * to change:
- *
- * XXX hooked up?
- */
- uuid_le user_uuid;
- __le64 pad1[6];
-
- __le64 data_offset;
- __le16 block_size; /* sectors */
- __le16 pad2[3];
-
- __le32 last_mount; /* time_t */
- __le16 pad3;
- /* size of variable length portion - always 0 for backingdev superblock */
- __le16 u64s;
- __u64 _data[0];
-};
-
-LE64_BITMASK(BDEV_CACHE_MODE, struct backingdev_sb, flags, 0, 4);
-#define CACHE_MODE_WRITETHROUGH 0U
-#define CACHE_MODE_WRITEBACK 1U
-#define CACHE_MODE_WRITEAROUND 2U
-#define CACHE_MODE_NONE 3U
-
-LE64_BITMASK(BDEV_STATE, struct backingdev_sb, flags, 61, 63);
-#define BDEV_STATE_NONE 0U
-#define BDEV_STATE_CLEAN 1U
-#define BDEV_STATE_DIRTY 2U
-#define BDEV_STATE_STALE 3U
-
-#define BDEV_DATA_START_DEFAULT 16 /* sectors */
-
-static inline _Bool __SB_IS_BDEV(__u64 version)
-{
- return version == BCACHE_SB_VERSION_BDEV
- || version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
-}
-
-static inline _Bool SB_IS_BDEV(const struct bch_sb *sb)
-{
- return __SB_IS_BDEV(sb->version);
-}
-
/*
* Magic numbers
*
#define BCACHE_STATFS_MAGIC 0xca451a4e
#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL)
-#define PSET_MAGIC __cpu_to_le64(0x6750e15f87337f91ULL)
#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL)
static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
}
-static inline __u64 __pset_magic(struct bch_sb *sb)
-{
- return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC);
-}
-
static inline __u64 __bset_magic(struct bch_sb *sb)
{
return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
LE32_BITMASK(JOURNAL_ENTRY_TYPE, struct jset_entry, flags, 0, 8);
enum {
- JOURNAL_ENTRY_BTREE_KEYS = 0,
- JOURNAL_ENTRY_BTREE_ROOT = 1,
- JOURNAL_ENTRY_PRIO_PTRS = 2,
+ JOURNAL_ENTRY_BTREE_KEYS = 0,
+ JOURNAL_ENTRY_BTREE_ROOT = 1,
+ JOURNAL_ENTRY_PRIO_PTRS = 2, /* Obsolete */
/*
* Journal sequence numbers can be blacklisted: bsets record the max
* and then record that we skipped it so that the next time we crash and
* recover we don't think there was a missing journal entry.
*/
- JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3,
+ JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3,
};
/*
#define BCH_JOURNAL_BUCKETS_MIN 20
-/* Bucket prios/gens */
-
-struct prio_set {
- struct bch_csum csum;
-
- __le64 magic;
- __le32 nonce[3];
- __le16 version;
- __le16 flags;
-
- __u8 encrypted_start[0];
-
- __le64 next_bucket;
-
- struct bucket_disk {
- __le16 prio[2];
- __u8 gen;
- } __attribute__((packed)) data[];
-} __attribute__((packed, aligned(8)));
-
-LE32_BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4);
-
/* Btree: */
#define DEFINE_BCH_BTREE_IDS() \
- DEF_BTREE_ID(EXTENTS, 0, "extents") \
- DEF_BTREE_ID(INODES, 1, "inodes") \
- DEF_BTREE_ID(DIRENTS, 2, "dirents") \
- DEF_BTREE_ID(XATTRS, 3, "xattrs")
+ DEF_BTREE_ID(EXTENTS, 0, "extents") \
+ DEF_BTREE_ID(INODES, 1, "inodes") \
+ DEF_BTREE_ID(DIRENTS, 2, "dirents") \
+ DEF_BTREE_ID(XATTRS, 3, "xattrs") \
+ DEF_BTREE_ID(ALLOC, 4, "alloc")
#define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
};
} __attribute__((packed, aligned(8)));
+/* Obsolete: */
+
+struct prio_set {
+ struct bch_csum csum;
+
+ __le64 magic;
+ __le32 nonce[3];
+ __le16 version;
+ __le16 flags;
+
+ __u8 encrypted_start[0];
+
+ __le64 next_bucket;
+
+ struct bucket_disk {
+ __le16 prio[2];
+ __u8 gen;
+ } __attribute__((packed)) data[];
+} __attribute__((packed, aligned(8)));
+
+LE32_BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4);
+
+#define PSET_MAGIC __cpu_to_le64(0x6750e15f87337f91ULL)
+
+static inline __u64 __pset_magic(struct bch_sb *sb)
+{
+ return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC);
+}
+
#endif /* _BCACHEFS_FORMAT_H */
-#ifndef _LINUX_BCACHE_IOCTL_H
-#define _LINUX_BCACHE_IOCTL_H
+#ifndef _BCACHEFS_IOCTL_H
+#define _BCACHEFS_IOCTL_H
#include <linux/uuid.h>
#include "bcachefs_format.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-
#define BCH_FORCE_IF_DATA_LOST (1 << 0)
#define BCH_FORCE_IF_METADATA_LOST (1 << 1)
#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
__u64 end_offset;
};
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _LINUX_BCACHE_IOCTL_H */
+#endif /* _BCACHEFS_IOCTL_H */
BKEY_VAL_ACCESSORS(xattr, BCH_XATTR);
+BKEY_VAL_ACCESSORS(alloc, BCH_ALLOC);
+
/* byte order helpers */
#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_types.h"
+#include "alloc.h"
#include "dirent.h"
#include "error.h"
#include "extents.h"
[BKEY_TYPE_INODES] = &bch2_bkey_inode_ops,
[BKEY_TYPE_DIRENTS] = &bch2_bkey_dirent_ops,
[BKEY_TYPE_XATTRS] = &bch2_bkey_xattr_ops,
+ [BKEY_TYPE_ALLOC] = &bch2_bkey_alloc_ops,
[BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops,
};
int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
+ enum bch_data_types data_type = type == BKEY_TYPE_BTREE
+ ? BCH_DATA_BTREE : BCH_DATA_USER;
int ret = 0;
switch (k.k->type) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const struct bch_extent_ptr *ptr;
+ if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+ (!c->opts.nofsck &&
+ fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
+ "superblock not marked as containing replicas"))) {
+ ret = bch2_check_mark_super(c, e, data_type);
+ if (ret)
+ return ret;
+ }
+
extent_for_each_ptr(e, ptr) {
struct bch_dev *ca = c->devs[ptr->dev];
struct bucket *g = PTR_BUCKET(ca, ptr);
new.gen = ptr->gen;
new.gen_valid = 1;
}));
- ca->need_prio_write = true;
+ ca->need_alloc_write = true;
}
if (fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
new.gen = ptr->gen;
new.gen_valid = 1;
}));
- ca->need_prio_write = true;
+ ca->need_alloc_write = true;
set_bit(BCH_FS_FIXED_GENS, &c->flags);
}
}
}
+
atomic64_set(&c->key_version,
max_t(u64, k.k->version.lo,
atomic64_read(&c->key_version)));
}
spin_unlock(&c->journal.lock);
-
- spin_lock(&ca->prio_buckets_lock);
-
- for (i = 0; i < prio_buckets(ca) * 2; i++) {
- b = ca->prio_buckets[i];
- if (b)
- bch2_mark_metadata_bucket(ca, ca->buckets + b,
- BUCKET_PRIOS, true);
- }
-
- spin_unlock(&ca->prio_buckets_lock);
}
static void bch2_mark_metadata(struct bch_fs *c)
* move around - if references move backwards in the ordering GC
* uses, GC could skip past them
*/
-
- if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
- return;
-
trace_gc_start(c);
/*
bch2_recalc_sectors_available(c);
down_write(&c->gc_lock);
+ if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+ goto out;
bch2_gc_start(c);
if (ret) {
bch_err(c, "btree gc failed: %d", ret);
set_bit(BCH_FS_GC_FAILURE, &c->flags);
- up_write(&c->gc_lock);
- return;
+ goto out;
}
gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
/* Indicates that gc is no longer in progress: */
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
c->gc_count++;
-
+out:
up_write(&c->gc_lock);
trace_gc_end(c);
bch2_time_stats_update(&c->btree_gc_time, start_time);
*/
for_each_member_device(ca, c, i)
bch2_wake_allocator(ca);
+
+ /*
+ * At startup, allocations can happen directly instead of via the
+ * allocator thread - issue wakeup in case they blocked on gc_lock:
+ */
+ closure_wake_up(&c->freelist_wait);
}
/* Btree coalescing */
unsigned iter = 0;
enum btree_id id;
int ret;
+
+ mutex_lock(&c->sb_lock);
+ if (!bch2_sb_get_replicas(c->disk_sb)) {
+ if (BCH_SB_INITIALIZED(c->disk_sb))
+ bch_info(c, "building replicas info");
+ set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
+ }
+ mutex_unlock(&c->sb_lock);
again:
bch2_gc_start(c);
return ret;
}
- if (journal) {
- ret = bch2_journal_mark(c, journal);
- if (ret)
- return ret;
- }
+ ret = bch2_journal_mark(c, journal);
+ if (ret)
+ return ret;
bch2_mark_metadata(c);
ret = validate_bset(c, b, ptr, i, sectors, &whiteout_u64s, WRITE);
if (ret)
- bch2_fatal_error(c);
+ bch2_inconsistent_error(c);
return ret;
}
}
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
- bool use_reserve,
- struct disk_reservation *res,
- struct closure *cl)
+ struct disk_reservation *res,
+ struct closure *cl,
+ unsigned flags)
{
BKEY_PADDED(k) tmp;
struct open_bucket *ob;
struct btree *b;
- unsigned reserve = use_reserve ? 0 : BTREE_NODE_RESERVE;
+ unsigned nr_reserve;
+ enum alloc_reserve alloc_reserve;
+
+ if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
+ nr_reserve = 0;
+ alloc_reserve = RESERVE_ALLOC;
+ } else if (flags & BTREE_INSERT_USE_RESERVE) {
+ nr_reserve = BTREE_NODE_RESERVE / 2;
+ alloc_reserve = RESERVE_BTREE;
+ } else {
+ nr_reserve = BTREE_NODE_RESERVE;
+ alloc_reserve = RESERVE_NONE;
+ }
mutex_lock(&c->btree_reserve_cache_lock);
- if (c->btree_reserve_cache_nr > reserve) {
+ if (c->btree_reserve_cache_nr > nr_reserve) {
struct btree_alloc *a =
&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
bkey_i_to_extent(&tmp.k),
res->nr_replicas,
c->opts.metadata_replicas_required,
- use_reserve ? RESERVE_BTREE : RESERVE_NONE,
- cl);
+ alloc_reserve, cl);
if (IS_ERR(ob))
return ERR_CAST(ob);
bch2_btree_build_aux_trees(b);
- bch2_check_mark_super(c, &b->key, true);
+ bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key), BCH_DATA_BTREE);
trace_btree_node_alloc(c, b);
return b;
if (flags & BTREE_INSERT_NOFAIL)
disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
- if (flags & BTREE_INSERT_NOWAIT)
- cl = NULL;
-
/*
* This check isn't necessary for correctness - it's just to potentially
* prevent us from doing a lot of work that'll end up being wasted:
reserve->nr = 0;
while (reserve->nr < nr_nodes) {
- b = __bch2_btree_node_alloc(c, flags & BTREE_INSERT_USE_RESERVE,
- &disk_res, cl);
+ b = __bch2_btree_node_alloc(c, &disk_res,
+ flags & BTREE_INSERT_NOWAIT
+ ? NULL : cl, flags);
if (IS_ERR(b)) {
ret = PTR_ERR(b);
goto err_free;
struct btree_write *w = btree_current_write(b);
EBUG_ON(iter->level || b->level);
- EBUG_ON(!trans->journal_res.ref &&
- test_bit(JOURNAL_REPLAY_DONE, &j->flags));
+ EBUG_ON(trans->journal_res.ref !=
+ !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
if (!journal_pin_active(&w->journal))
bch2_journal_pin_add(j, &trans->journal_res,
*/
six_unlock_read(&b->lock);
mutex_unlock(&c->btree_interior_update_lock);
+
+ /*
+ * Bit of funny circularity going on here we have to break:
+ *
+ * We have to drop our journal pin before writing the journal
+ * entry that points to the new btree root: else, we could
+ * deadlock if the journal currently happens to be full.
+ *
+ * This mean we're dropping the journal pin _before_ the new
+ * nodes are technically reachable - but this is safe, because
+ * after the bch2_btree_set_root_ondisk() call above they will
+ * be reachable as of the very next journal write:
+ */
+ bch2_journal_pin_drop(&c->journal, &as->journal);
+
+ /*
+ * And, do a journal write to write the pointer to the new root,
+ * then wait for it to complete before freeing the nodes we
+ * replaced:
+ */
+ bch2_journal_meta_async(&c->journal, cl);
break;
}
mutex_unlock(&c->btree_interior_update_lock);
+ /*
+ * In general, when you're staging things in a journal that will later
+ * be written elsewhere, and you also want to guarantee ordering: that
+ * is, if you have updates a, b, c, after a crash you should never see c
+ * and not a or b - there's a problem:
+ *
+ * If the final destination of the update(s) (i.e. btree node) can be
+ * written/flushed _before_ the relevant journal entry - oops, that
+ * breaks ordering, since the various leaf nodes can be written in any
+ * order.
+ *
+ * Normally we use bset->journal_seq to deal with this - if during
+ * recovery we find a btree node write that's newer than the newest
+ * journal entry, we just ignore it - we don't need it, anything we're
+ * supposed to have (that we reported as completed via fsync()) will
+ * still be in the journal, and as far as the state of the journal is
+ * concerned that btree node write never happened.
+ *
+ * That breaks when we're rewriting/splitting/merging nodes, since we're
+ * mixing btree node writes that haven't happened yet with previously
+ * written data that has been reported as completed to the journal.
+ *
+ * Thus, before making the new nodes reachable, we have to wait the
+ * newest journal sequence number we have data for to be written (if it
+ * hasn't been yet).
+ */
bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
continue_at(&as->cl, btree_interior_update_nodes_written,
system_freezable_wq);
}
-static void btree_interior_update_reparent(struct btree_interior_update *as,
+static void interior_update_flush(struct journal *j,
+ struct journal_entry_pin *pin, u64 seq)
+{
+ struct btree_interior_update *as =
+ container_of(pin, struct btree_interior_update, journal);
+
+ bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
+}
+
+static void btree_interior_update_reparent(struct bch_fs *c,
+ struct btree_interior_update *as,
struct btree_interior_update *child)
{
child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS;
child->parent_as = as;
closure_get(&as->cl);
+
+ /*
+ * When we write a new btree root, we have to drop our journal pin
+ * _before_ the new nodes are technically reachable; see
+ * btree_interior_update_nodes_written().
+ *
+ * This goes for journal pins that are recursively blocked on us - so,
+ * just transfer the journal pin to the new interior update so
+ * btree_interior_update_nodes_written() can drop it.
+ */
+ bch2_journal_pin_add_if_older(&c->journal, &child->journal,
+ &as->journal, interior_update_flush);
+ bch2_journal_pin_drop(&c->journal, &child->journal);
+
+ as->journal_seq = max(as->journal_seq, child->journal_seq);
}
static void btree_interior_update_updated_root(struct bch_fs *c,
* btree_interior_update operation to point to us:
*/
if (r->as)
- btree_interior_update_reparent(as, r->as);
+ btree_interior_update_reparent(c, as, r->as);
as->mode = BTREE_INTERIOR_UPDATING_ROOT;
as->b = r->b;
mutex_unlock(&c->btree_interior_update_lock);
+ /*
+ * When we're rewriting nodes and updating interior nodes, there's an
+ * issue with updates that haven't been written in the journal getting
+ * mixed together with older data - see * btree_interior_update_updated_btree()
+ * for the explanation.
+ *
+ * However, this doesn't affect us when we're writing a new btree root -
+ * because to make that new root reachable we have to write out a new
+ * journal entry, which must necessarily be newer than as->journal_seq.
+ */
+
continue_at(&as->cl, btree_interior_update_nodes_written,
system_freezable_wq);
}
-static void interior_update_flush(struct journal *j,
- struct journal_entry_pin *pin, u64 seq)
-{
- struct btree_interior_update *as =
- container_of(pin, struct btree_interior_update, journal);
-
- bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
-}
-
/*
* @b is being split/rewritten: it may have pointers to not-yet-written btree
* nodes and thus outstanding btree_interior_updates - redirect @b's
*/
list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
list_del(&p->write_blocked_list);
- btree_interior_update_reparent(as, p);
+ btree_interior_update_reparent(c, as, p);
}
clear_btree_node_dirty(b);
/* for copygc, or when merging btree nodes */
#define BTREE_INSERT_USE_RESERVE (1 << 2)
+#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << 3)
/*
* Insert is for journal replay: don't get journal reservations, or mark extents
* (bch_mark_key)
*/
-#define BTREE_INSERT_JOURNAL_REPLAY (1 << 3)
+#define BTREE_INSERT_JOURNAL_REPLAY (1 << 4)
/* Don't block on allocation failure (for new btree nodes: */
-#define BTREE_INSERT_NOWAIT (1 << 4)
-#define BTREE_INSERT_GC_LOCK_HELD (1 << 5)
+#define BTREE_INSERT_NOWAIT (1 << 5)
+#define BTREE_INSERT_GC_LOCK_HELD (1 << 6)
+
+#define BCH_HASH_SET_MUST_CREATE (1 << 7)
+#define BCH_HASH_SET_MUST_REPLACE (1 << 8)
int bch2_btree_delete_at(struct btree_iter *, unsigned);
_old; \
})
-void bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g)
+bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
+ struct bucket_mark *old)
{
struct bch_fs_usage stats = { 0 };
- struct bucket_mark old, new;
+ struct bucket_mark new;
+
+ *old = bucket_data_cmpxchg(ca, g, new, ({
+ if (!is_available_bucket(new))
+ return false;
- old = bucket_data_cmpxchg(ca, g, new, ({
new.owned_by_allocator = 1;
- new.had_metadata = 0;
+ new.touched_this_mount = 1;
new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
}));
/* XXX: we're not actually updating fs usage's cached sectors... */
- bch2_fs_usage_update(&stats, old, new);
+ bch2_fs_usage_update(&stats, *old, new);
- if (!old.owned_by_allocator && old.cached_sectors)
+ if (!old->owned_by_allocator && old->cached_sectors)
trace_invalidate(ca, g - ca->buckets,
- old.cached_sectors);
+ old->cached_sectors);
+ return true;
+}
+
+bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
+{
+ struct bucket_mark new, old;
+
+ old = bucket_data_cmpxchg(ca, g, new, ({
+ if (new.touched_this_mount ||
+ !is_available_bucket(new))
+ return false;
+
+ new.owned_by_allocator = 1;
+ new.touched_this_mount = 1;
+ }));
+
+ return true;
}
void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
struct bucket_mark old, new;
old = bucket_data_cmpxchg(ca, g, new, ({
+ new.touched_this_mount = 1;
new.owned_by_allocator = 0;
new.data_type = 0;
new.cached_sectors = 0;
struct bucket_mark new;
bucket_data_cmpxchg(ca, g, new, ({
- new.owned_by_allocator = owned_by_allocator;
+ new.touched_this_mount = 1;
+ new.owned_by_allocator = owned_by_allocator;
}));
}
old = bucket_data_cmpxchg(ca, g, new, ({
saturated_add(ca, new.dirty_sectors, ca->mi.bucket_size,
GC_MAX_SECTORS_USED);
- new.data_type = type;
- new.had_metadata = 1;
+ new.data_type = type;
+ new.touched_this_mount = 1;
}));
if (old.data_type != type &&
if (gc_will_visit) {
if (journal_seq)
bucket_cmpxchg(g, new, ({
- new.journal_seq_valid = 1;
- new.journal_seq = journal_seq;
+ new.touched_this_mount = 1;
+ new.journal_seq_valid = 1;
+ new.journal_seq = journal_seq;
}));
goto out;
return;
}
- EBUG_ON(type != S_CACHED &&
- !may_make_unavailable &&
- is_available_bucket(new) &&
- test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
-
if (type != S_CACHED &&
new.dirty_sectors == GC_MAX_SECTORS_USED &&
disk_sectors < 0)
new.data_type = data_type;
}
- new.had_metadata |= is_meta_bucket(new);
+ new.touched_this_mount = 1;
}));
if (old.data_type != data_type &&
void bch2_bucket_seq_cleanup(struct bch_fs *);
-void bch2_invalidate_bucket(struct bch_dev *, struct bucket *);
+bool bch2_invalidate_bucket(struct bch_dev *, struct bucket *,
+ struct bucket_mark *);
+bool bch2_mark_alloc_bucket_startup(struct bch_dev *, struct bucket *);
void bch2_mark_free_bucket(struct bch_dev *, struct bucket *);
void bch2_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool);
void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *,
#include "util.h"
+/* kill, switch to bch_data_types */
enum bucket_data_type {
BUCKET_DATA = 0,
BUCKET_BTREE,
struct {
u8 gen;
-
- unsigned gen_valid:1;
- unsigned journal_seq_valid:1;
-
- /*
- * If this bucket had metadata while at the current generation
- * number, the allocator must increment its gen before we reuse
- * it:
- */
- unsigned had_metadata:1;
-
- unsigned owned_by_allocator:1;
-
- unsigned data_type:3;
-
- unsigned nouse:1;
-
+ u8 data_type:3,
+ gen_valid:1,
+ owned_by_allocator:1,
+ nouse:1,
+ journal_seq_valid:1,
+ touched_this_mount:1;
u16 dirty_sectors;
u16 cached_sectors;
size_ondisk > ca->mi.bucket_size)
return "spans multiple buckets";
- if (!(metadata ? ca->mi.has_metadata : ca->mi.has_data))
- return "device not marked as containing data";
-
return NULL;
}
goto err;
}
- if (replicas < c->sb.meta_replicas_have) {
+ if (!bch2_sb_has_replicas(c, e, BCH_DATA_BTREE)) {
bch2_bkey_val_to_text(c, btree_node_type(b),
buf, sizeof(buf), k);
bch2_fs_bug(c,
- "btree key bad (too few replicas, %u < %u): %s",
- replicas, c->sb.meta_replicas_have, buf);
+ "btree key bad (replicas not marked in superblock):\n%s",
+ buf);
return;
}
}
if (!bkey_extent_is_cached(e.k) &&
- replicas < c->sb.data_replicas_have) {
- bch2_bkey_val_to_text(c, btree_node_type(b), buf,
- sizeof(buf), e.s_c);
+ !bch2_sb_has_replicas(c, e, BCH_DATA_USER)) {
+ bch2_bkey_val_to_text(c, btree_node_type(b),
+ buf, sizeof(buf), e.s_c);
bch2_fs_bug(c,
- "extent key bad (too few replicas, %u < %u): %s",
- replicas, c->sb.data_replicas_have, buf);
+ "extent key bad (replicas not marked in superblock):\n%s",
+ buf);
return;
}
key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
- bch2_check_mark_super(c, key_to_write, false);
+ bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
+ BCH_DATA_USER);
bch2_submit_wbio_replicas(to_wbio(bio), c, key_to_write);
return ret;
return last_seq(j) + fifo_entry_idx(&j->pin, pin_list);
}
-static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
- struct jset_entry *entry, unsigned type)
-{
- while (entry < vstruct_last(jset)) {
- if (JOURNAL_ENTRY_TYPE(entry) == type)
- return entry;
-
- entry = vstruct_next(entry);
- }
-
- return NULL;
-}
-
-#define for_each_jset_entry_type(entry, jset, type) \
- for (entry = (jset)->start; \
- (entry = __jset_entry_type_next(jset, entry, type)); \
- entry = vstruct_next(entry))
-
-#define for_each_jset_key(k, _n, entry, jset) \
- for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
- vstruct_for_each_safe(entry, k, _n)
-
static inline void bch2_journal_add_entry(struct journal_buf *buf,
const void *data, size_t u64s,
unsigned type, enum btree_id id,
JOURNAL_ENTRY_BTREE_ROOT, id, level);
}
-static inline void bch2_journal_add_prios(struct journal *j,
- struct journal_buf *buf)
-{
- /*
- * no prio bucket ptrs yet... XXX should change the allocator so this
- * can't happen:
- */
- if (!buf->nr_prio_buckets)
- return;
-
- bch2_journal_add_entry(buf, j->prio_buckets, buf->nr_prio_buckets,
- JOURNAL_ENTRY_PRIO_PTRS, 0, 0);
-}
-
static void journal_seq_blacklist_flush(struct journal *j,
struct journal_entry_pin *pin, u64 seq)
{
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
{
struct journal *j = &c->journal;
- struct jset_entry *prio_ptrs;
struct journal_list jlist;
struct journal_replay *i;
struct journal_entry_pin_list *p;
bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
keys, entries, (u64) atomic64_read(&j->seq));
-
- i = list_last_entry(list, struct journal_replay, list);
- prio_ptrs = bch2_journal_find_entry(&i->j, JOURNAL_ENTRY_PRIO_PTRS, 0);
- if (prio_ptrs) {
- memcpy_u64s(j->prio_buckets,
- prio_ptrs->_data,
- le16_to_cpu(prio_ptrs->u64s));
- j->nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s);
- }
fsck_err:
return ret;
}
static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
{
- unsigned ret = BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
-
- if (buf->nr_prio_buckets)
- ret += JSET_KEYS_U64s + buf->nr_prio_buckets;
-
- return ret;
+ return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
}
static enum {
buf->disk_sectors = sectors;
sectors = min_t(unsigned, sectors, buf->size >> 9);
-
j->cur_buf_sectors = sectors;
- buf->nr_prio_buckets = j->nr_prio_buckets;
u64s = (sectors << 9) / sizeof(u64);
for_each_jset_key(k, _n, entry, &i->j) {
struct disk_reservation disk_res;
- /*
- * We might cause compressed extents to be split, so we
- * need to pass in a disk_reservation:
- */
- BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0));
+ if (entry->btree_id == BTREE_ID_ALLOC) {
+ /*
+ * allocation code handles replay for
+ * BTREE_ID_ALLOC keys:
+ */
+ ret = bch2_alloc_replay_key(c, k->k.p);
+ } else {
+
+ /*
+ * We might cause compressed extents to be
+ * split, so we need to pass in a
+ * disk_reservation:
+ */
+ BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0));
- ret = bch2_btree_insert(c, entry->btree_id, k,
- &disk_res, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_REPLAY);
- bch2_disk_reservation_put(c, &disk_res);
+ ret = bch2_btree_insert(c, entry->btree_id, k,
+ &disk_res, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_JOURNAL_REPLAY);
+ bch2_disk_reservation_put(c, &disk_res);
+ }
if (ret) {
bch_err(c, "journal replay: error %d while replaying key",
return ret;
}
-#if 0
/*
* Allocate more journal space at runtime - not currently making use if it, but
* the code works:
*/
static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
- unsigned nr)
+ unsigned nr)
{
struct journal *j = &c->journal;
struct journal_device *ja = &ca->journal;
while (ja->nr < nr) {
/* must happen under journal lock, to avoid racing with gc: */
- u64 b = bch2_bucket_alloc(ca, RESERVE_NONE);
- if (!b) {
+ long b = bch2_bucket_alloc(c, ca, RESERVE_NONE);
+ if (b < 0) {
if (!closure_wait(&c->freelist_wait, &cl)) {
spin_unlock(&j->lock);
closure_sync(&cl);
}
spin_unlock(&j->lock);
- BUG_ON(bch2_validate_journal_layout(ca->disk_sb.sb, ca->mi));
+ BUG_ON(bch2_sb_validate_journal(ca->disk_sb.sb, ca->mi));
bch2_write_super(c);
kfree(new_buckets);
bch2_disk_reservation_put(c, &disk_res);
+ if (!ret)
+ bch2_dev_allocator_add(c, ca);
+
return ret;
}
-#endif
int bch2_dev_journal_alloc(struct bch_dev *ca)
{
- struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets;
- unsigned i, nr;
- u64 b, *p;
+ unsigned nr;
if (dynamic_fault("bcachefs:add:journal_alloc"))
return -ENOMEM;
min(1 << 10,
(1 << 20) / ca->mi.bucket_size));
- p = krealloc(ja->bucket_seq, nr * sizeof(u64),
- GFP_KERNEL|__GFP_ZERO);
- if (!p)
- return -ENOMEM;
-
- ja->bucket_seq = p;
-
- p = krealloc(ja->buckets, nr * sizeof(u64),
- GFP_KERNEL|__GFP_ZERO);
- if (!p)
- return -ENOMEM;
-
- ja->buckets = p;
-
- journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
- nr + sizeof(*journal_buckets) / sizeof(u64));
- if (!journal_buckets)
- return -ENOMEM;
-
- for (i = 0, b = ca->mi.first_bucket;
- i < nr && b < ca->mi.nbuckets; b++) {
- if (!is_available_bucket(ca->buckets[b].mark))
- continue;
-
- bch2_mark_metadata_bucket(ca, &ca->buckets[b],
- BUCKET_JOURNAL, true);
- ja->buckets[i] = b;
- journal_buckets->buckets[i] = cpu_to_le64(b);
- i++;
- }
-
- if (i < nr)
- return -ENOSPC;
-
- BUG_ON(bch2_validate_journal_layout(ca->disk_sb.sb, ca->mi));
-
- ja->nr = nr;
-
- return 0;
+ return bch2_set_nr_journal_buckets(ca->fs, ca, nr);
}
/* Journalling */
jset = w->data;
j->write_start_time = local_clock();
-
- bch2_journal_add_prios(j, w);
-
mutex_lock(&c->btree_root_lock);
for (i = 0; i < BTREE_ID_NR; i++) {
struct btree_root *r = &c->btree_roots[i];
closure_return_with_destructor(cl, journal_write_done);
}
- bch2_check_mark_super(c, &j->key, true);
+ bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key),
+ BCH_DATA_JOURNAL);
/*
* XXX: we really should just disable the entire journal in nochanges
closure_return_with_destructor(cl, journal_write_done);
err:
- bch2_fatal_error(c);
+ bch2_inconsistent_error(c);
closure_return_with_destructor(cl, journal_write_done);
}
struct jset j;
};
+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
+ struct jset_entry *entry, unsigned type)
+{
+ while (entry < vstruct_last(jset)) {
+ if (JOURNAL_ENTRY_TYPE(entry) == type)
+ return entry;
+
+ entry = vstruct_next(entry);
+ }
+
+ return NULL;
+}
+
+#define for_each_jset_entry_type(entry, jset, type) \
+ for (entry = (jset)->start; \
+ (entry = __jset_entry_type_next(jset, entry, type)); \
+ entry = vstruct_next(entry))
+
+#define for_each_jset_key(k, _n, entry, jset) \
+ for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
+ vstruct_for_each_safe(entry, k, _n)
+
#define JOURNAL_PIN (32 * 1024)
static inline bool journal_pin_active(struct journal_entry_pin *pin)
unsigned size;
unsigned disk_sectors;
-
- /*
- * ugh, prio_buckets are stupid - need to convert them to new
- * transaction machinery when it arrives
- */
- unsigned nr_prio_buckets;
-
/* bloom filter: */
unsigned long has_inode[1024 / sizeof(unsigned long)];
};
/* protects advancing ja->last_idx: */
struct mutex reclaim_lock;
-
- /*
- * ugh: need to get prio_buckets converted over to the eventual new
- * transaction machinery
- */
- __le64 prio_buckets[BCH_SB_MEMBERS_MAX];
- unsigned nr_prio_buckets;
-
unsigned write_delay_ms;
unsigned reclaim_delay_ms;
{
struct moving_context ctxt;
struct bch_fs *c = ca->fs;
- struct bch_sb_field_members *mi;
unsigned pass = 0;
u64 seen_key_count;
int ret = 0;
BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
- if (!ca->mi.has_data)
+ if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER)))
return 0;
+ mutex_lock(&c->replicas_gc_lock);
+ bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
+
bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
ctxt.avoid = ca;
BUG_ON(ret);
seen_key_count++;
+ continue;
next:
+ if (bkey_extent_is_data(k.k))
+ bch2_check_mark_super(c, bkey_s_c_to_extent(k),
+ BCH_DATA_USER);
bch2_btree_iter_advance_pos(&iter);
bch2_btree_iter_cond_resched(&iter);
bch2_move_ctxt_exit(&ctxt);
if (ret)
- return ret;
+ goto err;
} while (seen_key_count && pass++ < MAX_DATA_OFF_ITER);
if (seen_key_count) {
pr_err("Unable to migrate all data in %d iterations.",
MAX_DATA_OFF_ITER);
- return -1;
+ ret = -1;
+ goto err;
}
- mutex_lock(&c->sb_lock);
- mi = bch2_sb_get_members(c->disk_sb);
- SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return 0;
+err:
+ bch2_replicas_gc_end(c, ret);
+ mutex_unlock(&c->replicas_gc_lock);
+ return ret;
}
/*
int bch2_move_metadata_off_device(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
- struct bch_sb_field_members *mi;
unsigned i;
- int ret;
+ int ret = 0;
BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
- if (!ca->mi.has_metadata)
+ if (!(bch2_dev_has_data(c, ca) &
+ ((1 << BCH_DATA_JOURNAL)|
+ (1 << BCH_DATA_BTREE))))
return 0;
+ mutex_lock(&c->replicas_gc_lock);
+ bch2_replicas_gc_start(c,
+ (1 << BCH_DATA_JOURNAL)|
+ (1 << BCH_DATA_BTREE));
+
/* 1st, Move the btree nodes off the device */
for (i = 0; i < BTREE_ID_NR; i++) {
ret = bch2_move_btree_off(c, ca, i);
if (ret)
- return ret;
+ goto err;
}
/* There are no prios/gens to move -- they are already in the device. */
ret = bch2_journal_move(ca);
if (ret)
- return ret;
-
- mutex_lock(&c->sb_lock);
- mi = bch2_sb_get_members(c->disk_sb);
- SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false);
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
+ goto err;
- return 0;
+err:
+ bch2_replicas_gc_end(c, ret);
+ mutex_unlock(&c->replicas_gc_lock);
+ return ret;
}
/*
*/
int bch2_flag_data_bad(struct bch_dev *ca)
{
- int ret = 0;
+ struct bch_fs *c = ca->fs;
struct bkey_s_c k;
struct bkey_s_c_extent e;
struct btree_iter iter;
+ int ret = 0;
- bch2_btree_iter_init(&iter, ca->fs, BTREE_ID_EXTENTS,
+ mutex_lock(&c->replicas_gc_lock);
+ bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
+
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
POS_MIN, BTREE_ITER_PREFETCH);
while ((k = bch2_btree_iter_peek(&iter)).k &&
*/
continue;
advance:
+ if (bkey_extent_is_data(k.k))
+ bch2_check_mark_super(c, bkey_s_c_to_extent(k),
+ BCH_DATA_USER);
bch2_btree_iter_advance_pos(&iter);
}
bch2_btree_iter_unlock(&iter);
+ bch2_replicas_gc_end(c, ret);
+ mutex_unlock(&c->replicas_gc_lock);
+
return ret;
}
s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \
BCH_OPT(data_replicas_required, 0444, BCH_SB_DATA_REPLICAS_REQ,\
s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \
+ BCH_OPT(degraded, 0444, NO_SB_OPT, \
+ s8, OPT_BOOL()) \
BCH_OPT(metadata_checksum, 0644, BCH_SB_META_CSUM_TYPE, \
s8, OPT_STR(bch2_csum_types)) \
BCH_OPT(data_checksum, 0644, BCH_SB_DATA_CSUM_TYPE, \
}
}
-#define BCH_HASH_SET_MUST_CREATE (1 << 4)
-#define BCH_HASH_SET_MUST_REPLACE (1 << 5)
-
static inline int bch2_hash_set(const struct bch_hash_desc desc,
const struct bch_hash_info *info,
struct bch_fs *c, u64 inode,
#include <linux/backing-dev.h>
#include <linux/sort.h>
+static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+static const char *bch2_sb_validate_replicas(struct bch_sb *);
+
static inline void __bch2_sb_layout_size_assert(void)
{
BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
return l < r ? -1 : l > r ? 1 : 0;
}
-const char *bch2_validate_journal_layout(struct bch_sb *sb,
- struct bch_member_cpu mi)
+const char *bch2_sb_validate_journal(struct bch_sb *sb,
+ struct bch_member_cpu mi)
{
struct bch_sb_field_journal *journal;
const char *err;
return "Invalid superblock: bad member info";
for (i = 0; i < sb->nr_devices; i++) {
- if (bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)))
+ if (!bch2_dev_exists(sb, mi, i))
continue;
if (le16_to_cpu(mi->members[i].bucket_size) <
return NULL;
}
-const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb)
+const char *bch2_sb_validate(struct bcache_superblock *disk_sb)
{
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field *f;
BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
return "Invalid number of metadata replicas";
- if (!BCH_SB_META_REPLICAS_HAVE(sb) ||
- BCH_SB_META_REPLICAS_HAVE(sb) >
- BCH_SB_META_REPLICAS_WANT(sb))
- return "Invalid number of metadata replicas";
-
if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
return "Invalid number of data replicas";
BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
return "Invalid number of metadata replicas";
- if (!BCH_SB_DATA_REPLICAS_HAVE(sb) ||
- BCH_SB_DATA_REPLICAS_HAVE(sb) >
- BCH_SB_DATA_REPLICAS_WANT(sb))
- return "Invalid number of data replicas";
-
if (!BCH_SB_BTREE_NODE_SIZE(sb))
return "Btree node size not set";
mi.bucket_size * mi.nbuckets)
return "Invalid superblock: device too small";
- err = bch2_validate_journal_layout(sb, mi);
+ err = bch2_sb_validate_journal(sb, mi);
+ if (err)
+ return err;
+
+ err = bch2_sb_validate_replicas(sb);
if (err)
return err;
c->sb.btree_node_size = BCH_SB_BTREE_NODE_SIZE(src);
c->sb.nr_devices = src->nr_devices;
c->sb.clean = BCH_SB_CLEAN(src);
- c->sb.meta_replicas_have= BCH_SB_META_REPLICAS_HAVE(src);
- c->sb.data_replicas_have= BCH_SB_DATA_REPLICAS_HAVE(src);
c->sb.str_hash_type = BCH_SB_STR_HASH_TYPE(src);
c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src);
c->sb.time_base_lo = le64_to_cpu(src->time_base_lo);
unsigned journal_u64s = journal_buckets
? le32_to_cpu(journal_buckets->field.u64s)
: 0;
+ int ret;
lockdep_assert_held(&c->sb_lock);
return -ENOMEM;
__copy_super(c->disk_sb, src);
- bch2_sb_update(c);
+ ret = bch2_sb_replicas_to_cpu_replicas(c);
+ if (ret)
+ return ret;
+
+ bch2_sb_update(c);
return 0;
}
struct closure *cl = &c->sb_write;
struct bch_dev *ca;
unsigned i, super_idx = 0;
+ const char *err;
bool wrote;
lockdep_assert_held(&c->sb_lock);
for_each_online_member(ca, c, i)
bch2_sb_from_fs(c, ca);
- if (c->opts.nochanges)
+ for_each_online_member(ca, c, i) {
+ err = bch2_sb_validate(&ca->disk_sb);
+ if (err) {
+ bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
+ goto out;
+ }
+ }
+
+ if (c->opts.nochanges ||
+ test_bit(BCH_FS_ERROR, &c->flags))
goto out;
do {
bch2_sb_update(c);
}
-void bch2_check_mark_super_slowpath(struct bch_fs *c, const struct bkey_i *k,
- bool meta)
+/* replica information: */
+
+static inline struct bch_replicas_entry *
+replicas_entry_next(struct bch_replicas_entry *i)
+{
+ return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
+}
+
+#define for_each_replicas_entry(_r, _i) \
+ for (_i = (_r)->entries; \
+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+ (_i) = replicas_entry_next(_i))
+
+static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
+ unsigned *nr,
+ unsigned *bytes,
+ unsigned *max_dev)
+{
+ struct bch_replicas_entry *i;
+ unsigned j;
+
+ *nr = 0;
+ *bytes = sizeof(*r);
+ *max_dev = 0;
+
+ if (!r)
+ return;
+
+ for_each_replicas_entry(r, i) {
+ for (j = 0; j < i->nr; j++)
+ *max_dev = max_t(unsigned, *max_dev, i->devs[j]);
+ (*nr)++;
+ }
+
+ *bytes = (void *) i - (void *) r;
+}
+
+static struct bch_replicas_cpu *
+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
+{
+ struct bch_replicas_cpu *cpu_r;
+ unsigned i, nr, bytes, max_dev, entry_size;
+
+ bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+
+ entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+ DIV_ROUND_UP(max_dev + 1, 8);
+
+ cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
+ nr * entry_size, GFP_NOIO);
+ if (!cpu_r)
+ return NULL;
+
+ cpu_r->nr = nr;
+ cpu_r->entry_size = entry_size;
+
+ if (nr) {
+ struct bch_replicas_cpu_entry *dst =
+ cpu_replicas_entry(cpu_r, 0);
+ struct bch_replicas_entry *src = sb_r->entries;
+
+ while (dst < cpu_replicas_entry(cpu_r, nr)) {
+ dst->data_type = src->data_type;
+ for (i = 0; i < src->nr; i++)
+ replicas_set_dev(dst, src->devs[i]);
+
+ src = replicas_entry_next(src);
+ dst = (void *) dst + entry_size;
+ }
+ }
+
+ eytzinger0_sort(cpu_r->entries,
+ cpu_r->nr,
+ cpu_r->entry_size,
+ memcmp, NULL);
+ return cpu_r;
+}
+
+static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
+{
+ struct bch_sb_field_replicas *sb_r;
+ struct bch_replicas_cpu *cpu_r, *old_r;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ sb_r = bch2_sb_get_replicas(c->disk_sb);
+ cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+ if (!cpu_r)
+ return -ENOMEM;
+
+ old_r = c->replicas;
+ rcu_assign_pointer(c->replicas, cpu_r);
+ if (old_r)
+ kfree_rcu(old_r, rcu);
+
+ return 0;
+}
+
+/*
+ * for when gc of replica information is in progress:
+ */
+static int bch2_update_gc_replicas(struct bch_fs *c,
+ struct bch_replicas_cpu *gc_r,
+ struct bkey_s_c_extent e,
+ enum bch_data_types data_type)
{
- struct bch_member *mi;
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
const struct bch_extent_ptr *ptr;
- unsigned nr_replicas = 0;
+ struct bch_replicas_cpu_entry *new_e;
+ struct bch_replicas_cpu *new;
+ unsigned i, nr, entry_size, max_dev = 0;
+
+ extent_for_each_ptr(e, ptr)
+ if (!ptr->cached)
+ max_dev = max_t(unsigned, max_dev, ptr->dev);
+
+ entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+ DIV_ROUND_UP(max_dev + 1, 8);
+ entry_size = max(entry_size, gc_r->entry_size);
+ nr = gc_r->nr + 1;
+
+ new = kzalloc(sizeof(struct bch_replicas_cpu) +
+ nr * entry_size, GFP_NOIO);
+ if (!new)
+ return -ENOMEM;
+
+ new->nr = nr;
+ new->entry_size = entry_size;
+
+ for (i = 0; i < gc_r->nr; i++)
+ memcpy(cpu_replicas_entry(new, i),
+ cpu_replicas_entry(gc_r, i),
+ gc_r->entry_size);
+
+ new_e = cpu_replicas_entry(new, nr - 1);
+ new_e->data_type = data_type;
+
+ extent_for_each_ptr(e, ptr)
+ if (!ptr->cached)
+ replicas_set_dev(new_e, ptr->dev);
+
+ eytzinger0_sort(new->entries,
+ new->nr,
+ new->entry_size,
+ memcmp, NULL);
+
+ rcu_assign_pointer(c->replicas_gc, new);
+ kfree_rcu(gc_r, rcu);
+ return 0;
+}
+
+int bch2_check_mark_super_slowpath(struct bch_fs *c, struct bkey_s_c_extent e,
+ enum bch_data_types data_type)
+{
+ struct bch_replicas_cpu *gc_r;
+ const struct bch_extent_ptr *ptr;
+ struct bch_sb_field_replicas *sb_r;
+ struct bch_replicas_entry *new_entry;
+ unsigned new_entry_bytes, new_u64s, nr, bytes, max_dev;
+ int ret = 0;
mutex_lock(&c->sb_lock);
+ gc_r = rcu_dereference_protected(c->replicas_gc,
+ lockdep_is_held(&c->sb_lock));
+ if (gc_r &&
+ !replicas_has_extent(gc_r, e, data_type)) {
+ ret = bch2_update_gc_replicas(c, gc_r, e, data_type);
+ if (ret)
+ goto err;
+ }
+
/* recheck, might have raced */
- if (bch2_check_super_marked(c, k, meta)) {
+ if (bch2_sb_has_replicas(c, e, data_type)) {
mutex_unlock(&c->sb_lock);
- return;
+ return 0;
}
- mi = bch2_sb_get_members(c->disk_sb)->members;
+ new_entry_bytes = sizeof(struct bch_replicas_entry) +
+ bch2_extent_nr_dirty_ptrs(e.s_c);
+
+ sb_r = bch2_sb_get_replicas(c->disk_sb);
+
+ bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+
+ new_u64s = DIV_ROUND_UP(bytes + new_entry_bytes, sizeof(u64));
+
+ sb_r = bch2_fs_sb_resize_replicas(c,
+ DIV_ROUND_UP(sizeof(*sb_r) + bytes + new_entry_bytes,
+ sizeof(u64)));
+ if (!sb_r) {
+ ret = -ENOSPC;
+ goto err;
+ }
+
+ new_entry = (void *) sb_r + bytes;
+ new_entry->data_type = data_type;
+ new_entry->nr = 0;
extent_for_each_ptr(e, ptr)
- if (!ptr->cached) {
- (meta
- ? SET_BCH_MEMBER_HAS_METADATA
- : SET_BCH_MEMBER_HAS_DATA)(mi + ptr->dev, true);
- nr_replicas++;
+ if (!ptr->cached)
+ new_entry->devs[new_entry->nr++] = ptr->dev;
+
+ ret = bch2_sb_replicas_to_cpu_replicas(c);
+ if (ret) {
+ memset(new_entry, 0,
+ vstruct_end(&sb_r->field) - (void *) new_entry);
+ goto err;
+ }
+
+ bch2_write_super(c);
+err:
+ mutex_unlock(&c->sb_lock);
+ return ret;
+}
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *c,
+ struct bch_dev *dev_to_offline)
+{
+ struct bch_replicas_cpu_entry *e;
+ struct bch_replicas_cpu *r;
+ unsigned i, dev, dev_slots, nr_online, nr_offline;
+ struct replicas_status ret;
+
+ memset(&ret, 0, sizeof(ret));
+
+ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
+ ret.replicas[i].nr_online = UINT_MAX;
+
+ rcu_read_lock();
+ r = rcu_dereference(c->replicas);
+ dev_slots = min_t(unsigned, replicas_dev_slots(r), c->sb.nr_devices);
+
+ for (i = 0; i < r->nr; i++) {
+ e = cpu_replicas_entry(r, i);
+
+ BUG_ON(e->data_type >= ARRAY_SIZE(ret.replicas));
+
+ nr_online = nr_offline = 0;
+
+ for (dev = 0; dev < dev_slots; dev++) {
+ if (!replicas_test_dev(e, dev))
+ continue;
+
+ if (bch2_dev_is_online(c->devs[dev]) &&
+ c->devs[dev] != dev_to_offline)
+ nr_online++;
+ else
+ nr_offline++;
}
- nr_replicas = min_t(unsigned, nr_replicas,
- (meta
- ? BCH_SB_META_REPLICAS_HAVE
- : BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb));
- (meta
- ? SET_BCH_SB_META_REPLICAS_HAVE
- : SET_BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb, nr_replicas);
+ ret.replicas[e->data_type].nr_online =
+ min(ret.replicas[e->data_type].nr_online,
+ nr_online);
+
+ ret.replicas[e->data_type].nr_offline =
+ max(ret.replicas[e->data_type].nr_offline,
+ nr_offline);
+ }
+
+ rcu_read_unlock();
+
+ return ret;
+}
+
+struct replicas_status bch2_replicas_status(struct bch_fs *c)
+{
+ return __bch2_replicas_status(c, NULL);
+}
+
+unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
+{
+ struct replicas_status s = bch2_replicas_status(c);
+
+ return meta
+ ? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
+ s.replicas[BCH_DATA_BTREE].nr_online)
+ : s.replicas[BCH_DATA_USER].nr_online;
+}
+
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+ struct bch_replicas_cpu_entry *e;
+ struct bch_replicas_cpu *r;
+ unsigned i, ret = 0;
+
+ rcu_read_lock();
+ r = rcu_dereference(c->replicas);
+
+ if (ca->dev_idx >= replicas_dev_slots(r))
+ goto out;
+
+ for (i = 0; i < r->nr; i++) {
+ e = cpu_replicas_entry(r, i);
+
+ if (replicas_test_dev(e, ca->dev_idx)) {
+ ret |= 1 << e->data_type;
+ break;
+ }
+ }
+out:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
+{
+ struct bch_sb_field_members *mi;
+ struct bch_sb_field_replicas *sb_r;
+ struct bch_replicas_cpu *cpu_r = NULL;
+ struct bch_replicas_entry *e;
+ const char *err;
+ unsigned i;
+
+ mi = bch2_sb_get_members(sb);
+ sb_r = bch2_sb_get_replicas(sb);
+ if (!sb_r)
+ return NULL;
+
+ for_each_replicas_entry(sb_r, e) {
+ err = "invalid replicas entry: invalid data type";
+ if (e->data_type >= BCH_DATA_NR)
+ goto err;
+
+ err = "invalid replicas entry: too many devices";
+ if (e->nr >= BCH_REPLICAS_MAX)
+ goto err;
+
+ err = "invalid replicas entry: invalid device";
+ for (i = 0; i < e->nr; i++)
+ if (!bch2_dev_exists(sb, mi, e->devs[i]))
+ goto err;
+ }
+
+ err = "cannot allocate memory";
+ cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+ if (!cpu_r)
+ goto err;
+
+ sort_cmp_size(cpu_r->entries,
+ cpu_r->nr,
+ cpu_r->entry_size,
+ memcmp, NULL);
+
+ for (i = 0; i + 1 < cpu_r->nr; i++) {
+ struct bch_replicas_cpu_entry *l =
+ cpu_replicas_entry(cpu_r, i);
+ struct bch_replicas_cpu_entry *r =
+ cpu_replicas_entry(cpu_r, i + 1);
+
+ BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
+
+ err = "duplicate replicas entry";
+ if (!memcmp(l, r, cpu_r->entry_size))
+ goto err;
+ }
+
+ err = NULL;
+err:
+ kfree(cpu_r);
+ return err;
+}
+
+int bch2_replicas_gc_end(struct bch_fs *c, int err)
+{
+ struct bch_sb_field_replicas *sb_r;
+ struct bch_replicas_cpu *r, *old_r;
+ struct bch_replicas_entry *dst_e;
+ size_t i, j, bytes, dev_slots;
+ int ret = 0;
+
+ lockdep_assert_held(&c->replicas_gc_lock);
+
+ mutex_lock(&c->sb_lock);
+
+ r = rcu_dereference_protected(c->replicas_gc,
+ lockdep_is_held(&c->sb_lock));
+
+ if (err) {
+ rcu_assign_pointer(c->replicas_gc, NULL);
+ kfree_rcu(r, rcu);
+ goto err;
+ }
+
+ dev_slots = replicas_dev_slots(r);
+
+ bytes = sizeof(struct bch_sb_field_replicas);
+
+ for (i = 0; i < r->nr; i++) {
+ struct bch_replicas_cpu_entry *e =
+ cpu_replicas_entry(r, i);
+
+ bytes += sizeof(struct bch_replicas_entry);
+ for (j = 0; j < r->entry_size - 1; j++)
+ bytes += hweight8(e->devs[j]);
+ }
+
+ sb_r = bch2_fs_sb_resize_replicas(c,
+ DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
+ if (!sb_r) {
+ ret = -ENOSPC;
+ goto err;
+ }
+
+ memset(&sb_r->entries, 0,
+ vstruct_end(&sb_r->field) -
+ (void *) &sb_r->entries);
+
+ dst_e = sb_r->entries;
+ for (i = 0; i < r->nr; i++) {
+ struct bch_replicas_cpu_entry *src_e =
+ cpu_replicas_entry(r, i);
+
+ dst_e->data_type = src_e->data_type;
+
+ for (j = 0; j < dev_slots; j++)
+ if (replicas_test_dev(src_e, j))
+ dst_e->devs[dst_e->nr++] = j;
+
+ dst_e = replicas_entry_next(dst_e);
+ }
+
+ old_r = rcu_dereference_protected(c->replicas,
+ lockdep_is_held(&c->sb_lock));
+ rcu_assign_pointer(c->replicas, r);
+ rcu_assign_pointer(c->replicas_gc, NULL);
+ kfree_rcu(old_r, rcu);
bch2_write_super(c);
+err:
mutex_unlock(&c->sb_lock);
+ return ret;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+ struct bch_replicas_cpu *r, *src;
+ unsigned i;
+
+ lockdep_assert_held(&c->replicas_gc_lock);
+
+ mutex_lock(&c->sb_lock);
+ BUG_ON(c->replicas_gc);
+
+ src = rcu_dereference_protected(c->replicas,
+ lockdep_is_held(&c->sb_lock));
+
+ r = kzalloc(sizeof(struct bch_replicas_cpu) +
+ src->nr * src->entry_size, GFP_NOIO);
+ if (!r) {
+ mutex_unlock(&c->sb_lock);
+ return -ENOMEM;
+ }
+
+ r->entry_size = src->entry_size;
+ r->nr = 0;
+
+ for (i = 0; i < src->nr; i++) {
+ struct bch_replicas_cpu_entry *dst_e =
+ cpu_replicas_entry(r, r->nr);
+ struct bch_replicas_cpu_entry *src_e =
+ cpu_replicas_entry(src, i);
+
+ if (!(src_e->data_type & typemask)) {
+ memcpy(dst_e, src_e, r->entry_size);
+ r->nr++;
+ }
+ }
+
+ eytzinger0_sort(r->entries,
+ r->nr,
+ r->entry_size,
+ memcmp, NULL);
+
+ rcu_assign_pointer(c->replicas_gc, r);
+ mutex_unlock(&c->sb_lock);
+
+ return 0;
}
#define _BCACHE_SUPER_IO_H
#include "extents.h"
+#include "eytzinger.h"
#include "super_types.h"
#include <asm/byteorder.h>
BCH_SB_FIELD_TYPE(journal);
BCH_SB_FIELD_TYPE(members);
BCH_SB_FIELD_TYPE(crypt);
+BCH_SB_FIELD_TYPE(replicas);
+
+static inline bool bch2_dev_exists(struct bch_sb *sb,
+ struct bch_sb_field_members *mi,
+ unsigned dev)
+{
+ return dev < sb->nr_devices &&
+ !bch2_is_zero(mi->members[dev].uuid.b, sizeof(uuid_le));
+}
static inline bool bch2_sb_test_feature(struct bch_sb *sb,
enum bch_sb_features f)
.bucket_size = le16_to_cpu(mi->bucket_size),
.state = BCH_MEMBER_STATE(mi),
.tier = BCH_MEMBER_TIER(mi),
- .has_metadata = BCH_MEMBER_HAS_METADATA(mi),
- .has_data = BCH_MEMBER_HAS_DATA(mi),
.replacement = BCH_MEMBER_REPLACEMENT(mi),
.discard = BCH_MEMBER_DISCARD(mi),
.valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
void bch2_free_super(struct bcache_superblock *);
int bch2_super_realloc(struct bcache_superblock *, unsigned);
-const char *bch2_validate_journal_layout(struct bch_sb *,
+const char *bch2_sb_validate_journal(struct bch_sb *,
struct bch_member_cpu);
-const char *bch2_validate_cache_super(struct bcache_superblock *);
+const char *bch2_sb_validate(struct bcache_superblock *);
const char *bch2_read_super(struct bcache_superblock *,
struct bch_opts, const char *);
void bch2_write_super(struct bch_fs *);
-void bch2_check_mark_super_slowpath(struct bch_fs *,
- const struct bkey_i *, bool);
+static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
+ unsigned dev)
+{
+ return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
+}
-static inline bool bch2_check_super_marked(struct bch_fs *c,
- const struct bkey_i *k, bool meta)
+static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
+ unsigned dev)
{
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
- const struct bch_extent_ptr *ptr;
- unsigned nr_replicas = 0;
- bool ret = true;
+ e->devs[dev >> 3] |= 1 << (dev & 7);
+}
- extent_for_each_ptr(e, ptr) {
- struct bch_dev *ca = c->devs[ptr->dev];
+static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
+{
+ return (r->entry_size -
+ offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
+}
- if (ptr->cached)
- continue;
+static inline struct bch_replicas_cpu_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+ return (void *) r->entries + r->entry_size * i;
+}
- if (!(meta
- ? ca->mi.has_metadata
- : ca->mi.has_data)) {
- ret = false;
- break;
+int bch2_check_mark_super_slowpath(struct bch_fs *, struct bkey_s_c_extent,
+ enum bch_data_types);
+
+static inline bool replicas_has_extent(struct bch_replicas_cpu *r,
+ struct bkey_s_c_extent e,
+ enum bch_data_types data_type)
+{
+ const struct bch_extent_ptr *ptr;
+ struct bch_replicas_cpu_entry search = {
+ .data_type = data_type,
+ };
+ unsigned max_dev = 0;
+
+ BUG_ON(!data_type ||
+ data_type == BCH_DATA_SB ||
+ data_type >= BCH_DATA_NR);
+
+ extent_for_each_ptr(e, ptr)
+ if (!ptr->cached) {
+ max_dev = max_t(unsigned, max_dev, ptr->dev);
+ replicas_set_dev(&search, ptr->dev);
}
- nr_replicas++;
- }
+ return max_dev < replicas_dev_slots(r) &&
+ eytzinger0_find(r->entries, r->nr,
+ r->entry_size,
+ memcmp, &search) < r->nr;
+}
+
+static inline bool bch2_sb_has_replicas(struct bch_fs *c,
+ struct bkey_s_c_extent e,
+ enum bch_data_types data_type)
+{
+ bool ret;
- if (nr_replicas <
- (meta ? c->sb.meta_replicas_have : c->sb.data_replicas_have))
- ret = false;
+ rcu_read_lock();
+ ret = replicas_has_extent(rcu_dereference(c->replicas),
+ e, data_type);
+ rcu_read_unlock();
return ret;
}
-static inline void bch2_check_mark_super(struct bch_fs *c,
- const struct bkey_i *k, bool meta)
+static inline int bch2_check_mark_super(struct bch_fs *c,
+ struct bkey_s_c_extent e,
+ enum bch_data_types data_type)
{
- if (bch2_check_super_marked(c, k, meta))
- return;
+ struct bch_replicas_cpu *gc_r;
+ bool marked;
- bch2_check_mark_super_slowpath(c, k, meta);
+ rcu_read_lock();
+ marked = replicas_has_extent(rcu_dereference(c->replicas),
+ e, data_type) &&
+ (!(gc_r = rcu_dereference(c->replicas_gc)) ||
+ replicas_has_extent(gc_r, e, data_type));
+ rcu_read_unlock();
+
+ if (marked)
+ return 0;
+
+ return bch2_check_mark_super_slowpath(c, e, data_type);
}
+struct replicas_status {
+ struct {
+ unsigned nr_online;
+ unsigned nr_offline;
+ } replicas[BCH_DATA_NR];
+};
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *,
+ struct bch_dev *);
+struct replicas_status bch2_replicas_status(struct bch_fs *);
+
+unsigned bch2_replicas_online(struct bch_fs *, bool);
+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
+
+int bch2_replicas_gc_end(struct bch_fs *, int);
+int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+
#endif /* _BCACHE_SUPER_IO_H */
bch2_dev_allocator_stop(ca);
bch2_fs_journal_stop(&c->journal);
+
+ for_each_member_device(ca, c, i)
+ bch2_dev_allocator_remove(c, ca);
}
static void bch2_writes_disabled(struct percpu_ref *writes)
c->state != BCH_FS_RO)
goto out;
+ for_each_rw_member(ca, c, i)
+ bch2_dev_allocator_add(c, ca);
+ bch2_recalc_capacity(c);
+
err = "error starting allocator thread";
for_each_rw_member(ca, c, i)
if (bch2_dev_allocator_start(ca)) {
mutex_init(&c->state_lock);
mutex_init(&c->sb_lock);
+ mutex_init(&c->replicas_gc_lock);
mutex_init(&c->btree_cache_lock);
mutex_init(&c->bucket_lock);
mutex_init(&c->btree_root_lock);
mi = bch2_sb_get_members(c->disk_sb);
for (i = 0; i < c->sb.nr_devices; i++)
- if (!bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)) &&
+ if (bch2_dev_exists(c->disk_sb, mi, i) &&
bch2_dev_alloc(c, i))
goto err;
const char *err = "cannot allocate memory";
struct bch_sb_field_members *mi;
struct bch_dev *ca;
- unsigned i, id;
- time64_t now;
LIST_HEAD(journal);
struct jset *j;
+ struct closure cl;
+ u64 journal_seq = 0;
+ time64_t now;
+ unsigned i;
int ret = -EINVAL;
+ closure_init_stack(&cl);
+
BUG_ON(c->state != BCH_FS_STARTING);
mutex_lock(&c->sb_lock);
bch2_sb_from_fs(c, ca);
mutex_unlock(&c->sb_lock);
+ for_each_rw_member(ca, c, i)
+ bch2_dev_allocator_add(c, ca);
+ bch2_recalc_capacity(c);
+
if (BCH_SB_INITIALIZED(c->disk_sb)) {
ret = bch2_journal_read(c, &journal);
if (ret)
c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
- err = "error reading priorities";
- for_each_readable_member(ca, c, i) {
- ret = bch2_prio_read(ca);
- if (ret) {
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
- }
-
- for (id = 0; id < BTREE_ID_NR; id++) {
+ for (i = 0; i < BTREE_ID_NR; i++) {
unsigned level;
struct bkey_i *k;
- err = "bad btree root";
- k = bch2_journal_find_btree_root(c, j, id, &level);
- if (!k && id == BTREE_ID_EXTENTS)
+ err = "missing btree root";
+ k = bch2_journal_find_btree_root(c, j, i, &level);
+ if (!k && i < BTREE_ID_ALLOC)
goto err;
- if (!k) {
- pr_debug("missing btree root: %d", id);
+
+ if (!k)
continue;
- }
err = "error reading btree root";
- if (bch2_btree_root_read(c, id, k, level))
+ if (bch2_btree_root_read(c, i, k, level))
goto err;
}
- bch_verbose(c, "starting mark and sweep:");
+ err = "error reading allocation information";
+ ret = bch2_alloc_read(c, &journal);
+ if (ret)
+ goto err;
+ bch_verbose(c, "starting mark and sweep:");
err = "error in recovery";
ret = bch2_initial_gc(c, &journal);
if (ret)
goto err;
+ bch_verbose(c, "mark and sweep done");
if (c->opts.noreplay)
goto recovery_done;
- bch_verbose(c, "mark and sweep done");
+ err = "cannot allocate new btree root";
+ for (i = 0; i < BTREE_ID_NR; i++)
+ if (!c->btree_roots[i].b &&
+ bch2_btree_root_alloc(c, i, &cl))
+ goto err;
+
+ closure_sync(&cl);
/*
* bch2_journal_start() can't happen sooner, or btree_gc_finish()
}
bch_verbose(c, "starting journal replay:");
-
err = "journal replay failed";
ret = bch2_journal_replay(c, &journal);
if (ret)
goto err;
-
bch_verbose(c, "journal replay done");
if (c->opts.norecovery)
ret = bch2_fsck(c, !c->opts.nofsck);
if (ret)
goto err;
+ bch_verbose(c, "fsck done");
for_each_rw_member(ca, c, i)
- if (ca->need_prio_write) {
- ret = bch2_prio_write(ca);
+ if (ca->need_alloc_write) {
+ ret = bch2_alloc_write(c, ca, &journal_seq);
if (ret) {
percpu_ref_put(&ca->io_ref);
goto err;
}
}
- bch_verbose(c, "fsck done");
+ bch2_journal_flush_seq(&c->journal, journal_seq);
} else {
struct bch_inode_unpacked inode;
struct bkey_inode_buf packed_inode;
- struct closure cl;
-
- closure_init_stack(&cl);
bch_notice(c, "initializing new filesystem");
goto err;
}
+ err = "cannot allocate new btree root";
+ for (i = 0; i < BTREE_ID_NR; i++)
+ if (bch2_btree_root_alloc(c, i, &cl))
+ goto err;
+
/*
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
goto err;
}
- err = "cannot allocate new btree root";
- for (id = 0; id < BTREE_ID_NR; id++)
- if (bch2_btree_root_alloc(c, id, &cl)) {
- closure_sync(&cl);
- goto err;
- }
-
/* Wait for new btree roots to be written: */
closure_sync(&cl);
bch2_journal_entries_free(&journal);
return err;
err:
+ closure_sync(&cl);
+
switch (ret) {
case BCH_FSCK_ERRORS_NOT_FIXED:
bch_err(c, "filesystem contains errors: please report this to the developers");
if (uuid_le_cmp(fs->uuid, sb->uuid))
return "device not a member of filesystem";
- if (sb->dev_idx >= newest->nr_devices)
- return "device has invalid dev_idx";
-
- if (bch2_is_zero(mi->members[sb->dev_idx].uuid.b, sizeof(uuid_le)))
+ if (!bch2_dev_exists(newest, mi, sb->dev_idx))
return "device has been removed";
if (fs->block_size != sb->block_size)
free_percpu(ca->sectors_written);
bioset_exit(&ca->replica_set);
free_percpu(ca->usage_percpu);
- kvpfree(ca->disk_buckets, bucket_bytes(ca));
- kfree(ca->prio_buckets);
- kfree(ca->bio_prio);
kvpfree(ca->buckets, ca->mi.nbuckets * sizeof(struct bucket));
kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
free_heap(&ca->copygc_heap);
lockdep_assert_held(&c->state_lock);
- __bch2_dev_read_only(ca->fs, ca);
+ __bch2_dev_read_only(c, ca);
reinit_completion(&ca->offline_complete);
percpu_ref_kill(&ca->io_ref);
return 0;
if (!ca->kobj.state_in_sysfs) {
- ret = kobject_add(&ca->kobj, &ca->fs->kobj,
+ ret = kobject_add(&ca->kobj, &c->kobj,
"dev-%u", ca->dev_idx);
if (ret)
return ret;
struct bch_member *member;
size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
size_t heap_size;
- unsigned i;
+ unsigned i, btree_node_reserve_buckets;
struct bch_dev *ca;
if (bch2_fs_init_fault("dev_alloc"))
ca->dev_idx = dev_idx;
spin_lock_init(&ca->freelist_lock);
- spin_lock_init(&ca->prio_buckets_lock);
- mutex_init(&ca->prio_write_lock);
bch2_dev_moving_gc_init(ca);
INIT_WORK(&ca->io_error_work, bch2_nonfatal_io_error_work);
free_inc_reserve = movinggc_reserve / 2;
heap_size = movinggc_reserve * 8;
+ btree_node_reserve_buckets =
+ DIV_ROUND_UP(BTREE_NODE_RESERVE,
+ ca->mi.bucket_size / c->sb.btree_node_size);
+
if (percpu_ref_init(&ca->ref, bch2_dev_ref_release,
0, GFP_KERNEL) ||
percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
- !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
- !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
+ !init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets,
+ GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_MOVINGGC],
movinggc_reserve, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
!(ca->buckets = kvpmalloc(ca->mi.nbuckets *
sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO)) ||
- !(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) *
- 2, GFP_KERNEL)) ||
- !(ca->disk_buckets = kvpmalloc(bucket_bytes(ca), GFP_KERNEL)) ||
!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
- !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
bioset_init(&ca->replica_set, 4,
offsetof(struct bch_write_bio, bio)) ||
!(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
goto err;
- ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
-
total_reserve = ca->free_inc.size;
for (i = 0; i < RESERVE_NR; i++)
total_reserve += ca->free[i].size;
lg_local_lock(&c->usage_lock);
if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA)))
- bch2_mark_dev_metadata(ca->fs, ca);
+ bch2_mark_dev_metadata(c, ca);
lg_local_unlock(&c->usage_lock);
+ if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+ struct bch_sb_field_journal *journal_buckets =
+ bch2_sb_get_journal(ca->disk_sb.sb);
+ bool has_journal =
+ bch2_nr_journal_buckets(journal_buckets) >=
+ BCH_JOURNAL_BUCKETS_MIN;
+
+ bch2_dev_group_add(&c->tiers[ca->mi.tier].devs, ca);
+ bch2_dev_group_add(&c->all_devs, ca);
+
+ if (has_journal)
+ bch2_dev_group_add(&c->journal.devs, ca);
+ }
+
percpu_ref_reinit(&ca->io_ref);
return 0;
}
/* Device management: */
-bool bch2_fs_may_start(struct bch_fs *c, int flags)
+static bool have_enough_devs(struct bch_fs *c,
+ struct replicas_status s,
+ unsigned flags)
{
- struct bch_sb_field_members *mi;
- unsigned meta_missing = 0;
- unsigned data_missing = 0;
- bool degraded = false;
- unsigned i;
-
- mutex_lock(&c->sb_lock);
- mi = bch2_sb_get_members(c->disk_sb);
-
- for (i = 0; i < c->disk_sb->nr_devices; i++)
- if (!c->devs[i] &&
- !bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) {
- degraded = true;
- if (BCH_MEMBER_HAS_METADATA(&mi->members[i]))
- meta_missing++;
- if (BCH_MEMBER_HAS_DATA(&mi->members[i]))
- data_missing++;
- }
- mutex_unlock(&c->sb_lock);
-
- if (degraded &&
- !(flags & BCH_FORCE_IF_DEGRADED))
- return false;
-
- if (meta_missing &&
+ if ((s.replicas[BCH_DATA_JOURNAL].nr_offline ||
+ s.replicas[BCH_DATA_BTREE].nr_offline) &&
!(flags & BCH_FORCE_IF_METADATA_DEGRADED))
return false;
- if (meta_missing >= BCH_SB_META_REPLICAS_HAVE(c->disk_sb) &&
+ if ((!s.replicas[BCH_DATA_JOURNAL].nr_online ||
+ !s.replicas[BCH_DATA_BTREE].nr_online) &&
!(flags & BCH_FORCE_IF_METADATA_LOST))
return false;
- if (data_missing && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
+ if (s.replicas[BCH_DATA_USER].nr_offline &&
+ !(flags & BCH_FORCE_IF_DATA_DEGRADED))
return false;
- if (data_missing >= BCH_SB_DATA_REPLICAS_HAVE(c->disk_sb) &&
+ if (!s.replicas[BCH_DATA_USER].nr_online &&
!(flags & BCH_FORCE_IF_DATA_LOST))
return false;
bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
enum bch_member_state new_state, int flags)
{
- lockdep_assert_held(&c->state_lock);
-
- if (new_state == BCH_MEMBER_STATE_RW)
- return true;
+ struct replicas_status s;
+ struct bch_dev *ca2;
+ int i, nr_rw = 0, required;
- if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
- return true;
+ lockdep_assert_held(&c->state_lock);
- /*
- * If the device is already offline - whatever is going on with it can't
- * possible make the FS need to go RO:
- */
- if (!bch2_dev_is_online(ca))
+ switch (new_state) {
+ case BCH_MEMBER_STATE_RW:
return true;
+ case BCH_MEMBER_STATE_RO:
+ if (ca->mi.state != BCH_MEMBER_STATE_RW)
+ return true;
+
+ /* do we have enough devices to write to? */
+ for_each_member_device(ca2, c, i)
+ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
+
+ required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
+ ? c->opts.metadata_replicas
+ : c->opts.metadata_replicas_required,
+ !(flags & BCH_FORCE_IF_DATA_DEGRADED)
+ ? c->opts.data_replicas
+ : c->opts.data_replicas_required);
+
+ return nr_rw - 1 <= required;
+ case BCH_MEMBER_STATE_FAILED:
+ case BCH_MEMBER_STATE_SPARE:
+ if (ca->mi.state != BCH_MEMBER_STATE_RW &&
+ ca->mi.state != BCH_MEMBER_STATE_RO)
+ return true;
+
+ /* do we have enough devices to read from? */
+ s = __bch2_replicas_status(c, ca);
+
+ pr_info("replicas: j %u %u b %u %u d %u %u",
+ s.replicas[BCH_DATA_JOURNAL].nr_online,
+ s.replicas[BCH_DATA_JOURNAL].nr_offline,
+
+ s.replicas[BCH_DATA_BTREE].nr_online,
+ s.replicas[BCH_DATA_BTREE].nr_offline,
+
+ s.replicas[BCH_DATA_USER].nr_online,
+ s.replicas[BCH_DATA_USER].nr_offline);
+
+ return have_enough_devs(c, s, flags);
+ default:
+ BUG();
+ }
+}
- if (ca->mi.has_data &&
- !(flags & BCH_FORCE_IF_DATA_DEGRADED))
- return false;
-
- if (ca->mi.has_data &&
- c->sb.data_replicas_have <= 1 &&
- !(flags & BCH_FORCE_IF_DATA_LOST))
- return false;
+static bool bch2_fs_may_start(struct bch_fs *c, int flags)
+{
+ struct replicas_status s;
+ struct bch_sb_field_members *mi;
+ unsigned i;
- if (ca->mi.has_metadata &&
- !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
- return false;
+ if (!c->opts.degraded) {
+ mutex_lock(&c->sb_lock);
+ mi = bch2_sb_get_members(c->disk_sb);
+
+ for (i = 0; i < c->disk_sb->nr_devices; i++)
+ if (bch2_dev_exists(c->disk_sb, mi, i) &&
+ !bch2_dev_is_online(c->devs[i]) &&
+ (c->devs[i]->mi.state == BCH_MEMBER_STATE_RW ||
+ c->devs[i]->mi.state == BCH_MEMBER_STATE_RO)) {
+ mutex_unlock(&c->sb_lock);
+ return false;
+ }
+ mutex_unlock(&c->sb_lock);
+ }
- if (ca->mi.has_metadata &&
- c->sb.meta_replicas_have <= 1 &&
- !(flags & BCH_FORCE_IF_METADATA_LOST))
- return false;
+ s = bch2_replicas_status(c);
- return true;
+ return have_enough_devs(c, s, flags);
}
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
* complete.
*/
bch2_dev_allocator_stop(ca);
-
- bch2_dev_group_remove(&c->journal.devs, ca);
+ bch2_dev_allocator_remove(c, ca);
}
static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
+ bch2_dev_allocator_add(c, ca);
+ bch2_recalc_capacity(c);
+
if (bch2_dev_allocator_start(ca))
return "error starting allocator thread";
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{
struct bch_sb_field_members *mi;
- unsigned dev_idx = ca->dev_idx;
+ unsigned dev_idx = ca->dev_idx, data;
int ret = -EINVAL;
mutex_lock(&c->state_lock);
goto err;
}
- if (ca->mi.has_data || ca->mi.has_metadata) {
- bch_err(ca, "Remove failed, still has data");
+ data = bch2_dev_has_data(c, ca);
+ if (data) {
+ bch_err(ca, "Remove failed, still has data (%x)", data);
goto err;
}
- /*
- * Ok, really doing the remove:
- * Drop device's prio pointer before removing it from superblock:
- */
- spin_lock(&c->journal.lock);
- c->journal.prio_buckets[dev_idx] = 0;
- spin_unlock(&c->journal.lock);
-
bch2_journal_meta(&c->journal);
__bch2_dev_offline(ca);
return ret;
}
+/* Add new device to running filesystem: */
int bch2_dev_add(struct bch_fs *c, const char *path)
{
struct bcache_superblock sb;
if (err)
return -EINVAL;
- err = bch2_validate_cache_super(&sb);
+ err = bch2_sb_validate(&sb);
if (err)
return -EINVAL;
mi = bch2_sb_get_members(c->disk_sb);
for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
- if (dev_idx >= c->sb.nr_devices ||
- bch2_is_zero(mi->members[dev_idx].uuid.b,
- sizeof(uuid_le)))
+ if (!bch2_dev_exists(c->disk_sb, mi, dev_idx))
goto have_slot;
no_slot:
err = "no slots available in superblock";
return ret ?: -EINVAL;
}
+/* Hot add existing device to running filesystem: */
int bch2_dev_online(struct bch_fs *c, const char *path)
{
struct bcache_superblock sb = { 0 };
struct bch_dev *ca;
unsigned dev_idx;
const char *err;
- int ret;
mutex_lock(&c->state_lock);
mutex_unlock(&c->sb_lock);
ca = c->devs[dev_idx];
- ret = bch2_prio_read(ca);
- if (ret) {
- err = "error reading priorities";
- goto err;
- }
-
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = __bch2_dev_read_write(c, ca);
if (err)
int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
{
+ unsigned data;
int ret;
mutex_lock(&c->state_lock);
return ret;
}
- if (ca->mi.has_data || ca->mi.has_metadata) {
- bch_err(ca, "Migrate error: data still present");
+ data = bch2_dev_has_data(c, ca);
+ if (data) {
+ bch_err(ca, "Migrate error: data still present (%x)", data);
return -EINVAL;
}
if (err)
goto err;
- err = "attempting to register backing device";
- if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
- goto err;
-
- err = bch2_validate_cache_super(&sb[i]);
+ err = bch2_sb_validate(&sb[i]);
if (err)
goto err;
}
struct bch_fs *c;
bool allocated_fs = false;
- err = bch2_validate_cache_super(sb);
+ err = bch2_sb_validate(sb);
if (err)
return err;
if (err)
return err;
- if (!__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
- err = __bch2_fs_open_incremental(&sb, opts);
- else
- err = "not a bcachefs superblock";
-
+ err = __bch2_fs_open_incremental(&sb, opts);
bch2_free_super(&sb);
return err;
sysfs_pd_controller_show(tiering, &c->tiers[1].pd); /* XXX */
- sysfs_printf(meta_replicas_have, "%u", c->sb.meta_replicas_have);
- sysfs_printf(data_replicas_have, "%u", c->sb.data_replicas_have);
+ sysfs_printf(meta_replicas_have, "%u", bch2_replicas_online(c, true));
+ sysfs_printf(data_replicas_have, "%u", bch2_replicas_online(c, false));
/* Debugging: */
return scnprintf(buf, PAGE_SIZE,
"free_inc: %zu/%zu\n"
- "free[RESERVE_PRIO]: %zu/%zu\n"
"free[RESERVE_BTREE]: %zu/%zu\n"
"free[RESERVE_MOVINGGC]: %zu/%zu\n"
"free[RESERVE_NONE]: %zu/%zu\n"
"open buckets: %u/%u (reserved %u)\n"
"open_buckets_wait: %s\n",
fifo_used(&ca->free_inc), ca->free_inc.size,
- fifo_used(&ca->free[RESERVE_PRIO]), ca->free[RESERVE_PRIO].size,
fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size,
fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
sysfs_print(alloc_buckets, stats.buckets_alloc);
sysfs_print(available_buckets, dev_buckets_available(ca));
sysfs_print(free_buckets, dev_buckets_free(ca));
- sysfs_print(has_data, ca->mi.has_data);
- sysfs_print(has_metadata, ca->mi.has_metadata);
+ sysfs_print(has_data, bch2_dev_has_data(c, ca) &
+ (1 << BCH_DATA_USER));
+ sysfs_print(has_metadata, bch2_dev_has_data(c, ca) &
+ ((1 << BCH_DATA_JOURNAL)|
+ (1 << BCH_DATA_BTREE)));
sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd);
}
}
}
+
+void sort_cmp_size(void *base, size_t num, size_t size,
+ int (*cmp_func)(const void *, const void *, size_t),
+ void (*swap_func)(void *, void *, size_t size))
+{
+ /* pre-scale counters for performance */
+ int i = (num/2 - 1) * size, n = num * size, c, r;
+
+ if (!swap_func) {
+ if (size == 4 && alignment_ok(base, 4))
+ swap_func = u32_swap;
+ else if (size == 8 && alignment_ok(base, 8))
+ swap_func = u64_swap;
+ else
+ swap_func = generic_swap;
+ }
+
+ /* heapify */
+ for ( ; i >= 0; i -= size) {
+ for (r = i; r * 2 + size < n; r = c) {
+ c = r * 2 + size;
+ if (c < n - size &&
+ cmp_func(base + c, base + c + size, size) < 0)
+ c += size;
+ if (cmp_func(base + r, base + c, size) >= 0)
+ break;
+ swap_func(base + r, base + c, size);
+ }
+ }
+
+ /* sort */
+ for (i = n - size; i > 0; i -= size) {
+ swap_func(base, base + i, size);
+ for (r = 0; r * 2 + size < i; r = c) {
+ c = r * 2 + size;
+ if (c < i - size &&
+ cmp_func(base + c, base + c + size, size) < 0)
+ c += size;
+ if (cmp_func(base + r, base + c, size) >= 0)
+ break;
+ swap_func(base + r, base + c, size);
+ }
+ }
+}
size_t bch_scnmemcpy(char *, size_t, const char *, size_t);
+void sort_cmp_size(void *base, size_t num, size_t size,
+ int (*cmp_func)(const void *, const void *, size_t),
+ void (*swap_func)(void *, void *, size_t));
+
#endif /* _BCACHE_UTIL_H */