]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 9ceb982d77 bcachefs: Store bucket gens in a btree
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 8 May 2017 10:28:15 +0000 (02:28 -0800)
committerKent Overstreet <kent.overstreet@gmail.com>
Mon, 8 May 2017 14:57:17 +0000 (06:57 -0800)
35 files changed:
.bcachefs_revision
cmd_debug.c
cmd_fsck.c
cmd_migrate.c
include/linux/bitops.h
libbcachefs.c
libbcachefs/alloc.c
libbcachefs/alloc.h
libbcachefs/alloc_types.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bcachefs_ioctl.h
libbcachefs/bkey.h
libbcachefs/bkey_methods.c
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_update.c
libbcachefs/btree_update.h
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/buckets_types.h
libbcachefs/extents.c
libbcachefs/io.c
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/journal_types.h
libbcachefs/migrate.c
libbcachefs/opts.h
libbcachefs/str_hash.h
libbcachefs/super-io.c
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/sysfs.c
libbcachefs/util.c
libbcachefs/util.h

index 81d9f67c08723f0151963622c884eead1a327ad1..c5ef77333bce8841bcb4494f0e855a24a52034c5 100644 (file)
@@ -1 +1 @@
-4231dd5cf0f04dd61b0b8bae44a357da8331c0e2
+9ceb982d7790f552e2f5c96bebeab176516cf144
index 974e862e2eab107293cdb08c72d52946c424716b..d4613ecb8ccb810ad77fdfa89d94f7c14e111908 100644 (file)
@@ -55,12 +55,6 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd)
                                  bucket_bytes(ca));
                }
 
-       /* Prios/gens: */
-       for (i = 0; i < prio_buckets(ca); i++)
-               range_add(&data,
-                         bucket_bytes(ca) * ca->prio_last_buckets[i],
-                         bucket_bytes(ca));
-
        /* Btree: */
        for (i = 0; i < BTREE_ID_NR; i++) {
                const struct bch_extent_ptr *ptr;
@@ -97,6 +91,7 @@ int cmd_dump(int argc, char *argv[])
        opts.nochanges  = true;
        opts.noreplay   = true;
        opts.errors     = BCH_ON_ERROR_CONTINUE;
+       opts.degraded   = true;
 
        while ((opt = getopt(argc, argv, "o:fh")) != -1)
                switch (opt) {
@@ -273,6 +268,7 @@ int cmd_list(int argc, char *argv[])
        opts.nochanges  = true;
        opts.norecovery = true;
        opts.errors     = BCH_ON_ERROR_CONTINUE;
+       opts.degraded   = true;
 
        while ((opt = getopt(argc, argv, "b:s:e:i:m:fvh")) != -1)
                switch (opt) {
index 17750675ffe5dbb8a2b6270ee304f1f4ec67a7e0..5ca9b8254536d39f2cb7e405e42037f9a33b8f20 100644 (file)
@@ -27,6 +27,8 @@ int cmd_fsck(int argc, char *argv[])
        const char *err;
        int opt;
 
+       opts.degraded = true;
+
        while ((opt = getopt(argc, argv, "pynfvh")) != -1)
                switch (opt) {
                case 'p':
index 72cc004d0d42aba63dae1a8a9df74f28c1bed01f..bf8f0bea7a2520cda9252512fd15e48a4c5fbf62 100644 (file)
@@ -333,7 +333,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
                        die("error reserving space in new filesystem: %s",
                            strerror(-ret));
 
-               bch2_check_mark_super(c, &e->k_i, false);
+               bch2_check_mark_super(c, extent_i_to_s_c(e), false);
 
                ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
                                        &res, NULL, NULL, 0);
index 47fffb79bb4cf0f98b7c390a46b636a311207e5b..239574c15b01c79b319f20591bedb61111ee3c99 100644 (file)
@@ -112,6 +112,11 @@ static inline unsigned long hweight_long(unsigned long w)
        return __builtin_popcountl(w);
 }
 
+static inline unsigned long hweight8(unsigned long w)
+{
+       return __builtin_popcountl(w);
+}
+
 /**
  * rol64 - rotate a 64-bit value left
  * @word: value to rotate
index 73ea2d131b227d6faac057e4950e0db5a4c375b3..f68a45f23d8314b71f4ad10792ed2b43167afa2a 100644 (file)
@@ -176,10 +176,8 @@ struct bch_sb *bch2_format(struct format_opts opts,
        SET_BCH_SB_BTREE_NODE_SIZE(sb,          opts.btree_node_size);
        SET_BCH_SB_GC_RESERVE(sb,               8);
        SET_BCH_SB_META_REPLICAS_WANT(sb,       opts.meta_replicas);
-       SET_BCH_SB_META_REPLICAS_HAVE(sb,       opts.meta_replicas);
        SET_BCH_SB_META_REPLICAS_REQ(sb,        opts.meta_replicas_required);
        SET_BCH_SB_DATA_REPLICAS_WANT(sb,       opts.data_replicas);
-       SET_BCH_SB_DATA_REPLICAS_HAVE(sb,       opts.data_replicas);
        SET_BCH_SB_DATA_REPLICAS_REQ(sb,        opts.data_replicas_required);
        SET_BCH_SB_ERROR_ACTION(sb,             opts.on_error_action);
        SET_BCH_SB_STR_HASH_TYPE(sb,            BCH_STR_HASH_SIPHASH);
@@ -339,9 +337,9 @@ void bch2_super_print(struct bch_sb *sb, int units)
 
               BCH_SB_CLEAN(sb),
 
-              BCH_SB_META_REPLICAS_HAVE(sb),
+              0LLU, //BCH_SB_META_REPLICAS_HAVE(sb),
               BCH_SB_META_REPLICAS_WANT(sb),
-              BCH_SB_DATA_REPLICAS_HAVE(sb),
+              0LLU, //BCH_SB_DATA_REPLICAS_HAVE(sb),
               BCH_SB_DATA_REPLICAS_WANT(sb),
 
               BCH_SB_META_CSUM_TYPE(sb) < BCH_CSUM_NR
@@ -405,8 +403,8 @@ void bch2_super_print(struct bch_sb *sb, int units)
                       : "unknown",
 
                       BCH_MEMBER_TIER(m),
-                      BCH_MEMBER_HAS_METADATA(m),
-                      BCH_MEMBER_HAS_DATA(m),
+                      0LLU, //BCH_MEMBER_HAS_METADATA(m),
+                      0LLU, //BCH_MEMBER_HAS_DATA(m),
 
                       BCH_MEMBER_REPLACEMENT(m) < CACHE_REPLACEMENT_NR
                       ? bch2_cache_replacement_policies[BCH_MEMBER_REPLACEMENT(m)]
index 9d54dd8023ea92ae82df115d4f10ee08a0a95a90..5a258cb65db48f50f158639a54e1d412665b359d 100644 (file)
@@ -75,7 +75,6 @@
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>
 
-static void __bch2_bucket_free(struct bch_dev *, struct bucket *);
 static void bch2_recalc_min_prio(struct bch_dev *, int);
 
 /* Allocation groups: */
@@ -206,268 +205,244 @@ static void pd_controllers_update(struct work_struct *work)
                              c->pd_controllers_update_seconds * HZ);
 }
 
-/*
- * Bucket priorities/gens:
- *
- * For each bucket, we store on disk its
-   * 8 bit gen
-   * 16 bit priority
- *
- * See alloc.c for an explanation of the gen. The priority is used to implement
- * lru (and in the future other) cache replacement policies; for most purposes
- * it's just an opaque integer.
- *
- * The gens and the priorities don't have a whole lot to do with each other, and
- * it's actually the gens that must be written out at specific times - it's no
- * big deal if the priorities don't get written, if we lose them we just reuse
- * buckets in suboptimal order.
- *
- * On disk they're stored in a packed array, and in as many buckets are required
- * to fit them all. The buckets we use to store them form a list; the journal
- * header points to the first bucket, the first bucket points to the second
- * bucket, et cetera.
- *
- * This code is used by the allocation code; periodically (whenever it runs out
- * of buckets to allocate from) the allocation code will invalidate some
- * buckets, but it can't use those buckets until their new gens are safely on
- * disk.
- */
+static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
+{
+       unsigned bytes = offsetof(struct bch_alloc, data);
+
+       if (a->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+               bytes += 2;
+       if (a->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+               bytes += 2;
+
+       return DIV_ROUND_UP(bytes, sizeof(u64));
+}
 
-static int prio_io(struct bch_dev *ca, uint64_t bucket, int op)
+static const char *bch2_alloc_invalid(const struct bch_fs *c,
+                                     struct bkey_s_c k)
 {
-       bio_init(ca->bio_prio, ca->bio_prio->bi_inline_vecs, bucket_pages(ca));
-       ca->bio_prio->bi_opf            = op|REQ_SYNC|REQ_META;
-       ca->bio_prio->bi_iter.bi_sector = bucket * ca->mi.bucket_size;
-       ca->bio_prio->bi_bdev           = ca->disk_sb.bdev;
-       ca->bio_prio->bi_iter.bi_size   = bucket_bytes(ca);
-       bch2_bio_map(ca->bio_prio, ca->disk_buckets);
-
-       return submit_bio_wait(ca->bio_prio);
+       if (k.k->p.inode >= c->sb.nr_devices ||
+           !c->devs[k.k->p.inode])
+               return "invalid device";
+
+       switch (k.k->type) {
+       case BCH_ALLOC: {
+               struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+
+               if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
+                       return "incorrect value size";
+               break;
+       }
+       default:
+               return "invalid type";
+       }
+
+       return NULL;
 }
 
-static struct nonce prio_nonce(struct prio_set *p)
+static void bch2_alloc_to_text(struct bch_fs *c, char *buf,
+                              size_t size, struct bkey_s_c k)
 {
-       return (struct nonce) {{
-               [0] = 0,
-               [1] = p->nonce[0],
-               [2] = p->nonce[1],
-               [3] = p->nonce[2]^BCH_NONCE_PRIO,
-       }};
+       buf[0] = '\0';
+
+       switch (k.k->type) {
+       case BCH_ALLOC:
+               break;
+       }
 }
 
-int bch2_prio_write(struct bch_dev *ca)
+const struct bkey_ops bch2_bkey_alloc_ops = {
+       .key_invalid    = bch2_alloc_invalid,
+       .val_to_text    = bch2_alloc_to_text,
+};
+
+static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
 {
-       struct bch_fs *c = ca->fs;
-       struct journal *j = &c->journal;
-       struct journal_res res = { 0 };
-       bool need_new_journal_entry;
-       int i, ret = 0;
+       unsigned v;
 
-       if (c->opts.nochanges)
-               return 0;
+       switch (bytes) {
+       case 1:
+               v = **p;
+               break;
+       case 2:
+               v = le16_to_cpup((void *) *p);
+               break;
+       case 4:
+               v = le32_to_cpup((void *) *p);
+               break;
+       default:
+               BUG();
+       }
 
-       mutex_lock(&ca->prio_write_lock);
-       trace_prio_write_start(ca);
+       *p += bytes;
+       return v;
+}
 
-       ca->need_prio_write = false;
+static inline void put_alloc_field(u8 **p, unsigned bytes, unsigned v)
+{
+       switch (bytes) {
+       case 1:
+               **p = v;
+               break;
+       case 2:
+               *((__le16 *) *p) = cpu_to_le16(v);
+               break;
+       case 4:
+               *((__le32 *) *p) = cpu_to_le32(v);
+               break;
+       default:
+               BUG();
+       }
 
-       atomic64_add(ca->mi.bucket_size * prio_buckets(ca),
-                    &ca->meta_sectors_written);
+       *p += bytes;
+}
 
-       for (i = prio_buckets(ca) - 1; i >= 0; --i) {
-               struct bucket *g;
-               struct prio_set *p = ca->disk_buckets;
-               struct bucket_disk *d = p->data;
-               struct bucket_disk *end = d + prios_per_bucket(ca);
-               size_t r;
+static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bch_dev *ca;
+       struct bkey_s_c_alloc a;
+       struct bucket_mark new;
+       struct bucket *g;
+       const u8 *d;
 
-               for (r = i * prios_per_bucket(ca);
-                    r < ca->mi.nbuckets && d < end;
-                    r++, d++) {
-                       g = ca->buckets + r;
-                       d->prio[READ] = cpu_to_le16(g->prio[READ]);
-                       d->prio[WRITE] = cpu_to_le16(g->prio[WRITE]);
-                       d->gen = ca->buckets[r].mark.gen;
-               }
+       if (k.k->type != BCH_ALLOC)
+               return;
 
-               p->next_bucket  = cpu_to_le64(ca->prio_buckets[i + 1]);
-               p->magic        = cpu_to_le64(pset_magic(c));
-               get_random_bytes(&p->nonce, sizeof(p->nonce));
+       a = bkey_s_c_to_alloc(k);
+       ca = c->devs[a.k->p.inode];
 
-               spin_lock(&ca->prio_buckets_lock);
-               r = bch2_bucket_alloc(ca, RESERVE_PRIO);
-               BUG_ON(!r);
+       if (a.k->p.offset >= ca->mi.nbuckets)
+               return;
 
-               /*
-                * goes here before dropping prio_buckets_lock to guard against
-                * it getting gc'd from under us
-                */
-               ca->prio_buckets[i] = r;
-               bch2_mark_metadata_bucket(ca, ca->buckets + r,
-                                        BUCKET_PRIOS, false);
-               spin_unlock(&ca->prio_buckets_lock);
-
-               SET_PSET_CSUM_TYPE(p, bch2_meta_checksum_type(c));
-
-               bch2_encrypt(c, PSET_CSUM_TYPE(p),
-                           prio_nonce(p),
-                           p->encrypted_start,
-                           bucket_bytes(ca) -
-                           offsetof(struct prio_set, encrypted_start));
-
-               p->csum  = bch2_checksum(c, PSET_CSUM_TYPE(p),
-                                       prio_nonce(p),
-                                       (void *) p + sizeof(p->csum),
-                                       bucket_bytes(ca) - sizeof(p->csum));
-
-               ret = prio_io(ca, r, REQ_OP_WRITE);
-               if (bch2_dev_fatal_io_err_on(ret, ca,
-                                         "prio write to bucket %zu", r) ||
-                   bch2_meta_write_fault("prio"))
-                       goto err;
-       }
+       g = ca->buckets + a.k->p.offset;
+       bucket_cmpxchg(g, new, ({
+               new.gen = a.v->gen;
+               new.gen_valid = 1;
+       }));
+
+       d = a.v->data;
+       if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+               g->prio[READ] = get_alloc_field(&d, 2);
+       if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+               g->prio[WRITE] = get_alloc_field(&d, 2);
+}
 
-       spin_lock(&j->lock);
-       j->prio_buckets[ca->dev_idx] = cpu_to_le64(ca->prio_buckets[0]);
-       j->nr_prio_buckets = max_t(unsigned,
-                                  ca->dev_idx + 1,
-                                  j->nr_prio_buckets);
-       spin_unlock(&j->lock);
+int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
+{
+       struct journal_replay *r;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
 
-       do {
-               unsigned u64s = jset_u64s(0);
+       if (!c->btree_roots[BTREE_ID_ALLOC].b)
+               return 0;
 
-               if (!test_bit(JOURNAL_STARTED, &c->journal.flags))
-                       break;
+       for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
+               bch2_alloc_read_key(c, k);
+               bch2_btree_iter_cond_resched(&iter);
+       }
 
-               ret = bch2_journal_res_get(j, &res, u64s, u64s);
-               if (ret)
-                       goto err;
+       ret = bch2_btree_iter_unlock(&iter);
+       if (ret)
+               return ret;
 
-               need_new_journal_entry = j->buf[res.idx].nr_prio_buckets <
-                       ca->dev_idx + 1;
-               bch2_journal_res_put(j, &res);
+       list_for_each_entry(r, journal_replay_list, list) {
+               struct bkey_i *k, *n;
+               struct jset_entry *entry;
 
-               ret = bch2_journal_flush_seq(j, res.seq);
-               if (ret)
-                       goto err;
-       } while (need_new_journal_entry);
+               for_each_jset_key(k, n, entry, &r->j)
+                       if (entry->btree_id == BTREE_ID_ALLOC)
+                               bch2_alloc_read_key(c, bkey_i_to_s_c(k));
+       }
 
-       /*
-        * Don't want the old priorities to get garbage collected until after we
-        * finish writing the new ones, and they're journalled
-        */
+       return 0;
+}
 
-       spin_lock(&ca->prio_buckets_lock);
+static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
+                                 struct bucket *g, struct btree_iter *iter,
+                                 u64 *journal_seq)
+{
+       struct bucket_mark m = READ_ONCE(g->mark);
+       __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
+       struct bkey_i_alloc *a;
+       u8 *d;
+       int ret;
 
-       for (i = 0; i < prio_buckets(ca); i++) {
-               if (ca->prio_last_buckets[i])
-                       __bch2_bucket_free(ca,
-                               &ca->buckets[ca->prio_last_buckets[i]]);
+       bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, g - ca->buckets));
 
-               ca->prio_last_buckets[i] = ca->prio_buckets[i];
-       }
+       do {
+               ret = bch2_btree_iter_traverse(iter);
+               if (ret)
+                       break;
 
-       spin_unlock(&ca->prio_buckets_lock);
+               a = bkey_alloc_init(&alloc_key.k);
+               a->k.p          = iter->pos;
+               a->v.fields     = 0;
+               a->v.gen        = m.gen;
+               set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
+
+               d = a->v.data;
+               if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+                       put_alloc_field(&d, 2, g->prio[READ]);
+               if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+                       put_alloc_field(&d, 2, g->prio[WRITE]);
+
+               bch2_btree_iter_set_pos(iter, a->k.p);
+               ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
+                                          BTREE_INSERT_ATOMIC|
+                                          BTREE_INSERT_NOFAIL|
+                                          BTREE_INSERT_USE_RESERVE|
+                                          BTREE_INSERT_USE_ALLOC_RESERVE|
+                                          BTREE_INSERT_NOWAIT,
+                                          BTREE_INSERT_ENTRY(iter, &a->k_i));
+               bch2_btree_iter_cond_resched(iter);
+       } while (ret == -EINTR);
 
-       trace_prio_write_end(ca);
-err:
-       mutex_unlock(&ca->prio_write_lock);
        return ret;
 }
 
-int bch2_prio_read(struct bch_dev *ca)
+int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
 {
-       struct bch_fs *c = ca->fs;
-       struct prio_set *p = ca->disk_buckets;
-       struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
-       struct bucket_mark new;
-       struct bch_csum csum;
-       unsigned bucket_nr = 0;
-       u64 bucket, expect, got;
-       size_t b;
-       int ret = 0;
+       struct bch_dev *ca;
+       struct bucket *g;
+       struct btree_iter iter;
+       int ret;
 
-       if (ca->prio_read_done)
-               return 0;
+       lockdep_assert_held(&c->state_lock);
 
-       ca->prio_read_done = true;
+       if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
+               return 0;
 
-       spin_lock(&c->journal.lock);
-       bucket = le64_to_cpu(c->journal.prio_buckets[ca->dev_idx]);
-       spin_unlock(&c->journal.lock);
+       ca = c->devs[pos.inode];
 
-       /*
-        * If the device hasn't been used yet, there won't be a prio bucket ptr
-        */
-       if (!bucket)
+       if (pos.offset >= ca->mi.nbuckets)
                return 0;
 
-       if (mustfix_fsck_err_on(bucket < ca->mi.first_bucket ||
-                               bucket >= ca->mi.nbuckets, c,
-                               "bad prio bucket %llu", bucket))
-               return 0;
+       g = ca->buckets + pos.offset;
 
-       for (b = 0; b < ca->mi.nbuckets; b++, d++) {
-               if (d == end) {
-                       ca->prio_last_buckets[bucket_nr] = bucket;
-                       bucket_nr++;
-
-                       ret = prio_io(ca, bucket, REQ_OP_READ) ||
-                               bch2_meta_read_fault("prio");
-
-                       if (mustfix_fsck_err_on(ret, c,
-                                       "IO error reading bucket gens (%i)",
-                                       ret))
-                               return 0;
-
-                       got = le64_to_cpu(p->magic);
-                       expect = pset_magic(c);
-                       if (mustfix_fsck_err_on(got != expect, c,
-                                       "bad magic (got %llu expect %llu) while reading prios from bucket %llu",
-                                       got, expect, bucket))
-                               return 0;
-
-                       if (mustfix_fsck_err_on(PSET_CSUM_TYPE(p) >= BCH_CSUM_NR, c,
-                                       "prio bucket with unknown csum type %llu bucket %lluu",
-                                       PSET_CSUM_TYPE(p), bucket))
-                               return 0;
-
-                       csum = bch2_checksum(c, PSET_CSUM_TYPE(p),
-                                           prio_nonce(p),
-                                           (void *) p + sizeof(p->csum),
-                                           bucket_bytes(ca) - sizeof(p->csum));
-                       if (fsck_err_on(bch2_crc_cmp(csum, p->csum), c,
-                                       "bad checksum reading prios from bucket %llu",
-                                       bucket))
-                               return 0;
-
-                       bch2_encrypt(c, PSET_CSUM_TYPE(p),
-                                   prio_nonce(p),
-                                   p->encrypted_start,
-                                   bucket_bytes(ca) -
-                                   offsetof(struct prio_set, encrypted_start));
-
-                       bucket = le64_to_cpu(p->next_bucket);
-                       d = p->data;
-               }
+       bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
+                            BTREE_ITER_INTENT);
 
-               ca->buckets[b].prio[READ] = le16_to_cpu(d->prio[READ]);
-               ca->buckets[b].prio[WRITE] = le16_to_cpu(d->prio[WRITE]);
+       ret = __bch2_alloc_write_key(c, ca, g, &iter, NULL);
+       bch2_btree_iter_unlock(&iter);
+       return ret;
+}
 
-               bucket_cmpxchg(&ca->buckets[b], new, ({
-                       new.gen = d->gen;
-                       new.gen_valid = 1;
-               }));
-       }
+int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq)
+{
+       struct btree_iter iter;
+       struct bucket *g;
+       int ret = 0;
 
-       mutex_lock(&c->bucket_lock);
-       bch2_recalc_min_prio(ca, READ);
-       bch2_recalc_min_prio(ca, WRITE);
-       mutex_unlock(&c->bucket_lock);
+       bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
+                            BTREE_ITER_INTENT);
+
+       for_each_bucket(g, ca) {
+               ret = __bch2_alloc_write_key(c, ca, g, &iter, journal_seq);
+               if (ret)
+                       break;
+       }
 
-       ret = 0;
-fsck_err:
+       bch2_btree_iter_unlock(&iter);
        return ret;
 }
 
@@ -516,9 +491,6 @@ static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket)
                long i;
                unsigned j;
 
-               for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
-                       BUG_ON(ca->prio_buckets[iter] == bucket);
-
                for (j = 0; j < RESERVE_NR; j++)
                        fifo_for_each_entry(i, &ca->free[j], iter)
                                BUG_ON(i == bucket);
@@ -651,17 +623,37 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
 
 static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g)
 {
-       spin_lock(&ca->freelist_lock);
-
-       bch2_invalidate_bucket(ca, g);
+       struct bch_fs *c = ca->fs;
+       struct bucket_mark m;
 
-       g->prio[READ] = ca->fs->prio_clock[READ].hand;
-       g->prio[WRITE] = ca->fs->prio_clock[WRITE].hand;
+       spin_lock(&ca->freelist_lock);
+       if (!bch2_invalidate_bucket(ca, g, &m)) {
+               spin_unlock(&ca->freelist_lock);
+               return;
+       }
 
        verify_not_on_freelist(ca, g - ca->buckets);
        BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
-
        spin_unlock(&ca->freelist_lock);
+
+       g->prio[READ] = c->prio_clock[READ].hand;
+       g->prio[WRITE] = c->prio_clock[WRITE].hand;
+
+       if (m.cached_sectors) {
+               ca->allocator_invalidating_data = true;
+       } else if (m.journal_seq_valid) {
+               u64 journal_seq = atomic64_read(&c->journal.seq);
+               u64 bucket_seq  = journal_seq;
+
+               bucket_seq &= ~((u64) U16_MAX);
+               bucket_seq |= m.journal_seq;
+
+               if (bucket_seq > journal_seq)
+                       bucket_seq -= 1 << 16;
+
+               ca->allocator_journal_seq_flush =
+                       max(ca->allocator_journal_seq_flush, bucket_seq);
+       }
 }
 
 /*
@@ -686,11 +678,23 @@ static unsigned long bucket_sort_key(struct bch_dev *ca,
                                     struct bucket *g,
                                     struct bucket_mark m)
 {
+       /*
+        * Time since last read, scaled to [0, 8) where larger value indicates
+        * more recently read data:
+        */
        unsigned long hotness =
                (g->prio[READ]                  - ca->min_prio[READ]) * 7 /
                (ca->fs->prio_clock[READ].hand  - ca->min_prio[READ]);
 
-       return (((hotness + 1) * bucket_sectors_used(m)) << 8) |
+       /* How much we want to keep the data in this bucket: */
+       unsigned long data_wantness =
+               (hotness + 1) * bucket_sectors_used(m);
+
+       unsigned long needs_journal_commit =
+                   bucket_needs_journal_commit(m, ca->fs->journal.last_seq_ondisk);
+
+       return  (data_wantness << 9) |
+               (needs_journal_commit << 8) |
                bucket_gc_gen(ca, g);
 }
 
@@ -790,8 +794,8 @@ static void invalidate_buckets_random(struct bch_dev *ca)
 
 static void invalidate_buckets(struct bch_dev *ca)
 {
-       ca->inc_gen_needs_gc = 0;
-       ca->inc_gen_really_needs_gc = 0;
+       ca->inc_gen_needs_gc                    = 0;
+       ca->inc_gen_really_needs_gc             = 0;
 
        switch (ca->mi.replacement) {
        case CACHE_REPLACEMENT_LRU:
@@ -806,73 +810,82 @@ static void invalidate_buckets(struct bch_dev *ca)
        }
 }
 
-static bool __bch2_allocator_push(struct bch_dev *ca, long bucket)
+static int size_t_cmp(const void *_l, const void *_r)
 {
-       if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
-               goto success;
-
-       if (fifo_push(&ca->free[RESERVE_MOVINGGC], bucket))
-               goto success;
-
-       if (fifo_push(&ca->free[RESERVE_BTREE], bucket))
-               goto success;
-
-       if (fifo_push(&ca->free[RESERVE_NONE], bucket))
-               goto success;
+       const size_t *l = _l, *r = _r;
 
-       return false;
-success:
-       closure_wake_up(&ca->fs->freelist_wait);
-       return true;
+       return (*l > *r) - (*l < *r);
 }
 
-static bool bch2_allocator_push(struct bch_dev *ca, long bucket)
+static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
+                                   u64 *journal_seq)
 {
-       bool ret;
+       struct btree_iter iter;
+       unsigned nr_invalidated = 0;
+       size_t b, i;
+       int ret = 0;
 
-       spin_lock(&ca->freelist_lock);
-       ret = __bch2_allocator_push(ca, bucket);
-       if (ret)
-               fifo_pop(&ca->free_inc, bucket);
-       spin_unlock(&ca->freelist_lock);
+       bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
+                            BTREE_ITER_INTENT);
 
-       return ret;
+       fifo_for_each_entry(b, &ca->free_inc, i) {
+               ret = __bch2_alloc_write_key(c, ca, ca->buckets + b,
+                                            &iter, journal_seq);
+               if (ret)
+                       break;
+
+               nr_invalidated++;
+       }
+
+       bch2_btree_iter_unlock(&iter);
+       return nr_invalidated ?: ret;
 }
 
-static void bch2_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca)
+/*
+ * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
+ * then add it to the freelist, waiting until there's room if necessary:
+ */
+static void discard_invalidated_bucket(struct bch_dev *ca, long bucket)
 {
-       u16 last_seq_ondisk = c->journal.last_seq_ondisk;
-       struct bucket *g;
+       if (ca->mi.discard &&
+           blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+               blkdev_issue_discard(ca->disk_sb.bdev,
+                                    bucket_to_sector(ca, bucket),
+                                    ca->mi.bucket_size, GFP_NOIO, 0);
 
-       for_each_bucket(g, ca) {
-               struct bucket_mark m = READ_ONCE(g->mark);
 
-               if (is_available_bucket(m) &&
-                   !m.cached_sectors &&
-                   !m.had_metadata &&
-                   !bucket_needs_journal_commit(m, last_seq_ondisk)) {
-                       spin_lock(&ca->freelist_lock);
+       while (1) {
+               bool pushed = false;
+               unsigned i;
 
-                       bch2_mark_alloc_bucket(ca, g, true);
-                       g->prio[READ] = c->prio_clock[READ].hand;
-                       g->prio[WRITE] = c->prio_clock[WRITE].hand;
+               set_current_state(TASK_INTERRUPTIBLE);
 
-                       verify_not_on_freelist(ca, g - ca->buckets);
-                       BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
+               /*
+                * Don't remove from free_inc until after it's added to
+                * freelist, so gc can find it:
+                */
+               spin_lock(&ca->freelist_lock);
+               for (i = 0; i < RESERVE_NR; i++)
+                       if (fifo_push(&ca->free[i], bucket)) {
+                               fifo_pop(&ca->free_inc, bucket);
+                               closure_wake_up(&ca->fs->freelist_wait);
+                               pushed = true;
+                               break;
+                       }
+               spin_unlock(&ca->freelist_lock);
 
-                       spin_unlock(&ca->freelist_lock);
+               if (pushed)
+                       break;
 
-                       if (fifo_full(&ca->free_inc))
-                               break;
+               if (kthread_should_stop()) {
+                       __set_current_state(TASK_RUNNING);
+                       break;
                }
+               schedule();
+               try_to_freeze();
        }
-}
-
-static int size_t_cmp(const void *_l, const void *_r)
-{
-       const size_t *l = _l, *r = _r;
 
-       return (*l > *r) - (*l < *r);
+       __set_current_state(TASK_RUNNING);
 }
 
 /**
@@ -887,57 +900,26 @@ static int bch2_allocator_thread(void *arg)
 {
        struct bch_dev *ca = arg;
        struct bch_fs *c = ca->fs;
-       long bucket;
+       size_t bucket;
        int ret;
 
        set_freezable();
 
-       bch2_find_empty_buckets(c, ca);
-
-       while (1) {
-               /*
-                * First, we pull buckets off of the free_inc list, possibly
-                * issue discards to them, then we add the bucket to a
-                * free list:
-                */
-
-               while (!fifo_empty(&ca->free_inc)) {
-                       bucket = fifo_peek(&ca->free_inc);
-
-                       /*
-                        * Don't remove from free_inc until after it's added
-                        * to freelist, so gc doesn't miss it while we've
-                        * dropped bucket lock
-                        */
-
-                       if (ca->mi.discard &&
-                           blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
-                               blkdev_issue_discard(ca->disk_sb.bdev,
-                                       bucket_to_sector(ca, bucket),
-                                       ca->mi.bucket_size, GFP_NOIO, 0);
-
-                       while (1) {
-                               set_current_state(TASK_INTERRUPTIBLE);
-                               if (bch2_allocator_push(ca, bucket))
-                                       break;
-
-                               if (kthread_should_stop()) {
-                                       __set_current_state(TASK_RUNNING);
-                                       goto out;
-                               }
-                               schedule();
-                               try_to_freeze();
-                       }
-
-                       __set_current_state(TASK_RUNNING);
-               }
-
-               /* We've run out of free buckets! */
+       while (!kthread_should_stop()) {
+               u64 journal_seq = 0;
 
+               /* Reset front/back so we can easily sort fifo entries later: */
                BUG_ON(fifo_used(&ca->free_inc));
-               ca->free_inc.front = ca->free_inc.back = 0;
+               ca->free_inc.front = ca->free_inc.back  = 0;
+               ca->allocator_journal_seq_flush         = 0;
+               ca->allocator_invalidating_data         = false;
 
                down_read(&c->gc_lock);
+               if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
+                       up_read(&c->gc_lock);
+                       goto out;
+               }
+
                while (1) {
                        /*
                         * Find some buckets that we can invalidate, either
@@ -947,7 +929,6 @@ static int bch2_allocator_thread(void *arg)
                         */
 
                        invalidate_buckets(ca);
-
                        trace_alloc_batch(ca, fifo_used(&ca->free_inc),
                                          ca->free_inc.size);
 
@@ -980,28 +961,32 @@ static int bch2_allocator_thread(void *arg)
                spin_unlock(&ca->freelist_lock);
 
                /*
-                * free_inc is full of newly-invalidated buckets, must write out
-                * prios and gens before they can be re-used
+                * free_inc is now full of newly-invalidated buckets: next,
+                * write out the new bucket gens:
                 */
-               ret = bch2_prio_write(ca);
-               if (ret) {
-                       /*
-                        * Emergency read only - allocator thread has to
-                        * shutdown.
-                        *
-                        * N.B. we better be going into RO mode, else
-                        * allocations would hang indefinitely - whatever
-                        * generated the error will have sent us into RO mode.
-                        *
-                        * Clear out the free_inc freelist so things are
-                        * consistent-ish:
-                        */
-                       spin_lock(&ca->freelist_lock);
-                       while (fifo_pop(&ca->free_inc, bucket))
-                               bch2_mark_free_bucket(ca, ca->buckets + bucket);
-                       spin_unlock(&ca->freelist_lock);
-                       goto out;
+
+               while (!fifo_empty(&ca->free_inc) && !kthread_should_stop()) {
+                       ret = bch2_invalidate_free_inc(c, ca, &journal_seq);
+                       if (bch2_fs_fatal_err_on(ret < 0, c,
+                                       "error invalidating buckets: %i", ret))
+                               goto err;
+
+                       if (ca->allocator_invalidating_data)
+                               bch2_journal_flush_seq(&c->journal, journal_seq);
+                       else if (ca->allocator_journal_seq_flush)
+                               bch2_journal_flush_seq(&c->journal,
+                                                      ca->allocator_journal_seq_flush);
+
+                       while (ret && !kthread_should_stop()) {
+                               BUG_ON(fifo_empty(&ca->free_inc));
+
+                               bucket = fifo_peek(&ca->free_inc);
+                               discard_invalidated_bucket(ca, bucket);
+                               --ret;
+                       }
                }
+
+               ca->alloc_thread_started = true;
        }
 out:
        /*
@@ -1010,50 +995,104 @@ out:
         */
        synchronize_rcu();
        return 0;
+err:
+       /*
+        * Emergency read only - allocator thread has to shutdown.
+        *
+        * N.B. we better be going into RO mode, else allocations would hang
+        * indefinitely - whatever generated the error will have sent us into RO
+        * mode.
+        *
+        * Clear out the free_inc freelist so things are consistent-ish:
+        */
+       spin_lock(&ca->freelist_lock);
+       while (fifo_pop(&ca->free_inc, bucket))
+               bch2_mark_free_bucket(ca, ca->buckets + bucket);
+       spin_unlock(&ca->freelist_lock);
+       goto out;
 }
 
 /* Allocation */
 
+static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
+{
+       struct bucket *g;
+       long r = -1;
+
+       if (!down_read_trylock(&c->gc_lock))
+               return r;
+
+       if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+               goto out;
+
+       for_each_bucket(g, ca)
+               if (!g->mark.touched_this_mount &&
+                   is_available_bucket(g->mark) &&
+                   bch2_mark_alloc_bucket_startup(ca, g)) {
+                       r = g - ca->buckets;
+                       break;
+               }
+out:
+       up_read(&c->gc_lock);
+       return r;
+}
+
 /**
  * bch_bucket_alloc - allocate a single bucket from a specific device
  *
  * Returns index of bucket on success, 0 on failure
  * */
-size_t bch2_bucket_alloc(struct bch_dev *ca, enum alloc_reserve reserve)
+long bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+                      enum alloc_reserve reserve)
 {
-       struct bucket *g;
-       long r;
+       size_t r;
 
        spin_lock(&ca->freelist_lock);
-       if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
-           fifo_pop(&ca->free[reserve], r))
+       if (likely(fifo_pop(&ca->free[RESERVE_NONE], r)))
                goto out;
 
+       switch (reserve) {
+       case RESERVE_ALLOC:
+               if (fifo_pop(&ca->free[RESERVE_BTREE], r))
+                       goto out;
+               break;
+       case RESERVE_BTREE:
+               if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
+                   ca->free[RESERVE_BTREE].size &&
+                   fifo_pop(&ca->free[RESERVE_BTREE], r))
+                       goto out;
+               break;
+       case RESERVE_MOVINGGC:
+               if (fifo_pop(&ca->free[RESERVE_MOVINGGC], r))
+                       goto out;
+               break;
+       default:
+               break;
+       }
+
        spin_unlock(&ca->freelist_lock);
 
+       if (unlikely(!ca->alloc_thread_started) &&
+           (r = bch2_bucket_alloc_startup(c, ca)) >= 0) {
+               verify_not_on_freelist(ca, r);
+               goto out2;
+       }
+
        trace_bucket_alloc_fail(ca, reserve);
-       return 0;
+       return -1;
 out:
        verify_not_on_freelist(ca, r);
        spin_unlock(&ca->freelist_lock);
 
-       trace_bucket_alloc(ca, reserve);
-
        bch2_wake_allocator(ca);
+out2:
+       ca->buckets[r].prio[READ]       = c->prio_clock[READ].hand;
+       ca->buckets[r].prio[WRITE]      = c->prio_clock[WRITE].hand;
 
-       g = ca->buckets + r;
-
-       g->prio[READ] = ca->fs->prio_clock[READ].hand;
-       g->prio[WRITE] = ca->fs->prio_clock[WRITE].hand;
-
+       trace_bucket_alloc(ca, reserve);
        return r;
 }
 
-static void __bch2_bucket_free(struct bch_dev *ca, struct bucket *g)
-{
-       bch2_mark_free_bucket(ca, g);
-}
-
 enum bucket_alloc_ret {
        ALLOC_SUCCESS,
        NO_DEVICES,             /* -EROFS */
@@ -1116,7 +1155,7 @@ static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c,
 
        while (ob->nr_ptrs < nr_replicas) {
                struct bch_dev *ca;
-               u64 bucket;
+               long bucket;
 
                if (!available) {
                        ret = NO_DEVICES;
@@ -1139,8 +1178,8 @@ static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c,
                    get_random_int() > devs->d[i].weight)
                        continue;
 
-               bucket = bch2_bucket_alloc(ca, reserve);
-               if (!bucket) {
+               bucket = bch2_bucket_alloc(c, ca, reserve);
+               if (bucket < 0) {
                        if (fail_idx == -1)
                                fail_idx = i;
                        continue;
@@ -1456,7 +1495,6 @@ struct open_bucket *bch2_alloc_sectors_start(struct bch_fs *c,
                ? 0 : BTREE_NODE_RESERVE;
        int ret;
 
-       BUG_ON(!reserve);
        BUG_ON(!nr_replicas);
 retry:
        ob = lock_writepoint(c, wp);
@@ -1705,7 +1743,9 @@ set_capacity:
        capacity *= (100 - c->opts.gc_reserve_percent);
        capacity = div64_u64(capacity, 100);
 
-       BUG_ON(capacity + reserved_sectors > total_capacity);
+       BUG_ON(reserved_sectors > total_capacity);
+
+       capacity = min(capacity, total_capacity - reserved_sectors);
 
        c->capacity = capacity;
 
@@ -1725,10 +1765,9 @@ set_capacity:
        closure_wake_up(&c->freelist_wait);
 }
 
-static void bch2_stop_write_point(struct bch_dev *ca,
-                                struct write_point *wp)
+static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
+                                 struct write_point *wp)
 {
-       struct bch_fs *c = ca->fs;
        struct open_bucket *ob;
        struct bch_extent_ptr *ptr;
 
@@ -1750,9 +1789,8 @@ found:
        bch2_open_bucket_put(c, ob);
 }
 
-static bool bch2_dev_has_open_write_point(struct bch_dev *ca)
+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
 {
-       struct bch_fs *c = ca->fs;
        struct bch_extent_ptr *ptr;
        struct open_bucket *ob;
 
@@ -1773,55 +1811,36 @@ static bool bch2_dev_has_open_write_point(struct bch_dev *ca)
 }
 
 /* device goes ro: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 {
-       struct bch_fs *c = ca->fs;
        struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
-       struct task_struct *p;
        struct closure cl;
        unsigned i;
 
+       BUG_ON(ca->alloc_thread);
+
        closure_init_stack(&cl);
 
        /* First, remove device from allocation groups: */
 
+       bch2_dev_group_remove(&c->journal.devs, ca);
        bch2_dev_group_remove(tier, ca);
        bch2_dev_group_remove(&c->all_devs, ca);
 
-       bch2_recalc_capacity(c);
-
        /*
-        * Stopping the allocator thread comes after removing from allocation
-        * groups, else pending allocations will hang:
-        */
-
-       p = ca->alloc_thread;
-       ca->alloc_thread = NULL;
-       smp_wmb();
-
-       /*
-        * We need an rcu barrier between setting ca->alloc_thread = NULL and
-        * the thread shutting down to avoid a race with bch2_usage_update() -
-        * the allocator thread itself does a synchronize_rcu() on exit.
-        *
-        * XXX: it would be better to have the rcu barrier be asynchronous
-        * instead of blocking us here
+        * Capacity is calculated based off of devices in allocation groups:
         */
-       if (p) {
-               kthread_stop(p);
-               put_task_struct(p);
-       }
+       bch2_recalc_capacity(c);
 
        /* Next, close write points that point to this device... */
-
        for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
-               bch2_stop_write_point(ca, &c->write_points[i]);
+               bch2_stop_write_point(c, ca, &c->write_points[i]);
 
-       bch2_stop_write_point(ca, &ca->copygc_write_point);
-       bch2_stop_write_point(ca, &c->promote_write_point);
-       bch2_stop_write_point(ca, &ca->tiering_write_point);
-       bch2_stop_write_point(ca, &c->migration_write_point);
-       bch2_stop_write_point(ca, &c->btree_write_point);
+       bch2_stop_write_point(c, ca, &ca->copygc_write_point);
+       bch2_stop_write_point(c, ca, &c->promote_write_point);
+       bch2_stop_write_point(c, ca, &ca->tiering_write_point);
+       bch2_stop_write_point(c, ca, &c->migration_write_point);
+       bch2_stop_write_point(c, ca, &c->btree_write_point);
 
        mutex_lock(&c->btree_reserve_cache_lock);
        while (c->btree_reserve_cache_nr) {
@@ -1832,9 +1851,16 @@ void bch2_dev_allocator_stop(struct bch_dev *ca)
        }
        mutex_unlock(&c->btree_reserve_cache_lock);
 
-       /* Avoid deadlocks.. */
-
+       /*
+        * Wake up threads that were blocked on allocation, so they can notice
+        * the device can no longer be removed and the capacity has changed:
+        */
        closure_wake_up(&c->freelist_wait);
+
+       /*
+        * journal_res_get() can block waiting for free space in the journal -
+        * it needs to notice there may not be devices to allocate from anymore:
+        */
        wake_up(&c->journal.wait);
 
        /* Now wait for any in flight writes: */
@@ -1842,7 +1868,7 @@ void bch2_dev_allocator_stop(struct bch_dev *ca)
        while (1) {
                closure_wait(&c->open_buckets_wait, &cl);
 
-               if (!bch2_dev_has_open_write_point(ca)) {
+               if (!bch2_dev_has_open_write_point(c, ca)) {
                        closure_wake_up(&c->open_buckets_wait);
                        break;
                }
@@ -1851,32 +1877,15 @@ void bch2_dev_allocator_stop(struct bch_dev *ca)
        }
 }
 
-/*
- * Startup the allocator thread for transition to RW mode:
- */
-int bch2_dev_allocator_start(struct bch_dev *ca)
+/* device goes rw: */
+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
 {
-       struct bch_fs *c = ca->fs;
        struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
        struct bch_sb_field_journal *journal_buckets;
        bool has_journal;
-       struct task_struct *k;
 
-       /*
-        * allocator thread already started?
-        */
-       if (ca->alloc_thread)
-               return 0;
-
-       k = kthread_create(bch2_allocator_thread, ca, "bcache_allocator");
-       if (IS_ERR(k))
-               return 0;
-
-       get_task_struct(k);
-       ca->alloc_thread = k;
-
-       bch2_dev_group_add(tier, ca);
        bch2_dev_group_add(&c->all_devs, ca);
+       bch2_dev_group_add(tier, ca);
 
        mutex_lock(&c->sb_lock);
        journal_buckets = bch2_sb_get_journal(ca->disk_sb.sb);
@@ -1886,15 +1895,44 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 
        if (has_journal)
                bch2_dev_group_add(&c->journal.devs, ca);
+}
 
-       bch2_recalc_capacity(c);
+/* stop allocator thread: */
+void bch2_dev_allocator_stop(struct bch_dev *ca)
+{
+       struct task_struct *p = ca->alloc_thread;
+
+       ca->alloc_thread = NULL;
+       smp_wmb();
+
+       /*
+        * We need an rcu barrier between setting ca->alloc_thread = NULL and
+        * the thread shutting down to avoid a race with bch2_usage_update() -
+        * the allocator thread itself does a synchronize_rcu() on exit.
+        *
+        * XXX: it would be better to have the rcu barrier be asynchronous
+        * instead of blocking us here
+        */
+       if (p)
+               kthread_stop(p);
+}
+
+/* start allocator thread: */
+int bch2_dev_allocator_start(struct bch_dev *ca)
+{
+       struct task_struct *p;
 
        /*
-        * Don't wake up allocator thread until after adding device to
-        * allocator groups - otherwise, alloc thread could get a spurious
-        * -EROFS due to prio_write() -> journal_meta() not finding any devices:
+        * allocator thread already started?
         */
-       wake_up_process(k);
+       if (ca->alloc_thread)
+               return 0;
+
+       p = kthread_run(bch2_allocator_thread, ca, "bcache_allocator");
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+
+       ca->alloc_thread = p;
        return 0;
 }
 
index 195108c20b4c18b9d9fe712b519e5fefbaed1412..cfd1c8efb74b2e983859c1c6128358208dadec60 100644 (file)
@@ -10,24 +10,14 @@ struct bch_dev;
 struct bch_fs;
 struct dev_group;
 
-static inline size_t prios_per_bucket(const struct bch_dev *ca)
-{
-       return (bucket_bytes(ca) - sizeof(struct prio_set)) /
-               sizeof(struct bucket_disk);
-}
-
-static inline size_t prio_buckets(const struct bch_dev *ca)
-{
-       return DIV_ROUND_UP((size_t) (ca)->mi.nbuckets, prios_per_bucket(ca));
-}
-
 void bch2_dev_group_remove(struct dev_group *, struct bch_dev *);
 void bch2_dev_group_add(struct dev_group *, struct bch_dev *);
 
-int bch2_prio_read(struct bch_dev *);
-int bch2_prio_write(struct bch_dev *);
+int bch2_alloc_read(struct bch_fs *, struct list_head *);
+int bch2_alloc_write(struct bch_fs *, struct bch_dev *, u64 *);
+int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
 
-size_t bch2_bucket_alloc(struct bch_dev *, enum alloc_reserve);
+long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve);
 
 void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
 
@@ -80,8 +70,15 @@ static inline struct bch_dev *dev_group_next(struct dev_group *devs,
             (_ptr)++)
 
 void bch2_recalc_capacity(struct bch_fs *);
+
+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
+
 void bch2_fs_allocator_init(struct bch_fs *);
 
+extern const struct bkey_ops bch2_bkey_alloc_ops;
+
 #endif /* _BCACHE_ALLOC_H */
index ae58d083d731320ea62f5bed3b29a68422f64516..ce3a919ef2531961e442fc2ad6b488a80549be91 100644 (file)
@@ -35,20 +35,13 @@ struct prio_clock {
 /* There is one reserve for each type of btree, one for prios and gens
  * and one for moving GC */
 enum alloc_reserve {
-       RESERVE_PRIO,
-       RESERVE_BTREE,
-       RESERVE_METADATA_LAST = RESERVE_BTREE,
-       RESERVE_MOVINGGC,
-
-       RESERVE_NONE,
-       RESERVE_NR,
+       RESERVE_ALLOC           = -1,
+       RESERVE_BTREE           = 0,
+       RESERVE_MOVINGGC        = 1,
+       RESERVE_NONE            = 2,
+       RESERVE_NR              = 3,
 };
 
-static inline bool allocation_is_metadata(enum alloc_reserve id)
-{
-       return id <= RESERVE_METADATA_LAST;
-}
-
 struct dev_group {
        spinlock_t              lock;
        unsigned                nr;
index 977ac364b5368030bdc153a3a093503052ebf02d..ab99af7b3a24d075731b805e14de32e60d471daf 100644 (file)
@@ -305,7 +305,7 @@ do {                                                                        \
        (btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES)
 
 /* Size of the freelist we allocate btree nodes from: */
-#define BTREE_NODE_RESERVE             (BTREE_RESERVE_MAX * 2)
+#define BTREE_NODE_RESERVE             (BTREE_RESERVE_MAX * 4)
 
 struct btree;
 struct crypto_blkcipher;
@@ -329,13 +329,23 @@ struct bch_member_cpu {
        u16                     bucket_size;    /* sectors */
        u8                      state;
        u8                      tier;
-       u8                      has_metadata;
-       u8                      has_data;
        u8                      replacement;
        u8                      discard;
        u8                      valid;
 };
 
+struct bch_replicas_cpu_entry {
+       u8                      data_type;
+       u8                      devs[BCH_SB_MEMBERS_MAX / 8];
+};
+
+struct bch_replicas_cpu {
+       struct rcu_head         rcu;
+       unsigned                nr;
+       unsigned                entry_size;
+       struct bch_replicas_cpu_entry entries[];
+};
+
 struct bch_dev {
        struct kobject          kobj;
        struct percpu_ref       ref;
@@ -363,21 +373,7 @@ struct bch_dev {
 
        struct task_struct      *alloc_thread;
 
-       struct prio_set         *disk_buckets;
-
-       /*
-        * When allocating new buckets, prio_write() gets first dibs - since we
-        * may not be allocate at all without writing priorities and gens.
-        * prio_last_buckets[] contains the last buckets we wrote priorities to
-        * (so gc can mark them as metadata).
-        */
-       u64                     *prio_buckets;
-       u64                     *prio_last_buckets;
-       spinlock_t              prio_buckets_lock;
-       struct bio              *bio_prio;
-       bool                    prio_read_done;
-       bool                    need_prio_write;
-       struct mutex            prio_write_lock;
+       bool                    need_alloc_write;
 
        /*
         * free: Buckets that are ready to be used
@@ -391,6 +387,7 @@ struct bch_dev {
        DECLARE_FIFO(long, free)[RESERVE_NR];
        DECLARE_FIFO(long, free_inc);
        spinlock_t              freelist_lock;
+       bool                    alloc_thread_started;
 
        size_t                  fifo_last_bucket;
 
@@ -415,6 +412,8 @@ struct bch_dev {
        atomic_long_t           saturated_count;
        size_t                  inc_gen_needs_gc;
        size_t                  inc_gen_really_needs_gc;
+       u64                     allocator_journal_seq_flush;
+       bool                    allocator_invalidating_data;
 
        alloc_heap              alloc_heap;
        bucket_heap             copygc_heap;
@@ -458,6 +457,7 @@ enum {
        BCH_FS_FSCK_FIXED_ERRORS,
        BCH_FS_FSCK_DONE,
        BCH_FS_FIXED_GENS,
+       BCH_FS_REBUILD_REPLICAS,
 };
 
 struct btree_debug {
@@ -507,6 +507,10 @@ struct bch_fs {
 
        struct bch_dev __rcu    *devs[BCH_SB_MEMBERS_MAX];
 
+       struct bch_replicas_cpu __rcu *replicas;
+       struct bch_replicas_cpu __rcu *replicas_gc;
+       struct mutex            replicas_gc_lock;
+
        struct bch_opts         opts;
 
        /* Updated by bch2_sb_update():*/
@@ -520,9 +524,6 @@ struct bch_fs {
                u8              nr_devices;
                u8              clean;
 
-               u8              meta_replicas_have;
-               u8              data_replicas_have;
-
                u8              str_hash_type;
                u8              encryption_type;
 
index 2d64bcae041119b981c23aef280d87d5043bd10a..3f6d51acb3b6fea23f07400fec5df5fcf35fb8fb 100644 (file)
@@ -2,7 +2,7 @@
 #define _BCACHEFS_FORMAT_H
 
 /*
- * Bcache on disk data structures
+ * bcachefs on disk data structures
  */
 
 #include <asm/types.h>
@@ -714,6 +714,25 @@ struct bch_xattr {
 } __attribute__((packed, aligned(8)));
 BKEY_VAL_TYPE(xattr,           BCH_XATTR);
 
+/* Bucket/allocation information: */
+
+enum {
+       BCH_ALLOC               = 128,
+};
+
+enum {
+       BCH_ALLOC_FIELD_READ_TIME       = 0,
+       BCH_ALLOC_FIELD_WRITE_TIME      = 1,
+};
+
+struct bch_alloc {
+       struct bch_val          v;
+       __u8                    fields;
+       __u8                    gen;
+       __u8                    data[];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(alloc,   BCH_ALLOC);
+
 /* Superblock */
 
 /* Version 0: Cache device
@@ -752,8 +771,7 @@ struct bch_member {
 
 LE64_BITMASK(BCH_MEMBER_STATE,         struct bch_member, flags[0],  0,  4)
 LE64_BITMASK(BCH_MEMBER_TIER,          struct bch_member, flags[0],  4,  8)
-LE64_BITMASK(BCH_MEMBER_HAS_METADATA,  struct bch_member, flags[0],  8,  9)
-LE64_BITMASK(BCH_MEMBER_HAS_DATA,      struct bch_member, flags[0],  9, 10)
+/* 8-10 unused, was HAS_(META)DATA */
 LE64_BITMASK(BCH_MEMBER_REPLACEMENT,   struct bch_member, flags[0], 10, 14)
 LE64_BITMASK(BCH_MEMBER_DISCARD,       struct bch_member, flags[0], 14, 15);
 
@@ -800,7 +818,8 @@ enum bch_sb_field_type {
        BCH_SB_FIELD_journal    = 0,
        BCH_SB_FIELD_members    = 1,
        BCH_SB_FIELD_crypt      = 2,
-       BCH_SB_FIELD_NR         = 3,
+       BCH_SB_FIELD_replicas   = 3,
+       BCH_SB_FIELD_NR         = 4,
 };
 
 struct bch_sb_field_journal {
@@ -861,8 +880,24 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N,     struct bch_sb_field_crypt, kdf_flags,  0, 16);
 LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
 LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
 
-struct bch_sb_field_replication {
+enum bch_data_types {
+       BCH_DATA_NONE           = 0,
+       BCH_DATA_SB             = 1,
+       BCH_DATA_JOURNAL        = 2,
+       BCH_DATA_BTREE          = 3,
+       BCH_DATA_USER           = 4,
+       BCH_DATA_NR             = 5,
+};
+
+struct bch_replicas_entry {
+       u8                      data_type;
+       u8                      nr;
+       u8                      devs[0];
+};
+
+struct bch_sb_field_replicas {
        struct bch_sb_field     field;
+       struct bch_replicas_entry entries[0];
 };
 
 /*
@@ -937,8 +972,7 @@ LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48);
 LE64_BITMASK(BCH_SB_META_REPLICAS_WANT,        struct bch_sb, flags[0], 48, 52);
 LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT,        struct bch_sb, flags[0], 52, 56);
 
-LE64_BITMASK(BCH_SB_META_REPLICAS_HAVE,        struct bch_sb, flags[0], 56, 60);
-LE64_BITMASK(BCH_SB_DATA_REPLICAS_HAVE,        struct bch_sb, flags[0], 60, 64);
+/* 56-64 unused, was REPLICAS_HAVE */
 
 LE64_BITMASK(BCH_SB_STR_HASH_TYPE,     struct bch_sb, flags[1],  0,  4);
 LE64_BITMASK(BCH_SB_COMPRESSION_TYPE,  struct bch_sb, flags[1],  4,  8);
@@ -946,6 +980,7 @@ LE64_BITMASK(BCH_SB_INODE_32BIT,    struct bch_sb, flags[1],  8,  9);
 
 LE64_BITMASK(BCH_SB_128_BIT_MACS,      struct bch_sb, flags[1],  9, 10);
 LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,   struct bch_sb, flags[1], 10, 14);
+
 /* 14-20 unused, was JOURNAL_ENTRY_SIZE */
 
 LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
@@ -1003,77 +1038,6 @@ enum bch_compression_opts {
        BCH_COMPRESSION_NR              = 3,
 };
 
-/* backing device specific stuff: */
-
-struct backingdev_sb {
-       __le64                  csum;
-       __le64                  offset; /* sector where this sb was written */
-       __le64                  version; /* of on disk format */
-
-       uuid_le                 magic;  /* bcachefs superblock UUID */
-
-       uuid_le                 disk_uuid;
-
-       /*
-        * Internal cache set UUID - xored with various magic numbers and thus
-        * must never change:
-        */
-       union {
-               uuid_le         set_uuid;
-               __le64          set_magic;
-       };
-       __u8                    label[BCH_SB_LABEL_SIZE];
-
-       __le64                  flags;
-
-       /* Incremented each time superblock is written: */
-       __le64                  seq;
-
-       /*
-        * User visible UUID for identifying the cache set the user is allowed
-        * to change:
-        *
-        * XXX hooked up?
-        */
-       uuid_le                 user_uuid;
-       __le64                  pad1[6];
-
-       __le64                  data_offset;
-       __le16                  block_size;     /* sectors */
-       __le16                  pad2[3];
-
-       __le32                  last_mount;     /* time_t */
-       __le16                  pad3;
-       /* size of variable length portion - always 0 for backingdev superblock */
-       __le16                  u64s;
-       __u64                   _data[0];
-};
-
-LE64_BITMASK(BDEV_CACHE_MODE,          struct backingdev_sb, flags, 0, 4);
-#define CACHE_MODE_WRITETHROUGH                0U
-#define CACHE_MODE_WRITEBACK           1U
-#define CACHE_MODE_WRITEAROUND         2U
-#define CACHE_MODE_NONE                        3U
-
-LE64_BITMASK(BDEV_STATE,               struct backingdev_sb, flags, 61, 63);
-#define BDEV_STATE_NONE                        0U
-#define BDEV_STATE_CLEAN               1U
-#define BDEV_STATE_DIRTY               2U
-#define BDEV_STATE_STALE               3U
-
-#define BDEV_DATA_START_DEFAULT                16      /* sectors */
-
-static inline _Bool __SB_IS_BDEV(__u64 version)
-{
-       return version == BCACHE_SB_VERSION_BDEV
-               || version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
-}
-
-static inline _Bool SB_IS_BDEV(const struct bch_sb *sb)
-{
-       return __SB_IS_BDEV(sb->version);
-}
-
 /*
  * Magic numbers
  *
@@ -1088,7 +1052,6 @@ static inline _Bool SB_IS_BDEV(const struct bch_sb *sb)
 #define BCACHE_STATFS_MAGIC            0xca451a4e
 
 #define JSET_MAGIC             __cpu_to_le64(0x245235c1a3625032ULL)
-#define PSET_MAGIC             __cpu_to_le64(0x6750e15f87337f91ULL)
 #define BSET_MAGIC             __cpu_to_le64(0x90135c78b99e07f5ULL)
 
 static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
@@ -1103,11 +1066,6 @@ static inline __u64 __jset_magic(struct bch_sb *sb)
        return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
 }
 
-static inline __u64 __pset_magic(struct bch_sb *sb)
-{
-       return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC);
-}
-
 static inline __u64 __bset_magic(struct bch_sb *sb)
 {
        return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
@@ -1136,9 +1094,9 @@ struct jset_entry {
 
 LE32_BITMASK(JOURNAL_ENTRY_TYPE,       struct jset_entry, flags, 0, 8);
 enum {
-       JOURNAL_ENTRY_BTREE_KEYS        = 0,
-       JOURNAL_ENTRY_BTREE_ROOT        = 1,
-       JOURNAL_ENTRY_PRIO_PTRS         = 2,
+       JOURNAL_ENTRY_BTREE_KEYS                = 0,
+       JOURNAL_ENTRY_BTREE_ROOT                = 1,
+       JOURNAL_ENTRY_PRIO_PTRS                 = 2, /* Obsolete */
 
        /*
         * Journal sequence numbers can be blacklisted: bsets record the max
@@ -1150,7 +1108,7 @@ enum {
         * and then record that we skipped it so that the next time we crash and
         * recover we don't think there was a missing journal entry.
         */
-       JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3,
+       JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED   = 3,
 };
 
 /*
@@ -1193,35 +1151,14 @@ LE32_BITMASK(JSET_BIG_ENDIAN,   struct jset, flags, 4, 5);
 
 #define BCH_JOURNAL_BUCKETS_MIN                20
 
-/* Bucket prios/gens */
-
-struct prio_set {
-       struct bch_csum         csum;
-
-       __le64                  magic;
-       __le32                  nonce[3];
-       __le16                  version;
-       __le16                  flags;
-
-       __u8                    encrypted_start[0];
-
-       __le64                  next_bucket;
-
-       struct bucket_disk {
-               __le16          prio[2];
-               __u8            gen;
-       } __attribute__((packed)) data[];
-} __attribute__((packed, aligned(8)));
-
-LE32_BITMASK(PSET_CSUM_TYPE,   struct prio_set, flags, 0, 4);
-
 /* Btree: */
 
 #define DEFINE_BCH_BTREE_IDS()                                 \
-       DEF_BTREE_ID(EXTENTS, 0, "extents")                     \
-       DEF_BTREE_ID(INODES,  1, "inodes")                      \
-       DEF_BTREE_ID(DIRENTS, 2, "dirents")                     \
-       DEF_BTREE_ID(XATTRS,  3, "xattrs")
+       DEF_BTREE_ID(EXTENTS,   0, "extents")                   \
+       DEF_BTREE_ID(INODES,    1, "inodes")                    \
+       DEF_BTREE_ID(DIRENTS,   2, "dirents")                   \
+       DEF_BTREE_ID(XATTRS,    3, "xattrs")                    \
+       DEF_BTREE_ID(ALLOC,     4, "alloc")
 
 #define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
 
@@ -1318,4 +1255,33 @@ struct btree_node_entry {
        };
 } __attribute__((packed, aligned(8)));
 
+/* Obsolete: */
+
+struct prio_set {
+       struct bch_csum         csum;
+
+       __le64                  magic;
+       __le32                  nonce[3];
+       __le16                  version;
+       __le16                  flags;
+
+       __u8                    encrypted_start[0];
+
+       __le64                  next_bucket;
+
+       struct bucket_disk {
+               __le16          prio[2];
+               __u8            gen;
+       } __attribute__((packed)) data[];
+} __attribute__((packed, aligned(8)));
+
+LE32_BITMASK(PSET_CSUM_TYPE,   struct prio_set, flags, 0, 4);
+
+#define PSET_MAGIC             __cpu_to_le64(0x6750e15f87337f91ULL)
+
+static inline __u64 __pset_magic(struct bch_sb *sb)
+{
+       return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC);
+}
+
 #endif /* _BCACHEFS_FORMAT_H */
index 22d6845e2b06d3bf2d66802e9e0d7670c9df14b3..5bdbbe6ef1ef1b47d42471676d83b7c44e65d6da 100644 (file)
@@ -1,13 +1,9 @@
-#ifndef _LINUX_BCACHE_IOCTL_H
-#define _LINUX_BCACHE_IOCTL_H
+#ifndef _BCACHEFS_IOCTL_H
+#define _BCACHEFS_IOCTL_H
 
 #include <linux/uuid.h>
 #include "bcachefs_format.h"
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 #define BCH_FORCE_IF_DATA_LOST         (1 << 0)
 #define BCH_FORCE_IF_METADATA_LOST     (1 << 1)
 #define BCH_FORCE_IF_DATA_DEGRADED     (1 << 2)
@@ -97,8 +93,4 @@ struct bch_ioctl_data {
        __u64                   end_offset;
 };
 
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _LINUX_BCACHE_IOCTL_H */
+#endif /* _BCACHEFS_IOCTL_H */
index 1383c96b09e0b613f982466432b1e437b37f1413..0511e1fa41c844f5ccb9ccf4f268d846cbbb74ec 100644 (file)
@@ -580,6 +580,8 @@ BKEY_VAL_ACCESSORS(dirent,          BCH_DIRENT);
 
 BKEY_VAL_ACCESSORS(xattr,              BCH_XATTR);
 
+BKEY_VAL_ACCESSORS(alloc,              BCH_ALLOC);
+
 /* byte order helpers */
 
 #if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
index cd9a60c1144a7fed0ea610dcc5c89506ba72d043..dbec8b32df3cfb7132ffff34dae748eed8c84ba0 100644 (file)
@@ -2,6 +2,7 @@
 #include "bcachefs.h"
 #include "bkey_methods.h"
 #include "btree_types.h"
+#include "alloc.h"
 #include "dirent.h"
 #include "error.h"
 #include "extents.h"
@@ -13,6 +14,7 @@ const struct bkey_ops *bch2_bkey_ops[] = {
        [BKEY_TYPE_INODES]      = &bch2_bkey_inode_ops,
        [BKEY_TYPE_DIRENTS]     = &bch2_bkey_dirent_ops,
        [BKEY_TYPE_XATTRS]      = &bch2_bkey_xattr_ops,
+       [BKEY_TYPE_ALLOC]       = &bch2_bkey_alloc_ops,
        [BKEY_TYPE_BTREE]       = &bch2_bkey_btree_ops,
 };
 
index 78132e40330e2b865ff2d286434619f2cbc07384..815260bc25809de0e1bcf6c2e55958df77d2135d 100644 (file)
@@ -129,6 +129,8 @@ static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type,
 int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
                                struct bkey_s_c k)
 {
+       enum bch_data_types data_type = type == BKEY_TYPE_BTREE
+               ? BCH_DATA_BTREE : BCH_DATA_USER;
        int ret = 0;
 
        switch (k.k->type) {
@@ -137,6 +139,15 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
                struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
                const struct bch_extent_ptr *ptr;
 
+               if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+                   (!c->opts.nofsck &&
+                    fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
+                                "superblock not marked as containing replicas"))) {
+                       ret = bch2_check_mark_super(c, e, data_type);
+                       if (ret)
+                               return ret;
+               }
+
                extent_for_each_ptr(e, ptr) {
                        struct bch_dev *ca = c->devs[ptr->dev];
                        struct bucket *g = PTR_BUCKET(ca, ptr);
@@ -147,7 +158,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
                                        new.gen = ptr->gen;
                                        new.gen_valid = 1;
                                }));
-                               ca->need_prio_write = true;
+                               ca->need_alloc_write = true;
                        }
 
                        if (fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
@@ -159,7 +170,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
                                        new.gen = ptr->gen;
                                        new.gen_valid = 1;
                                }));
-                               ca->need_prio_write = true;
+                               ca->need_alloc_write = true;
                                set_bit(BCH_FS_FIXED_GENS, &c->flags);
                        }
 
@@ -168,6 +179,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
        }
        }
 
+
        atomic64_set(&c->key_version,
                     max_t(u64, k.k->version.lo,
                           atomic64_read(&c->key_version)));
@@ -348,17 +360,6 @@ void bch2_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca)
        }
 
        spin_unlock(&c->journal.lock);
-
-       spin_lock(&ca->prio_buckets_lock);
-
-       for (i = 0; i < prio_buckets(ca) * 2; i++) {
-               b = ca->prio_buckets[i];
-               if (b)
-                       bch2_mark_metadata_bucket(ca, ca->buckets + b,
-                                                BUCKET_PRIOS, true);
-       }
-
-       spin_unlock(&ca->prio_buckets_lock);
 }
 
 static void bch2_mark_metadata(struct bch_fs *c)
@@ -474,10 +475,6 @@ void bch2_gc(struct bch_fs *c)
         *    move around - if references move backwards in the ordering GC
         *    uses, GC could skip past them
         */
-
-       if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-               return;
-
        trace_gc_start(c);
 
        /*
@@ -487,6 +484,8 @@ void bch2_gc(struct bch_fs *c)
        bch2_recalc_sectors_available(c);
 
        down_write(&c->gc_lock);
+       if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+               goto out;
 
        bch2_gc_start(c);
 
@@ -502,8 +501,7 @@ void bch2_gc(struct bch_fs *c)
                if (ret) {
                        bch_err(c, "btree gc failed: %d", ret);
                        set_bit(BCH_FS_GC_FAILURE, &c->flags);
-                       up_write(&c->gc_lock);
-                       return;
+                       goto out;
                }
 
                gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
@@ -518,7 +516,7 @@ void bch2_gc(struct bch_fs *c)
        /* Indicates that gc is no longer in progress: */
        gc_pos_set(c, gc_phase(GC_PHASE_DONE));
        c->gc_count++;
-
+out:
        up_write(&c->gc_lock);
        trace_gc_end(c);
        bch2_time_stats_update(&c->btree_gc_time, start_time);
@@ -529,6 +527,12 @@ void bch2_gc(struct bch_fs *c)
         */
        for_each_member_device(ca, c, i)
                bch2_wake_allocator(ca);
+
+       /*
+        * At startup, allocations can happen directly instead of via the
+        * allocator thread - issue wakeup in case they blocked on gc_lock:
+        */
+       closure_wake_up(&c->freelist_wait);
 }
 
 /* Btree coalescing */
@@ -997,6 +1001,14 @@ int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
        unsigned iter = 0;
        enum btree_id id;
        int ret;
+
+       mutex_lock(&c->sb_lock);
+       if (!bch2_sb_get_replicas(c->disk_sb)) {
+               if (BCH_SB_INITIALIZED(c->disk_sb))
+                       bch_info(c, "building replicas info");
+               set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
+       }
+       mutex_unlock(&c->sb_lock);
 again:
        bch2_gc_start(c);
 
@@ -1006,11 +1018,9 @@ again:
                        return ret;
        }
 
-       if (journal) {
-               ret = bch2_journal_mark(c, journal);
-               if (ret)
-                       return ret;
-       }
+       ret = bch2_journal_mark(c, journal);
+       if (ret)
+       return ret;
 
        bch2_mark_metadata(c);
 
index 18469486f6b269c5bc317a6e66734caab845ebbb..571a8140369c58f509f6a9ad75a7e1ad36769772 100644 (file)
@@ -1402,7 +1402,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 
        ret = validate_bset(c, b, ptr, i, sectors, &whiteout_u64s, WRITE);
        if (ret)
-               bch2_fatal_error(c);
+               bch2_inconsistent_error(c);
 
        return ret;
 }
index 8a4ee6d199ae9de83354bd2cf8fe188f751915f9..9794ac3b3d38a3e8c59ebcce3cbd5d29af323552 100644 (file)
@@ -233,17 +233,29 @@ void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b)
 }
 
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
-                                           bool use_reserve,
-                                           struct disk_reservation *res,
-                                           struct closure *cl)
+                                            struct disk_reservation *res,
+                                            struct closure *cl,
+                                            unsigned flags)
 {
        BKEY_PADDED(k) tmp;
        struct open_bucket *ob;
        struct btree *b;
-       unsigned reserve = use_reserve ? 0 : BTREE_NODE_RESERVE;
+       unsigned nr_reserve;
+       enum alloc_reserve alloc_reserve;
+
+       if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
+               nr_reserve      = 0;
+               alloc_reserve   = RESERVE_ALLOC;
+       } else if (flags & BTREE_INSERT_USE_RESERVE) {
+               nr_reserve      = BTREE_NODE_RESERVE / 2;
+               alloc_reserve   = RESERVE_BTREE;
+       } else {
+               nr_reserve      = BTREE_NODE_RESERVE;
+               alloc_reserve   = RESERVE_NONE;
+       }
 
        mutex_lock(&c->btree_reserve_cache_lock);
-       if (c->btree_reserve_cache_nr > reserve) {
+       if (c->btree_reserve_cache_nr > nr_reserve) {
                struct btree_alloc *a =
                        &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
 
@@ -263,8 +275,7 @@ retry:
                               bkey_i_to_extent(&tmp.k),
                               res->nr_replicas,
                               c->opts.metadata_replicas_required,
-                              use_reserve ? RESERVE_BTREE : RESERVE_NONE,
-                              cl);
+                              alloc_reserve, cl);
        if (IS_ERR(ob))
                return ERR_CAST(ob);
 
@@ -311,7 +322,7 @@ static struct btree *bch2_btree_node_alloc(struct bch_fs *c,
 
        bch2_btree_build_aux_trees(b);
 
-       bch2_check_mark_super(c, &b->key, true);
+       bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key), BCH_DATA_BTREE);
 
        trace_btree_node_alloc(c, b);
        return b;
@@ -533,9 +544,6 @@ static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c,
        if (flags & BTREE_INSERT_NOFAIL)
                disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
 
-       if (flags & BTREE_INSERT_NOWAIT)
-               cl = NULL;
-
        /*
         * This check isn't necessary for correctness - it's just to potentially
         * prevent us from doing a lot of work that'll end up being wasted:
@@ -565,8 +573,9 @@ static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c,
        reserve->nr = 0;
 
        while (reserve->nr < nr_nodes) {
-               b = __bch2_btree_node_alloc(c, flags & BTREE_INSERT_USE_RESERVE,
-                                          &disk_res, cl);
+               b = __bch2_btree_node_alloc(c, &disk_res,
+                                           flags & BTREE_INSERT_NOWAIT
+                                           ? NULL : cl, flags);
                if (IS_ERR(b)) {
                        ret = PTR_ERR(b);
                        goto err_free;
@@ -793,8 +802,8 @@ void bch2_btree_journal_key(struct btree_insert *trans,
        struct btree_write *w = btree_current_write(b);
 
        EBUG_ON(iter->level || b->level);
-       EBUG_ON(!trans->journal_res.ref &&
-               test_bit(JOURNAL_REPLAY_DONE, &j->flags));
+       EBUG_ON(trans->journal_res.ref !=
+               !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
 
        if (!journal_pin_active(&w->journal))
                bch2_journal_pin_add(j, &trans->journal_res,
@@ -1026,6 +1035,27 @@ retry:
                 */
                six_unlock_read(&b->lock);
                mutex_unlock(&c->btree_interior_update_lock);
+
+               /*
+                * Bit of funny circularity going on here we have to break:
+                *
+                * We have to drop our journal pin before writing the journal
+                * entry that points to the new btree root: else, we could
+                * deadlock if the journal currently happens to be full.
+                *
+                * This mean we're dropping the journal pin _before_ the new
+                * nodes are technically reachable - but this is safe, because
+                * after the bch2_btree_set_root_ondisk() call above they will
+                * be reachable as of the very next journal write:
+                */
+               bch2_journal_pin_drop(&c->journal, &as->journal);
+
+               /*
+                * And, do a journal write to write the pointer to the new root,
+                * then wait for it to complete before freeing the nodes we
+                * replaced:
+                */
+               bch2_journal_meta_async(&c->journal, cl);
                break;
        }
 
@@ -1051,19 +1081,70 @@ static void btree_interior_update_updated_btree(struct bch_fs *c,
 
        mutex_unlock(&c->btree_interior_update_lock);
 
+       /*
+        * In general, when you're staging things in a journal that will later
+        * be written elsewhere, and you also want to guarantee ordering: that
+        * is, if you have updates a, b, c, after a crash you should never see c
+        * and not a or b - there's a problem:
+        *
+        * If the final destination of the update(s) (i.e. btree node) can be
+        * written/flushed _before_ the relevant journal entry - oops, that
+        * breaks ordering, since the various leaf nodes can be written in any
+        * order.
+        *
+        * Normally we use bset->journal_seq to deal with this - if during
+        * recovery we find a btree node write that's newer than the newest
+        * journal entry, we just ignore it - we don't need it, anything we're
+        * supposed to have (that we reported as completed via fsync()) will
+        * still be in the journal, and as far as the state of the journal is
+        * concerned that btree node write never happened.
+        *
+        * That breaks when we're rewriting/splitting/merging nodes, since we're
+        * mixing btree node writes that haven't happened yet with previously
+        * written data that has been reported as completed to the journal.
+        *
+        * Thus, before making the new nodes reachable, we have to wait the
+        * newest journal sequence number we have data for to be written (if it
+        * hasn't been yet).
+        */
        bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
 
        continue_at(&as->cl, btree_interior_update_nodes_written,
                    system_freezable_wq);
 }
 
-static void btree_interior_update_reparent(struct btree_interior_update *as,
+static void interior_update_flush(struct journal *j,
+                       struct journal_entry_pin *pin, u64 seq)
+{
+       struct btree_interior_update *as =
+               container_of(pin, struct btree_interior_update, journal);
+
+       bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
+}
+
+static void btree_interior_update_reparent(struct bch_fs *c,
+                                          struct btree_interior_update *as,
                                           struct btree_interior_update *child)
 {
        child->b = NULL;
        child->mode = BTREE_INTERIOR_UPDATING_AS;
        child->parent_as = as;
        closure_get(&as->cl);
+
+       /*
+        * When we write a new btree root, we have to drop our journal pin
+        * _before_ the new nodes are technically reachable; see
+        * btree_interior_update_nodes_written().
+        *
+        * This goes for journal pins that are recursively blocked on us - so,
+        * just transfer the journal pin to the new interior update so
+        * btree_interior_update_nodes_written() can drop it.
+        */
+       bch2_journal_pin_add_if_older(&c->journal, &child->journal,
+                                     &as->journal, interior_update_flush);
+       bch2_journal_pin_drop(&c->journal, &child->journal);
+
+       as->journal_seq = max(as->journal_seq, child->journal_seq);
 }
 
 static void btree_interior_update_updated_root(struct bch_fs *c,
@@ -1081,7 +1162,7 @@ static void btree_interior_update_updated_root(struct bch_fs *c,
         * btree_interior_update operation to point to us:
         */
        if (r->as)
-               btree_interior_update_reparent(as, r->as);
+               btree_interior_update_reparent(c, as, r->as);
 
        as->mode = BTREE_INTERIOR_UPDATING_ROOT;
        as->b = r->b;
@@ -1089,19 +1170,21 @@ static void btree_interior_update_updated_root(struct bch_fs *c,
 
        mutex_unlock(&c->btree_interior_update_lock);
 
+       /*
+        * When we're rewriting nodes and updating interior nodes, there's an
+        * issue with updates that haven't been written in the journal getting
+        * mixed together with older data - see * btree_interior_update_updated_btree()
+        * for the explanation.
+        *
+        * However, this doesn't affect us when we're writing a new btree root -
+        * because to make that new root reachable we have to write out a new
+        * journal entry, which must necessarily be newer than as->journal_seq.
+        */
+
        continue_at(&as->cl, btree_interior_update_nodes_written,
                    system_freezable_wq);
 }
 
-static void interior_update_flush(struct journal *j,
-                       struct journal_entry_pin *pin, u64 seq)
-{
-       struct btree_interior_update *as =
-               container_of(pin, struct btree_interior_update, journal);
-
-       bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
-}
-
 /*
  * @b is being split/rewritten: it may have pointers to not-yet-written btree
  * nodes and thus outstanding btree_interior_updates - redirect @b's
@@ -1150,7 +1233,7 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
         */
        list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
                list_del(&p->write_blocked_list);
-               btree_interior_update_reparent(as, p);
+               btree_interior_update_reparent(c, as, p);
        }
 
        clear_btree_node_dirty(b);
index 7c4abe4aec3681f602d679833c6ba289f2c9b389..b5cfa890ca0d4bb27b149ab8a066ca375e20ac6b 100644 (file)
@@ -373,16 +373,20 @@ int __bch2_btree_insert_at(struct btree_insert *);
 
 /* for copygc, or when merging btree nodes */
 #define BTREE_INSERT_USE_RESERVE       (1 << 2)
+#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << 3)
 
 /*
  * Insert is for journal replay: don't get journal reservations, or mark extents
  * (bch_mark_key)
  */
-#define BTREE_INSERT_JOURNAL_REPLAY    (1 << 3)
+#define BTREE_INSERT_JOURNAL_REPLAY    (1 << 4)
 
 /* Don't block on allocation failure (for new btree nodes: */
-#define BTREE_INSERT_NOWAIT            (1 << 4)
-#define BTREE_INSERT_GC_LOCK_HELD      (1 << 5)
+#define BTREE_INSERT_NOWAIT            (1 << 5)
+#define BTREE_INSERT_GC_LOCK_HELD      (1 << 6)
+
+#define BCH_HASH_SET_MUST_CREATE       (1 << 7)
+#define BCH_HASH_SET_MUST_REPLACE      (1 << 8)
 
 int bch2_btree_delete_at(struct btree_iter *, unsigned);
 
index 1c2f692160874b396a0a5ffc51532a0747b1be58..e5227058f47c2f389d9b08723567ffdc55f44723 100644 (file)
@@ -306,14 +306,18 @@ static void bch2_dev_usage_update(struct bch_dev *ca,
        _old;                                                   \
 })
 
-void bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g)
+bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
+                           struct bucket_mark *old)
 {
        struct bch_fs_usage stats = { 0 };
-       struct bucket_mark old, new;
+       struct bucket_mark new;
+
+       *old = bucket_data_cmpxchg(ca, g, new, ({
+               if (!is_available_bucket(new))
+                       return false;
 
-       old = bucket_data_cmpxchg(ca, g, new, ({
                new.owned_by_allocator  = 1;
-               new.had_metadata        = 0;
+               new.touched_this_mount  = 1;
                new.data_type           = 0;
                new.cached_sectors      = 0;
                new.dirty_sectors       = 0;
@@ -321,11 +325,28 @@ void bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g)
        }));
 
        /* XXX: we're not actually updating fs usage's cached sectors... */
-       bch2_fs_usage_update(&stats, old, new);
+       bch2_fs_usage_update(&stats, *old, new);
 
-       if (!old.owned_by_allocator && old.cached_sectors)
+       if (!old->owned_by_allocator && old->cached_sectors)
                trace_invalidate(ca, g - ca->buckets,
-                                       old.cached_sectors);
+                                       old->cached_sectors);
+       return true;
+}
+
+bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
+{
+       struct bucket_mark new, old;
+
+       old = bucket_data_cmpxchg(ca, g, new, ({
+               if (new.touched_this_mount ||
+                   !is_available_bucket(new))
+                       return false;
+
+               new.owned_by_allocator  = 1;
+               new.touched_this_mount  = 1;
+       }));
+
+       return true;
 }
 
 void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
@@ -333,6 +354,7 @@ void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
        struct bucket_mark old, new;
 
        old = bucket_data_cmpxchg(ca, g, new, ({
+               new.touched_this_mount  = 1;
                new.owned_by_allocator  = 0;
                new.data_type           = 0;
                new.cached_sectors      = 0;
@@ -348,7 +370,8 @@ void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g,
        struct bucket_mark new;
 
        bucket_data_cmpxchg(ca, g, new, ({
-               new.owned_by_allocator = owned_by_allocator;
+               new.touched_this_mount  = 1;
+               new.owned_by_allocator  = owned_by_allocator;
        }));
 }
 
@@ -376,8 +399,8 @@ void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g,
        old = bucket_data_cmpxchg(ca, g, new, ({
                saturated_add(ca, new.dirty_sectors, ca->mi.bucket_size,
                              GC_MAX_SECTORS_USED);
-               new.data_type = type;
-               new.had_metadata = 1;
+               new.data_type           = type;
+               new.touched_this_mount  = 1;
        }));
 
        if (old.data_type != type &&
@@ -458,8 +481,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
        if (gc_will_visit) {
                if (journal_seq)
                        bucket_cmpxchg(g, new, ({
-                               new.journal_seq_valid = 1;
-                               new.journal_seq = journal_seq;
+                               new.touched_this_mount  = 1;
+                               new.journal_seq_valid   = 1;
+                               new.journal_seq         = journal_seq;
                        }));
 
                goto out;
@@ -479,11 +503,6 @@ static void bch2_mark_pointer(struct bch_fs *c,
                        return;
                }
 
-               EBUG_ON(type != S_CACHED &&
-                       !may_make_unavailable &&
-                       is_available_bucket(new) &&
-                       test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
-
                if (type != S_CACHED &&
                    new.dirty_sectors == GC_MAX_SECTORS_USED &&
                    disk_sectors < 0)
@@ -508,7 +527,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
                        new.data_type = data_type;
                }
 
-               new.had_metadata |= is_meta_bucket(new);
+               new.touched_this_mount  = 1;
        }));
 
        if (old.data_type != data_type &&
index f99a62bcc9bf418cc6f60b257629615b0eecdbf8..37eb471f644aa46b281040a1ce8d744dac4f79e9 100644 (file)
@@ -191,7 +191,9 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
 
 void bch2_bucket_seq_cleanup(struct bch_fs *);
 
-void bch2_invalidate_bucket(struct bch_dev *, struct bucket *);
+bool bch2_invalidate_bucket(struct bch_dev *, struct bucket *,
+                           struct bucket_mark *);
+bool bch2_mark_alloc_bucket_startup(struct bch_dev *, struct bucket *);
 void bch2_mark_free_bucket(struct bch_dev *, struct bucket *);
 void bch2_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool);
 void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *,
index 3c8b64477cb6adcfc45285bc8cc6f17f83e07e8d..c25c9fabee9e4366cdb1f2dae7f2c1c9a032ed97 100644 (file)
@@ -3,6 +3,7 @@
 
 #include "util.h"
 
+/* kill, switch to bch_data_types */
 enum bucket_data_type {
        BUCKET_DATA     = 0,
        BUCKET_BTREE,
@@ -19,23 +20,12 @@ struct bucket_mark {
 
        struct {
                u8              gen;
-
-               unsigned        gen_valid:1;
-               unsigned        journal_seq_valid:1;
-
-               /*
-                * If this bucket had metadata while at the current generation
-                * number, the allocator must increment its gen before we reuse
-                * it:
-                */
-               unsigned        had_metadata:1;
-
-               unsigned        owned_by_allocator:1;
-
-               unsigned        data_type:3;
-
-               unsigned        nouse:1;
-
+               u8              data_type:3,
+                               gen_valid:1,
+                               owned_by_allocator:1,
+                               nouse:1,
+                               journal_seq_valid:1,
+                               touched_this_mount:1;
                u16             dirty_sectors;
                u16             cached_sectors;
 
index 57bfb4a618b64cf9628338426a665429800ca921..74d54ab172a696e4d9d9500f1d623bd879c7d72b 100644 (file)
@@ -412,9 +412,6 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
            size_ondisk > ca->mi.bucket_size)
                return "spans multiple buckets";
 
-       if (!(metadata ? ca->mi.has_metadata : ca->mi.has_data))
-               return "device not marked as containing data";
-
        return NULL;
 }
 
@@ -547,12 +544,12 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
                        goto err;
        }
 
-       if (replicas < c->sb.meta_replicas_have) {
+       if (!bch2_sb_has_replicas(c, e, BCH_DATA_BTREE)) {
                bch2_bkey_val_to_text(c, btree_node_type(b),
                                     buf, sizeof(buf), k);
                bch2_fs_bug(c,
-                       "btree key bad (too few replicas, %u < %u): %s",
-                       replicas, c->sb.meta_replicas_have, buf);
+                       "btree key bad (replicas not marked in superblock):\n%s",
+                       buf);
                return;
        }
 
@@ -1755,12 +1752,12 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
        }
 
        if (!bkey_extent_is_cached(e.k) &&
-           replicas < c->sb.data_replicas_have) {
-               bch2_bkey_val_to_text(c, btree_node_type(b), buf,
-                                    sizeof(buf), e.s_c);
+           !bch2_sb_has_replicas(c, e, BCH_DATA_USER)) {
+               bch2_bkey_val_to_text(c, btree_node_type(b),
+                                    buf, sizeof(buf), e.s_c);
                bch2_fs_bug(c,
-                       "extent key bad (too few replicas, %u < %u): %s",
-                       replicas, c->sb.data_replicas_have, buf);
+                       "extent key bad (replicas not marked in superblock):\n%s",
+                       buf);
                return;
        }
 
index 1145a1903abca58438f5d468c6eeb78470461e95..54b523d435dd63e9b561fabfe5ace49ddf131272 100644 (file)
@@ -531,7 +531,8 @@ static int bch2_write_extent(struct bch_write_op *op,
 
        key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
 
-       bch2_check_mark_super(c, key_to_write, false);
+       bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
+                             BCH_DATA_USER);
 
        bch2_submit_wbio_replicas(to_wbio(bio), c, key_to_write);
        return ret;
index 92364fea48e1795f426bfb0c3c679fb5c4f4e4da..b0011b43e1e0a3208ba18d5adf9d918cf8fd2a25 100644 (file)
@@ -53,28 +53,6 @@ static inline u64 journal_pin_seq(struct journal *j,
        return last_seq(j) + fifo_entry_idx(&j->pin, pin_list);
 }
 
-static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
-                                       struct jset_entry *entry, unsigned type)
-{
-       while (entry < vstruct_last(jset)) {
-               if (JOURNAL_ENTRY_TYPE(entry) == type)
-                       return entry;
-
-               entry = vstruct_next(entry);
-       }
-
-       return NULL;
-}
-
-#define for_each_jset_entry_type(entry, jset, type)                    \
-       for (entry = (jset)->start;                                     \
-            (entry = __jset_entry_type_next(jset, entry, type));       \
-            entry = vstruct_next(entry))
-
-#define for_each_jset_key(k, _n, entry, jset)                          \
-       for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
-               vstruct_for_each_safe(entry, k, _n)
-
 static inline void bch2_journal_add_entry(struct journal_buf *buf,
                                         const void *data, size_t u64s,
                                         unsigned type, enum btree_id id,
@@ -123,20 +101,6 @@ static void bch2_journal_add_btree_root(struct journal_buf *buf,
                              JOURNAL_ENTRY_BTREE_ROOT, id, level);
 }
 
-static inline void bch2_journal_add_prios(struct journal *j,
-                                        struct journal_buf *buf)
-{
-       /*
-        * no prio bucket ptrs yet... XXX should change the allocator so this
-        * can't happen:
-        */
-       if (!buf->nr_prio_buckets)
-               return;
-
-       bch2_journal_add_entry(buf, j->prio_buckets, buf->nr_prio_buckets,
-                             JOURNAL_ENTRY_PRIO_PTRS, 0, 0);
-}
-
 static void journal_seq_blacklist_flush(struct journal *j,
                                struct journal_entry_pin *pin, u64 seq)
 {
@@ -986,7 +950,6 @@ static inline bool journal_has_keys(struct list_head *list)
 int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 {
        struct journal *j = &c->journal;
-       struct jset_entry *prio_ptrs;
        struct journal_list jlist;
        struct journal_replay *i;
        struct journal_entry_pin_list *p;
@@ -1094,15 +1057,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 
        bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
                 keys, entries, (u64) atomic64_read(&j->seq));
-
-       i = list_last_entry(list, struct journal_replay, list);
-       prio_ptrs = bch2_journal_find_entry(&i->j, JOURNAL_ENTRY_PRIO_PTRS, 0);
-       if (prio_ptrs) {
-               memcpy_u64s(j->prio_buckets,
-                           prio_ptrs->_data,
-                           le16_to_cpu(prio_ptrs->u64s));
-               j->nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s);
-       }
 fsck_err:
        return ret;
 }
@@ -1189,12 +1143,7 @@ static void __bch2_journal_next_entry(struct journal *j)
 
 static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
 {
-       unsigned ret = BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
-
-       if (buf->nr_prio_buckets)
-               ret += JSET_KEYS_U64s + buf->nr_prio_buckets;
-
-       return ret;
+       return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
 }
 
 static enum {
@@ -1395,9 +1344,7 @@ static int journal_entry_open(struct journal *j)
        buf->disk_sectors       = sectors;
 
        sectors = min_t(unsigned, sectors, buf->size >> 9);
-
        j->cur_buf_sectors      = sectors;
-       buf->nr_prio_buckets    = j->nr_prio_buckets;
 
        u64s = (sectors << 9) / sizeof(u64);
 
@@ -1510,17 +1457,27 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
                for_each_jset_key(k, _n, entry, &i->j) {
                        struct disk_reservation disk_res;
 
-                       /*
-                        * We might cause compressed extents to be split, so we
-                        * need to pass in a disk_reservation:
-                        */
-                       BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0));
+                       if (entry->btree_id == BTREE_ID_ALLOC) {
+                               /*
+                                * allocation code handles replay for
+                                * BTREE_ID_ALLOC keys:
+                                */
+                               ret = bch2_alloc_replay_key(c, k->k.p);
+                       } else {
+
+                               /*
+                                * We might cause compressed extents to be
+                                * split, so we need to pass in a
+                                * disk_reservation:
+                                */
+                               BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0));
 
-                       ret = bch2_btree_insert(c, entry->btree_id, k,
-                                              &disk_res, NULL, NULL,
-                                              BTREE_INSERT_NOFAIL|
-                                              BTREE_INSERT_JOURNAL_REPLAY);
-                       bch2_disk_reservation_put(c, &disk_res);
+                               ret = bch2_btree_insert(c, entry->btree_id, k,
+                                                       &disk_res, NULL, NULL,
+                                                       BTREE_INSERT_NOFAIL|
+                                                       BTREE_INSERT_JOURNAL_REPLAY);
+                               bch2_disk_reservation_put(c, &disk_res);
+                       }
 
                        if (ret) {
                                bch_err(c, "journal replay: error %d while replaying key",
@@ -1560,13 +1517,12 @@ err:
        return ret;
 }
 
-#if 0
 /*
  * Allocate more journal space at runtime - not currently making use if it, but
  * the code works:
  */
 static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
-                                     unsigned nr)
+                                      unsigned nr)
 {
        struct journal *j = &c->journal;
        struct journal_device *ja = &ca->journal;
@@ -1614,8 +1570,8 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 
        while (ja->nr < nr) {
                /* must happen under journal lock, to avoid racing with gc: */
-               u64 b = bch2_bucket_alloc(ca, RESERVE_NONE);
-               if (!b) {
+               long b = bch2_bucket_alloc(c, ca, RESERVE_NONE);
+               if (b < 0) {
                        if (!closure_wait(&c->freelist_wait, &cl)) {
                                spin_unlock(&j->lock);
                                closure_sync(&cl);
@@ -1651,7 +1607,7 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
        }
        spin_unlock(&j->lock);
 
-       BUG_ON(bch2_validate_journal_layout(ca->disk_sb.sb, ca->mi));
+       BUG_ON(bch2_sb_validate_journal(ca->disk_sb.sb, ca->mi));
 
        bch2_write_super(c);
 
@@ -1663,16 +1619,15 @@ err:
        kfree(new_buckets);
        bch2_disk_reservation_put(c, &disk_res);
 
+       if (!ret)
+               bch2_dev_allocator_add(c, ca);
+
        return ret;
 }
-#endif
 
 int bch2_dev_journal_alloc(struct bch_dev *ca)
 {
-       struct journal_device *ja = &ca->journal;
-       struct bch_sb_field_journal *journal_buckets;
-       unsigned i, nr;
-       u64 b, *p;
+       unsigned nr;
 
        if (dynamic_fault("bcachefs:add:journal_alloc"))
                return -ENOMEM;
@@ -1686,45 +1641,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
                     min(1 << 10,
                         (1 << 20) / ca->mi.bucket_size));
 
-       p = krealloc(ja->bucket_seq, nr * sizeof(u64),
-                    GFP_KERNEL|__GFP_ZERO);
-       if (!p)
-               return -ENOMEM;
-
-       ja->bucket_seq = p;
-
-       p = krealloc(ja->buckets, nr * sizeof(u64),
-                    GFP_KERNEL|__GFP_ZERO);
-       if (!p)
-               return -ENOMEM;
-
-       ja->buckets = p;
-
-       journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
-                               nr + sizeof(*journal_buckets) / sizeof(u64));
-       if (!journal_buckets)
-               return -ENOMEM;
-
-       for (i = 0, b = ca->mi.first_bucket;
-            i < nr && b < ca->mi.nbuckets; b++) {
-               if (!is_available_bucket(ca->buckets[b].mark))
-                       continue;
-
-               bch2_mark_metadata_bucket(ca, &ca->buckets[b],
-                                        BUCKET_JOURNAL, true);
-               ja->buckets[i] = b;
-               journal_buckets->buckets[i] = cpu_to_le64(b);
-               i++;
-       }
-
-       if (i < nr)
-               return -ENOSPC;
-
-       BUG_ON(bch2_validate_journal_layout(ca->disk_sb.sb, ca->mi));
-
-       ja->nr = nr;
-
-       return 0;
+       return bch2_set_nr_journal_buckets(ca->fs, ca, nr);
 }
 
 /* Journalling */
@@ -2274,9 +2191,6 @@ static void journal_write(struct closure *cl)
        jset = w->data;
 
        j->write_start_time = local_clock();
-
-       bch2_journal_add_prios(j, w);
-
        mutex_lock(&c->btree_root_lock);
        for (i = 0; i < BTREE_ID_NR; i++) {
                struct btree_root *r = &c->btree_roots[i];
@@ -2324,7 +2238,8 @@ static void journal_write(struct closure *cl)
                closure_return_with_destructor(cl, journal_write_done);
        }
 
-       bch2_check_mark_super(c, &j->key, true);
+       bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key),
+                             BCH_DATA_JOURNAL);
 
        /*
         * XXX: we really should just disable the entire journal in nochanges
@@ -2380,7 +2295,7 @@ no_io:
 
        closure_return_with_destructor(cl, journal_write_done);
 err:
-       bch2_fatal_error(c);
+       bch2_inconsistent_error(c);
        closure_return_with_destructor(cl, journal_write_done);
 }
 
index d0dd0d330c6a393db3495184dd2394b56d9ad972..88a9bd12447d803a11f91656a87da543c0d15024 100644 (file)
@@ -121,6 +121,28 @@ struct journal_replay {
        struct jset             j;
 };
 
+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
+                                       struct jset_entry *entry, unsigned type)
+{
+       while (entry < vstruct_last(jset)) {
+               if (JOURNAL_ENTRY_TYPE(entry) == type)
+                       return entry;
+
+               entry = vstruct_next(entry);
+       }
+
+       return NULL;
+}
+
+#define for_each_jset_entry_type(entry, jset, type)                    \
+       for (entry = (jset)->start;                                     \
+            (entry = __jset_entry_type_next(jset, entry, type));       \
+            entry = vstruct_next(entry))
+
+#define for_each_jset_key(k, _n, entry, jset)                          \
+       for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
+               vstruct_for_each_safe(entry, k, _n)
+
 #define JOURNAL_PIN    (32 * 1024)
 
 static inline bool journal_pin_active(struct journal_entry_pin *pin)
index 4b01b14a6c69870a9fa0c7b0c188d9d9455e9d1a..3314fc0fffd45485d752f30b29c5eb9745c0855a 100644 (file)
@@ -20,13 +20,6 @@ struct journal_buf {
 
        unsigned                size;
        unsigned                disk_sectors;
-
-       /*
-        * ugh, prio_buckets are stupid - need to convert them to new
-        * transaction machinery when it arrives
-        */
-       unsigned                nr_prio_buckets;
-
        /* bloom filter: */
        unsigned long           has_inode[1024 / sizeof(unsigned long)];
 };
@@ -189,14 +182,6 @@ struct journal {
 
        /* protects advancing ja->last_idx: */
        struct mutex            reclaim_lock;
-
-       /*
-        * ugh: need to get prio_buckets converted over to the eventual new
-        * transaction machinery
-        */
-       __le64                  prio_buckets[BCH_SB_MEMBERS_MAX];
-       unsigned                nr_prio_buckets;
-
        unsigned                write_delay_ms;
        unsigned                reclaim_delay_ms;
 
index 8c9e3c259ab416e59fcbca46d2c2d4eb1ffca3ba..ba0cc0e45c783e75ff202302bf897621987d086a 100644 (file)
@@ -59,16 +59,18 @@ int bch2_move_data_off_device(struct bch_dev *ca)
 {
        struct moving_context ctxt;
        struct bch_fs *c = ca->fs;
-       struct bch_sb_field_members *mi;
        unsigned pass = 0;
        u64 seen_key_count;
        int ret = 0;
 
        BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
 
-       if (!ca->mi.has_data)
+       if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER)))
                return 0;
 
+       mutex_lock(&c->replicas_gc_lock);
+       bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
+
        bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
        ctxt.avoid = ca;
 
@@ -124,7 +126,11 @@ int bch2_move_data_off_device(struct bch_dev *ca)
                        BUG_ON(ret);
 
                        seen_key_count++;
+                       continue;
 next:
+                       if (bkey_extent_is_data(k.k))
+                               bch2_check_mark_super(c, bkey_s_c_to_extent(k),
+                                                     BCH_DATA_USER);
                        bch2_btree_iter_advance_pos(&iter);
                        bch2_btree_iter_cond_resched(&iter);
 
@@ -133,23 +139,20 @@ next:
                bch2_move_ctxt_exit(&ctxt);
 
                if (ret)
-                       return ret;
+                       goto err;
        } while (seen_key_count && pass++ < MAX_DATA_OFF_ITER);
 
        if (seen_key_count) {
                pr_err("Unable to migrate all data in %d iterations.",
                       MAX_DATA_OFF_ITER);
-               return -1;
+               ret = -1;
+               goto err;
        }
 
-       mutex_lock(&c->sb_lock);
-       mi = bch2_sb_get_members(c->disk_sb);
-       SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
-
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       return 0;
+err:
+       bch2_replicas_gc_end(c, ret);
+       mutex_unlock(&c->replicas_gc_lock);
+       return ret;
 }
 
 /*
@@ -245,21 +248,27 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
 int bch2_move_metadata_off_device(struct bch_dev *ca)
 {
        struct bch_fs *c = ca->fs;
-       struct bch_sb_field_members *mi;
        unsigned i;
-       int ret;
+       int ret = 0;
 
        BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
 
-       if (!ca->mi.has_metadata)
+       if (!(bch2_dev_has_data(c, ca) &
+             ((1 << BCH_DATA_JOURNAL)|
+              (1 << BCH_DATA_BTREE))))
                return 0;
 
+       mutex_lock(&c->replicas_gc_lock);
+       bch2_replicas_gc_start(c,
+                              (1 << BCH_DATA_JOURNAL)|
+                              (1 << BCH_DATA_BTREE));
+
        /* 1st, Move the btree nodes off the device */
 
        for (i = 0; i < BTREE_ID_NR; i++) {
                ret = bch2_move_btree_off(c, ca, i);
                if (ret)
-                       return ret;
+                       goto err;
        }
 
        /* There are no prios/gens to move -- they are already in the device. */
@@ -268,16 +277,12 @@ int bch2_move_metadata_off_device(struct bch_dev *ca)
 
        ret = bch2_journal_move(ca);
        if (ret)
-               return ret;
-
-       mutex_lock(&c->sb_lock);
-       mi = bch2_sb_get_members(c->disk_sb);
-       SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false);
-
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
+               goto err;
 
-       return 0;
+err:
+       bch2_replicas_gc_end(c, ret);
+       mutex_unlock(&c->replicas_gc_lock);
+       return ret;
 }
 
 /*
@@ -326,12 +331,16 @@ static int bch2_flag_key_bad(struct btree_iter *iter,
  */
 int bch2_flag_data_bad(struct bch_dev *ca)
 {
-       int ret = 0;
+       struct bch_fs *c = ca->fs;
        struct bkey_s_c k;
        struct bkey_s_c_extent e;
        struct btree_iter iter;
+       int ret = 0;
 
-       bch2_btree_iter_init(&iter, ca->fs, BTREE_ID_EXTENTS,
+       mutex_lock(&c->replicas_gc_lock);
+       bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
                             POS_MIN, BTREE_ITER_PREFETCH);
 
        while ((k = bch2_btree_iter_peek(&iter)).k &&
@@ -377,10 +386,16 @@ int bch2_flag_data_bad(struct bch_dev *ca)
                 */
                continue;
 advance:
+               if (bkey_extent_is_data(k.k))
+                       bch2_check_mark_super(c, bkey_s_c_to_extent(k),
+                                             BCH_DATA_USER);
                bch2_btree_iter_advance_pos(&iter);
        }
 
        bch2_btree_iter_unlock(&iter);
 
+       bch2_replicas_gc_end(c, ret);
+       mutex_unlock(&c->replicas_gc_lock);
+
        return ret;
 }
index 6fa707db24d74030a2ac93d1f089cb3af31f2c86..53eb15adc8ab5463c26151408bbcac33372a1d80 100644 (file)
@@ -59,6 +59,8 @@ enum opt_type {
                s8,  OPT_UINT(1, BCH_REPLICAS_MAX))                     \
        BCH_OPT(data_replicas_required, 0444,   BCH_SB_DATA_REPLICAS_REQ,\
                s8,  OPT_UINT(1, BCH_REPLICAS_MAX))                     \
+       BCH_OPT(degraded,               0444,   NO_SB_OPT,              \
+               s8,  OPT_BOOL())                                        \
        BCH_OPT(metadata_checksum,      0644,   BCH_SB_META_CSUM_TYPE,  \
                s8,  OPT_STR(bch2_csum_types))                          \
        BCH_OPT(data_checksum,          0644,   BCH_SB_DATA_CSUM_TYPE,  \
index b237b751053bb3dbceb98e2a1402bf063b70161c..ab28b07a72ae2864a2e5fa5b6314c74984a2d0b4 100644 (file)
@@ -267,9 +267,6 @@ static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc,
        }
 }
 
-#define BCH_HASH_SET_MUST_CREATE       (1 << 4)
-#define BCH_HASH_SET_MUST_REPLACE      (1 << 5)
-
 static inline int bch2_hash_set(const struct bch_hash_desc desc,
                               const struct bch_hash_info *info,
                               struct bch_fs *c, u64 inode,
index 130b130fa4cf02833d5dd9274fe9090a0854c693..1eae0fcb97cb3cc81f522aeaf7dc996765c3ef93 100644 (file)
@@ -11,6 +11,9 @@
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
 
+static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+static const char *bch2_sb_validate_replicas(struct bch_sb *);
+
 static inline void __bch2_sb_layout_size_assert(void)
 {
        BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
@@ -228,8 +231,8 @@ static int u64_cmp(const void *_l, const void *_r)
        return l < r ? -1 : l > r ? 1 : 0;
 }
 
-const char *bch2_validate_journal_layout(struct bch_sb *sb,
-                                       struct bch_member_cpu mi)
+const char *bch2_sb_validate_journal(struct bch_sb *sb,
+                                    struct bch_member_cpu mi)
 {
        struct bch_sb_field_journal *journal;
        const char *err;
@@ -291,7 +294,7 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb)
                return "Invalid superblock: bad member info";
 
        for (i = 0; i < sb->nr_devices; i++) {
-               if (bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)))
+               if (!bch2_dev_exists(sb, mi, i))
                        continue;
 
                if (le16_to_cpu(mi->members[i].bucket_size) <
@@ -302,7 +305,7 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb)
        return NULL;
 }
 
-const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb)
+const char *bch2_sb_validate(struct bcache_superblock *disk_sb)
 {
        struct bch_sb *sb = disk_sb->sb;
        struct bch_sb_field *f;
@@ -347,11 +350,6 @@ const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb)
            BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
                return "Invalid number of metadata replicas";
 
-       if (!BCH_SB_META_REPLICAS_HAVE(sb) ||
-           BCH_SB_META_REPLICAS_HAVE(sb) >
-           BCH_SB_META_REPLICAS_WANT(sb))
-               return "Invalid number of metadata replicas";
-
        if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
            BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
                return "Invalid number of data replicas";
@@ -360,11 +358,6 @@ const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb)
            BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
                return "Invalid number of metadata replicas";
 
-       if (!BCH_SB_DATA_REPLICAS_HAVE(sb) ||
-           BCH_SB_DATA_REPLICAS_HAVE(sb) >
-           BCH_SB_DATA_REPLICAS_WANT(sb))
-               return "Invalid number of data replicas";
-
        if (!BCH_SB_BTREE_NODE_SIZE(sb))
                return "Btree node size not set";
 
@@ -419,7 +412,11 @@ const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb)
            mi.bucket_size * mi.nbuckets)
                return "Invalid superblock: device too small";
 
-       err = bch2_validate_journal_layout(sb, mi);
+       err = bch2_sb_validate_journal(sb, mi);
+       if (err)
+               return err;
+
+       err = bch2_sb_validate_replicas(sb);
        if (err)
                return err;
 
@@ -464,8 +461,6 @@ static void bch2_sb_update(struct bch_fs *c)
        c->sb.btree_node_size   = BCH_SB_BTREE_NODE_SIZE(src);
        c->sb.nr_devices        = src->nr_devices;
        c->sb.clean             = BCH_SB_CLEAN(src);
-       c->sb.meta_replicas_have= BCH_SB_META_REPLICAS_HAVE(src);
-       c->sb.data_replicas_have= BCH_SB_DATA_REPLICAS_HAVE(src);
        c->sb.str_hash_type     = BCH_SB_STR_HASH_TYPE(src);
        c->sb.encryption_type   = BCH_SB_ENCRYPTION_TYPE(src);
        c->sb.time_base_lo      = le64_to_cpu(src->time_base_lo);
@@ -517,6 +512,7 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
        unsigned journal_u64s = journal_buckets
                ? le32_to_cpu(journal_buckets->field.u64s)
                : 0;
+       int ret;
 
        lockdep_assert_held(&c->sb_lock);
 
@@ -524,8 +520,12 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
                return -ENOMEM;
 
        __copy_super(c->disk_sb, src);
-       bch2_sb_update(c);
 
+       ret = bch2_sb_replicas_to_cpu_replicas(c);
+       if (ret)
+               return ret;
+
+       bch2_sb_update(c);
        return 0;
 }
 
@@ -743,6 +743,7 @@ void bch2_write_super(struct bch_fs *c)
        struct closure *cl = &c->sb_write;
        struct bch_dev *ca;
        unsigned i, super_idx = 0;
+       const char *err;
        bool wrote;
 
        lockdep_assert_held(&c->sb_lock);
@@ -754,7 +755,16 @@ void bch2_write_super(struct bch_fs *c)
        for_each_online_member(ca, c, i)
                bch2_sb_from_fs(c, ca);
 
-       if (c->opts.nochanges)
+       for_each_online_member(ca, c, i) {
+               err = bch2_sb_validate(&ca->disk_sb);
+               if (err) {
+                       bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
+                       goto out;
+               }
+       }
+
+       if (c->opts.nochanges ||
+           test_bit(BCH_FS_ERROR, &c->flags))
                goto out;
 
        do {
@@ -771,40 +781,482 @@ out:
        bch2_sb_update(c);
 }
 
-void bch2_check_mark_super_slowpath(struct bch_fs *c, const struct bkey_i *k,
-                                  bool meta)
+/* replica information: */
+
+static inline struct bch_replicas_entry *
+replicas_entry_next(struct bch_replicas_entry *i)
+{
+       return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
+}
+
+#define for_each_replicas_entry(_r, _i)                                        \
+       for (_i = (_r)->entries;                                        \
+            (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+            (_i) = replicas_entry_next(_i))
+
+static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
+                                       unsigned *nr,
+                                       unsigned *bytes,
+                                       unsigned *max_dev)
+{
+       struct bch_replicas_entry *i;
+       unsigned j;
+
+       *nr     = 0;
+       *bytes  = sizeof(*r);
+       *max_dev = 0;
+
+       if (!r)
+               return;
+
+       for_each_replicas_entry(r, i) {
+               for (j = 0; j < i->nr; j++)
+                       *max_dev = max_t(unsigned, *max_dev, i->devs[j]);
+               (*nr)++;
+       }
+
+       *bytes = (void *) i - (void *) r;
+}
+
+static struct bch_replicas_cpu *
+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
+{
+       struct bch_replicas_cpu *cpu_r;
+       unsigned i, nr, bytes, max_dev, entry_size;
+
+       bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+
+       entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+               DIV_ROUND_UP(max_dev + 1, 8);
+
+       cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
+                       nr * entry_size, GFP_NOIO);
+       if (!cpu_r)
+               return NULL;
+
+       cpu_r->nr               = nr;
+       cpu_r->entry_size       = entry_size;
+
+       if (nr) {
+               struct bch_replicas_cpu_entry *dst =
+                       cpu_replicas_entry(cpu_r, 0);
+               struct bch_replicas_entry *src = sb_r->entries;
+
+               while (dst < cpu_replicas_entry(cpu_r, nr)) {
+                       dst->data_type = src->data_type;
+                       for (i = 0; i < src->nr; i++)
+                               replicas_set_dev(dst, src->devs[i]);
+
+                       src     = replicas_entry_next(src);
+                       dst     = (void *) dst + entry_size;
+               }
+       }
+
+       eytzinger0_sort(cpu_r->entries,
+                       cpu_r->nr,
+                       cpu_r->entry_size,
+                       memcmp, NULL);
+       return cpu_r;
+}
+
+static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
+{
+       struct bch_sb_field_replicas *sb_r;
+       struct bch_replicas_cpu *cpu_r, *old_r;
+
+       lockdep_assert_held(&c->sb_lock);
+
+       sb_r    = bch2_sb_get_replicas(c->disk_sb);
+       cpu_r   = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+       if (!cpu_r)
+               return -ENOMEM;
+
+       old_r = c->replicas;
+       rcu_assign_pointer(c->replicas, cpu_r);
+       if (old_r)
+               kfree_rcu(old_r, rcu);
+
+       return 0;
+}
+
+/*
+ * for when gc of replica information is in progress:
+ */
+static int bch2_update_gc_replicas(struct bch_fs *c,
+                                  struct bch_replicas_cpu *gc_r,
+                                  struct bkey_s_c_extent e,
+                                  enum bch_data_types data_type)
 {
-       struct bch_member *mi;
-       struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
        const struct bch_extent_ptr *ptr;
-       unsigned nr_replicas = 0;
+       struct bch_replicas_cpu_entry *new_e;
+       struct bch_replicas_cpu *new;
+       unsigned i, nr, entry_size, max_dev = 0;
+
+       extent_for_each_ptr(e, ptr)
+               if (!ptr->cached)
+                       max_dev = max_t(unsigned, max_dev, ptr->dev);
+
+       entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+               DIV_ROUND_UP(max_dev + 1, 8);
+       entry_size = max(entry_size, gc_r->entry_size);
+       nr = gc_r->nr + 1;
+
+       new = kzalloc(sizeof(struct bch_replicas_cpu) +
+                     nr * entry_size, GFP_NOIO);
+       if (!new)
+               return -ENOMEM;
+
+       new->nr         = nr;
+       new->entry_size = entry_size;
+
+       for (i = 0; i < gc_r->nr; i++)
+               memcpy(cpu_replicas_entry(new, i),
+                      cpu_replicas_entry(gc_r, i),
+                      gc_r->entry_size);
+
+       new_e = cpu_replicas_entry(new, nr - 1);
+       new_e->data_type = data_type;
+
+       extent_for_each_ptr(e, ptr)
+               if (!ptr->cached)
+                       replicas_set_dev(new_e, ptr->dev);
+
+       eytzinger0_sort(new->entries,
+                       new->nr,
+                       new->entry_size,
+                       memcmp, NULL);
+
+       rcu_assign_pointer(c->replicas_gc, new);
+       kfree_rcu(gc_r, rcu);
+       return 0;
+}
+
+int bch2_check_mark_super_slowpath(struct bch_fs *c, struct bkey_s_c_extent e,
+                                  enum bch_data_types data_type)
+{
+       struct bch_replicas_cpu *gc_r;
+       const struct bch_extent_ptr *ptr;
+       struct bch_sb_field_replicas *sb_r;
+       struct bch_replicas_entry *new_entry;
+       unsigned new_entry_bytes, new_u64s, nr, bytes, max_dev;
+       int ret = 0;
 
        mutex_lock(&c->sb_lock);
 
+       gc_r = rcu_dereference_protected(c->replicas_gc,
+                                        lockdep_is_held(&c->sb_lock));
+       if (gc_r &&
+           !replicas_has_extent(gc_r, e, data_type)) {
+               ret = bch2_update_gc_replicas(c, gc_r, e, data_type);
+               if (ret)
+                       goto err;
+       }
+
        /* recheck, might have raced */
-       if (bch2_check_super_marked(c, k, meta)) {
+       if (bch2_sb_has_replicas(c, e, data_type)) {
                mutex_unlock(&c->sb_lock);
-               return;
+               return 0;
        }
 
-       mi = bch2_sb_get_members(c->disk_sb)->members;
+       new_entry_bytes = sizeof(struct bch_replicas_entry) +
+               bch2_extent_nr_dirty_ptrs(e.s_c);
+
+       sb_r = bch2_sb_get_replicas(c->disk_sb);
+
+       bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+
+       new_u64s = DIV_ROUND_UP(bytes + new_entry_bytes, sizeof(u64));
+
+       sb_r = bch2_fs_sb_resize_replicas(c,
+                       DIV_ROUND_UP(sizeof(*sb_r) + bytes + new_entry_bytes,
+                                    sizeof(u64)));
+       if (!sb_r) {
+               ret = -ENOSPC;
+               goto err;
+       }
+
+       new_entry = (void *) sb_r + bytes;
+       new_entry->data_type = data_type;
+       new_entry->nr = 0;
 
        extent_for_each_ptr(e, ptr)
-               if (!ptr->cached) {
-                       (meta
-                        ? SET_BCH_MEMBER_HAS_METADATA
-                        : SET_BCH_MEMBER_HAS_DATA)(mi + ptr->dev, true);
-                       nr_replicas++;
+               if (!ptr->cached)
+                       new_entry->devs[new_entry->nr++] = ptr->dev;
+
+       ret = bch2_sb_replicas_to_cpu_replicas(c);
+       if (ret) {
+               memset(new_entry, 0,
+                      vstruct_end(&sb_r->field) - (void *) new_entry);
+               goto err;
+       }
+
+       bch2_write_super(c);
+err:
+       mutex_unlock(&c->sb_lock);
+       return ret;
+}
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *c,
+                                             struct bch_dev *dev_to_offline)
+{
+       struct bch_replicas_cpu_entry *e;
+       struct bch_replicas_cpu *r;
+       unsigned i, dev, dev_slots, nr_online, nr_offline;
+       struct replicas_status ret;
+
+       memset(&ret, 0, sizeof(ret));
+
+       for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
+               ret.replicas[i].nr_online = UINT_MAX;
+
+       rcu_read_lock();
+       r = rcu_dereference(c->replicas);
+       dev_slots = min_t(unsigned, replicas_dev_slots(r), c->sb.nr_devices);
+
+       for (i = 0; i < r->nr; i++) {
+               e = cpu_replicas_entry(r, i);
+
+               BUG_ON(e->data_type >= ARRAY_SIZE(ret.replicas));
+
+               nr_online = nr_offline = 0;
+
+               for (dev = 0; dev < dev_slots; dev++) {
+                       if (!replicas_test_dev(e, dev))
+                               continue;
+
+                       if (bch2_dev_is_online(c->devs[dev]) &&
+                           c->devs[dev] != dev_to_offline)
+                               nr_online++;
+                       else
+                               nr_offline++;
                }
 
-       nr_replicas = min_t(unsigned, nr_replicas,
-                           (meta
-                            ? BCH_SB_META_REPLICAS_HAVE
-                            : BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb));
-       (meta
-        ? SET_BCH_SB_META_REPLICAS_HAVE
-        : SET_BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb, nr_replicas);
+               ret.replicas[e->data_type].nr_online =
+                       min(ret.replicas[e->data_type].nr_online,
+                           nr_online);
+
+               ret.replicas[e->data_type].nr_offline =
+                       max(ret.replicas[e->data_type].nr_offline,
+                           nr_offline);
+       }
+
+       rcu_read_unlock();
+
+       return ret;
+}
+
+struct replicas_status bch2_replicas_status(struct bch_fs *c)
+{
+       return __bch2_replicas_status(c, NULL);
+}
+
+unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
+{
+       struct replicas_status s = bch2_replicas_status(c);
+
+       return meta
+               ? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
+                     s.replicas[BCH_DATA_BTREE].nr_online)
+               : s.replicas[BCH_DATA_USER].nr_online;
+}
+
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+       struct bch_replicas_cpu_entry *e;
+       struct bch_replicas_cpu *r;
+       unsigned i, ret = 0;
+
+       rcu_read_lock();
+       r = rcu_dereference(c->replicas);
+
+       if (ca->dev_idx >= replicas_dev_slots(r))
+               goto out;
+
+       for (i = 0; i < r->nr; i++) {
+               e = cpu_replicas_entry(r, i);
+
+               if (replicas_test_dev(e, ca->dev_idx)) {
+                       ret |= 1 << e->data_type;
+                       break;
+               }
+       }
+out:
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
+{
+       struct bch_sb_field_members *mi;
+       struct bch_sb_field_replicas *sb_r;
+       struct bch_replicas_cpu *cpu_r = NULL;
+       struct bch_replicas_entry *e;
+       const char *err;
+       unsigned i;
+
+       mi      = bch2_sb_get_members(sb);
+       sb_r    = bch2_sb_get_replicas(sb);
+       if (!sb_r)
+               return NULL;
+
+       for_each_replicas_entry(sb_r, e) {
+               err = "invalid replicas entry: invalid data type";
+               if (e->data_type >= BCH_DATA_NR)
+                       goto err;
+
+               err = "invalid replicas entry: too many devices";
+               if (e->nr >= BCH_REPLICAS_MAX)
+                       goto err;
+
+               err = "invalid replicas entry: invalid device";
+               for (i = 0; i < e->nr; i++)
+                       if (!bch2_dev_exists(sb, mi, e->devs[i]))
+                               goto err;
+       }
+
+       err = "cannot allocate memory";
+       cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+       if (!cpu_r)
+               goto err;
+
+       sort_cmp_size(cpu_r->entries,
+                     cpu_r->nr,
+                     cpu_r->entry_size,
+                     memcmp, NULL);
+
+       for (i = 0; i + 1 < cpu_r->nr; i++) {
+               struct bch_replicas_cpu_entry *l =
+                       cpu_replicas_entry(cpu_r, i);
+               struct bch_replicas_cpu_entry *r =
+                       cpu_replicas_entry(cpu_r, i + 1);
+
+               BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
+
+               err = "duplicate replicas entry";
+               if (!memcmp(l, r, cpu_r->entry_size))
+                       goto err;
+       }
+
+       err = NULL;
+err:
+       kfree(cpu_r);
+       return err;
+}
+
+int bch2_replicas_gc_end(struct bch_fs *c, int err)
+{
+       struct bch_sb_field_replicas *sb_r;
+       struct bch_replicas_cpu *r, *old_r;
+       struct bch_replicas_entry *dst_e;
+       size_t i, j, bytes, dev_slots;
+       int ret = 0;
+
+       lockdep_assert_held(&c->replicas_gc_lock);
+
+       mutex_lock(&c->sb_lock);
+
+       r = rcu_dereference_protected(c->replicas_gc,
+                                     lockdep_is_held(&c->sb_lock));
+
+       if (err) {
+               rcu_assign_pointer(c->replicas_gc, NULL);
+               kfree_rcu(r, rcu);
+               goto err;
+       }
+
+       dev_slots = replicas_dev_slots(r);
+
+       bytes = sizeof(struct bch_sb_field_replicas);
+
+       for (i = 0; i < r->nr; i++) {
+               struct bch_replicas_cpu_entry *e =
+                       cpu_replicas_entry(r, i);
+
+               bytes += sizeof(struct bch_replicas_entry);
+               for (j = 0; j < r->entry_size - 1; j++)
+                       bytes += hweight8(e->devs[j]);
+       }
+
+       sb_r = bch2_fs_sb_resize_replicas(c,
+                       DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
+       if (!sb_r) {
+               ret = -ENOSPC;
+               goto err;
+       }
+
+       memset(&sb_r->entries, 0,
+              vstruct_end(&sb_r->field) -
+              (void *) &sb_r->entries);
+
+       dst_e = sb_r->entries;
+       for (i = 0; i < r->nr; i++) {
+               struct bch_replicas_cpu_entry *src_e =
+                       cpu_replicas_entry(r, i);
+
+               dst_e->data_type = src_e->data_type;
+
+               for (j = 0; j < dev_slots; j++)
+                       if (replicas_test_dev(src_e, j))
+                               dst_e->devs[dst_e->nr++] = j;
+
+               dst_e = replicas_entry_next(dst_e);
+       }
+
+       old_r = rcu_dereference_protected(c->replicas,
+                                         lockdep_is_held(&c->sb_lock));
+       rcu_assign_pointer(c->replicas, r);
+       rcu_assign_pointer(c->replicas_gc, NULL);
+       kfree_rcu(old_r, rcu);
 
        bch2_write_super(c);
+err:
        mutex_unlock(&c->sb_lock);
+       return ret;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+       struct bch_replicas_cpu *r, *src;
+       unsigned i;
+
+       lockdep_assert_held(&c->replicas_gc_lock);
+
+       mutex_lock(&c->sb_lock);
+       BUG_ON(c->replicas_gc);
+
+       src = rcu_dereference_protected(c->replicas,
+                                       lockdep_is_held(&c->sb_lock));
+
+       r = kzalloc(sizeof(struct bch_replicas_cpu) +
+                   src->nr * src->entry_size, GFP_NOIO);
+       if (!r) {
+               mutex_unlock(&c->sb_lock);
+               return -ENOMEM;
+       }
+
+       r->entry_size = src->entry_size;
+       r->nr = 0;
+
+       for (i = 0; i < src->nr; i++) {
+               struct bch_replicas_cpu_entry *dst_e =
+                       cpu_replicas_entry(r, r->nr);
+               struct bch_replicas_cpu_entry *src_e =
+                       cpu_replicas_entry(src, i);
+
+               if (!(src_e->data_type & typemask)) {
+                       memcpy(dst_e, src_e, r->entry_size);
+                       r->nr++;
+               }
+       }
+
+       eytzinger0_sort(r->entries,
+                       r->nr,
+                       r->entry_size,
+                       memcmp, NULL);
+
+       rcu_assign_pointer(c->replicas_gc, r);
+       mutex_unlock(&c->sb_lock);
+
+       return 0;
 }
index 8f0d82dbe2607aa94cc874209ebaa9c5eed0c832..879fddad940a2dd40b2b5cd86c4221b109e0878f 100644 (file)
@@ -2,6 +2,7 @@
 #define _BCACHE_SUPER_IO_H
 
 #include "extents.h"
+#include "eytzinger.h"
 #include "super_types.h"
 
 #include <asm/byteorder.h>
@@ -40,6 +41,15 @@ bch2_fs_sb_resize_##_name(struct bch_fs *c, unsigned u64s)           \
 BCH_SB_FIELD_TYPE(journal);
 BCH_SB_FIELD_TYPE(members);
 BCH_SB_FIELD_TYPE(crypt);
+BCH_SB_FIELD_TYPE(replicas);
+
+static inline bool bch2_dev_exists(struct bch_sb *sb,
+                                  struct bch_sb_field_members *mi,
+                                  unsigned dev)
+{
+       return dev < sb->nr_devices &&
+               !bch2_is_zero(mi->members[dev].uuid.b, sizeof(uuid_le));
+}
 
 static inline bool bch2_sb_test_feature(struct bch_sb *sb,
                                        enum bch_sb_features f)
@@ -91,8 +101,6 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
                .bucket_size    = le16_to_cpu(mi->bucket_size),
                .state          = BCH_MEMBER_STATE(mi),
                .tier           = BCH_MEMBER_TIER(mi),
-               .has_metadata   = BCH_MEMBER_HAS_METADATA(mi),
-               .has_data       = BCH_MEMBER_HAS_DATA(mi),
                .replacement    = BCH_MEMBER_REPLACEMENT(mi),
                .discard        = BCH_MEMBER_DISCARD(mi),
                .valid          = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
@@ -105,55 +113,116 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
 void bch2_free_super(struct bcache_superblock *);
 int bch2_super_realloc(struct bcache_superblock *, unsigned);
 
-const char *bch2_validate_journal_layout(struct bch_sb *,
+const char *bch2_sb_validate_journal(struct bch_sb *,
                                         struct bch_member_cpu);
-const char *bch2_validate_cache_super(struct bcache_superblock *);
+const char *bch2_sb_validate(struct bcache_superblock *);
 
 const char *bch2_read_super(struct bcache_superblock *,
                           struct bch_opts, const char *);
 void bch2_write_super(struct bch_fs *);
 
-void bch2_check_mark_super_slowpath(struct bch_fs *,
-                                   const struct bkey_i *, bool);
+static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
+                                    unsigned dev)
+{
+       return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
+}
 
-static inline bool bch2_check_super_marked(struct bch_fs *c,
-                                          const struct bkey_i *k, bool meta)
+static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
+                                   unsigned dev)
 {
-       struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
-       const struct bch_extent_ptr *ptr;
-       unsigned nr_replicas = 0;
-       bool ret = true;
+       e->devs[dev >> 3] |= 1 << (dev & 7);
+}
 
-       extent_for_each_ptr(e, ptr) {
-               struct bch_dev *ca = c->devs[ptr->dev];
+static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
+{
+       return (r->entry_size -
+               offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
+}
 
-               if (ptr->cached)
-                       continue;
+static inline struct bch_replicas_cpu_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+       return (void *) r->entries + r->entry_size * i;
+}
 
-               if (!(meta
-                     ? ca->mi.has_metadata
-                     : ca->mi.has_data)) {
-                       ret = false;
-                       break;
+int bch2_check_mark_super_slowpath(struct bch_fs *, struct bkey_s_c_extent,
+                                  enum bch_data_types);
+
+static inline bool replicas_has_extent(struct bch_replicas_cpu *r,
+                                      struct bkey_s_c_extent e,
+                                      enum bch_data_types data_type)
+{
+       const struct bch_extent_ptr *ptr;
+       struct bch_replicas_cpu_entry search = {
+               .data_type = data_type,
+       };
+       unsigned max_dev = 0;
+
+       BUG_ON(!data_type ||
+              data_type == BCH_DATA_SB ||
+              data_type >= BCH_DATA_NR);
+
+       extent_for_each_ptr(e, ptr)
+               if (!ptr->cached) {
+                       max_dev = max_t(unsigned, max_dev, ptr->dev);
+                       replicas_set_dev(&search, ptr->dev);
                }
 
-               nr_replicas++;
-       }
+       return max_dev < replicas_dev_slots(r) &&
+               eytzinger0_find(r->entries, r->nr,
+                               r->entry_size,
+                               memcmp, &search) < r->nr;
+}
+
+static inline bool bch2_sb_has_replicas(struct bch_fs *c,
+                                       struct bkey_s_c_extent e,
+                                       enum bch_data_types data_type)
+{
+       bool ret;
 
-       if (nr_replicas <
-           (meta ? c->sb.meta_replicas_have : c->sb.data_replicas_have))
-               ret = false;
+       rcu_read_lock();
+       ret = replicas_has_extent(rcu_dereference(c->replicas),
+                                 e, data_type);
+       rcu_read_unlock();
 
        return ret;
 }
 
-static inline void bch2_check_mark_super(struct bch_fs *c,
-                                        const struct bkey_i *k, bool meta)
+static inline int bch2_check_mark_super(struct bch_fs *c,
+                                       struct bkey_s_c_extent e,
+                                       enum bch_data_types data_type)
 {
-       if (bch2_check_super_marked(c, k, meta))
-               return;
+       struct bch_replicas_cpu *gc_r;
+       bool marked;
 
-       bch2_check_mark_super_slowpath(c, k, meta);
+       rcu_read_lock();
+       marked = replicas_has_extent(rcu_dereference(c->replicas),
+                                    e, data_type) &&
+               (!(gc_r = rcu_dereference(c->replicas_gc)) ||
+                replicas_has_extent(gc_r, e, data_type));
+       rcu_read_unlock();
+
+       if (marked)
+               return 0;
+
+       return bch2_check_mark_super_slowpath(c, e, data_type);
 }
 
+struct replicas_status {
+       struct {
+               unsigned        nr_online;
+               unsigned        nr_offline;
+       }                       replicas[BCH_DATA_NR];
+};
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *,
+                                             struct bch_dev *);
+struct replicas_status bch2_replicas_status(struct bch_fs *);
+
+unsigned bch2_replicas_online(struct bch_fs *, bool);
+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
+
+int bch2_replicas_gc_end(struct bch_fs *, int);
+int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+
 #endif /* _BCACHE_SUPER_IO_H */
index 2a3947e26e8812fb0db51fcc9e69de88f211feb0..692eb417dd478c2d9273f3d11744fc4cf8595ccd 100644 (file)
@@ -224,6 +224,9 @@ static void __bch2_fs_read_only(struct bch_fs *c)
                bch2_dev_allocator_stop(ca);
 
        bch2_fs_journal_stop(&c->journal);
+
+       for_each_member_device(ca, c, i)
+               bch2_dev_allocator_remove(c, ca);
 }
 
 static void bch2_writes_disabled(struct percpu_ref *writes)
@@ -330,6 +333,10 @@ const char *bch2_fs_read_write(struct bch_fs *c)
            c->state != BCH_FS_RO)
                goto out;
 
+       for_each_rw_member(ca, c, i)
+               bch2_dev_allocator_add(c, ca);
+       bch2_recalc_capacity(c);
+
        err = "error starting allocator thread";
        for_each_rw_member(ca, c, i)
                if (bch2_dev_allocator_start(ca)) {
@@ -484,6 +491,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        mutex_init(&c->state_lock);
        mutex_init(&c->sb_lock);
+       mutex_init(&c->replicas_gc_lock);
        mutex_init(&c->btree_cache_lock);
        mutex_init(&c->bucket_lock);
        mutex_init(&c->btree_root_lock);
@@ -603,7 +611,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        mi = bch2_sb_get_members(c->disk_sb);
        for (i = 0; i < c->sb.nr_devices; i++)
-               if (!bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)) &&
+               if (bch2_dev_exists(c->disk_sb, mi, i) &&
                    bch2_dev_alloc(c, i))
                        goto err;
 
@@ -681,12 +689,16 @@ static const char *__bch2_fs_start(struct bch_fs *c)
        const char *err = "cannot allocate memory";
        struct bch_sb_field_members *mi;
        struct bch_dev *ca;
-       unsigned i, id;
-       time64_t now;
        LIST_HEAD(journal);
        struct jset *j;
+       struct closure cl;
+       u64 journal_seq = 0;
+       time64_t now;
+       unsigned i;
        int ret = -EINVAL;
 
+       closure_init_stack(&cl);
+
        BUG_ON(c->state != BCH_FS_STARTING);
 
        mutex_lock(&c->sb_lock);
@@ -694,6 +706,10 @@ static const char *__bch2_fs_start(struct bch_fs *c)
                bch2_sb_from_fs(c, ca);
        mutex_unlock(&c->sb_lock);
 
+       for_each_rw_member(ca, c, i)
+               bch2_dev_allocator_add(c, ca);
+       bch2_recalc_capacity(c);
+
        if (BCH_SB_INITIALIZED(c->disk_sb)) {
                ret = bch2_journal_read(c, &journal);
                if (ret)
@@ -704,44 +720,45 @@ static const char *__bch2_fs_start(struct bch_fs *c)
                c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
                c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
 
-               err = "error reading priorities";
-               for_each_readable_member(ca, c, i) {
-                       ret = bch2_prio_read(ca);
-                       if (ret) {
-                               percpu_ref_put(&ca->io_ref);
-                               goto err;
-                       }
-               }
-
-               for (id = 0; id < BTREE_ID_NR; id++) {
+               for (i = 0; i < BTREE_ID_NR; i++) {
                        unsigned level;
                        struct bkey_i *k;
 
-                       err = "bad btree root";
-                       k = bch2_journal_find_btree_root(c, j, id, &level);
-                       if (!k && id == BTREE_ID_EXTENTS)
+                       err = "missing btree root";
+                       k = bch2_journal_find_btree_root(c, j, i, &level);
+                       if (!k && i < BTREE_ID_ALLOC)
                                goto err;
-                       if (!k) {
-                               pr_debug("missing btree root: %d", id);
+
+                       if (!k)
                                continue;
-                       }
 
                        err = "error reading btree root";
-                       if (bch2_btree_root_read(c, id, k, level))
+                       if (bch2_btree_root_read(c, i, k, level))
                                goto err;
                }
 
-               bch_verbose(c, "starting mark and sweep:");
+               err = "error reading allocation information";
+               ret = bch2_alloc_read(c, &journal);
+               if (ret)
+                       goto err;
 
+               bch_verbose(c, "starting mark and sweep:");
                err = "error in recovery";
                ret = bch2_initial_gc(c, &journal);
                if (ret)
                        goto err;
+               bch_verbose(c, "mark and sweep done");
 
                if (c->opts.noreplay)
                        goto recovery_done;
 
-               bch_verbose(c, "mark and sweep done");
+               err = "cannot allocate new btree root";
+               for (i = 0; i < BTREE_ID_NR; i++)
+                       if (!c->btree_roots[i].b &&
+                           bch2_btree_root_alloc(c, i, &cl))
+                               goto err;
+
+               closure_sync(&cl);
 
                /*
                 * bch2_journal_start() can't happen sooner, or btree_gc_finish()
@@ -758,12 +775,10 @@ static const char *__bch2_fs_start(struct bch_fs *c)
                        }
 
                bch_verbose(c, "starting journal replay:");
-
                err = "journal replay failed";
                ret = bch2_journal_replay(c, &journal);
                if (ret)
                        goto err;
-
                bch_verbose(c, "journal replay done");
 
                if (c->opts.norecovery)
@@ -774,23 +789,21 @@ static const char *__bch2_fs_start(struct bch_fs *c)
                ret = bch2_fsck(c, !c->opts.nofsck);
                if (ret)
                        goto err;
+               bch_verbose(c, "fsck done");
 
                for_each_rw_member(ca, c, i)
-                       if (ca->need_prio_write) {
-                               ret = bch2_prio_write(ca);
+                       if (ca->need_alloc_write) {
+                               ret = bch2_alloc_write(c, ca, &journal_seq);
                                if (ret) {
                                        percpu_ref_put(&ca->io_ref);
                                        goto err;
                                }
                        }
 
-               bch_verbose(c, "fsck done");
+               bch2_journal_flush_seq(&c->journal, journal_seq);
        } else {
                struct bch_inode_unpacked inode;
                struct bkey_inode_buf packed_inode;
-               struct closure cl;
-
-               closure_init_stack(&cl);
 
                bch_notice(c, "initializing new filesystem");
 
@@ -805,6 +818,11 @@ static const char *__bch2_fs_start(struct bch_fs *c)
                                goto err;
                        }
 
+               err = "cannot allocate new btree root";
+               for (i = 0; i < BTREE_ID_NR; i++)
+                       if (bch2_btree_root_alloc(c, i, &cl))
+                               goto err;
+
                /*
                 * journal_res_get() will crash if called before this has
                 * set up the journal.pin FIFO and journal.cur pointer:
@@ -819,13 +837,6 @@ static const char *__bch2_fs_start(struct bch_fs *c)
                                goto err;
                        }
 
-               err = "cannot allocate new btree root";
-               for (id = 0; id < BTREE_ID_NR; id++)
-                       if (bch2_btree_root_alloc(c, id, &cl)) {
-                               closure_sync(&cl);
-                               goto err;
-                       }
-
                /* Wait for new btree roots to be written: */
                closure_sync(&cl);
 
@@ -877,6 +888,8 @@ out:
        bch2_journal_entries_free(&journal);
        return err;
 err:
+       closure_sync(&cl);
+
        switch (ret) {
        case BCH_FSCK_ERRORS_NOT_FIXED:
                bch_err(c, "filesystem contains errors: please report this to the developers");
@@ -940,10 +953,7 @@ static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
        if (uuid_le_cmp(fs->uuid, sb->uuid))
                return "device not a member of filesystem";
 
-       if (sb->dev_idx >= newest->nr_devices)
-               return "device has invalid dev_idx";
-
-       if (bch2_is_zero(mi->members[sb->dev_idx].uuid.b, sizeof(uuid_le)))
+       if (!bch2_dev_exists(newest, mi, sb->dev_idx))
                return "device has been removed";
 
        if (fs->block_size != sb->block_size)
@@ -981,9 +991,6 @@ static void bch2_dev_free(struct bch_dev *ca)
        free_percpu(ca->sectors_written);
        bioset_exit(&ca->replica_set);
        free_percpu(ca->usage_percpu);
-       kvpfree(ca->disk_buckets, bucket_bytes(ca));
-       kfree(ca->prio_buckets);
-       kfree(ca->bio_prio);
        kvpfree(ca->buckets,     ca->mi.nbuckets * sizeof(struct bucket));
        kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
        free_heap(&ca->copygc_heap);
@@ -1011,7 +1018,7 @@ static void __bch2_dev_offline(struct bch_dev *ca)
 
        lockdep_assert_held(&c->state_lock);
 
-       __bch2_dev_read_only(ca->fs, ca);
+       __bch2_dev_read_only(c, ca);
 
        reinit_completion(&ca->offline_complete);
        percpu_ref_kill(&ca->io_ref);
@@ -1061,7 +1068,7 @@ static int bch2_dev_sysfs_online(struct bch_dev *ca)
                return 0;
 
        if (!ca->kobj.state_in_sysfs) {
-               ret = kobject_add(&ca->kobj, &ca->fs->kobj,
+               ret = kobject_add(&ca->kobj, &c->kobj,
                                  "dev-%u", ca->dev_idx);
                if (ret)
                        return ret;
@@ -1087,7 +1094,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
        struct bch_member *member;
        size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
        size_t heap_size;
-       unsigned i;
+       unsigned i, btree_node_reserve_buckets;
        struct bch_dev *ca;
 
        if (bch2_fs_init_fault("dev_alloc"))
@@ -1107,8 +1114,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
        ca->dev_idx = dev_idx;
 
        spin_lock_init(&ca->freelist_lock);
-       spin_lock_init(&ca->prio_buckets_lock);
-       mutex_init(&ca->prio_write_lock);
        bch2_dev_moving_gc_init(ca);
 
        INIT_WORK(&ca->io_error_work, bch2_nonfatal_io_error_work);
@@ -1134,12 +1139,16 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
        free_inc_reserve = movinggc_reserve / 2;
        heap_size = movinggc_reserve * 8;
 
+       btree_node_reserve_buckets =
+               DIV_ROUND_UP(BTREE_NODE_RESERVE,
+                            ca->mi.bucket_size / c->sb.btree_node_size);
+
        if (percpu_ref_init(&ca->ref, bch2_dev_ref_release,
                            0, GFP_KERNEL) ||
            percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release,
                            PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
-           !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
-           !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
+           !init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets,
+                      GFP_KERNEL) ||
            !init_fifo(&ca->free[RESERVE_MOVINGGC],
                       movinggc_reserve, GFP_KERNEL) ||
            !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
@@ -1152,18 +1161,12 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
            !(ca->buckets       = kvpmalloc(ca->mi.nbuckets *
                                            sizeof(struct bucket),
                                            GFP_KERNEL|__GFP_ZERO)) ||
-           !(ca->prio_buckets  = kzalloc(sizeof(u64) * prio_buckets(ca) *
-                                         2, GFP_KERNEL)) ||
-           !(ca->disk_buckets  = kvpmalloc(bucket_bytes(ca), GFP_KERNEL)) ||
            !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
-           !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
            bioset_init(&ca->replica_set, 4,
                        offsetof(struct bch_write_bio, bio)) ||
            !(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
                goto err;
 
-       ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
-
        total_reserve = ca->free_inc.size;
        for (i = 0; i < RESERVE_NR; i++)
                total_reserve += ca->free[i].size;
@@ -1232,53 +1235,48 @@ static int __bch2_dev_online(struct bch_fs *c, struct bcache_superblock *sb)
 
        lg_local_lock(&c->usage_lock);
        if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA)))
-               bch2_mark_dev_metadata(ca->fs, ca);
+               bch2_mark_dev_metadata(c, ca);
        lg_local_unlock(&c->usage_lock);
 
+       if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+               struct bch_sb_field_journal *journal_buckets =
+                       bch2_sb_get_journal(ca->disk_sb.sb);
+               bool has_journal =
+                       bch2_nr_journal_buckets(journal_buckets) >=
+                       BCH_JOURNAL_BUCKETS_MIN;
+
+               bch2_dev_group_add(&c->tiers[ca->mi.tier].devs, ca);
+               bch2_dev_group_add(&c->all_devs, ca);
+
+               if (has_journal)
+                       bch2_dev_group_add(&c->journal.devs, ca);
+       }
+
        percpu_ref_reinit(&ca->io_ref);
        return 0;
 }
 
 /* Device management: */
 
-bool bch2_fs_may_start(struct bch_fs *c, int flags)
+static bool have_enough_devs(struct bch_fs *c,
+                            struct replicas_status s,
+                            unsigned flags)
 {
-       struct bch_sb_field_members *mi;
-       unsigned meta_missing = 0;
-       unsigned data_missing = 0;
-       bool degraded = false;
-       unsigned i;
-
-       mutex_lock(&c->sb_lock);
-       mi = bch2_sb_get_members(c->disk_sb);
-
-       for (i = 0; i < c->disk_sb->nr_devices; i++)
-               if (!c->devs[i] &&
-                   !bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) {
-                       degraded = true;
-                       if (BCH_MEMBER_HAS_METADATA(&mi->members[i]))
-                               meta_missing++;
-                       if (BCH_MEMBER_HAS_DATA(&mi->members[i]))
-                               data_missing++;
-               }
-       mutex_unlock(&c->sb_lock);
-
-       if (degraded &&
-           !(flags & BCH_FORCE_IF_DEGRADED))
-               return false;
-
-       if (meta_missing &&
+       if ((s.replicas[BCH_DATA_JOURNAL].nr_offline ||
+            s.replicas[BCH_DATA_BTREE].nr_offline) &&
            !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
                return false;
 
-       if (meta_missing >= BCH_SB_META_REPLICAS_HAVE(c->disk_sb) &&
+       if ((!s.replicas[BCH_DATA_JOURNAL].nr_online ||
+            !s.replicas[BCH_DATA_BTREE].nr_online) &&
            !(flags & BCH_FORCE_IF_METADATA_LOST))
                return false;
 
-       if (data_missing && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
+       if (s.replicas[BCH_DATA_USER].nr_offline &&
+           !(flags & BCH_FORCE_IF_DATA_DEGRADED))
                return false;
 
-       if (data_missing >= BCH_SB_DATA_REPLICAS_HAVE(c->disk_sb) &&
+       if (!s.replicas[BCH_DATA_USER].nr_online &&
            !(flags & BCH_FORCE_IF_DATA_LOST))
                return false;
 
@@ -1297,40 +1295,80 @@ bool bch2_fs_may_start(struct bch_fs *c, int flags)
 bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
                            enum bch_member_state new_state, int flags)
 {
-       lockdep_assert_held(&c->state_lock);
-
-       if (new_state == BCH_MEMBER_STATE_RW)
-               return true;
+       struct replicas_status s;
+       struct bch_dev *ca2;
+       int i, nr_rw = 0, required;
 
-       if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
-               return true;
+       lockdep_assert_held(&c->state_lock);
 
-       /*
-        * If the device is already offline - whatever is going on with it can't
-        * possible make the FS need to go RO:
-        */
-       if (!bch2_dev_is_online(ca))
+       switch (new_state) {
+       case BCH_MEMBER_STATE_RW:
                return true;
+       case BCH_MEMBER_STATE_RO:
+               if (ca->mi.state != BCH_MEMBER_STATE_RW)
+                       return true;
+
+               /* do we have enough devices to write to?  */
+               for_each_member_device(ca2, c, i)
+                       nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
+
+               required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
+                              ? c->opts.metadata_replicas
+                              : c->opts.metadata_replicas_required,
+                              !(flags & BCH_FORCE_IF_DATA_DEGRADED)
+                              ? c->opts.data_replicas
+                              : c->opts.data_replicas_required);
+
+               return nr_rw - 1 <= required;
+       case BCH_MEMBER_STATE_FAILED:
+       case BCH_MEMBER_STATE_SPARE:
+               if (ca->mi.state != BCH_MEMBER_STATE_RW &&
+                   ca->mi.state != BCH_MEMBER_STATE_RO)
+                       return true;
+
+               /* do we have enough devices to read from?  */
+               s = __bch2_replicas_status(c, ca);
+
+               pr_info("replicas: j %u %u b %u %u d %u %u",
+                       s.replicas[BCH_DATA_JOURNAL].nr_online,
+                       s.replicas[BCH_DATA_JOURNAL].nr_offline,
+
+                       s.replicas[BCH_DATA_BTREE].nr_online,
+                       s.replicas[BCH_DATA_BTREE].nr_offline,
+
+                       s.replicas[BCH_DATA_USER].nr_online,
+                       s.replicas[BCH_DATA_USER].nr_offline);
+
+               return have_enough_devs(c, s, flags);
+       default:
+               BUG();
+       }
+}
 
-       if (ca->mi.has_data &&
-           !(flags & BCH_FORCE_IF_DATA_DEGRADED))
-               return false;
-
-       if (ca->mi.has_data &&
-           c->sb.data_replicas_have <= 1 &&
-           !(flags & BCH_FORCE_IF_DATA_LOST))
-               return false;
+static bool bch2_fs_may_start(struct bch_fs *c, int flags)
+{
+       struct replicas_status s;
+       struct bch_sb_field_members *mi;
+       unsigned i;
 
-       if (ca->mi.has_metadata &&
-           !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
-               return false;
+       if (!c->opts.degraded) {
+               mutex_lock(&c->sb_lock);
+               mi = bch2_sb_get_members(c->disk_sb);
+
+               for (i = 0; i < c->disk_sb->nr_devices; i++)
+                       if (bch2_dev_exists(c->disk_sb, mi, i) &&
+                           !bch2_dev_is_online(c->devs[i]) &&
+                           (c->devs[i]->mi.state == BCH_MEMBER_STATE_RW ||
+                            c->devs[i]->mi.state == BCH_MEMBER_STATE_RO)) {
+                               mutex_unlock(&c->sb_lock);
+                               return false;
+                       }
+               mutex_unlock(&c->sb_lock);
+       }
 
-       if (ca->mi.has_metadata &&
-           c->sb.meta_replicas_have <= 1 &&
-           !(flags & BCH_FORCE_IF_METADATA_LOST))
-               return false;
+       s = bch2_replicas_status(c);
 
-       return true;
+       return have_enough_devs(c, s, flags);
 }
 
 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
@@ -1343,8 +1381,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
         * complete.
         */
        bch2_dev_allocator_stop(ca);
-
-       bch2_dev_group_remove(&c->journal.devs, ca);
+       bch2_dev_allocator_remove(c, ca);
 }
 
 static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
@@ -1353,6 +1390,9 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 
        BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
 
+       bch2_dev_allocator_add(c, ca);
+       bch2_recalc_capacity(c);
+
        if (bch2_dev_allocator_start(ca))
                return "error starting allocator thread";
 
@@ -1411,7 +1451,7 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 {
        struct bch_sb_field_members *mi;
-       unsigned dev_idx = ca->dev_idx;
+       unsigned dev_idx = ca->dev_idx, data;
        int ret = -EINVAL;
 
        mutex_lock(&c->state_lock);
@@ -1439,19 +1479,12 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
                goto err;
        }
 
-       if (ca->mi.has_data || ca->mi.has_metadata) {
-               bch_err(ca, "Remove failed, still has data");
+       data = bch2_dev_has_data(c, ca);
+       if (data) {
+               bch_err(ca, "Remove failed, still has data (%x)", data);
                goto err;
        }
 
-       /*
-        * Ok, really doing the remove:
-        * Drop device's prio pointer before removing it from superblock:
-        */
-       spin_lock(&c->journal.lock);
-       c->journal.prio_buckets[dev_idx] = 0;
-       spin_unlock(&c->journal.lock);
-
        bch2_journal_meta(&c->journal);
 
        __bch2_dev_offline(ca);
@@ -1476,6 +1509,7 @@ err:
        return ret;
 }
 
+/* Add new device to running filesystem: */
 int bch2_dev_add(struct bch_fs *c, const char *path)
 {
        struct bcache_superblock sb;
@@ -1490,7 +1524,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
        if (err)
                return -EINVAL;
 
-       err = bch2_validate_cache_super(&sb);
+       err = bch2_sb_validate(&sb);
        if (err)
                return -EINVAL;
 
@@ -1514,9 +1548,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
        mi = bch2_sb_get_members(c->disk_sb);
        for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
-               if (dev_idx >= c->sb.nr_devices ||
-                   bch2_is_zero(mi->members[dev_idx].uuid.b,
-                                sizeof(uuid_le)))
+               if (!bch2_dev_exists(c->disk_sb, mi, dev_idx))
                        goto have_slot;
 no_slot:
        err = "no slots available in superblock";
@@ -1587,13 +1619,13 @@ err:
        return ret ?: -EINVAL;
 }
 
+/* Hot add existing device to running filesystem: */
 int bch2_dev_online(struct bch_fs *c, const char *path)
 {
        struct bcache_superblock sb = { 0 };
        struct bch_dev *ca;
        unsigned dev_idx;
        const char *err;
-       int ret;
 
        mutex_lock(&c->state_lock);
 
@@ -1616,12 +1648,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
        mutex_unlock(&c->sb_lock);
 
        ca = c->devs[dev_idx];
-       ret = bch2_prio_read(ca);
-       if (ret) {
-               err = "error reading priorities";
-               goto err;
-       }
-
        if (ca->mi.state == BCH_MEMBER_STATE_RW) {
                err = __bch2_dev_read_write(c, ca);
                if (err)
@@ -1656,6 +1682,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
 {
+       unsigned data;
        int ret;
 
        mutex_lock(&c->state_lock);
@@ -1680,8 +1707,9 @@ int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
                return ret;
        }
 
-       if (ca->mi.has_data || ca->mi.has_metadata) {
-               bch_err(ca, "Migrate error: data still present");
+       data = bch2_dev_has_data(c, ca);
+       if (data) {
+               bch_err(ca, "Migrate error: data still present (%x)", data);
                return -EINVAL;
        }
 
@@ -1714,11 +1742,7 @@ const char *bch2_fs_open(char * const *devices, unsigned nr_devices,
                if (err)
                        goto err;
 
-               err = "attempting to register backing device";
-               if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
-                       goto err;
-
-               err = bch2_validate_cache_super(&sb[i]);
+               err = bch2_sb_validate(&sb[i]);
                if (err)
                        goto err;
        }
@@ -1790,7 +1814,7 @@ static const char *__bch2_fs_open_incremental(struct bcache_superblock *sb,
        struct bch_fs *c;
        bool allocated_fs = false;
 
-       err = bch2_validate_cache_super(sb);
+       err = bch2_sb_validate(sb);
        if (err)
                return err;
 
@@ -1855,11 +1879,7 @@ const char *bch2_fs_open_incremental(const char *path)
        if (err)
                return err;
 
-       if (!__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
-               err = __bch2_fs_open_incremental(&sb, opts);
-       else
-               err = "not a bcachefs superblock";
-
+       err = __bch2_fs_open_incremental(&sb, opts);
        bch2_free_super(&sb);
 
        return err;
index edfa85b0561461cbeb89edbf088c4a55b3afb768..1986fdd573cb0a180c8e4a2b6951ba98cec4e5f1 100644 (file)
@@ -337,8 +337,8 @@ SHOW(bch2_fs)
 
        sysfs_pd_controller_show(tiering,       &c->tiers[1].pd); /* XXX */
 
-       sysfs_printf(meta_replicas_have, "%u",  c->sb.meta_replicas_have);
-       sysfs_printf(data_replicas_have, "%u",  c->sb.data_replicas_have);
+       sysfs_printf(meta_replicas_have, "%u",  bch2_replicas_online(c, true));
+       sysfs_printf(data_replicas_have, "%u",  bch2_replicas_online(c, false));
 
        /* Debugging: */
 
@@ -693,7 +693,6 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 
        return scnprintf(buf, PAGE_SIZE,
                "free_inc:               %zu/%zu\n"
-               "free[RESERVE_PRIO]:     %zu/%zu\n"
                "free[RESERVE_BTREE]:    %zu/%zu\n"
                "free[RESERVE_MOVINGGC]: %zu/%zu\n"
                "free[RESERVE_NONE]:     %zu/%zu\n"
@@ -705,7 +704,6 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
                "open buckets:           %u/%u (reserved %u)\n"
                "open_buckets_wait:      %s\n",
                fifo_used(&ca->free_inc),               ca->free_inc.size,
-               fifo_used(&ca->free[RESERVE_PRIO]),     ca->free[RESERVE_PRIO].size,
                fifo_used(&ca->free[RESERVE_BTREE]),    ca->free[RESERVE_BTREE].size,
                fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
                fifo_used(&ca->free[RESERVE_NONE]),     ca->free[RESERVE_NONE].size,
@@ -759,8 +757,11 @@ SHOW(bch2_dev)
        sysfs_print(alloc_buckets,      stats.buckets_alloc);
        sysfs_print(available_buckets,  dev_buckets_available(ca));
        sysfs_print(free_buckets,       dev_buckets_free(ca));
-       sysfs_print(has_data,           ca->mi.has_data);
-       sysfs_print(has_metadata,       ca->mi.has_metadata);
+       sysfs_print(has_data,           bch2_dev_has_data(c, ca) &
+                   (1 << BCH_DATA_USER));
+       sysfs_print(has_metadata,       bch2_dev_has_data(c, ca) &
+                   ((1 << BCH_DATA_JOURNAL)|
+                    (1 << BCH_DATA_BTREE)));
 
        sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd);
 
index 5400dec5ed5083c5e4217c75f7930b301a53d6ea..906e7a6b9fedd022332f733f27ff49cef39a1dfc 100644 (file)
@@ -533,3 +533,47 @@ void eytzinger0_sort(void *base, size_t n, size_t size,
                }
        }
 }
+
+void sort_cmp_size(void *base, size_t num, size_t size,
+         int (*cmp_func)(const void *, const void *, size_t),
+         void (*swap_func)(void *, void *, size_t size))
+{
+       /* pre-scale counters for performance */
+       int i = (num/2 - 1) * size, n = num * size, c, r;
+
+       if (!swap_func) {
+               if (size == 4 && alignment_ok(base, 4))
+                       swap_func = u32_swap;
+               else if (size == 8 && alignment_ok(base, 8))
+                       swap_func = u64_swap;
+               else
+                       swap_func = generic_swap;
+       }
+
+       /* heapify */
+       for ( ; i >= 0; i -= size) {
+               for (r = i; r * 2 + size < n; r  = c) {
+                       c = r * 2 + size;
+                       if (c < n - size &&
+                           cmp_func(base + c, base + c + size, size) < 0)
+                               c += size;
+                       if (cmp_func(base + r, base + c, size) >= 0)
+                               break;
+                       swap_func(base + r, base + c, size);
+               }
+       }
+
+       /* sort */
+       for (i = n - size; i > 0; i -= size) {
+               swap_func(base, base + i, size);
+               for (r = 0; r * 2 + size < i; r = c) {
+                       c = r * 2 + size;
+                       if (c < i - size &&
+                           cmp_func(base + c, base + c + size, size) < 0)
+                               c += size;
+                       if (cmp_func(base + r, base + c, size) >= 0)
+                               break;
+                       swap_func(base + r, base + c, size);
+               }
+       }
+}
index 927aa3a9dab103f4a867eb4e1a77d3f1fdd769f3..68d9a861d98513a24922116e5414a124aaebaf54 100644 (file)
@@ -763,4 +763,8 @@ static inline struct bio_vec next_contig_bvec(struct bio *bio,
 
 size_t bch_scnmemcpy(char *, size_t, const char *, size_t);
 
+void sort_cmp_size(void *base, size_t num, size_t size,
+         int (*cmp_func)(const void *, const void *, size_t),
+         void (*swap_func)(void *, void *, size_t));
+
 #endif /* _BCACHE_UTIL_H */