]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to ca97ee3577 bcachefs: bch2_btree_iter_peek_and_restart_outl...
authorKent Overstreet <kent.overstreet@linux.dev>
Mon, 27 Feb 2023 02:36:39 +0000 (21:36 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Tue, 28 Feb 2023 02:36:36 +0000 (21:36 -0500)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
25 files changed:
.bcachefs_revision
include/linux/blkdev.h
include/trace/events/bcachefs.h
libbcachefs/alloc_background.c
libbcachefs/alloc_foreground.c
libbcachefs/alloc_foreground.h
libbcachefs/alloc_types.h
libbcachefs/backpointers.c
libbcachefs/bcachefs.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_write_buffer.c
libbcachefs/data_update.c
libbcachefs/ec.c
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/io.c
libbcachefs/journal.c
libbcachefs/keylist.c
libbcachefs/keylist.h
libbcachefs/lru.c
libbcachefs/move.c
libbcachefs/opts.h
libbcachefs/sysfs.c
linux/blkdev.c

index 93724d8b4ef1b8393549d6650aa075aec3a5f72a..44599a02e1a54ffab4f356f224337e40095944d9 100644 (file)
@@ -1 +1 @@
-8e1519ccb62b76736d5b9ca97e58b41ed9a11274
+ca97ee357774427208e4c251bfaa5957ae7f8c2c
index 01b3d4adda15f92c9b1484b7057ba2f4b9509278..f78621d8a3d8730ef560b458f635cc2ea460c0b7 100644 (file)
@@ -70,6 +70,7 @@ static inline void submit_bio(struct bio *bio)
 }
 
 int blkdev_issue_discard(struct block_device *, sector_t, sector_t, gfp_t);
+int blkdev_issue_zeroout(struct block_device *, sector_t, sector_t, gfp_t, unsigned);
 
 #define bdev_get_queue(bdev)           (&((bdev)->queue))
 
index d1e2f97913487e81594c65b0b5814e986fa703d5..ae1842201b87e46b3846b1c7ac9044c5afc29510 100644 (file)
@@ -516,7 +516,6 @@ DEFINE_EVENT(bch_fs, gc_gens_end,
 
 DECLARE_EVENT_CLASS(bucket_alloc,
        TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
-                bool user,
                 u64 bucket,
                 u64 free,
                 u64 avail,
@@ -525,14 +524,13 @@ DECLARE_EVENT_CLASS(bucket_alloc,
                 struct bucket_alloc_state *s,
                 bool nonblocking,
                 const char *err),
-       TP_ARGS(ca, alloc_reserve, user, bucket, free, avail,
+       TP_ARGS(ca, alloc_reserve, bucket, free, avail,
                copygc_wait_amount, copygc_waiting_for,
                s, nonblocking, err),
 
        TP_STRUCT__entry(
-               __field(dev_t,                  dev                     )
+               __field(u8,                     dev                     )
                __array(char,   reserve,        16                      )
-               __field(bool,                   user    )
                __field(u64,                    bucket  )
                __field(u64,                    free                    )
                __field(u64,                    avail                   )
@@ -548,9 +546,8 @@ DECLARE_EVENT_CLASS(bucket_alloc,
        ),
 
        TP_fast_assign(
-               __entry->dev            = ca->dev;
+               __entry->dev            = ca->dev_idx;
                strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
-               __entry->user           = user;
                __entry->bucket         = bucket;
                __entry->free           = free;
                __entry->avail          = avail;
@@ -565,10 +562,9 @@ DECLARE_EVENT_CLASS(bucket_alloc,
                strscpy(__entry->err, err, sizeof(__entry->err));
        ),
 
-       TP_printk("%d,%d reserve %s user %u bucket %llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
+       TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s",
                  __entry->reserve,
-                 __entry->user,
+                 __entry->dev,
                  __entry->bucket,
                  __entry->free,
                  __entry->avail,
@@ -585,7 +581,6 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc,
        TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
-                bool user,
                 u64 bucket,
                 u64 free,
                 u64 avail,
@@ -594,14 +589,13 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc,
                 struct bucket_alloc_state *s,
                 bool nonblocking,
                 const char *err),
-       TP_ARGS(ca, alloc_reserve, user, bucket, free, avail,
+       TP_ARGS(ca, alloc_reserve, bucket, free, avail,
                copygc_wait_amount, copygc_waiting_for,
                s, nonblocking, err)
 );
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
        TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
-                bool user,
                 u64 bucket,
                 u64 free,
                 u64 avail,
@@ -610,7 +604,7 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
                 struct bucket_alloc_state *s,
                 bool nonblocking,
                 const char *err),
-       TP_ARGS(ca, alloc_reserve, user, bucket, free, avail,
+       TP_ARGS(ca, alloc_reserve, bucket, free, avail,
                copygc_wait_amount, copygc_waiting_for,
                s, nonblocking, err)
 );
index b39a4533a5a431e82d0ef1c52868ede78a19f81e..5f4bb82c35ea103e5368ff8a1e3c3231824fd46c 100644 (file)
@@ -2175,21 +2175,24 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
        }
        mutex_unlock(&c->btree_reserve_cache_lock);
 
-       while (1) {
-               struct open_bucket *ob;
-
-               spin_lock(&c->freelist_lock);
-               if (!ca->open_buckets_partial_nr) {
+       spin_lock(&c->freelist_lock);
+       i = 0;
+       while (i < c->open_buckets_partial_nr) {
+               struct open_bucket *ob =
+                       c->open_buckets + c->open_buckets_partial[i];
+
+               if (ob->dev == ca->dev_idx) {
+                       swap(c->open_buckets_partial[i],
+                            c->open_buckets_partial[--c->open_buckets_partial_nr]);
+                       ob->on_partial_list = false;
                        spin_unlock(&c->freelist_lock);
-                       break;
+                       bch2_open_bucket_put(c, ob);
+                       spin_lock(&c->freelist_lock);
+               } else {
+                       i++;
                }
-               ob = c->open_buckets +
-                       ca->open_buckets_partial[--ca->open_buckets_partial_nr];
-               ob->on_partial_list = false;
-               spin_unlock(&c->freelist_lock);
-
-               bch2_open_bucket_put(c, ob);
        }
+       spin_unlock(&c->freelist_lock);
 
        bch2_ec_stop_dev(c, ca);
 
index affddf1f03fff0cb3121ab5f767df65beb1d19b9..023b62c5a1c4de836cbad8b5c55dc1a7f2d1f181 100644 (file)
@@ -154,26 +154,17 @@ static void open_bucket_free_unused(struct bch_fs *c,
                                    struct write_point *wp,
                                    struct open_bucket *ob)
 {
-       struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
-       bool may_realloc = wp->data_type == BCH_DATA_user;
-
-       BUG_ON(ca->open_buckets_partial_nr >
-              ARRAY_SIZE(ca->open_buckets_partial));
-
-       if (ca->open_buckets_partial_nr <
-           ARRAY_SIZE(ca->open_buckets_partial) &&
-           may_realloc) {
-               spin_lock(&c->freelist_lock);
-               ob->on_partial_list = true;
-               ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
-                       ob - c->open_buckets;
-               spin_unlock(&c->freelist_lock);
+       BUG_ON(c->open_buckets_partial_nr >=
+              ARRAY_SIZE(c->open_buckets_partial));
 
-               closure_wake_up(&c->open_buckets_wait);
-               closure_wake_up(&c->freelist_wait);
-       } else {
-               bch2_open_bucket_put(c, ob);
-       }
+       spin_lock(&c->freelist_lock);
+       ob->on_partial_list = true;
+       c->open_buckets_partial[c->open_buckets_partial_nr++] =
+               ob - c->open_buckets;
+       spin_unlock(&c->freelist_lock);
+
+       closure_wake_up(&c->open_buckets_wait);
+       closure_wake_up(&c->freelist_wait);
 }
 
 /* _only_ for allocating the journal on a new device: */
@@ -259,7 +250,6 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 
        ob->valid       = true;
        ob->sectors_free = ca->mi.bucket_size;
-       ob->alloc_reserve = reserve;
        ob->dev         = ca->dev_idx;
        ob->gen         = a->gen;
        ob->bucket      = bucket;
@@ -386,32 +376,6 @@ err:
        return ob;
 }
 
-static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca,
-                                                   enum alloc_reserve reserve)
-{
-       struct open_bucket *ob;
-       int i;
-
-       spin_lock(&c->freelist_lock);
-
-       for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
-               ob = c->open_buckets + ca->open_buckets_partial[i];
-
-               if (reserve <= ob->alloc_reserve) {
-                       array_remove_item(ca->open_buckets_partial,
-                                         ca->open_buckets_partial_nr,
-                                         i);
-                       ob->on_partial_list = false;
-                       ob->alloc_reserve = reserve;
-                       spin_unlock(&c->freelist_lock);
-                       return ob;
-               }
-       }
-
-       spin_unlock(&c->freelist_lock);
-       return NULL;
-}
-
 /*
  * This path is for before the freespace btree is initialized:
  *
@@ -535,7 +499,6 @@ again:
 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
                                      struct bch_dev *ca,
                                      enum alloc_reserve reserve,
-                                     bool may_alloc_partial,
                                      struct closure *cl,
                                      struct bch_dev_usage *usage)
 {
@@ -574,12 +537,6 @@ again:
 
        if (waiting)
                closure_wake_up(&c->freelist_wait);
-
-       if (may_alloc_partial) {
-               ob = try_alloc_partial_bucket(c, ca, reserve);
-               if (ob)
-                       return ob;
-       }
 alloc:
        ob = likely(freespace)
                ? bch2_bucket_alloc_freelist(trans, ca, reserve, &s, cl)
@@ -599,7 +556,6 @@ err:
        if (!IS_ERR(ob))
                trace_and_count(c, bucket_alloc, ca,
                                bch2_alloc_reserves[reserve],
-                               may_alloc_partial,
                                ob->bucket,
                                usage->d[BCH_DATA_free].buckets,
                                avail,
@@ -611,7 +567,6 @@ err:
        else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
                trace_and_count(c, bucket_alloc_fail, ca,
                                bch2_alloc_reserves[reserve],
-                               may_alloc_partial,
                                0,
                                usage->d[BCH_DATA_free].buckets,
                                avail,
@@ -626,7 +581,6 @@ err:
 
 struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
                                      enum alloc_reserve reserve,
-                                     bool may_alloc_partial,
                                      struct closure *cl)
 {
        struct bch_dev_usage usage;
@@ -634,7 +588,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 
        bch2_trans_do(c, NULL, NULL, 0,
                      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
-                                                       may_alloc_partial, cl, &usage)));
+                                                       cl, &usage)));
        return ob;
 }
 
@@ -691,12 +645,10 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
        bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
 }
 
-#define BUCKET_MAY_ALLOC_PARTIAL       (1 << 0)
-#define BUCKET_ALLOC_USE_DURABILITY    (1 << 1)
-
-static void add_new_bucket(struct bch_fs *c,
+static int add_new_bucket(struct bch_fs *c,
                           struct open_buckets *ptrs,
                           struct bch_devs_mask *devs_may_alloc,
+                          unsigned nr_replicas,
                           unsigned *nr_effective,
                           bool *have_cache,
                           unsigned flags,
@@ -705,12 +657,19 @@ static void add_new_bucket(struct bch_fs *c,
        unsigned durability =
                bch_dev_bkey_exists(c, ob->dev)->mi.durability;
 
+       BUG_ON(*nr_effective >= nr_replicas);
+
        __clear_bit(ob->dev, devs_may_alloc->d);
-       *nr_effective   += (flags & BUCKET_ALLOC_USE_DURABILITY)
-               ? durability : 1;
+       *nr_effective   += durability;
        *have_cache     |= !durability;
 
        ob_push(c, ptrs, ob);
+
+       if (*nr_effective >= nr_replicas)
+               return 1;
+       if (ob->ec)
+               return 1;
+       return 0;
 }
 
 int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
@@ -720,8 +679,8 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
                      unsigned nr_replicas,
                      unsigned *nr_effective,
                      bool *have_cache,
+                     enum bch_data_type data_type,
                      enum alloc_reserve reserve,
-                     unsigned flags,
                      struct closure *cl)
 {
        struct bch_fs *c = trans->c;
@@ -754,8 +713,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
                        continue;
                }
 
-               ob = bch2_bucket_alloc_trans(trans, ca, reserve,
-                               flags & BUCKET_MAY_ALLOC_PARTIAL, cl, &usage);
+               ob = bch2_bucket_alloc_trans(trans, ca, reserve, cl, &usage);
                if (!IS_ERR(ob))
                        bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
                percpu_ref_put(&ca->ref);
@@ -767,10 +725,11 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
                        continue;
                }
 
-               add_new_bucket(c, ptrs, devs_may_alloc,
-                              nr_effective, have_cache, flags, ob);
+               ob->data_type = data_type;
 
-               if (*nr_effective >= nr_replicas) {
+               if (add_new_bucket(c, ptrs, devs_may_alloc,
+                                  nr_replicas, nr_effective,
+                                  have_cache, 0, ob)) {
                        ret = 0;
                        break;
                }
@@ -792,7 +751,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
                         struct write_point *wp,
                         struct bch_devs_mask *devs_may_alloc,
                         u16 target,
-                        unsigned erasure_code,
                         unsigned nr_replicas,
                         unsigned *nr_effective,
                         bool *have_cache,
@@ -805,9 +763,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
        struct open_bucket *ob;
        struct bch_dev *ca;
        unsigned i, ec_idx;
-
-       if (!erasure_code)
-               return 0;
+       int ret = 0;
 
        if (nr_replicas < 2)
                return 0;
@@ -842,54 +798,122 @@ got_bucket:
        ob->ec_idx      = ec_idx;
        ob->ec          = h->s;
 
-       add_new_bucket(c, ptrs, devs_may_alloc,
-                      nr_effective, have_cache, flags, ob);
+       ret = add_new_bucket(c, ptrs, devs_may_alloc,
+                            nr_replicas, nr_effective,
+                            have_cache, flags, ob);
        atomic_inc(&h->s->pin);
 out_put_head:
        bch2_ec_stripe_head_put(c, h);
-       return 0;
+       return ret;
 }
 
 /* Sector allocator */
 
-static void get_buckets_from_writepoint(struct bch_fs *c,
-                                       struct open_buckets *ptrs,
-                                       struct write_point *wp,
-                                       struct bch_devs_mask *devs_may_alloc,
-                                       unsigned nr_replicas,
-                                       unsigned *nr_effective,
-                                       bool *have_cache,
-                                       unsigned flags,
-                                       bool need_ec)
+static bool want_bucket(struct bch_fs *c,
+                       struct write_point *wp,
+                       struct bch_devs_mask *devs_may_alloc,
+                       bool *have_cache, bool ec,
+                       struct open_bucket *ob)
+{
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+       if (!test_bit(ob->dev, devs_may_alloc->d))
+               return false;
+
+       if (ob->data_type != wp->data_type)
+               return false;
+
+       if (!ca->mi.durability &&
+           (wp->data_type != BCH_DATA_user || !*have_cache))
+               return false;
+
+       if (ec != (ob->ec != NULL))
+               return false;
+
+       return true;
+}
+
+static int bucket_alloc_set_writepoint(struct bch_fs *c,
+                                      struct open_buckets *ptrs,
+                                      struct write_point *wp,
+                                      struct bch_devs_mask *devs_may_alloc,
+                                      unsigned nr_replicas,
+                                      unsigned *nr_effective,
+                                      bool *have_cache,
+                                      bool ec, unsigned flags)
 {
        struct open_buckets ptrs_skip = { .nr = 0 };
        struct open_bucket *ob;
        unsigned i;
+       int ret = 0;
 
        open_bucket_for_each(c, &wp->ptrs, ob, i) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
-
-               if (*nr_effective < nr_replicas &&
-                   test_bit(ob->dev, devs_may_alloc->d) &&
-                   (ca->mi.durability ||
-                    (wp->data_type == BCH_DATA_user && !*have_cache)) &&
-                   (ob->ec || !need_ec)) {
-                       add_new_bucket(c, ptrs, devs_may_alloc,
-                                      nr_effective, have_cache,
-                                      flags, ob);
-               } else {
+               if (!ret && want_bucket(c, wp, devs_may_alloc,
+                                       have_cache, ec, ob))
+                       ret = add_new_bucket(c, ptrs, devs_may_alloc,
+                                      nr_replicas, nr_effective,
+                                      have_cache, flags, ob);
+               else
                        ob_push(c, &ptrs_skip, ob);
-               }
        }
        wp->ptrs = ptrs_skip;
+
+       return ret;
 }
 
-static int open_bucket_add_buckets(struct btree_trans *trans,
+static int bucket_alloc_set_partial(struct bch_fs *c,
+                                   struct open_buckets *ptrs,
+                                   struct write_point *wp,
+                                   struct bch_devs_mask *devs_may_alloc,
+                                   unsigned nr_replicas,
+                                   unsigned *nr_effective,
+                                   bool *have_cache, bool ec,
+                                   enum alloc_reserve reserve,
+                                   unsigned flags)
+{
+       int i, ret = 0;
+
+       if (!c->open_buckets_partial_nr)
+               return 0;
+
+       spin_lock(&c->freelist_lock);
+
+       for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
+               struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
+
+               if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+                       struct bch_dev_usage usage;
+                       u64 avail;
+
+                       bch2_dev_usage_read_fast(ca, &usage);
+                       avail = dev_buckets_free(ca, usage, reserve);
+                       if (!avail)
+                               continue;
+
+                       array_remove_item(c->open_buckets_partial,
+                                         c->open_buckets_partial_nr,
+                                         i);
+                       ob->on_partial_list = false;
+
+                       ret = add_new_bucket(c, ptrs, devs_may_alloc,
+                                            nr_replicas, nr_effective,
+                                            have_cache, flags, ob);
+                       if (ret)
+                               break;
+               }
+       }
+
+       spin_unlock(&c->freelist_lock);
+       return ret;
+}
+
+static int __open_bucket_add_buckets(struct btree_trans *trans,
                        struct open_buckets *ptrs,
                        struct write_point *wp,
                        struct bch_devs_list *devs_have,
                        u16 target,
-                       unsigned erasure_code,
+                       bool erasure_code,
                        unsigned nr_replicas,
                        unsigned *nr_effective,
                        bool *have_cache,
@@ -901,8 +925,8 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
        struct bch_devs_mask devs;
        struct open_bucket *ob;
        struct closure *cl = NULL;
-       int ret;
        unsigned i;
+       int ret;
 
        rcu_read_lock();
        devs = target_rw_devs(c, wp->data_type, target);
@@ -915,52 +939,82 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
        open_bucket_for_each(c, ptrs, ob, i)
                __clear_bit(ob->dev, devs.d);
 
+       if (erasure_code && ec_open_bucket(c, ptrs))
+               return 0;
+
+       ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
+                                nr_replicas, nr_effective,
+                                have_cache, erasure_code, flags);
+       if (ret)
+               return ret;
+
+       ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
+                                nr_replicas, nr_effective,
+                                have_cache, erasure_code, reserve, flags);
+       if (ret)
+               return ret;
+
        if (erasure_code) {
-               if (!ec_open_bucket(c, ptrs)) {
-                       get_buckets_from_writepoint(c, ptrs, wp, &devs,
-                                                   nr_replicas, nr_effective,
-                                                   have_cache, flags, true);
-                       if (*nr_effective >= nr_replicas)
-                               return 0;
+               ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
+                                        target,
+                                        nr_replicas, nr_effective,
+                                        have_cache, flags, _cl);
+       } else {
+retry_blocking:
+               /*
+                * Try nonblocking first, so that if one device is full we'll try from
+                * other devices:
+                */
+               ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
+                                       nr_replicas, nr_effective, have_cache,
+                                       wp->data_type, reserve, cl);
+               if (ret &&
+                   !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+                   !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
+                   !cl && _cl) {
+                       cl = _cl;
+                       goto retry_blocking;
                }
 
-               if (!ec_open_bucket(c, ptrs)) {
-                       ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
-                                                target, erasure_code,
-                                                nr_replicas, nr_effective,
-                                                have_cache, flags, _cl);
-                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-                           bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
-                           bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
-                               return ret;
-                       if (*nr_effective >= nr_replicas)
-                               return 0;
-               }
        }
 
-       get_buckets_from_writepoint(c, ptrs, wp, &devs,
-                                   nr_replicas, nr_effective,
-                                   have_cache, flags, false);
-       if (*nr_effective >= nr_replicas)
-               return 0;
+       return ret;
+}
 
-retry_blocking:
-       /*
-        * Try nonblocking first, so that if one device is full we'll try from
-        * other devices:
-        */
-       ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
+static int open_bucket_add_buckets(struct btree_trans *trans,
+                       struct open_buckets *ptrs,
+                       struct write_point *wp,
+                       struct bch_devs_list *devs_have,
+                       u16 target,
+                       unsigned erasure_code,
+                       unsigned nr_replicas,
+                       unsigned *nr_effective,
+                       bool *have_cache,
+                       enum alloc_reserve reserve,
+                       unsigned flags,
+                       struct closure *cl)
+{
+       int ret;
+
+       if (erasure_code) {
+               ret = __open_bucket_add_buckets(trans, ptrs, wp,
+                               devs_have, target, erasure_code,
                                nr_replicas, nr_effective, have_cache,
                                reserve, flags, cl);
-       if (ret &&
-           !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
-           !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
-           !cl && _cl) {
-               cl = _cl;
-               goto retry_blocking;
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+                   bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
+                   bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
+                   bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+                       return ret;
+               if (*nr_effective >= nr_replicas)
+                       return 0;
        }
 
-       return ret;
+       ret = __open_bucket_add_buckets(trans, ptrs, wp,
+                       devs_have, target, false,
+                       nr_replicas, nr_effective, have_cache,
+                       reserve, flags, cl);
+       return ret < 0 ? ret : 0;
 }
 
 void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
@@ -1159,14 +1213,10 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
        struct open_bucket *ob;
        struct open_buckets ptrs;
        unsigned nr_effective, write_points_nr;
-       unsigned ob_flags = 0;
        bool have_cache;
        int ret;
        int i;
 
-       if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS))
-               ob_flags |= BUCKET_ALLOC_USE_DURABILITY;
-
        BUG_ON(!nr_replicas || !nr_replicas_required);
 retry:
        ptrs.nr         = 0;
@@ -1176,9 +1226,6 @@ retry:
 
        *wp_ret = wp = writepoint_find(trans, write_point.v);
 
-       if (wp->data_type == BCH_DATA_user)
-               ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
-
        /* metadata may not allocate on cache devices: */
        if (wp->data_type != BCH_DATA_user)
                have_cache = true;
@@ -1188,13 +1235,13 @@ retry:
                                              target, erasure_code,
                                              nr_replicas, &nr_effective,
                                              &have_cache, reserve,
-                                             ob_flags, cl);
+                                             flags, cl);
        } else {
                ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
                                              target, erasure_code,
                                              nr_replicas, &nr_effective,
                                              &have_cache, reserve,
-                                             ob_flags, NULL);
+                                             flags, NULL);
                if (!ret ||
                    bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto alloc_done;
@@ -1203,7 +1250,7 @@ retry:
                                              0, erasure_code,
                                              nr_replicas, &nr_effective,
                                              &have_cache, reserve,
-                                             ob_flags, cl);
+                                             flags, cl);
        }
 alloc_done:
        BUG_ON(!ret && nr_effective < nr_replicas);
@@ -1350,6 +1397,24 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
        }
 }
 
+void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
+{
+       unsigned i;
+
+       spin_lock(&c->freelist_lock);
+       for (i = 0; i < c->open_buckets_partial_nr; i++) {
+               struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
+
+               prt_printf(out, "%zu ref %u type %s ec %u %u:%llu:%u\n",
+                          ob - c->open_buckets,
+                          atomic_read(&ob->pin),
+                          bch2_data_types[ob->data_type],
+                          ob->ec != NULL,
+                          ob->dev, ob->bucket, ob->gen);
+       }
+       spin_unlock(&c->freelist_lock);
+}
+
 static const char * const bch2_write_point_states[] = {
 #define x(n)   #n,
        WRITE_POINT_STATES()
index ba7a87afda0e237b5e59a32c31c49706ae3c8133..e9b3b142d14d7df90718202c694aae07d41ac6fc 100644 (file)
@@ -31,8 +31,7 @@ void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
 long bch2_bucket_alloc_new_fs(struct bch_dev *);
 
 struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
-                                     enum alloc_reserve, bool,
-                                     struct closure *);
+                                     enum alloc_reserve, struct closure *);
 
 static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
                           struct open_bucket *ob)
@@ -152,8 +151,9 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
 
 int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
                      struct dev_stripe_state *, struct bch_devs_mask *,
-                     unsigned, unsigned *, bool *, enum alloc_reserve,
-                     unsigned, struct closure *);
+                     unsigned, unsigned *, bool *,
+                     enum bch_data_type, enum alloc_reserve,
+                     struct closure *);
 
 int bch2_alloc_sectors_start_trans(struct btree_trans *,
                                   unsigned, unsigned,
@@ -221,6 +221,7 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
 void bch2_fs_allocator_foreground_init(struct bch_fs *);
 
 void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
 
 void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
 
index 2e6f4806925871bf513f17235e6680e067e4471f..0739bf925318d57ca5ffd84470d73647eea4a1d9 100644 (file)
@@ -51,10 +51,9 @@ struct open_bucket {
         * the block in the stripe this open_bucket corresponds to:
         */
        u8                      ec_idx;
-       enum bch_data_type      data_type:8;
+       enum bch_data_type      data_type:6;
        unsigned                valid:1;
        unsigned                on_partial_list:1;
-       unsigned                alloc_reserve:3;
 
        u8                      dev;
        u8                      gen;
index e001f41916713d44df03e944c1dc0ab23e4d2ed5..a40c26125d2aca45eacf6f71931ddbff307dcd1f 100644 (file)
@@ -932,11 +932,14 @@ static int check_one_backpointer(struct btree_trans *trans,
                                 struct bpos bucket,
                                 u64 *bp_offset,
                                 struct bbpos start,
-                                struct bbpos end)
+                                struct bbpos end,
+                                struct bpos *last_flushed_pos)
 {
+       struct bch_fs *c = trans->c;
        struct btree_iter iter;
        struct bch_backpointer bp;
        struct bbpos pos;
+       struct bpos bp_pos;
        struct bkey_s_c k;
        struct printbuf buf = PRINTBUF;
        int ret;
@@ -957,17 +960,31 @@ static int check_one_backpointer(struct btree_trans *trans,
        if (ret)
                return ret;
 
-       if (fsck_err_on(!k.k, trans->c,
+       bp_pos = bucket_pos_to_bp(c, bucket,
+                       max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX);
+
+       if (!k.k && !bpos_eq(*last_flushed_pos, bp_pos)) {
+               *last_flushed_pos = bp_pos;
+               pr_info("flushing at %llu:%llu",
+                       last_flushed_pos->inode,
+                       last_flushed_pos->offset);
+
+               ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+                       -BCH_ERR_transaction_restart_write_buffer_flush;
+               goto out;
+       }
+
+       if (fsck_err_on(!k.k, c,
                        "%s backpointer points to missing extent\n%s",
                        *bp_offset < BACKPOINTER_OFFSET_MAX ? "alloc" : "btree",
                        (bch2_backpointer_to_text(&buf, &bp), buf.buf))) {
                ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp);
                if (ret == -ENOENT)
-                       bch_err(trans->c, "backpointer at %llu not found", *bp_offset);
+                       bch_err(c, "backpointer at %llu not found", *bp_offset);
        }
-
-       bch2_trans_iter_exit(trans, &iter);
+out:
 fsck_err:
+       bch2_trans_iter_exit(trans, &iter);
        printbuf_exit(&buf);
        return ret;
 }
@@ -978,6 +995,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
 {
        struct btree_iter iter;
        struct bkey_s_c k;
+       struct bpos last_flushed_pos = SPOS_MAX;
        int ret = 0;
 
        for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
@@ -987,7 +1005,8 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
                while (!(ret = commit_do(trans, NULL, NULL,
                                         BTREE_INSERT_LAZY_RW|
                                         BTREE_INSERT_NOFAIL,
-                               check_one_backpointer(trans, iter.pos, &bp_offset, start, end))) &&
+                               check_one_backpointer(trans, iter.pos, &bp_offset,
+                                                     start, end, &last_flushed_pos))) &&
                       bp_offset < U64_MAX)
                        bp_offset++;
 
index 7f9c1087f168ffab421bd04b195c08f10a42350e..3f88e7eac17c13e7e9b41e55c458763b948fa336 100644 (file)
@@ -516,9 +516,6 @@ struct bch_dev {
        unsigned                nr_open_buckets;
        unsigned                nr_btree_reserve;
 
-       open_bucket_idx_t       open_buckets_partial[OPEN_BUCKETS_COUNT];
-       open_bucket_idx_t       open_buckets_partial_nr;
-
        size_t                  inc_gen_needs_gc;
        size_t                  inc_gen_really_needs_gc;
        size_t                  buckets_waiting_on_journal;
@@ -859,6 +856,9 @@ struct bch_fs {
        struct open_bucket      open_buckets[OPEN_BUCKETS_COUNT];
        open_bucket_idx_t       open_buckets_hash[OPEN_BUCKETS_COUNT];
 
+       open_bucket_idx_t       open_buckets_partial[OPEN_BUCKETS_COUNT];
+       open_bucket_idx_t       open_buckets_partial_nr;
+
        struct write_point      btree_write_point;
        struct write_point      rebalance_write_point;
 
index d5a9cfb0e0b724419fb7cf079e3ac689a83fd6c9..c8b0cf5ed7914206f3c00c26ab93375b9a5991df 100644 (file)
@@ -2568,6 +2568,18 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
        return bch2_btree_iter_peek_slot(iter);
 }
 
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
+{
+       struct bkey_s_c k;
+
+       while (btree_trans_too_many_iters(iter->trans) ||
+              (k = bch2_btree_iter_peek_type(iter, iter->flags),
+               bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+               bch2_trans_begin(iter->trans);
+
+       return k;
+}
+
 /* new transactional stuff: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
index 1225c4dd6205c16580e4162967b2a3cc5a98a34d..448be089956968f1ad2440764ff768f3ce493c55 100644 (file)
@@ -596,6 +596,8 @@ static inline int btree_trans_too_many_iters(struct btree_trans *trans)
        return 0;
 }
 
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
+
 static inline struct bkey_s_c
 __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
                                   struct btree_iter *iter, unsigned flags)
index 6285532e77904f3e834afe317d0b6fac98b36aa5..026c249a3f441c9073aaa2641a9fec9290b50baf 100644 (file)
@@ -64,6 +64,15 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
 
        bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
        (*fast)++;
+
+       if (path->ref > 1) {
+               /*
+                * We can't clone a path that has write locks: if the path is
+                * shared, unlock before set_pos(), traverse():
+                */
+               bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+               *write_locked = false;
+       }
        return 0;
 trans_commit:
        return  bch2_trans_update(trans, iter, &wb->k, 0) ?:
index de0575f61cfbcb7920086cca2690c9f46298eab4..e1467e11b046fe3173fa9ef01228ac5b350b26be 100644 (file)
@@ -98,8 +98,10 @@ static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev)
        struct bch_extent_ptr *ptr;
 
        bkey_for_each_ptr(ptrs, ptr)
-               if (ptr->dev == dev)
-                       ptr->cached = true;
+               if (ptr->dev == dev) {
+                       bch2_extent_ptr_set_cached(k, ptr);
+                       return;
+               }
 }
 
 static int __bch2_data_update_index_update(struct btree_trans *trans,
@@ -295,15 +297,7 @@ out:
 
 int bch2_data_update_index_update(struct bch_write_op *op)
 {
-       struct bch_fs *c = op->c;
-       struct btree_trans trans;
-       int ret;
-
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-       ret = __bch2_data_update_index_update(&trans, op);
-       bch2_trans_exit(&trans);
-
-       return ret;
+       return bch2_trans_run(op->c, __bch2_data_update_index_update(&trans, op));
 }
 
 void bch2_data_update_read_done(struct data_update *m,
@@ -326,8 +320,9 @@ void bch2_data_update_exit(struct data_update *update)
        const struct bch_extent_ptr *ptr;
 
        bkey_for_each_ptr(ptrs, ptr) {
-               bch2_bucket_nocow_unlock(&c->nocow_locks,
-                                        PTR_BUCKET_POS(c, ptr), 0);
+               if (c->opts.nocow_enabled)
+                       bch2_bucket_nocow_unlock(&c->nocow_locks,
+                                                PTR_BUCKET_POS(c, ptr), 0);
                percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref);
        }
 
@@ -487,23 +482,26 @@ int bch2_data_update_init(struct btree_trans *trans,
                if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
                        m->op.incompressible = true;
 
-               if (ctxt) {
-                       move_ctxt_wait_event(ctxt, trans,
-                                       (locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
-                                                                 PTR_BUCKET_POS(c, &p.ptr), 0)) ||
-                                       !atomic_read(&ctxt->read_sectors));
-
-                       if (!locked)
-                               bch2_bucket_nocow_lock(&c->nocow_locks,
-                                                      PTR_BUCKET_POS(c, &p.ptr), 0);
-               } else {
-                       if (!bch2_bucket_nocow_trylock(&c->nocow_locks,
-                                                      PTR_BUCKET_POS(c, &p.ptr), 0)) {
-                               ret = -BCH_ERR_nocow_lock_blocked;
-                               goto err;
+               if (c->opts.nocow_enabled) {
+                       if (ctxt) {
+                               move_ctxt_wait_event(ctxt, trans,
+                                               (locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
+                                                                         PTR_BUCKET_POS(c, &p.ptr), 0)) ||
+                                               !atomic_read(&ctxt->read_sectors));
+
+                               if (!locked)
+                                       bch2_bucket_nocow_lock(&c->nocow_locks,
+                                                              PTR_BUCKET_POS(c, &p.ptr), 0);
+                       } else {
+                               if (!bch2_bucket_nocow_trylock(&c->nocow_locks,
+                                                              PTR_BUCKET_POS(c, &p.ptr), 0)) {
+                                       ret = -BCH_ERR_nocow_lock_blocked;
+                                       goto err;
+                               }
                        }
+                       ptrs_locked |= (1U << i);
                }
-               ptrs_locked |= (1U << i);
+
                i++;
        }
 
index 84d2a0c4bcaaaa3e20bd1584cad26308f24f3b74..7bd68880995f28bebb9ac3efc1391b87920ae53f 100644 (file)
@@ -138,20 +138,28 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
                         struct bkey_s_c k)
 {
        const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-       unsigned i;
+       unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
 
        prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
               s->algorithm,
               le16_to_cpu(s->sectors),
-              s->nr_blocks - s->nr_redundant,
+              nr_data,
               s->nr_redundant,
               s->csum_type,
               1U << s->csum_granularity_bits);
 
-       for (i = 0; i < s->nr_blocks; i++)
-               prt_printf(out, " %u:%llu:%u", s->ptrs[i].dev,
-                      (u64) s->ptrs[i].offset,
-                      stripe_blockcount_get(s, i));
+       for (i = 0; i < s->nr_blocks; i++) {
+               const struct bch_extent_ptr *ptr = s->ptrs + i;
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+               u32 offset;
+               u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+               prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
+               if (i < nr_data)
+                       prt_printf(out, "#%u", stripe_blockcount_get(s, i));
+               if (ptr_stale(ca, ptr))
+                       prt_printf(out, " stale");
+       }
 }
 
 /* returns blocknr in stripe that we matched: */
@@ -442,15 +450,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
        percpu_ref_put(&ca->io_ref);
 }
 
-static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
+static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
+                               struct ec_stripe_buf *stripe)
 {
-       struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes,
                             POS(0, idx), BTREE_ITER_SLOTS);
        k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
@@ -462,11 +469,15 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip
        }
        bkey_reassemble(&stripe->key.k_i, k);
 err:
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
+static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
+{
+       return bch2_trans_run(c, get_stripe_key_trans(&trans, idx, stripe));
+}
+
 /* recovery read path: */
 int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
 {
@@ -865,25 +876,6 @@ err:
        return ret;
 }
 
-static void extent_stripe_ptr_add(struct bkey_s_extent e,
-                                 struct ec_stripe_buf *s,
-                                 struct bch_extent_ptr *ptr,
-                                 unsigned block)
-{
-       struct bch_extent_stripe_ptr *dst = (void *) ptr;
-       union bch_extent_entry *end = extent_entry_last(e);
-
-       memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst);
-       e.k->u64s += sizeof(*dst) / sizeof(u64);
-
-       *dst = (struct bch_extent_stripe_ptr) {
-               .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
-               .block          = block,
-               .redundancy     = s->key.v.nr_redundant,
-               .idx            = s->key.k.p.offset,
-       };
-}
-
 static int ec_stripe_update_extent(struct btree_trans *trans,
                                   struct bpos bucket, u8 gen,
                                   struct ec_stripe_buf *s,
@@ -895,6 +887,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
        struct bkey_s_c k;
        const struct bch_extent_ptr *ptr_c;
        struct bch_extent_ptr *ptr, *ec_ptr = NULL;
+       struct bch_extent_stripe_ptr stripe_ptr;
        struct bkey_i *n;
        int ret, dev, block;
 
@@ -933,16 +926,27 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 
        dev = s->key.v.ptrs[block].dev;
 
-       n = bch2_bkey_make_mut(trans, k);
+       n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
        ret = PTR_ERR_OR_ZERO(n);
        if (ret)
                goto out;
 
+       bkey_reassemble(n, k);
+
        bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
        ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev);
        BUG_ON(!ec_ptr);
 
-       extent_stripe_ptr_add(bkey_i_to_s_extent(n), s, ec_ptr, block);
+       stripe_ptr = (struct bch_extent_stripe_ptr) {
+               .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
+               .block          = block,
+               .redundancy     = s->key.v.nr_redundant,
+               .idx            = s->key.k.p.offset,
+       };
+
+       __extent_entry_insert(n,
+                       (union bch_extent_entry *) ec_ptr,
+                       (union bch_extent_entry *) &stripe_ptr);
 
        ret = bch2_trans_update(trans, &iter, n, 0);
 out:
@@ -999,6 +1003,35 @@ err:
        return ret;
 }
 
+static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
+                                      struct ec_stripe_new *s,
+                                      unsigned block,
+                                      struct open_bucket *ob)
+{
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+       unsigned offset = ca->mi.bucket_size - ob->sectors_free;
+       int ret;
+
+       if (!bch2_dev_get_ioref(ca, WRITE)) {
+               s->err = -EROFS;
+               return;
+       }
+
+       memset(s->new_stripe.data[block] + (offset << 9),
+              0,
+              ob->sectors_free << 9);
+
+       ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
+                       ob->bucket * ca->mi.bucket_size + offset,
+                       ob->sectors_free,
+                       GFP_KERNEL, 0);
+
+       percpu_ref_put(&ca->io_ref);
+
+       if (ret)
+               s->err = ret;
+}
+
 /*
  * data buckets of new stripe all written: create the stripe
  */
@@ -1014,6 +1047,14 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
        closure_sync(&s->iodone);
 
+       for (i = 0; i < nr_data; i++)
+               if (s->blocks[i]) {
+                       ob = c->open_buckets + s->blocks[i];
+
+                       if (ob->sectors_free)
+                               zero_out_rest_of_ec_bucket(c, s, i, ob);
+               }
+
        if (s->err) {
                if (!bch2_err_matches(s->err, EROFS))
                        bch_err(c, "error creating stripe: error writing data buckets");
@@ -1155,9 +1196,6 @@ void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
 {
        struct ec_stripe_new *s = ob->ec;
 
-       if (ob->sectors_free)
-               s->err = -1;
-
        ec_stripe_new_put(c, s);
 }
 
@@ -1398,10 +1436,10 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
                                            h->s->nr_parity,
                                            &nr_have_parity,
                                            &have_cache,
+                                           BCH_DATA_parity,
                                            h->copygc
                                            ? RESERVE_movinggc
                                            : RESERVE_none,
-                                           0,
                                            cl);
 
                open_bucket_for_each(c, &buckets, ob, i) {
@@ -1427,10 +1465,10 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
                                            h->s->nr_data,
                                            &nr_have_data,
                                            &have_cache,
+                                           BCH_DATA_user,
                                            h->copygc
                                            ? RESERVE_movinggc
                                            : RESERVE_none,
-                                           0,
                                            cl);
 
                open_bucket_for_each(c, &buckets, ob, i) {
@@ -1486,8 +1524,9 @@ static s64 get_existing_stripe(struct bch_fs *c,
        return ret;
 }
 
-static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, struct ec_stripe_head *h)
+static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
 {
+       struct bch_fs *c = trans->c;
        unsigned i;
        s64 idx;
        int ret;
@@ -1497,7 +1536,7 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, struct ec_stripe_head *
                return -BCH_ERR_ENOSPC_stripe_reuse;
 
        h->s->have_existing_stripe = true;
-       ret = get_stripe_key(c, idx, &h->s->existing_stripe);
+       ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
        if (ret) {
                bch2_fs_fatal_error(c, "error reading stripe key: %i", ret);
                return ret;
@@ -1626,7 +1665,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
                goto err;
 
        if (ret && needs_stripe_new)
-               ret = __bch2_ec_stripe_head_reuse(c, h);
+               ret = __bch2_ec_stripe_head_reuse(trans, h);
        if (ret) {
                bch_err_ratelimited(c, "failed to get stripe: %s", bch2_err_str(ret));
                goto err;
@@ -1771,6 +1810,7 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 void bch2_fs_ec_exit(struct bch_fs *c)
 {
        struct ec_stripe_head *h;
+       unsigned i;
 
        while (1) {
                mutex_lock(&c->ec_stripe_head_lock);
@@ -1782,7 +1822,12 @@ void bch2_fs_ec_exit(struct bch_fs *c)
                if (!h)
                        break;
 
-               BUG_ON(h->s);
+               if (h->s) {
+                       for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++)
+                               BUG_ON(h->s->blocks[i]);
+
+                       kfree(h->s);
+               }
                kfree(h);
        }
 
@@ -1801,6 +1846,8 @@ void bch2_fs_ec_init_early(struct bch_fs *c)
 
 int bch2_fs_ec_init(struct bch_fs *c)
 {
+       spin_lock_init(&c->ec_stripes_new_lock);
+
        return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
                           BIOSET_NEED_BVECS);
 }
index d01cec89603bfbaa1963d2995e6349caf5f158bd..4fc581be7aaf545ab8ebd8b5d1f018bb768f62c0 100644 (file)
@@ -706,18 +706,6 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry
        k->k.u64s -= extent_entry_u64s(entry);
 }
 
-static inline void __extent_entry_insert(struct bkey_i *k,
-                                        union bch_extent_entry *dst,
-                                        union bch_extent_entry *new)
-{
-       union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
-
-       memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
-                             dst, (u64 *) end - (u64 *) dst);
-       k->k.u64s += extent_entry_u64s(new);
-       memcpy_u64s_small(dst, new, extent_entry_u64s(new));
-}
-
 void bch2_extent_ptr_decoded_append(struct bkey_i *k,
                                    struct extent_ptr_decoded *p)
 {
@@ -951,6 +939,29 @@ bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1,
        return false;
 }
 
+void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
+{
+       struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+       union bch_extent_entry *entry;
+       union bch_extent_entry *ec = NULL;
+
+       bkey_extent_entry_for_each(ptrs, entry) {
+               if (&entry->ptr == ptr) {
+                       ptr->cached = true;
+                       if (ec)
+                               extent_entry_drop(k, ec);
+                       return;
+               }
+
+               if (extent_entry_is_stripe_ptr(entry))
+                       ec = entry;
+               else if (extent_entry_is_ptr(entry))
+                       ec = NULL;
+       }
+
+       BUG();
+}
+
 /*
  * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
  *
@@ -1094,7 +1105,7 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
        unsigned size_ondisk = k.k->size;
        unsigned nonce = UINT_MAX;
        unsigned nr_ptrs = 0;
-       bool unwritten = false;
+       bool unwritten = false, have_ec = false, crc_since_last_ptr = false;
        int ret;
 
        if (bkey_is_btree_ptr(k.k))
@@ -1130,7 +1141,14 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
                                return -BCH_ERR_invalid_bkey;
                        }
 
+                       if (entry->ptr.cached && have_ec) {
+                               prt_printf(err, "cached, erasure coded ptr");
+                               return -BCH_ERR_invalid_bkey;
+                       }
+
                        unwritten = entry->ptr.unwritten;
+                       have_ec = false;
+                       crc_since_last_ptr = false;
                        nr_ptrs++;
                        break;
                case BCH_EXTENT_ENTRY_crc32:
@@ -1164,17 +1182,43 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
                                        return -BCH_ERR_invalid_bkey;
                                }
                        }
+
+                       if (crc_since_last_ptr) {
+                               prt_printf(err, "redundant crc entry");
+                               return -BCH_ERR_invalid_bkey;
+                       }
+                       crc_since_last_ptr = true;
                        break;
                case BCH_EXTENT_ENTRY_stripe_ptr:
+                       if (have_ec) {
+                               prt_printf(err, "redundant stripe entry");
+                               return -BCH_ERR_invalid_bkey;
+                       }
+                       have_ec = true;
                        break;
                }
        }
 
+       if (!nr_ptrs) {
+               prt_str(err, "no ptrs");
+               return -BCH_ERR_invalid_bkey;
+       }
+
        if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
                prt_str(err, "too many ptrs");
                return -BCH_ERR_invalid_bkey;
        }
 
+       if (crc_since_last_ptr) {
+               prt_printf(err, "redundant crc entry");
+               return -BCH_ERR_invalid_bkey;
+       }
+
+       if (have_ec) {
+               prt_printf(err, "redundant stripe entry");
+               return -BCH_ERR_invalid_bkey;
+       }
+
        return 0;
 }
 
index 1d8f3b309b074ba40394e2fd954744a658b4336f..2e37543a62291a8335106abefed72da047e57753 100644 (file)
@@ -76,6 +76,18 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
        return extent_entry_bytes(entry) / sizeof(u64);
 }
 
+static inline void __extent_entry_insert(struct bkey_i *k,
+                                        union bch_extent_entry *dst,
+                                        union bch_extent_entry *new)
+{
+       union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+
+       memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
+                             dst, (u64 *) end - (u64 *) dst);
+       k->k.u64s += extent_entry_u64s(new);
+       memcpy_u64s_small(dst, new, extent_entry_u64s(new));
+}
+
 static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
 {
        return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
@@ -655,6 +667,8 @@ bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
 bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
 bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c);
 
+void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
+
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
index 64925db22cdc937419881e5ed1ae78c4a5a1f3ad..15ce0657c37b5830d2c5aeb6438e384a9b6ec75d 100644 (file)
@@ -1650,7 +1650,7 @@ static void __bch2_write(struct bch_write_op *op)
 
        nofs_flags = memalloc_nofs_save();
 
-       if (unlikely(op->opts.nocow)) {
+       if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
                bch2_nocow_write(op);
                if (op->flags & BCH_WRITE_DONE)
                        goto out_nofs_restore;
index 957eeece4d98f56c827f6063d074998dd5401e4c..e0c4f51a1fde4862165dab8b53348e32fd2de26c 100644 (file)
@@ -789,8 +789,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                                break;
                        }
                } else {
-                       ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none,
-                                              false, cl);
+                       ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, cl);
                        ret = PTR_ERR_OR_ZERO(ob[nr_got]);
                        if (ret)
                                break;
index 29e51bde8313419cd44dcc83d4b339d84abd6935..cf5998e519e722e9f9c580717a7095ee64aad1e1 100644 (file)
@@ -31,22 +31,6 @@ int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
        return 0;
 }
 
-void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
-{
-       struct bkey_i *where;
-
-       for_each_keylist_key(l, where)
-               if (bpos_lt(insert->k.p, where->k.p))
-                       break;
-
-       memmove_u64s_up((u64 *) where + insert->k.u64s,
-                       where,
-                       ((u64 *) l->top) - ((u64 *) where));
-
-       l->top_p += insert->k.u64s;
-       bkey_copy(where, insert);
-}
-
 void bch2_keylist_pop_front(struct keylist *l)
 {
        l->top_p -= bch2_keylist_front(l)->k.u64s;
index 635efb7e8228b96fae848a917214590306ae1538..fe759c7031e0403a0fe0da6d61b2c8432f819451 100644 (file)
@@ -5,7 +5,6 @@
 #include "keylist_types.h"
 
 int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
-void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
 void bch2_keylist_pop_front(struct keylist *);
 
 static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
index 9eec12a99535e84675e5ad3d5bd63b3348611f1b..e913b90f37b7b7a562949a644a38d5e65e57e1d7 100644 (file)
@@ -4,6 +4,7 @@
 #include "alloc_background.h"
 #include "btree_iter.h"
 #include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "error.h"
 #include "lru.h"
 #include "recovery.h"
@@ -101,7 +102,8 @@ static const char * const bch2_lru_types[] = {
 
 static int bch2_check_lru_key(struct btree_trans *trans,
                              struct btree_iter *lru_iter,
-                             struct bkey_s_c lru_k)
+                             struct bkey_s_c lru_k,
+                             struct bpos *last_flushed_pos)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter iter;
@@ -137,19 +139,25 @@ static int bch2_check_lru_key(struct btree_trans *trans,
                break;
        }
 
-       if (fsck_err_on(lru_k.k->type != KEY_TYPE_set ||
-                       lru_pos_time(lru_k.k->p) != idx, c,
-                       "incorrect lru entry: lru %s time %llu\n"
-                       "  %s\n"
-                       "  for %s",
-                       bch2_lru_types[type],
-                       lru_pos_time(lru_k.k->p),
-                       (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
-                       (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
-               ret = bch2_btree_delete_at(trans, lru_iter, 0);
-               if (ret)
-                       goto err;
+       if (lru_k.k->type != KEY_TYPE_set ||
+           lru_pos_time(lru_k.k->p) != idx) {
+               if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) {
+                       *last_flushed_pos = lru_k.k->p;
+                       ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+                               -BCH_ERR_transaction_restart_write_buffer_flush;
+                       goto out;
+               }
+
+               if (fsck_err(c, "incorrect lru entry: lru %s time %llu\n"
+                            "  %s\n"
+                            "  for %s",
+                            bch2_lru_types[type],
+                            lru_pos_time(lru_k.k->p),
+                            (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
+                            (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
+                       ret = bch2_btree_delete_at(trans, lru_iter, 0);
        }
+out:
 err:
 fsck_err:
        bch2_trans_iter_exit(trans, &iter);
@@ -163,6 +171,7 @@ int bch2_check_lrus(struct bch_fs *c)
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
+       struct bpos last_flushed_pos = POS_MIN;
        int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
@@ -170,7 +179,7 @@ int bch2_check_lrus(struct bch_fs *c)
        ret = for_each_btree_key_commit(&trans, iter,
                        BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
                        NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-               bch2_check_lru_key(&trans, &iter, k));
+               bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos));
 
        bch2_trans_exit(&trans);
        return ret;
index 7dac9264304e4188f43d591790900e4fa3bed518..4ef7595fa6a2d5f43c098b40d10a5ca73ae029cf 100644 (file)
@@ -227,7 +227,8 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
        if (bkey_deleted(&n->k))
                n->k.size = 0;
 
-       return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+       return bch2_trans_relock(trans) ?:
+               bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
 }
 
index 304718a0b599b50b60d80631d98f515bbc7b6d77..76c2691aa8560ea0757a5b68df2747cb4b384d3b 100644 (file)
@@ -404,6 +404,12 @@ enum opt_type {
          NULL,         "Nocow mode: Writes will be done in place when possible.\n"\
                        "Snapshots and reflink will still caused writes to be COW\n"\
                        "Implicitly disables data checksumming, compression and encryption")\
+       x(nocow_enabled,                u8,                             \
+         OPT_FS|OPT_MOUNT,                                             \
+         OPT_BOOL(),                                                   \
+         BCH2_NO_SB_OPT,                       true,                   \
+         NULL,         "Enable nocow mode: enables runtime locking in\n"\
+                       "data move path needed if nocow will ever be in use\n")\
        x(no_data_io,                   u8,                             \
          OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
index b981c87edd1f7f316c5f820b56d33f9427997fb4..a7582dd42b243205ef92cb92ee001c95b17042e5 100644 (file)
@@ -194,6 +194,7 @@ read_attribute(btree_cache);
 read_attribute(btree_key_cache);
 read_attribute(stripes_heap);
 read_attribute(open_buckets);
+read_attribute(open_buckets_partial);
 read_attribute(write_points);
 read_attribute(nocow_lock_table);
 
@@ -455,6 +456,9 @@ SHOW(bch2_fs)
        if (attr == &sysfs_open_buckets)
                bch2_open_buckets_to_text(out, c);
 
+       if (attr == &sysfs_open_buckets_partial)
+               bch2_open_buckets_partial_to_text(out, c);
+
        if (attr == &sysfs_write_points)
                bch2_write_points_to_text(out, c);
 
@@ -663,6 +667,7 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_new_stripes,
        &sysfs_stripes_heap,
        &sysfs_open_buckets,
+       &sysfs_open_buckets_partial,
        &sysfs_write_points,
 #ifdef BCH_WRITE_REF_DEBUG
        &sysfs_write_refs,
index 0a5cedfea3c037140fcf4040dc66cf435c1cee73..805d55dbf7411324a8aa133ff46594be10cd51b0 100644 (file)
@@ -118,6 +118,14 @@ int blkdev_issue_discard(struct block_device *bdev,
        return 0;
 }
 
+int blkdev_issue_zeroout(struct block_device *bdev,
+                        sector_t sector, sector_t nr_sects,
+                        gfp_t gfp_mask, unsigned flags)
+{
+       /* Not yet implemented: */
+       BUG();
+}
+
 unsigned bdev_logical_block_size(struct block_device *bdev)
 {
        struct stat statbuf;