]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 72405e7ff8 bcachefs: Fix bch2_check_extents_to_backpointers()
authorKent Overstreet <kent.overstreet@linux.dev>
Tue, 14 Mar 2023 16:56:38 +0000 (12:56 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Tue, 14 Mar 2023 16:56:38 +0000 (12:56 -0400)
40 files changed:
.bcachefs_revision
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/alloc_foreground.c
libbcachefs/alloc_foreground.h
libbcachefs/backpointers.c
libbcachefs/bcachefs.h
libbcachefs/btree_iter.c
libbcachefs/btree_key_cache.c
libbcachefs/btree_key_cache.h
libbcachefs/btree_locking.c
libbcachefs/btree_locking.h
libbcachefs/btree_update.h
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/data_update.c
libbcachefs/ec.c
libbcachefs/ec.h
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fsck.c
libbcachefs/io.c
libbcachefs/io.h
libbcachefs/io_types.h
libbcachefs/journal.c
libbcachefs/journal_io.c
libbcachefs/journal_reclaim.c
libbcachefs/journal_sb.c
libbcachefs/journal_sb.h
libbcachefs/journal_types.h
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/movinggc.c
libbcachefs/opts.h
libbcachefs/reflink.c
libbcachefs/subvolume.c
libbcachefs/super.c
libbcachefs/sysfs.c
linux/six.c

index 2845be6830c7c935f92ba55579d82278f04c41df..d8d138657af4c2d10a014528d685e450ee1b101a 100644 (file)
@@ -1 +1 @@
-3856459b1b9f37cebee2bca3c9edcafaf393aa98
+72405e7ff8c5fb569b74b046d19866ee480f29b7
index 5f4bb82c35ea103e5368ff8a1e3c3231824fd46c..009a85bccc53f9abc4c47bb2ed182bd55b098273 100644 (file)
@@ -1006,7 +1006,7 @@ static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
        iter = bucket->inode;
        ca = __bch2_next_dev(c, &iter, NULL);
        if (ca)
-               bucket->offset = ca->mi.first_bucket;
+               *bucket = POS(ca->dev_idx, ca->mi.first_bucket);
        rcu_read_unlock();
 
        return ca != NULL;
@@ -2158,43 +2158,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
         */
        bch2_recalc_capacity(c);
 
-       /* Next, close write points that point to this device... */
-       for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
-               bch2_writepoint_stop(c, ca, &c->write_points[i]);
-
-       bch2_writepoint_stop(c, ca, &c->copygc_write_point);
-       bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
-       bch2_writepoint_stop(c, ca, &c->btree_write_point);
-
-       mutex_lock(&c->btree_reserve_cache_lock);
-       while (c->btree_reserve_cache_nr) {
-               struct btree_alloc *a =
-                       &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
-               bch2_open_buckets_put(c, &a->ob);
-       }
-       mutex_unlock(&c->btree_reserve_cache_lock);
-
-       spin_lock(&c->freelist_lock);
-       i = 0;
-       while (i < c->open_buckets_partial_nr) {
-               struct open_bucket *ob =
-                       c->open_buckets + c->open_buckets_partial[i];
-
-               if (ob->dev == ca->dev_idx) {
-                       swap(c->open_buckets_partial[i],
-                            c->open_buckets_partial[--c->open_buckets_partial_nr]);
-                       ob->on_partial_list = false;
-                       spin_unlock(&c->freelist_lock);
-                       bch2_open_bucket_put(c, ob);
-                       spin_lock(&c->freelist_lock);
-               } else {
-                       i++;
-               }
-       }
-       spin_unlock(&c->freelist_lock);
-
-       bch2_ec_stop_dev(c, ca);
+       bch2_open_buckets_stop(c, ca, false);
 
        /*
         * Wake up threads that were blocked on allocation, so they can notice
index c9ff590ef978f7f17445c90055cb7f349ebf6501..324798396fc6b667f608e035a78033e2c035f456 100644 (file)
@@ -216,7 +216,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
        u64 free = max_t(s64, 0,
                           u.d[BCH_DATA_free].buckets
                         + u.d[BCH_DATA_need_discard].buckets
-                        - bch2_dev_buckets_reserved(ca, RESERVE_none));
+                        - bch2_dev_buckets_reserved(ca, RESERVE_stripe));
 
        return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
 }
index 3a67ac0d913512f89e2ff61d2934af27155076cd..d52f30ac66bad2f536df6a99cac1f45af2de48d3 100644 (file)
@@ -97,7 +97,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
        struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
 
        if (ob->ec) {
-               ec_stripe_new_put(c, ob->ec);
+               ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
                return;
        }
 
@@ -658,9 +658,11 @@ static int add_new_bucket(struct bch_fs *c,
                bch_dev_bkey_exists(c, ob->dev)->mi.durability;
 
        BUG_ON(*nr_effective >= nr_replicas);
+       BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
 
        __clear_bit(ob->dev, devs_may_alloc->d);
-       *nr_effective   += durability;
+       *nr_effective   += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
+               ? durability : 1;
        *have_cache     |= !durability;
 
        ob_push(c, ptrs, ob);
@@ -679,6 +681,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
                      unsigned nr_replicas,
                      unsigned *nr_effective,
                      bool *have_cache,
+                     unsigned flags,
                      enum bch_data_type data_type,
                      enum alloc_reserve reserve,
                      struct closure *cl)
@@ -729,7 +732,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 
                if (add_new_bucket(c, ptrs, devs_may_alloc,
                                   nr_replicas, nr_effective,
-                                  have_cache, 0, ob)) {
+                                  have_cache, flags, ob)) {
                        ret = 0;
                        break;
                }
@@ -796,7 +799,7 @@ got_bucket:
 
        ob->ec_idx      = ec_idx;
        ob->ec          = h->s;
-       ec_stripe_new_get(h->s);
+       ec_stripe_new_get(h->s, STRIPE_REF_io);
 
        ret = add_new_bucket(c, ptrs, devs_may_alloc,
                             nr_replicas, nr_effective,
@@ -823,7 +826,7 @@ static bool want_bucket(struct bch_fs *c,
                return false;
 
        if (!ca->mi.durability &&
-           (wp->data_type != BCH_DATA_user || !*have_cache))
+           (wp->data_type == BCH_DATA_btree || ec || *have_cache))
                return false;
 
        if (ec != (ob->ec != NULL))
@@ -877,6 +880,9 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
 
        spin_lock(&c->freelist_lock);
 
+       if (!c->open_buckets_partial_nr)
+               goto unlock;
+
        for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
                struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
 
@@ -902,7 +908,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
                                break;
                }
        }
-
+unlock:
        spin_unlock(&c->freelist_lock);
        return ret;
 }
@@ -967,7 +973,7 @@ retry_blocking:
                 */
                ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
                                        nr_replicas, nr_effective, have_cache,
-                                       wp->data_type, reserve, cl);
+                                       flags, wp->data_type, reserve, cl);
                if (ret &&
                    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
                    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
@@ -1017,45 +1023,96 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
        return ret < 0 ? ret : 0;
 }
 
-void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
-                               struct open_buckets *obs)
+static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
+                              struct bch_dev *ca, bool ec)
 {
-       struct open_buckets ptrs = { .nr = 0 };
-       struct open_bucket *ob, *ob2;
-       unsigned i, j;
-
-       open_bucket_for_each(c, obs, ob, i) {
-               bool drop = !ca || ob->dev == ca->dev_idx;
+       if (ec) {
+               return ob->ec != NULL;
+       } else if (ca) {
+               bool drop = ob->dev == ca->dev_idx;
+               struct open_bucket *ob2;
+               unsigned i;
 
                if (!drop && ob->ec) {
                        mutex_lock(&ob->ec->lock);
-                       for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) {
-                               if (!ob->ec->blocks[j])
+                       for (i = 0; i < ob->ec->new_stripe.key.v.nr_blocks; i++) {
+                               if (!ob->ec->blocks[i])
                                        continue;
 
-                               ob2 = c->open_buckets + ob->ec->blocks[j];
+                               ob2 = c->open_buckets + ob->ec->blocks[i];
                                drop |= ob2->dev == ca->dev_idx;
                        }
                        mutex_unlock(&ob->ec->lock);
                }
 
-               if (drop)
-                       bch2_open_bucket_put(c, ob);
-               else
-                       ob_push(c, &ptrs, ob);
+               return drop;
+       } else {
+               return true;
        }
-
-       *obs = ptrs;
 }
 
-void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
-                         struct write_point *wp)
+static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
+                                bool ec, struct write_point *wp)
 {
+       struct open_buckets ptrs = { .nr = 0 };
+       struct open_bucket *ob;
+       unsigned i;
+
        mutex_lock(&wp->lock);
-       bch2_open_buckets_stop_dev(c, ca, &wp->ptrs);
+       open_bucket_for_each(c, &wp->ptrs, ob, i)
+               if (should_drop_bucket(ob, c, ca, ec))
+                       bch2_open_bucket_put(c, ob);
+               else
+                       ob_push(c, &ptrs, ob);
+       wp->ptrs = ptrs;
        mutex_unlock(&wp->lock);
 }
 
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
+                           bool ec)
+{
+       unsigned i;
+
+       /* Next, close write points that point to this device... */
+       for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+               bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
+
+       bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
+       bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
+       bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
+
+       mutex_lock(&c->btree_reserve_cache_lock);
+       while (c->btree_reserve_cache_nr) {
+               struct btree_alloc *a =
+                       &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+               bch2_open_buckets_put(c, &a->ob);
+       }
+       mutex_unlock(&c->btree_reserve_cache_lock);
+
+       spin_lock(&c->freelist_lock);
+       i = 0;
+       while (i < c->open_buckets_partial_nr) {
+               struct open_bucket *ob =
+                       c->open_buckets + c->open_buckets_partial[i];
+
+               if (should_drop_bucket(ob, c, ca, ec)) {
+                       --c->open_buckets_partial_nr;
+                       swap(c->open_buckets_partial[i],
+                            c->open_buckets_partial[c->open_buckets_partial_nr]);
+                       ob->on_partial_list = false;
+                       spin_unlock(&c->freelist_lock);
+                       bch2_open_bucket_put(c, ob);
+                       spin_lock(&c->freelist_lock);
+               } else {
+                       i++;
+               }
+       }
+       spin_unlock(&c->freelist_lock);
+
+       bch2_ec_stop_dev(c, ca);
+}
+
 static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
                                                 unsigned long write_point)
 {
@@ -1101,8 +1158,7 @@ static bool try_increase_writepoints(struct bch_fs *c)
        return true;
 }
 
-static bool try_decrease_writepoints(struct bch_fs *c,
-                                    unsigned old_nr)
+static bool try_decrease_writepoints(struct bch_fs *c, unsigned old_nr)
 {
        struct write_point *wp;
 
@@ -1123,7 +1179,7 @@ static bool try_decrease_writepoints(struct bch_fs *c,
        hlist_del_rcu(&wp->node);
        mutex_unlock(&c->write_points_hash_lock);
 
-       bch2_writepoint_stop(c, NULL, wp);
+       bch2_writepoint_stop(c, NULL, false, wp);
        return true;
 }
 
@@ -1217,6 +1273,8 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
        int ret;
        int i;
 
+       BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
+
        BUG_ON(!nr_replicas || !nr_replicas_required);
 retry:
        ptrs.nr         = 0;
@@ -1230,13 +1288,7 @@ retry:
        if (wp->data_type != BCH_DATA_user)
                have_cache = true;
 
-       if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
-               ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
-                                             target, erasure_code,
-                                             nr_replicas, &nr_effective,
-                                             &have_cache, reserve,
-                                             flags, cl);
-       } else {
+       if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
                ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
                                              target, erasure_code,
                                              nr_replicas, &nr_effective,
@@ -1246,11 +1298,28 @@ retry:
                    bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto alloc_done;
 
+               /* Don't retry from all devices if we're out of open buckets: */
+               if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+                       goto allocate_blocking;
+
+               /*
+                * Only try to allocate cache (durability = 0 devices) from the
+                * specified target:
+                */
+               have_cache = true;
+
                ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
                                              0, erasure_code,
                                              nr_replicas, &nr_effective,
                                              &have_cache, reserve,
                                              flags, cl);
+       } else {
+allocate_blocking:
+               ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+                                             target, erasure_code,
+                                             nr_replicas, &nr_effective,
+                                             &have_cache, reserve,
+                                             flags, cl);
        }
 alloc_done:
        BUG_ON(!ret && nr_effective < nr_replicas);
@@ -1380,14 +1449,16 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
 
 static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
 {
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
        unsigned data_type = ob->data_type;
        barrier(); /* READ_ONCE() doesn't work on bitfields */
 
-       prt_printf(out, "%zu ref %u %s %u:%llu gen %u",
+       prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
                   ob - c->open_buckets,
                   atomic_read(&ob->pin),
                   data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
-                  ob->dev, ob->bucket, ob->gen);
+                  ob->dev, ob->bucket, ob->gen,
+                  ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
        if (ob->ec)
                prt_printf(out, " ec idx %llu", ob->ec->idx);
        if (ob->on_partial_list)
index e9b3b142d14d7df90718202c694aae07d41ac6fc..8a1cf425091b8aacd8438f1222b688074c60bffb 100644 (file)
@@ -151,7 +151,7 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
 
 int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
                      struct dev_stripe_state *, struct bch_devs_mask *,
-                     unsigned, unsigned *, bool *,
+                     unsigned, unsigned *, bool *, unsigned,
                      enum bch_data_type, enum alloc_reserve,
                      struct closure *);
 
@@ -202,11 +202,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
                                    struct bkey_i *, unsigned, bool);
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 
-void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
-                               struct open_buckets *);
-
-void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
-                         struct write_point *);
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool);
 
 static inline struct write_point_specifier writepoint_hashed(unsigned long v)
 {
index a40c26125d2aca45eacf6f71931ddbff307dcd1f..8517c56352264451315103848521ad9a09f9d26a 100644 (file)
@@ -549,13 +549,18 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
                  bch2_check_btree_backpointer(&trans, &iter, k)));
 }
 
+struct bpos_level {
+       unsigned        level;
+       struct bpos     pos;
+};
+
 static int check_bp_exists(struct btree_trans *trans,
                           struct bpos bucket_pos,
                           struct bch_backpointer bp,
                           struct bkey_s_c orig_k,
                           struct bpos bucket_start,
                           struct bpos bucket_end,
-                          struct bpos *last_flushed_pos)
+                          struct bpos_level *last_flushed)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter alloc_iter, bp_iter = { NULL };
@@ -600,8 +605,11 @@ static int check_bp_exists(struct btree_trans *trans,
 
        if (bp_k.k->type != KEY_TYPE_backpointer ||
            memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
-               if (!bpos_eq(*last_flushed_pos, orig_k.k->p)) {
-                       *last_flushed_pos = orig_k.k->p;
+               if (last_flushed->level != bp.level ||
+                   !bpos_eq(last_flushed->pos, orig_k.k->p)) {
+                       last_flushed->level = bp.level;
+                       last_flushed->pos = orig_k.k->p;
+
                        ret = bch2_btree_write_buffer_flush_sync(trans) ?:
                                -BCH_ERR_transaction_restart_write_buffer_flush;
                        goto out;
@@ -639,7 +647,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
                                        struct btree_iter *iter,
                                        struct bpos bucket_start,
                                        struct bpos bucket_end,
-                                       struct bpos *last_flushed_pos)
+                                       struct bpos_level *last_flushed)
 {
        struct bch_fs *c = trans->c;
        struct bkey_ptrs_c ptrs;
@@ -668,7 +676,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
 
                ret = check_bp_exists(trans, bucket_pos, bp, k,
                                      bucket_start, bucket_end,
-                                     last_flushed_pos);
+                                     last_flushed);
                if (ret)
                        return ret;
        }
@@ -680,7 +688,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
                                            enum btree_id btree_id,
                                            struct bpos bucket_start,
                                            struct bpos bucket_end,
-                                           struct bpos *last_flushed_pos)
+                                           struct bpos_level *last_flushed)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter iter;
@@ -709,12 +717,12 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
                if (p.ptr.cached)
                        continue;
 
-               bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1,
+               bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1,
                                      k, p, &bucket_pos, &bp);
 
                ret = check_bp_exists(trans, bucket_pos, bp, k,
                                      bucket_start, bucket_end,
-                                     last_flushed_pos);
+                                     last_flushed);
                if (ret)
                        goto err;
        }
@@ -794,7 +802,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
 {
        struct btree_iter iter;
        enum btree_id btree_id;
-       struct bpos last_flushed_pos = SPOS_MAX;
+       struct bpos_level last_flushed = { UINT_MAX };
        int ret = 0;
 
        for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
@@ -811,7 +819,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
                                        BTREE_INSERT_NOFAIL,
                                        check_extent_to_backpointers(trans, &iter,
                                                                bucket_start, bucket_end,
-                                                               &last_flushed_pos));
+                                                               &last_flushed));
                        if (ret)
                                break;
                } while (!bch2_btree_iter_advance(&iter));
@@ -826,7 +834,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
                                BTREE_INSERT_NOFAIL,
                                check_btree_root_to_backpointers(trans, btree_id,
                                                        bucket_start, bucket_end,
-                                                       &last_flushed_pos));
+                                                       &last_flushed));
                if (ret)
                        break;
        }
index 25a32fd6c8f2a594b7fca3cd8dc0eaba4d003a8f..348ee8e8c01521a49b6d18c6e9cdf9726e01c486 100644 (file)
 #define BCH_WRITE_REF_DEBUG
 #endif
 
+#ifndef dynamic_fault
 #define dynamic_fault(...)             0
-#define race_fault(...)                        0
+#endif
+
+#define race_fault(...)                        dynamic_fault("bcachefs:race")
 
 #define trace_and_count(_c, _name, ...)                                        \
 do {                                                                   \
@@ -652,7 +655,6 @@ typedef struct {
        x(fallocate)                                                    \
        x(discard)                                                      \
        x(invalidate)                                                   \
-       x(move)                                                         \
        x(delete_dead_snapshots)                                        \
        x(snapshot_delete_pagecache)                                    \
        x(sysfs)
@@ -922,6 +924,13 @@ struct bch_fs {
 
        mempool_t               large_bkey_pool;
 
+       /* MOVE.C */
+       struct list_head        moving_context_list;
+       struct mutex            moving_context_lock;
+
+       struct list_head        data_progress_list;
+       struct mutex            data_progress_lock;
+
        /* REBALANCE */
        struct bch_fs_rebalance rebalance;
 
@@ -932,10 +941,6 @@ struct bch_fs {
        bool                    copygc_running;
        wait_queue_head_t       copygc_running_wq;
 
-       /* DATA PROGRESS STATS */
-       struct list_head        data_progress_list;
-       struct mutex            data_progress_lock;
-
        /* STRIPES: */
        GENRADIX(struct stripe) stripes;
        GENRADIX(struct gc_stripe) gc_stripes;
@@ -952,14 +957,14 @@ struct bch_fs {
 
        struct list_head        ec_stripe_new_list;
        struct mutex            ec_stripe_new_lock;
+       wait_queue_head_t       ec_stripe_new_wait;
 
        struct work_struct      ec_stripe_create_work;
        u64                     ec_stripe_hint;
 
-       struct bio_set          ec_bioset;
-
        struct work_struct      ec_stripe_delete_work;
-       struct llist_head       ec_stripe_delete_list;
+
+       struct bio_set          ec_bioset;
 
        /* REFLINK */
        u64                     reflink_hint;
index 2d344993674980851625796832382aeb417fe751..0a3e560597f7ebe90778ed16d14986b1b042f85c 100644 (file)
@@ -16,7 +16,7 @@
 #include "replicas.h"
 #include "subvolume.h"
 
-#include <linux/prandom.h>
+#include <linux/random.h>
 #include <linux/prefetch.h>
 #include <trace/events/bcachefs.h>
 
index 298a674dbfd6e9963a624a410fe7d5a814954a80..27a73933878a2b7618f506f05ac39e745aa058c7 100644 (file)
@@ -770,11 +770,11 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
 
 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
                                  unsigned flags,
-                                 struct btree_path *path,
-                                 struct bkey_i *insert)
+                                 struct btree_insert_entry *insert_entry)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_cached *ck = (void *) path->l[0].b;
+       struct bkey_cached *ck = (void *) insert_entry->path->l[0].b;
+       struct bkey_i *insert = insert_entry->k;
        bool kick_reclaim = false;
 
        BUG_ON(insert->k.u64s > ck->u64s);
@@ -802,9 +802,24 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
                        kick_reclaim = true;
        }
 
+       /*
+        * To minimize lock contention, we only add the journal pin here and
+        * defer pin updates to the flush callback via ->seq. Be careful not to
+        * update ->seq on nojournal commits because we don't want to update the
+        * pin to a seq that doesn't include journal updates on disk. Otherwise
+        * we risk losing the update after a crash.
+        *
+        * The only exception is if the pin is not active in the first place. We
+        * have to add the pin because journal reclaim drives key cache
+        * flushing. The flush callback will not proceed unless ->seq matches
+        * the latest pin, so make sure it starts with a consistent value.
+        */
+       if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) ||
+           !journal_pin_active(&ck->journal)) {
+               ck->seq = trans->journal_res.seq;
+       }
        bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
                             &ck->journal, bch2_btree_key_cache_journal_flush);
-       ck->seq = trans->journal_res.seq;
 
        if (kick_reclaim)
                journal_reclaim_kick(&c->journal);
index c86d5e48f6e33fa9691362804f5537e5af436cbb..be3acde2caa09d65ec8746e20583c6ad840711b3 100644 (file)
@@ -30,7 +30,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
                                    unsigned);
 
 bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
-                       struct btree_path *, struct bkey_i *);
+                       struct btree_insert_entry *);
 int bch2_btree_key_cache_flush(struct btree_trans *,
                               enum btree_id, struct bpos);
 void bch2_btree_key_cache_drop(struct btree_trans *,
index 0032d0eb05a59c15a520467b86b7d971a2005785..b99986653adefe24ac4a312ab8d960194307d958 100644 (file)
@@ -388,6 +388,40 @@ int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *p
        return ret;
 }
 
+void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+                                      struct btree_path *path,
+                                      struct btree_bkey_cached_common *b)
+{
+       struct btree_path *linked;
+       unsigned i;
+       int ret;
+
+       /*
+        * XXX BIG FAT NOTICE
+        *
+        * Drop all read locks before taking a write lock:
+        *
+        * This is a hack, because bch2_btree_node_lock_write_nofail() is a
+        * hack - but by dropping read locks first, this should never fail, and
+        * we only use this in code paths where whatever read locks we've
+        * already taken are no longer needed:
+        */
+
+       trans_for_each_path(trans, linked) {
+               if (!linked->nodes_locked)
+                       continue;
+
+               for (i = 0; i < BTREE_MAX_DEPTH; i++)
+                       if (btree_node_read_locked(linked, i)) {
+                               btree_node_unlock(trans, linked, i);
+                               btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
+                       }
+       }
+
+       ret = __btree_node_lock_write(trans, path, b, true);
+       BUG_ON(ret);
+}
+
 /* relock */
 
 static inline bool btree_path_get_locks(struct btree_trans *trans,
index bd658e5c865f88d7e5583f7b50ebe350ddd725b0..327780ce8e9ae0772e325dedd4b331684962d3ad 100644 (file)
@@ -299,15 +299,6 @@ static inline int __btree_node_lock_write(struct btree_trans *trans,
                : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
 }
 
-static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
-                                             struct btree_path *path,
-                                             struct btree_bkey_cached_common *b)
-{
-       int ret = __btree_node_lock_write(trans, path, b, true);
-
-       BUG_ON(ret);
-}
-
 static inline int __must_check
 bch2_btree_node_lock_write(struct btree_trans *trans,
                           struct btree_path *path,
@@ -316,6 +307,10 @@ bch2_btree_node_lock_write(struct btree_trans *trans,
        return __btree_node_lock_write(trans, path, b, false);
 }
 
+void bch2_btree_node_lock_write_nofail(struct btree_trans *,
+                                      struct btree_path *,
+                                      struct btree_bkey_cached_common *);
+
 /* relock: */
 
 bool bch2_btree_path_relock_norestart(struct btree_trans *,
index ee1d15931022f42a4331d48e70d0f1ffaffee501..46fb4a9ed29584c82d6fe1fb923925252daa5847 100644 (file)
@@ -13,6 +13,9 @@ void bch2_btree_node_prep_for_write(struct btree_trans *,
 bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
                                struct btree *, struct btree_node_iter *,
                                struct bkey_i *);
+
+int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64);
+int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64);
 void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
 
 void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
index c93c132dd815d963360dfe5060dd74925e3ba915..629e528899d99777a7090fa820ab3ab066ac9bf4 100644 (file)
@@ -227,12 +227,12 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
        return 0;
 }
 
-static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
 {
        return __btree_node_flush(j, pin, 0, seq);
 }
 
-static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
 {
        return __btree_node_flush(j, pin, 1, seq);
 }
@@ -244,8 +244,8 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
 
        bch2_journal_pin_add(&c->journal, seq, &w->journal,
                             btree_node_write_idx(b) == 0
-                            ? btree_node_flush0
-                            : btree_node_flush1);
+                            ? bch2_btree_node_flush0
+                            : bch2_btree_node_flush1);
 }
 
 /**
@@ -765,7 +765,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
                if (!i->cached)
                        btree_insert_key_leaf(trans, i);
                else if (!i->key_cache_already_flushed)
-                       bch2_btree_insert_key_cached(trans, flags, i->path, i->k);
+                       bch2_btree_insert_key_cached(trans, flags, i);
                else {
                        bch2_btree_key_cache_drop(trans, i->path);
                        btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
index 6805f2c0f08a23fde10324fec3f56ef481b18df3..1bcef419cfabedd8b8f777df3a51105200dbdc45 100644 (file)
@@ -1855,7 +1855,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
        if (IS_ERR(a))
                return PTR_ERR(a);
 
-       if (a->v.data_type && a->v.data_type != type) {
+       if (a->v.data_type && type && a->v.data_type != type) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
                        "while marking %s",
index eb91e24c3157c8bb8c25aa98f699afb9849f43c8..e414d1afd434a148bb80e0d511c1e3b84af54002 100644 (file)
@@ -92,18 +92,6 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
        return ret;
 }
 
-static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev)
-{
-       struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-       struct bch_extent_ptr *ptr;
-
-       bkey_for_each_ptr(ptrs, ptr)
-               if (ptr->dev == dev) {
-                       bch2_extent_ptr_set_cached(k, ptr);
-                       return;
-               }
-}
-
 static int __bch2_data_update_index_update(struct btree_trans *trans,
                                           struct bch_write_op *op)
 {
@@ -126,15 +114,17 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
        while (1) {
                struct bkey_s_c k;
                struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
-               struct bkey_i *insert;
+               struct bkey_i *insert = NULL;
                struct bkey_i_extent *new;
-               const union bch_extent_entry *entry;
+               const union bch_extent_entry *entry_c;
+               union bch_extent_entry *entry;
                struct extent_ptr_decoded p;
+               struct bch_extent_ptr *ptr;
+               const struct bch_extent_ptr *ptr_c;
                struct bpos next_pos;
-               bool did_work = false;
                bool should_check_enospc;
                s64 i_sectors_delta = 0, disk_sectors_delta = 0;
-               unsigned i;
+               unsigned rewrites_found = 0, durability, i;
 
                bch2_trans_begin(trans);
 
@@ -146,7 +136,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
                new = bkey_i_to_extent(bch2_keylist_front(keys));
 
                if (!bch2_extents_match(k, old))
-                       goto nomatch;
+                       goto nowork;
 
                bkey_reassemble(_insert.k, k);
                insert = _insert.k;
@@ -169,50 +159,60 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
                 * Fist, drop rewrite_ptrs from @new:
                 */
                i = 0;
-               bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
+               bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
                        if (((1U << i) & m->data_opts.rewrite_ptrs) &&
-                           bch2_extent_has_ptr(old, p, bkey_i_to_s_c(insert))) {
-                               /*
-                                * If we're going to be adding a pointer to the
-                                * same device, we have to drop the old one -
-                                * otherwise, we can just mark it cached:
-                                */
-                               if (bch2_bkey_has_device(bkey_i_to_s_c(&new->k_i), p.ptr.dev))
-                                       bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), p.ptr.dev);
-                               else
-                                       bch2_bkey_mark_dev_cached(bkey_i_to_s(insert), p.ptr.dev);
+                           (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
+                           !ptr->cached) {
+                               bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
+                               rewrites_found |= 1U << i;
                        }
                        i++;
                }
 
+               if (m->data_opts.rewrite_ptrs &&
+                   !rewrites_found &&
+                   bch2_bkey_durability(c, k) >= m->op.opts.data_replicas)
+                       goto nowork;
 
-               /* Add new ptrs: */
-               extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
-                       const struct bch_extent_ptr *existing_ptr =
-                               bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev);
-
-                       if (existing_ptr && existing_ptr->cached) {
-                               /*
-                                * We're replacing a cached pointer with a non
-                                * cached pointer:
-                                */
-                               bch2_bkey_drop_device_noerror(bkey_i_to_s(insert),
-                                                             existing_ptr->dev);
-                       } else if (existing_ptr) {
-                               /*
-                                * raced with another move op? extent already
-                                * has a pointer to the device we just wrote
-                                * data to
-                                */
-                               continue;
+               /*
+                * A replica that we just wrote might conflict with a replica
+                * that we want to keep, due to racing with another move:
+                */
+restart_drop_conflicting_replicas:
+               extent_for_each_ptr(extent_i_to_s(new), ptr)
+                       if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) &&
+                           !ptr_c->cached) {
+                               bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr);
+                               goto restart_drop_conflicting_replicas;
                        }
 
-                       bch2_extent_ptr_decoded_append(insert, &p);
-                       did_work = true;
+               if (!bkey_val_u64s(&new->k))
+                       goto nowork;
+
+               /* Now, drop pointers that conflict with what we just wrote: */
+               extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+                       if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev)))
+                               bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
+
+               durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) +
+                       bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
+
+               /* Now, drop excess replicas: */
+restart_drop_extra_replicas:
+               bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
+                       unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
+
+                       if (!p.ptr.cached &&
+                           durability - ptr_durability >= m->op.opts.data_replicas) {
+                               durability -= ptr_durability;
+                               bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
+                               goto restart_drop_extra_replicas;
+                       }
                }
 
-               if (!did_work)
-                       goto nomatch;
+               /* Finally, add the pointers we just wrote: */
+               extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+                       bch2_extent_ptr_decoded_append(insert, &p);
 
                bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
                bch2_extent_normalize(c, bkey_i_to_s(insert));
@@ -253,6 +253,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
                                BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                        bch2_trans_commit(trans, &op->res,
                                NULL,
+                               BTREE_INSERT_NOCHECK_RW|
                                BTREE_INSERT_NOFAIL|
                                m->data_opts.btree_insert_flags);
                if (!ret) {
@@ -273,7 +274,7 @@ next:
                                goto out;
                }
                continue;
-nomatch:
+nowork:
                if (m->ctxt && m->ctxt->stats) {
                        BUG_ON(k.k->p.offset <= iter.pos.offset);
                        atomic64_inc(&m->ctxt->stats->keys_raced);
index 7d43fd4a6bb79a2727ba35c4883ba4b278f5a0cd..09c6f93c123494fdaf891581cb05d2ee25872d2a 100644 (file)
@@ -659,14 +659,13 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
 static u64 stripe_idx_to_delete(struct bch_fs *c)
 {
        ec_stripes_heap *h = &c->ec_stripes_heap;
-       size_t heap_idx;
 
        lockdep_assert_held(&c->ec_stripes_heap_lock);
 
-       for (heap_idx = 0; heap_idx < h->used; heap_idx++)
-               if (h->data[heap_idx].blocks_nonempty == 0 &&
-                   !bch2_stripe_is_open(c, h->data[heap_idx].idx))
-                       return h->data[heap_idx].idx;
+       if (h->used &&
+           h->data[0].blocks_nonempty == 0 &&
+           !bch2_stripe_is_open(c, h->data[0].idx))
+               return h->data[0].idx;
 
        return 0;
 }
@@ -959,7 +958,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
        bkey_reassemble(n, k);
 
        bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
-       ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev);
+       ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
        BUG_ON(!ec_ptr);
 
        stripe_ptr = (struct bch_extent_stripe_ptr) {
@@ -990,6 +989,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 
        while (1) {
                ret = commit_do(trans, NULL, NULL,
+                               BTREE_INSERT_NOCHECK_RW|
                                BTREE_INSERT_NOFAIL,
                        ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
                                                s, &bp_offset));
@@ -1057,6 +1057,13 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
                s->err = ret;
 }
 
+void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
+{
+       if (s->idx)
+               bch2_stripe_close(c, s);
+       kfree(s);
+}
+
 /*
  * data buckets of new stripe all written: create the stripe
  */
@@ -1072,13 +1079,15 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
        closure_sync(&s->iodone);
 
-       for (i = 0; i < nr_data; i++)
-               if (s->blocks[i]) {
-                       ob = c->open_buckets + s->blocks[i];
+       if (!s->err) {
+               for (i = 0; i < nr_data; i++)
+                       if (s->blocks[i]) {
+                               ob = c->open_buckets + s->blocks[i];
 
-                       if (ob->sectors_free)
-                               zero_out_rest_of_ec_bucket(c, s, i, ob);
-               }
+                               if (ob->sectors_free)
+                                       zero_out_rest_of_ec_bucket(c, s, i, ob);
+                       }
+       }
 
        if (s->err) {
                if (!bch2_err_matches(s->err, EROFS))
@@ -1119,7 +1128,9 @@ static void ec_stripe_create(struct ec_stripe_new *s)
                goto err;
        }
 
-       ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
+       ret = bch2_trans_do(c, &s->res, NULL,
+                           BTREE_INSERT_NOCHECK_RW|
+                           BTREE_INSERT_NOFAIL,
                            ec_stripe_key_update(&trans, &s->new_stripe.key,
                                                 !s->have_existing_stripe));
        if (ret) {
@@ -1152,13 +1163,11 @@ err:
        list_del(&s->list);
        mutex_unlock(&c->ec_stripe_new_lock);
 
-       if (s->idx)
-               bch2_stripe_close(c, s);
-
        ec_stripe_buf_exit(&s->existing_stripe);
        ec_stripe_buf_exit(&s->new_stripe);
        closure_debug_destroy(&s->iodone);
-       kfree(s);
+
+       ec_stripe_new_put(c, s, STRIPE_REF_stripe);
 }
 
 static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
@@ -1167,7 +1176,7 @@ static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
 
        mutex_lock(&c->ec_stripe_new_lock);
        list_for_each_entry(s, &c->ec_stripe_new_list, list)
-               if (!atomic_read(&s->pin))
+               if (!atomic_read(&s->ref[STRIPE_REF_io]))
                        goto out;
        s = NULL;
 out:
@@ -1209,7 +1218,7 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
        list_add(&s->list, &c->ec_stripe_new_list);
        mutex_unlock(&c->ec_stripe_new_lock);
 
-       ec_stripe_new_put(c, s);
+       ec_stripe_new_put(c, s, STRIPE_REF_io);
 }
 
 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
@@ -1321,7 +1330,8 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 
        mutex_init(&s->lock);
        closure_init(&s->iodone, NULL);
-       atomic_set(&s->pin, 1);
+       atomic_set(&s->ref[STRIPE_REF_stripe], 1);
+       atomic_set(&s->ref[STRIPE_REF_io], 1);
        s->c            = c;
        s->h            = h;
        s->nr_data      = min_t(unsigned, h->nr_active_devs,
@@ -1402,6 +1412,11 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct btree_trans *trans,
        if (ret)
                return ERR_PTR(ret);
 
+       if (test_bit(BCH_FS_GOING_RO, &c->flags)) {
+               h = ERR_PTR(-EROFS);
+               goto found;
+       }
+
        list_for_each_entry(h, &c->ec_stripe_head_list, list)
                if (h->target           == target &&
                    h->algo             == algo &&
@@ -1451,7 +1466,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
                                            &devs,
                                            h->s->nr_parity,
                                            &nr_have_parity,
-                                           &have_cache,
+                                           &have_cache, 0,
                                            BCH_DATA_parity,
                                            reserve,
                                            cl);
@@ -1478,7 +1493,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
                                            &devs,
                                            h->s->nr_data,
                                            &nr_have_data,
-                                           &have_cache,
+                                           &have_cache, 0,
                                            BCH_DATA_user,
                                            reserve,
                                            cl);
@@ -1706,6 +1721,14 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
                if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
                        goto err;
 
+               if (reserve == RESERVE_movinggc) {
+                       ret =   new_stripe_alloc_buckets(trans, h, reserve, NULL) ?:
+                               __bch2_ec_stripe_head_reserve(trans, h);
+                       if (ret)
+                               goto err;
+                       goto allocate_buf;
+               }
+
                /* XXX freelist_wait? */
                closure_wait(&c->freelist_wait, cl);
                waiting = true;
@@ -1738,7 +1761,7 @@ err:
        return ERR_PTR(ret);
 }
 
-void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
 {
        struct ec_stripe_head *h;
        struct open_bucket *ob;
@@ -1746,11 +1769,13 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
 
        mutex_lock(&c->ec_stripe_head_lock);
        list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-
                mutex_lock(&h->lock);
                if (!h->s)
                        goto unlock;
 
+               if (!ca)
+                       goto found;
+
                for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
                        if (!h->s->blocks[i])
                                continue;
@@ -1769,6 +1794,32 @@ unlock:
        mutex_unlock(&c->ec_stripe_head_lock);
 }
 
+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+{
+       __bch2_ec_stop(c, ca);
+}
+
+void bch2_fs_ec_stop(struct bch_fs *c)
+{
+       __bch2_ec_stop(c, NULL);
+}
+
+static bool bch2_fs_ec_flush_done(struct bch_fs *c)
+{
+       bool ret;
+
+       mutex_lock(&c->ec_stripe_new_lock);
+       ret = list_empty(&c->ec_stripe_new_list);
+       mutex_unlock(&c->ec_stripe_new_lock);
+
+       return ret;
+}
+
+void bch2_fs_ec_flush(struct bch_fs *c)
+{
+       wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
+}
+
 int bch2_stripes_read(struct bch_fs *c)
 {
        struct btree_trans trans;
@@ -1821,13 +1872,16 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
        size_t i;
 
        mutex_lock(&c->ec_stripes_heap_lock);
-       for (i = 0; i < min_t(size_t, h->used, 20); i++) {
+       for (i = 0; i < min_t(size_t, h->used, 50); i++) {
                m = genradix_ptr(&c->stripes, h->data[i].idx);
 
-               prt_printf(out, "%zu %u/%u+%u\n", h->data[i].idx,
+               prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
                       h->data[i].blocks_nonempty,
                       m->nr_blocks - m->nr_redundant,
                       m->nr_redundant);
+               if (bch2_stripe_is_open(c, h->data[i].idx))
+                       prt_str(out, " open");
+               prt_newline(out);
        }
        mutex_unlock(&c->ec_stripes_heap_lock);
 }
@@ -1839,22 +1893,27 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 
        mutex_lock(&c->ec_stripe_head_lock);
        list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-               prt_printf(out, "target %u algo %u redundancy %u:\n",
-                      h->target, h->algo, h->redundancy);
+               prt_printf(out, "target %u algo %u redundancy %u %s:\n",
+                      h->target, h->algo, h->redundancy,
+                      bch2_alloc_reserves[h->reserve]);
 
                if (h->s)
-                       prt_printf(out, "\tpending: idx %llu blocks %u+%u allocated %u\n",
+                       prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
                               h->s->idx, h->s->nr_data, h->s->nr_parity,
                               bitmap_weight(h->s->blocks_allocated,
                                             h->s->nr_data));
        }
        mutex_unlock(&c->ec_stripe_head_lock);
 
+       prt_printf(out, "in flight:\n");
+
        mutex_lock(&c->ec_stripe_new_lock);
        list_for_each_entry(s, &c->ec_stripe_new_list, list) {
-               prt_printf(out, "\tin flight: idx %llu blocks %u+%u pin %u\n",
+               prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
                           s->idx, s->nr_data, s->nr_parity,
-                          atomic_read(&s->pin));
+                          atomic_read(&s->ref[STRIPE_REF_io]),
+                          atomic_read(&s->ref[STRIPE_REF_stripe]),
+                          bch2_alloc_reserves[s->h->reserve]);
        }
        mutex_unlock(&c->ec_stripe_new_lock);
 }
@@ -1892,14 +1951,22 @@ void bch2_fs_ec_exit(struct bch_fs *c)
 
 void bch2_fs_ec_init_early(struct bch_fs *c)
 {
+       spin_lock_init(&c->ec_stripes_new_lock);
+       mutex_init(&c->ec_stripes_heap_lock);
+
+       INIT_LIST_HEAD(&c->ec_stripe_head_list);
+       mutex_init(&c->ec_stripe_head_lock);
+
+       INIT_LIST_HEAD(&c->ec_stripe_new_list);
+       mutex_init(&c->ec_stripe_new_lock);
+       init_waitqueue_head(&c->ec_stripe_new_wait);
+
        INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
        INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
 }
 
 int bch2_fs_ec_init(struct bch_fs *c)
 {
-       spin_lock_init(&c->ec_stripes_new_lock);
-
        return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
                           BIOSET_NEED_BVECS);
 }
index d112aea9ec5632f0339532fd39c3e2407fceedc2..7c08a49d741956534a229e92e2d787ceacb3b407 100644 (file)
@@ -143,6 +143,12 @@ struct ec_stripe_buf {
 
 struct ec_stripe_head;
 
+enum ec_stripe_ref {
+       STRIPE_REF_io,
+       STRIPE_REF_stripe,
+       STRIPE_REF_NR
+};
+
 struct ec_stripe_new {
        struct bch_fs           *c;
        struct ec_stripe_head   *h;
@@ -154,8 +160,7 @@ struct ec_stripe_new {
 
        struct closure          iodone;
 
-       /* counts in flight writes, stripe is created when pin == 0 */
-       atomic_t                pin;
+       atomic_t                ref[STRIPE_REF_NR];
 
        int                     err;
 
@@ -213,24 +218,35 @@ void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
 
 void bch2_do_stripe_deletes(struct bch_fs *);
 void bch2_ec_do_stripe_creates(struct bch_fs *);
+void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
 
-static inline void ec_stripe_new_get(struct ec_stripe_new *s)
+static inline void ec_stripe_new_get(struct ec_stripe_new *s,
+                                    enum ec_stripe_ref ref)
 {
-       atomic_inc(&s->pin);
+       atomic_inc(&s->ref[ref]);
 }
 
-static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s)
+static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
+                                    enum ec_stripe_ref ref)
 {
-       BUG_ON(atomic_read(&s->pin) <= 0);
-       BUG_ON(!s->err && !s->idx);
-
-       if (atomic_dec_and_test(&s->pin))
-               bch2_ec_do_stripe_creates(c);
+       BUG_ON(atomic_read(&s->ref[ref]) <= 0);
+
+       if (atomic_dec_and_test(&s->ref[ref]))
+               switch (ref) {
+               case STRIPE_REF_stripe:
+                       bch2_ec_stripe_new_free(c, s);
+                       break;
+               case STRIPE_REF_io:
+                       bch2_ec_do_stripe_creates(c);
+                       break;
+               default:
+                       unreachable();
+               }
 }
 
 void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
-
-void bch2_ec_flush_new_stripes(struct bch_fs *);
+void bch2_fs_ec_stop(struct bch_fs *);
+void bch2_fs_ec_flush(struct bch_fs *);
 
 int bch2_stripes_read(struct bch_fs *);
 
index 4fc581be7aaf545ab8ebd8b5d1f018bb768f62c0..e2c09ea4a3e013bd9bb55c2518da017e59dd69c7 100644 (file)
@@ -26,8 +26,6 @@
 
 #include <trace/events/bcachefs.h>
 
-static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
-
 static unsigned bch2_crc_field_size_max[] = {
        [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
        [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
@@ -512,7 +510,7 @@ restart_narrow_pointers:
 
        bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
                if (can_narrow_crc(p.crc, n)) {
-                       __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
+                       bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
                        p.ptr.offset += p.crc.offset;
                        p.crc = n;
                        bch2_extent_ptr_decoded_append(k, &p);
@@ -765,8 +763,8 @@ static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
 /*
  * Returns pointer to the next entry after the one being dropped:
  */
-static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
-                                          struct bch_extent_ptr *ptr)
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
+                                                  struct bch_extent_ptr *ptr)
 {
        struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
        union bch_extent_entry *entry = to_entry(ptr), *next;
@@ -809,7 +807,7 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
 {
        bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
        union bch_extent_entry *ret =
-               __bch2_bkey_drop_ptr(k, ptr);
+               bch2_bkey_drop_ptr_noerror(k, ptr);
 
        /*
         * If we deleted all the dirty pointers and there's still cached
@@ -840,14 +838,13 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
 
 void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
 {
-       struct bch_extent_ptr *ptr = (void *) bch2_bkey_has_device(k.s_c, dev);
+       struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
 
        if (ptr)
-               __bch2_bkey_drop_ptr(k, ptr);
+               bch2_bkey_drop_ptr_noerror(k, ptr);
 }
 
-const struct bch_extent_ptr *
-bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const struct bch_extent_ptr *ptr;
@@ -922,11 +919,11 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
        }
 }
 
-bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1,
-                        struct bkey_s_c k2)
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
 {
-       struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
-       const union bch_extent_entry *entry2;
+       struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
+       union bch_extent_entry *entry2;
        struct extent_ptr_decoded p2;
 
        bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
@@ -934,9 +931,9 @@ bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1,
                    p1.ptr.gen          == p2.ptr.gen &&
                    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
                    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
-                       return true;
+                       return &entry2->ptr;
 
-       return false;
+       return NULL;
 }
 
 void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
@@ -992,6 +989,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
        struct bch_dev *ca;
        bool first = true;
 
+       if (c)
+               prt_printf(out, "durability: %u ", bch2_bkey_durability(c, k));
+
        bkey_extent_entry_for_each(ptrs, entry) {
                if (!first)
                        prt_printf(out, " ");
index bac6a1ed2c599a945e915c23b0772c2221887012..9b026ae95932f382d5b6c7aacd34b38d628c059f 100644 (file)
@@ -613,14 +613,21 @@ unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
 
 void bch2_bkey_drop_device(struct bkey_s, unsigned);
 void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
-const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
+
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
+
+static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
+{
+       return (void *) bch2_bkey_has_device_c(k.s_c, dev);
+}
+
 bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
 
 void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
 
 static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
 {
-       EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
+       EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
 
        switch (k->k.type) {
        case KEY_TYPE_btree_ptr:
@@ -642,6 +649,8 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr
 
 void bch2_extent_ptr_decoded_append(struct bkey_i *,
                                    struct extent_ptr_decoded *);
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s,
+                                                  struct bch_extent_ptr *);
 union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
                                           struct bch_extent_ptr *);
 
@@ -665,7 +674,8 @@ do {                                                                        \
 bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
                           struct bch_extent_ptr, u64);
 bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
-bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c);
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
 
 void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
 
index e232f331ae9a40f0cf65d034df6e53bf374ab9db..5e6dc6c316d12052d0bdaf85d8d7bab2a130efbe 100644 (file)
@@ -954,11 +954,11 @@ static int check_inode(struct btree_trans *trans,
                                     iter->pos.snapshot),
                                POS(u.bi_inum, U64_MAX),
                                0, NULL);
-               if (ret) {
+               if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        bch_err(c, "error in fsck: error truncating inode: %s",
                                bch2_err_str(ret));
+               if (ret)
                        return ret;
-               }
 
                /*
                 * We truncated without our normal sector accounting hook, just
index ea0fd6310b6e09353fd0eef999abffaa90889cf5..76856bfd6dc5d17b9ee37477112e0d9ab080b510 100644 (file)
@@ -218,7 +218,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 
        bch2_trans_copy_iter(&iter, extent_iter);
 
-       for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) {
+       for_each_btree_key_upto_continue_norestart(iter,
+                               new->k.p, BTREE_ITER_SLOTS, old, ret) {
                s64 sectors = min(new->k.p.offset, old.k->p.offset) -
                        max(bkey_start_offset(&new->k),
                            bkey_start_offset(old.k));
@@ -705,7 +706,8 @@ static void bch2_write_done(struct closure *cl)
        struct bch_fs *c = op->c;
 
        bch2_disk_reservation_put(c, &op->res);
-       bch2_write_ref_put(c, BCH_WRITE_REF_write);
+       if (!(op->flags & BCH_WRITE_MOVE))
+               bch2_write_ref_put(c, BCH_WRITE_REF_write);
        bch2_keylist_free(&op->insert_keys, op->inline_keys);
 
        bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
@@ -834,36 +836,30 @@ static void bch2_write_index(struct closure *cl)
        struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
        struct write_point *wp = op->wp;
        struct workqueue_struct *wq = index_update_wq(op);
+       unsigned long flags;
 
        if ((op->flags & BCH_WRITE_DONE) &&
            (op->flags & BCH_WRITE_MOVE))
                bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
 
-       barrier();
-
-       /*
-        * We're not using wp->writes_lock here, so this is racey: that's ok,
-        * because this is just for diagnostic purposes, and we're running out
-        * of interrupt context here so if we were to take the log we'd have to
-        * switch to spin_lock_irq()/irqsave(), which is not free:
-        */
+       spin_lock_irqsave(&wp->writes_lock, flags);
        if (wp->state == WRITE_POINT_waiting_io)
                __wp_update_state(wp, WRITE_POINT_waiting_work);
+       list_add_tail(&op->wp_list, &wp->writes);
+       spin_unlock_irqrestore (&wp->writes_lock, flags);
 
-       op->btree_update_ready = true;
        queue_work(wq, &wp->index_update_work);
 }
 
 static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
 {
-       op->btree_update_ready = false;
        op->wp = wp;
 
-       spin_lock(&wp->writes_lock);
-       list_add_tail(&op->wp_list, &wp->writes);
-       if (wp->state == WRITE_POINT_stopped)
+       if (wp->state == WRITE_POINT_stopped) {
+               spin_lock_irq(&wp->writes_lock);
                __wp_update_state(wp, WRITE_POINT_waiting_io);
-       spin_unlock(&wp->writes_lock);
+               spin_unlock_irq(&wp->writes_lock);
+       }
 }
 
 void bch2_write_point_do_index_updates(struct work_struct *work)
@@ -873,16 +869,12 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
        struct bch_write_op *op;
 
        while (1) {
-               spin_lock(&wp->writes_lock);
-               list_for_each_entry(op, &wp->writes, wp_list)
-                       if (op->btree_update_ready) {
-                               list_del(&op->wp_list);
-                               goto unlock;
-                       }
-               op = NULL;
-unlock:
+               spin_lock_irq(&wp->writes_lock);
+               op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
+               if (op)
+                       list_del(&op->wp_list);
                wp_update_state(wp, op != NULL);
-               spin_unlock(&wp->writes_lock);
+               spin_unlock_irq(&wp->writes_lock);
 
                if (!op)
                        break;
@@ -1673,7 +1665,6 @@ static void __bch2_write(struct bch_write_op *op)
        }
 again:
        memset(&op->failed, 0, sizeof(op->failed));
-       op->btree_update_ready = false;
 
        do {
                struct bkey_i *key_to_write;
@@ -1853,7 +1844,12 @@ void bch2_write(struct closure *cl)
                goto err;
        }
 
-       if (c->opts.nochanges ||
+       if (c->opts.nochanges) {
+               op->error = -BCH_ERR_erofs_no_writes;
+               goto err;
+       }
+
+       if (!(op->flags & BCH_WRITE_MOVE) &&
            !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
                op->error = -BCH_ERR_erofs_no_writes;
                goto err;
@@ -1881,6 +1877,28 @@ err:
                op->end_io(op);
 }
 
+const char * const bch2_write_flags[] = {
+#define x(f)   #f,
+       BCH_WRITE_FLAGS()
+#undef x
+       NULL
+};
+
+void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
+{
+       prt_str(out, "pos: ");
+       bch2_bpos_to_text(out, op->pos);
+       prt_newline(out);
+
+       prt_str(out, "started: ");
+       bch2_pr_time_units(out, local_clock() - op->start_time);
+       prt_newline(out);
+
+       prt_str(out, "flags: ");
+       prt_bitflags(out, bch2_write_flags, op->flags);
+       prt_newline(out);
+}
+
 /* Cache promotion on read */
 
 struct promote_op {
index 166ad68177400176b9fa04ae8a23d94934b12941..90948bb0aabd62b8be5e69ce7d2add70f0d56ec1 100644 (file)
@@ -28,41 +28,34 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
 
 const char *bch2_blk_status_to_str(blk_status_t);
 
-enum bch_write_flags {
-       __BCH_WRITE_ALLOC_NOWAIT,
-       __BCH_WRITE_CACHED,
-       __BCH_WRITE_DATA_ENCODED,
-       __BCH_WRITE_PAGES_STABLE,
-       __BCH_WRITE_PAGES_OWNED,
-       __BCH_WRITE_ONLY_SPECIFIED_DEVS,
-       __BCH_WRITE_WROTE_DATA_INLINE,
-       __BCH_WRITE_FROM_INTERNAL,
-       __BCH_WRITE_CHECK_ENOSPC,
-       __BCH_WRITE_SYNC,
-       __BCH_WRITE_MOVE,
-       __BCH_WRITE_IN_WORKER,
-       __BCH_WRITE_DONE,
-       __BCH_WRITE_IO_ERROR,
-       __BCH_WRITE_CONVERT_UNWRITTEN,
+#define BCH_WRITE_FLAGS()              \
+       x(ALLOC_NOWAIT)                 \
+       x(CACHED)                       \
+       x(DATA_ENCODED)                 \
+       x(PAGES_STABLE)                 \
+       x(PAGES_OWNED)                  \
+       x(ONLY_SPECIFIED_DEVS)          \
+       x(WROTE_DATA_INLINE)            \
+       x(FROM_INTERNAL)                \
+       x(CHECK_ENOSPC)                 \
+       x(SYNC)                         \
+       x(MOVE)                         \
+       x(IN_WORKER)                    \
+       x(DONE)                         \
+       x(IO_ERROR)                     \
+       x(CONVERT_UNWRITTEN)
+
+enum __bch_write_flags {
+#define x(f)   __BCH_WRITE_##f,
+       BCH_WRITE_FLAGS()
+#undef x
 };
 
-#define BCH_WRITE_ALLOC_NOWAIT         (1U << __BCH_WRITE_ALLOC_NOWAIT)
-#define BCH_WRITE_CACHED               (1U << __BCH_WRITE_CACHED)
-#define BCH_WRITE_DATA_ENCODED         (1U << __BCH_WRITE_DATA_ENCODED)
-#define BCH_WRITE_PAGES_STABLE         (1U << __BCH_WRITE_PAGES_STABLE)
-#define BCH_WRITE_PAGES_OWNED          (1U << __BCH_WRITE_PAGES_OWNED)
-#define BCH_WRITE_ONLY_SPECIFIED_DEVS  (1U << __BCH_WRITE_ONLY_SPECIFIED_DEVS)
-#define BCH_WRITE_WROTE_DATA_INLINE    (1U << __BCH_WRITE_WROTE_DATA_INLINE)
-#define BCH_WRITE_FROM_INTERNAL                (1U << __BCH_WRITE_FROM_INTERNAL)
-#define BCH_WRITE_CHECK_ENOSPC         (1U << __BCH_WRITE_CHECK_ENOSPC)
-#define BCH_WRITE_SYNC                 (1U << __BCH_WRITE_SYNC)
-#define BCH_WRITE_MOVE                 (1U << __BCH_WRITE_MOVE)
-
-/* Internal: */
-#define BCH_WRITE_IN_WORKER            (1U << __BCH_WRITE_IN_WORKER)
-#define BCH_WRITE_DONE                 (1U << __BCH_WRITE_DONE)
-#define BCH_WRITE_IO_ERROR             (1U << __BCH_WRITE_IO_ERROR)
-#define BCH_WRITE_CONVERT_UNWRITTEN    (1U << __BCH_WRITE_CONVERT_UNWRITTEN)
+enum bch_write_flags {
+#define x(f)   BCH_WRITE_##f = 1U << __BCH_WRITE_##f,
+       BCH_WRITE_FLAGS()
+#undef x
+};
 
 static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 {
@@ -124,6 +117,8 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio)
        return wbio;
 }
 
+void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
+
 struct bch_devs_mask;
 struct cache_promote_op;
 struct extent_ptr_decoded;
index 4e5d31060b5760a5bc4b8dbc511bcd54d6070030..3b2ed0fa583a05b50fb92ad9adff7f37a10d7ce0 100644 (file)
@@ -119,7 +119,7 @@ struct bch_write_op {
        unsigned                nr_replicas_required:4;
        unsigned                alloc_reserve:3;
        unsigned                incompressible:1;
-       unsigned                btree_update_ready:1;
+       unsigned                stripe_waited:1;
 
        struct bch_devs_list    devs_have;
        u16                     target;
index e0c4f51a1fde4862165dab8b53348e32fd2de26c..5699a9d8d9f957aa747765fc38e28f6dc37a0352 100644 (file)
@@ -68,8 +68,9 @@ journal_seq_to_buf(struct journal *j, u64 seq)
 
 static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
 {
-       INIT_LIST_HEAD(&p->list);
-       INIT_LIST_HEAD(&p->key_cache_list);
+       unsigned i;
+       for (i = 0; i < ARRAY_SIZE(p->list); i++)
+               INIT_LIST_HEAD(&p->list[i]);
        INIT_LIST_HEAD(&p->flushed);
        atomic_set(&p->count, count);
        p->devs.nr = 0;
@@ -758,19 +759,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
        u64 *new_bucket_seq = NULL, *new_buckets = NULL;
        struct open_bucket **ob = NULL;
        long *bu = NULL;
-       unsigned i, nr_got = 0, nr_want = nr - ja->nr;
-       unsigned old_nr                 = ja->nr;
-       unsigned old_discard_idx        = ja->discard_idx;
-       unsigned old_dirty_idx_ondisk   = ja->dirty_idx_ondisk;
-       unsigned old_dirty_idx          = ja->dirty_idx;
-       unsigned old_cur_idx            = ja->cur_idx;
+       unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr;
        int ret = 0;
 
-       if (c) {
-               bch2_journal_flush_all_pins(&c->journal);
-               bch2_journal_block(&c->journal);
-               mutex_lock(&c->sb_lock);
-       }
+       BUG_ON(nr <= ja->nr);
 
        bu              = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
        ob              = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
@@ -778,7 +770,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
        new_bucket_seq  = kcalloc(nr, sizeof(u64), GFP_KERNEL);
        if (!bu || !ob || !new_buckets || !new_bucket_seq) {
                ret = -ENOMEM;
-               goto err_unblock;
+               goto err_free;
        }
 
        for (nr_got = 0; nr_got < nr_want; nr_got++) {
@@ -794,87 +786,92 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                        if (ret)
                                break;
 
+                       ret = bch2_trans_run(c,
+                               bch2_trans_mark_metadata_bucket(&trans, ca,
+                                               ob[nr_got]->bucket, BCH_DATA_journal,
+                                               ca->mi.bucket_size));
+                       if (ret) {
+                               bch2_open_bucket_put(c, ob[nr_got]);
+                               bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret));
+                               break;
+                       }
+
                        bu[nr_got] = ob[nr_got]->bucket;
                }
        }
 
        if (!nr_got)
-               goto err_unblock;
+               goto err_free;
 
-       /*
-        * We may be called from the device add path, before the new device has
-        * actually been added to the running filesystem:
-        */
-       if (!new_fs)
-               spin_lock(&c->journal.lock);
+       /* Don't return an error if we successfully allocated some buckets: */
+       ret = 0;
+
+       if (c) {
+               bch2_journal_flush_all_pins(&c->journal);
+               bch2_journal_block(&c->journal);
+               mutex_lock(&c->sb_lock);
+       }
 
        memcpy(new_buckets,     ja->buckets,    ja->nr * sizeof(u64));
        memcpy(new_bucket_seq,  ja->bucket_seq, ja->nr * sizeof(u64));
-       swap(new_buckets,       ja->buckets);
-       swap(new_bucket_seq,    ja->bucket_seq);
+
+       BUG_ON(ja->discard_idx > ja->nr);
+
+       pos = ja->discard_idx ?: ja->nr;
+
+       memmove(new_buckets + pos + nr_got,
+               new_buckets + pos,
+               sizeof(new_buckets[0]) * (ja->nr - pos));
+       memmove(new_bucket_seq + pos + nr_got,
+               new_bucket_seq + pos,
+               sizeof(new_bucket_seq[0]) * (ja->nr - pos));
 
        for (i = 0; i < nr_got; i++) {
-               unsigned pos = ja->discard_idx ?: ja->nr;
-               long b = bu[i];
-
-               __array_insert_item(ja->buckets,                ja->nr, pos);
-               __array_insert_item(ja->bucket_seq,             ja->nr, pos);
-               ja->nr++;
-
-               ja->buckets[pos] = b;
-               ja->bucket_seq[pos] = 0;
-
-               if (pos <= ja->discard_idx)
-                       ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
-               if (pos <= ja->dirty_idx_ondisk)
-                       ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
-               if (pos <= ja->dirty_idx)
-                       ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
-               if (pos <= ja->cur_idx)
-                       ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+               new_buckets[pos + i] = bu[i];
+               new_bucket_seq[pos + i] = 0;
        }
 
-       ret = bch2_journal_buckets_to_sb(c, ca);
-       if (ret) {
-               /* Revert: */
-               swap(new_buckets,       ja->buckets);
-               swap(new_bucket_seq,    ja->bucket_seq);
-               ja->nr                  = old_nr;
-               ja->discard_idx         = old_discard_idx;
-               ja->dirty_idx_ondisk    = old_dirty_idx_ondisk;
-               ja->dirty_idx           = old_dirty_idx;
-               ja->cur_idx             = old_cur_idx;
-       }
+       nr = ja->nr + nr_got;
 
-       if (!new_fs)
-               spin_unlock(&c->journal.lock);
+       ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr);
+       if (ret)
+               goto err_unblock;
 
-       if (ja->nr != old_nr && !new_fs)
+       if (!new_fs)
                bch2_write_super(c);
 
+       /* Commit: */
        if (c)
-               bch2_journal_unblock(&c->journal);
+               spin_lock(&c->journal.lock);
 
-       if (ret)
-               goto err;
+       swap(new_buckets,       ja->buckets);
+       swap(new_bucket_seq,    ja->bucket_seq);
+       ja->nr = nr;
+
+       if (pos <= ja->discard_idx)
+               ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr;
+       if (pos <= ja->dirty_idx_ondisk)
+               ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr;
+       if (pos <= ja->dirty_idx)
+               ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr;
+       if (pos <= ja->cur_idx)
+               ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr;
 
-       if (!new_fs) {
-               for (i = 0; i < nr_got; i++) {
-                       ret = bch2_trans_run(c,
-                               bch2_trans_mark_metadata_bucket(&trans, ca,
-                                               bu[i], BCH_DATA_journal,
-                                               ca->mi.bucket_size));
-                       if (ret) {
-                               bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret);
-                               goto err;
-                       }
-               }
-       }
-err:
        if (c)
+               spin_unlock(&c->journal.lock);
+err_unblock:
+       if (c) {
+               bch2_journal_unblock(&c->journal);
                mutex_unlock(&c->sb_lock);
+       }
 
-       if (ob && !new_fs)
+       if (ret && !new_fs)
+               for (i = 0; i < nr_got; i++)
+                       bch2_trans_run(c,
+                               bch2_trans_mark_metadata_bucket(&trans, ca,
+                                               bu[i], BCH_DATA_free, 0));
+err_free:
+       if (!new_fs)
                for (i = 0; i < nr_got; i++)
                        bch2_open_bucket_put(c, ob[i]);
 
@@ -882,12 +879,7 @@ err:
        kfree(new_buckets);
        kfree(ob);
        kfree(bu);
-
        return ret;
-err_unblock:
-       if (c)
-               bch2_journal_unblock(&c->journal);
-       goto err;
 }
 
 /*
@@ -901,13 +893,15 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
        struct closure cl;
        int ret = 0;
 
+       closure_init_stack(&cl);
+
+       down_write(&c->state_lock);
+
        /* don't handle reducing nr of buckets yet: */
        if (nr < ja->nr)
-               return 0;
-
-       closure_init_stack(&cl);
+               goto unlock;
 
-       while (ja->nr != nr) {
+       while (ja->nr < nr) {
                struct disk_reservation disk_res = { 0, 0 };
 
                /*
@@ -938,7 +932,8 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 
        if (ret)
                bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
-
+unlock:
+       up_write(&c->state_lock);
        return ret;
 }
 
@@ -977,7 +972,7 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
             seq++) {
                struct journal_buf *buf = journal_seq_to_buf(j, seq);
 
-               if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx))
+               if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx))
                        ret = true;
        }
        spin_unlock(&j->lock);
@@ -1353,6 +1348,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 {
        struct journal_entry_pin_list *pin_list;
        struct journal_entry_pin *pin;
+       unsigned i;
 
        spin_lock(&j->lock);
        *seq = max(*seq, j->pin.front);
@@ -1370,15 +1366,11 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
        prt_newline(out);
        printbuf_indent_add(out, 2);
 
-       list_for_each_entry(pin, &pin_list->list, list) {
-               prt_printf(out, "\t%px %ps", pin, pin->flush);
-               prt_newline(out);
-       }
-
-       list_for_each_entry(pin, &pin_list->key_cache_list, list) {
-               prt_printf(out, "\t%px %ps", pin, pin->flush);
-               prt_newline(out);
-       }
+       for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+               list_for_each_entry(pin, &pin_list->list[i], list) {
+                       prt_printf(out, "\t%px %ps", pin, pin->flush);
+                       prt_newline(out);
+               }
 
        if (!list_empty(&pin_list->flushed)) {
                prt_printf(out, "flushed:");
index 8d3878bde1d1039f610b26fd84d8b39221f130ef..cfd92d8b44382d224296df1404c77eab1a1cd031 100644 (file)
@@ -1339,8 +1339,7 @@ static void __journal_write_alloc(struct journal *j,
                if (!ca->mi.durability ||
                    ca->mi.state != BCH_MEMBER_STATE_rw ||
                    !ja->nr ||
-                   bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
-                                        ca->dev_idx) ||
+                   bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
                    sectors > ja->sectors_free)
                        continue;
 
index 8744581dfda7de7bad38aa65ad66b9aca27562a1..8c88884c74a5d3593a5d931a8440391cdf123b45 100644 (file)
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "btree_key_cache.h"
+#include "btree_update.h"
 #include "errcode.h"
 #include "error.h"
 #include "journal.h"
@@ -318,9 +319,7 @@ static void bch2_journal_reclaim_fast(struct journal *j)
         */
        while (!fifo_empty(&j->pin) &&
               !atomic_read(&fifo_peek_front(&j->pin).count)) {
-               BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
-               BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed));
-               BUG_ON(!fifo_pop(&j->pin, temp));
+               fifo_pop(&j->pin, temp);
                popped = true;
        }
 
@@ -379,6 +378,17 @@ void bch2_journal_pin_drop(struct journal *j,
        spin_unlock(&j->lock);
 }
 
+enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
+{
+       if (fn == bch2_btree_node_flush0 ||
+           fn == bch2_btree_node_flush1)
+               return JOURNAL_PIN_btree;
+       else if (fn == bch2_btree_key_cache_journal_flush)
+               return JOURNAL_PIN_key_cache;
+       else
+               return JOURNAL_PIN_other;
+}
+
 void bch2_journal_pin_set(struct journal *j, u64 seq,
                          struct journal_entry_pin *pin,
                          journal_pin_flush_fn flush_fn)
@@ -407,10 +417,8 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
        pin->seq        = seq;
        pin->flush      = flush_fn;
 
-       if (flush_fn == bch2_btree_key_cache_journal_flush)
-               list_add(&pin->list, &pin_list->key_cache_list);
-       else if (flush_fn)
-               list_add(&pin->list, &pin_list->list);
+       if (flush_fn)
+               list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
        else
                list_add(&pin->list, &pin_list->flushed);
 
@@ -446,37 +454,37 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
 
 static struct journal_entry_pin *
 journal_get_next_pin(struct journal *j,
-                    bool get_any,
-                    bool get_key_cache,
-                    u64 max_seq, u64 *seq)
+                    u64 seq_to_flush,
+                    unsigned allowed_below_seq,
+                    unsigned allowed_above_seq,
+                    u64 *seq)
 {
        struct journal_entry_pin_list *pin_list;
        struct journal_entry_pin *ret = NULL;
+       unsigned i;
 
        fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
-               if (*seq > max_seq && !get_any && !get_key_cache)
+               if (*seq > seq_to_flush && !allowed_above_seq)
                        break;
 
-               if (*seq <= max_seq || get_any) {
-                       ret = list_first_entry_or_null(&pin_list->list,
-                               struct journal_entry_pin, list);
-                       if (ret)
-                               return ret;
-               }
-
-               if (*seq <= max_seq || get_any || get_key_cache) {
-                       ret = list_first_entry_or_null(&pin_list->key_cache_list,
-                               struct journal_entry_pin, list);
-                       if (ret)
-                               return ret;
-               }
+               for (i = 0; i < JOURNAL_PIN_NR; i++)
+                       if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
+                           ((1U << i) & allowed_above_seq)) {
+                               ret = list_first_entry_or_null(&pin_list->list[i],
+                                       struct journal_entry_pin, list);
+                               if (ret)
+                                       return ret;
+                       }
        }
 
        return NULL;
 }
 
 /* returns true if we did work */
-static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
+static size_t journal_flush_pins(struct journal *j,
+                                u64 seq_to_flush,
+                                unsigned allowed_below_seq,
+                                unsigned allowed_above_seq,
                                 unsigned min_any,
                                 unsigned min_key_cache)
 {
@@ -489,15 +497,25 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
        lockdep_assert_held(&j->reclaim_lock);
 
        while (1) {
+               unsigned allowed_above = allowed_above_seq;
+               unsigned allowed_below = allowed_below_seq;
+
+               if (min_any) {
+                       allowed_above |= ~0;
+                       allowed_below |= ~0;
+               }
+
+               if (min_key_cache) {
+                       allowed_above |= 1U << JOURNAL_PIN_key_cache;
+                       allowed_below |= 1U << JOURNAL_PIN_key_cache;
+               }
+
                cond_resched();
 
                j->last_flushed = jiffies;
 
                spin_lock(&j->lock);
-               pin = journal_get_next_pin(j,
-                                          min_any != 0,
-                                          min_key_cache != 0,
-                                          seq_to_flush, &seq);
+               pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
                if (pin) {
                        BUG_ON(j->flush_in_progress);
                        j->flush_in_progress = pin;
@@ -656,6 +674,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
                                atomic_long_read(&c->btree_key_cache.nr_keys));
 
                nr_flushed = journal_flush_pins(j, seq_to_flush,
+                                               ~0, 0,
                                                min_nr, min_key_cache);
 
                if (direct)
@@ -776,7 +795,11 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 
        mutex_lock(&j->reclaim_lock);
 
-       if (journal_flush_pins(j, seq_to_flush, 0, 0))
+       if (journal_flush_pins(j, seq_to_flush,
+                              (1U << JOURNAL_PIN_key_cache)|
+                              (1U << JOURNAL_PIN_other), 0, 0, 0) ||
+           journal_flush_pins(j, seq_to_flush,
+                              (1U << JOURNAL_PIN_btree), 0, 0, 0))
                *did_work = true;
 
        spin_lock(&j->lock);
index 9b933330a4c374e0aa9b00fd08a0ee178c582a41..5be7882342e0f1d2b26e0ab80c9c08d4d77e9a39 100644 (file)
@@ -175,46 +175,45 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
        .to_text        = bch2_sb_journal_v2_to_text,
 };
 
-int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca)
+int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
+                              u64 *buckets, unsigned nr)
 {
-       struct journal_device *ja = &ca->journal;
        struct bch_sb_field_journal_v2 *j;
-       unsigned i, dst = 0, nr = 1;
+       unsigned i, dst = 0, nr_compacted = 1;
 
        if (c)
                lockdep_assert_held(&c->sb_lock);
 
-       if (!ja->nr) {
+       if (!nr) {
                bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
                bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
                return 0;
        }
 
-       for (i = 0; i + 1 < ja->nr; i++)
-               if (ja->buckets[i] + 1 != ja->buckets[i + 1])
-                       nr++;
+       for (i = 0; i + 1 < nr; i++)
+               if (buckets[i] + 1 != buckets[i + 1])
+                       nr_compacted++;
 
        j = bch2_sb_resize_journal_v2(&ca->disk_sb,
-                                (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64));
+                        (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
        if (!j)
                return -BCH_ERR_ENOSPC_sb_journal;
 
        bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
 
-       j->d[dst].start = le64_to_cpu(ja->buckets[0]);
+       j->d[dst].start = le64_to_cpu(buckets[0]);
        j->d[dst].nr    = le64_to_cpu(1);
 
-       for (i = 1; i < ja->nr; i++) {
-               if (ja->buckets[i] == ja->buckets[i - 1] + 1) {
+       for (i = 1; i < nr; i++) {
+               if (buckets[i] == buckets[i - 1] + 1) {
                        le64_add_cpu(&j->d[dst].nr, 1);
                } else {
                        dst++;
-                       j->d[dst].start = le64_to_cpu(ja->buckets[i]);
+                       j->d[dst].start = le64_to_cpu(buckets[i]);
                        j->d[dst].nr    = le64_to_cpu(1);
                }
        }
 
-       BUG_ON(dst + 1 != nr);
-
+       BUG_ON(dst + 1 != nr_compacted);
        return 0;
 }
index a39192e9f6f4c5a0ff251366ce1e2268fae38172..ba40a7e8d90a32d391a3d99dd2b4a8ca1418fb23 100644 (file)
@@ -21,4 +21,4 @@ static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_j
 extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
 extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
 
-int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *);
+int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned);
index 0e6bde669b3e6577b542a4243a11a061693ffc8c..8d8c0b3d5a30e7e86dbef1f26db55f3b9a3fd5f1 100644 (file)
@@ -43,9 +43,15 @@ struct journal_buf {
  * flushed:
  */
 
+enum journal_pin_type {
+       JOURNAL_PIN_btree,
+       JOURNAL_PIN_key_cache,
+       JOURNAL_PIN_other,
+       JOURNAL_PIN_NR,
+};
+
 struct journal_entry_pin_list {
-       struct list_head                list;
-       struct list_head                key_cache_list;
+       struct list_head                list[JOURNAL_PIN_NR];
        struct list_head                flushed;
        atomic_t                        count;
        struct bch_devs_list            devs;
index e3e39127b40a0aa4708735bcc3f2d2d91bf367f3..d93db07f0c8781d9ad35afeeeebb10938e94428d 100644 (file)
@@ -46,7 +46,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
        struct bkey_i *n;
        int ret;
 
-       if (!bch2_bkey_has_device(k, dev_idx))
+       if (!bch2_bkey_has_device_c(k, dev_idx))
                return 0;
 
        n = bch2_bkey_make_mut(trans, k);
@@ -130,8 +130,7 @@ retry:
                while (bch2_trans_begin(&trans),
                       (b = bch2_btree_iter_peek_node(&iter)) &&
                       !(ret = PTR_ERR_OR_ZERO(b))) {
-                       if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
-                                                 dev_idx))
+                       if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
                                goto next;
 
                        bch2_bkey_buf_copy(&k, c, &b->key);
index 5e952d6c0944a193a5fa70ce5df6240c206d01a2..bb5061bc24d738c2156abe62358d36633e12b640 100644 (file)
@@ -41,7 +41,8 @@ static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
 }
 
 struct moving_io {
-       struct list_head                list;
+       struct list_head                read_list;
+       struct list_head                io_list;
        struct move_bucket_in_flight    *b;
        struct closure                  cl;
        bool                            read_completed;
@@ -65,8 +66,12 @@ static void move_free(struct moving_io *io)
                atomic_dec(&io->b->count);
 
        bch2_data_update_exit(&io->write);
+
+       mutex_lock(&ctxt->lock);
+       list_del(&io->io_list);
        wake_up(&ctxt->wait);
-       bch2_write_ref_put(c, BCH_WRITE_REF_move);
+       mutex_unlock(&ctxt->lock);
+
        kfree(io);
 }
 
@@ -101,7 +106,7 @@ static void move_write(struct moving_io *io)
 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
 {
        struct moving_io *io =
-               list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
+               list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
 
        return io && io->read_completed ? io : NULL;
 }
@@ -128,7 +133,7 @@ void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
                bch2_trans_unlock(trans);
 
        while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
-               list_del(&io->list);
+               list_del(&io->read_list);
                move_write(io);
        }
 }
@@ -145,6 +150,8 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
 
 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
 {
+       struct bch_fs *c = ctxt->c;
+
        move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
        closure_sync(&ctxt->cl);
 
@@ -154,12 +161,15 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
        EBUG_ON(atomic_read(&ctxt->read_ios));
 
        if (ctxt->stats) {
-               progress_list_del(ctxt->c, ctxt->stats);
-
-               trace_move_data(ctxt->c,
+               progress_list_del(c, ctxt->stats);
+               trace_move_data(c,
                                atomic64_read(&ctxt->stats->sectors_moved),
                                atomic64_read(&ctxt->stats->keys_moved));
        }
+
+       mutex_lock(&c->moving_context_lock);
+       list_del(&ctxt->list);
+       mutex_unlock(&c->moving_context_lock);
 }
 
 void bch2_moving_ctxt_init(struct moving_context *ctxt,
@@ -172,15 +182,23 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
        memset(ctxt, 0, sizeof(*ctxt));
 
        ctxt->c         = c;
+       ctxt->fn        = (void *) _RET_IP_;
        ctxt->rate      = rate;
        ctxt->stats     = stats;
        ctxt->wp        = wp;
        ctxt->wait_on_copygc = wait_on_copygc;
 
        closure_init_stack(&ctxt->cl);
+
+       mutex_init(&ctxt->lock);
        INIT_LIST_HEAD(&ctxt->reads);
+       INIT_LIST_HEAD(&ctxt->ios);
        init_waitqueue_head(&ctxt->wait);
 
+       mutex_lock(&c->moving_context_lock);
+       list_add(&ctxt->list, &c->moving_context_list);
+       mutex_unlock(&c->moving_context_lock);
+
        if (stats) {
                progress_list_add(c, stats);
                stats->data_type = BCH_DATA_user;
@@ -262,9 +280,6 @@ static int bch2_move_extent(struct btree_trans *trans,
                return 0;
        }
 
-       if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_move))
-               return -BCH_ERR_erofs_no_writes;
-
        /*
         * Before memory allocations & taking nocow locks in
         * bch2_data_update_init():
@@ -334,9 +349,14 @@ static int bch2_move_extent(struct btree_trans *trans,
        this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
        trace_move_extent_read(k.k);
 
+
+       mutex_lock(&ctxt->lock);
        atomic_add(io->read_sectors, &ctxt->read_sectors);
        atomic_inc(&ctxt->read_ios);
-       list_add_tail(&io->list, &ctxt->reads);
+
+       list_add_tail(&io->read_list, &ctxt->reads);
+       list_add_tail(&io->io_list, &ctxt->ios);
+       mutex_unlock(&ctxt->lock);
 
        /*
         * dropped by move_read_endio() - guards against use after free of
@@ -354,7 +374,6 @@ err_free_pages:
 err_free:
        kfree(io);
 err:
-       bch2_write_ref_put(c, BCH_WRITE_REF_move);
        trace_and_count(c, move_extent_alloc_mem_fail, k.k);
        return ret;
 }
@@ -759,8 +778,13 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
                        data_opts.rewrite_ptrs = 0;
 
                        bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
-                               if (ptr->dev == bucket.inode)
+                               if (ptr->dev == bucket.inode) {
                                        data_opts.rewrite_ptrs |= 1U << i;
+                                       if (ptr->cached) {
+                                               bch2_trans_iter_exit(trans, &iter);
+                                               goto next;
+                                       }
+                               }
                                i++;
                        }
 
@@ -819,14 +843,6 @@ next:
        }
 
        trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
-
-       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) {
-               bch2_trans_unlock(trans);
-               move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
-               closure_sync(&ctxt->cl);
-               if (!ctxt->write_error)
-                       bch2_verify_bucket_evacuated(trans, bucket, gen);
-       }
 err:
        bch2_bkey_buf_exit(&sk, c);
        return ret;
@@ -1111,3 +1127,67 @@ int bch2_data_job(struct bch_fs *c,
 
        return ret;
 }
+
+void bch2_data_jobs_to_text(struct printbuf *out, struct bch_fs *c)
+{
+       struct bch_move_stats *stats;
+
+       mutex_lock(&c->data_progress_lock);
+       list_for_each_entry(stats, &c->data_progress_list, list) {
+               prt_printf(out, "%s: data type %s btree_id %s position: ",
+                      stats->name,
+                      bch2_data_types[stats->data_type],
+                      bch2_btree_ids[stats->btree_id]);
+               bch2_bpos_to_text(out, stats->pos);
+               prt_printf(out, "%s", "\n");
+       }
+       mutex_unlock(&c->data_progress_lock);
+}
+
+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct moving_context *ctxt)
+{
+       struct moving_io *io;
+
+       prt_printf(out, "%ps:", ctxt->fn);
+       prt_newline(out);
+       printbuf_indent_add(out, 2);
+
+       prt_printf(out, "reads: %u sectors %u",
+                  atomic_read(&ctxt->read_ios),
+                  atomic_read(&ctxt->read_sectors));
+       prt_newline(out);
+
+       prt_printf(out, "writes: %u sectors %u",
+                  atomic_read(&ctxt->write_ios),
+                  atomic_read(&ctxt->write_sectors));
+       prt_newline(out);
+
+       printbuf_indent_add(out, 2);
+
+       mutex_lock(&ctxt->lock);
+       list_for_each_entry(io, &ctxt->ios, io_list) {
+               bch2_write_op_to_text(out, &io->write.op);
+       }
+       mutex_unlock(&ctxt->lock);
+
+       printbuf_indent_sub(out, 4);
+}
+
+void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
+{
+       struct moving_context *ctxt;
+
+       mutex_lock(&c->moving_context_lock);
+       list_for_each_entry(ctxt, &c->moving_context_list, list)
+               bch2_moving_ctxt_to_text(out, ctxt);
+       mutex_unlock(&c->moving_context_lock);
+}
+
+void bch2_fs_move_init(struct bch_fs *c)
+{
+       INIT_LIST_HEAD(&c->moving_context_list);
+       mutex_init(&c->moving_context_lock);
+
+       INIT_LIST_HEAD(&c->data_progress_list);
+       mutex_init(&c->data_progress_lock);
+}
index 4c0013872347ff4d767998e223110e1d88d3c85a..50a6f7d7a292dcbe1b0f5e4b2c6959df701f9b60 100644 (file)
@@ -11,6 +11,9 @@ struct bch_read_bio;
 
 struct moving_context {
        struct bch_fs           *c;
+       struct list_head        list;
+       void                    *fn;
+
        struct bch_ratelimit    *rate;
        struct bch_move_stats   *stats;
        struct write_point_specifier wp;
@@ -19,7 +22,10 @@ struct moving_context {
 
        /* For waiting on outstanding reads and writes: */
        struct closure          cl;
+
+       struct mutex            lock;
        struct list_head        reads;
+       struct list_head        ios;
 
        /* in flight sectors: */
        atomic_t                read_sectors;
@@ -84,6 +90,9 @@ int bch2_data_job(struct bch_fs *,
                  struct bch_ioctl_data);
 
 void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
+void bch2_data_jobs_to_text(struct printbuf *, struct bch_fs *);
+void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
 
+void bch2_fs_move_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_MOVE_H */
index 79aaa45f5348663b0bd7177ef7676fd0667c4bc9..178f96a64804bb07b8e8293874bbb7960ae63624 100644 (file)
@@ -46,7 +46,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
        if (bch2_bucket_is_open(trans->c, bucket.inode, bucket.offset))
                return 0;
 
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, 0);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_CACHED);
        k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        bch2_trans_iter_exit(trans, &iter);
@@ -85,7 +85,7 @@ static int move_bucket_cmp(const void *_l, const void *_r)
        const struct move_bucket *l = _l;
        const struct move_bucket *r = _r;
 
-       return bpos_cmp(l->bucket, r->bucket) ?: cmp_int(l->gen, r->gen);
+       return bkey_cmp(l->bucket, r->bucket);
 }
 
 static bool bucket_in_flight(move_buckets *buckets_sorted, struct move_bucket b)
@@ -178,13 +178,13 @@ static int bch2_copygc(struct btree_trans *trans,
                       move_buckets_in_flight *buckets_in_flight)
 {
        struct bch_fs *c = trans->c;
-       struct bch_move_stats move_stats;
        struct data_update_opts data_opts = {
                .btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc,
        };
        move_buckets buckets = { 0 };
        struct move_bucket_in_flight *f;
        struct move_bucket *i;
+       u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
        int ret = 0;
 
        ret = bch2_btree_write_buffer_flush(trans);
@@ -192,9 +192,6 @@ static int bch2_copygc(struct btree_trans *trans,
                                 __func__, bch2_err_str(ret)))
                return ret;
 
-       bch2_move_stats_init(&move_stats, "copygc");
-       ctxt->stats = &move_stats;
-
        ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets);
        if (ret)
                goto err;
@@ -222,8 +219,8 @@ err:
        if (ret < 0 && !bch2_err_matches(ret, EROFS))
                bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
 
-       trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0);
-       ctxt->stats = NULL;
+       moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
+       trace_and_count(c, copygc, c, moved, 0, 0, 0);
        return ret;
 }
 
@@ -282,6 +279,7 @@ static int bch2_copygc_thread(void *arg)
        struct bch_fs *c = arg;
        struct btree_trans trans;
        struct moving_context ctxt;
+       struct bch_move_stats move_stats;
        struct io_clock *clock = &c->io_clock[WRITE];
        move_buckets_in_flight move_buckets;
        u64 last, wait;
@@ -294,7 +292,9 @@ static int bch2_copygc_thread(void *arg)
 
        set_freezable();
        bch2_trans_init(&trans, c, 0, 0);
-       bch2_moving_ctxt_init(&ctxt, c, NULL, NULL,
+
+       bch2_move_stats_init(&move_stats, "copygc");
+       bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
                              writepoint_ptr(&c->copygc_write_point),
                              false);
 
@@ -334,8 +334,8 @@ static int bch2_copygc_thread(void *arg)
                wake_up(&c->copygc_running_wq);
        }
 
-       bch2_moving_ctxt_exit(&ctxt);
        bch2_trans_exit(&trans);
+       bch2_moving_ctxt_exit(&ctxt);
        free_fifo(&move_buckets);
 
        return 0;
index afbf82d629779c4bd1013a12072e7c9761552da8..719693b333da231577d0fe09c38809954cbdfca2 100644 (file)
@@ -92,6 +92,12 @@ enum opt_type {
 #define RATELIMIT_ERRORS_DEFAULT false
 #endif
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCACHEFS_VERBOSE_DEFAULT       true
+#else
+#define BCACHEFS_VERBOSE_DEFAULT       false
+#endif
+
 #define BCH_OPTS()                                                     \
        x(block_size,                   u16,                            \
          OPT_FS|OPT_FORMAT|                                            \
@@ -276,7 +282,7 @@ enum opt_type {
        x(verbose,                      u8,                             \
          OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
          OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
+         BCH2_NO_SB_OPT,               BCACHEFS_VERBOSE_DEFAULT,       \
          NULL,         "Extra debugging information during mount/recovery")\
        x(journal_flush_delay,          u32,                            \
          OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
index d2e6adc13fb17c519162cd55ddcd3e7c64e046c8..d8426e754cdf0cd9cef8ed9c71a26c367294987b 100644 (file)
@@ -189,7 +189,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 
        for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink,
                           POS(0, c->reflink_hint),
-                          BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
+                          BTREE_ITER_SLOTS, k, ret) {
                if (reflink_iter.pos.inode) {
                        bch2_btree_iter_set_pos(&reflink_iter, POS_MIN);
                        continue;
index d76239654a8912ff5fc8fdaa0d28d9935cb41a91..bcc67c0f5dfc95992c05827ade2b759fd20e913a 100644 (file)
@@ -513,7 +513,9 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
                n->v.pad        = 0;
                SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
 
-               ret   = bch2_trans_update(trans, &iter, &n->k_i, 0);
+               ret   = bch2_trans_update(trans, &iter, &n->k_i, 0) ?:
+                       bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+                                          bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
                if (ret)
                        goto err;
 
@@ -540,7 +542,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
                n->v.children[1] = cpu_to_le32(new_snapids[1]);
                n->v.subvol = 0;
                SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
-               ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
+               ret   = bch2_trans_update(trans, &iter, &n->k_i, 0);
                if (ret)
                        goto err;
        }
index 359ca164e0889764d03cd930449debc89327b59a..3a7f4e295cbd2840ca27fddcf9f8ccb983f15778 100644 (file)
@@ -206,11 +206,15 @@ static void __bch2_fs_read_only(struct bch_fs *c)
        unsigned i, clean_passes = 0;
        u64 seq = 0;
 
+       bch2_fs_ec_stop(c);
+       bch2_open_buckets_stop(c, NULL, true);
        bch2_rebalance_stop(c);
        bch2_copygc_stop(c);
        bch2_gc_thread_stop(c);
+       bch2_fs_ec_flush(c);
 
-       bch_verbose(c, "flushing journal and stopping allocators");
+       bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
+                   journal_cur_seq(&c->journal));
 
        do {
                clean_passes++;
@@ -224,7 +228,8 @@ static void __bch2_fs_read_only(struct bch_fs *c)
                }
        } while (clean_passes < 2);
 
-       bch_verbose(c, "flushing journal and stopping allocators complete");
+       bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
+                   journal_cur_seq(&c->journal));
 
        if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
            !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
@@ -679,6 +684,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        bch2_fs_rebalance_init(c);
        bch2_fs_quota_init(c);
        bch2_fs_ec_init_early(c);
+       bch2_fs_move_init(c);
 
        INIT_LIST_HEAD(&c->list);
 
@@ -697,17 +703,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        INIT_LIST_HEAD(&c->fsck_errors);
        mutex_init(&c->fsck_error_lock);
 
-       INIT_LIST_HEAD(&c->ec_stripe_head_list);
-       mutex_init(&c->ec_stripe_head_lock);
-
-       INIT_LIST_HEAD(&c->ec_stripe_new_list);
-       mutex_init(&c->ec_stripe_new_lock);
-
-       INIT_LIST_HEAD(&c->data_progress_list);
-       mutex_init(&c->data_progress_lock);
-
-       mutex_init(&c->ec_stripes_heap_lock);
-
        seqcount_init(&c->gc_pos_lock);
 
        seqcount_init(&c->usage_lock);
index ed17b27fba4ff593e46f0f277854c7a56a391dc5..1344ae4cb28f6657c7caf0f8c925fc956524c56c 100644 (file)
@@ -248,6 +248,7 @@ read_attribute(io_timers_read);
 read_attribute(io_timers_write);
 
 read_attribute(data_jobs);
+read_attribute(moving_ctxts);
 
 #ifdef CONFIG_BCACHEFS_TESTS
 write_attribute(perf_test);
@@ -277,25 +278,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
        return ret;
 }
 
-static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       long ret = 0;
-       struct bch_move_stats *stats;
-
-       mutex_lock(&c->data_progress_lock);
-       list_for_each_entry(stats, &c->data_progress_list, list) {
-               prt_printf(out, "%s: data type %s btree_id %s position: ",
-                      stats->name,
-                      bch2_data_types[stats->data_type],
-                      bch2_btree_ids[stats->btree_id]);
-               bch2_bpos_to_text(out, stats->pos);
-               prt_printf(out, "%s", "\n");
-       }
-
-       mutex_unlock(&c->data_progress_lock);
-       return ret;
-}
-
 static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 {
        struct btree_trans trans;
@@ -476,7 +458,10 @@ SHOW(bch2_fs)
                bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
 
        if (attr == &sysfs_data_jobs)
-               data_progress_to_text(out, c);
+               bch2_data_jobs_to_text(out, c);
+
+       if (attr == &sysfs_moving_ctxts)
+               bch2_fs_moving_ctxts_to_text(out, c);
 
 #ifdef BCH_WRITE_REF_DEBUG
        if (attr == &sysfs_write_refs)
@@ -693,6 +678,7 @@ struct attribute *bch2_fs_internal_files[] = {
        sysfs_pd_controller_files(rebalance),
 
        &sysfs_data_jobs,
+       &sysfs_moving_ctxts,
 
        &sysfs_internal_uuid,
        NULL
index 5a6eadc0e8405459ce10bf9b72cbd0db42b07eaa..3d366a843eb55b243c0f0c07b9245fcc76c40ae6 100644 (file)
@@ -143,8 +143,17 @@ static int __do_six_trylock_type(struct six_lock *lock,
                 * lock, issue a wakeup because we might have caused a
                 * spurious trylock failure:
                 */
+#if 0
+               /*
+                * This code should be sufficient, but we're seeing unexplained
+                * lost wakeups:
+                */
                if (old.write_locking)
                        ret = -1 - SIX_LOCK_write;
+#else
+               if (!ret)
+                       ret = -1 - SIX_LOCK_write;
+#endif
        } else if (type == SIX_LOCK_write && lock->readers) {
                if (try) {
                        atomic64_add(__SIX_VAL(write_locking, 1),
@@ -320,11 +329,10 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
                 * Similar to the lock path, we may have caused a spurious write
                 * lock fail and need to issue a wakeup:
                 */
-               if (old.write_locking)
-                       six_lock_wakeup(lock, old, SIX_LOCK_write);
-
                if (ret)
                        six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
+               else
+                       six_lock_wakeup(lock, old, SIX_LOCK_write);
 
                return ret;
        }