]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/alloc_foreground.c
New upstream release
[bcachefs-tools-debian] / libbcachefs / alloc_foreground.c
index 1f4c5b38562d9cabb4b886b794c9750d7e64b1a8..b85c7765272f6e4ae5e8aceb5a4bbaa89c535912 100644 (file)
@@ -25,7 +25,7 @@
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal.h"
 #include "movinggc.h"
 #include "nocow_locking.h"
@@ -399,12 +399,23 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
                        struct bucket_alloc_state *s,
                        struct closure *cl)
 {
-       struct btree_iter iter;
-       struct bkey_s_c k;
+       struct btree_iter iter, citer;
+       struct bkey_s_c k, ck;
        struct open_bucket *ob = NULL;
-       u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
-       u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
+       u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+       u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
+       u64 alloc_cursor = alloc_start;
        int ret;
+
+       /*
+        * Scan with an uncached iterator to avoid polluting the key cache. An
+        * uncached iter will return a cached key if one exists, but if not
+        * there is no other underlying protection for the associated key cache
+        * slot. To avoid racing bucket allocations, look up the cached key slot
+        * of any likely allocation candidate before attempting to proceed with
+        * the allocation. This provides proper exclusion on the associated
+        * bucket.
+        */
 again:
        for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
                           BTREE_ITER_SLOTS, k, ret) {
@@ -419,25 +430,38 @@ again:
                        continue;
 
                a = bch2_alloc_to_v4(k, &a_convert);
-
                if (a->data_type != BCH_DATA_free)
                        continue;
 
+               /* now check the cached key to serialize concurrent allocs of the bucket */
+               ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
+               ret = bkey_err(ck);
+               if (ret)
+                       break;
+
+               a = bch2_alloc_to_v4(ck, &a_convert);
+               if (a->data_type != BCH_DATA_free)
+                       goto next;
+
                s->buckets_seen++;
 
                ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
+next:
+               citer.path->preserve = false;
+               bch2_trans_iter_exit(trans, &citer);
                if (ob)
                        break;
        }
        bch2_trans_iter_exit(trans, &iter);
 
+       alloc_cursor = iter.pos.offset;
        ca->alloc_cursor = alloc_cursor;
 
        if (!ob && ret)
                ob = ERR_PTR(ret);
 
-       if (!ob && alloc_cursor > alloc_start) {
-               alloc_cursor = alloc_start;
+       if (!ob && alloc_start > first_bucket) {
+               alloc_cursor = alloc_start = first_bucket;
                goto again;
        }
 
@@ -502,9 +526,14 @@ again:
 }
 
 /**
- * bch_bucket_alloc - allocate a single bucket from a specific device
+ * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
+ * @trans:     transaction object
+ * @ca:                device to allocate from
+ * @watermark: how important is this allocation?
+ * @cl:                if not NULL, closure to be used to wait if buckets not available
+ * @usage:     for secondarily also returning the current device usage
  *
- * Returns index of bucket on success, 0 on failure
+ * Returns:    an open_bucket on success, or an ERR_PTR() on failure.
  */
 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
                                      struct bch_dev *ca,
@@ -597,7 +626,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
        struct open_bucket *ob;
 
        bch2_trans_do(c, NULL, NULL, 0,
-                     PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark,
+                     PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
                                                        cl, &usage)));
        return ob;
 }
@@ -775,7 +804,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
        struct dev_alloc_list devs_sorted;
        struct ec_stripe_head *h;
        struct open_bucket *ob;
-       struct bch_dev *ca;
        unsigned i, ec_idx;
        int ret = 0;
 
@@ -805,8 +833,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
                }
        goto out_put_head;
 got_bucket:
-       ca = bch_dev_bkey_exists(c, ob->dev);
-
        ob->ec_idx      = ec_idx;
        ob->ec          = h->s;
        ec_stripe_new_get(h->s, STRIPE_REF_io);
@@ -989,7 +1015,6 @@ retry_blocking:
                        cl = _cl;
                        goto retry_blocking;
                }
-
        }
 
        return ret;
@@ -1031,6 +1056,19 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
        return ret < 0 ? ret : 0;
 }
 
+/**
+ * should_drop_bucket - check if this is open_bucket should go away
+ * @ob:                open_bucket to predicate on
+ * @c:         filesystem handle
+ * @ca:                if set, we're killing buckets for a particular device
+ * @ec:                if true, we're shutting down erasure coding and killing all ec
+ *             open_buckets
+ *             otherwise, return true
+ * Returns: true if we should kill this open_bucket
+ *
+ * We're killing open_buckets because we're shutting down a device, erasure
+ * coding, or the entire filesystem - check if this open_bucket matches:
+ */
 static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
                               struct bch_dev *ca, bool ec)
 {
@@ -1516,25 +1554,47 @@ static const char * const bch2_write_point_states[] = {
        NULL
 };
 
+static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
+                                    struct write_point *wp)
+{
+       struct open_bucket *ob;
+       unsigned i;
+
+       prt_printf(out, "%lu: ", wp->write_point);
+       prt_human_readable_u64(out, wp->sectors_allocated);
+
+       prt_printf(out, " last wrote: ");
+       bch2_pr_time_units(out, sched_clock() - wp->last_used);
+
+       for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
+               prt_printf(out, " %s: ", bch2_write_point_states[i]);
+               bch2_pr_time_units(out, wp->time[i]);
+       }
+
+       prt_newline(out);
+
+       printbuf_indent_add(out, 2);
+       open_bucket_for_each(c, &wp->ptrs, ob, i)
+               bch2_open_bucket_to_text(out, c, ob);
+       printbuf_indent_sub(out, 2);
+}
+
 void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
 {
        struct write_point *wp;
-       unsigned i;
 
+       prt_str(out, "Foreground write points\n");
        for (wp = c->write_points;
             wp < c->write_points + ARRAY_SIZE(c->write_points);
-            wp++) {
-               prt_printf(out, "%lu: ", wp->write_point);
-               prt_human_readable_u64(out, wp->sectors_allocated);
+            wp++)
+               bch2_write_point_to_text(out, c, wp);
 
-               prt_printf(out, " last wrote: ");
-               bch2_pr_time_units(out, sched_clock() - wp->last_used);
+       prt_str(out, "Copygc write point\n");
+       bch2_write_point_to_text(out, c, &c->copygc_write_point);
 
-               for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
-                       prt_printf(out, " %s: ", bch2_write_point_states[i]);
-                       bch2_pr_time_units(out, wp->time[i]);
-               }
+       prt_str(out, "Rebalance write point\n");
+       bch2_write_point_to_text(out, c, &c->rebalance_write_point);
 
-               prt_newline(out);
-       }
+       prt_str(out, "Btree write point\n");
+       bch2_write_point_to_text(out, c, &c->btree_write_point);
 }