New upstream release

[bcachefs-tools-debian] / libbcachefs / alloc_foreground.c
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c

index 1f4c5b38562d9cabb4b886b794c9750d7e64b1a8..b85c7765272f6e4ae5e8aceb5a4bbaa89c535912 100644 (file)
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -25,7 +25,7 @@
  #include "disk_groups.h"
  #include "ec.h"
  #include "error.h"
-#include "io.h"
+#include "io_write.h"
  #include "journal.h"
  #include "movinggc.h"
  #include "nocow_locking.h"
@@ -399,12 +399,23 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
                         struct bucket_alloc_state *s,
                         struct closure *cl)
  {
-       struct btree_iter iter;
-       struct bkey_s_c k;
+       struct btree_iter iter, citer;
+       struct bkey_s_c k, ck;
         struct open_bucket *ob = NULL;
-       u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
-       u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
+       u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+       u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
+       u64 alloc_cursor = alloc_start;
         int ret;
+
+       /*
+        * Scan with an uncached iterator to avoid polluting the key cache. An
+        * uncached iter will return a cached key if one exists, but if not
+        * there is no other underlying protection for the associated key cache
+        * slot. To avoid racing bucket allocations, look up the cached key slot
+        * of any likely allocation candidate before attempting to proceed with
+        * the allocation. This provides proper exclusion on the associated
+        * bucket.
+        */
  again:
         for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
                            BTREE_ITER_SLOTS, k, ret) {
@@ -419,25 +430,38 @@ again:
                         continue;
  
                 a = bch2_alloc_to_v4(k, &a_convert);
-
                 if (a->data_type != BCH_DATA_free)
                         continue;
  
+               /* now check the cached key to serialize concurrent allocs of the bucket */
+               ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
+               ret = bkey_err(ck);
+               if (ret)
+                       break;
+
+               a = bch2_alloc_to_v4(ck, &a_convert);
+               if (a->data_type != BCH_DATA_free)
+                       goto next;
+
                 s->buckets_seen++;
  
                 ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
+next:
+               citer.path->preserve = false;
+               bch2_trans_iter_exit(trans, &citer);
                 if (ob)
                         break;
         }
         bch2_trans_iter_exit(trans, &iter);
  
+       alloc_cursor = iter.pos.offset;
         ca->alloc_cursor = alloc_cursor;
  
         if (!ob && ret)
                 ob = ERR_PTR(ret);
  
-       if (!ob && alloc_cursor > alloc_start) {
-               alloc_cursor = alloc_start;
+       if (!ob && alloc_start > first_bucket) {
+               alloc_cursor = alloc_start = first_bucket;
                 goto again;
         }
  
@@ -502,9 +526,14 @@ again:
  }
  
  /**
- * bch_bucket_alloc - allocate a single bucket from a specific device
+ * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
+ * @trans:     transaction object
+ * @ca:                device to allocate from
+ * @watermark: how important is this allocation?
+ * @cl:                if not NULL, closure to be used to wait if buckets not available
+ * @usage:     for secondarily also returning the current device usage
   *
- * Returns index of bucket on success, 0 on failure
+ * Returns:    an open_bucket on success, or an ERR_PTR() on failure.
   */
  static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
                                       struct bch_dev *ca,
@@ -597,7 +626,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
         struct open_bucket *ob;
  
         bch2_trans_do(c, NULL, NULL, 0,
-                     PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark,
+                     PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
                                                         cl, &usage)));
         return ob;
  }
@@ -775,7 +804,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
         struct dev_alloc_list devs_sorted;
         struct ec_stripe_head *h;
         struct open_bucket *ob;
-       struct bch_dev *ca;
         unsigned i, ec_idx;
         int ret = 0;
  
@@ -805,8 +833,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
                 }
         goto out_put_head;
  got_bucket:
-       ca = bch_dev_bkey_exists(c, ob->dev);
-
         ob->ec_idx      = ec_idx;
         ob->ec          = h->s;
         ec_stripe_new_get(h->s, STRIPE_REF_io);
@@ -989,7 +1015,6 @@ retry_blocking:
                         cl = _cl;
                         goto retry_blocking;
                 }
-
         }
  
         return ret;
@@ -1031,6 +1056,19 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
         return ret < 0 ? ret : 0;
  }
  
+/**
+ * should_drop_bucket - check if this is open_bucket should go away
+ * @ob:                open_bucket to predicate on
+ * @c:         filesystem handle
+ * @ca:                if set, we're killing buckets for a particular device
+ * @ec:                if true, we're shutting down erasure coding and killing all ec
+ *             open_buckets
+ *             otherwise, return true
+ * Returns: true if we should kill this open_bucket
+ *
+ * We're killing open_buckets because we're shutting down a device, erasure
+ * coding, or the entire filesystem - check if this open_bucket matches:
+ */
  static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
                                struct bch_dev *ca, bool ec)
  {
@@ -1516,25 +1554,47 @@ static const char * const bch2_write_point_states[] = {
         NULL
  };
  
+static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
+                                    struct write_point *wp)
+{
+       struct open_bucket *ob;
+       unsigned i;
+
+       prt_printf(out, "%lu: ", wp->write_point);
+       prt_human_readable_u64(out, wp->sectors_allocated);
+
+       prt_printf(out, " last wrote: ");
+       bch2_pr_time_units(out, sched_clock() - wp->last_used);
+
+       for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
+               prt_printf(out, " %s: ", bch2_write_point_states[i]);
+               bch2_pr_time_units(out, wp->time[i]);
+       }
+
+       prt_newline(out);
+
+       printbuf_indent_add(out, 2);
+       open_bucket_for_each(c, &wp->ptrs, ob, i)
+               bch2_open_bucket_to_text(out, c, ob);
+       printbuf_indent_sub(out, 2);
+}
+
  void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
  {
         struct write_point *wp;
-       unsigned i;
  
+       prt_str(out, "Foreground write points\n");
         for (wp = c->write_points;
              wp < c->write_points + ARRAY_SIZE(c->write_points);
-            wp++) {
-               prt_printf(out, "%lu: ", wp->write_point);
-               prt_human_readable_u64(out, wp->sectors_allocated);
+            wp++)
+               bch2_write_point_to_text(out, c, wp);
  
-               prt_printf(out, " last wrote: ");
-               bch2_pr_time_units(out, sched_clock() - wp->last_used);
+       prt_str(out, "Copygc write point\n");
+       bch2_write_point_to_text(out, c, &c->copygc_write_point);
  
-               for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
-                       prt_printf(out, " %s: ", bch2_write_point_states[i]);
-                       bch2_pr_time_units(out, wp->time[i]);
-               }
+       prt_str(out, "Rebalance write point\n");
+       bch2_write_point_to_text(out, c, &c->rebalance_write_point);
  
-               prt_newline(out);
-       }
+       prt_str(out, "Btree write point\n");
+       bch2_write_point_to_text(out, c, &c->btree_write_point);
  }