#include "disk_groups.h"
#include "ec.h"
#include "error.h"
-#include "io.h"
+#include "io_write.h"
#include "journal.h"
#include "movinggc.h"
#include "nocow_locking.h"
struct bucket_alloc_state *s,
struct closure *cl)
{
- struct btree_iter iter;
- struct bkey_s_c k;
+ struct btree_iter iter, citer;
+ struct bkey_s_c k, ck;
struct open_bucket *ob = NULL;
- u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
- u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
+ u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+ u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
+ u64 alloc_cursor = alloc_start;
int ret;
+
+ /*
+ * Scan with an uncached iterator to avoid polluting the key cache. An
+ * uncached iter will return a cached key if one exists, but if not
+ * there is no other underlying protection for the associated key cache
+ * slot. To avoid racing bucket allocations, look up the cached key slot
+ * of any likely allocation candidate before attempting to proceed with
+ * the allocation. This provides proper exclusion on the associated
+ * bucket.
+ */
again:
for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
BTREE_ITER_SLOTS, k, ret) {
continue;
a = bch2_alloc_to_v4(k, &a_convert);
-
if (a->data_type != BCH_DATA_free)
continue;
+ /* now check the cached key to serialize concurrent allocs of the bucket */
+ ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
+ ret = bkey_err(ck);
+ if (ret)
+ break;
+
+ a = bch2_alloc_to_v4(ck, &a_convert);
+ if (a->data_type != BCH_DATA_free)
+ goto next;
+
s->buckets_seen++;
ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
+next:
+ citer.path->preserve = false;
+ bch2_trans_iter_exit(trans, &citer);
if (ob)
break;
}
bch2_trans_iter_exit(trans, &iter);
+ alloc_cursor = iter.pos.offset;
ca->alloc_cursor = alloc_cursor;
if (!ob && ret)
ob = ERR_PTR(ret);
- if (!ob && alloc_cursor > alloc_start) {
- alloc_cursor = alloc_start;
+ if (!ob && alloc_start > first_bucket) {
+ alloc_cursor = alloc_start = first_bucket;
goto again;
}
}
/**
- * bch_bucket_alloc - allocate a single bucket from a specific device
+ * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
+ * @trans: transaction object
+ * @ca: device to allocate from
+ * @watermark: how important is this allocation?
+ * @cl: if not NULL, closure to be used to wait if buckets not available
+ * @usage: for secondarily also returning the current device usage
*
- * Returns index of bucket on success, 0 on failure
+ * Returns: an open_bucket on success, or an ERR_PTR() on failure.
*/
static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
struct bch_dev *ca,
struct open_bucket *ob;
bch2_trans_do(c, NULL, NULL, 0,
- PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark,
+ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
cl, &usage)));
return ob;
}
struct dev_alloc_list devs_sorted;
struct ec_stripe_head *h;
struct open_bucket *ob;
- struct bch_dev *ca;
unsigned i, ec_idx;
int ret = 0;
}
goto out_put_head;
got_bucket:
- ca = bch_dev_bkey_exists(c, ob->dev);
-
ob->ec_idx = ec_idx;
ob->ec = h->s;
ec_stripe_new_get(h->s, STRIPE_REF_io);
cl = _cl;
goto retry_blocking;
}
-
}
return ret;
return ret < 0 ? ret : 0;
}
+/**
+ * should_drop_bucket - check if this is open_bucket should go away
+ * @ob: open_bucket to predicate on
+ * @c: filesystem handle
+ * @ca: if set, we're killing buckets for a particular device
+ * @ec: if true, we're shutting down erasure coding and killing all ec
+ * open_buckets
+ * otherwise, return true
+ * Returns: true if we should kill this open_bucket
+ *
+ * We're killing open_buckets because we're shutting down a device, erasure
+ * coding, or the entire filesystem - check if this open_bucket matches:
+ */
static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
struct bch_dev *ca, bool ec)
{
NULL
};
+static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
+ struct write_point *wp)
+{
+ struct open_bucket *ob;
+ unsigned i;
+
+ prt_printf(out, "%lu: ", wp->write_point);
+ prt_human_readable_u64(out, wp->sectors_allocated);
+
+ prt_printf(out, " last wrote: ");
+ bch2_pr_time_units(out, sched_clock() - wp->last_used);
+
+ for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
+ prt_printf(out, " %s: ", bch2_write_point_states[i]);
+ bch2_pr_time_units(out, wp->time[i]);
+ }
+
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
+ bch2_open_bucket_to_text(out, c, ob);
+ printbuf_indent_sub(out, 2);
+}
+
void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
{
struct write_point *wp;
- unsigned i;
+ prt_str(out, "Foreground write points\n");
for (wp = c->write_points;
wp < c->write_points + ARRAY_SIZE(c->write_points);
- wp++) {
- prt_printf(out, "%lu: ", wp->write_point);
- prt_human_readable_u64(out, wp->sectors_allocated);
+ wp++)
+ bch2_write_point_to_text(out, c, wp);
- prt_printf(out, " last wrote: ");
- bch2_pr_time_units(out, sched_clock() - wp->last_used);
+ prt_str(out, "Copygc write point\n");
+ bch2_write_point_to_text(out, c, &c->copygc_write_point);
- for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
- prt_printf(out, " %s: ", bch2_write_point_states[i]);
- bch2_pr_time_units(out, wp->time[i]);
- }
+ prt_str(out, "Rebalance write point\n");
+ bch2_write_point_to_text(out, c, &c->rebalance_write_point);
- prt_newline(out);
- }
+ prt_str(out, "Btree write point\n");
+ bch2_write_point_to_text(out, c, &c->btree_write_point);
}