X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Falloc_foreground.c;h=a961df74d421707468ed0868932d25a59fc218fb;hb=89b361f24a433a4a55e0032eca4e43045ea3f0d0;hp=fcb7311b184463313b965b01ef1785c1bdc09c43;hpb=5d507f795b0b679a67e972a48cbd0854c4ad0f02;p=bcachefs-tools-debian diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index fcb7311..a961df7 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -25,7 +25,7 @@ #include "disk_groups.h" #include "ec.h" #include "error.h" -#include "io.h" +#include "io_write.h" #include "journal.h" #include "movinggc.h" #include "nocow_locking.h" @@ -399,12 +399,23 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct bucket_alloc_state *s, struct closure *cl) { - struct btree_iter iter; - struct bkey_s_c k; + struct btree_iter iter, citer; + struct bkey_s_c k, ck; struct open_bucket *ob = NULL; - u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); - u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor)); + u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); + u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 alloc_cursor = alloc_start; int ret; + + /* + * Scan with an uncached iterator to avoid polluting the key cache. An + * uncached iter will return a cached key if one exists, but if not + * there is no other underlying protection for the associated key cache + * slot. To avoid racing bucket allocations, look up the cached key slot + * of any likely allocation candidate before attempting to proceed with + * the allocation. This provides proper exclusion on the associated + * bucket. + */ again: for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), BTREE_ITER_SLOTS, k, ret) { @@ -419,25 +430,38 @@ again: continue; a = bch2_alloc_to_v4(k, &a_convert); - if (a->data_type != BCH_DATA_free) continue; + /* now check the cached key to serialize concurrent allocs of the bucket */ + ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED); + ret = bkey_err(ck); + if (ret) + break; + + a = bch2_alloc_to_v4(ck, &a_convert); + if (a->data_type != BCH_DATA_free) + goto next; + s->buckets_seen++; ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); +next: + citer.path->preserve = false; + bch2_trans_iter_exit(trans, &citer); if (ob) break; } bch2_trans_iter_exit(trans, &iter); + alloc_cursor = iter.pos.offset; ca->alloc_cursor = alloc_cursor; if (!ob && ret) ob = ERR_PTR(ret); - if (!ob && alloc_cursor > alloc_start) { - alloc_cursor = alloc_start; + if (!ob && alloc_start > first_bucket) { + alloc_cursor = alloc_start = first_bucket; goto again; } @@ -502,9 +526,14 @@ again: } /** - * bch_bucket_alloc - allocate a single bucket from a specific device + * bch2_bucket_alloc_trans - allocate a single bucket from a specific device + * @trans: transaction object + * @ca: device to allocate from + * @watermark: how important is this allocation? + * @cl: if not NULL, closure to be used to wait if buckets not available + * @usage: for secondarily also returning the current device usage * - * Returns index of bucket on success, 0 on failure + * Returns: an open_bucket on success, or an ERR_PTR() on failure. */ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct bch_dev *ca, @@ -597,7 +626,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, struct open_bucket *ob; bch2_trans_do(c, NULL, NULL, 0, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark, + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage))); return ob; } @@ -668,11 +697,9 @@ static int add_new_bucket(struct bch_fs *c, bch_dev_bkey_exists(c, ob->dev)->mi.durability; BUG_ON(*nr_effective >= nr_replicas); - BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); __clear_bit(ob->dev, devs_may_alloc->d); - *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) - ? durability : 1; + *nr_effective += durability; *have_cache |= !durability; ob_push(c, ptrs, ob); @@ -775,7 +802,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, struct dev_alloc_list devs_sorted; struct ec_stripe_head *h; struct open_bucket *ob; - struct bch_dev *ca; unsigned i, ec_idx; int ret = 0; @@ -805,8 +831,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, } goto out_put_head; got_bucket: - ca = bch_dev_bkey_exists(c, ob->dev); - ob->ec_idx = ec_idx; ob->ec = h->s; ec_stripe_new_get(h->s, STRIPE_REF_io); @@ -989,7 +1013,6 @@ retry_blocking: cl = _cl; goto retry_blocking; } - } return ret; @@ -1031,6 +1054,19 @@ static int open_bucket_add_buckets(struct btree_trans *trans, return ret < 0 ? ret : 0; } +/** + * should_drop_bucket - check if this is open_bucket should go away + * @ob: open_bucket to predicate on + * @c: filesystem handle + * @ca: if set, we're killing buckets for a particular device + * @ec: if true, we're shutting down erasure coding and killing all ec + * open_buckets + * otherwise, return true + * Returns: true if we should kill this open_bucket + * + * We're killing open_buckets because we're shutting down a device, erasure + * coding, or the entire filesystem - check if this open_bucket matches: + */ static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, struct bch_dev *ca, bool ec) { @@ -1042,8 +1078,12 @@ static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, unsigned i; if (!drop && ob->ec) { + unsigned nr_blocks; + mutex_lock(&ob->ec->lock); - for (i = 0; i < ob->ec->new_stripe.key.v.nr_blocks; i++) { + nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; + + for (i = 0; i < nr_blocks; i++) { if (!ob->ec->blocks[i]) continue; @@ -1255,6 +1295,30 @@ out: return wp; } +static noinline void +deallocate_extra_replicas(struct bch_fs *c, + struct open_buckets *ptrs, + struct open_buckets *ptrs_no_use, + unsigned extra_replicas) +{ + struct open_buckets ptrs2 = { 0 }; + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, ptrs, ob, i) { + unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability; + + if (d && d <= extra_replicas) { + extra_replicas -= d; + ob_push(c, ptrs_no_use, ob); + } else { + ob_push(c, &ptrs2, ob); + } + } + + *ptrs = ptrs2; +} + /* * Get us an open_bucket we can allocate from, return with it locked: */ @@ -1279,6 +1343,9 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, int ret; int i; + if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) + erasure_code = false; + BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); BUG_ON(!nr_replicas || !nr_replicas_required); @@ -1340,6 +1407,9 @@ alloc_done: if (ret) goto err; + if (nr_effective > nr_replicas) + deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); + /* Free buckets we didn't use: */ open_bucket_for_each(c, &wp->ptrs, ob, i) open_bucket_free_unused(c, ob); @@ -1512,25 +1582,47 @@ static const char * const bch2_write_point_states[] = { NULL }; +static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, + struct write_point *wp) +{ + struct open_bucket *ob; + unsigned i; + + prt_printf(out, "%lu: ", wp->write_point); + prt_human_readable_u64(out, wp->sectors_allocated); + + prt_printf(out, " last wrote: "); + bch2_pr_time_units(out, sched_clock() - wp->last_used); + + for (i = 0; i < WRITE_POINT_STATE_NR; i++) { + prt_printf(out, " %s: ", bch2_write_point_states[i]); + bch2_pr_time_units(out, wp->time[i]); + } + + prt_newline(out); + + printbuf_indent_add(out, 2); + open_bucket_for_each(c, &wp->ptrs, ob, i) + bch2_open_bucket_to_text(out, c, ob); + printbuf_indent_sub(out, 2); +} + void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) { struct write_point *wp; - unsigned i; + prt_str(out, "Foreground write points\n"); for (wp = c->write_points; wp < c->write_points + ARRAY_SIZE(c->write_points); - wp++) { - prt_printf(out, "%lu: ", wp->write_point); - prt_human_readable_u64(out, wp->sectors_allocated); + wp++) + bch2_write_point_to_text(out, c, wp); - prt_printf(out, " last wrote: "); - bch2_pr_time_units(out, sched_clock() - wp->last_used); + prt_str(out, "Copygc write point\n"); + bch2_write_point_to_text(out, c, &c->copygc_write_point); - for (i = 0; i < WRITE_POINT_STATE_NR; i++) { - prt_printf(out, " %s: ", bch2_write_point_states[i]); - bch2_pr_time_units(out, wp->time[i]); - } + prt_str(out, "Rebalance write point\n"); + bch2_write_point_to_text(out, c, &c->rebalance_write_point); - prt_newline(out); - } + prt_str(out, "Btree write point\n"); + bch2_write_point_to_text(out, c, &c->btree_write_point); }