X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Falloc_foreground.c;h=633d3223b353f83e83501601024dd262952236c6;hb=f154e6ed41e6080d7afe76d5d1209c5cae0acc12;hp=fcb7311b184463313b965b01ef1785c1bdc09c43;hpb=5d507f795b0b679a67e972a48cbd0854c4ad0f02;p=bcachefs-tools-debian diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index fcb7311..633d322 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -25,7 +25,7 @@ #include "disk_groups.h" #include "ec.h" #include "error.h" -#include "io.h" +#include "io_write.h" #include "journal.h" #include "movinggc.h" #include "nocow_locking.h" @@ -69,11 +69,8 @@ const char * const bch2_watermarks[] = { void bch2_reset_alloc_cursors(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) + for_each_member_device_rcu(c, ca, NULL) ca->alloc_cursor = 0; rcu_read_unlock(); } @@ -239,9 +236,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * if (cl) closure_wait(&c->open_buckets_wait, cl); - if (!c->blocked_allocate_open_bucket) - c->blocked_allocate_open_bucket = local_clock(); - + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], + &c->blocked_allocate_open_bucket, true); spin_unlock(&c->freelist_lock); return ERR_PTR(-BCH_ERR_open_buckets_empty); } @@ -267,19 +263,11 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * ca->nr_open_buckets++; bch2_open_bucket_hash_add(c, ob); - if (c->blocked_allocate_open_bucket) { - bch2_time_stats_update( - &c->times[BCH_TIME_blocked_allocate_open_bucket], - c->blocked_allocate_open_bucket); - c->blocked_allocate_open_bucket = 0; - } + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], + &c->blocked_allocate_open_bucket, false); - if (c->blocked_allocate) { - bch2_time_stats_update( - &c->times[BCH_TIME_blocked_allocate], - c->blocked_allocate); - c->blocked_allocate = 0; - } + track_event_change(&c->times[BCH_TIME_blocked_allocate], + &c->blocked_allocate, false); spin_unlock(&c->freelist_lock); return ob; @@ -377,9 +365,9 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); if (!ob) - iter.path->preserve = false; + set_btree_iter_dontneed(&iter); err: - if (iter.trans && iter.path) + if (iter.path) set_btree_iter_dontneed(&iter); bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); @@ -399,12 +387,23 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct bucket_alloc_state *s, struct closure *cl) { - struct btree_iter iter; - struct bkey_s_c k; + struct btree_iter iter, citer; + struct bkey_s_c k, ck; struct open_bucket *ob = NULL; - u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); - u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor)); + u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); + u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 alloc_cursor = alloc_start; int ret; + + /* + * Scan with an uncached iterator to avoid polluting the key cache. An + * uncached iter will return a cached key if one exists, but if not + * there is no other underlying protection for the associated key cache + * slot. To avoid racing bucket allocations, look up the cached key slot + * of any likely allocation candidate before attempting to proceed with + * the allocation. This provides proper exclusion on the associated + * bucket. + */ again: for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), BTREE_ITER_SLOTS, k, ret) { @@ -419,25 +418,38 @@ again: continue; a = bch2_alloc_to_v4(k, &a_convert); - if (a->data_type != BCH_DATA_free) continue; + /* now check the cached key to serialize concurrent allocs of the bucket */ + ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED); + ret = bkey_err(ck); + if (ret) + break; + + a = bch2_alloc_to_v4(ck, &a_convert); + if (a->data_type != BCH_DATA_free) + goto next; + s->buckets_seen++; ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); +next: + set_btree_iter_dontneed(&citer); + bch2_trans_iter_exit(trans, &citer); if (ob) break; } bch2_trans_iter_exit(trans, &iter); + alloc_cursor = iter.pos.offset; ca->alloc_cursor = alloc_cursor; if (!ob && ret) ob = ERR_PTR(ret); - if (!ob && alloc_cursor > alloc_start) { - alloc_cursor = alloc_start; + if (!ob && alloc_start > first_bucket) { + alloc_cursor = alloc_start = first_bucket; goto again; } @@ -478,7 +490,7 @@ again: ob = try_alloc_bucket(trans, ca, watermark, alloc_cursor, s, k, cl); if (ob) { - iter.path->preserve = false; + set_btree_iter_dontneed(&iter); break; } } @@ -502,9 +514,14 @@ again: } /** - * bch_bucket_alloc - allocate a single bucket from a specific device + * bch2_bucket_alloc_trans - allocate a single bucket from a specific device + * @trans: transaction object + * @ca: device to allocate from + * @watermark: how important is this allocation? + * @cl: if not NULL, closure to be used to wait if buckets not available + * @usage: for secondarily also returning the current device usage * - * Returns index of bucket on success, 0 on failure + * Returns: an open_bucket on success, or an ERR_PTR() on failure. */ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct bch_dev *ca, @@ -538,8 +555,8 @@ again: goto again; } - if (!c->blocked_allocate) - c->blocked_allocate = local_clock(); + track_event_change(&c->times[BCH_TIME_blocked_allocate], + &c->blocked_allocate, true); ob = ERR_PTR(-BCH_ERR_freelist_empty); goto err; @@ -597,7 +614,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, struct open_bucket *ob; bch2_trans_do(c, NULL, NULL, 0, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark, + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage))); return ob; } @@ -668,11 +685,9 @@ static int add_new_bucket(struct bch_fs *c, bch_dev_bkey_exists(c, ob->dev)->mi.durability; BUG_ON(*nr_effective >= nr_replicas); - BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); __clear_bit(ob->dev, devs_may_alloc->d); - *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) - ? durability : 1; + *nr_effective += durability; *have_cache |= !durability; ob_push(c, ptrs, ob); @@ -775,7 +790,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, struct dev_alloc_list devs_sorted; struct ec_stripe_head *h; struct open_bucket *ob; - struct bch_dev *ca; unsigned i, ec_idx; int ret = 0; @@ -805,8 +819,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, } goto out_put_head; got_bucket: - ca = bch_dev_bkey_exists(c, ob->dev); - ob->ec_idx = ec_idx; ob->ec = h->s; ec_stripe_new_get(h->s, STRIPE_REF_io); @@ -946,8 +958,8 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, devs = target_rw_devs(c, wp->data_type, target); /* Don't allocate from devices we already have pointers to: */ - for (i = 0; i < devs_have->nr; i++) - __clear_bit(devs_have->devs[i], devs.d); + darray_for_each(*devs_have, i) + __clear_bit(*i, devs.d); open_bucket_for_each(c, ptrs, ob, i) __clear_bit(ob->dev, devs.d); @@ -989,7 +1001,6 @@ retry_blocking: cl = _cl; goto retry_blocking; } - } return ret; @@ -1031,6 +1042,19 @@ static int open_bucket_add_buckets(struct btree_trans *trans, return ret < 0 ? ret : 0; } +/** + * should_drop_bucket - check if this is open_bucket should go away + * @ob: open_bucket to predicate on + * @c: filesystem handle + * @ca: if set, we're killing buckets for a particular device + * @ec: if true, we're shutting down erasure coding and killing all ec + * open_buckets + * otherwise, return true + * Returns: true if we should kill this open_bucket + * + * We're killing open_buckets because we're shutting down a device, erasure + * coding, or the entire filesystem - check if this open_bucket matches: + */ static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, struct bch_dev *ca, bool ec) { @@ -1042,8 +1066,12 @@ static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, unsigned i; if (!drop && ob->ec) { + unsigned nr_blocks; + mutex_lock(&ob->ec->lock); - for (i = 0; i < ob->ec->new_stripe.key.v.nr_blocks; i++) { + nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; + + for (i = 0; i < nr_blocks; i++) { if (!ob->ec->blocks[i]) continue; @@ -1255,6 +1283,30 @@ out: return wp; } +static noinline void +deallocate_extra_replicas(struct bch_fs *c, + struct open_buckets *ptrs, + struct open_buckets *ptrs_no_use, + unsigned extra_replicas) +{ + struct open_buckets ptrs2 = { 0 }; + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, ptrs, ob, i) { + unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability; + + if (d && d <= extra_replicas) { + extra_replicas -= d; + ob_push(c, ptrs_no_use, ob); + } else { + ob_push(c, &ptrs2, ob); + } + } + + *ptrs = ptrs2; +} + /* * Get us an open_bucket we can allocate from, return with it locked: */ @@ -1279,6 +1331,9 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, int ret; int i; + if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) + erasure_code = false; + BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); BUG_ON(!nr_replicas || !nr_replicas_required); @@ -1305,8 +1360,17 @@ retry: goto alloc_done; /* Don't retry from all devices if we're out of open buckets: */ - if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - goto allocate_blocking; + if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { + int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, + &have_cache, watermark, + flags, cl); + if (!ret || + bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + goto alloc_done; + } /* * Only try to allocate cache (durability = 0 devices) from the @@ -1320,7 +1384,6 @@ retry: &have_cache, watermark, flags, cl); } else { -allocate_blocking: ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, @@ -1340,6 +1403,9 @@ alloc_done: if (ret) goto err; + if (nr_effective > nr_replicas) + deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); + /* Free buckets we didn't use: */ open_bucket_for_each(c, &wp->ptrs, ob, i) open_bucket_free_unused(c, ob); @@ -1459,10 +1525,11 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str unsigned data_type = ob->data_type; barrier(); /* READ_ONCE() doesn't work on bitfields */ - prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u", + prt_printf(out, "%zu ref %u ", ob - c->open_buckets, - atomic_read(&ob->pin), - data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type", + atomic_read(&ob->pin)); + bch2_prt_data_type(out, data_type); + prt_printf(out, " %u:%llu gen %u allocated %u/%u", ob->dev, ob->bucket, ob->gen, ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); if (ob->ec) @@ -1512,25 +1579,47 @@ static const char * const bch2_write_point_states[] = { NULL }; +static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, + struct write_point *wp) +{ + struct open_bucket *ob; + unsigned i; + + prt_printf(out, "%lu: ", wp->write_point); + prt_human_readable_u64(out, wp->sectors_allocated); + + prt_printf(out, " last wrote: "); + bch2_pr_time_units(out, sched_clock() - wp->last_used); + + for (i = 0; i < WRITE_POINT_STATE_NR; i++) { + prt_printf(out, " %s: ", bch2_write_point_states[i]); + bch2_pr_time_units(out, wp->time[i]); + } + + prt_newline(out); + + printbuf_indent_add(out, 2); + open_bucket_for_each(c, &wp->ptrs, ob, i) + bch2_open_bucket_to_text(out, c, ob); + printbuf_indent_sub(out, 2); +} + void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) { struct write_point *wp; - unsigned i; + prt_str(out, "Foreground write points\n"); for (wp = c->write_points; wp < c->write_points + ARRAY_SIZE(c->write_points); - wp++) { - prt_printf(out, "%lu: ", wp->write_point); - prt_human_readable_u64(out, wp->sectors_allocated); + wp++) + bch2_write_point_to_text(out, c, wp); - prt_printf(out, " last wrote: "); - bch2_pr_time_units(out, sched_clock() - wp->last_used); + prt_str(out, "Copygc write point\n"); + bch2_write_point_to_text(out, c, &c->copygc_write_point); - for (i = 0; i < WRITE_POINT_STATE_NR; i++) { - prt_printf(out, " %s: ", bch2_write_point_states[i]); - bch2_pr_time_units(out, wp->time[i]); - } + prt_str(out, "Rebalance write point\n"); + bch2_write_point_to_text(out, c, &c->rebalance_write_point); - prt_newline(out); - } + prt_str(out, "Btree write point\n"); + bch2_write_point_to_text(out, c, &c->btree_write_point); }