#include "buckets.h"
#include "clock.h"
#include "debug.h"
+#include "ec.h"
#include "error.h"
#include "journal_io.h"
const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
+ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+
if (k.k->p.inode >= c->sb.nr_devices ||
!c->devs[k.k->p.inode])
return "invalid device";
- switch (k.k->type) {
- case BCH_ALLOC: {
- struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
-
- if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
- return "incorrect value size";
- break;
- }
- default:
- return "invalid type";
- }
+ /* allow for unknown fields */
+ if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
+ return "incorrect value size";
return NULL;
}
-int bch2_alloc_to_text(struct bch_fs *c, char *buf,
- size_t size, struct bkey_s_c k)
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
{
- buf[0] = '\0';
+ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
- switch (k.k->type) {
- case BCH_ALLOC:
- break;
- }
-
- return 0;
+ pr_buf(out, "gen %u", a.v->gen);
}
static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
struct bucket *g;
const u8 *d;
- if (k.k->type != BCH_ALLOC)
+ if (k.k->type != KEY_TYPE_alloc)
return;
a = bkey_s_c_to_alloc(k);
__BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
struct bucket *g;
struct bkey_i_alloc *a;
+ int ret;
u8 *d;
percpu_down_read_preempt_disable(&c->usage_lock);
bch2_btree_iter_set_pos(iter, a->k.p);
- return bch2_btree_insert_at(c, NULL, journal_seq,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE|
- flags,
- BTREE_INSERT_ENTRY(iter, &a->k_i));
+ ret = bch2_btree_insert_at(c, NULL, journal_seq,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_USE_ALLOC_RESERVE|
+ flags,
+ BTREE_INSERT_ENTRY(iter, &a->k_i));
+
+ if (!ret && ca->buckets_written)
+ set_bit(b, ca->buckets_written);
+
+ return ret;
}
-int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
+int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
{
struct bch_dev *ca;
struct btree_iter iter;
int ret;
- if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
+ if (k->k.p.inode >= c->sb.nr_devices ||
+ !c->devs[k->k.p.inode])
return 0;
- ca = bch_dev_bkey_exists(c, pos.inode);
+ ca = bch_dev_bkey_exists(c, k->k.p.inode);
- if (pos.offset >= ca->mi.nbuckets)
+ if (k->k.p.offset >= ca->mi.nbuckets)
return 0;
- bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, k->k.p,
+ BTREE_ITER_INTENT);
- ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL, 0);
+ ret = bch2_btree_iter_traverse(&iter);
+ if (ret)
+ goto err;
+
+ /* check buckets_written with btree node locked: */
+
+ ret = test_bit(k->k.p.offset, ca->buckets_written)
+ ? 0
+ : bch2_btree_insert_at(c, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_JOURNAL_REPLAY,
+ BTREE_INSERT_ENTRY(&iter, k));
+err:
bch2_btree_iter_unlock(&iter);
return ret;
}
}
}
+static inline u64 bucket_clock_freq(u64 capacity)
+{
+ return max(capacity >> 10, 2028ULL);
+}
+
static void bch2_inc_clock_hand(struct io_timer *timer)
{
struct bucket_clock *clock = container_of(timer,
* RW mode (that will be 0 when we're RO, yet we can still service
* reads)
*/
- timer->expire += capacity >> 10;
+ timer->expire += bucket_clock_freq(capacity);
bch2_io_timer_add(&c->io_clock[clock->rw], timer);
}
clock->hand = 1;
clock->rw = rw;
clock->rescale.fn = bch2_inc_clock_hand;
- clock->rescale.expire = c->capacity >> 10;
+ clock->rescale.expire = bucket_clock_freq(c->capacity);
mutex_init(&clock->lock);
}
pr_debug("free_inc now empty");
do {
- if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
- up_read(&c->gc_lock);
- bch_err(ca, "gc failure");
- goto stop;
- }
-
/*
* Find some buckets that we can invalidate, either
* they're completely unused, or only contain clean data
{
struct bch_dev *ca;
u64 capacity = 0, reserved_sectors = 0, gc_reserve;
+ unsigned bucket_size_max = 0;
unsigned long ra_pages = 0;
unsigned i, j;
for (j = 0; j < RESERVE_NONE; j++)
dev_reserve += ca->free[j].size;
- dev_reserve += ca->free_inc.size;
-
- dev_reserve += ARRAY_SIZE(c->write_points);
-
dev_reserve += 1; /* btree write point */
dev_reserve += 1; /* copygc write point */
dev_reserve += 1; /* rebalance write point */
- dev_reserve += WRITE_POINT_COUNT;
dev_reserve *= ca->mi.bucket_size;
ca->mi.first_bucket);
reserved_sectors += dev_reserve * 2;
+
+ bucket_size_max = max_t(unsigned, bucket_size_max,
+ ca->mi.bucket_size);
}
gc_reserve = c->opts.gc_reserve_bytes
c->capacity = capacity - reserved_sectors;
+ c->bucket_size_max = bucket_size_max;
+
if (c->capacity) {
bch2_io_timer_add(&c->io_clock[READ],
&c->bucket_clock[READ].rescale);
}
mutex_unlock(&c->btree_reserve_cache_lock);
+ while (1) {
+ struct open_bucket *ob;
+
+ spin_lock(&c->freelist_lock);
+ if (!ca->open_buckets_partial_nr) {
+ spin_unlock(&c->freelist_lock);
+ break;
+ }
+ ob = c->open_buckets +
+ ca->open_buckets_partial[--ca->open_buckets_partial_nr];
+ ob->on_partial_list = false;
+ spin_unlock(&c->freelist_lock);
+
+ bch2_open_bucket_put(c, ob);
+ }
+
+ bch2_ec_stop_dev(c, ca);
+
/*
* Wake up threads that were blocked on allocation, so they can notice
* the device can no longer be removed and the capacity has changed:
bool invalidating_data = false;
int ret = 0;
- if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
- return -1;
-
if (test_alloc_startup(c)) {
invalidating_data = true;
goto not_enough;
/* Scan for buckets that are already invalidated: */
for_each_rw_member(ca, c, dev_iter) {
- struct btree_iter iter;
+ struct bucket_array *buckets;
struct bucket_mark m;
- struct bkey_s_c k;
- for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) {
- if (k.k->type != BCH_ALLOC)
- continue;
+ down_read(&ca->bucket_lock);
+ percpu_down_read_preempt_disable(&c->usage_lock);
- bu = k.k->p.offset;
- m = READ_ONCE(bucket(ca, bu)->mark);
+ buckets = bucket_array(ca);
- if (!is_available_bucket(m) || m.cached_sectors)
+ for (bu = buckets->first_bucket;
+ bu < buckets->nbuckets; bu++) {
+ m = READ_ONCE(buckets->b[bu].mark);
+
+ if (!m.gen_valid ||
+ !is_available_bucket(m) ||
+ m.cached_sectors)
continue;
- percpu_down_read_preempt_disable(&c->usage_lock);
bch2_mark_alloc_bucket(c, ca, bu, true,
- gc_pos_alloc(c, NULL),
- BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
- BCH_BUCKET_MARK_GC_LOCK_HELD);
- percpu_up_read_preempt_enable(&c->usage_lock);
+ gc_pos_alloc(c, NULL), 0);
fifo_push(&ca->free_inc, bu);
- if (fifo_full(&ca->free_inc))
+ discard_invalidated_buckets(c, ca);
+
+ if (fifo_full(&ca->free[RESERVE_BTREE]))
break;
}
- bch2_btree_iter_unlock(&iter);
+ percpu_up_read_preempt_enable(&c->usage_lock);
+ up_read(&ca->bucket_lock);
}
/* did we find enough buckets? */
for_each_rw_member(ca, c, dev_iter)
- if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
+ if (!fifo_full(&ca->free[RESERVE_BTREE])) {
percpu_ref_put(&ca->io_ref);
goto not_enough;
}
return 0;
not_enough:
- pr_debug("did not find enough empty buckets; issuing discards");
-
- /* clear out free_inc, we'll be using it again below: */
- for_each_rw_member(ca, c, dev_iter)
- discard_invalidated_buckets(c, ca);
-
- pr_debug("scanning for reclaimable buckets");
+ pr_debug("not enough empty buckets; scanning for reclaimable buckets");
for_each_rw_member(ca, c, dev_iter) {
find_reclaimable_buckets(c, ca);
* invalidated on disk:
*/
if (invalidating_data) {
- BUG();
- pr_info("holding writes");
pr_debug("invalidating existing data");
set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
} else {
return bch2_alloc_write(c);
}
-void bch2_fs_allocator_init(struct bch_fs *c)
+void bch2_fs_allocator_background_init(struct bch_fs *c)
{
- struct open_bucket *ob;
- struct write_point *wp;
-
- mutex_init(&c->write_points_hash_lock);
spin_lock_init(&c->freelist_lock);
bch2_bucket_clock_init(c, READ);
bch2_bucket_clock_init(c, WRITE);
- /* open bucket 0 is a sentinal NULL: */
- spin_lock_init(&c->open_buckets[0].lock);
-
- for (ob = c->open_buckets + 1;
- ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
- spin_lock_init(&ob->lock);
- c->open_buckets_nr_free++;
-
- ob->freelist = c->open_buckets_freelist;
- c->open_buckets_freelist = ob - c->open_buckets;
- }
-
- writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
- writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
-
- for (wp = c->write_points;
- wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) {
- writepoint_init(wp, BCH_DATA_USER);
-
- wp->last_used = sched_clock();
- wp->write_point = (unsigned long) wp;
- hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
- }
-
c->pd_controllers_update_seconds = 5;
INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
}