From: Kent Overstreet Date: Thu, 28 Dec 2017 01:32:40 +0000 (-0500) Subject: Update bcachefs sources to f4b290345a bcachefs: device resize X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=d77921a153b83e576c01386d38a1e457bb84a009;p=bcachefs-tools-debian Update bcachefs sources to f4b290345a bcachefs: device resize --- diff --git a/.bcachefs_revision b/.bcachefs_revision index 09d45b2..f908a3f 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -2afdc642c2ab4d629993c7f064765ecf25ee483f +f4b290345a983c534879e603fa5bf4d7465c9e2e diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index 4b6038d..ec02adc 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -154,6 +154,8 @@ static void pd_controllers_update(struct work_struct *work) c->pd_controllers_update_seconds * HZ); } +/* Persistent alloc info: */ + static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) { unsigned bytes = offsetof(struct bch_alloc, data); @@ -262,7 +264,9 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k) if (a.k->p.offset >= ca->mi.nbuckets) return; - g = ca->buckets + a.k->p.offset; + lg_local_lock(&c->usage_lock); + + g = bucket(ca, a.k->p.offset); bucket_cmpxchg(g, new, ({ new.gen = a.v->gen; new.gen_valid = 1; @@ -273,6 +277,8 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k) g->prio[READ] = get_alloc_field(&d, 2); if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) g->prio[WRITE] = get_alloc_field(&d, 2); + + lg_local_unlock(&c->usage_lock); } int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) @@ -305,33 +311,46 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) bch2_alloc_read_key(c, bkey_i_to_s_c(k)); } - mutex_lock(&c->bucket_lock); + mutex_lock(&c->prio_clock[READ].lock); for_each_member_device(ca, c, i) { + down_read(&ca->bucket_lock); bch2_recalc_min_prio(c, ca, READ); + up_read(&ca->bucket_lock); + } + mutex_unlock(&c->prio_clock[READ].lock); + + mutex_lock(&c->prio_clock[WRITE].lock); + for_each_member_device(ca, c, i) { + down_read(&ca->bucket_lock); bch2_recalc_min_prio(c, ca, WRITE); + up_read(&ca->bucket_lock); } - mutex_unlock(&c->bucket_lock); + mutex_unlock(&c->prio_clock[WRITE].lock); return 0; } static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g, struct btree_iter *iter, + size_t b, struct btree_iter *iter, u64 *journal_seq) { struct bucket_mark m; __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key; + struct bucket *g; struct bkey_i_alloc *a; u8 *d; int ret; - bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, g - ca->buckets)); + bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); do { ret = bch2_btree_iter_traverse(iter); if (ret) break; + lg_local_lock(&c->usage_lock); + g = bucket(ca, b); + /* read mark under btree node lock: */ m = READ_ONCE(g->mark); a = bkey_alloc_init(&alloc_key.k); @@ -345,8 +364,8 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, put_alloc_field(&d, 2, g->prio[READ]); if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) put_alloc_field(&d, 2, g->prio[WRITE]); + lg_local_unlock(&c->usage_lock); - bch2_btree_iter_set_pos(iter, a->k.p); ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| @@ -363,7 +382,6 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) { struct bch_dev *ca; - struct bucket *g; struct btree_iter iter; int ret; @@ -375,12 +393,10 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) if (pos.offset >= ca->mi.nbuckets) return 0; - g = ca->buckets + pos.offset; - bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, BTREE_ITER_INTENT); - ret = __bch2_alloc_write_key(c, ca, g, &iter, NULL); + ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL); bch2_btree_iter_unlock(&iter); return ret; } @@ -394,86 +410,34 @@ static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_s bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, BTREE_ITER_INTENT); - for_each_set_bit(bucket, ca->bucket_dirty, ca->mi.nbuckets) { - ret = __bch2_alloc_write_key(c, ca, ca->buckets + bucket, - &iter, journal_seq); + down_read(&ca->bucket_lock); + for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) { + ret = __bch2_alloc_write_key(c, ca, bucket, &iter, journal_seq); if (ret) break; - clear_bit(bucket, ca->bucket_dirty); + clear_bit(bucket, ca->buckets_dirty); } + up_read(&ca->bucket_lock); bch2_btree_iter_unlock(&iter); return ret; } -#define BUCKET_GC_GEN_MAX 96U - -/** - * wait_buckets_available - wait on reclaimable buckets - * - * If there aren't enough available buckets to fill up free_inc, wait until - * there are. - */ -static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) -{ - unsigned long gc_count = c->gc_count; - int ret = 0; - - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) { - ret = -1; - break; - } - - if (gc_count != c->gc_count) - ca->inc_gen_really_needs_gc = 0; - - if ((ssize_t) (dev_buckets_available(c, ca) - - ca->inc_gen_really_needs_gc) >= - (ssize_t) fifo_free(&ca->free_inc)) - break; - - up_read(&c->gc_lock); - schedule(); - try_to_freeze(); - down_read(&c->gc_lock); - } - - __set_current_state(TASK_RUNNING); - return ret; -} - -static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, - size_t bucket) -{ - if (expensive_debug_checks(c)) { - size_t iter; - long i; - unsigned j; - - for (j = 0; j < RESERVE_NR; j++) - fifo_for_each_entry(i, &ca->free[j], iter) - BUG_ON(i == bucket); - fifo_for_each_entry(i, &ca->free_inc, iter) - BUG_ON(i == bucket); - } -} - -/* Bucket heap / gen */ +/* Bucket IO clocks: */ static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw) { struct prio_clock *clock = &c->prio_clock[rw]; + struct bucket_array *buckets = bucket_array(ca); struct bucket *g; u16 max_delta = 1; unsigned i; - lockdep_assert_held(&c->bucket_lock); + lockdep_assert_held(&c->prio_clock[rw].lock); /* Determine min prio for this particular device */ - for_each_bucket(g, ca) + for_each_bucket(g, buckets) max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw])); ca->min_prio[rw] = clock->hand - max_delta; @@ -494,6 +458,7 @@ static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw) static void bch2_rescale_prios(struct bch_fs *c, int rw) { struct prio_clock *clock = &c->prio_clock[rw]; + struct bucket_array *buckets; struct bch_dev *ca; struct bucket *g; unsigned i; @@ -501,23 +466,28 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw) trace_rescale_prios(c); for_each_member_device(ca, c, i) { - for_each_bucket(g, ca) + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + for_each_bucket(g, buckets) g->prio[rw] = clock->hand - - (clock->hand - g->prio[rw]) / 2; + (clock->hand - g->prio[rw]) / 2; bch2_recalc_min_prio(c, ca, rw); + + up_read(&ca->bucket_lock); } } static void bch2_inc_clock_hand(struct io_timer *timer) { struct prio_clock *clock = container_of(timer, - struct prio_clock, rescale); + struct prio_clock, rescale); struct bch_fs *c = container_of(clock, - struct bch_fs, prio_clock[clock->rw]); + struct bch_fs, prio_clock[clock->rw]); u64 capacity; - mutex_lock(&c->bucket_lock); + mutex_lock(&clock->lock); clock->hand++; @@ -525,7 +495,7 @@ static void bch2_inc_clock_hand(struct io_timer *timer) if (clock->hand == (u16) (clock->min_prio - 1)) bch2_rescale_prios(c, clock->rw); - mutex_unlock(&c->bucket_lock); + mutex_unlock(&clock->lock); capacity = READ_ONCE(c->capacity); @@ -548,57 +518,114 @@ static void bch2_inc_clock_hand(struct io_timer *timer) static void bch2_prio_timer_init(struct bch_fs *c, int rw) { struct prio_clock *clock = &c->prio_clock[rw]; - struct io_timer *timer = &clock->rescale; - clock->rw = rw; - timer->fn = bch2_inc_clock_hand; - timer->expire = c->capacity >> 10; + clock->hand = 1; + clock->rw = rw; + clock->rescale.fn = bch2_inc_clock_hand; + clock->rescale.expire = c->capacity >> 10; + mutex_init(&clock->lock); } +/* Background allocator thread: */ + /* - * Background allocation thread: scans for buckets to be invalidated, - * invalidates them, rewrites prios/gens (marking them as invalidated on disk), - * then optionally issues discard commands to the newly free buckets, then puts - * them on the various freelists. + * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens + * (marking them as invalidated on disk), then optionally issues discard + * commands to the newly free buckets, then puts them on the various freelists. */ -static inline bool can_inc_bucket_gen(struct bch_dev *ca, struct bucket *g) +static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, + size_t bucket) +{ + if (expensive_debug_checks(c)) { + size_t iter; + long i; + unsigned j; + + for (j = 0; j < RESERVE_NR; j++) + fifo_for_each_entry(i, &ca->free[j], iter) + BUG_ON(i == bucket); + fifo_for_each_entry(i, &ca->free_inc, iter) + BUG_ON(i == bucket); + } +} + +#define BUCKET_GC_GEN_MAX 96U + +/** + * wait_buckets_available - wait on reclaimable buckets + * + * If there aren't enough available buckets to fill up free_inc, wait until + * there are. + */ +static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) { - return bucket_gc_gen(ca, g) < BUCKET_GC_GEN_MAX; + unsigned long gc_count = c->gc_count; + int ret = 0; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) { + ret = -1; + break; + } + + if (gc_count != c->gc_count) + ca->inc_gen_really_needs_gc = 0; + + if ((ssize_t) (dev_buckets_available(c, ca) - + ca->inc_gen_really_needs_gc) >= + (ssize_t) fifo_free(&ca->free_inc)) + break; + + up_read(&c->gc_lock); + schedule(); + try_to_freeze(); + down_read(&c->gc_lock); + } + + __set_current_state(TASK_RUNNING); + return ret; } -static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g, +static bool bch2_can_invalidate_bucket(struct bch_dev *ca, + size_t bucket, struct bucket_mark mark) { + u8 gc_gen; + if (!is_available_bucket(mark)) return false; - if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX / 2) + gc_gen = bucket_gc_gen(ca, bucket); + + if (gc_gen >= BUCKET_GC_GEN_MAX / 2) ca->inc_gen_needs_gc++; - if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX) + if (gc_gen >= BUCKET_GC_GEN_MAX) ca->inc_gen_really_needs_gc++; - return can_inc_bucket_gen(ca, g); + return gc_gen < BUCKET_GC_GEN_MAX; } static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g) + size_t bucket) { struct bucket_mark m; spin_lock(&c->freelist_lock); - if (!bch2_invalidate_bucket(c, ca, g, &m)) { + if (!bch2_invalidate_bucket(c, ca, bucket, &m)) { spin_unlock(&c->freelist_lock); return; } - verify_not_on_freelist(c, ca, g - ca->buckets); - BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); + verify_not_on_freelist(c, ca, bucket); + BUG_ON(!fifo_push(&ca->free_inc, bucket)); spin_unlock(&c->freelist_lock); - g->prio[READ] = c->prio_clock[READ].hand; - g->prio[WRITE] = c->prio_clock[WRITE].hand; + /* gc lock held: */ + bucket_io_clock_reset(c, ca, bucket, READ); + bucket_io_clock_reset(c, ca, bucket, WRITE); if (m.cached_sectors) { ca->allocator_invalidating_data = true; @@ -636,14 +663,14 @@ static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, */ static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g, struct bucket_mark m) + size_t b, struct bucket_mark m) { /* * Time since last read, scaled to [0, 8) where larger value indicates * more recently read data: */ unsigned long hotness = - (g->prio[READ] - ca->min_prio[READ]) * 7 / + (bucket(ca, b)->prio[READ] - ca->min_prio[READ]) * 7 / (c->prio_clock[READ].hand - ca->min_prio[READ]); /* How much we want to keep the data in this bucket: */ @@ -651,11 +678,11 @@ static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, (hotness + 1) * bucket_sectors_used(m); unsigned long needs_journal_commit = - bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); + bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); return (data_wantness << 9) | (needs_journal_commit << 8) | - bucket_gc_gen(ca, g); + bucket_gc_gen(ca, b); } static inline int bucket_alloc_cmp(alloc_heap *h, @@ -667,34 +694,41 @@ static inline int bucket_alloc_cmp(alloc_heap *h, static void invalidate_buckets_lru(struct bch_fs *c, struct bch_dev *ca) { + struct bucket_array *buckets; struct alloc_heap_entry e; - struct bucket *g; + size_t b; ca->alloc_heap.used = 0; - mutex_lock(&c->bucket_lock); + mutex_lock(&c->prio_clock[READ].lock); + down_read(&ca->bucket_lock); + + buckets = bucket_array(ca); + bch2_recalc_min_prio(c, ca, READ); - bch2_recalc_min_prio(c, ca, WRITE); /* * Find buckets with lowest read priority, by building a maxheap sorted * by read priority and repeatedly replacing the maximum element until * all buckets have been visited. */ - for_each_bucket(g, ca) { - struct bucket_mark m = READ_ONCE(g->mark); + for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { + struct bucket_mark m = READ_ONCE(buckets->b[b].mark); - if (!bch2_can_invalidate_bucket(ca, g, m)) + if (!bch2_can_invalidate_bucket(ca, b, m)) continue; e = (struct alloc_heap_entry) { - .bucket = g - ca->buckets, - .key = bucket_sort_key(c, ca, g, m) + .bucket = b, + .key = bucket_sort_key(c, ca, b, m) }; heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp); } + up_read(&ca->bucket_lock); + mutex_unlock(&c->prio_clock[READ].lock); + heap_resort(&ca->alloc_heap, bucket_alloc_cmp); /* @@ -703,52 +737,48 @@ static void invalidate_buckets_lru(struct bch_fs *c, struct bch_dev *ca) */ while (!fifo_full(&ca->free_inc) && heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) - bch2_invalidate_one_bucket(c, ca, &ca->buckets[e.bucket]); - - mutex_unlock(&c->bucket_lock); + bch2_invalidate_one_bucket(c, ca, e.bucket); } static void invalidate_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) { + struct bucket_array *buckets = bucket_array(ca); struct bucket_mark m; - struct bucket *g; - size_t checked = 0; + size_t b, checked; - while (!fifo_full(&ca->free_inc)) { + for (checked = 0; + checked < ca->mi.nbuckets && !fifo_full(&ca->free_inc); + checked++) { if (ca->fifo_last_bucket < ca->mi.first_bucket || ca->fifo_last_bucket >= ca->mi.nbuckets) ca->fifo_last_bucket = ca->mi.first_bucket; - g = ca->buckets + ca->fifo_last_bucket++; - m = READ_ONCE(g->mark); + b = ca->fifo_last_bucket++; - if (bch2_can_invalidate_bucket(ca, g, m)) - bch2_invalidate_one_bucket(c, ca, g); + m = READ_ONCE(buckets->b[b].mark); - if (++checked >= ca->mi.nbuckets) - return; + if (bch2_can_invalidate_bucket(ca, b, m)) + bch2_invalidate_one_bucket(c, ca, b); } } static void invalidate_buckets_random(struct bch_fs *c, struct bch_dev *ca) { + struct bucket_array *buckets = bucket_array(ca); struct bucket_mark m; - struct bucket *g; - size_t checked = 0; + size_t checked; - while (!fifo_full(&ca->free_inc)) { - size_t n = bch2_rand_range(ca->mi.nbuckets - - ca->mi.first_bucket) + + for (checked = 0; + checked < ca->mi.nbuckets / 2 && !fifo_full(&ca->free_inc); + checked++) { + size_t b = bch2_rand_range(ca->mi.nbuckets - + ca->mi.first_bucket) + ca->mi.first_bucket; - g = ca->buckets + n; - m = READ_ONCE(g->mark); - - if (bch2_can_invalidate_bucket(ca, g, m)) - bch2_invalidate_one_bucket(c, ca, g); + m = READ_ONCE(buckets->b[b].mark); - if (++checked >= ca->mi.nbuckets / 2) - return; + if (bch2_can_invalidate_bucket(ca, b, m)) + bch2_invalidate_one_bucket(c, ca, b); } } @@ -758,15 +788,15 @@ static void invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) ca->inc_gen_really_needs_gc = 0; switch (ca->mi.replacement) { - case CACHE_REPLACEMENT_LRU: - invalidate_buckets_lru(c, ca); - break; - case CACHE_REPLACEMENT_FIFO: - invalidate_buckets_fifo(c, ca); - break; - case CACHE_REPLACEMENT_RANDOM: - invalidate_buckets_random(c, ca); - break; + case CACHE_REPLACEMENT_LRU: + invalidate_buckets_lru(c, ca); + break; + case CACHE_REPLACEMENT_FIFO: + invalidate_buckets_fifo(c, ca); + break; + case CACHE_REPLACEMENT_RANDOM: + invalidate_buckets_random(c, ca); + break; } } @@ -789,8 +819,7 @@ static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca, BTREE_ITER_INTENT); fifo_for_each_entry(b, &ca->free_inc, i) { - ret = __bch2_alloc_write_key(c, ca, ca->buckets + b, - &iter, journal_seq); + ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq); if (ret) break; @@ -980,8 +1009,8 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); spin_lock(&ob->lock); - bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), false, - gc_pos_alloc(c, ob), 0); + bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), + false, gc_pos_alloc(c, ob), 0); ob->valid = false; spin_unlock(&ob->lock); @@ -1025,26 +1054,35 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) */ static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca) { - struct bucket *g; - long r = -1; + struct bucket_array *buckets; + ssize_t b; if (!down_read_trylock(&c->gc_lock)) - return r; + return -1; - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) - goto out; + if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) { + up_read(&c->gc_lock); + return -1; + } - for_each_bucket(g, ca) - if (!g->mark.touched_this_mount && - is_available_bucket(g->mark) && - bch2_mark_alloc_bucket_startup(c, ca, g)) { - r = g - ca->buckets; - set_bit(r, ca->bucket_dirty); - break; + spin_unlock(&c->freelist_lock); + + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + spin_lock(&c->freelist_lock); + + for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) + if (is_startup_available_bucket(buckets->b[b].mark) && + bch2_mark_alloc_bucket_startup(c, ca, b)) { + set_bit(b, ca->buckets_dirty); + goto success; } -out: + b = -1; +success: + up_read(&ca->bucket_lock); up_read(&c->gc_lock); - return r; + return b; } static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) @@ -1069,6 +1107,7 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, bool may_alloc_partial, struct closure *cl) { + struct bucket_array *buckets; struct open_bucket *ob; long bucket; @@ -1126,22 +1165,27 @@ out: ob = bch2_open_bucket_alloc(c); spin_lock(&ob->lock); + lg_local_lock(&c->usage_lock); + buckets = bucket_array(ca); + ob->valid = true; ob->sectors_free = ca->mi.bucket_size; ob->ptr = (struct bch_extent_ptr) { - .gen = ca->buckets[bucket].mark.gen, + .gen = buckets->b[bucket].mark.gen, .offset = bucket_to_sector(ca, bucket), .dev = ca->dev_idx, }; + + bucket_io_clock_reset(c, ca, bucket, READ); + bucket_io_clock_reset(c, ca, bucket, WRITE); + + lg_local_unlock(&c->usage_lock); spin_unlock(&ob->lock); spin_unlock(&c->freelist_lock); bch2_wake_allocator(ca); - ca->buckets[bucket].prio[READ] = c->prio_clock[READ].hand; - ca->buckets[bucket].prio[WRITE] = c->prio_clock[WRITE].hand; - trace_bucket_alloc(ca, reserve); return ob - c->open_buckets; } diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h index 8dffb86..ee771ee 100644 --- a/libbcachefs/alloc.h +++ b/libbcachefs/alloc.h @@ -5,7 +5,6 @@ #include "alloc_types.h" struct bkey; -struct bucket; struct bch_dev; struct bch_fs; struct bch_devs_List; diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 90123ff..6b08104 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -5,12 +5,9 @@ #include #include "clock_types.h" +#include "fifo.h" -/* - * There's two of these clocks, one for reads and one for writes: - * - * All fields protected by bucket_lock - */ +/* There's two of these clocks, one for reads and one for writes: */ struct prio_clock { /* * "now" in (read/write) IO time - incremented whenever we do X amount @@ -31,6 +28,7 @@ struct prio_clock { int rw; struct io_timer rescale; + struct mutex lock; }; /* There is one reserve for each type of btree, one for prios and gens @@ -43,6 +41,8 @@ enum alloc_reserve { RESERVE_NR = 3, }; +typedef FIFO(long) alloc_fifo; + /* Enough for 16 cache devices, 2 tiers and some left over for pipelining */ #define OPEN_BUCKETS_COUNT 256 #define WRITE_POINT_COUNT 32 diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index e25baf5..02e3841 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -349,6 +349,22 @@ struct bch_dev { /* biosets used in cloned bios for writing multiple replicas */ struct bio_set replica_set; + /* + * Buckets: + * Per-bucket arrays are protected by c->usage_lock, bucket_lock and + * gc_lock, for device resize - holding any is sufficient for access: + * Or rcu_read_lock(), but only for ptr_stale(): + */ + struct bucket_array __rcu *buckets; + unsigned long *buckets_dirty; + /* most out of date gen in the btree */ + u8 *oldest_gens; + struct rw_semaphore bucket_lock; + + struct bch_dev_usage __percpu *usage_percpu; + struct bch_dev_usage usage_cached; + + /* Allocator: */ struct task_struct *alloc_thread; /* @@ -360,8 +376,8 @@ struct bch_dev { * gens/prios, they'll be moved to the free list (and possibly discarded * in the process) */ - DECLARE_FIFO(long, free)[RESERVE_NR]; - DECLARE_FIFO(long, free_inc); + alloc_fifo free[RESERVE_NR]; + alloc_fifo free_inc; spinlock_t freelist_lock; unsigned nr_invalidated; bool alloc_thread_started; @@ -371,24 +387,9 @@ struct bch_dev { size_t fifo_last_bucket; - /* Allocation stuff: */ - - /* most out of date gen in the btree */ - u8 *oldest_gens; - struct bucket *buckets; - unsigned long *bucket_dirty; - /* last calculated minimum prio */ u16 min_prio[2]; - /* - * Bucket book keeping. The first element is updated by GC, the - * second contains a saved copy of the stats from the beginning - * of GC. - */ - struct bch_dev_usage __percpu *usage_percpu; - struct bch_dev_usage usage_cached; - atomic_long_t saturated_count; size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; @@ -575,8 +576,6 @@ struct bch_fs { struct bch_fs_usage usage_cached; struct lglock usage_lock; - struct mutex bucket_lock; - struct closure_waitlist freelist_wait; /* diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h index 079e2b5..aa2a205 100644 --- a/libbcachefs/bcachefs_ioctl.h +++ b/libbcachefs/bcachefs_ioctl.h @@ -2,6 +2,7 @@ #define _BCACHEFS_IOCTL_H #include +#include #include "bcachefs_format.h" #define BCH_FORCE_IF_DATA_LOST (1 << 0) @@ -15,6 +16,8 @@ #define BCH_BY_INDEX (1 << 4) +#define BCH_READ_DEV (1 << 5) + /* global control dev: */ #define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) @@ -46,6 +49,9 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_DISK_EVACUATE _IOW(0xbc, 9, struct bch_ioctl_disk) #define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) #define BCH_IOCTL_USAGE _IOWR(0xbc, 11, struct bch_ioctl_usage) +#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) +#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) +#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 13, struct bch_ioctl_disk_resize) struct bch_ioctl_query_uuid { uuid_le uuid; @@ -123,4 +129,23 @@ struct bch_ioctl_usage { struct bch_ioctl_dev_usage devs[0]; }; +struct bch_ioctl_read_super { + __u32 flags; + __u32 pad; + __u64 dev; + __u64 size; + __u64 sb; +}; + +struct bch_ioctl_disk_get_idx { + __u64 dev; +}; + +struct bch_ioctl_disk_resize { + __u32 flags; + __u32 pad; + __u64 dev; + __u64 nbuckets; +}; + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index e8e4f6d..7d1be86 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -167,6 +167,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, extent_for_each_ptr(e, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + size_t b = PTR_BUCKET_NR(ca, ptr); struct bucket *g = PTR_BUCKET(ca, ptr); if (mustfix_fsck_err_on(!g->mark.gen_valid, c, @@ -176,7 +177,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, ptr->gen)) { g->_mark.gen = ptr->gen; g->_mark.gen_valid = 1; - set_bit(g - ca->buckets, ca->bucket_dirty); + set_bit(b, ca->buckets_dirty); } if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, @@ -185,7 +186,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, ptr->gen, g->mark.gen)) { g->_mark.gen = ptr->gen; g->_mark.gen_valid = 1; - set_bit(g - ca->buckets, ca->bucket_dirty); + set_bit(b, ca->buckets_dirty); set_bit(BCH_FS_FIXED_GENS, &c->flags); } @@ -194,7 +195,6 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, } } - atomic64_set(&c->key_version, max_t(u64, k.k->version.lo, atomic64_read(&c->key_version))); @@ -302,8 +302,7 @@ static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, unsigned sectors = min_t(u64, bucket_to_sector(ca, b + 1), end) - start; - bch2_mark_metadata_bucket(c, ca, ca->buckets + b, - type, sectors, + bch2_mark_metadata_bucket(c, ca, b, type, sectors, gc_phase(GC_PHASE_SB), flags); b++; start += sectors; @@ -335,8 +334,7 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, for (i = 0; i < ca->journal.nr; i++) { b = ca->journal.buckets[i]; - bch2_mark_metadata_bucket(c, ca, ca->buckets + b, - BCH_DATA_JOURNAL, + bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), flags); } @@ -397,7 +395,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) for_each_member_device(ca, c, ci) { fifo_for_each_entry(i, &ca->free_inc, iter) - bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true, + bch2_mark_alloc_bucket(c, ca, i, true, gc_pos_alloc(c, NULL), BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| BCH_BUCKET_MARK_GC_LOCK_HELD); @@ -406,7 +404,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) for (j = 0; j < RESERVE_NR; j++) fifo_for_each_entry(i, &ca->free[j], iter) - bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true, + bch2_mark_alloc_bucket(c, ca, i, true, gc_pos_alloc(c, NULL), BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| BCH_BUCKET_MARK_GC_LOCK_HELD); @@ -421,7 +419,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) if (ob->valid) { gc_pos_set(c, gc_pos_alloc(c, ob)); ca = bch_dev_bkey_exists(c, ob->ptr.dev); - bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true, + bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, gc_pos_alloc(c, ob), BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| BCH_BUCKET_MARK_GC_LOCK_HELD); @@ -433,9 +431,10 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) static void bch2_gc_start(struct bch_fs *c) { struct bch_dev *ca; - struct bucket *g; + struct bucket_array *buckets; struct bucket_mark new; unsigned i; + size_t b; int cpu; lg_global_lock(&c->usage_lock); @@ -467,16 +466,21 @@ static void bch2_gc_start(struct bch_fs *c) lg_global_unlock(&c->usage_lock); /* Clear bucket marks: */ - for_each_member_device(ca, c, i) - for_each_bucket(g, ca) { - bucket_cmpxchg(g, new, ({ + for_each_member_device(ca, c, i) { + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { + bucket_cmpxchg(buckets->b + b, new, ({ new.owned_by_allocator = 0; new.data_type = 0; new.cached_sectors = 0; new.dirty_sectors = 0; })); - ca->oldest_gens[g - ca->buckets] = new.gen; + ca->oldest_gens[b] = new.gen; } + up_read(&ca->bucket_lock); + } } /** @@ -1020,7 +1024,7 @@ err: return bch2_btree_iter_unlock(&iter) ?: ret; } -int bch2_initial_gc(struct bch_fs *c, struct list_head *journal) +static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal) { unsigned iter = 0; enum btree_id id; @@ -1044,7 +1048,7 @@ again: ret = bch2_journal_mark(c, journal); if (ret) - return ret; + return ret; if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) { if (iter++ > 2) { @@ -1071,3 +1075,14 @@ again: return 0; } + +int bch2_initial_gc(struct bch_fs *c, struct list_head *journal) +{ + int ret; + + down_write(&c->gc_lock); + ret = __bch2_initial_gc(c, journal); + up_write(&c->gc_lock); + + return ret; +} diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 8899e3c..2dbe7d3 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -67,6 +67,7 @@ #include "btree_gc.h" #include "buckets.h" #include "error.h" +#include "movinggc.h" #include #include @@ -147,12 +148,16 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c) { u16 last_seq_ondisk = c->journal.last_seq_ondisk; struct bch_dev *ca; + struct bucket_array *buckets; struct bucket *g; struct bucket_mark m; unsigned i; - for_each_member_device(ca, c, i) - for_each_bucket(g, ca) { + for_each_member_device(ca, c, i) { + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + for_each_bucket(g, buckets) { bucket_cmpxchg(g, m, ({ if (!m.journal_seq_valid || bucket_needs_journal_commit(m, last_seq_ondisk)) @@ -161,6 +166,8 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c) m.journal_seq_valid = 0; })); } + up_read(&ca->bucket_lock); + } } #define bch2_usage_add(_acc, _stats) \ @@ -319,20 +326,17 @@ void bch2_fs_usage_apply(struct bch_fs *c, } static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g, struct bucket_mark old, - struct bucket_mark new) + struct bucket_mark old, struct bucket_mark new) { struct bch_dev_usage *dev_usage; - BUG_ON((g - ca->buckets) < ca->mi.first_bucket || - (g - ca->buckets) >= ca->mi.nbuckets); + lockdep_assert_held(&c->usage_lock); bch2_fs_inconsistent_on(old.data_type && new.data_type && old.data_type != new.data_type, c, "different types of data in same bucket: %u, %u", old.data_type, new.data_type); - preempt_disable(); dev_usage = this_cpu_ptr(ca->usage_percpu); dev_usage->buckets[bucket_type(old)]--; @@ -347,7 +351,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, dev_usage->sectors[new.data_type] += new.dirty_sectors; dev_usage->sectors[BCH_DATA_CACHED] += (int) new.cached_sectors - (int) old.cached_sectors; - preempt_enable(); if (!is_available_bucket(old) && is_available_bucket(new)) bch2_wake_allocator(ca); @@ -359,16 +362,19 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, ({ \ struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \ \ - bch2_dev_usage_update(c, ca, g, _old, new); \ + bch2_dev_usage_update(c, ca, _old, new); \ _old; \ }) bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g, struct bucket_mark *old) + size_t b, struct bucket_mark *old) { + struct bucket *g; struct bucket_mark new; lg_local_lock(&c->usage_lock); + g = bucket(ca, b); + *old = bucket_data_cmpxchg(c, ca, g, new, ({ if (!is_available_bucket(new)) { lg_local_unlock(&c->usage_lock); @@ -385,20 +391,22 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, lg_local_unlock(&c->usage_lock); if (!old->owned_by_allocator && old->cached_sectors) - trace_invalidate(ca, bucket_to_sector(ca, g - ca->buckets), + trace_invalidate(ca, bucket_to_sector(ca, b), old->cached_sectors); return true; } bool bch2_mark_alloc_bucket_startup(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g) + size_t b) { + struct bucket *g; struct bucket_mark new, old; lg_local_lock(&c->usage_lock); + g = bucket(ca, b); + old = bucket_data_cmpxchg(c, ca, g, new, ({ - if (new.touched_this_mount || - !is_available_bucket(new)) { + if (!is_startup_available_bucket(new)) { lg_local_unlock(&c->usage_lock); return false; } @@ -412,12 +420,15 @@ bool bch2_mark_alloc_bucket_startup(struct bch_fs *c, struct bch_dev *ca, } void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g, bool owned_by_allocator, + size_t b, bool owned_by_allocator, struct gc_pos pos, unsigned flags) { + struct bucket *g; struct bucket_mark old, new; lg_local_lock(&c->usage_lock); + g = bucket(ca, b); + if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && gc_will_visit(c, pos)) { lg_local_unlock(&c->usage_lock); @@ -448,15 +459,18 @@ do { \ } while (0) void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, - struct bucket *g, enum bch_data_type type, + size_t b, enum bch_data_type type, unsigned sectors, struct gc_pos pos, unsigned flags) { + struct bucket *g; struct bucket_mark old, new; BUG_ON(!type); lg_local_lock(&c->usage_lock); + g = bucket(ca, b); + if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && gc_will_visit(c, pos)) { lg_local_unlock(&c->usage_lock); @@ -502,7 +516,7 @@ static void bch2_mark_pointer(struct bch_fs *c, struct bucket_mark old, new; unsigned saturated; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr); + struct bucket *g = PTR_BUCKET(ca, ptr); enum bch_data_type data_type = type == S_META ? BCH_DATA_BTREE : BCH_DATA_USER; u64 v; @@ -584,7 +598,7 @@ static void bch2_mark_pointer(struct bch_fs *c, old.counter, new.counter)) != old.counter); - bch2_dev_usage_update(c, ca, g, old, new); + bch2_dev_usage_update(c, ca, old, new); BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && bucket_became_unavailable(c, old, new)); @@ -810,3 +824,158 @@ int bch2_disk_reservation_get(struct bch_fs *c, return bch2_disk_reservation_add(c, res, sectors, flags); } + +/* Startup/shutdown: */ + +static void buckets_free_rcu(struct rcu_head *rcu) +{ + struct bucket_array *buckets = + container_of(rcu, struct bucket_array, rcu); + + kvpfree(buckets, + sizeof(struct bucket_array) + + buckets->nbuckets * sizeof(struct bucket)); +} + +int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) +{ + struct bucket_array *buckets = NULL, *old_buckets; + unsigned long *buckets_dirty = NULL; + u8 *oldest_gens = NULL; + alloc_fifo free[RESERVE_NR]; + alloc_fifo free_inc; + alloc_heap alloc_heap; + copygc_heap copygc_heap; + + size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, + ca->mi.bucket_size / c->opts.btree_node_size); + /* XXX: these should be tunable */ + size_t reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9); + size_t copygc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7); + size_t free_inc_reserve = copygc_reserve / 2; + bool resize = ca->buckets != NULL, + start_copygc = ca->copygc_thread != NULL; + int ret = -ENOMEM; + unsigned i; + + memset(&free, 0, sizeof(free)); + memset(&free_inc, 0, sizeof(free_inc)); + memset(&alloc_heap, 0, sizeof(alloc_heap)); + memset(©gc_heap, 0, sizeof(copygc_heap)); + + if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + + nbuckets * sizeof(struct bucket), + GFP_KERNEL|__GFP_ZERO)) || + !(oldest_gens = kvpmalloc(nbuckets * sizeof(u8), + GFP_KERNEL|__GFP_ZERO)) || + !(buckets_dirty = kvpmalloc(BITS_TO_LONGS(nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO)) || + !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) || + !init_fifo(&free[RESERVE_MOVINGGC], + copygc_reserve, GFP_KERNEL) || + !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || + !init_fifo(&free_inc, free_inc_reserve, GFP_KERNEL) || + !init_heap(&alloc_heap, free_inc_reserve, GFP_KERNEL) || + !init_heap(©gc_heap, copygc_reserve, GFP_KERNEL)) + goto err; + + buckets->first_bucket = ca->mi.first_bucket; + buckets->nbuckets = nbuckets; + + bch2_copygc_stop(ca); + + down_write(&c->gc_lock); + down_write(&ca->bucket_lock); + lg_global_lock(&c->usage_lock); + + old_buckets = bucket_array(ca); + + if (resize) { + size_t n = min(buckets->nbuckets, old_buckets->nbuckets); + + memcpy(buckets->b, + old_buckets->b, + n * sizeof(struct bucket)); + memcpy(oldest_gens, + ca->oldest_gens, + n * sizeof(u8)); + memcpy(buckets_dirty, + ca->buckets_dirty, + BITS_TO_LONGS(n) * sizeof(unsigned long)); + } + + rcu_assign_pointer(ca->buckets, buckets); + buckets = old_buckets; + + swap(ca->oldest_gens, oldest_gens); + swap(ca->buckets_dirty, buckets_dirty); + + lg_global_unlock(&c->usage_lock); + + spin_lock(&c->freelist_lock); + for (i = 0; i < RESERVE_NR; i++) { + fifo_move(&free[i], &ca->free[i]); + swap(ca->free[i], free[i]); + } + fifo_move(&free_inc, &ca->free_inc); + swap(ca->free_inc, free_inc); + spin_unlock(&c->freelist_lock); + + /* with gc lock held, alloc_heap can't be in use: */ + swap(ca->alloc_heap, alloc_heap); + + /* and we shut down copygc: */ + swap(ca->copygc_heap, copygc_heap); + + nbuckets = ca->mi.nbuckets; + + up_write(&ca->bucket_lock); + up_write(&c->gc_lock); + + if (start_copygc && + bch2_copygc_start(c, ca)) + bch_err(ca, "error restarting copygc thread"); + + ret = 0; +err: + free_heap(©gc_heap); + free_heap(&alloc_heap); + free_fifo(&free_inc); + for (i = 0; i < RESERVE_NR; i++) + free_fifo(&free[i]); + kvpfree(buckets_dirty, + BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); + kvpfree(oldest_gens, + nbuckets * sizeof(u8)); + if (buckets) + call_rcu(&old_buckets->rcu, buckets_free_rcu); + + return ret; +} + +void bch2_dev_buckets_free(struct bch_dev *ca) +{ + unsigned i; + + free_heap(&ca->copygc_heap); + free_heap(&ca->alloc_heap); + free_fifo(&ca->free_inc); + for (i = 0; i < RESERVE_NR; i++) + free_fifo(&ca->free[i]); + kvpfree(ca->buckets_dirty, + BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); + kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8)); + kvpfree(ca->buckets, sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket)); + + free_percpu(ca->usage_percpu); +} + +int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) +{ + if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage))) + return -ENOMEM; + + return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; +} diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index d0a9ec0..7824312 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -10,9 +10,9 @@ #include "buckets_types.h" #include "super.h" -#define for_each_bucket(b, ca) \ - for (b = (ca)->buckets + (ca)->mi.first_bucket; \ - b < (ca)->buckets + (ca)->mi.nbuckets; b++) +#define for_each_bucket(_b, _buckets) \ + for (_b = (_buckets)->b + (_buckets)->first_bucket; \ + _b < (_buckets)->b + (_buckets)->nbuckets; _b++) #define bucket_cmpxchg(g, new, expr) \ ({ \ @@ -28,15 +28,36 @@ _old; \ }) +static inline struct bucket_array *bucket_array(struct bch_dev *ca) +{ + return rcu_dereference_check(ca->buckets, + lockdep_is_held(&ca->fs->usage_lock) || + lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->bucket_lock)); +} + +static inline struct bucket *bucket(struct bch_dev *ca, size_t b) +{ + struct bucket_array *buckets = bucket_array(ca); + + BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); + return buckets->b + b; +} + +static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca, + size_t b, int rw) +{ + bucket(ca, b)->prio[rw] = c->prio_clock[rw].hand; +} + /* * bucket_gc_gen() returns the difference between the bucket's current gen and * the oldest gen of any pointer into that bucket in the btree. */ -static inline u8 bucket_gc_gen(struct bch_dev *ca, struct bucket *g) +static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) { - unsigned long r = g - ca->buckets; - return g->mark.gen - ca->oldest_gens[r]; + return bucket(ca, b)->mark.gen - ca->oldest_gens[b]; } static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, @@ -45,10 +66,22 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, return sector_to_bucket(ca, ptr->offset); } -static inline struct bucket *PTR_BUCKET(const struct bch_dev *ca, +static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - return ca->buckets + PTR_BUCKET_NR(ca, ptr); + return bucket(ca, PTR_BUCKET_NR(ca, ptr)); +} + +static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, + const struct bch_extent_ptr *ptr) +{ + struct bucket_mark m; + + rcu_read_lock(); + m = READ_ONCE(bucket(ca, PTR_BUCKET_NR(ca, ptr))->mark); + rcu_read_unlock(); + + return m; } static inline int gen_cmp(u8 a, u8 b) @@ -67,10 +100,10 @@ static inline int gen_after(u8 a, u8 b) * ptr_stale() - check if a pointer points into a bucket that has been * invalidated. */ -static inline u8 ptr_stale(const struct bch_dev *ca, +static inline u8 ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - return gen_after(PTR_BUCKET(ca, ptr)->mark.gen, ptr->gen); + return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); } /* bucket gc marks */ @@ -159,6 +192,11 @@ static inline bool is_available_bucket(struct bucket_mark mark) !mark.nouse); } +static inline bool is_startup_available_bucket(struct bucket_mark mark) +{ + return !mark.touched_this_mount && is_available_bucket(mark); +} + static inline bool bucket_needs_journal_commit(struct bucket_mark m, u16 last_seq_ondisk) { @@ -169,15 +207,14 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, void bch2_bucket_seq_cleanup(struct bch_fs *); bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, - struct bucket *, struct bucket_mark *); + size_t, struct bucket_mark *); bool bch2_mark_alloc_bucket_startup(struct bch_fs *, struct bch_dev *, - struct bucket *); + size_t); void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, - struct bucket *, bool, - struct gc_pos, unsigned); + size_t, bool, struct gc_pos, unsigned); void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, - struct bucket *, enum bch_data_type, - unsigned, struct gc_pos, unsigned); + size_t, enum bch_data_type, unsigned, + struct gc_pos, unsigned); #define BCH_BUCKET_MARK_NOATOMIC (1 << 0) #define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 1) @@ -210,4 +247,8 @@ int bch2_disk_reservation_get(struct bch_fs *, struct disk_reservation *, unsigned, int); +int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); +void bch2_dev_buckets_free(struct bch_dev *); +int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); + #endif /* _BUCKETS_H */ diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 8a3c8c3..7cd8439 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -40,6 +40,13 @@ struct bucket { }; }; +struct bucket_array { + struct rcu_head rcu; + u16 first_bucket; + size_t nbuckets; + struct bucket b[]; +}; + struct bch_dev_usage { u64 buckets[BCH_DATA_NR]; u64 buckets_alloc; diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 1ab36ac..1618ffe 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -363,6 +363,82 @@ static long bch2_ioctl_usage(struct bch_fs *c, return 0; } +static long bch2_ioctl_read_super(struct bch_fs *c, + struct bch_ioctl_read_super arg) +{ + struct bch_dev *ca = NULL; + struct bch_sb *sb; + int ret = 0; + + if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || + arg.pad) + return -EINVAL; + + mutex_lock(&c->sb_lock); + + if (arg.flags & BCH_READ_DEV) { + ca = bch2_device_lookup(c, arg.dev, arg.flags); + + if (IS_ERR(ca)) { + ret = PTR_ERR(ca); + goto err; + } + + sb = ca->disk_sb.sb; + } else { + sb = c->disk_sb; + } + + if (vstruct_bytes(sb) > arg.size) { + ret = -ERANGE; + goto err; + } + + ret = copy_to_user((void __user *)(unsigned long)arg.sb, + sb, vstruct_bytes(sb)); +err: + if (ca) + percpu_ref_put(&ca->ref); + mutex_unlock(&c->sb_lock); + return ret; +} + +static long bch2_ioctl_disk_get_idx(struct bch_fs *c, + struct bch_ioctl_disk_get_idx arg) +{ + dev_t dev = huge_decode_dev(arg.dev); + struct bch_dev *ca; + unsigned i; + + for_each_online_member(ca, c, i) + if (ca->disk_sb.bdev->bd_dev == dev) { + percpu_ref_put(&ca->io_ref); + return i; + } + + return -ENOENT; +} + +static long bch2_ioctl_disk_resize(struct bch_fs *c, + struct bch_ioctl_disk_resize arg) +{ + struct bch_dev *ca; + int ret; + + if ((arg.flags & ~BCH_BY_INDEX) || + arg.pad) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + ret = bch2_dev_resize(c, ca, arg.nbuckets); + + percpu_ref_put(&ca->ref); + return ret; +} + #define BCH_IOCTL(_name, _argtype) \ do { \ _argtype i; \ @@ -404,6 +480,12 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); case BCH_IOCTL_DISK_EVACUATE: BCH_IOCTL(disk_evacuate, struct bch_ioctl_disk); + case BCH_IOCTL_READ_SUPER: + BCH_IOCTL(read_super, struct bch_ioctl_read_super); + case BCH_IOCTL_DISK_GET_IDX: + BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); + case BCH_IOCTL_DISK_RESIZE: + BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); default: return -ENOTTY; diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 51262d6..2b4a2dc 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -123,6 +123,22 @@ bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev) return NULL; } +bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev) +{ + struct bch_extent_ptr *ptr; + bool dropped = false; + + extent_for_each_ptr_backwards(e, ptr) + if (ptr->dev == dev) { + __bch2_extent_drop_ptr(e, ptr); + dropped = true; + } + + if (dropped) + bch2_extent_drop_redundant_crcs(e); + return dropped; +} + unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e) { const struct bch_extent_ptr *ptr; @@ -225,20 +241,6 @@ void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr) bch2_extent_drop_redundant_crcs(e); } -void bch2_extent_drop_ptr_idx(struct bkey_s_extent e, unsigned idx) -{ - struct bch_extent_ptr *ptr; - unsigned i = 0; - - extent_for_each_ptr(e, ptr) - if (i++ == idx) - goto found; - - BUG(); -found: - bch2_extent_drop_ptr(e, ptr); -} - static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, struct bch_extent_crc_unpacked n) { @@ -634,14 +636,13 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, unsigned seq; const char *err; char buf[160]; - struct bucket *g; + struct bucket_mark mark; struct bch_dev *ca; unsigned replicas = 0; bool bad; extent_for_each_ptr(e, ptr) { ca = bch_dev_bkey_exists(c, ptr->dev); - g = PTR_BUCKET(ca, ptr); replicas++; if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags)) @@ -653,9 +654,11 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, do { seq = read_seqcount_begin(&c->gc_pos_lock); + mark = ptr_bucket_mark(ca, ptr); + bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - (g->mark.data_type != BCH_DATA_BTREE || - g->mark.dirty_sectors < c->opts.btree_node_size); + (mark.data_type != BCH_DATA_BTREE || + mark.dirty_sectors < c->opts.btree_node_size); } while (read_seqcount_retry(&c->gc_pos_lock, seq)); err = "inconsistent"; @@ -676,11 +679,9 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, err: bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); bch2_fs_bug(c, "%s btree pointer %s: bucket %zi " - "gen %i last_gc %i mark %08x", + "gen %i mark %08x", err, buf, PTR_BUCKET_NR(ca, ptr), - PTR_BUCKET(ca, ptr)->mark.gen, - ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], - (unsigned) g->mark.counter); + mark.gen, (unsigned) mark.counter); } static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, @@ -1730,7 +1731,6 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, { const struct bch_extent_ptr *ptr; struct bch_dev *ca; - struct bucket *g; struct bucket_mark mark; unsigned seq, stale; char buf[160]; @@ -1751,7 +1751,6 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, extent_for_each_ptr(e, ptr) { ca = bch_dev_bkey_exists(c, ptr->dev); - g = PTR_BUCKET(ca, ptr); replicas++; ptrs_per_tier[ca->mi.tier]++; @@ -1766,7 +1765,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, do { seq = read_seqcount_begin(&c->gc_pos_lock); - mark = READ_ONCE(g->mark); + mark = ptr_bucket_mark(ca, ptr); /* between mark and bucket gen */ smp_rmb(); @@ -1819,10 +1818,8 @@ bad_ptr: bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), e.s_c); bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu " - "gen %i last_gc %i type %u", - buf, PTR_BUCKET_NR(ca, ptr), mark.gen, - ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], - mark.data_type); + "gen %i type %u", buf, + PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type); return; } diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index eb81b74..aeae361 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -42,6 +42,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent); const struct bch_extent_ptr * bch2_extent_has_device(struct bkey_s_c_extent, unsigned); +bool bch2_extent_drop_device(struct bkey_s_extent, unsigned); unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent); unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c); @@ -432,7 +433,6 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent); void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *); void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *); -void bch2_extent_drop_ptr_idx(struct bkey_s_extent, unsigned); bool bch2_cut_front(struct bpos, struct bkey_i *); bool bch2_cut_back(struct bpos, struct bkey *); diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h index 0a9c0c9..98f22f6 100644 --- a/libbcachefs/fifo.h +++ b/libbcachefs/fifo.h @@ -3,11 +3,13 @@ #include "util.h" -#define DECLARE_FIFO(type, name) \ - struct { \ - size_t front, back, size, mask; \ - type *data; \ - } name +#define FIFO(type) \ +struct { \ + size_t front, back, size, mask; \ + type *data; \ +} + +#define DECLARE_FIFO(type, name) FIFO(type) name #define fifo_buf_size(fifo) \ (roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0])) diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 3369a2f..e045eb2 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -262,16 +262,17 @@ static void bch2_write_index(struct closure *cl) if (test_bit(ptr->dev, op->failed.d)) bch2_extent_drop_ptr(e, ptr); - ret = bch2_extent_nr_ptrs(e.c) - ? bch2_check_mark_super(c, e.c, BCH_DATA_USER) - : -EIO; - if (ret) { - keys->top = keys->keys; - op->error = ret; - op->flags |= BCH_WRITE_DONE; + if (!bch2_extent_nr_ptrs(e.c)) { + ret = -EIO; goto err; } + if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) { + ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER); + if (ret) + goto err; + } + dst = bkey_next(dst); } @@ -290,7 +291,7 @@ static void bch2_write_index(struct closure *cl) op->error = ret; } } -err: +out: bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets); if (!(op->flags & BCH_WRITE_DONE)) @@ -304,6 +305,12 @@ err: } else { continue_at_nobarrier(cl, bch2_write_done, NULL); } + return; +err: + keys->top = keys->keys; + op->error = ret; + op->flags |= BCH_WRITE_DONE; + goto out; } static void bch2_write_endio(struct bio *bio) @@ -351,7 +358,6 @@ static void init_append_extent(struct bch_write_op *op, bch2_extent_crc_append(e, crc); bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size); - bkey_extent_set_cached(&e->k, (op->flags & BCH_WRITE_CACHED)); bch2_keylist_push(&op->insert_keys); } @@ -1298,7 +1304,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, struct bpos pos = bkey_start_pos(e.k); int ret = 0; - PTR_BUCKET(pick->ca, &pick->ptr)->prio[READ] = c->prio_clock[READ].hand; + lg_local_lock(&c->usage_lock); + bucket_io_clock_reset(c, pick->ca, + PTR_BUCKET_NR(pick->ca, &pick->ptr), READ); + lg_local_unlock(&c->usage_lock); narrow_crcs = should_narrow_crcs(e, pick, flags); diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 0c145eb..71eee4f 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -32,11 +32,12 @@ enum bch_write_flags { BCH_WRITE_PAGES_OWNED = (1 << 5), BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), BCH_WRITE_NOPUT_RESERVATION = (1 << 7), + BCH_WRITE_NOMARK_REPLICAS = (1 << 8), /* Internal: */ - BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 8), - BCH_WRITE_DONE = (1 << 9), - BCH_WRITE_LOOPED = (1 << 10), + BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), + BCH_WRITE_DONE = (1 << 10), + BCH_WRITE_LOOPED = (1 << 11), }; static inline u64 *op_journal_seq(struct bch_write_op *op) diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index ecae9b0..829e064 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -1629,8 +1629,7 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, ja->nr++; spin_unlock(&j->lock); - bch2_mark_metadata_bucket(c, ca, &ca->buckets[bucket], - BCH_DATA_JOURNAL, + bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), 0); @@ -2021,10 +2020,11 @@ static void journal_reclaim_work(struct work_struct *work) /** * journal_next_bucket - move on to the next journal bucket if possible */ -static int journal_write_alloc(struct journal *j, unsigned sectors) +static int journal_write_alloc(struct journal *j, struct journal_buf *w, + unsigned sectors) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); + struct bkey_s_extent e; struct bch_extent_ptr *ptr; struct journal_device *ja; struct bch_dev *ca; @@ -2033,6 +2033,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) READ_ONCE(c->opts.metadata_replicas); spin_lock(&j->lock); + e = bkey_i_to_s_extent(&j->key); /* * Drop any pointers to devices that have been removed, are no longer @@ -2098,6 +2099,8 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) rcu_read_unlock(); j->prev_buf_sectors = 0; + + bkey_copy(&w->key, &j->key); spin_unlock(&j->lock); if (replicas < c->opts.metadata_replicas_required) @@ -2173,13 +2176,26 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) static void journal_write_done(struct closure *cl) { struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *w = journal_prev_buf(j); + struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&w->key); + + if (!bch2_extent_nr_ptrs(e)) { + bch_err(c, "unable to write journal to sufficient devices"); + goto err; + } + if (bch2_check_mark_super(c, e, BCH_DATA_JOURNAL)) + goto err; +out: __bch2_time_stats_update(j->write_time, j->write_start_time); spin_lock(&j->lock); j->last_seq_ondisk = le64_to_cpu(w->data->last_seq); + journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs = + bch2_extent_devs(bkey_i_to_s_c_extent(&w->key)); + /* * Updating last_seq_ondisk may let journal_reclaim_work() discard more * buckets: @@ -2202,31 +2218,6 @@ static void journal_write_done(struct closure *cl) if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) mod_delayed_work(system_freezable_wq, &j->write_work, 0); spin_unlock(&j->lock); -} - -static void journal_write_error(struct closure *cl) -{ - struct journal *j = container_of(cl, struct journal, io); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); - - while (j->replicas_failed) { - unsigned idx = __fls(j->replicas_failed); - - bch2_extent_drop_ptr_idx(e, idx); - j->replicas_failed ^= 1 << idx; - } - - if (!bch2_extent_nr_ptrs(e.c)) { - bch_err(c, "unable to write journal to sufficient devices"); - goto err; - } - - if (bch2_check_mark_super(c, e.c, BCH_DATA_JOURNAL)) - goto err; - -out: - journal_write_done(cl); return; err: bch2_fatal_error(c); @@ -2241,12 +2232,12 @@ static void journal_write_endio(struct bio *bio) if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") || bch2_meta_write_fault("journal")) { - /* Was this a flush or an actual journal write? */ - if (ca->journal.ptr_idx != U8_MAX) { - set_bit(ca->journal.ptr_idx, &j->replicas_failed); - set_closure_fn(&j->io, journal_write_error, - system_highpri_wq); - } + struct journal_buf *w = journal_prev_buf(j); + unsigned long flags; + + spin_lock_irqsave(&j->err_lock, flags); + bch2_extent_drop_device(bkey_i_to_s_extent(&w->key), ca->dev_idx); + spin_unlock_irqrestore(&j->err_lock, flags); } closure_put(&j->io); @@ -2262,7 +2253,7 @@ static void journal_write(struct closure *cl) struct jset *jset; struct bio *bio; struct bch_extent_ptr *ptr; - unsigned i, sectors, bytes, ptr_idx = 0; + unsigned i, sectors, bytes; journal_buf_realloc(j, w); jset = w->data; @@ -2309,20 +2300,13 @@ static void journal_write(struct closure *cl) bytes = vstruct_bytes(w->data); memset((void *) w->data + bytes, 0, (sectors << 9) - bytes); - if (journal_write_alloc(j, sectors)) { + if (journal_write_alloc(j, w, sectors)) { bch2_journal_halt(j); bch_err(c, "Unable to allocate journal write"); bch2_fatal_error(c); continue_at(cl, journal_write_done, system_highpri_wq); } - if (bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key), - BCH_DATA_JOURNAL)) - goto err; - - journal_seq_pin(j, le64_to_cpu(jset->seq))->devs = - bch2_extent_devs(bkey_i_to_s_c_extent(&j->key)); - /* * XXX: we really should just disable the entire journal in nochanges * mode @@ -2330,7 +2314,7 @@ static void journal_write(struct closure *cl) if (c->opts.nochanges) goto no_io; - extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) { + extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { ca = bch_dev_bkey_exists(c, ptr->dev); if (!percpu_ref_tryget(&ca->io_ref)) { /* XXX: fix this */ @@ -2341,7 +2325,6 @@ static void journal_write(struct closure *cl) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL], sectors); - ca->journal.ptr_idx = ptr_idx++; bio = ca->journal.bio; bio_reset(bio); bio->bi_iter.bi_sector = ptr->offset; @@ -2361,10 +2344,9 @@ static void journal_write(struct closure *cl) for_each_rw_member(ca, c, i) if (journal_flushes_device(ca) && - !bch2_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) { + !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) { percpu_ref_get(&ca->io_ref); - ca->journal.ptr_idx = U8_MAX; bio = ca->journal.bio; bio_reset(bio); bio->bi_bdev = ca->disk_sb.bdev; @@ -2375,7 +2357,7 @@ static void journal_write(struct closure *cl) } no_io: - extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) + extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) ptr->offset += sectors; continue_at(cl, journal_write_done, system_highpri_wq); @@ -2779,163 +2761,32 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx) return ret; } -ssize_t bch2_journal_print_debug(struct journal *j, char *buf) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - union journal_res_state *s = &j->reservations; - struct bch_dev *ca; - unsigned iter; - ssize_t ret = 0; - - rcu_read_lock(); - spin_lock(&j->lock); - - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "active journal entries:\t%zu\n" - "seq:\t\t\t%llu\n" - "last_seq:\t\t%llu\n" - "last_seq_ondisk:\t%llu\n" - "reservation count:\t%u\n" - "reservation offset:\t%u\n" - "current entry u64s:\t%u\n" - "io in flight:\t\t%i\n" - "need write:\t\t%i\n" - "dirty:\t\t\t%i\n" - "replay done:\t\t%i\n", - fifo_used(&j->pin), - (u64) atomic64_read(&j->seq), - last_seq(j), - j->last_seq_ondisk, - journal_state_count(*s, s->idx), - s->cur_entry_offset, - j->cur_entry_u64s, - s->prev_buf_unwritten, - test_bit(JOURNAL_NEED_WRITE, &j->flags), - journal_entry_is_open(j), - test_bit(JOURNAL_REPLAY_DONE, &j->flags)); - - for_each_member_device_rcu(ca, c, iter, - &c->rw_devs[BCH_DATA_JOURNAL]) { - struct journal_device *ja = &ca->journal; - - if (!ja->nr) - continue; - - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "dev %u:\n" - "\tnr\t\t%u\n" - "\tcur_idx\t\t%u (seq %llu)\n" - "\tlast_idx\t%u (seq %llu)\n", - iter, ja->nr, - ja->cur_idx, ja->bucket_seq[ja->cur_idx], - ja->last_idx, ja->bucket_seq[ja->last_idx]); - } - - spin_unlock(&j->lock); - rcu_read_unlock(); - - return ret; -} - -ssize_t bch2_journal_print_pins(struct journal *j, char *buf) -{ - struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *pin; - ssize_t ret = 0; - unsigned i; - - spin_lock_irq(&j->pin_lock); - fifo_for_each_entry_ptr(pin_list, &j->pin, i) { - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "%llu: count %u\n", - journal_pin_seq(j, pin_list), - atomic_read(&pin_list->count)); - - list_for_each_entry(pin, &pin_list->list, list) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "\t%p %pf\n", - pin, pin->flush); - - if (!list_empty(&pin_list->flushed)) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "flushed:\n"); - - list_for_each_entry(pin, &pin_list->flushed, list) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "\t%p %pf\n", - pin, pin->flush); - } - spin_unlock_irq(&j->pin_lock); - - return ret; -} +/* startup/shutdown: */ -static bool bch2_journal_writing_to_device(struct bch_dev *ca) +static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) { - struct journal *j = &ca->fs->journal; + union journal_res_state state; + struct journal_buf *w; bool ret; spin_lock(&j->lock); - ret = bch2_extent_has_device(bkey_i_to_s_c_extent(&j->key), - ca->dev_idx); + state = READ_ONCE(j->reservations); + w = j->buf + !state.idx; + + ret = state.prev_buf_unwritten && + bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), dev_idx); spin_unlock(&j->lock); return ret; } -/* - * This asumes that ca has already been marked read-only so that - * journal_next_bucket won't pick buckets out of ca any more. - * Hence, if the journal is not currently pointing to ca, there - * will be no new writes to journal entries in ca after all the - * pending ones have been flushed to disk. - * - * If the journal is being written to ca, write a new record, and - * journal_next_bucket will notice that the device is no longer - * writeable and pick a new set of devices to write to. - */ - -int bch2_journal_move(struct bch_dev *ca) +void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) { - struct journal_device *ja = &ca->journal; - struct journal *j = &ca->fs->journal; - u64 seq_to_flush = 0; - unsigned i; - int ret; - - if (bch2_journal_writing_to_device(ca)) { - /* - * bch_journal_meta will write a record and we'll wait - * for the write to complete. - * Actually writing the journal (journal_write_locked) - * will call journal_next_bucket which notices that the - * device is no longer writeable, and picks a new one. - */ - bch2_journal_meta(j); - BUG_ON(bch2_journal_writing_to_device(ca)); - } - - for (i = 0; i < ja->nr; i++) - seq_to_flush = max(seq_to_flush, ja->bucket_seq[i]); - - bch2_journal_flush_pins(j, seq_to_flush); - - /* - * Force a meta-data journal entry to be written so that - * we have newer journal entries in devices other than ca, - * and wait for the meta data write to complete. - */ - bch2_journal_meta(j); - - /* - * Verify that we no longer need any of the journal entries in - * the device - */ spin_lock(&j->lock); - ret = j->last_seq_ondisk > seq_to_flush ? 0 : -EIO; + bch2_extent_drop_device(bkey_i_to_s_extent(&j->key), ca->dev_idx); spin_unlock(&j->lock); - return ret; + wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); } void bch2_fs_journal_stop(struct journal *j) @@ -3006,6 +2857,7 @@ int bch2_fs_journal_init(struct journal *j) spin_lock_init(&j->lock); spin_lock_init(&j->pin_lock); + spin_lock_init(&j->err_lock); init_waitqueue_head(&j->wait); INIT_DELAYED_WORK(&j->write_work, journal_write_work); INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work); @@ -3035,3 +2887,96 @@ int bch2_fs_journal_init(struct journal *j) return 0; } + +/* debug: */ + +ssize_t bch2_journal_print_debug(struct journal *j, char *buf) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + union journal_res_state *s = &j->reservations; + struct bch_dev *ca; + unsigned iter; + ssize_t ret = 0; + + rcu_read_lock(); + spin_lock(&j->lock); + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "active journal entries:\t%zu\n" + "seq:\t\t\t%llu\n" + "last_seq:\t\t%llu\n" + "last_seq_ondisk:\t%llu\n" + "reservation count:\t%u\n" + "reservation offset:\t%u\n" + "current entry u64s:\t%u\n" + "io in flight:\t\t%i\n" + "need write:\t\t%i\n" + "dirty:\t\t\t%i\n" + "replay done:\t\t%i\n", + fifo_used(&j->pin), + (u64) atomic64_read(&j->seq), + last_seq(j), + j->last_seq_ondisk, + journal_state_count(*s, s->idx), + s->cur_entry_offset, + j->cur_entry_u64s, + s->prev_buf_unwritten, + test_bit(JOURNAL_NEED_WRITE, &j->flags), + journal_entry_is_open(j), + test_bit(JOURNAL_REPLAY_DONE, &j->flags)); + + for_each_member_device_rcu(ca, c, iter, + &c->rw_devs[BCH_DATA_JOURNAL]) { + struct journal_device *ja = &ca->journal; + + if (!ja->nr) + continue; + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "dev %u:\n" + "\tnr\t\t%u\n" + "\tcur_idx\t\t%u (seq %llu)\n" + "\tlast_idx\t%u (seq %llu)\n", + iter, ja->nr, + ja->cur_idx, ja->bucket_seq[ja->cur_idx], + ja->last_idx, ja->bucket_seq[ja->last_idx]); + } + + spin_unlock(&j->lock); + rcu_read_unlock(); + + return ret; +} + +ssize_t bch2_journal_print_pins(struct journal *j, char *buf) +{ + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *pin; + ssize_t ret = 0; + unsigned i; + + spin_lock_irq(&j->pin_lock); + fifo_for_each_entry_ptr(pin_list, &j->pin, i) { + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "%llu: count %u\n", + journal_pin_seq(j, pin_list), + atomic_read(&pin_list->count)); + + list_for_each_entry(pin, &pin_list->list, list) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "\t%p %pf\n", + pin, pin->flush); + + if (!list_empty(&pin_list->flushed)) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "flushed:\n"); + + list_for_each_entry(pin, &pin_list->flushed, list) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "\t%p %pf\n", + pin, pin->flush); + } + spin_unlock_irq(&j->pin_lock); + + return ret; +} diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 5f3ece0..b3e6b2b 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -398,8 +398,7 @@ static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) : 0; } -int bch2_journal_move(struct bch_dev *); - +void bch2_dev_journal_stop(struct journal *, struct bch_dev *); void bch2_fs_journal_stop(struct journal *); void bch2_dev_journal_exit(struct bch_dev *); int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 87f378a..66923cf 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -17,6 +17,8 @@ struct journal_res; struct journal_buf { struct jset *data; + BKEY_PADDED(key); + struct closure_waitlist wait; unsigned size; @@ -141,7 +143,6 @@ struct journal { struct closure io; struct delayed_work write_work; - unsigned long replicas_failed; /* Sequence number of most recent journal entry (last entry in @pin) */ atomic64_t seq; @@ -179,6 +180,7 @@ struct journal { BKEY_PADDED(key); struct write_point wp; + spinlock_t err_lock; struct delayed_work reclaim_work; unsigned long last_flushed; @@ -230,7 +232,6 @@ struct journal_device { /* Bio for journal reads/writes to this device */ struct bio *bio; - u8 ptr_idx; /* for bch_journal_read_device */ struct closure read; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index e11ee95..328316a 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -196,26 +196,13 @@ static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca, return 0; mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, - (1 << BCH_DATA_JOURNAL)| - (1 << BCH_DATA_BTREE)); - - /* 1st, Move the btree nodes off the device */ + bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE); for (i = 0; i < BTREE_ID_NR; i++) { ret = bch2_move_btree_off(c, ca, i); if (ret) goto err; } - - /* There are no prios/gens to move -- they are already in the device. */ - - /* 2nd. Move the journal off the device */ - - ret = bch2_journal_move(ca); - if (ret) - goto err; - err: bch2_replicas_gc_end(c, ret); mutex_unlock(&c->replicas_gc_lock); @@ -231,15 +218,12 @@ int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags) static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e, unsigned dev_idx, int flags, bool metadata) { - struct bch_extent_ptr *ptr; unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; unsigned nr_good; - extent_for_each_ptr_backwards(e, ptr) - if (ptr->dev == dev_idx) - bch2_extent_drop_ptr(e, ptr); + bch2_extent_drop_device(e, dev_idx); nr_good = bch2_extent_nr_good_ptrs(c, e.c); if ((!nr_good && !(flags & lost)) || diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 8ce63d6..a3de3b0 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -116,6 +116,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_extent_normalize(c, extent_i_to_s(insert).s); bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert)); + ret = bch2_check_mark_super(c, extent_i_to_s_c(insert), + BCH_DATA_USER); + if (ret) + break; + ret = bch2_btree_insert_at(c, &op->res, NULL, op_journal_seq(op), BTREE_INSERT_ATOMIC| @@ -178,7 +183,8 @@ void bch2_migrate_write_init(struct migrate_write *m, m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS| BCH_WRITE_PAGES_STABLE| BCH_WRITE_PAGES_OWNED| - BCH_WRITE_DATA_ENCODED; + BCH_WRITE_DATA_ENCODED| + BCH_WRITE_NOMARK_REPLICAS; m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; m->op.nr_replicas = 1; diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index c6a9ac2..90eb4ca 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -99,10 +99,11 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) { copygc_heap *h = &ca->copygc_heap; struct copygc_heap_entry e, *i; - struct bucket *g; + struct bucket_array *buckets; u64 keys_moved, sectors_moved; u64 sectors_to_move = 0, sectors_not_moved = 0; u64 buckets_to_move, buckets_not_moved = 0; + size_t b; int ret; closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); @@ -113,15 +114,18 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) * and repeatedly replacing the maximum element until all * buckets have been visited. */ + h->used = 0; /* * We need bucket marks to be up to date - gc can't be recalculating * them: */ down_read(&c->gc_lock); - h->used = 0; - for_each_bucket(g, ca) { - struct bucket_mark m = READ_ONCE(g->mark); + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { + struct bucket_mark m = READ_ONCE(buckets->b[b].mark); struct copygc_heap_entry e; if (m.owned_by_allocator || @@ -131,11 +135,12 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) continue; e = (struct copygc_heap_entry) { - .offset = bucket_to_sector(ca, g - ca->buckets), + .offset = bucket_to_sector(ca, b), .mark = m }; heap_add_or_replace(h, e, -sectors_used_cmp); } + up_read(&ca->bucket_lock); up_read(&c->gc_lock); for (i = h->data; i < h->data + h->used; i++) @@ -165,15 +170,18 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) &keys_moved, §ors_moved); + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); for (i = h->data; i < h->data + h->used; i++) { - size_t bucket = sector_to_bucket(ca, i->offset); - struct bucket_mark m = READ_ONCE(ca->buckets[bucket].mark); + size_t b = sector_to_bucket(ca, i->offset); + struct bucket_mark m = READ_ONCE(buckets->b[b].mark); if (i->mark.gen == m.gen && bucket_sectors_used(m)) { sectors_not_moved += bucket_sectors_used(m); buckets_not_moved++; } } + up_read(&ca->bucket_lock); if (sectors_not_moved && !ret) bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved", diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 60a2d83..c928307 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -88,7 +88,7 @@ static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); static void bch2_dev_free(struct bch_dev *); static int bch2_dev_alloc(struct bch_fs *, unsigned); -static int bch2_dev_sysfs_online(struct bch_dev *); +static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); struct bch_fs *bch2_bdev_to_fs(struct block_device *bdev) @@ -480,7 +480,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->state_lock); mutex_init(&c->sb_lock); mutex_init(&c->replicas_gc_lock); - mutex_init(&c->bucket_lock); mutex_init(&c->btree_root_lock); INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); @@ -512,11 +511,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) seqcount_init(&c->gc_pos_lock); - c->prio_clock[READ].hand = 1; - c->prio_clock[READ].min_prio = 0; - c->prio_clock[WRITE].hand = 1; - c->prio_clock[WRITE].min_prio = 0; - init_waitqueue_head(&c->writeback_wait); c->writeback_pages_max = (256 << 10) / PAGE_SIZE; @@ -649,7 +643,7 @@ static const char *__bch2_fs_online(struct bch_fs *c) err = "error creating sysfs objects"; __for_each_member_device(ca, c, i, NULL) - if (bch2_dev_sysfs_online(ca)) + if (bch2_dev_sysfs_online(c, ca)) goto err; list_add(&c->list, &bch_fs_list); @@ -958,8 +952,6 @@ static void bch2_dev_release(struct kobject *kobj) static void bch2_dev_free(struct bch_dev *ca) { - unsigned i; - cancel_work_sync(&ca->io_error_work); if (ca->kobj.state_in_sysfs && @@ -975,25 +967,15 @@ static void bch2_dev_free(struct bch_dev *ca) free_percpu(ca->io_done); bioset_exit(&ca->replica_set); - free_percpu(ca->usage_percpu); - kvpfree(ca->bucket_dirty, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); - kvpfree(ca->buckets, ca->mi.nbuckets * sizeof(struct bucket)); - kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8)); - free_heap(&ca->copygc_heap); - free_heap(&ca->alloc_heap); - free_fifo(&ca->free_inc); - - for (i = 0; i < RESERVE_NR; i++) - free_fifo(&ca->free[i]); + bch2_dev_buckets_free(ca); percpu_ref_exit(&ca->io_ref); percpu_ref_exit(&ca->ref); kobject_put(&ca->kobj); } -static void __bch2_dev_offline(struct bch_dev *ca) +static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) { - struct bch_fs *c = ca->fs; lockdep_assert_held(&c->state_lock); @@ -1032,9 +1014,8 @@ static void bch2_dev_io_ref_complete(struct percpu_ref *ref) complete(&ca->io_ref_completion); } -static int bch2_dev_sysfs_online(struct bch_dev *ca) +static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) { - struct bch_fs *c = ca->fs; int ret; if (!c->kobj.state_in_sysfs) @@ -1065,9 +1046,6 @@ static int bch2_dev_sysfs_online(struct bch_dev *ca) static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) { struct bch_member *member; - size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve; - size_t heap_size; - unsigned i, btree_node_reserve_buckets; struct bch_dev *ca; if (bch2_fs_init_fault("dev_alloc")) @@ -1084,6 +1062,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->dev_idx = dev_idx; __set_bit(ca->dev_idx, ca->self.d); + init_rwsem(&ca->bucket_lock); + writepoint_init(&ca->copygc_write_point, BCH_DATA_USER); spin_lock_init(&ca->freelist_lock); @@ -1100,56 +1080,20 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->uuid = member->uuid; scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); - /* XXX: tune these */ - movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7); - reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9); - /* - * free_inc must be smaller than the copygc reserve: if it was bigger, - * one copygc iteration might not make enough buckets available to fill - * up free_inc and allow the allocator to make forward progress - */ - free_inc_reserve = movinggc_reserve / 2; - heap_size = movinggc_reserve * 8; - - btree_node_reserve_buckets = - DIV_ROUND_UP(BTREE_NODE_RESERVE, - ca->mi.bucket_size / c->opts.btree_node_size); - if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL) || percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || - !init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets, - GFP_KERNEL) || - !init_fifo(&ca->free[RESERVE_MOVINGGC], - movinggc_reserve, GFP_KERNEL) || - !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) || - !init_fifo(&ca->free_inc, free_inc_reserve, GFP_KERNEL) || - !init_heap(&ca->alloc_heap, free_inc_reserve, GFP_KERNEL) || - !init_heap(&ca->copygc_heap,heap_size, GFP_KERNEL) || - !(ca->oldest_gens = kvpmalloc(ca->mi.nbuckets * - sizeof(u8), - GFP_KERNEL|__GFP_ZERO)) || - !(ca->buckets = kvpmalloc(ca->mi.nbuckets * - sizeof(struct bucket), - GFP_KERNEL|__GFP_ZERO)) || - !(ca->bucket_dirty = kvpmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * - sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO)) || - !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) || + bch2_dev_buckets_alloc(c, ca) || bioset_init(&ca->replica_set, 4, offsetof(struct bch_write_bio, bio), 0) || !(ca->io_done = alloc_percpu(*ca->io_done))) goto err; - total_reserve = ca->free_inc.size; - for (i = 0; i < RESERVE_NR; i++) - total_reserve += ca->free[i].size; - ca->fs = c; rcu_assign_pointer(c->devs[ca->dev_idx], ca); - if (bch2_dev_sysfs_online(ca)) + if (bch2_dev_sysfs_online(c, ca)) pr_warn("error creating sysfs objects"); return 0; @@ -1202,9 +1146,6 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb) bdevname(ca->disk_sb.bdev, c->name); bdevname(ca->disk_sb.bdev, ca->name); - if (bch2_dev_sysfs_online(ca)) - pr_warn("error creating sysfs objects"); - bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); if (ca->mi.state == BCH_MEMBER_STATE_RW) @@ -1311,12 +1252,11 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) bch2_copygc_stop(ca); /* - * This stops new data writes (e.g. to existing open data - * buckets) and then waits for all existing writes to - * complete. + * The allocator thread itself allocates btree nodes, so stop it first: */ bch2_dev_allocator_stop(ca); bch2_dev_allocator_remove(c, ca); + bch2_dev_journal_stop(&c->journal, ca); } static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) @@ -1393,16 +1333,13 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) percpu_ref_put(&ca->ref); /* XXX */ - if (ca->mi.state == BCH_MEMBER_STATE_RW) { - bch_err(ca, "Cannot remove RW device"); - goto err; - } - if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { bch_err(ca, "Cannot remove without losing data"); goto err; } + __bch2_dev_read_only(c, ca); + /* * XXX: verify that dev_idx is really not in use anymore, anywhere * @@ -1452,7 +1389,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) goto err; } - __bch2_dev_offline(ca); + __bch2_dev_offline(c, ca); mutex_lock(&c->sb_lock); rcu_assign_pointer(c->devs[ca->dev_idx], NULL); @@ -1477,6 +1414,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) mutex_unlock(&c->state_lock); return 0; err: + if (ca->mi.state == BCH_MEMBER_STATE_RW) + __bch2_dev_read_write(c, ca); mutex_unlock(&c->state_lock); return ret; } @@ -1645,7 +1584,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) return -EINVAL; } - __bch2_dev_offline(ca); + __bch2_dev_offline(c, ca); mutex_unlock(&c->state_lock); return 0; @@ -1658,7 +1597,8 @@ int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca) mutex_lock(&c->state_lock); - if (ca->mi.state == BCH_MEMBER_STATE_RW) { + if (ca->mi.state == BCH_MEMBER_STATE_RW && + bch2_dev_is_online(ca)) { bch_err(ca, "Cannot migrate data off RW device"); ret = -EINVAL; goto err; @@ -1681,6 +1621,46 @@ err: return ret; } +int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) +{ + struct bch_member *mi; + int ret = 0; + + mutex_lock(&c->state_lock); + + if (nbuckets < ca->mi.nbuckets) { + bch_err(ca, "Cannot shrink yet"); + ret = -EINVAL; + goto err; + } + + if (bch2_dev_is_online(ca) && + get_capacity(ca->disk_sb.bdev->bd_disk) < + ca->mi.bucket_size * nbuckets) { + bch_err(ca, "New size larger than device"); + ret = -EINVAL; + goto err; + } + + ret = bch2_dev_buckets_resize(c, ca, nbuckets); + if (ret) { + bch_err(ca, "Resize error: %i", ret); + goto err; + } + + mutex_lock(&c->sb_lock); + mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx]; + mi->nbuckets = cpu_to_le64(nbuckets); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + bch2_recalc_capacity(c); +err: + mutex_unlock(&c->state_lock); + return ret; +} + /* Filesystem open: */ const char *bch2_fs_open(char * const *devices, unsigned nr_devices, diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 7ebe598..3189da6 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -189,6 +189,7 @@ int bch2_dev_add(struct bch_fs *, const char *); int bch2_dev_online(struct bch_fs *, const char *); int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); int bch2_dev_evacuate(struct bch_fs *, struct bch_dev *); +int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); bool bch2_fs_emergency_read_only(struct bch_fs *); void bch2_fs_read_only(struct bch_fs *); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 11112dc..dc70fb0 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -606,26 +606,28 @@ struct attribute *bch2_fs_time_stats_files[] = { NULL }; -typedef unsigned (bucket_map_fn)(struct bch_dev *, struct bucket *, void *); +typedef unsigned (bucket_map_fn)(struct bch_dev *, size_t, void *); -static unsigned bucket_priority_fn(struct bch_dev *ca, struct bucket *g, +static unsigned bucket_priority_fn(struct bch_dev *ca, size_t b, void *private) { + struct bucket *g = bucket(ca, b); int rw = (private ? 1 : 0); return ca->fs->prio_clock[rw].hand - g->prio[rw]; } -static unsigned bucket_sectors_used_fn(struct bch_dev *ca, struct bucket *g, +static unsigned bucket_sectors_used_fn(struct bch_dev *ca, size_t b, void *private) { + struct bucket *g = bucket(ca, b); return bucket_sectors_used(g->mark); } -static unsigned bucket_oldest_gen_fn(struct bch_dev *ca, struct bucket *g, +static unsigned bucket_oldest_gen_fn(struct bch_dev *ca, size_t b, void *private) { - return bucket_gc_gen(ca, g); + return bucket_gc_gen(ca, b); } static ssize_t show_quantiles(struct bch_dev *ca, char *buf, @@ -634,19 +636,25 @@ static ssize_t show_quantiles(struct bch_dev *ca, char *buf, int cmp(const void *l, const void *r) { return *((unsigned *) r) - *((unsigned *) l); } - size_t n = ca->mi.nbuckets, i; + size_t i, n; /* Compute 31 quantiles */ unsigned q[31], *p; ssize_t ret = 0; - p = vzalloc(ca->mi.nbuckets * sizeof(unsigned)); - if (!p) + down_read(&ca->bucket_lock); + n = ca->mi.nbuckets; + + p = vzalloc(n * sizeof(unsigned)); + if (!p) { + up_read(&ca->bucket_lock); return -ENOMEM; + } for (i = ca->mi.first_bucket; i < n; i++) - p[i] = fn(ca, &ca->buckets[i], private); + p[i] = fn(ca, n, private); sort(p, n, sizeof(unsigned), cmp, NULL); + up_read(&ca->bucket_lock); while (n && !p[n - 1])