From 99b12cd3a4cdd19985624b79a8c54716cad649bd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 30 Nov 2021 15:27:31 -0500 Subject: [PATCH] Update bcachefs sources to 50d6a25d9c bcachefs: Erasure coding fixes --- .bcachefs_revision | 2 +- cmd_migrate.c | 2 - libbcachefs/alloc_background.c | 3 + libbcachefs/alloc_background.h | 2 + libbcachefs/alloc_foreground.c | 35 ++- libbcachefs/alloc_foreground.h | 10 +- libbcachefs/bcachefs.h | 1 + libbcachefs/btree_gc.c | 35 +-- libbcachefs/btree_iter.c | 60 +++-- libbcachefs/btree_iter.h | 14 +- libbcachefs/btree_types.h | 3 + libbcachefs/btree_update_interior.c | 7 +- libbcachefs/btree_update_leaf.c | 24 +- libbcachefs/buckets.c | 284 ++++++++++++++------- libbcachefs/buckets.h | 4 +- libbcachefs/dirent.c | 5 +- libbcachefs/ec.c | 46 +++- libbcachefs/errcode.h | 12 + libbcachefs/fs-io.c | 382 +++++++++++++++++++++------- libbcachefs/fs.c | 28 +- libbcachefs/fs.h | 6 + libbcachefs/io.c | 4 + libbcachefs/opts.h | 12 +- libbcachefs/quota.c | 69 +++-- libbcachefs/reflink.c | 3 +- libbcachefs/replicas.c | 66 ----- libbcachefs/replicas.h | 3 - libbcachefs/subvolume.c | 9 + libbcachefs/subvolume.h | 2 + libbcachefs/util.h | 2 - 30 files changed, 735 insertions(+), 400 deletions(-) create mode 100644 libbcachefs/errcode.h diff --git a/.bcachefs_revision b/.bcachefs_revision index e4c9f20..236c0c1 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -4c24a1cf56583a3da1e14eb1bce2c3240d860b06 +50d6a25d9c0090d84ad9aadd29f76bc0abff5423 diff --git a/cmd_migrate.c b/cmd_migrate.c index 41cfe5d..fa46730 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -331,8 +331,6 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, die("error reserving space in new filesystem: %s", strerror(-ret)); - bch2_mark_bkey_replicas(c, extent_i_to_s_c(e).s_c); - ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i, &res, NULL, 0); if (ret) diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index b2735c8..bf3611e 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -336,6 +336,9 @@ static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k) g->_mark.data_type = u.data_type; g->_mark.dirty_sectors = u.dirty_sectors; g->_mark.cached_sectors = u.cached_sectors; + g->_mark.stripe = u.stripe != 0; + g->stripe = u.stripe; + g->stripe_redundancy = u.stripe_redundancy; g->io_time[READ] = u.read_time; g->io_time[WRITE] = u.write_time; g->oldest_gen = u.oldest_gen; diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index 370573f..b1efc14 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -65,6 +65,8 @@ alloc_mem_to_key(struct btree_iter *iter, .cached_sectors = m.cached_sectors, .read_time = g->io_time[READ], .write_time = g->io_time[WRITE], + .stripe = g->stripe, + .stripe_redundancy = g->stripe_redundancy, }; } diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 412fed4..2bb107b 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -348,8 +348,7 @@ static void add_new_bucket(struct bch_fs *c, ob_push(c, ptrs, ob); } -enum bucket_alloc_ret -bch2_bucket_alloc_set(struct bch_fs *c, +int bch2_bucket_alloc_set(struct bch_fs *c, struct open_buckets *ptrs, struct dev_stripe_state *stripe, struct bch_devs_mask *devs_may_alloc, @@ -363,7 +362,7 @@ bch2_bucket_alloc_set(struct bch_fs *c, struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); struct bch_dev *ca; - enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES; + int ret = -INSUFFICIENT_DEVICES; unsigned i; BUG_ON(*nr_effective >= nr_replicas); @@ -381,7 +380,7 @@ bch2_bucket_alloc_set(struct bch_fs *c, ob = bch2_bucket_alloc(c, ca, reserve, flags & BUCKET_MAY_ALLOC_PARTIAL, cl); if (IS_ERR(ob)) { - ret = -PTR_ERR(ob); + ret = PTR_ERR(ob); if (cl) return ret; @@ -394,7 +393,7 @@ bch2_bucket_alloc_set(struct bch_fs *c, bch2_dev_stripe_increment(ca, stripe); if (*nr_effective >= nr_replicas) - return ALLOC_SUCCESS; + return 0; } return ret; @@ -408,8 +407,7 @@ bch2_bucket_alloc_set(struct bch_fs *c, * it's to a device we don't want: */ -static enum bucket_alloc_ret -bucket_alloc_from_stripe(struct bch_fs *c, +static int bucket_alloc_from_stripe(struct bch_fs *c, struct open_buckets *ptrs, struct write_point *wp, struct bch_devs_mask *devs_may_alloc, @@ -505,8 +503,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c, wp->ptrs = ptrs_skip; } -static enum bucket_alloc_ret -open_bucket_add_buckets(struct bch_fs *c, +static int open_bucket_add_buckets(struct bch_fs *c, struct open_buckets *ptrs, struct write_point *wp, struct bch_devs_list *devs_have, @@ -522,7 +519,7 @@ open_bucket_add_buckets(struct bch_fs *c, struct bch_devs_mask devs; struct open_bucket *ob; struct closure *cl = NULL; - enum bucket_alloc_ret ret; + int ret; unsigned i; rcu_read_lock(); @@ -550,8 +547,8 @@ open_bucket_add_buckets(struct bch_fs *c, target, erasure_code, nr_replicas, nr_effective, have_cache, flags, _cl); - if (ret == FREELIST_EMPTY || - ret == OPEN_BUCKETS_EMPTY) + if (ret == -FREELIST_EMPTY || + ret == -OPEN_BUCKETS_EMPTY) return ret; if (*nr_effective >= nr_replicas) return 0; @@ -575,7 +572,7 @@ retry_blocking: ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, nr_replicas, nr_effective, have_cache, reserve, flags, cl); - if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) { + if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) { cl = _cl; goto retry_blocking; } @@ -772,7 +769,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, unsigned nr_effective, write_points_nr; unsigned ob_flags = 0; bool have_cache; - enum bucket_alloc_ret ret; + int ret; int i; if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) @@ -821,7 +818,7 @@ alloc_done: if (erasure_code && !ec_open_bucket(c, &ptrs)) pr_debug("failed to get ec bucket: ret %u", ret); - if (ret == INSUFFICIENT_DEVICES && + if (ret == -INSUFFICIENT_DEVICES && nr_effective >= nr_replicas_required) ret = 0; @@ -854,15 +851,15 @@ err: mutex_unlock(&wp->lock); - if (ret == FREELIST_EMPTY && + if (ret == -FREELIST_EMPTY && try_decrease_writepoints(c, write_points_nr)) goto retry; switch (ret) { - case OPEN_BUCKETS_EMPTY: - case FREELIST_EMPTY: + case -OPEN_BUCKETS_EMPTY: + case -FREELIST_EMPTY: return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC); - case INSUFFICIENT_DEVICES: + case -INSUFFICIENT_DEVICES: return ERR_PTR(-EROFS); default: BUG(); diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index c658295..2e81712 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -12,13 +12,6 @@ struct bch_dev; struct bch_fs; struct bch_devs_List; -enum bucket_alloc_ret { - ALLOC_SUCCESS, - OPEN_BUCKETS_EMPTY, - FREELIST_EMPTY, /* Allocator thread not keeping up */ - INSUFFICIENT_DEVICES, -}; - struct dev_alloc_list { unsigned nr; u8 devs[BCH_SB_MEMBERS_MAX]; @@ -98,8 +91,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c, } } -enum bucket_alloc_ret -bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, +int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, struct dev_stripe_state *, struct bch_devs_mask *, unsigned, unsigned *, bool *, enum alloc_reserve, unsigned, struct closure *); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index fdf3a77..0439f3e 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -200,6 +200,7 @@ #include #include "bcachefs_format.h" +#include "errcode.h" #include "fifo.h" #include "opts.h" #include "util.h" diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 091bdde..4deb87f 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -710,12 +710,15 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs; const struct bch_extent_ptr *ptr; + struct bkey deleted = KEY(0, 0, 0); + struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; unsigned flags = BTREE_TRIGGER_GC| (initial ? BTREE_TRIGGER_NOATOMIC : 0); - char buf[200]; int ret = 0; + deleted.p = k->k->p; + if (initial) { BUG_ON(bch2_journal_seq_verify && k->k->version.lo > journal_cur_seq(&c->journal)); @@ -729,18 +732,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, k->k->version.lo, atomic64_read(&c->key_version))) atomic64_set(&c->key_version, k->k->version.lo); - - if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c, - "superblock not marked as containing replicas\n" - " while marking %s", - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { - ret = bch2_mark_bkey_replicas(c, *k); - if (ret) { - bch_err(c, "error marking bkey replicas: %i", ret); - goto err; - } - } } ptrs = bch2_bkey_ptrs_c(*k); @@ -754,7 +745,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, *max_stale = max(*max_stale, ptr_stale(ca, ptr)); } - ret = bch2_mark_key(trans, *k, flags); + ret = bch2_mark_key(trans, old, *k, flags); fsck_err: err: if (ret) @@ -1185,14 +1176,14 @@ static int bch2_gc_done(struct bch_fs *c, set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_bucket_field(_f) \ - if (dst->b[b].mark._f != src->b[b].mark._f) { \ + if (dst->b[b]._f != src->b[b]._f) { \ if (verify) \ fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ ": got %u, should be %u", dev, b, \ dst->b[b].mark.gen, \ bch2_data_types[dst->b[b].mark.data_type],\ - dst->b[b].mark._f, src->b[b].mark._f); \ - dst->b[b]._mark._f = src->b[b].mark._f; \ + dst->b[b]._f, src->b[b]._f); \ + dst->b[b]._f = src->b[b]._f; \ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_dev_field(_f, _msg, ...) \ @@ -1238,11 +1229,13 @@ static int bch2_gc_done(struct bch_fs *c, size_t b; for (b = 0; b < src->nbuckets; b++) { - copy_bucket_field(gen); - copy_bucket_field(data_type); + copy_bucket_field(_mark.gen); + copy_bucket_field(_mark.data_type); + copy_bucket_field(_mark.stripe); + copy_bucket_field(_mark.dirty_sectors); + copy_bucket_field(_mark.cached_sectors); + copy_bucket_field(stripe_redundancy); copy_bucket_field(stripe); - copy_bucket_field(dirty_sectors); - copy_bucket_field(cached_sectors); dst->b[b].oldest_gen = src->b[b].oldest_gen; } diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index f43044e..fc9d5ba 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -25,6 +25,15 @@ static inline void btree_path_list_remove(struct btree_trans *, struct btree_pat static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, struct btree_path *); +static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + return iter->ip_allocated; +#else + return 0; +#endif +} + static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); /* @@ -1601,14 +1610,15 @@ static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btr inline struct btree_path * __must_check bch2_btree_path_make_mut(struct btree_trans *trans, - struct btree_path *path, bool intent) + struct btree_path *path, bool intent, + unsigned long ip) { if (path->ref > 1 || path->preserve) { __btree_path_put(path, intent); path = btree_path_clone(trans, path, intent); path->preserve = false; #ifdef CONFIG_BCACHEFS_DEBUG - path->ip_allocated = _RET_IP_; + path->ip_allocated = ip; #endif btree_trans_verify_sorted(trans); } @@ -1619,7 +1629,7 @@ bch2_btree_path_make_mut(struct btree_trans *trans, static struct btree_path * __must_check btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos new_pos, - bool intent) + bool intent, unsigned long ip) { int cmp = bpos_cmp(new_pos, path->pos); unsigned l = path->level; @@ -1630,7 +1640,7 @@ btree_path_set_pos(struct btree_trans *trans, if (!cmp) return path; - path = bch2_btree_path_make_mut(trans, path, intent); + path = bch2_btree_path_make_mut(trans, path, intent, ip); path->pos = new_pos; path->should_be_locked = false; @@ -1806,7 +1816,7 @@ static struct btree_path *btree_path_alloc(struct btree_trans *trans, struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, enum btree_id btree_id, struct bpos pos, unsigned locks_want, unsigned level, - bool intent) + bool intent, unsigned long ip) { struct btree_path *path, *path_pos = NULL; int i; @@ -1829,7 +1839,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, path_pos->btree_id == btree_id && path_pos->level == level) { __btree_path_get(path_pos, intent); - path = btree_path_set_pos(trans, path_pos, pos, intent); + path = btree_path_set_pos(trans, path_pos, pos, intent, ip); path->preserve = true; } else { path = btree_path_alloc(trans, path_pos); @@ -1849,7 +1859,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, for (i = 0; i < ARRAY_SIZE(path->l); i++) path->l[i].b = BTREE_ITER_NO_NODE_INIT; #ifdef CONFIG_BCACHEFS_DEBUG - path->ip_allocated = _RET_IP_; + path->ip_allocated = ip; #endif btree_trans_verify_sorted(trans); } @@ -1927,7 +1937,8 @@ bch2_btree_iter_traverse(struct btree_iter *iter) iter->path = btree_path_set_pos(iter->trans, iter->path, btree_iter_search_key(iter), - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); if (ret) @@ -1962,7 +1973,8 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) iter->k.p = iter->pos = b->key.k.p; iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); iter->path->should_be_locked = true; BUG_ON(iter->path->uptodate); out: @@ -2021,7 +2033,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) */ path = iter->path = btree_path_set_pos(trans, path, bpos_successor(iter->pos), - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); path->level = iter->min_depth; @@ -2043,7 +2056,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) iter->k.p = iter->pos = b->key.k.p; iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); iter->path->should_be_locked = true; BUG_ON(iter->path->uptodate); out: @@ -2102,7 +2116,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) while (1) { iter->path = btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { @@ -2178,7 +2193,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) cmp = bpos_cmp(k.k->p, iter->path->pos); if (cmp) { iter->path = bch2_btree_path_make_mut(trans, iter->path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); iter->path->pos = k.k->p; btree_path_check_sort(trans, iter->path, cmp); } @@ -2230,7 +2246,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) while (1) { iter->path = btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { @@ -2360,7 +2377,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) search_key = btree_iter_search_key(iter); iter->path = btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) @@ -2571,7 +2589,8 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, unsigned btree_id, struct bpos pos, unsigned locks_want, unsigned depth, - unsigned flags) + unsigned flags, + unsigned long ip) { EBUG_ON(trans->restarted); @@ -2597,6 +2616,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, iter->k.type = KEY_TYPE_deleted; iter->k.p = pos; iter->k.size = 0; +#ifdef CONFIG_BCACHEFS_DEBUG + iter->ip_allocated = ip; +#endif iter->path = bch2_path_get(trans, flags & BTREE_ITER_CACHED, @@ -2604,7 +2626,7 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, iter->pos, locks_want, depth, - flags & BTREE_ITER_INTENT); + flags & BTREE_ITER_INTENT, ip); } void bch2_trans_iter_init(struct btree_trans *trans, @@ -2613,7 +2635,7 @@ void bch2_trans_iter_init(struct btree_trans *trans, unsigned flags) { __bch2_trans_iter_init(trans, iter, btree_id, pos, - 0, 0, flags); + 0, 0, flags, _RET_IP_); } void bch2_trans_node_iter_init(struct btree_trans *trans, @@ -2628,7 +2650,7 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, BTREE_ITER_NOT_EXTENTS| __BTREE_ITER_ALL_SNAPSHOTS| BTREE_ITER_ALL_SNAPSHOTS| - flags); + flags, _RET_IP_); BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); BUG_ON(iter->path->level != depth); BUG_ON(iter->min_depth != depth); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 33a703c..26eb90a 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -130,11 +130,13 @@ __trans_next_path_with_node(struct btree_trans *trans, struct btree *b, (_path)->idx + 1)) struct btree_path * __must_check -bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, bool); +bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, + bool, unsigned long); int __must_check bch2_btree_path_traverse(struct btree_trans *, struct btree_path *, unsigned); struct btree_path *bch2_path_get(struct btree_trans *, bool, enum btree_id, - struct bpos, unsigned, unsigned, bool); + struct bpos, unsigned, unsigned, bool, + unsigned long); inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); #ifdef CONFIG_BCACHEFS_DEBUG @@ -302,13 +304,19 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, : bch2_btree_iter_peek(iter); } +static inline int btree_trans_too_many_iters(struct btree_trans *trans) +{ + return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2 + ? -EINTR : 0; +} + static inline struct bkey_s_c __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, struct btree_iter *iter, unsigned flags) { struct bkey_s_c k; - while ((hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2) || + while (btree_trans_too_many_iters(trans) || (k = __bch2_btree_iter_peek(iter, flags), bkey_err(k) == -EINTR)) bch2_trans_begin(trans); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 0d0a719..2c2e2f7 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -291,6 +291,9 @@ struct btree_iter { * bch2_btree_iter_next_slot() can correctly advance pos. */ struct bkey k; +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned long ip_allocated; +#endif }; struct btree_key_cache { diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 61c7757..dfff972 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -1590,8 +1590,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, ? bpos_predecessor(b->data->min_key) : bpos_successor(b->data->max_key); - sib_path = bch2_path_get(trans, false, path->btree_id, - sib_pos, U8_MAX, level, true); + sib_path = bch2_path_get(trans, false, path->btree_id, sib_pos, + U8_MAX, level, true, _THIS_IP_); ret = bch2_btree_path_traverse(trans, sib_path, false); if (ret) goto err; @@ -1888,7 +1888,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bch2_trans_copy_iter(&iter2, iter); iter2.path = bch2_btree_path_make_mut(trans, iter2.path, - iter2.flags & BTREE_ITER_INTENT); + iter2.flags & BTREE_ITER_INTENT, + _THIS_IP_); BUG_ON(iter2.path->level != b->c.level); BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p)); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 112ac7c..131fd4c 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -437,17 +437,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, marking = true; } - if (marking) { - percpu_down_read(&c->mark_lock); - } - - /* Must be called under mark_lock: */ - if (marking && trans->fs_usage_deltas && - !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) { - ret = BTREE_INSERT_NEED_MARK_REPLICAS; - goto err; - } - /* * Don't get journal reservation until after we know insert will * succeed: @@ -456,7 +445,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK); if (ret) - goto err; + return ret; } else { trans->journal_res.seq = c->journal.replay_journal_seq; } @@ -484,22 +473,19 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, i->k->k.version = MAX_VERSION; } + if (trans->fs_usage_deltas && + bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) + return BTREE_INSERT_NEED_MARK_REPLICAS; + trans_for_each_update(trans, i) if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) bch2_mark_update(trans, i->path, i->k, i->flags); - if (marking && trans->fs_usage_deltas) - bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas); - if (unlikely(c->gc_pos.phase)) bch2_trans_mark_gc(trans); trans_for_each_update(trans, i) do_btree_insert_one(trans, i); -err: - if (marking) { - percpu_up_read(&c->mark_lock); - } return ret; } diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 6fc93b5..4d55ef5 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -144,6 +144,7 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, unsigned journal_seq, bool gc) { + percpu_rwsem_assert_held(&c->mark_lock); BUG_ON(!gc && !journal_seq); return this_cpu_ptr(gc @@ -371,8 +372,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, if (!journal_seq && !test_bit(BCH_FS_INITIALIZED, &c->flags)) journal_seq = 1; - percpu_rwsem_assert_held(&c->mark_lock); - preempt_disable(); fs_usage = fs_usage_ptr(c, journal_seq, gc); u = dev_usage_ptr(ca, journal_seq, gc); @@ -418,25 +417,48 @@ static inline int __update_replicas(struct bch_fs *c, return 0; } -static inline int update_replicas(struct bch_fs *c, +static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, struct bch_replicas_entry *r, s64 sectors, unsigned journal_seq, bool gc) { struct bch_fs_usage __percpu *fs_usage; - int idx = bch2_replicas_entry_idx(c, r); + int idx, ret = 0; + char buf[200]; - if (idx < 0) - return -1; + percpu_down_read(&c->mark_lock); + + idx = bch2_replicas_entry_idx(c, r); + if (idx < 0 && + (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || + fsck_err(c, "no replicas entry\n" + " while marking %s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))) { + percpu_up_read(&c->mark_lock); + ret = bch2_mark_replicas(c, r); + if (ret) + return ret; + + percpu_down_read(&c->mark_lock); + idx = bch2_replicas_entry_idx(c, r); + } + if (idx < 0) { + ret = -1; + goto err; + } preempt_disable(); fs_usage = fs_usage_ptr(c, journal_seq, gc); fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); fs_usage->replicas[idx] += sectors; preempt_enable(); - return 0; +err: +fsck_err: + percpu_up_read(&c->mark_lock); + return ret; } static inline int update_cached_sectors(struct bch_fs *c, + struct bkey_s_c k, unsigned dev, s64 sectors, unsigned journal_seq, bool gc) { @@ -444,7 +466,7 @@ static inline int update_cached_sectors(struct bch_fs *c, bch2_replicas_entry_cached(&r.e, dev); - return update_replicas(c, &r.e, sectors, journal_seq, gc); + return update_replicas(c, k, &r.e, sectors, journal_seq, gc); } static struct replicas_delta_list * @@ -547,6 +569,7 @@ static int bch2_mark_alloc(struct btree_trans *trans, struct bch_dev *ca; struct bucket *g; struct bucket_mark old_m, m; + int ret = 0; /* We don't do anything for deletions - do we?: */ if (!bkey_is_alloc(new.k)) @@ -573,6 +596,7 @@ static int bch2_mark_alloc(struct btree_trans *trans, if (new.k->p.offset >= ca->mi.nbuckets) return 0; + percpu_down_read(&c->mark_lock); g = __bucket(ca, new.k->p.offset, gc); u = bch2_alloc_unpack(new); @@ -597,6 +621,7 @@ static int bch2_mark_alloc(struct btree_trans *trans, g->gen_valid = 1; g->stripe = u.stripe; g->stripe_redundancy = u.stripe_redundancy; + percpu_up_read(&c->mark_lock); /* * need to know if we're getting called from the invalidate path or @@ -605,10 +630,12 @@ static int bch2_mark_alloc(struct btree_trans *trans, if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && old_m.cached_sectors) { - if (update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors, - journal_seq, gc)) { + ret = update_cached_sectors(c, new, ca->dev_idx, + -old_m.cached_sectors, + journal_seq, gc); + if (ret) { bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); - return -1; + return ret; } trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), @@ -779,43 +806,57 @@ static int mark_stripe_bucket(struct btree_trans *trans, const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; unsigned nr_data = s->nr_blocks - s->nr_redundant; bool parity = ptr_idx >= nr_data; + enum bch_data_type data_type = parity ? BCH_DATA_parity : 0; + s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; bool gc = flags & BTREE_TRIGGER_GC; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, gc); + struct bucket *g; struct bucket_mark new, old; char buf[200]; - int ret; + int ret = 0; - if (g->stripe && g->stripe != k.k->p.offset) { + /* * XXX doesn't handle deletion */ + + percpu_down_read(&c->mark_lock); + g = PTR_BUCKET(ca, ptr, gc); + + if (g->mark.dirty_sectors || + (g->stripe && g->stripe != k.k->p.offset)) { bch2_fs_inconsistent(c, "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - return -EINVAL; + ret = -EINVAL; + goto err; } old = bucket_cmpxchg(g, new, ({ - ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type, + ret = check_bucket_ref(c, k, ptr, sectors, data_type, + new.gen, new.data_type, new.dirty_sectors, new.cached_sectors); if (ret) - return ret; + goto err; - if (parity) { - new.data_type = BCH_DATA_parity; - new.dirty_sectors = le16_to_cpu(s->sectors); - } + new.dirty_sectors += sectors; + if (data_type) + new.data_type = data_type; if (journal_seq) { new.journal_seq_valid = 1; new.journal_seq = journal_seq; } + + new.stripe = true; })); g->stripe = k.k->p.offset; g->stripe_redundancy = s->nr_redundant; bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); +err: + percpu_up_read(&c->mark_lock); + return 0; } @@ -856,7 +897,10 @@ static int bch2_mark_pointer(struct btree_trans *trans, struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); u8 bucket_data_type; u64 v; - int ret; + int ret = 0; + + percpu_down_read(&c->mark_lock); + g = PTR_BUCKET(ca, &p.ptr, gc); v = atomic64_read(&g->_mark.v); do { @@ -869,7 +913,7 @@ static int bch2_mark_pointer(struct btree_trans *trans, &new.dirty_sectors, &new.cached_sectors); if (ret) - return ret; + goto err; new.data_type = bucket_data_type; @@ -889,11 +933,14 @@ static int bch2_mark_pointer(struct btree_trans *trans, bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); BUG_ON(!gc && bucket_became_unavailable(old, new)); +err: + percpu_up_read(&c->mark_lock); - return 0; + return ret; } static int bch2_mark_stripe_ptr(struct btree_trans *trans, + struct bkey_s_c k, struct bch_extent_stripe_ptr p, enum bch_data_type data_type, s64 sectors, @@ -933,7 +980,7 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans, spin_unlock(&c->ec_stripes_heap_lock); r.e.data_type = data_type; - update_replicas(c, &r.e, sectors, trans->journal_res.seq, gc); + update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, gc); return 0; } @@ -978,18 +1025,19 @@ static int bch2_mark_extent(struct btree_trans *trans, stale = ret > 0; if (p.ptr.cached) { - if (!stale) - if (update_cached_sectors(c, p.ptr.dev, disk_sectors, - journal_seq, gc)) { + if (!stale) { + ret = update_cached_sectors(c, k, p.ptr.dev, + disk_sectors, journal_seq, gc); + if (ret) { bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors"); - return -1; - + return ret; } + } } else if (!p.has_ec) { dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { - ret = bch2_mark_stripe_ptr(trans, p.ec, data_type, + ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type, disk_sectors, flags); if (ret) return ret; @@ -1004,12 +1052,13 @@ static int bch2_mark_extent(struct btree_trans *trans, } if (r.e.nr_devs) { - if (update_replicas(c, &r.e, dirty_sectors, journal_seq, gc)) { + ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, gc); + if (ret) { char buf[200]; bch2_bkey_val_to_text(&PBUF(buf), c, k); bch2_fs_fatal_error(c, "no replicas entry for %s", buf); - return -1; + return ret; } } @@ -1078,6 +1127,11 @@ static int bch2_mark_stripe(struct btree_trans *trans, } if (gc) { + /* + * This will be wrong when we bring back runtime gc: we should + * be unmarking the old key and then marking the new key + */ + /* * gc recalculates this field from stripe ptr * references: @@ -1091,14 +1145,15 @@ static int bch2_mark_stripe(struct btree_trans *trans, return ret; } - if (update_replicas(c, &m->r.e, - ((s64) m->sectors * m->nr_redundant), - journal_seq, gc)) { + ret = update_replicas(c, new, &m->r.e, + ((s64) m->sectors * m->nr_redundant), + journal_seq, gc); + if (ret) { char buf[200]; bch2_bkey_val_to_text(&PBUF(buf), c, new); bch2_fs_fatal_error(c, "no replicas entry for %s", buf); - return -1; + return ret; } } @@ -1123,11 +1178,15 @@ static int bch2_mark_inode(struct btree_trans *trans, } if (flags & BTREE_TRIGGER_GC) { + percpu_down_read(&c->mark_lock); preempt_disable(); + fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); fs_usage->nr_inodes += bkey_is_inode(new.k); fs_usage->nr_inodes -= bkey_is_inode(old.k); + preempt_enable(); + percpu_up_read(&c->mark_lock); } return 0; } @@ -1146,14 +1205,18 @@ static int bch2_mark_reservation(struct btree_trans *trans, sectors = -sectors; sectors *= replicas; + percpu_down_read(&c->mark_lock); preempt_disable(); + fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); replicas = clamp_t(unsigned, replicas, 1, ARRAY_SIZE(fs_usage->persistent_reserved)); fs_usage->reserved += sectors; fs_usage->persistent_reserved[replicas - 1] += sectors; + preempt_enable(); + percpu_up_read(&c->mark_lock); return 0; } @@ -1241,10 +1304,10 @@ static int bch2_mark_reflink_p(struct btree_trans *trans, return ret; } -static int bch2_mark_key_locked(struct btree_trans *trans, - struct bkey_s_c old, - struct bkey_s_c new, - unsigned flags) +int bch2_mark_key(struct btree_trans *trans, + struct bkey_s_c old, + struct bkey_s_c new, + unsigned flags) { struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; @@ -1274,22 +1337,6 @@ static int bch2_mark_key_locked(struct btree_trans *trans, } } -int bch2_mark_key(struct btree_trans *trans, struct bkey_s_c new, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey deleted = KEY(0, 0, 0); - struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; - int ret; - - deleted.p = new.k->p; - - percpu_down_read(&c->mark_lock); - ret = bch2_mark_key_locked(trans, old, new, flags); - percpu_up_read(&c->mark_lock); - - return ret; -} - int bch2_mark_update(struct btree_trans *trans, struct btree_path *path, struct bkey_i *new, unsigned flags) { @@ -1311,12 +1358,12 @@ int bch2_mark_update(struct btree_trans *trans, struct btree_path *path, if (old.k->type == new->k.type && ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { - ret = bch2_mark_key_locked(trans, old, bkey_i_to_s_c(new), + ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new), BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); } else { - ret = bch2_mark_key_locked(trans, deleted, bkey_i_to_s_c(new), + ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new), BTREE_TRIGGER_INSERT|flags) ?: - bch2_mark_key_locked(trans, old, deleted, + bch2_mark_key(trans, old, deleted, BTREE_TRIGGER_OVERWRITE|flags); } @@ -1359,21 +1406,20 @@ void fs_usage_apply_warn(struct btree_trans *trans, __WARN(); } -void bch2_trans_fs_usage_apply(struct btree_trans *trans, - struct replicas_delta_list *deltas) +int bch2_trans_fs_usage_apply(struct btree_trans *trans, + struct replicas_delta_list *deltas) { struct bch_fs *c = trans->c; static int warned_disk_usage = 0; bool warn = false; unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; - struct replicas_delta *d = deltas->d; + struct replicas_delta *d = deltas->d, *d2; struct replicas_delta *top = (void *) deltas->d + deltas->used; struct bch_fs_usage *dst; s64 added = 0, should_not_have_added; unsigned i; - percpu_rwsem_assert_held(&c->mark_lock); - + percpu_down_read(&c->mark_lock); preempt_disable(); dst = fs_usage_ptr(c, trans->journal_res.seq, false); @@ -1385,7 +1431,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, added += d->delta; } - BUG_ON(__update_replicas(c, dst, &d->r, d->delta)); + if (__update_replicas(c, dst, &d->r, d->delta)) + goto need_mark; } dst->nr_inodes += deltas->nr_inodes; @@ -1420,9 +1467,19 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, } preempt_enable(); + percpu_up_read(&c->mark_lock); if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added); + return 0; +need_mark: + /* revert changes: */ + for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2)) + BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta)); + + preempt_enable(); + percpu_up_read(&c->mark_lock); + return -1; } /* trans_mark: */ @@ -1606,50 +1663,75 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, return 0; } -static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, - struct bkey_s_c_stripe s, - unsigned idx, bool deleting) +static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c_stripe s, + unsigned idx, bool deleting) { struct bch_fs *c = trans->c; const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; struct bkey_alloc_buf *a; struct btree_iter iter; struct bkey_alloc_unpacked u; - bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant; + enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity : 0; + s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; int ret = 0; + if (deleting) + sectors = -sectors; + a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); if (IS_ERR(a)) return PTR_ERR(a); - if (parity) { - s64 sectors = le16_to_cpu(s.v->sectors); - - if (deleting) - sectors = -sectors; - - u.dirty_sectors += sectors; - u.data_type = u.dirty_sectors - ? BCH_DATA_parity - : 0; - } + ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type, + u.gen, u.data_type, + u.dirty_sectors, u.cached_sectors); + if (ret) + goto err; if (!deleting) { - if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c, - "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)", + if (bch2_fs_inconsistent_on(u.stripe || + u.stripe_redundancy, c, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", iter.pos.inode, iter.pos.offset, u.gen, + bch2_data_types[u.data_type], + u.dirty_sectors, u.stripe, s.k->p.offset)) { ret = -EIO; goto err; } + if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", + iter.pos.inode, iter.pos.offset, u.gen, + bch2_data_types[u.data_type], + u.dirty_sectors, + s.k->p.offset)) { + ret = -EIO; + goto err; + } + u.stripe = s.k->p.offset; u.stripe_redundancy = s.v->nr_redundant; } else { + if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset || + u.stripe_redundancy != s.v->nr_redundant, c, + "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", + iter.pos.inode, iter.pos.offset, u.gen, + s.k->p.offset, u.stripe)) { + ret = -EIO; + goto err; + } + u.stripe = 0; u.stripe_redundancy = 0; } + u.dirty_sectors += sectors; + if (data_type) + u.data_type = !deleting ? data_type : 0; + bch2_alloc_pack(c, a, u); bch2_trans_update(trans, &iter, &a->k, 0); err: @@ -1664,7 +1746,7 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, struct bkey_s_c_stripe old_s = { .k = NULL }; struct bkey_s_c_stripe new_s = { .k = NULL }; struct bch_replicas_padded r; - unsigned i; + unsigned i, nr_blocks; int ret = 0; if (old.k->type == KEY_TYPE_stripe) @@ -1682,18 +1764,17 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, new_s.v->nr_blocks * sizeof(struct bch_extent_ptr))) return 0; + BUG_ON(new_s.k && old_s.k && + (new_s.v->nr_blocks != old_s.v->nr_blocks || + new_s.v->nr_redundant != old_s.v->nr_redundant)); + + nr_blocks = new_s.k ? new_s.v->nr_blocks : old_s.v->nr_blocks; + if (new_s.k) { s64 sectors = le16_to_cpu(new_s.v->sectors); bch2_bkey_to_replicas(&r.e, new); update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant); - - for (i = 0; i < new_s.v->nr_blocks; i++) { - ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s, - i, false); - if (ret) - return ret; - } } if (old_s.k) { @@ -1701,12 +1782,25 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, bch2_bkey_to_replicas(&r.e, old); update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant); + } - for (i = 0; i < old_s.v->nr_blocks; i++) { - ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s, - i, true); + for (i = 0; i < nr_blocks; i++) { + if (new_s.k && old_s.k && + !memcmp(&new_s.v->ptrs[i], + &old_s.v->ptrs[i], + sizeof(new_s.v->ptrs[i]))) + continue; + + if (new_s.k) { + ret = bch2_trans_mark_stripe_bucket(trans, new_s, i, false); if (ret) - return ret; + break; + } + + if (old_s.k) { + ret = bch2_trans_mark_stripe_bucket(trans, old_s, i, true); + if (ret) + break; } } diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 5ed9441..ac9b554 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -226,14 +226,14 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); -int bch2_mark_key(struct btree_trans *, struct bkey_s_c, unsigned); +int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); int bch2_mark_update(struct btree_trans *, struct btree_path *, struct bkey_i *, unsigned); int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); -void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); +int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, size_t, enum bch_data_type, unsigned); diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 4dfcc95..fe4a85a 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -531,10 +531,9 @@ retry: * read_target looks up subvolumes, we can overflow paths if the * directory has many subvolumes in it */ - if (hweight64(trans.paths_allocated) > BTREE_ITER_MAX / 2) { - ret = -EINTR; + ret = btree_trans_too_many_iters(&trans); + if (ret) break; - } } bch2_trans_iter_exit(&trans, &iter); err: diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index bca1b8a..71d85c9 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -15,6 +15,7 @@ #include "io.h" #include "keylist.h" #include "recovery.h" +#include "replicas.h" #include "super-io.h" #include "util.h" @@ -1272,16 +1273,15 @@ found: return h; } -static enum bucket_alloc_ret -new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, - struct closure *cl) +static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, + struct closure *cl) { struct bch_devs_mask devs = h->devs; struct open_bucket *ob; struct open_buckets buckets; unsigned i, j, nr_have_parity = 0, nr_have_data = 0; bool have_cache = true; - enum bucket_alloc_ret ret = ALLOC_SUCCESS; + int ret = 0; for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { if (test_bit(i, h->s->blocks_gotten)) { @@ -1516,7 +1516,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, err: bch2_ec_stripe_head_put(c, h); - return ERR_PTR(-ret); + return ERR_PTR(ret); } void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) @@ -1636,13 +1636,41 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k) { + const struct bch_stripe *s; struct bch_fs *c = trans->c; + struct stripe *m; + unsigned i; int ret = 0; - if (k.k->type == KEY_TYPE_stripe) - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: - bch2_mark_key(trans, k, - BTREE_TRIGGER_NOATOMIC); + if (k.k->type != KEY_TYPE_stripe) + return 0; + + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); + if (ret) + return ret; + + s = bkey_s_c_to_stripe(k).v; + + m = genradix_ptr(&c->stripes[0], k.k->p.offset); + m->alive = true; + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; + m->nr_blocks = s->nr_blocks; + m->nr_redundant = s->nr_redundant; + m->blocks_nonempty = 0; + + for (i = 0; i < s->nr_blocks; i++) { + m->block_sectors[i] = + stripe_blockcount_get(s, i); + m->blocks_nonempty += !!m->block_sectors[i]; + m->ptrs[i] = s->ptrs[i]; + } + + bch2_bkey_to_replicas(&m->r.e, k); + + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_update(c, m, k.k->p.offset); + spin_unlock(&c->ec_stripes_heap_lock); return ret; } diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h new file mode 100644 index 0000000..f7d1291 --- /dev/null +++ b/libbcachefs/errcode.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ERRCODE_H +#define _BCACHEFS_ERRCODE_H + +enum { + /* Bucket allocator: */ + OPEN_BUCKETS_EMPTY = 2048, + FREELIST_EMPTY, /* Allocator thread not keeping up */ + INSUFFICIENT_DEVICES, +}; + +#endif /* _BCACHFES_ERRCODE_H */ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index d3d48a5..5bcdfe3 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -223,6 +223,9 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, return; mutex_lock(&inode->ei_quota_lock); + BUG_ON((s64) inode->v.i_blocks + sectors < 0); + inode->v.i_blocks += sectors; + #ifdef CONFIG_BCACHEFS_QUOTA if (quota_res && sectors > 0) { BUG_ON(sectors > quota_res->sectors); @@ -234,7 +237,6 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); } #endif - inode->v.i_blocks += sectors; mutex_unlock(&inode->ei_quota_lock); } @@ -243,24 +245,26 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, /* stored in page->private: */ struct bch_page_sector { - /* Uncompressed, fully allocated replicas: */ - unsigned nr_replicas:3; + /* Uncompressed, fully allocated replicas (or on disk reservation): */ + unsigned nr_replicas:4; - /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ - unsigned replicas_reserved:3; + /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ + unsigned replicas_reserved:4; /* i_sectors: */ enum { SECTOR_UNALLOCATED, SECTOR_RESERVED, SECTOR_DIRTY, + SECTOR_DIRTY_RESERVED, SECTOR_ALLOCATED, - } state:2; + } state:8; }; struct bch_page_state { spinlock_t lock; atomic_t write_count; + bool uptodate; struct bch_page_sector s[PAGE_SECTORS]; }; @@ -311,6 +315,212 @@ static struct bch_page_state *bch2_page_state_create(struct page *page, return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); } +static unsigned bkey_to_sector_state(const struct bkey *k) +{ + if (k->type == KEY_TYPE_reservation) + return SECTOR_RESERVED; + if (bkey_extent_is_allocation(k)) + return SECTOR_ALLOCATED; + return SECTOR_UNALLOCATED; +} + +static void __bch2_page_state_set(struct page *page, + unsigned pg_offset, unsigned pg_len, + unsigned nr_ptrs, unsigned state) +{ + struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL); + unsigned i; + + BUG_ON(pg_offset >= PAGE_SECTORS); + BUG_ON(pg_offset + pg_len > PAGE_SECTORS); + + spin_lock(&s->lock); + + for (i = pg_offset; i < pg_offset + pg_len; i++) { + s->s[i].nr_replicas = nr_ptrs; + s->s[i].state = state; + } + + if (i == PAGE_SECTORS) + s->uptodate = true; + + spin_unlock(&s->lock); +} + +static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum, + struct page **pages, unsigned nr_pages) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT; + unsigned pg_idx = 0; + u32 snapshot; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, ret) { + unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k.k); + + while (pg_idx < nr_pages) { + struct page *page = pages[pg_idx]; + u64 pg_start = page->index << PAGE_SECTORS_SHIFT; + u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; + unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start; + unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start; + + BUG_ON(k.k->p.offset < pg_start); + BUG_ON(bkey_start_offset(k.k) > pg_end); + + if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) + __bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state); + + if (k.k->p.offset < pg_end) + break; + pg_idx++; + } + + if (pg_idx == nr_pages) + break; + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; + bch2_trans_exit(&trans); + + return ret; +} + +static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) +{ + struct bvec_iter iter; + struct bio_vec bv; + unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v + ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k.k); + + bio_for_each_segment(bv, bio, iter) + __bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9, + bv.bv_len >> 9, nr_ptrs, state); +} + +static void mark_pagecache_unallocated(struct bch_inode_info *inode, + u64 start, u64 end) +{ + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct pagevec pvec; + + if (end <= start) + return; + + pagevec_init(&pvec); + + do { + unsigned nr_pages, i, j; + + nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, + &index, end_index); + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + u64 pg_start = page->index << PAGE_SECTORS_SHIFT; + u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; + unsigned pg_offset = max(start, pg_start) - pg_start; + unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; + struct bch_page_state *s; + + BUG_ON(end <= pg_start); + BUG_ON(pg_offset >= PAGE_SECTORS); + BUG_ON(pg_offset + pg_len > PAGE_SECTORS); + + lock_page(page); + s = bch2_page_state(page); + + if (s) { + spin_lock(&s->lock); + for (j = pg_offset; j < pg_offset + pg_len; j++) + s->s[j].nr_replicas = 0; + spin_unlock(&s->lock); + } + + unlock_page(page); + } + pagevec_release(&pvec); + } while (index <= end_index); +} + +static void mark_pagecache_reserved(struct bch_inode_info *inode, + u64 start, u64 end) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct pagevec pvec; + s64 i_sectors_delta = 0; + + if (end <= start) + return; + + pagevec_init(&pvec); + + do { + unsigned nr_pages, i, j; + + nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, + &index, end_index); + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + u64 pg_start = page->index << PAGE_SECTORS_SHIFT; + u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; + unsigned pg_offset = max(start, pg_start) - pg_start; + unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; + struct bch_page_state *s; + + BUG_ON(end <= pg_start); + BUG_ON(pg_offset >= PAGE_SECTORS); + BUG_ON(pg_offset + pg_len > PAGE_SECTORS); + + lock_page(page); + s = bch2_page_state(page); + + if (s) { + spin_lock(&s->lock); + for (j = pg_offset; j < pg_offset + pg_len; j++) + switch (s->s[j].state) { + case SECTOR_UNALLOCATED: + s->s[j].state = SECTOR_RESERVED; + break; + case SECTOR_DIRTY: + s->s[j].state = SECTOR_DIRTY_RESERVED; + i_sectors_delta--; + break; + default: + break; + } + spin_unlock(&s->lock); + } + + unlock_page(page); + } + pagevec_release(&pvec); + } while (index <= end_index); + + i_sectors_acct(c, inode, NULL, i_sectors_delta); +} + static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) { /* XXX: this should not be open coded */ @@ -395,6 +605,8 @@ static int bch2_page_reservation_get(struct bch_fs *c, if (!s) return -ENOMEM; + BUG_ON(!s->uptodate); + for (i = round_down(offset, block_bytes(c)) >> 9; i < round_up(offset + len, block_bytes(c)) >> 9; i++) { @@ -449,16 +661,22 @@ static void bch2_clear_page_bits(struct page *page) disk_res.sectors += s->s[i].replicas_reserved; s->s[i].replicas_reserved = 0; - if (s->s[i].state == SECTOR_DIRTY) { - dirty_sectors++; + switch (s->s[i].state) { + case SECTOR_DIRTY: s->s[i].state = SECTOR_UNALLOCATED; + --dirty_sectors; + break; + case SECTOR_DIRTY_RESERVED: + s->s[i].state = SECTOR_RESERVED; + break; + default: + break; } } bch2_disk_reservation_put(c, &disk_res); - if (dirty_sectors) - i_sectors_acct(c, inode, NULL, -dirty_sectors); + i_sectors_acct(c, inode, NULL, dirty_sectors); bch2_page_state_release(page); } @@ -491,16 +709,22 @@ static void bch2_set_page_dirty(struct bch_fs *c, s->s[i].replicas_reserved += sectors; res->disk.sectors -= sectors; - if (s->s[i].state == SECTOR_UNALLOCATED) + switch (s->s[i].state) { + case SECTOR_UNALLOCATED: + s->s[i].state = SECTOR_DIRTY; dirty_sectors++; - - s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); + break; + case SECTOR_RESERVED: + s->s[i].state = SECTOR_DIRTY_RESERVED; + break; + default: + break; + } } spin_unlock(&s->lock); - if (dirty_sectors) - i_sectors_acct(c, inode, &res->quota, dirty_sectors); + i_sectors_acct(c, inode, &res->quota, dirty_sectors); if (!PageDirty(page)) __set_page_dirty_nobuffers(page); @@ -554,7 +778,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) struct bch2_page_reservation res; unsigned len; loff_t isize; - int ret = VM_FAULT_LOCKED; + int ret; bch2_page_reservation_init(c, inode, &res); @@ -580,6 +804,14 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); + if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { + if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) { + unlock_page(page); + ret = VM_FAULT_SIGBUS; + goto out; + } + } + if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { unlock_page(page); ret = VM_FAULT_SIGBUS; @@ -590,6 +822,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) bch2_page_reservation_put(c, inode, &res); wait_for_stable_page(page); + ret = VM_FAULT_LOCKED; out: bch2_pagecache_add_put(&inode->ei_pagecache_lock); sb_end_pagefault(inode->v.i_sb); @@ -703,29 +936,6 @@ static inline struct page *readpage_iter_next(struct readpages_iter *iter) return iter->pages[iter->idx]; } -static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) -{ - struct bvec_iter iter; - struct bio_vec bv; - unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v - ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = k.k->type == KEY_TYPE_reservation - ? SECTOR_RESERVED - : SECTOR_ALLOCATED; - - bio_for_each_segment(bv, bio, iter) { - struct bch_page_state *s = bch2_page_state(bv.bv_page); - unsigned i; - - for (i = bv.bv_offset >> 9; - i < (bv.bv_offset + bv.bv_len) >> 9; - i++) { - s->s[i].nr_replicas = nr_ptrs; - s->s[i].state = state; - } - } -} - static bool extent_partial_reads_expensive(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -745,7 +955,7 @@ static void readpage_bio_extend(struct readpages_iter *iter, { while (bio_sectors(bio) < sectors_this_extent && bio->bi_vcnt < bio->bi_max_vecs) { - pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; + pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; struct page *page = readpage_iter_next(iter); int ret; @@ -864,8 +1074,7 @@ retry: if (rbio->bio.bi_iter.bi_size == bytes) flags |= BCH_READ_LAST_FRAGMENT; - if (bkey_extent_is_allocation(k.k)) - bch2_add_page_sectors(&rbio->bio, k); + bch2_bio_page_state_set(&rbio->bio, k); bch2_read_extent(trans, rbio, iter.pos, data_btree, k, offset_into_extent, flags); @@ -875,6 +1084,10 @@ retry: swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); + + ret = btree_trans_too_many_iters(trans); + if (ret) + break; } err: bch2_trans_iter_exit(trans, &iter); @@ -922,7 +1135,7 @@ void bch2_readahead(struct readahead_control *ractl) readpages_iter.idx++; bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0); - rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT; + rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT; rbio->bio.bi_end_io = bch2_readpages_end_io; BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); @@ -945,7 +1158,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); rbio->bio.bi_iter.bi_sector = - (sector_t) page->index << PAGE_SECTOR_SHIFT; + (sector_t) page->index << PAGE_SECTORS_SHIFT; BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); bch2_trans_init(&trans, c, 0, 0); @@ -1232,7 +1445,7 @@ do_io: } BUG_ON(!sectors); - sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; + sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset; if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || @@ -1349,6 +1562,12 @@ readpage: if (ret) goto err; out: + if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { + ret = bch2_page_state_set(c, inode_inum(inode), &page, 1); + if (ret) + goto out; + } + ret = bch2_page_reservation_get(c, inode, page, res, offset, len, true); if (ret) { @@ -1478,20 +1697,21 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, } while (reserved < len) { - struct page *page = pages[(offset + reserved) >> PAGE_SHIFT]; + unsigned i = (offset + reserved) >> PAGE_SHIFT; + struct page *page = pages[i]; unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); unsigned pg_len = min_t(unsigned, len - reserved, PAGE_SIZE - pg_offset); -retry_reservation: - ret = bch2_page_reservation_get(c, inode, page, &res, - pg_offset, pg_len, true); - if (ret && !PageUptodate(page)) { - ret = bch2_read_single_page(page, mapping); - if (!ret) - goto retry_reservation; + if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { + ret = bch2_page_state_set(c, inode_inum(inode), + pages + i, nr_pages - i); + if (ret) + goto out; } + ret = bch2_page_reservation_get(c, inode, page, &res, + pg_offset, pg_len, true); if (ret) goto out; @@ -2245,6 +2465,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; unsigned i; struct page *page; + s64 i_sectors_delta = 0; int ret = 0; /* Page boundary? Nothing to do */ @@ -2263,8 +2484,8 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, * page */ ret = range_has_data(c, inode->ei_subvol, - POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), - POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); + POS(inode->v.i_ino, index << PAGE_SECTORS_SHIFT), + POS(inode->v.i_ino, (index + 1) << PAGE_SECTORS_SHIFT)); if (ret <= 0) return ret; @@ -2296,9 +2517,13 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, i < round_down(end_offset, block_bytes(c)) >> 9; i++) { s->s[i].nr_replicas = 0; + if (s->s[i].state == SECTOR_DIRTY) + i_sectors_delta--; s->s[i].state = SECTOR_UNALLOCATED; } + i_sectors_acct(c, inode, NULL, i_sectors_delta); + /* * Caller needs to know whether this page will be written out by * writeback - doing an i_size update if necessary - or whether it will @@ -2480,6 +2705,8 @@ int bch2_truncate(struct user_namespace *mnt_userns, U64_MAX, &i_sectors_delta); i_sectors_acct(c, inode, NULL, i_sectors_delta); + BUG_ON(!inode->v.i_size && inode->v.i_blocks); + if (unlikely(ret)) goto err; @@ -2810,6 +3037,8 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, &reservation.k_i, &disk_res, NULL, 0, &i_sectors_delta, true); + if (ret) + goto bkey_err; i_sectors_acct(c, inode, "a_res, i_sectors_delta); bkey_err: bch2_quota_reservation_put(c, inode, "a_res); @@ -2818,6 +3047,9 @@ bkey_err: ret = 0; } + bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */ + mark_pagecache_reserved(inode, start_sector, iter.pos.offset); + if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) { struct quota_res quota_res = { 0 }; s64 i_sectors_delta = 0; @@ -2923,43 +3155,6 @@ long bch2_fallocate_dispatch(struct file *file, int mode, return ret; } -static void mark_range_unallocated(struct bch_inode_info *inode, - loff_t start, loff_t end) -{ - pgoff_t index = start >> PAGE_SHIFT; - pgoff_t end_index = (end - 1) >> PAGE_SHIFT; - struct pagevec pvec; - - pagevec_init(&pvec); - - do { - unsigned nr_pages, i, j; - - nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, - &index, end_index); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct bch_page_state *s; - - lock_page(page); - s = bch2_page_state(page); - - if (s) { - spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; - spin_unlock(&s->lock); - } - - unlock_page(page); - } - pagevec_release(&pvec); - } while (index <= end_index); -} - loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct file *file_dst, loff_t pos_dst, loff_t len, unsigned remap_flags) @@ -3005,7 +3200,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, if (ret) goto err; - mark_range_unallocated(src, pos_src, pos_src + aligned_len); + mark_pagecache_unallocated(src, pos_src >> 9, + (pos_src + aligned_len) >> 9); ret = bch2_remap_range(c, inode_inum(dst), pos_dst >> 9, diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index fc29e6c..7eb33da 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -38,7 +38,8 @@ static struct kmem_cache *bch2_inode_cache; static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, struct bch_inode_info *, - struct bch_inode_unpacked *); + struct bch_inode_unpacked *, + struct bch_subvolume *); static void __pagecache_lock_put(struct pagecache_lock *lock, long i) { @@ -224,6 +225,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) struct bch_inode_unpacked inode_u; struct bch_inode_info *inode; struct btree_trans trans; + struct bch_subvolume subvol; int ret; inode = to_bch_ei(iget5_locked(c->vfs_sb, @@ -238,10 +240,11 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) bch2_trans_init(&trans, c, 8, 0); ret = lockrestart_do(&trans, + bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?: bch2_inode_find_by_inum_trans(&trans, inum, &inode_u)); if (!ret) - bch2_vfs_inode_init(&trans, inum, inode, &inode_u); + bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); bch2_trans_exit(&trans); if (ret) { @@ -267,6 +270,7 @@ __bch2_create(struct user_namespace *mnt_userns, struct bch_inode_unpacked inode_u; struct posix_acl *default_acl = NULL, *acl = NULL; subvol_inum inum; + struct bch_subvolume subvol; u64 journal_seq = 0; int ret; @@ -309,7 +313,12 @@ retry: if (unlikely(ret)) goto err_before_quota; - ret = bch2_trans_commit(&trans, NULL, &journal_seq, 0); + inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; + inum.inum = inode_u.bi_inum; + + ret = bch2_subvolume_get(&trans, inum.subvol, true, + BTREE_ITER_WITH_UPDATES, &subvol) ?: + bch2_trans_commit(&trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN); @@ -325,11 +334,8 @@ err_before_quota: mutex_unlock(&dir->ei_update_lock); } - inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; - inum.inum = inode_u.bi_inum; - bch2_iget5_set(&inode->v, &inum); - bch2_vfs_inode_init(&trans, inum, inode, &inode_u); + bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); @@ -1350,10 +1356,16 @@ static const struct export_operations bch_export_ops = { static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, struct bch_inode_info *inode, - struct bch_inode_unpacked *bi) + struct bch_inode_unpacked *bi, + struct bch_subvolume *subvol) { bch2_inode_update_after_write(trans, inode, bi, ~0); + if (BCH_SUBVOLUME_SNAP(subvol)) + set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + else + clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + inode->v.i_blocks = bi->bi_sectors; inode->v.i_ino = bi->bi_inum; inode->v.i_rdev = bi->bi_dev; diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 27aacd7..b2211ec 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -64,6 +64,12 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode) */ #define EI_INODE_ERROR 0 +/* + * Set in the inode is in a snapshot subvolume - we don't do quota accounting in + * those: + */ +#define EI_INODE_SNAPSHOT 1 + #define to_bch_ei(_inode) \ container_of_or_null(_inode, struct bch_inode_info, v) diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 3a6b444..5a3c9ef 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -2323,6 +2323,10 @@ retry: swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + + ret = btree_trans_too_many_iters(&trans); + if (ret) + break; } err: bch2_trans_iter_exit(&trans, &iter); diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 5d9c00a..afb1bb2 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -223,19 +223,19 @@ enum opt_type { BCH_SB_POSIX_ACL, true, \ NULL, "Enable POSIX acls") \ x(usrquota, u8, \ - 0, \ + OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH_SB_USRQUOTA, false, \ NULL, "Enable user quotas") \ x(grpquota, u8, \ - 0, \ + OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH_SB_GRPQUOTA, false, \ NULL, "Enable group quotas") \ x(prjquota, u8, \ - 0, \ + OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH_SB_PRJQUOTA, false, \ NULL, "Enable project quotas") \ x(degraded, u8, \ OPT_MOUNT, \ diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index 5f1216d..8f8f4b0 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -3,6 +3,7 @@ #include "btree_update.h" #include "inode.h" #include "quota.h" +#include "subvolume.h" #include "super-io.h" static const char *bch2_sb_validate_quota(struct bch_sb *sb, @@ -415,14 +416,55 @@ static void bch2_sb_quota_read(struct bch_fs *c) } } +static int bch2_fs_quota_read_inode(struct btree_trans *trans, + struct btree_iter *iter) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked u; + struct bch_subvolume subvolume; + struct bkey_s_c k; + int ret; + + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + if (ret) + return ret; + + if (!k.k) + return 1; + + ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume); + if (ret) + return ret; + + /* + * We don't do quota accounting in snapshots: + */ + if (BCH_SUBVOLUME_SNAP(&subvolume)) + goto advance; + + if (!bkey_is_inode(k.k)) + goto advance; + + ret = bch2_inode_unpack(k, &u); + if (ret) + return ret; + + bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, + KEY_TYPE_QUOTA_NOCHECK); + bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, + KEY_TYPE_QUOTA_NOCHECK); +advance: + bch2_btree_iter_set_pos(iter, POS(iter->pos.inode, iter->pos.offset + 1)); + return 0; +} + int bch2_fs_quota_read(struct bch_fs *c) { unsigned i, qtypes = enabled_qtypes(c); struct bch_memquota_type *q; struct btree_trans trans; struct btree_iter iter; - struct bch_inode_unpacked u; - struct bkey_s_c k; int ret; mutex_lock(&c->sb_lock); @@ -437,23 +479,18 @@ int bch2_fs_quota_read(struct bch_fs *c) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - if (bkey_is_inode(k.k)) { - ret = bch2_inode_unpack(k, &u); - if (ret) - return ret; - - bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, - KEY_TYPE_QUOTA_NOCHECK); - bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, - KEY_TYPE_QUOTA_NOCHECK); - } - } + bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); + do { + ret = lockrestart_do(&trans, + bch2_fs_quota_read_inode(&trans, &iter)); + } while (!ret); bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - return ret; + return ret < 0 ? ret : 0; } /* Enable/disable/delete quotas for an entire filesystem: */ diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index 8dcac78..c8d6d73 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -184,7 +184,8 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); - ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); + ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); err: c->reflink_hint = reflink_iter.pos.offset; bch2_trans_iter_exit(trans, &reflink_iter); diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 0020065..6c5ea78 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -427,61 +427,8 @@ int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) return __bch2_mark_replicas(c, r, false); } -static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, - bool check) -{ - struct bch_replicas_padded search; - struct bch_devs_list cached = bch2_bkey_cached_devs(k); - unsigned i; - int ret; - - memset(&search, 0, sizeof(search)); - - for (i = 0; i < cached.nr; i++) { - bch2_replicas_entry_cached(&search.e, cached.devs[i]); - - ret = __bch2_mark_replicas(c, &search.e, check); - if (ret) - return ret; - } - - bch2_bkey_to_replicas(&search.e, k); - - ret = __bch2_mark_replicas(c, &search.e, check); - if (ret) - return ret; - - if (search.e.data_type == BCH_DATA_parity) { - search.e.data_type = BCH_DATA_cached; - ret = __bch2_mark_replicas(c, &search.e, check); - if (ret) - return ret; - - search.e.data_type = BCH_DATA_user; - ret = __bch2_mark_replicas(c, &search.e, check); - if (ret) - return ret; - } - - return 0; -} - /* replicas delta list: */ -bool bch2_replicas_delta_list_marked(struct bch_fs *c, - struct replicas_delta_list *r) -{ - struct replicas_delta *d = r->d; - struct replicas_delta *top = (void *) r->d + r->used; - - percpu_rwsem_assert_held(&c->mark_lock); - - for (d = r->d; d != top; d = replicas_delta_next(d)) - if (bch2_replicas_entry_idx(c, &d->r) < 0) - return false; - return true; -} - int bch2_replicas_delta_list_mark(struct bch_fs *c, struct replicas_delta_list *r) { @@ -494,19 +441,6 @@ int bch2_replicas_delta_list_mark(struct bch_fs *c, return ret; } -/* bkey replicas: */ - -bool bch2_bkey_replicas_marked(struct bch_fs *c, - struct bkey_s_c k) -{ - return __bch2_mark_bkey_replicas(c, k, true) == 0; -} - -int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) -{ - return __bch2_mark_bkey_replicas(c, k, false); -} - /* * Old replicas_gc mechanism: only used for journal replicas entries now, should * die at some point: diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index 72ac544..d237d7c 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -48,12 +48,9 @@ replicas_delta_next(struct replicas_delta *d) return (void *) d + replicas_entry_bytes(&d->r) + 8; } -bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *); int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); -bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c); -int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, unsigned dev) diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index 0ef625d..7e909a1 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -789,6 +789,15 @@ int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, return ret; } +int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, + struct bch_subvolume *subvol) +{ + struct bch_snapshot snap; + + return snapshot_lookup(trans, snapshot, &snap) ?: + bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); +} + int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, u32 *snapid) { diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index dde755b..e4c3fdc 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -118,6 +118,8 @@ void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c) int bch2_subvolume_get(struct btree_trans *, unsigned, bool, int, struct bch_subvolume *); +int bch2_snapshot_get_subvol(struct btree_trans *, u32, + struct bch_subvolume *); int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); int bch2_subvolume_delete(struct btree_trans *, u32); diff --git a/libbcachefs/util.h b/libbcachefs/util.h index bec84d8..80402b3 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -18,8 +18,6 @@ #include #include -#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9) - struct closure; #ifdef CONFIG_BCACHEFS_DEBUG -- 2.39.2