From: Kent Overstreet Date: Mon, 27 Jun 2022 19:58:48 +0000 (-0400) Subject: Update bcachefs sources to 95ff72a6c1 fixup! mm: Centralize & improve oom reporting... X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=bad0c8c50758b4447d529f61017c1a8c85976a3e;p=bcachefs-tools-debian Update bcachefs sources to 95ff72a6c1 fixup! mm: Centralize & improve oom reporting in show_mem.c --- diff --git a/.bcachefs_revision b/.bcachefs_revision index b5a8d6b..ac80285 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -2f4e24d85692600a698d78938a213f27593bda25 +95ff72a6c1291f6838e5cfa81a7426aaff482cde diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index b96b257..66ad356 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -534,24 +534,27 @@ TRACE_EVENT(discard_buckets, ); TRACE_EVENT(invalidate_bucket, - TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket), - TP_ARGS(c, dev, bucket), + TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), + TP_ARGS(c, dev, bucket, sectors), TP_STRUCT__entry( __field(dev_t, dev ) __field(u32, dev_idx ) + __field(u32, sectors ) __field(u64, bucket ) ), TP_fast_assign( __entry->dev = c->dev; __entry->dev_idx = dev; + __entry->sectors = sectors; __entry->bucket = bucket; ), - TP_printk("%d:%d invalidated %u:%llu", + TP_printk("%d:%d invalidated %u:%llu cached sectors %u", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->dev_idx, __entry->bucket) + __entry->dev_idx, __entry->bucket, + __entry->sectors) ); /* Moving IO */ diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 359cb23..7385671 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -685,21 +685,23 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, } static int bch2_check_alloc_key(struct btree_trans *trans, - struct btree_iter *alloc_iter) + struct btree_iter *alloc_iter, + struct btree_iter *discard_iter, + struct btree_iter *freespace_iter) { struct bch_fs *c = trans->c; struct bch_dev *ca; - struct btree_iter discard_iter, freespace_iter; struct bch_alloc_v4 a; unsigned discard_key_type, freespace_key_type; struct bkey_s_c alloc_k, k; struct printbuf buf = PRINTBUF; - struct printbuf buf2 = PRINTBUF; int ret; - alloc_k = bch2_btree_iter_peek(alloc_iter); + alloc_k = bch2_dev_bucket_exists(c, alloc_iter->pos) + ? bch2_btree_iter_peek_slot(alloc_iter) + : bch2_btree_iter_peek(alloc_iter); if (!alloc_k.k) - return 0; + return 1; ret = bkey_err(alloc_k); if (ret) @@ -721,12 +723,10 @@ static int bch2_check_alloc_key(struct btree_trans *trans, freespace_key_type = a.data_type == BCH_DATA_free ? KEY_TYPE_set : 0; - bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, - alloc_k.k->p, 0); - bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, - alloc_freespace_pos(alloc_k.k->p, a), 0); + bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); + bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, a)); - k = bch2_btree_iter_peek_slot(&discard_iter); + k = bch2_btree_iter_peek_slot(discard_iter); ret = bkey_err(k); if (ret) goto err; @@ -746,14 +746,14 @@ static int bch2_check_alloc_key(struct btree_trans *trans, bkey_init(&update->k); update->k.type = discard_key_type; - update->k.p = discard_iter.pos; + update->k.p = discard_iter->pos; - ret = bch2_trans_update(trans, &discard_iter, update, 0); + ret = bch2_trans_update(trans, discard_iter, update, 0); if (ret) goto err; } - k = bch2_btree_iter_peek_slot(&freespace_iter); + k = bch2_btree_iter_peek_slot(freespace_iter); ret = bkey_err(k); if (ret) goto err; @@ -774,18 +774,15 @@ static int bch2_check_alloc_key(struct btree_trans *trans, bkey_init(&update->k); update->k.type = freespace_key_type; - update->k.p = freespace_iter.pos; + update->k.p = freespace_iter->pos; bch2_key_resize(&update->k, 1); - ret = bch2_trans_update(trans, &freespace_iter, update, 0); + ret = bch2_trans_update(trans, freespace_iter, update, 0); if (ret) goto err; } err: fsck_err: - bch2_trans_iter_exit(trans, &freespace_iter); - bch2_trans_iter_exit(trans, &discard_iter); - printbuf_exit(&buf2); printbuf_exit(&buf); return ret; } @@ -855,48 +852,64 @@ delete: int bch2_check_alloc_info(struct bch_fs *c) { struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; + struct btree_iter iter, discard_iter, freespace_iter; int ret = 0; bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_check_alloc_key(&trans, &iter)); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, + BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, + BTREE_ITER_PREFETCH); + while (1) { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_check_alloc_key(&trans, &iter, + &discard_iter, + &freespace_iter)); if (ret) break; + + bch2_btree_iter_advance(&iter); } + bch2_trans_iter_exit(&trans, &freespace_iter); + bch2_trans_iter_exit(&trans, &discard_iter); bch2_trans_iter_exit(&trans, &iter); - if (ret) + if (ret < 0) goto err; bch2_trans_iter_init(&trans, &iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_PREFETCH); while (1) { - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, bch2_check_discard_freespace_key(&trans, &iter)); if (ret) break; - bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); + bch2_btree_iter_advance(&iter); } bch2_trans_iter_exit(&trans, &iter); - if (ret) + if (ret < 0) goto err; bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN, BTREE_ITER_PREFETCH); while (1) { - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, bch2_check_discard_freespace_key(&trans, &iter)); if (ret) break; - bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); + bch2_btree_iter_advance(&iter); } bch2_trans_iter_exit(&trans, &iter); err: @@ -1151,12 +1164,13 @@ static void bch2_do_discards_work(struct work_struct *work) void bch2_do_discards(struct bch_fs *c) { - if (percpu_ref_tryget(&c->writes) && + if (percpu_ref_tryget_live(&c->writes) && !queue_work(system_long_wq, &c->discard_work)) percpu_ref_put(&c->writes); } -static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca) +static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca, + struct bpos *bucket_pos, unsigned *cached_sectors) { struct bch_fs *c = trans->c; struct btree_iter lru_iter, alloc_iter = { NULL }; @@ -1174,8 +1188,10 @@ next_lru: if (ret) goto out; - if (!k.k || k.k->p.inode != ca->dev_idx) + if (!k.k || k.k->p.inode != ca->dev_idx) { + ret = 1; goto out; + } if (k.k->type != KEY_TYPE_lru) { prt_printf(&buf, "non lru key in lru btree:\n "); @@ -1195,8 +1211,9 @@ next_lru: idx = k.k->p.offset; bucket = le64_to_cpu(bkey_s_c_to_lru(k).v->idx); - a = bch2_trans_start_alloc_update(trans, &alloc_iter, - POS(ca->dev_idx, bucket)); + *bucket_pos = POS(ca->dev_idx, bucket); + + a = bch2_trans_start_alloc_update(trans, &alloc_iter, *bucket_pos); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; @@ -1218,6 +1235,11 @@ next_lru: } } + if (!a->v.cached_sectors) + bch_err(c, "invalidating empty bucket, confused"); + + *cached_sectors = a->v.cached_sectors; + SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); a->v.gen++; a->v.data_type = 0; @@ -1230,8 +1252,6 @@ next_lru: BTREE_TRIGGER_BUCKET_INVALIDATE); if (ret) goto out; - - trace_invalidate_bucket(c, a->k.p.inode, a->k.p.offset); out: bch2_trans_iter_exit(trans, &alloc_iter); bch2_trans_iter_exit(trans, &lru_iter); @@ -1244,7 +1264,8 @@ static void bch2_do_invalidates_work(struct work_struct *work) struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); struct bch_dev *ca; struct btree_trans trans; - unsigned i; + struct bpos bucket; + unsigned i, sectors; int ret = 0; bch2_trans_init(&trans, c, 0, 0); @@ -1257,10 +1278,12 @@ static void bch2_do_invalidates_work(struct work_struct *work) ret = __bch2_trans_do(&trans, NULL, NULL, BTREE_INSERT_USE_RESERVE| BTREE_INSERT_NOFAIL, - invalidate_one_bucket(&trans, ca)); + invalidate_one_bucket(&trans, ca, &bucket, + §ors)); if (ret) break; + trace_invalidate_bucket(c, bucket.inode, bucket.offset, sectors); this_cpu_inc(c->counters[BCH_COUNTER_bucket_invalidate]); } } @@ -1271,8 +1294,9 @@ static void bch2_do_invalidates_work(struct work_struct *work) void bch2_do_invalidates(struct bch_fs *c) { - if (percpu_ref_tryget(&c->writes)) - queue_work(system_long_wq, &c->invalidate_work); + if (percpu_ref_tryget_live(&c->writes) && + !queue_work(system_long_wq, &c->invalidate_work)) + percpu_ref_put(&c->writes); } static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter) diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 1f0484a..8b4d0eb 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -826,6 +826,8 @@ struct bch_fs { copygc_heap copygc_heap; struct write_point copygc_write_point; s64 copygc_wait; + bool copygc_running; + wait_queue_head_t copygc_running_wq; /* DATA PROGRESS STATS */ struct list_head data_progress_list; diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index fd352a6..e0cbac8 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -211,8 +211,8 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, } if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) { - prt_printf(err, "invalid key type for this btree (%s)", - bch2_bkey_types[type]); + prt_printf(err, "invalid key type for btree %s (%s)", + bch2_btree_ids[type], bch2_bkey_types[type]); return -EINVAL; } diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 0bc7896..0e2c874 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -395,6 +395,7 @@ again: bch2_btree_node_evict(c, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); + cur = NULL; if (ret) break; continue; @@ -413,6 +414,7 @@ again: bch2_btree_node_evict(c, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); + cur = NULL; if (ret) break; continue; @@ -849,10 +851,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, struct bch_fs *c = trans->c; struct btree_iter iter; struct btree *b; - unsigned depth = metadata_only ? 1 - : bch2_expensive_debug_checks ? 0 - : !btree_node_type_needs_gc(btree_id) ? 1 - : 0; + unsigned depth = metadata_only ? 1 : 0; int ret = 0; gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); @@ -995,10 +994,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree *b; - unsigned target_depth = metadata_only ? 1 - : bch2_expensive_debug_checks ? 0 - : !btree_node_type_needs_gc(btree_id) ? 1 - : 0; + unsigned target_depth = metadata_only ? 1 : 0; struct printbuf buf = PRINTBUF; int ret = 0; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 5c5e14d..a1512eb 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -3273,11 +3273,14 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *fn) __acquires(&c->btree_trans_barrier) { + struct btree_trans *pos; + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); memset(trans, 0, sizeof(*trans)); trans->c = c; trans->fn = fn; + trans->task = current; bch2_trans_alloc_paths(trans, c); @@ -3293,9 +3296,15 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - trans->pid = current->pid; mutex_lock(&c->btree_trans_lock); - list_add(&trans->list, &c->btree_trans_list); + list_for_each_entry(pos, &c->btree_trans_list, list) { + if (trans->task->pid < pos->task->pid) { + list_add_tail(&trans->list, &pos->list); + goto list_add_done; + } + } + list_add_tail(&trans->list, &c->btree_trans_list); +list_add_done: mutex_unlock(&c->btree_trans_lock); } @@ -3383,73 +3392,55 @@ bch2_btree_path_node_to_text(struct printbuf *out, bch2_bpos_to_text(out, btree_node_pos(_b, cached)); } -static bool trans_has_locks(struct btree_trans *trans) -{ - struct btree_path *path; - - trans_for_each_path(trans, path) - if (path->nodes_locked) - return true; - return false; -} - -void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) +void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) { - struct btree_trans *trans; struct btree_path *path; struct btree *b; static char lock_types[] = { 'r', 'i', 'w' }; unsigned l; - mutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - if (!trans_has_locks(trans)) - continue; - - prt_printf(out, "%i %s\n", trans->pid, trans->fn); + prt_printf(out, "%i %s\n", trans->task->pid, trans->fn); - trans_for_each_path(trans, path) { - if (!path->nodes_locked) - continue; + trans_for_each_path(trans, path) { + if (!path->nodes_locked) + continue; - prt_printf(out, " path %u %c l=%u %s:", - path->idx, - path->cached ? 'c' : 'b', - path->level, - bch2_btree_ids[path->btree_id]); - bch2_bpos_to_text(out, path->pos); - prt_printf(out, "\n"); - - for (l = 0; l < BTREE_MAX_DEPTH; l++) { - if (btree_node_locked(path, l)) { - prt_printf(out, " %s l=%u ", - btree_node_intent_locked(path, l) ? "i" : "r", l); - bch2_btree_path_node_to_text(out, - (void *) path->l[l].b, - path->cached); - prt_printf(out, "\n"); - } + prt_printf(out, " path %u %c l=%u %s:", + path->idx, + path->cached ? 'c' : 'b', + path->level, + bch2_btree_ids[path->btree_id]); + bch2_bpos_to_text(out, path->pos); + prt_printf(out, "\n"); + + for (l = 0; l < BTREE_MAX_DEPTH; l++) { + if (btree_node_locked(path, l)) { + prt_printf(out, " %s l=%u ", + btree_node_intent_locked(path, l) ? "i" : "r", l); + bch2_btree_path_node_to_text(out, + (void *) path->l[l].b, + path->cached); + prt_printf(out, "\n"); } } + } - b = READ_ONCE(trans->locking); - if (b) { - path = &trans->paths[trans->locking_path_idx]; - prt_printf(out, " locking path %u %c l=%u %c %s:", - trans->locking_path_idx, - path->cached ? 'c' : 'b', - trans->locking_level, - lock_types[trans->locking_lock_type], - bch2_btree_ids[trans->locking_btree_id]); - bch2_bpos_to_text(out, trans->locking_pos); - - prt_printf(out, " node "); - bch2_btree_path_node_to_text(out, - (void *) b, path->cached); - prt_printf(out, "\n"); - } + b = READ_ONCE(trans->locking); + if (b) { + path = &trans->paths[trans->locking_path_idx]; + prt_printf(out, " locking path %u %c l=%u %c %s:", + trans->locking_path_idx, + path->cached ? 'c' : 'b', + trans->locking_level, + lock_types[trans->locking_lock_type], + bch2_btree_ids[trans->locking_btree_id]); + bch2_bpos_to_text(out, trans->locking_pos); + + prt_printf(out, " node "); + bch2_btree_path_node_to_text(out, + (void *) b, path->cached); + prt_printf(out, "\n"); } - mutex_unlock(&c->btree_trans_lock); } void bch2_fs_btree_iter_exit(struct bch_fs *c) diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index f2b302c..9da0a41 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -403,7 +403,7 @@ void bch2_trans_exit(struct btree_trans *); #define bch2_trans_init(...) __bch2_trans_init(__VA_ARGS__, __func__) -void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *); +void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); void bch2_fs_btree_iter_exit(struct bch_fs *); int bch2_fs_btree_iter_init(struct bch_fs *); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 0750951..a5b0a95 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -84,7 +84,7 @@ static void bkey_cached_free(struct btree_key_cache *bc, start_poll_synchronize_srcu(&c->btree_trans_barrier); list_move_tail(&ck->list, &bc->freed); - bc->nr_freed++; + atomic_long_inc(&bc->nr_freed); kfree(ck->k); ck->k = NULL; @@ -94,10 +94,88 @@ static void bkey_cached_free(struct btree_key_cache *bc, six_unlock_intent(&ck->c.lock); } +static void bkey_cached_free_fast(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + struct btree_key_cache_freelist *f; + bool freed = false; + + BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); + + ck->btree_trans_barrier_seq = + start_poll_synchronize_srcu(&c->btree_trans_barrier); + + list_del_init(&ck->list); + atomic_long_inc(&bc->nr_freed); + + kfree(ck->k); + ck->k = NULL; + ck->u64s = 0; + + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + if (f->nr < ARRAY_SIZE(f->objs)) { + f->objs[f->nr++] = ck; + freed = true; + } + preempt_enable(); + + if (!freed) { + mutex_lock(&bc->lock); + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + while (f->nr > ARRAY_SIZE(f->objs) / 2) { + struct bkey_cached *ck2 = f->objs[--f->nr]; + + list_move_tail(&ck2->list, &bc->freed); + } + preempt_enable(); + + list_move_tail(&ck->list, &bc->freed); + mutex_unlock(&bc->lock); + } + + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); +} + static struct bkey_cached * bkey_cached_alloc(struct btree_key_cache *c) { - struct bkey_cached *ck; + struct bkey_cached *ck = NULL; + struct btree_key_cache_freelist *f; + + preempt_disable(); + f = this_cpu_ptr(c->pcpu_freed); + if (f->nr) + ck = f->objs[--f->nr]; + preempt_enable(); + + if (!ck) { + mutex_lock(&c->lock); + preempt_disable(); + f = this_cpu_ptr(c->pcpu_freed); + + while (!list_empty(&c->freed) && + f->nr < ARRAY_SIZE(f->objs) / 2) { + ck = list_last_entry(&c->freed, struct bkey_cached, list); + list_del_init(&ck->list); + f->objs[f->nr++] = ck; + } + + ck = f->nr ? f->objs[--f->nr] : NULL; + preempt_enable(); + mutex_unlock(&c->lock); + } + + if (ck) { + six_lock_intent(&ck->c.lock, NULL, NULL); + six_lock_write(&ck->c.lock, NULL, NULL); + return ck; + } ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); if (likely(ck)) { @@ -119,16 +197,6 @@ bkey_cached_reuse(struct btree_key_cache *c) struct bkey_cached *ck; unsigned i; - mutex_lock(&c->lock); - list_for_each_entry_reverse(ck, &c->freed, list) - if (bkey_cached_lock_for_evict(ck)) { - c->nr_freed--; - list_del(&ck->list); - mutex_unlock(&c->lock); - return ck; - } - mutex_unlock(&c->lock); - rcu_read_lock(); tbl = rht_dereference_rcu(c->table.tbl, &c->table); for (i = 0; i < tbl->size; i++) @@ -189,9 +257,7 @@ btree_key_cache_create(struct bch_fs *c, six_unlock_intent(&ck->c.lock); kfree(ck); } else { - mutex_lock(&bc->lock); - bkey_cached_free(bc, ck); - mutex_unlock(&bc->lock); + bkey_cached_free_fast(bc, ck); } return NULL; @@ -465,9 +531,7 @@ evict: bkey_cached_evict(&c->btree_key_cache, ck); - mutex_lock(&c->btree_key_cache.lock); - bkey_cached_free(&c->btree_key_cache, ck); - mutex_unlock(&c->btree_key_cache.lock); + bkey_cached_free_fast(&c->btree_key_cache, ck); } out: bch2_trans_iter_exit(trans, &b_iter); @@ -604,7 +668,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, list_del(&ck->list); kmem_cache_free(bch2_key_cache, ck); - bc->nr_freed--; + atomic_long_dec(&bc->nr_freed); scanned++; freed++; } @@ -677,6 +741,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) struct bkey_cached *ck, *n; struct rhash_head *pos; unsigned i; + int cpu; if (bc->shrink.list.next) unregister_shrinker(&bc->shrink); @@ -693,6 +758,16 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) } rcu_read_unlock(); + for_each_possible_cpu(cpu) { + struct btree_key_cache_freelist *f = + per_cpu_ptr(bc->pcpu_freed, cpu); + + for (i = 0; i < f->nr; i++) { + ck = f->objs[i]; + list_add(&ck->list, &bc->freed); + } + } + list_for_each_entry_safe(ck, n, &bc->freed, list) { cond_resched(); @@ -713,6 +788,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) if (bc->table_init_done) rhashtable_destroy(&bc->table); + + free_percpu(bc->pcpu_freed); } void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) @@ -733,6 +810,10 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) { int ret; + c->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); + if (!c->pcpu_freed) + return -ENOMEM; + ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params); if (ret) return ret; @@ -748,7 +829,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) { - prt_printf(out, "nr_freed:\t%zu\n", c->nr_freed); + prt_printf(out, "nr_freed:\t%zu\n", atomic_long_read(&c->nr_freed)); prt_printf(out, "nr_keys:\t%lu\n", atomic_long_read(&c->nr_keys)); prt_printf(out, "nr_dirty:\t%lu\n", atomic_long_read(&c->nr_dirty)); } diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 5382f2b..1e4d1fe 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -301,6 +301,11 @@ struct btree_iter { #endif }; +struct btree_key_cache_freelist { + struct bkey_cached *objs[16]; + unsigned nr; +}; + struct btree_key_cache { struct mutex lock; struct rhashtable table; @@ -308,8 +313,9 @@ struct btree_key_cache { struct list_head freed; struct shrinker shrink; unsigned shrink_iter; + struct btree_key_cache_freelist __percpu *pcpu_freed; - size_t nr_freed; + atomic_long_t nr_freed; atomic_long_t nr_keys; atomic_long_t nr_dirty; }; @@ -388,7 +394,7 @@ struct btree_trans { u8 locking_btree_id; u8 locking_level; u8 locking_lock_type; - pid_t pid; + struct task_struct *task; int srcu_idx; u8 nr_sorted; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index ceb8484..965fdfb 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -1883,7 +1883,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) { struct async_btree_rewrite *a; - if (!percpu_ref_tryget(&c->writes)) + if (!percpu_ref_tryget_live(&c->writes)) return; a = kmalloc(sizeof(*a), GFP_NOFS); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 0ab0dc5..aed26b5 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -1102,7 +1102,7 @@ int __bch2_trans_commit(struct btree_trans *trans) } if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && - unlikely(!percpu_ref_tryget(&c->writes))) { + unlikely(!percpu_ref_tryget_live(&c->writes))) { ret = bch2_trans_commit_get_rw_cold(trans); if (ret) goto out_reset; diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 670b95b..6881502 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -178,12 +178,12 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca, enum alloc_reserve reserve) { return max_t(s64, 0, - usage.d[BCH_DATA_free].buckets - - usage.d[BCH_DATA_cached].buckets - - usage.d[BCH_DATA_need_gc_gens].buckets - - usage.d[BCH_DATA_need_discard].buckets - - ca->nr_open_buckets - - bch2_dev_buckets_reserved(ca, reserve)); + usage.d[BCH_DATA_free].buckets + + usage.d[BCH_DATA_cached].buckets + + usage.d[BCH_DATA_need_gc_gens].buckets + + usage.d[BCH_DATA_need_discard].buckets + - ca->nr_open_buckets + - bch2_dev_buckets_reserved(ca, reserve)); } static inline u64 dev_buckets_available(struct bch_dev *ca, diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index e23b221..7c2af67 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -425,8 +425,17 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, merged = bch2_checksum_bio(c, crc_old.csum_type, extent_nonce(version, crc_old), bio); - if (bch2_crc_cmp(merged, crc_old.csum)) + if (bch2_crc_cmp(merged, crc_old.csum)) { + bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n" + "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)", + crc_old.csum.hi, + crc_old.csum.lo, + merged.hi, + merged.lo, + bch2_csum_types[crc_old.csum_type], + bch2_csum_types[new_csum_type]); return -EIO; + } for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { if (i->crc) diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c new file mode 100644 index 0000000..cc9ae6d --- /dev/null +++ b/libbcachefs/data_update.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "buckets.h" +#include "data_update.h" +#include "ec.h" +#include "extents.h" +#include "io.h" +#include "keylist.h" +#include "move.h" +#include "subvolume.h" + +#include + +static int insert_snapshot_whiteouts(struct btree_trans *trans, + enum btree_id id, + struct bpos old_pos, + struct bpos new_pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter, update_iter; + struct bkey_s_c k; + struct snapshots_seen s; + int ret; + + if (!btree_type_has_snapshots(id)) + return 0; + + snapshots_seen_init(&s); + + if (!bkey_cmp(old_pos, new_pos)) + return 0; + + if (!snapshot_t(c, old_pos.snapshot)->children[0]) + return 0; + + bch2_trans_iter_init(trans, &iter, id, old_pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while (1) { +next: + k = bch2_btree_iter_prev(&iter); + ret = bkey_err(k); + if (ret) + break; + + if (bkey_cmp(old_pos, k.k->p)) + break; + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) { + struct bkey_i *update; + u32 *i; + + darray_for_each(s.ids, i) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, *i)) + goto next; + + update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + break; + + bkey_init(&update->k); + update->k.p = new_pos; + update->k.p.snapshot = k.k->p.snapshot; + + bch2_trans_iter_init(trans, &update_iter, id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&update_iter) ?: + bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &update_iter); + if (ret) + break; + + ret = snapshots_seen_add(c, &s, k.k->p.snapshot); + if (ret) + break; + } + } + bch2_trans_iter_exit(trans, &iter); + darray_exit(&s.ids); + + return ret; +} + +static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == dev) + ptr->cached = true; +} + +static int bch2_data_update_index_update(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct btree_trans trans; + struct btree_iter iter; + struct data_update *m = + container_of(op, struct data_update, op); + struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets); + struct keylist *keys = &op->insert_keys; + struct bkey_buf _new, _insert; + int ret = 0; + + bch2_bkey_buf_init(&_new); + bch2_bkey_buf_init(&_insert); + bch2_bkey_buf_realloc(&_insert, c, U8_MAX); + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + + bch2_trans_iter_init(&trans, &iter, m->btree_id, + bkey_start_pos(&bch2_keylist_front(keys)->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + while (1) { + struct bkey_s_c k; + struct bkey_s_c old = bkey_i_to_s_c(m->k.k); + struct bkey_i *insert; + struct bkey_i_extent *new; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bpos next_pos; + bool did_work = false; + bool should_check_enospc; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + unsigned i; + + bch2_trans_begin(&trans); + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + new = bkey_i_to_extent(bch2_keylist_front(keys)); + + if (!bch2_extents_match(k, old)) + goto nomatch; + + bkey_reassemble(_insert.k, k); + insert = _insert.k; + + bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); + new = bkey_i_to_extent(_new.k); + bch2_cut_front(iter.pos, &new->k_i); + + bch2_cut_front(iter.pos, insert); + bch2_cut_back(new->k.p, insert); + bch2_cut_back(insert->k.p, &new->k_i); + + /* + * @old: extent that we read from + * @insert: key that we're going to update, initialized from + * extent currently in btree - same as @old unless we raced with + * other updates + * @new: extent with new pointers that we'll be adding to @insert + * + * Fist, drop rewrite_ptrs from @new: + */ + i = 0; + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { + if (((1U << i) & m->data_opts.rewrite_ptrs) && + bch2_extent_has_ptr(old, p, bkey_i_to_s_c(insert))) { + /* + * If we're going to be adding a pointer to the + * same device, we have to drop the old one - + * otherwise, we can just mark it cached: + */ + if (bch2_bkey_has_device(bkey_i_to_s_c(&new->k_i), p.ptr.dev)) + bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), p.ptr.dev); + else + bch2_bkey_mark_dev_cached(bkey_i_to_s(insert), p.ptr.dev); + } + i++; + } + + + /* Add new ptrs: */ + extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { + if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { + /* + * raced with another move op? extent already + * has a pointer to the device we just wrote + * data to + */ + continue; + } + + bch2_extent_ptr_decoded_append(insert, &p); + did_work = true; + } + + if (!did_work) + goto nomatch; + + bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); + bch2_extent_normalize(c, bkey_i_to_s(insert)); + + ret = bch2_sum_sector_overwrites(&trans, &iter, insert, + &should_check_enospc, + &i_sectors_delta, + &disk_sectors_delta); + if (ret) + goto err; + + if (disk_sectors_delta > (s64) op->res.sectors) { + ret = bch2_disk_reservation_add(c, &op->res, + disk_sectors_delta - op->res.sectors, + !should_check_enospc + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + goto out; + } + + next_pos = insert->k.p; + + ret = insert_snapshot_whiteouts(&trans, m->btree_id, + k.k->p, insert->k.p) ?: + bch2_trans_update(&trans, &iter, insert, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(&trans, &op->res, + op_journal_seq(op), + BTREE_INSERT_NOFAIL| + m->data_opts.btree_insert_flags); + if (!ret) { + bch2_btree_iter_set_pos(&iter, next_pos); + atomic_long_inc(&c->extent_migrate_done); + if (ec_ob) + bch2_ob_add_backpointer(c, ec_ob, &insert->k); + } +err: + if (ret == -EINTR) + ret = 0; + if (ret) + break; +next: + while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) { + bch2_keylist_pop_front(keys); + if (bch2_keylist_empty(keys)) + goto out; + } + continue; +nomatch: + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, old); + bch_info(c, "no match for %s", buf.buf); + printbuf_exit(&buf); + } + + if (m->ctxt) { + BUG_ON(k.k->p.offset <= iter.pos.offset); + atomic64_inc(&m->ctxt->stats->keys_raced); + atomic64_add(k.k->p.offset - iter.pos.offset, + &m->ctxt->stats->sectors_raced); + } + atomic_long_inc(&c->extent_migrate_raced); + trace_move_race(&new->k); + bch2_btree_iter_advance(&iter); + goto next; + } +out: + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&_insert, c); + bch2_bkey_buf_exit(&_new, c); + BUG_ON(ret == -EINTR); + return ret; +} + +void bch2_data_update_read_done(struct data_update *m, + struct bch_extent_crc_unpacked crc, + struct closure *cl) +{ + /* write bio must own pages: */ + BUG_ON(!m->op.wbio.bio.bi_vcnt); + + m->op.crc = crc; + m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; + + closure_call(&m->op.cl, bch2_write, NULL, cl); +} + +void bch2_data_update_exit(struct data_update *update) +{ + struct bch_fs *c = update->op.c; + + bch2_bkey_buf_exit(&update->k, c); + bch2_disk_reservation_put(c, &update->op.res); + bch2_bio_free_pages_pool(c, &update->op.wbio.bio); +} + +int bch2_data_update_init(struct bch_fs *c, struct data_update *m, + struct write_point_specifier wp, + struct bch_io_opts io_opts, + struct data_update_opts data_opts, + enum btree_id btree_id, + struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; + int ret; + + bch2_bkey_buf_init(&m->k); + bch2_bkey_buf_reassemble(&m->k, c, k); + m->btree_id = btree_id; + m->data_opts = data_opts; + + bch2_write_op_init(&m->op, c, io_opts); + m->op.pos = bkey_start_pos(k.k); + m->op.version = k.k->version; + m->op.target = data_opts.target, + m->op.write_point = wp; + m->op.flags |= BCH_WRITE_PAGES_STABLE| + BCH_WRITE_PAGES_OWNED| + BCH_WRITE_DATA_ENCODED| + BCH_WRITE_FROM_INTERNAL| + m->data_opts.write_flags; + m->op.compression_type = + bch2_compression_opt_to_type[io_opts.background_compression ?: + io_opts.compression]; + if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) + m->op.alloc_reserve = RESERVE_movinggc; + m->op.index_update_fn = bch2_data_update_index_update; + + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.ptr.cached) + m->data_opts.rewrite_ptrs &= ~(1U << i); + + if (!((1U << i) & m->data_opts.rewrite_ptrs)) + bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); + + if (((1U << i) & m->data_opts.rewrite_ptrs) && + crc_is_compressed(p.crc)) + reserve_sectors += k.k->size; + + /* + * op->csum_type is normally initialized from the fs/file's + * current options - but if an extent is encrypted, we require + * that it stays encrypted: + */ + if (bch2_csum_type_is_encryption(p.crc.csum_type)) { + m->op.nonce = p.crc.nonce + p.crc.offset; + m->op.csum_type = p.crc.csum_type; + } + + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) + m->op.incompressible = true; + + i++; + } + + if (reserve_sectors) { + ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, + m->data_opts.extra_replicas + ? 0 + : BCH_DISK_RESERVATION_NOFAIL); + if (ret) + return ret; + } + + m->op.nr_replicas = m->op.nr_replicas_required = + hweight32(m->data_opts.rewrite_ptrs) + m->data_opts.extra_replicas; + return 0; +} diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h new file mode 100644 index 0000000..e645054 --- /dev/null +++ b/libbcachefs/data_update.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BCACHEFS_DATA_UPDATE_H +#define _BCACHEFS_DATA_UPDATE_H + +#include "bkey_buf.h" +#include "io_types.h" + +struct moving_context; + +struct data_update_opts { + unsigned rewrite_ptrs; + u16 target; + u8 extra_replicas; + unsigned btree_insert_flags; + unsigned write_flags; +}; + +struct data_update { + /* extent being updated: */ + enum btree_id btree_id; + struct bkey_buf k; + struct data_update_opts data_opts; + struct moving_context *ctxt; + struct bch_write_op op; +}; + +void bch2_data_update_read_done(struct data_update *, + struct bch_extent_crc_unpacked, + struct closure *); + +void bch2_data_update_exit(struct data_update *); +int bch2_data_update_init(struct bch_fs *, struct data_update *, + struct write_point_specifier, + struct bch_io_opts, struct data_update_opts, + enum btree_id, struct bkey_s_c); + +#endif /* _BCACHEFS_DATA_UPDATE_H */ diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index bdc50d5..05cae0e 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -529,6 +529,76 @@ static const struct file_operations cached_btree_nodes_ops = { .read = bch2_cached_btree_nodes_read, }; +static int prt_backtrace(struct printbuf *out, struct task_struct *task) +{ + unsigned long entries[32]; + unsigned i, nr_entries; + int ret; + + ret = down_read_killable(&task->signal->exec_update_lock); + if (ret) + return ret; + + nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0); + for (i = 0; i < nr_entries; i++) { + prt_printf(out, "[<0>] %pB", (void *)entries[i]); + prt_newline(out); + } + + up_read(&task->signal->exec_update_lock); + return 0; +} + +static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + struct btree_trans *trans; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + mutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) { + if (trans->task->pid <= i->iter) + continue; + + err = flush_buf(i); + if (err) + return err; + + if (!i->size) + break; + + bch2_btree_trans_to_text(&i->buf, trans); + + prt_printf(&i->buf, "backtrace:"); + prt_newline(&i->buf); + printbuf_indent_add(&i->buf, 2); + prt_backtrace(&i->buf, trans->task); + printbuf_indent_sub(&i->buf, 2); + prt_newline(&i->buf); + + i->iter = trans->task->pid; + } + mutex_unlock(&c->btree_trans_lock); + + if (i->buf.allocation_failure) + return -ENOMEM; + + return i->ret; +} + +static const struct file_operations btree_transactions_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_btree_transactions_read, +}; + static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { @@ -588,6 +658,9 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, c->btree_debug, &cached_btree_nodes_ops); + debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, + c->btree_debug, &btree_transactions_ops); + debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, c->btree_debug, &journal_pins_ops); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index faabaa6..6ce352c 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -939,7 +939,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) BUG_ON(!s->allocated); - if (!percpu_ref_tryget(&c->writes)) + if (!percpu_ref_tryget_live(&c->writes)) goto err; ec_generate_ec(&s->new_stripe); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 73d756a..2ca1301 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -26,6 +26,8 @@ #include +static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); + static unsigned bch2_crc_field_size_max[] = { [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, @@ -688,37 +690,6 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) return durability; } -void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, - unsigned target, - unsigned nr_desired_replicas) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry; - struct extent_ptr_decoded p; - int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; - - if (target && extra > 0) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int n = bch2_extent_ptr_durability(c, p); - - if (n && n <= extra && - !bch2_dev_in_target(c, p.ptr.dev, target)) { - entry->ptr.cached = true; - extra -= n; - } - } - - if (extra > 0) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int n = bch2_extent_ptr_durability(c, p); - - if (n && n <= extra) { - entry->ptr.cached = true; - extra -= n; - } - } -} - void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) { union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); @@ -822,8 +793,8 @@ static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) /* * Returns pointer to the next entry after the one being dropped: */ -union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k, - struct bch_extent_ptr *ptr) +static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry = to_entry(ptr), *next; @@ -895,6 +866,14 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); } +void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr = (void *) bch2_bkey_has_device(k.s_c, dev); + + if (ptr) + __bch2_bkey_drop_ptr(k, ptr); +} + const struct bch_extent_ptr * bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) { @@ -939,6 +918,44 @@ bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, return false; } +/* + * Returns true if two extents refer to the same data: + */ +bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) +{ + struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); + struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); + const union bch_extent_entry *entry1, *entry2; + struct extent_ptr_decoded p1, p2; + + bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return true; + + return false; +} + +bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, + struct bkey_s_c k2) +{ + struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); + const union bch_extent_entry *entry2; + struct extent_ptr_decoded p2; + + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return true; + + return false; +} + /* * bch_extent_normalize - clean up an extent, dropping stale pointers etc. * @@ -1079,6 +1096,7 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, struct bch_extent_crc_unpacked crc; unsigned size_ondisk = k.k->size; unsigned nonce = UINT_MAX; + unsigned nr_ptrs = 0; int ret; if (bkey_is_btree_ptr(k.k)) @@ -1103,6 +1121,7 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, false, err); if (ret) return ret; + nr_ptrs++; break; case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: @@ -1141,6 +1160,11 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, } } + if (nr_ptrs >= BCH_BKEY_PTRS_MAX) { + prt_str(err, "too many ptrs"); + return -EINVAL; + } + return 0; } diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 4f41f0f..3c17b81 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -577,15 +577,10 @@ unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); -void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, - unsigned, unsigned); - void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); -union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, - struct bch_extent_ptr *); union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); @@ -607,11 +602,14 @@ do { \ } while (0) void bch2_bkey_drop_device(struct bkey_s, unsigned); +void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, struct bch_extent_ptr, u64); +bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); +bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index d543480..53ffc68 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -204,7 +204,9 @@ int bch2_link_trans(struct btree_trans *trans, goto err; inode_u->bi_ctime = now; - bch2_inode_nlink_inc(inode_u); + ret = bch2_inode_nlink_inc(inode_u); + if (ret) + return ret; ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); if (ret) @@ -297,7 +299,7 @@ int bch2_unlink_trans(struct btree_trans *trans, if (ret) goto err; } else { - bch2_inode_nlink_dec(inode_u); + bch2_inode_nlink_dec(trans, inode_u); } if (inode_u->bi_dir == dirent_iter.pos.inode && @@ -462,7 +464,7 @@ int bch2_rename_trans(struct btree_trans *trans, } if (mode == BCH_RENAME_OVERWRITE) - bch2_inode_nlink_dec(dst_inode_u); + bch2_inode_nlink_dec(trans, dst_inode_u); src_dir_u->bi_mtime = now; src_dir_u->bi_ctime = now; diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 06f3e26..bcfd9e5 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -3127,7 +3127,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, struct bch_fs *c = inode->v.i_sb->s_fs_info; long ret; - if (!percpu_ref_tryget(&c->writes)) + if (!percpu_ref_tryget_live(&c->writes)) return -EROFS; inode_lock(&inode->v); diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 88d83d9..6a2b949 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -736,3 +736,36 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, return bch2_trans_do(c, NULL, NULL, 0, bch2_inode_find_by_inum_trans(&trans, inum, inode)); } + +int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) +{ + if (bi->bi_flags & BCH_INODE_UNLINKED) + bi->bi_flags &= ~BCH_INODE_UNLINKED; + else { + if (bi->bi_nlink == U32_MAX) + return -EINVAL; + + bi->bi_nlink++; + } + + return 0; +} + +void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) +{ + if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) { + bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", + bi->bi_inum); + return; + } + + if (bi->bi_flags & BCH_INODE_UNLINKED) { + bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); + return; + } + + if (bi->bi_nlink) + bi->bi_nlink--; + else + bi->bi_flags |= BCH_INODE_UNLINKED; +} diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 9442600..2ac2fc1 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -164,23 +164,6 @@ static inline unsigned nlink_bias(umode_t mode) return S_ISDIR(mode) ? 2 : 1; } -static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) -{ - if (bi->bi_flags & BCH_INODE_UNLINKED) - bi->bi_flags &= ~BCH_INODE_UNLINKED; - else - bi->bi_nlink++; -} - -static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi) -{ - BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED); - if (bi->bi_nlink) - bi->bi_nlink--; - else - bi->bi_flags |= BCH_INODE_UNLINKED; -} - static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) { return bi->bi_flags & BCH_INODE_UNLINKED @@ -200,4 +183,7 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, } } +int bch2_inode_nlink_inc(struct bch_inode_unpacked *); +void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); + #endif /* _BCACHEFS_INODE_H */ diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 8217199..50fa572 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -1043,8 +1043,7 @@ do_write: *_dst = dst; return more; csum_err: - bch_err(c, "error verifying existing checksum while " - "rewriting existing data (memory corruption?)"); + bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); ret = -EIO; err: if (to_wbio(dst)->bounce) @@ -1085,12 +1084,6 @@ again: BKEY_EXTENT_U64s_MAX)) goto flush_io; - if ((op->flags & BCH_WRITE_FROM_INTERNAL) && - percpu_ref_is_dying(&c->writes)) { - ret = -EROFS; - goto err; - } - /* * The copygc thread is now global, which means it's no longer * freeing up space on specific disks, which means that @@ -1284,7 +1277,7 @@ void bch2_write(struct closure *cl) } if (c->opts.nochanges || - !percpu_ref_tryget(&c->writes)) { + !percpu_ref_tryget_live(&c->writes)) { op->error = -EROFS; goto err; } @@ -1325,7 +1318,7 @@ struct promote_op { struct rhash_head hash; struct bpos pos; - struct migrate_write write; + struct data_update write; struct bio_vec bi_inline_vecs[0]; /* must be last */ }; @@ -1381,13 +1374,12 @@ static void promote_done(struct closure *cl) bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); - bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); + bch2_data_update_exit(&op->write); promote_free(c, op); } static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) { - struct bch_fs *c = rbio->c; struct closure *cl = &op->cl; struct bio *bio = &op->write.op.wbio.bio; @@ -1401,10 +1393,8 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) sizeof(struct bio_vec) * rbio->bio.bi_vcnt); swap(bio->bi_vcnt, rbio->bio.bi_vcnt); - bch2_migrate_read_done(&op->write, rbio); - closure_init(cl, NULL); - closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, cl); + bch2_data_update_read_done(&op->write, rbio->pick.crc, cl); closure_return_with_destructor(cl, promote_done); } @@ -1422,7 +1412,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); int ret; - if (!percpu_ref_tryget(&c->writes)) + if (!percpu_ref_tryget_live(&c->writes)) return NULL; op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); @@ -1460,13 +1450,13 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, bio = &op->write.op.wbio.bio; bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); - ret = bch2_migrate_write_init(c, &op->write, + ret = bch2_data_update_init(c, &op->write, writepoint_hashed((unsigned long) current), opts, - DATA_PROMOTE, - (struct data_opts) { + (struct data_update_opts) { .target = opts.promote_target, - .nr_replicas = 1, + .extra_replicas = 1, + .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, }, btree_id, k); BUG_ON(ret); @@ -1870,9 +1860,9 @@ csum_err: } bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector, - "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)", + "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, - csum.hi, csum.lo, crc.csum_type); + csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; decompression_err: diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 36d20dc..093efb0 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -1,14 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "alloc_background.h" #include "alloc_foreground.h" #include "backpointers.h" #include "bkey_buf.h" #include "btree_gc.h" #include "btree_update.h" #include "btree_update_interior.h" -#include "buckets.h" #include "disk_groups.h" #include "ec.h" #include "error.h" @@ -17,7 +15,6 @@ #include "journal_reclaim.h" #include "move.h" #include "replicas.h" -#include "subvolume.h" #include "super-io.h" #include "keylist.h" @@ -26,7 +23,19 @@ #include -#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 +static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats) +{ + mutex_lock(&c->data_progress_lock); + list_add(&stats->list, &c->data_progress_list); + mutex_unlock(&c->data_progress_lock); +} + +static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats) +{ + mutex_lock(&c->data_progress_lock); + list_del(&stats->list); + mutex_unlock(&c->data_progress_lock); +} struct moving_io { struct list_head list; @@ -38,414 +47,30 @@ struct moving_io { struct bch_read_bio rbio; - struct migrate_write write; + struct data_update write; /* Must be last since it is variable size */ struct bio_vec bi_inline_vecs[0]; }; -struct moving_context { - /* Closure for waiting on all reads and writes to complete */ - struct closure cl; - - struct bch_move_stats *stats; - - struct list_head reads; - - /* in flight sectors: */ - atomic_t read_sectors; - atomic_t write_sectors; - - wait_queue_head_t wait; -}; - -static int insert_snapshot_whiteouts(struct btree_trans *trans, - enum btree_id id, - struct bpos old_pos, - struct bpos new_pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter, update_iter; - struct bkey_s_c k; - struct snapshots_seen s; - int ret; - - if (!btree_type_has_snapshots(id)) - return 0; - - snapshots_seen_init(&s); - - if (!bkey_cmp(old_pos, new_pos)) - return 0; - - if (!snapshot_t(c, old_pos.snapshot)->children[0]) - return 0; - - bch2_trans_iter_init(trans, &iter, id, old_pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); - while (1) { -next: - k = bch2_btree_iter_prev(&iter); - ret = bkey_err(k); - if (ret) - break; - - if (bkey_cmp(old_pos, k.k->p)) - break; - - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) { - struct bkey_i *update; - u32 *i; - - darray_for_each(s.ids, i) - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, *i)) - goto next; - - update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); - - ret = PTR_ERR_OR_ZERO(update); - if (ret) - break; - - bkey_init(&update->k); - update->k.p = new_pos; - update->k.p.snapshot = k.k->p.snapshot; - - bch2_trans_iter_init(trans, &update_iter, id, update->k.p, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&update_iter) ?: - bch2_trans_update(trans, &update_iter, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - bch2_trans_iter_exit(trans, &update_iter); - if (ret) - break; - - ret = snapshots_seen_add(c, &s, k.k->p.snapshot); - if (ret) - break; - } - } - bch2_trans_iter_exit(trans, &iter); - darray_exit(&s.ids); - - return ret; -} - -static int bch2_migrate_index_update(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct btree_trans trans; - struct btree_iter iter; - struct migrate_write *m = - container_of(op, struct migrate_write, op); - struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets); - struct keylist *keys = &op->insert_keys; - struct bkey_buf _new, _insert; - int ret = 0; - - bch2_bkey_buf_init(&_new); - bch2_bkey_buf_init(&_insert); - bch2_bkey_buf_realloc(&_insert, c, U8_MAX); - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - - bch2_trans_iter_init(&trans, &iter, m->btree_id, - bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - while (1) { - struct bkey_s_c k; - struct bkey_i *insert; - struct bkey_i_extent *new; - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct bpos next_pos; - bool did_work = false; - bool should_check_enospc; - s64 i_sectors_delta = 0, disk_sectors_delta = 0; - - bch2_trans_begin(&trans); - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - new = bkey_i_to_extent(bch2_keylist_front(keys)); - - if (bversion_cmp(k.k->version, new->k.version) || - !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) - goto nomatch; - - bkey_reassemble(_insert.k, k); - insert = _insert.k; - - bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); - new = bkey_i_to_extent(_new.k); - bch2_cut_front(iter.pos, &new->k_i); - - bch2_cut_front(iter.pos, insert); - bch2_cut_back(new->k.p, insert); - bch2_cut_back(insert->k.p, &new->k_i); - - if (m->data_cmd == DATA_REWRITE) { - struct bch_extent_ptr *new_ptr, *old_ptr = (void *) - bch2_bkey_has_device(bkey_i_to_s_c(insert), - m->data_opts.rewrite_dev); - if (!old_ptr) - goto nomatch; - - if (old_ptr->cached) - extent_for_each_ptr(extent_i_to_s(new), new_ptr) - new_ptr->cached = true; - - __bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr); - } - - extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { - if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { - /* - * raced with another move op? extent already - * has a pointer to the device we just wrote - * data to - */ - continue; - } - - bch2_extent_ptr_decoded_append(insert, &p); - did_work = true; - } - - if (!did_work) - goto nomatch; - - bch2_bkey_narrow_crcs(insert, - (struct bch_extent_crc_unpacked) { 0 }); - bch2_extent_normalize(c, bkey_i_to_s(insert)); - bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert), - op->opts.background_target, - op->opts.data_replicas); - - ret = bch2_sum_sector_overwrites(&trans, &iter, insert, - &should_check_enospc, - &i_sectors_delta, - &disk_sectors_delta); - if (ret) - goto err; - - if (disk_sectors_delta > (s64) op->res.sectors) { - ret = bch2_disk_reservation_add(c, &op->res, - disk_sectors_delta - op->res.sectors, - !should_check_enospc - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (ret) - goto out; - } - - next_pos = insert->k.p; - - ret = insert_snapshot_whiteouts(&trans, m->btree_id, - k.k->p, insert->k.p) ?: - bch2_trans_update(&trans, &iter, insert, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(&trans, &op->res, - op_journal_seq(op), - BTREE_INSERT_NOFAIL| - m->data_opts.btree_insert_flags); - if (!ret) { - bch2_btree_iter_set_pos(&iter, next_pos); - atomic_long_inc(&c->extent_migrate_done); - if (ec_ob) - bch2_ob_add_backpointer(c, ec_ob, &insert->k); - } -err: - if (ret == -EINTR) - ret = 0; - if (ret) - break; -next: - while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) { - bch2_keylist_pop_front(keys); - if (bch2_keylist_empty(keys)) - goto out; - } - continue; -nomatch: - if (m->ctxt) { - BUG_ON(k.k->p.offset <= iter.pos.offset); - atomic64_inc(&m->ctxt->stats->keys_raced); - atomic64_add(k.k->p.offset - iter.pos.offset, - &m->ctxt->stats->sectors_raced); - } - atomic_long_inc(&c->extent_migrate_raced); - trace_move_race(&new->k); - bch2_btree_iter_advance(&iter); - goto next; - } -out: - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); - bch2_bkey_buf_exit(&_insert, c); - bch2_bkey_buf_exit(&_new, c); - BUG_ON(ret == -EINTR); - return ret; -} - -void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) -{ - /* write bio must own pages: */ - BUG_ON(!m->op.wbio.bio.bi_vcnt); - - m->ptr = rbio->pick.ptr; - m->offset = rbio->data_pos.offset - rbio->pick.crc.offset; - m->op.devs_have = rbio->devs_have; - m->op.pos = rbio->data_pos; - m->op.version = rbio->version; - m->op.crc = rbio->pick.crc; - m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; - - if (m->data_cmd == DATA_REWRITE) - bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev); -} - -int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, - struct write_point_specifier wp, - struct bch_io_opts io_opts, - enum data_cmd data_cmd, - struct data_opts data_opts, - enum btree_id btree_id, - struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - struct extent_ptr_decoded p; - int ret; - - m->btree_id = btree_id; - m->data_cmd = data_cmd; - m->data_opts = data_opts; - m->nr_ptrs_reserved = 0; - - bch2_write_op_init(&m->op, c, io_opts); - - if (!bch2_bkey_is_incompressible(k)) - m->op.compression_type = - bch2_compression_opt_to_type[io_opts.background_compression ?: - io_opts.compression]; - else - m->op.incompressible = true; - - m->op.target = data_opts.target, - m->op.write_point = wp; - - /* - * op->csum_type is normally initialized from the fs/file's current - * options - but if an extent is encrypted, we require that it stays - * encrypted: - */ - bkey_for_each_crc(k.k, ptrs, crc, entry) - if (bch2_csum_type_is_encryption(crc.csum_type)) { - m->op.nonce = crc.nonce + crc.offset; - m->op.csum_type = crc.csum_type; - break; - } - - if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { - m->op.alloc_reserve = RESERVE_movinggc; - } else { - /* XXX: this should probably be passed in */ - m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; - } - - m->op.flags |= BCH_WRITE_PAGES_STABLE| - BCH_WRITE_PAGES_OWNED| - BCH_WRITE_DATA_ENCODED| - BCH_WRITE_FROM_INTERNAL; - - m->op.nr_replicas = data_opts.nr_replicas; - m->op.nr_replicas_required = data_opts.nr_replicas; - m->op.index_update_fn = bch2_migrate_index_update; - - switch (data_cmd) { - case DATA_ADD_REPLICAS: { - /* - * DATA_ADD_REPLICAS is used for moving data to a different - * device in the background, and due to compression the new copy - * might take up more space than the old copy: - */ -#if 0 - int nr = (int) io_opts.data_replicas - - bch2_bkey_nr_ptrs_allocated(k); -#endif - int nr = (int) io_opts.data_replicas; - - if (nr > 0) { - m->op.nr_replicas = m->nr_ptrs_reserved = nr; - - ret = bch2_disk_reservation_get(c, &m->op.res, - k.k->size, m->op.nr_replicas, 0); - if (ret) - return ret; - } - break; - } - case DATA_REWRITE: { - unsigned compressed_sectors = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev == data_opts.rewrite_dev) { - if (p.ptr.cached) - m->op.flags |= BCH_WRITE_CACHED; - - if (!p.ptr.cached && - crc_is_compressed(p.crc)) - compressed_sectors += p.crc.compressed_size; - } - - if (compressed_sectors) { - ret = bch2_disk_reservation_add(c, &m->op.res, - k.k->size * m->op.nr_replicas, - BCH_DISK_RESERVATION_NOFAIL); - if (ret) - return ret; - } - break; - } - case DATA_PROMOTE: - m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; - m->op.flags |= BCH_WRITE_CACHED; - break; - default: - BUG(); - } - - return 0; -} - static void move_free(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->write.ctxt; - struct bvec_iter_all iter; - struct bio_vec *bv; - - bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); - - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) - if (bv->bv_page) - __free_page(bv->bv_page); + struct bch_fs *c = ctxt->c; + bch2_data_update_exit(&io->write); wake_up(&ctxt->wait); - + percpu_ref_put(&c->writes); kfree(io); } static void move_write_done(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); + struct moving_context *ctxt = io->write.ctxt; + + if (io->write.op.error) + ctxt->write_error = true; atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); closure_return_with_destructor(cl, move_free); @@ -460,10 +85,9 @@ static void move_write(struct closure *cl) return; } - bch2_migrate_read_done(&io->write, &io->rbio); - atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); - closure_call(&io->write.op.cl, bch2_write, NULL, cl); + + bch2_data_update_read_done(&io->write, io->rbio.pick.crc, cl); continue_at(cl, move_write_done, NULL); } @@ -520,14 +144,55 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, atomic_read(&ctxt->write_sectors) != sectors_pending); } +void bch2_moving_ctxt_exit(struct moving_context *ctxt) +{ + move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); + closure_sync(&ctxt->cl); + progress_list_del(ctxt->c, ctxt->stats); + + EBUG_ON(atomic_read(&ctxt->write_sectors)); + + trace_move_data(ctxt->c, + atomic64_read(&ctxt->stats->sectors_moved), + atomic64_read(&ctxt->stats->keys_moved)); +} + +void bch2_moving_ctxt_init(struct moving_context *ctxt, + struct bch_fs *c, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc) +{ + memset(ctxt, 0, sizeof(*ctxt)); + + ctxt->c = c; + ctxt->rate = rate; + ctxt->stats = stats; + ctxt->wp = wp; + ctxt->wait_on_copygc = wait_on_copygc; + + progress_list_add(c, stats); + closure_init_stack(&ctxt->cl); + INIT_LIST_HEAD(&ctxt->reads); + init_waitqueue_head(&ctxt->wait); + + if (stats) + stats->data_type = BCH_DATA_user; +} + +void bch_move_stats_init(struct bch_move_stats *stats, char *name) +{ + memset(stats, 0, sizeof(*stats)); + scnprintf(stats->name, sizeof(stats->name), "%s", name); +} + static int bch2_move_extent(struct btree_trans *trans, struct moving_context *ctxt, - struct write_point_specifier wp, struct bch_io_opts io_opts, enum btree_id btree_id, struct bkey_s_c k, - enum data_cmd data_cmd, - struct data_opts data_opts) + struct data_update_opts data_opts) { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -537,6 +202,9 @@ static int bch2_move_extent(struct btree_trans *trans, unsigned sectors = k.k->size, pages; int ret = -ENOMEM; + if (!percpu_ref_tryget_live(&c->writes)) + return -EROFS; + /* write path might have to decompress data: */ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); @@ -570,11 +238,13 @@ static int bch2_move_extent(struct btree_trans *trans, io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); io->rbio.bio.bi_end_io = move_read_endio; - ret = bch2_migrate_write_init(c, &io->write, wp, io_opts, - data_cmd, data_opts, btree_id, k); + ret = bch2_data_update_init(c, &io->write, ctxt->wp, io_opts, + data_opts, btree_id, k); if (ret) goto err_free_pages; + io->write.ctxt = ctxt; + atomic64_inc(&ctxt->stats->keys_moved); atomic64_add(k.k->size, &ctxt->stats->sectors_moved); this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); @@ -600,6 +270,7 @@ err_free_pages: err_free: kfree(io); err: + percpu_ref_put(&c->writes); trace_move_alloc_mem_fail(k.k); return ret; } @@ -636,13 +307,20 @@ err: } static int move_ratelimit(struct btree_trans *trans, - struct moving_context *ctxt, - struct bch_ratelimit *rate) + struct moving_context *ctxt) { + struct bch_fs *c = trans->c; u64 delay; + if (ctxt->wait_on_copygc) { + bch2_trans_unlock(trans); + wait_event_killable(c->copygc_running_wq, + !c->copygc_running || + kthread_should_stop()); + } + do { - delay = rate ? bch2_ratelimit_delay(rate) : 0; + delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; if (delay) { bch2_trans_unlock(trans); @@ -665,11 +343,11 @@ static int move_ratelimit(struct btree_trans *trans, move_ctxt_wait_event(ctxt, trans, atomic_read(&ctxt->write_sectors) < - SECTORS_IN_FLIGHT_PER_DEVICE); + c->opts.move_bytes_in_flight >> 9); move_ctxt_wait_event(ctxt, trans, atomic_read(&ctxt->read_sectors) < - SECTORS_IN_FLIGHT_PER_DEVICE); + c->opts.move_bytes_in_flight >> 9); return 0; } @@ -699,41 +377,37 @@ static int move_get_io_opts(struct btree_trans *trans, return 0; } -static int __bch2_move_data(struct bch_fs *c, - struct moving_context *ctxt, - struct bch_ratelimit *rate, - struct write_point_specifier wp, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - struct bch_move_stats *stats, - enum btree_id btree_id) +static int __bch2_move_data(struct moving_context *ctxt, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + enum btree_id btree_id) { + struct bch_fs *c = ctxt->c; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct bkey_buf sk; struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - struct data_opts data_opts; - enum data_cmd data_cmd; + struct data_update_opts data_opts; u64 cur_inum = U64_MAX; int ret = 0, ret2; bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); - stats->data_type = BCH_DATA_user; - stats->btree_id = btree_id; - stats->pos = start; + ctxt->stats->data_type = BCH_DATA_user; + ctxt->stats->btree_id = btree_id; + ctxt->stats->pos = start; bch2_trans_iter_init(&trans, &iter, btree_id, start, BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS); - if (rate) - bch2_ratelimit_reset(rate); + if (ctxt->rate) + bch2_ratelimit_reset(ctxt->rate); - while (!move_ratelimit(&trans, ctxt, rate)) { + while (!move_ratelimit(&trans, ctxt)) { bch2_trans_begin(&trans); k = bch2_btree_iter_peek(&iter); @@ -749,7 +423,7 @@ static int __bch2_move_data(struct bch_fs *c, if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; - stats->pos = iter.pos; + ctxt->stats->pos = iter.pos; if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; @@ -758,18 +432,9 @@ static int __bch2_move_data(struct bch_fs *c, if (ret) continue; - switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) { - case DATA_SKIP: + memset(&data_opts, 0, sizeof(data_opts)); + if (!pred(c, arg, k, &io_opts, &data_opts)) goto next; - case DATA_SCRUB: - BUG(); - case DATA_ADD_REPLICAS: - case DATA_REWRITE: - case DATA_PROMOTE: - break; - default: - BUG(); - } /* * The iterator gets unlocked by __bch2_read_extent - need to @@ -778,8 +443,8 @@ static int __bch2_move_data(struct bch_fs *c, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, - data_cmd, data_opts); + ret2 = bch2_move_extent(&trans, ctxt, io_opts, + btree_id, k, data_opts); if (ret2) { if (ret2 == -EINTR) continue; @@ -794,10 +459,10 @@ static int __bch2_move_data(struct bch_fs *c, goto next; } - if (rate) - bch2_ratelimit_increment(rate, k.k->size); + if (ctxt->rate) + bch2_ratelimit_increment(ctxt->rate, k.k->size); next: - atomic64_add(k.k->size, &stats->sectors_seen); + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); next_nondata: bch2_btree_iter_advance(&iter); } @@ -809,48 +474,20 @@ next_nondata: return ret; } -inline void bch_move_stats_init(struct bch_move_stats *stats, char *name) -{ - memset(stats, 0, sizeof(*stats)); - - scnprintf(stats->name, sizeof(stats->name), - "%s", name); -} - -static inline void progress_list_add(struct bch_fs *c, - struct bch_move_stats *stats) -{ - mutex_lock(&c->data_progress_lock); - list_add(&stats->list, &c->data_progress_list); - mutex_unlock(&c->data_progress_lock); -} - -static inline void progress_list_del(struct bch_fs *c, - struct bch_move_stats *stats) -{ - mutex_lock(&c->data_progress_lock); - list_del(&stats->list); - mutex_unlock(&c->data_progress_lock); -} - int bch2_move_data(struct bch_fs *c, enum btree_id start_btree_id, struct bpos start_pos, enum btree_id end_btree_id, struct bpos end_pos, struct bch_ratelimit *rate, + struct bch_move_stats *stats, struct write_point_specifier wp, - move_pred_fn pred, void *arg, - struct bch_move_stats *stats) + bool wait_on_copygc, + move_pred_fn pred, void *arg) { - struct moving_context ctxt = { .stats = stats }; + struct moving_context ctxt; enum btree_id id; int ret; - progress_list_add(c, stats); - closure_init_stack(&ctxt.cl); - INIT_LIST_HEAD(&ctxt.reads); - init_waitqueue_head(&ctxt.wait); - - stats->data_type = BCH_DATA_user; + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); for (id = start_btree_id; id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1); @@ -861,24 +498,16 @@ int bch2_move_data(struct bch_fs *c, id != BTREE_ID_reflink) continue; - ret = __bch2_move_data(c, &ctxt, rate, wp, + ret = __bch2_move_data(&ctxt, id == start_btree_id ? start_pos : POS_MIN, id == end_btree_id ? end_pos : POS_MAX, - pred, arg, stats, id); + pred, arg, id); if (ret) break; } - move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads)); - closure_sync(&ctxt.cl); - - EBUG_ON(atomic_read(&ctxt.write_sectors)); + bch2_moving_ctxt_exit(&ctxt); - trace_move_data(c, - atomic64_read(&stats->sectors_moved), - atomic64_read(&stats->keys_moved)); - - progress_list_del(c, stats); return ret; } @@ -891,6 +520,7 @@ static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_CACHED); +again: k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); @@ -901,10 +531,16 @@ static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket a.v->dirty_sectors) { struct printbuf buf = PRINTBUF; + if (a.v->data_type == BCH_DATA_btree) { + bch2_trans_unlock(trans); + if (bch2_btree_interior_updates_flush(c)) + goto again; + } + prt_str(&buf, "failed to evacuate bucket "); bch2_bkey_val_to_text(&buf, c, k); - bch_err_ratelimited(c, "%s", buf.buf); + bch2_trans_inconsistent(trans, "%s", buf.buf); printbuf_exit(&buf); } } @@ -913,33 +549,24 @@ static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket return ret; } -int bch2_evacuate_bucket(struct bch_fs *c, - struct bpos bucket, int gen, - struct bch_ratelimit *rate, - struct write_point_specifier wp, - enum data_cmd data_cmd, - struct data_opts *data_opts, - struct bch_move_stats *stats) +int __bch2_evacuate_bucket(struct moving_context *ctxt, + struct bpos bucket, int gen, + struct data_update_opts _data_opts) { + struct bch_fs *c = ctxt->c; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct moving_context ctxt = { .stats = stats }; struct btree_trans trans; struct btree_iter iter; struct bkey_buf sk; struct bch_backpointer bp; + struct data_update_opts data_opts; u64 bp_offset = 0, cur_inum = U64_MAX; int ret = 0; bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); - progress_list_add(c, stats); - closure_init_stack(&ctxt.cl); - INIT_LIST_HEAD(&ctxt.reads); - init_waitqueue_head(&ctxt.wait); - stats->data_type = BCH_DATA_user; - - while (!(ret = move_ratelimit(&trans, &ctxt, rate))) { + while (!(ret = move_ratelimit(&trans, ctxt))) { bch2_trans_begin(&trans); ret = bch2_get_next_backpointer(&trans, bucket, gen, @@ -952,7 +579,9 @@ int bch2_evacuate_bucket(struct bch_fs *c, break; if (!bp.level) { + const struct bch_extent_ptr *ptr; struct bkey_s_c k; + unsigned i = 0; k = bch2_backpointer_get_key(&trans, &iter, bucket, bp_offset, bp); @@ -972,24 +601,31 @@ int bch2_evacuate_bucket(struct bch_fs *c, if (ret) continue; - data_opts->target = io_opts.background_target; - data_opts->rewrite_dev = bucket.inode; + data_opts = _data_opts; + data_opts.target = io_opts.background_target; + data_opts.rewrite_ptrs = 0; + + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + if (ptr->dev == bucket.inode) + data_opts.rewrite_ptrs |= 1U << i; + i++; + } - ret = bch2_move_extent(&trans, &ctxt, wp, io_opts, bp.btree_id, k, - data_cmd, *data_opts); + ret = bch2_move_extent(&trans, ctxt, io_opts, + bp.btree_id, k, data_opts); if (ret == -EINTR) continue; if (ret == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(&ctxt, &trans); + bch2_move_ctxt_wait_for_io(ctxt, &trans); continue; } if (ret) goto err; - if (rate) - bch2_ratelimit_increment(rate, k.k->size); - atomic64_add(k.k->size, &stats->sectors_seen); + if (ctxt->rate) + bch2_ratelimit_increment(ctxt->rate, k.k->size); + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); } else { struct btree *b; @@ -1011,10 +647,11 @@ int bch2_evacuate_bucket(struct bch_fs *c, if (ret) goto err; - if (rate) - bch2_ratelimit_increment(rate, c->opts.btree_node_size >> 9); - atomic64_add(c->opts.btree_node_size >> 9, &stats->sectors_seen); - atomic64_add(c->opts.btree_node_size >> 9, &stats->sectors_moved); + if (ctxt->rate) + bch2_ratelimit_increment(ctxt->rate, + c->opts.btree_node_size >> 9); + atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); + atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); } bp_offset++; @@ -1022,30 +659,38 @@ int bch2_evacuate_bucket(struct bch_fs *c, if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) { bch2_trans_unlock(&trans); - move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads)); - closure_sync(&ctxt.cl); - lockrestart_do(&trans, verify_bucket_evacuated(&trans, bucket, gen)); + move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); + closure_sync(&ctxt->cl); + if (!ctxt->write_error) + lockrestart_do(&trans, verify_bucket_evacuated(&trans, bucket, gen)); } err: bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); + return ret; +} - move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads)); - closure_sync(&ctxt.cl); - progress_list_del(c, stats); - - EBUG_ON(atomic_read(&ctxt.write_sectors)); +int bch2_evacuate_bucket(struct bch_fs *c, + struct bpos bucket, int gen, + struct data_update_opts data_opts, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc) +{ + struct moving_context ctxt; + int ret; - trace_move_data(c, - atomic64_read(&stats->sectors_moved), - atomic64_read(&stats->keys_moved)); + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + ret = __bch2_evacuate_bucket(&ctxt, bucket, gen, data_opts); + bch2_moving_ctxt_exit(&ctxt); return ret; } -typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *, - struct btree *, struct bch_io_opts *, - struct data_opts *); +typedef bool (*move_btree_pred)(struct bch_fs *, void *, + struct btree *, struct bch_io_opts *, + struct data_update_opts *); static int bch2_move_btree(struct bch_fs *c, enum btree_id start_btree_id, struct bpos start_pos, @@ -1059,8 +704,7 @@ static int bch2_move_btree(struct bch_fs *c, struct btree_iter iter; struct btree *b; enum btree_id id; - struct data_opts data_opts; - enum data_cmd cmd; + struct data_update_opts data_opts; int ret = 0; bch2_trans_init(&trans, c, 0, 0); @@ -1089,17 +733,8 @@ retry: stats->pos = iter.pos; - switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) { - case DATA_SKIP: + if (!pred(c, arg, b, &io_opts, &data_opts)) goto next; - case DATA_SCRUB: - BUG(); - case DATA_ADD_REPLICAS: - case DATA_REWRITE: - break; - default: - BUG(); - } ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret; if (ret == -EINTR) @@ -1129,20 +764,10 @@ next: return ret; } -#if 0 -static enum data_cmd scrub_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) -{ - return DATA_SCRUB; -} -#endif - -static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) +static bool rereplicate_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { unsigned nr_good = bch2_bkey_durability(c, k); unsigned replicas = bkey_is_btree_ptr(k.k) @@ -1150,43 +775,50 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, : io_opts->data_replicas; if (!nr_good || nr_good >= replicas) - return DATA_SKIP; + return false; data_opts->target = 0; - data_opts->nr_replicas = 1; + data_opts->extra_replicas = replicas - nr_good; data_opts->btree_insert_flags = 0; - return DATA_ADD_REPLICAS; + return true; } -static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) +static bool migrate_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; struct bch_ioctl_data *op = arg; + unsigned i = 0; - if (!bch2_bkey_has_device(k, op->migrate.dev)) - return DATA_SKIP; - + data_opts->rewrite_ptrs = 0; data_opts->target = 0; - data_opts->nr_replicas = 1; + data_opts->extra_replicas = 0; data_opts->btree_insert_flags = 0; - data_opts->rewrite_dev = op->migrate.dev; - return DATA_REWRITE; + + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == op->migrate.dev) + data_opts->rewrite_ptrs |= 1U << i; + i++; + } + + return data_opts->rewrite_ptrs != 0;; } -static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) +static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); } -static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) +static bool migrate_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); } @@ -1215,21 +847,21 @@ static bool bformat_needs_redo(struct bkey_format *f) return false; } -static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) +static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { if (b->version_ondisk != c->sb.version || btree_node_need_rewrite(b) || bformat_needs_redo(&b->format)) { data_opts->target = 0; - data_opts->nr_replicas = 1; + data_opts->extra_replicas = 0; data_opts->btree_insert_flags = 0; - return DATA_REWRITE; + return true; } - return DATA_SKIP; + return false; } int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) @@ -1273,8 +905,11 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_move_data(c, op.start_btree, op.start_pos, op.end_btree, op.end_pos, - NULL, writepoint_hashed((unsigned long) current), - rereplicate_pred, c, stats) ?: ret; + NULL, + stats, + writepoint_hashed((unsigned long) current), + true, + rereplicate_pred, c) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_MIGRATE: @@ -1294,8 +929,11 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_move_data(c, op.start_btree, op.start_pos, op.end_btree, op.end_pos, - NULL, writepoint_hashed((unsigned long) current), - migrate_pred, &op, stats) ?: ret; + NULL, + stats, + writepoint_hashed((unsigned long) current), + true, + migrate_pred, &op) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_REWRITE_OLD_NODES: diff --git a/libbcachefs/move.h b/libbcachefs/move.h index c69b6b5..c0fec69 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -4,53 +4,37 @@ #include "btree_iter.h" #include "buckets.h" -#include "io_types.h" +#include "data_update.h" #include "move_types.h" struct bch_read_bio; -struct moving_context; -enum data_cmd { - DATA_SKIP, - DATA_SCRUB, - DATA_ADD_REPLICAS, - DATA_REWRITE, - DATA_PROMOTE, -}; - -struct data_opts { - u16 target; - u8 rewrite_dev; - u8 nr_replicas; - int btree_insert_flags; -}; - -struct migrate_write { - enum btree_id btree_id; - enum data_cmd data_cmd; - struct data_opts data_opts; - - unsigned nr_ptrs_reserved; +struct moving_context { + struct bch_fs *c; + struct bch_ratelimit *rate; + struct bch_move_stats *stats; + struct write_point_specifier wp; + bool wait_on_copygc; + bool write_error; - struct moving_context *ctxt; + /* For waiting on outstanding reads and writes: */ + struct closure cl; + struct list_head reads; - /* what we read: */ - struct bch_extent_ptr ptr; - u64 offset; + /* in flight sectors: */ + atomic_t read_sectors; + atomic_t write_sectors; - struct bch_write_op op; + wait_queue_head_t wait; }; -void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *); -int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, - struct write_point_specifier, - struct bch_io_opts, - enum data_cmd, struct data_opts, - enum btree_id, struct bkey_s_c); +typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, + struct bch_io_opts *, struct data_update_opts *); -typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, - struct bkey_s_c, - struct bch_io_opts *, struct data_opts *); +void bch2_moving_ctxt_exit(struct moving_context *); +void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, + struct bch_ratelimit *, struct bch_move_stats *, + struct write_point_specifier, bool); int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); @@ -58,16 +42,20 @@ int bch2_move_data(struct bch_fs *, enum btree_id, struct bpos, enum btree_id, struct bpos, struct bch_ratelimit *, + struct bch_move_stats *, struct write_point_specifier, - move_pred_fn, void *, - struct bch_move_stats *); + bool, + move_pred_fn, void *); +int __bch2_evacuate_bucket(struct moving_context *, + struct bpos, int, + struct data_update_opts); int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int, + struct data_update_opts, struct bch_ratelimit *, + struct bch_move_stats *, struct write_point_specifier, - enum data_cmd, - struct data_opts *, - struct bch_move_stats *); + bool); int bch2_data_job(struct bch_fs *, struct bch_move_stats *, struct bch_ioctl_data); diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index efb09e1..f9ad4cb 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -95,11 +95,11 @@ static int bch2_copygc(struct bch_fs *c) struct bch_dev *ca; unsigned dev_idx; size_t heap_size = 0; - struct data_opts data_opts = { - .nr_replicas = 1, - .btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc, + struct moving_context ctxt; + struct data_update_opts data_opts = { + .btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc, }; - int ret; + int ret = 0; bch_move_stats_init(&move_stats, "copygc"); @@ -121,26 +121,49 @@ static int bch2_copygc(struct bch_fs *c) } if (!h->used) { - bch_err_ratelimited(c, "copygc requested to run but found no buckets to move!"); + s64 wait = S64_MAX, dev_wait; + u64 dev_min_wait_fragmented = 0; + u64 dev_min_wait_allowed = 0; + int dev_min_wait = -1; + + for_each_rw_member(ca, c, dev_idx) { + struct bch_dev_usage usage = bch2_dev_usage_read(ca); + s64 allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) * + ca->mi.bucket_size) >> 1); + s64 fragmented = usage.d[BCH_DATA_user].fragmented; + + dev_wait = max(0LL, allowed - fragmented); + + if (dev_min_wait < 0 || dev_wait < wait) { + dev_min_wait = dev_idx; + dev_min_wait_fragmented = fragmented; + dev_min_wait_allowed = allowed; + } + } + + bch_err_ratelimited(c, "copygc requested to run but found no buckets to move! dev %u fragmented %llu allowed %llu", + dev_min_wait, dev_min_wait_fragmented, dev_min_wait_allowed); return 0; } heap_resort(h, fragmentation_cmp, NULL); - while (h->used) { + bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, + writepoint_ptr(&c->copygc_write_point), + false); + + /* not correct w.r.t. device removal */ + while (h->used && !ret) { BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); - /* not correct w.r.t. device removal */ - - ret = bch2_evacuate_bucket(c, POS(e.dev, e.bucket), e.gen, NULL, - writepoint_ptr(&c->copygc_write_point), - DATA_REWRITE, &data_opts, - &move_stats); - if (ret < 0) - bch_err(c, "error %i from bch2_move_data() in copygc", ret); - if (ret) - return ret; + ret = __bch2_evacuate_bucket(&ctxt, POS(e.dev, e.bucket), e.gen, + data_opts); } + bch2_moving_ctxt_exit(&ctxt); + + if (ret < 0) + bch_err(c, "error %i from bch2_move_data() in copygc", ret); + trace_copygc(c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0); return ret; } @@ -183,10 +206,11 @@ static int bch2_copygc_thread(void *arg) struct bch_fs *c = arg; struct io_clock *clock = &c->io_clock[WRITE]; u64 last, wait; + int ret = 0; set_freezable(); - while (!kthread_should_stop()) { + while (!ret && !kthread_should_stop()) { cond_resched(); if (kthread_wait_freezable(c->copy_gc_enabled)) @@ -205,8 +229,11 @@ static int bch2_copygc_thread(void *arg) c->copygc_wait = 0; - if (bch2_copygc(c)) - break; + c->copygc_running = true; + ret = bch2_copygc(c); + c->copygc_running = false; + + wake_up(&c->copygc_running_wq); } return 0; @@ -250,4 +277,6 @@ int bch2_copygc_start(struct bch_fs *c) void bch2_fs_copygc_init(struct bch_fs *c) { + init_waitqueue_head(&c->copygc_running_wq); + c->copygc_running = false; } diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index b07f3dc..407b221 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -35,6 +35,7 @@ const char * const bch2_sb_compat[] = { const char * const bch2_btree_ids[] = { BCH_BTREE_IDS() + "interior btree node", NULL }; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 54e3575..2f5f49c 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -269,7 +269,7 @@ enum opt_type { BCH2_NO_SB_OPT, true, \ NULL, "Enable discard/TRIM support") \ x(verbose, u8, \ - OPT_FS|OPT_MOUNT, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Extra debugging information during mount/recovery")\ @@ -290,6 +290,11 @@ enum opt_type { OPT_UINT(0, U32_MAX), \ BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ NULL, "Delay in milliseconds before automatic journal reclaim")\ + x(move_bytes_in_flight, u32, \ + OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1024, U32_MAX), \ + BCH2_NO_SB_OPT, 1U << 20, \ + NULL, "Amount of IO in flight to keep in flight by the move path")\ x(fsck, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 23cc46e..31da409 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -22,62 +22,70 @@ * returns -1 if it should not be moved, or * device of pointer that should be moved, if known, or INT_MAX if unknown */ -static int __bch2_rebalance_pred(struct bch_fs *c, - struct bkey_s_c k, - struct bch_io_opts *io_opts) +static bool rebalance_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; + unsigned i; + + data_opts->rewrite_ptrs = 0; + data_opts->target = io_opts->background_target; + data_opts->extra_replicas = 0; + data_opts->btree_insert_flags = 0; if (io_opts->background_compression && - !bch2_bkey_is_incompressible(k)) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + !bch2_bkey_is_incompressible(k)) { + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { if (!p.ptr.cached && p.crc.compression_type != bch2_compression_opt_to_type[io_opts->background_compression]) - return p.ptr.dev; + data_opts->rewrite_ptrs |= 1U << i; + i++; + } + } - if (io_opts->background_target) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && - !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target)) - return p.ptr.dev; + if (io_opts->background_target) { + const struct bch_extent_ptr *ptr; - return -1; + i = 0; + bkey_for_each_ptr(ptrs, ptr) { + if (!ptr->cached && + !bch2_dev_in_target(c, ptr->dev, io_opts->background_target)) + data_opts->rewrite_ptrs |= 1U << i; + i++; + } + } + + return data_opts->rewrite_ptrs != 0; } void bch2_rebalance_add_key(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts) { - atomic64_t *counter; - int dev; + struct data_update_opts update_opts = { 0 }; + struct bkey_ptrs_c ptrs; + const struct bch_extent_ptr *ptr; + unsigned i; - dev = __bch2_rebalance_pred(c, k, io_opts); - if (dev < 0) + if (!rebalance_pred(c, NULL, k, io_opts, &update_opts)) return; - counter = dev < INT_MAX - ? &bch_dev_bkey_exists(c, dev)->rebalance_work - : &c->rebalance.work_unknown_dev; - - if (atomic64_add_return(k.k->size, counter) == k.k->size) - rebalance_wakeup(c); -} - -static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) -{ - if (__bch2_rebalance_pred(c, k, io_opts) >= 0) { - data_opts->target = io_opts->background_target; - data_opts->nr_replicas = 1; - data_opts->btree_insert_flags = 0; - return DATA_ADD_REPLICAS; - } else { - return DATA_SKIP; + i = 0; + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) { + if ((1U << i) && update_opts.rewrite_ptrs) + if (atomic64_add_return(k.k->size, + &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) == + k.k->size) + rebalance_wakeup(c); + i++; } } @@ -245,9 +253,10 @@ static int bch2_rebalance_thread(void *arg) BTREE_ID_NR, POS_MAX, /* ratelimiting disabled for now */ NULL, /* &r->pd.rate, */ + &move_stats, writepoint_ptr(&c->rebalance_write_point), - rebalance_pred, NULL, - &move_stats); + true, + rebalance_pred, NULL); } return 0; diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 63e8c1c..eea025a 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -919,6 +919,19 @@ fsck_err: return ERR_PTR(ret); } +static bool btree_id_is_alloc(enum btree_id id) +{ + switch (id) { + case BTREE_ID_alloc: + case BTREE_ID_backpointers: + case BTREE_ID_need_discard: + case BTREE_ID_freespace: + return true; + default: + return false; + } +} + static int read_btree_roots(struct bch_fs *c) { unsigned i; @@ -930,14 +943,14 @@ static int read_btree_roots(struct bch_fs *c) if (!r->alive) continue; - if (i == BTREE_ID_alloc && + if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) { c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); continue; } if (r->error) { - __fsck_err(c, i == BTREE_ID_alloc + __fsck_err(c, btree_id_is_alloc(i) ? FSCK_CAN_IGNORE : 0, "invalid btree root %s", bch2_btree_ids[i]); @@ -947,7 +960,8 @@ static int read_btree_roots(struct bch_fs *c) ret = bch2_btree_root_read(c, i, &r->key, r->level); if (ret) { - __fsck_err(c, i == BTREE_ID_alloc + __fsck_err(c, + btree_id_is_alloc(i) ? FSCK_CAN_IGNORE : 0, "error reading btree root %s", bch2_btree_ids[i]); diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index 4e589c0..2038e35 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -282,7 +282,7 @@ s64 bch2_remap_range(struct bch_fs *c, u32 dst_snapshot, src_snapshot; int ret = 0, ret2 = 0; - if (!percpu_ref_tryget(&c->writes)) + if (!percpu_ref_tryget_live(&c->writes)) return -EROFS; bch2_check_set_feature(c, BCH_FEATURE_reflink); diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index 8f41a06..60b60de 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -729,7 +729,7 @@ err: static void bch2_delete_dead_snapshots(struct bch_fs *c) { - if (unlikely(!percpu_ref_tryget(&c->writes))) + if (unlikely(!percpu_ref_tryget_live(&c->writes))) return; if (!queue_work(system_long_wq, &c->snapshot_delete_work)) @@ -931,7 +931,7 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, if (ret) return ret; - if (unlikely(!percpu_ref_tryget(&c->writes))) + if (unlikely(!percpu_ref_tryget_live(&c->writes))) return -EROFS; if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 6d3efda..8501ada 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -89,7 +89,7 @@ static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, unsigned dev) { BUG_ON(bch2_dev_list_has_dev(*devs, dev)); - BUG_ON(devs->nr >= BCH_REPLICAS_MAX); + BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); devs->devs[devs->nr++] = dev; } diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index d72ec06..2c65005 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -182,7 +182,6 @@ read_attribute(journal_debug); read_attribute(btree_updates); read_attribute(btree_cache); read_attribute(btree_key_cache); -read_attribute(btree_transactions); read_attribute(stripes_heap); read_attribute(open_buckets); @@ -420,9 +419,6 @@ SHOW(bch2_fs) if (attr == &sysfs_btree_key_cache) bch2_btree_key_cache_to_text(out, &c->btree_key_cache); - if (attr == &sysfs_btree_transactions) - bch2_btree_trans_to_text(out, c); - if (attr == &sysfs_stripes_heap) bch2_stripes_heap_to_text(out, c); @@ -617,7 +613,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_btree_updates, &sysfs_btree_cache, &sysfs_btree_key_cache, - &sysfs_btree_transactions, &sysfs_new_stripes, &sysfs_stripes_heap, &sysfs_open_buckets, @@ -676,7 +671,7 @@ STORE(bch2_fs_opts_dir) * We don't need to take c->writes for correctness, but it eliminates an * unsightly error message in the dmesg log when we're RO: */ - if (unlikely(!percpu_ref_tryget(&c->writes))) + if (unlikely(!percpu_ref_tryget_live(&c->writes))) return -EROFS; tmp = kstrdup(buf, GFP_KERNEL); diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 85b8f3d..8ef4b59 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -145,9 +145,10 @@ static int __bch2_strtou64_h(const char *cp, u64 *res) if (f_n > div_u64(U64_MAX, b)) return -ERANGE; - if (v + (f_n * b) / f_d < v) + f_n = div_u64(f_n * b, f_d); + if (v + f_n < v) return -ERANGE; - v += (f_n * b) / f_d; + v += f_n; *res = v; return cp - start;