X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fbtree_cache.c;h=b6a716cd4b6d5e865e808d3a1df7ee8e4b792a3b;hb=05408b6f8fea54bf53e68a4ef24291214970f6d0;hp=5d3acba525c2e0ae4145bd9b62cbd96c6c08d4bf;hpb=1f7098c22213bbe66896f390a529223468a3986e;p=bcachefs-tools-debian diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 5d3acba..b6a716c 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -28,7 +28,7 @@ void bch2_recalc_btree_reserve(struct bch_fs *c) for (i = 0; i < BTREE_ID_NR; i++) if (c->btree_roots[i].b) reserve += min_t(unsigned, 1, - c->btree_roots[i].b->level) * 8; + c->btree_roots[i].b->c.level) * 8; c->btree_cache.reserve = reserve; } @@ -62,34 +62,43 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, const struct btree *b = obj; const u64 *v = arg->key; - return PTR_HASH(&b->key) == *v ? 0 : 1; + return b->hash_val == *v ? 0 : 1; } static const struct rhashtable_params bch_btree_cache_params = { .head_offset = offsetof(struct btree, hash), - .key_offset = offsetof(struct btree, key.v), - .key_len = sizeof(struct bch_extent_ptr), + .key_offset = offsetof(struct btree, hash_val), + .key_len = sizeof(u64), .obj_cmpfn = bch2_btree_cache_cmp_fn, }; -static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) +static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { - struct btree_cache *bc = &c->btree_cache; + BUG_ON(b->data || b->aux_data); b->data = kvpmalloc(btree_bytes(c), gfp); if (!b->data) - goto err; + return -ENOMEM; - if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) - goto err; + if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) { + kvpfree(b->data, btree_bytes(c)); + b->data = NULL; + return -ENOMEM; + } - bc->used++; - list_move(&b->list, &bc->freeable); - return; -err: - kvpfree(b->data, btree_bytes(c)); - b->data = NULL; - list_move(&b->list, &bc->freed); + return 0; +} + +static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) +{ + struct btree_cache *bc = &c->btree_cache; + + if (!__btree_node_data_alloc(c, b, gfp)) { + bc->used++; + list_move(&b->list, &bc->freeable); + } else { + list_move(&b->list, &bc->freed); + } } static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) @@ -99,7 +108,7 @@ static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) return NULL; bkey_btree_ptr_init(&b->key); - six_lock_init(&b->lock); + six_lock_init(&b->c.lock); INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); @@ -114,11 +123,14 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); /* Cause future lookups for this node to fail: */ - PTR_HASH(&b->key) = 0; + b->hash_val = 0; } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) { + BUG_ON(b->hash_val); + b->hash_val = btree_ptr_hash_val(&b->key); + return rhashtable_lookup_insert_fast(&bc->table, &b->hash, bch_btree_cache_params); } @@ -128,8 +140,8 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, { int ret; - b->level = level; - b->btree_id = id; + b->c.level = level; + b->c.btree_id = id; mutex_lock(&bc->lock); ret = __bch2_btree_node_hash_insert(bc, b); @@ -144,8 +156,9 @@ __flatten static inline struct btree *btree_cache_find(struct btree_cache *bc, const struct bkey_i *k) { - return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k), - bch_btree_cache_params); + u64 v = btree_ptr_hash_val(k); + + return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); } /* @@ -159,10 +172,10 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) lockdep_assert_held(&bc->lock); - if (!six_trylock_intent(&b->lock)) + if (!six_trylock_intent(&b->c.lock)) return -ENOMEM; - if (!six_trylock_write(&b->lock)) + if (!six_trylock_write(&b->c.lock)) goto out_unlock_intent; if (btree_node_noevict(b)) @@ -199,13 +212,13 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) btree_node_wait_on_io(b); } out: - if (PTR_HASH(&b->key) && !ret) + if (b->hash_val && !ret) trace_btree_node_reap(c, b); return ret; out_unlock: - six_unlock_write(&b->lock); + six_unlock_write(&b->c.lock); out_unlock_intent: - six_unlock_intent(&b->lock); + six_unlock_intent(&b->c.lock); ret = -ENOMEM; goto out; } @@ -237,7 +250,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, return SHRINK_STOP; /* Return -1 if we can't do anything right now */ - if (sc->gfp_mask & __GFP_IO) + if (sc->gfp_mask & __GFP_FS) mutex_lock(&bc->lock); else if (!mutex_trylock(&bc->lock)) return -1; @@ -263,8 +276,8 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (++i > 3 && !btree_node_reclaim(c, b)) { btree_node_data_free(c, b); - six_unlock_write(&b->lock); - six_unlock_intent(&b->lock); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); freed++; } } @@ -290,8 +303,8 @@ restart: mutex_unlock(&bc->lock); bch2_btree_node_hash_remove(bc, b); - six_unlock_write(&b->lock); - six_unlock_intent(&b->lock); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); if (freed >= nr) goto out; @@ -520,36 +533,47 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) */ list_for_each_entry(b, &bc->freeable, list) if (!btree_node_reclaim(c, b)) - goto out_unlock; + goto got_node; /* * We never free struct btree itself, just the memory that holds the on * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, &bc->freed, list) - if (!btree_node_reclaim(c, b)) { - btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO); - if (b->data) - goto out_unlock; + if (!btree_node_reclaim(c, b)) + goto got_node; - six_unlock_write(&b->lock); - six_unlock_intent(&b->lock); + b = NULL; +got_node: + if (b) + list_del_init(&b->list); + mutex_unlock(&bc->lock); + + if (!b) { + b = kzalloc(sizeof(struct btree), GFP_KERNEL); + if (!b) goto err; - } - b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO); - if (!b) - goto err; + bkey_btree_ptr_init(&b->key); + six_lock_init(&b->c.lock); + INIT_LIST_HEAD(&b->list); + INIT_LIST_HEAD(&b->write_blocked); + + BUG_ON(!six_trylock_intent(&b->c.lock)); + BUG_ON(!six_trylock_write(&b->c.lock)); + } + + if (!b->data) { + if (__btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) + goto err; + + mutex_lock(&bc->lock); + bc->used++; + mutex_unlock(&bc->lock); + } - BUG_ON(!six_trylock_intent(&b->lock)); - BUG_ON(!six_trylock_write(&b->lock)); -out_unlock: BUG_ON(btree_node_hashed(b)); BUG_ON(btree_node_write_in_flight(b)); - - list_del_init(&b->list); - mutex_unlock(&bc->lock); - memalloc_nofs_restore(flags); out: b->flags = 0; b->written = 0; @@ -557,14 +581,22 @@ out: b->sib_u64s[0] = 0; b->sib_u64s[1] = 0; b->whiteout_u64s = 0; - b->uncompacted_whiteout_u64s = 0; bch2_btree_keys_init(b, &c->expensive_debug_checks); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); + memalloc_nofs_restore(flags); return b; err: + mutex_lock(&bc->lock); + + if (b) { + list_add(&b->list, &bc->freed); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + } + /* Try to cannibalize another cached btree node: */ if (bc->alloc_lock == current) { b = btree_node_cannibalize(c); @@ -578,6 +610,7 @@ err: } mutex_unlock(&bc->lock); + memalloc_nofs_restore(flags); return ERR_PTR(-ENOMEM); } @@ -585,6 +618,7 @@ err: static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, struct btree_iter *iter, const struct bkey_i *k, + enum btree_id btree_id, unsigned level, enum six_lock_type lock_type, bool sync) @@ -592,60 +626,65 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, struct btree_cache *bc = &c->btree_cache; struct btree *b; + BUG_ON(level + 1 >= BTREE_MAX_DEPTH); /* * Parent node must be locked, else we could read in a btree node that's * been freed: */ - BUG_ON(!btree_node_locked(iter, level + 1)); - BUG_ON(level >= BTREE_MAX_DEPTH); + if (iter && !bch2_btree_node_relock(iter, level + 1)) + return ERR_PTR(-EINTR); b = bch2_btree_node_mem_alloc(c); if (IS_ERR(b)) return b; bkey_copy(&b->key, k); - if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) { + if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { /* raced with another fill: */ /* mark as unhashed... */ - PTR_HASH(&b->key) = 0; + b->hash_val = 0; mutex_lock(&bc->lock); list_add(&b->list, &bc->freeable); mutex_unlock(&bc->lock); - six_unlock_write(&b->lock); - six_unlock_intent(&b->lock); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); return NULL; } /* - * If the btree node wasn't cached, we can't drop our lock on - * the parent until after it's added to the cache - because - * otherwise we could race with a btree_split() freeing the node - * we're trying to lock. + * Unlock before doing IO: * - * But the deadlock described below doesn't exist in this case, - * so it's safe to not drop the parent lock until here: + * XXX: ideally should be dropping all btree node locks here */ - if (btree_node_read_locked(iter, level + 1)) + if (iter && btree_node_read_locked(iter, level + 1)) btree_node_unlock(iter, level + 1); bch2_btree_node_read(c, b, sync); - six_unlock_write(&b->lock); + six_unlock_write(&b->c.lock); if (!sync) { - six_unlock_intent(&b->lock); + six_unlock_intent(&b->c.lock); return NULL; } if (lock_type == SIX_LOCK_read) - six_lock_downgrade(&b->lock); + six_lock_downgrade(&b->c.lock); return b; } +static int lock_node_check_fn(struct six_lock *lock, void *p) +{ + struct btree *b = container_of(lock, struct btree, c.lock); + const struct bkey_i *k = p; + + return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1; +} + /** * bch_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. @@ -663,16 +702,11 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, struct btree *b; struct bset_tree *t; - /* - * XXX: locking optimization - * - * we can make the locking looser here - caller can drop lock on parent - * node before locking child node (and potentially blocking): we just - * have to have bch2_btree_node_fill() call relock on the parent and - * return -EINTR if that fails - */ - EBUG_ON(!btree_node_locked(iter, level + 1)); EBUG_ON(level >= BTREE_MAX_DEPTH); + + b = btree_node_mem_ptr(k); + if (b) + goto lock_node; retry: b = btree_cache_find(bc, k); if (unlikely(!b)) { @@ -681,7 +715,8 @@ retry: * else we could read in a btree node from disk that's been * freed: */ - b = bch2_btree_node_fill(c, iter, k, level, lock_type, true); + b = bch2_btree_node_fill(c, iter, k, iter->btree_id, + level, lock_type, true); /* We raced and found the btree node in the cache */ if (!b) @@ -690,6 +725,7 @@ retry: if (IS_ERR(b)) return b; } else { +lock_node: /* * There's a potential deadlock with splits and insertions into * interior nodes we have to avoid: @@ -711,7 +747,7 @@ retry: * free it: * * To guard against this, btree nodes are evicted from the cache - * when they're freed - and PTR_HASH() is zeroed out, which we + * when they're freed - and b->hash_val is zeroed out, which we * check for after we lock the node. * * Then, bch2_btree_node_relock() on the parent will fail - because @@ -721,13 +757,17 @@ retry: if (btree_node_read_locked(iter, level + 1)) btree_node_unlock(iter, level + 1); - if (!btree_node_lock(b, k->k.p, level, iter, lock_type)) + if (!btree_node_lock(b, k->k.p, level, iter, lock_type, + lock_node_check_fn, (void *) k)) { + if (b->hash_val != btree_ptr_hash_val(k)) + goto retry; return ERR_PTR(-EINTR); + } - if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) || - b->level != level || + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->c.level != level || race_fault())) { - six_unlock_type(&b->lock, lock_type); + six_unlock_type(&b->c.lock, lock_type); if (bch2_btree_node_relock(iter, level + 1)) goto retry; @@ -736,6 +776,78 @@ retry: } } + /* XXX: waiting on IO with btree locks held: */ + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, + TASK_UNINTERRUPTIBLE); + + prefetch(b->aux_data); + + for_each_bset(b, t) { + void *p = (u64 *) b->aux_data + t->aux_data_offset; + + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); + + if (unlikely(btree_node_read_error(b))) { + six_unlock_type(&b->c.lock, lock_type); + return ERR_PTR(-EIO); + } + + EBUG_ON(b->c.btree_id != iter->btree_id || + BTREE_NODE_LEVEL(b->data) != level || + bkey_cmp(b->data->max_key, k->k.p)); + + return b; +} + +struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, + const struct bkey_i *k, + enum btree_id btree_id, + unsigned level) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + struct bset_tree *t; + int ret; + + EBUG_ON(level >= BTREE_MAX_DEPTH); + + b = btree_node_mem_ptr(k); + if (b) + goto lock_node; +retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { + b = bch2_btree_node_fill(c, NULL, k, btree_id, + level, SIX_LOCK_read, true); + + /* We raced and found the btree node in the cache */ + if (!b) + goto retry; + + if (IS_ERR(b)) + return b; + } else { +lock_node: + ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); + if (ret) + goto retry; + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->c.btree_id != btree_id || + b->c.level != level)) { + six_unlock_read(&b->c.lock); + goto retry; + } + } + + /* XXX: waiting on IO with btree locks held: */ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, TASK_UNINTERRUPTIBLE); @@ -750,15 +862,15 @@ retry: } /* avoid atomic set bit if it's not needed: */ - if (btree_node_accessed(b)) + if (!btree_node_accessed(b)) set_btree_node_accessed(b); if (unlikely(btree_node_read_error(b))) { - six_unlock_type(&b->lock, lock_type); + six_unlock_read(&b->c.lock); return ERR_PTR(-EIO); } - EBUG_ON(b->btree_id != iter->btree_id || + EBUG_ON(b->c.btree_id != btree_id || BTREE_NODE_LEVEL(b->data) != level || bkey_cmp(b->data->max_key, k->k.p)); @@ -776,18 +888,30 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, struct bkey_packed *k; BKEY_PADDED(k) tmp; struct btree *ret = NULL; - unsigned level = b->level; + unsigned level = b->c.level; parent = btree_iter_node(iter, level + 1); if (!parent) return NULL; + /* + * There's a corner case where a btree_iter might have a node locked + * that is just outside its current pos - when + * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node. + * + * But the lock ordering checks in __bch2_btree_node_lock() go off of + * iter->pos, not the node's key: so if the iterator is marked as + * needing to be traversed, we risk deadlock if we don't bail out here: + */ + if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) + return ERR_PTR(-EINTR); + if (!bch2_btree_node_relock(iter, level + 1)) { ret = ERR_PTR(-EINTR); goto out; } - node_iter = iter->l[parent->level].iter; + node_iter = iter->l[parent->c.level].iter; k = bch2_btree_node_iter_peek_all(&node_iter, parent); BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); @@ -834,7 +958,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); if (!IS_ERR(ret)) { - six_unlock_intent(&ret->lock); + six_unlock_intent(&ret->c.lock); ret = ERR_PTR(-EINTR); } } @@ -856,8 +980,7 @@ out: if (sib != btree_prev_sib) swap(n1, n2); - BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id, - n1->key.k.p), + BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p), n2->data->min_key)); } @@ -879,7 +1002,8 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, if (b) return; - bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false); + bch2_btree_node_fill(c, iter, k, iter->btree_id, + level, SIX_LOCK_read, false); } void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, @@ -895,7 +1019,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, pr_buf(out, "l %u %llu:%llu - %llu:%llu:\n" " ptrs: ", - b->level, + b->c.level, b->data->min_key.inode, b->data->min_key.offset, b->data->max_key.inode,