-846600a41b7853588796a5403b07347d36c5a65c
+2e70771b8dc0d0f2d0356a5a7d16cab9430cd49e
struct bkey_s_c k;
char buf[512];
- for_each_btree_key(&iter, c, btree_id, start, k) {
+ for_each_btree_key(&iter, c, btree_id, start,
+ BTREE_ITER_PREFETCH, k) {
if (bkey_cmp(k.k->p, end) > 0)
break;
closure_init_stack(&cl);
- bio_init(&bio.bio);
- bio.bio.bi_max_vecs = 1;
- bio.bio.bi_io_vec = &bv;
+ bio_init(&bio.bio, &bv, 1);
bio.bio.bi_iter.bi_size = len;
bch2_bio_map(&bio.bio, buf);
};
struct backing_dev_info {
+ struct list_head bdi_list;
unsigned ra_pages;
unsigned capabilities;
return bio_clone_bioset(bio, gfp_mask, NULL);
}
-static inline void bio_init(struct bio *bio)
+static inline void bio_init(struct bio *bio, struct bio_vec *table,
+ unsigned short max_vecs)
{
memset(bio, 0, sizeof(*bio));
atomic_set(&bio->__bi_remaining, 1);
atomic_set(&bio->__bi_cnt, 1);
+
+ bio->bi_io_vec = table;
+ bio->bi_max_vecs = max_vecs;
}
#endif /* __LINUX_BIO_H */
__entry->dev = bio->bi_bdev->bd_dev;
__entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector = bio->bi_iter.bi_size >> 9;
- blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
- bio->bi_iter.bi_size);
+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
),
TP_printk("%d,%d %s %llu + %u",
__entry->inode = inode;
__entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector = bio->bi_iter.bi_size >> 9;
- blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
- bio->bi_iter.bi_size);
+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
__entry->delay = delay;
),
static int prio_io(struct bch_dev *ca, uint64_t bucket, int op)
{
- bio_init(ca->bio_prio);
- bio_set_op_attrs(ca->bio_prio, op, REQ_SYNC|REQ_META);
-
- ca->bio_prio->bi_max_vecs = bucket_pages(ca);
- ca->bio_prio->bi_io_vec = ca->bio_prio->bi_inline_vecs;
+ bio_init(ca->bio_prio, ca->bio_prio->bi_inline_vecs, bucket_pages(ca));
+ ca->bio_prio->bi_opf = op|REQ_SYNC|REQ_META;
ca->bio_prio->bi_iter.bi_sector = bucket * ca->mi.bucket_size;
ca->bio_prio->bi_bdev = ca->disk_sb.bdev;
ca->bio_prio->bi_iter.bi_size = bucket_bytes(ca);
return bucket_gc_gen(ca, g) < BUCKET_GC_GEN_MAX;
}
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g)
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
+ struct bucket_mark mark)
{
- if (!is_available_bucket(READ_ONCE(g->mark)))
+ if (!is_available_bucket(mark))
return false;
if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX - 1)
* btree GC to rewrite nodes with stale pointers.
*/
-#define bucket_sort_key(g) \
-({ \
- unsigned long prio = g->read_prio - ca->min_prio[READ]; \
- prio = (prio * 7) / (ca->fs->prio_clock[READ].hand - \
- ca->min_prio[READ]); \
- \
- (((prio + 1) * bucket_sectors_used(g)) << 8) | bucket_gc_gen(ca, g);\
-})
+static unsigned long bucket_sort_key(bucket_heap *h,
+ struct bucket_heap_entry e)
+{
+ struct bch_dev *ca = container_of(h, struct bch_dev, alloc_heap);
+ struct bucket *g = ca->buckets + e.bucket;
+ unsigned long prio = g->read_prio - ca->min_prio[READ];
+ prio = (prio * 7) / (ca->fs->prio_clock[READ].hand -
+ ca->min_prio[READ]);
+
+ return (prio + 1) * bucket_sectors_used(e.mark);
+}
+
+static inline int bucket_alloc_cmp(bucket_heap *h,
+ struct bucket_heap_entry l,
+ struct bucket_heap_entry r)
+{
+ return bucket_sort_key(h, l) - bucket_sort_key(h, r);
+}
+
+static inline long bucket_idx_cmp(bucket_heap *h,
+ struct bucket_heap_entry l,
+ struct bucket_heap_entry r)
+{
+ return l.bucket - r.bucket;
+}
static void invalidate_buckets_lru(struct bch_dev *ca)
{
struct bucket_heap_entry e;
struct bucket *g;
- unsigned i;
-
- mutex_lock(&ca->heap_lock);
- ca->heap.used = 0;
+ ca->alloc_heap.used = 0;
mutex_lock(&ca->fs->bucket_lock);
bch2_recalc_min_prio(ca, READ);
* all buckets have been visited.
*/
for_each_bucket(g, ca) {
- if (!bch2_can_invalidate_bucket(ca, g))
+ struct bucket_mark m = READ_ONCE(g->mark);
+ struct bucket_heap_entry e = { g - ca->buckets, m };
+
+ if (!bch2_can_invalidate_bucket(ca, g, m))
continue;
- bucket_heap_push(ca, g, bucket_sort_key(g));
+ heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
}
/* Sort buckets by physical location on disk for better locality */
- for (i = 0; i < ca->heap.used; i++) {
- struct bucket_heap_entry *e = &ca->heap.data[i];
-
- e->val = e->g - ca->buckets;
- }
-
- heap_resort(&ca->heap, bucket_max_cmp);
+ heap_resort(&ca->alloc_heap, bucket_idx_cmp);
/*
* If we run out of buckets to invalidate, bch2_allocator_thread() will
* kick stuff and retry us
*/
while (!fifo_full(&ca->free_inc) &&
- heap_pop(&ca->heap, e, bucket_max_cmp)) {
- BUG_ON(!bch2_can_invalidate_bucket(ca, e.g));
- bch2_invalidate_one_bucket(ca, e.g);
- }
+ heap_pop(&ca->alloc_heap, e, bucket_idx_cmp))
+ bch2_invalidate_one_bucket(ca, &ca->buckets[e.bucket]);
mutex_unlock(&ca->fs->bucket_lock);
- mutex_unlock(&ca->heap_lock);
}
static void invalidate_buckets_fifo(struct bch_dev *ca)
{
+ struct bucket_mark m;
struct bucket *g;
size_t checked = 0;
ca->fifo_last_bucket = ca->mi.first_bucket;
g = ca->buckets + ca->fifo_last_bucket++;
+ m = READ_ONCE(g->mark);
- if (bch2_can_invalidate_bucket(ca, g))
+ if (bch2_can_invalidate_bucket(ca, g, m))
bch2_invalidate_one_bucket(ca, g);
if (++checked >= ca->mi.nbuckets)
static void invalidate_buckets_random(struct bch_dev *ca)
{
+ struct bucket_mark m;
struct bucket *g;
size_t checked = 0;
ca->mi.first_bucket;
g = ca->buckets + n;
+ m = READ_ONCE(g->mark);
- if (bch2_can_invalidate_bucket(ca, g))
+ if (bch2_can_invalidate_bucket(ca, g, m))
bch2_invalidate_one_bucket(ca, g);
if (++checked >= ca->mi.nbuckets / 2)
#ifndef _BCACHE_ALLOC_H
#define _BCACHE_ALLOC_H
+#include "bcachefs.h"
#include "alloc_types.h"
struct bkey;
-#ifndef _BCACHE_H
-#define _BCACHE_H
+#ifndef _BCACHEFS_H
+#define _BCACHEFS_H
/*
* SOME HIGH LEVEL CODE DOCUMENTATION:
atomic_long_t saturated_count;
size_t inc_gen_needs_gc;
- struct mutex heap_lock;
- DECLARE_HEAP(struct bucket_heap_entry, heap);
+ bucket_heap alloc_heap;
+ bucket_heap copygc_heap;
/* Moving GC: */
struct task_struct *moving_gc_read;
return c->sb.block_size << 9;
}
-#endif /* _BCACHE_H */
+#endif /* _BCACHEFS_H */
-#ifndef _LINUX_BCACHE_H
-#define _LINUX_BCACHE_H
+#ifndef _BCACHEFS_FORMAT_H
+#define _BCACHEFS_FORMAT_H
/*
* Bcache on disk data structures
*/
-#ifdef __cplusplus
-typedef bool _Bool;
-extern "C" {
-#endif
-
#include <asm/types.h>
#include <asm/byteorder.h>
#include <linux/uuid.h>
};
};
-#ifndef __cplusplus
-
#define KEY(_inode, _offset, _size) \
((struct bkey) { \
.u64s = BKEY_U64s, \
.size = _size, \
})
-#else
-
-static inline struct bkey KEY(__u64 inode, __u64 offset, __u64 size)
-{
- struct bkey ret;
-
- memset(&ret, 0, sizeof(ret));
- ret.u64s = BKEY_U64s;
- ret.format = KEY_FORMAT_CURRENT;
- ret.p.inode = inode;
- ret.p.offset = offset;
- ret.size = size;
-
- return ret;
-}
-
-#endif
-
static inline void bkey_init(struct bkey *k)
{
*k = KEY(0, 0, 0);
};
} __attribute__((packed, aligned(8)));
-#ifdef __cplusplus
-}
-#endif
-#endif /* _LINUX_BCACHE_H */
-
-/* vim: set foldnestmax=2: */
+#endif /* _BCACHEFS_FORMAT_H */
* in one cacheline in t->set (BSET_CACHELINE bytes).
*
* This means we don't have to store the full index of the key that a node in
- * the binary tree points to; eytzinger_to_inorder() gives us the cacheline, and
+ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
* then bkey_float->m gives us the offset within that cacheline, in units of 8
* bytes.
*
unsigned j)
{
return cacheline_to_bkey(b, t,
- __eytzinger_to_inorder(j, t->size, t->extra),
+ __eytzinger1_to_inorder(j, t->size, t->extra),
bkey_float(b, t, j)->key_offset);
}
t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
/* First we figure out where the first key in each cacheline is */
- eytzinger_for_each(j, t->size) {
+ eytzinger1_for_each(j, t->size) {
while (bkey_to_cacheline(b, t, k) < cacheline)
prev = k, k = bkey_next(k);
t->max_key = bkey_unpack_pos(b, k);
/* Then we build the tree */
- eytzinger_for_each(j, t->size)
+ eytzinger1_for_each(j, t->size)
make_bfloat(b, t, j, &min_key, &max_key);
}
do {
p = j ? tree_to_bkey(b, t,
- __inorder_to_eytzinger(j--,
+ __inorder_to_eytzinger1(j--,
t->size, t->extra))
: btree_bkey_first(b, t);
} while (p >= k);
if (inorder &&
inorder < t->size) {
- j = __inorder_to_eytzinger(inorder, t->size, t->extra);
+ j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
if (k == tree_to_bkey(b, t, j)) {
/* Fix the node this key corresponds to */
make_bfloat(b, t, j, &min_key, &max_key);
/* Children for which this key is the right boundary */
- for (j = eytzinger_left_child(j);
+ for (j = eytzinger1_left_child(j);
j < t->size;
- j = eytzinger_right_child(j))
+ j = eytzinger1_right_child(j))
make_bfloat(b, t, j, &min_key, &max_key);
}
}
if (inorder + 1 < t->size) {
- j = __inorder_to_eytzinger(inorder + 1, t->size, t->extra);
+ j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
if (k == tree_to_prev_bkey(b, t, j)) {
make_bfloat(b, t, j, &min_key, &max_key);
/* Children for which this key is the left boundary */
- for (j = eytzinger_right_child(j);
+ for (j = eytzinger1_right_child(j);
j < t->size;
- j = eytzinger_left_child(j))
+ j = eytzinger1_left_child(j))
make_bfloat(b, t, j, &min_key, &max_key);
}
}
p = bkey_float_get(base, n << 4);
prefetch(p);
} else if (n << 3 < t->size) {
- inorder = __eytzinger_to_inorder(n, t->size, t->extra);
+ inorder = __eytzinger1_to_inorder(n, t->size, t->extra);
p = bset_cacheline(b, t, inorder);
#ifdef CONFIG_X86_64
asm(".intel_syntax noprefix;"
&search, packed_search, n);
} while (n < t->size);
- inorder = __eytzinger_to_inorder(n >> 1, t->size, t->extra);
+ inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
/*
* n would have been the node we recursed to - the low bit tells us if
return cacheline_to_bkey(b, t, inorder, f->key_offset);
} else {
if (--inorder) {
- n = eytzinger_prev(n >> 1, t->size);
+ n = eytzinger1_prev(n >> 1, t->size);
f = bkey_float_get(base, n);
return cacheline_to_bkey(b, t, inorder, f->key_offset);
} else
if (!bset_has_ro_aux_tree(t))
goto out;
- j = __inorder_to_eytzinger(bkey_to_cacheline(b, t, k), t->size, t->extra);
+ j = __inorder_to_eytzinger1(bkey_to_cacheline(b, t, k), t->size, t->extra);
if (j &&
j < t->size &&
k == tree_to_bkey(b, t, j))
goto out_unlock;
if (btree_node_dirty(b) ||
- btree_node_write_in_flight(b)) {
+ btree_node_write_in_flight(b) ||
+ btree_node_read_in_flight(b)) {
if (!flush)
goto out_unlock;
+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+ TASK_UNINTERRUPTIBLE);
+
/*
* Using the underscore version because we don't want to compact
* bsets after the write, since this node is about to be evicted
if (btree_node_read_locked(iter, level + 1))
btree_node_unlock(iter, level + 1);
- bch2_btree_node_read(c, b);
+ bch2_btree_node_read(c, b, true);
six_unlock_write(&b->lock);
if (lock_type == SIX_LOCK_read)
}
}
+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+ TASK_UNINTERRUPTIBLE);
+
prefetch(b->aux_data);
for_each_bset(b, t) {
return b;
}
+void bch2_btree_node_prefetch(struct btree_iter *iter,
+ const struct bkey_i *k, unsigned level)
+{
+ struct bch_fs *c = iter->c;
+ struct btree *b;
+
+ BUG_ON(level >= BTREE_MAX_DEPTH);
+
+ rcu_read_lock();
+ b = mca_find(c, k);
+ rcu_read_unlock();
+
+ if (b)
+ return;
+
+ b = bch2_btree_node_mem_alloc(c);
+ if (IS_ERR(b))
+ return;
+
+ bkey_copy(&b->key, k);
+ if (bch2_btree_node_hash_insert(c, b, level, iter->btree_id)) {
+ /* raced with another fill: */
+
+ /* mark as unhashed... */
+ bkey_i_to_extent(&b->key)->v._data[0] = 0;
+
+ mutex_lock(&c->btree_cache_lock);
+ list_add(&b->list, &c->btree_cache_freeable);
+ mutex_unlock(&c->btree_cache_lock);
+ goto out;
+ }
+
+ bch2_btree_node_read(c, b, false);
+out:
+ six_unlock_write(&b->lock);
+ six_unlock_intent(&b->lock);
+}
+
int bch2_print_btree_node(struct bch_fs *c, struct btree *b,
char *buf, size_t len)
{
struct btree *bch2_btree_node_get(struct btree_iter *, const struct bkey_i *,
unsigned, enum six_lock_type);
+void bch2_btree_node_prefetch(struct btree_iter *, const struct bkey_i *,
+ unsigned);
+
void bch2_fs_btree_exit(struct bch_fs *);
int bch2_fs_btree_init(struct bch_fs *);
btree_node_range_checks_init(&r, depth);
- for_each_btree_node(&iter, c, btree_id, POS_MIN, depth, b) {
+ __for_each_btree_node(&iter, c, btree_id, POS_MIN,
+ 0, depth, BTREE_ITER_PREFETCH, b) {
btree_node_range_checks(c, b, &r);
bch2_verify_btree_nr_keys(b);
*/
memset(merge, 0, sizeof(merge));
- __for_each_btree_node(&iter, c, btree_id, POS_MIN, 0, b, U8_MAX) {
+ __for_each_btree_node(&iter, c, btree_id, POS_MIN,
+ U8_MAX, 0, BTREE_ITER_PREFETCH, b) {
memmove(merge + 1, merge,
sizeof(merge) - sizeof(merge[0]));
memmove(lock_seq + 1, lock_seq,
* We have to hit every btree node before starting journal replay, in
* order for the journal seq blacklist machinery to work:
*/
- for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
+ for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
btree_node_range_checks(c, b, &r);
if (btree_node_has_ptrs(b)) {
btree_node_reset_sib_u64s(b);
out:
+ clear_btree_node_read_in_flight(b);
+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
mempool_free(iter, &c->fill_iter);
return;
err:
goto out;
}
-void bch2_btree_node_read(struct bch_fs *c, struct btree *b)
+static void btree_node_read_work(struct work_struct *work)
+{
+ struct btree_read_bio *rb =
+ container_of(work, struct btree_read_bio, work);
+
+ bch2_btree_node_read_done(rb->c, rb->bio.bi_private,
+ rb->pick.ca, &rb->pick.ptr);
+
+ percpu_ref_put(&rb->pick.ca->io_ref);
+ bio_put(&rb->bio);
+}
+
+static void btree_node_read_endio(struct bio *bio)
+{
+ struct btree *b = bio->bi_private;
+ struct btree_read_bio *rb =
+ container_of(bio, struct btree_read_bio, bio);
+
+ if (bch2_dev_fatal_io_err_on(bio->bi_error,
+ rb->pick.ca, "IO error reading bucket %zu",
+ PTR_BUCKET_NR(rb->pick.ca, &rb->pick.ptr)) ||
+ bch2_meta_read_fault("btree")) {
+ set_btree_node_read_error(b);
+ percpu_ref_put(&rb->pick.ca->io_ref);
+ bio_put(bio);
+ return;
+ }
+
+ INIT_WORK(&rb->work, btree_node_read_work);
+ schedule_work(&rb->work);
+}
+
+void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
+ bool sync)
{
uint64_t start_time = local_clock();
- struct bio *bio;
struct extent_pick_ptr pick;
+ struct btree_read_bio *rb;
+ struct bio *bio;
trace_btree_read(c, b);
+ set_btree_node_read_in_flight(b);
pick = bch2_btree_pick_ptr(c, b);
if (bch2_fs_fatal_err_on(!pick.ca, c,
}
bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
+ rb = container_of(bio, struct btree_read_bio, bio);
+ rb->c = c;
+ rb->pick = pick;
+ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_bdev = pick.ca->disk_sb.bdev;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
- bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
bch2_bio_map(bio, b->data);
- submit_bio_wait(bio);
+ if (sync) {
+ submit_bio_wait(bio);
- if (bch2_dev_fatal_io_err_on(bio->bi_error,
- pick.ca, "IO error reading bucket %zu",
- PTR_BUCKET_NR(pick.ca, &pick.ptr)) ||
- bch2_meta_read_fault("btree")) {
- set_btree_node_read_error(b);
- goto out;
- }
+ if (bch2_dev_fatal_io_err_on(bio->bi_error,
+ pick.ca, "IO error reading bucket %zu",
+ PTR_BUCKET_NR(pick.ca, &pick.ptr)) ||
+ bch2_meta_read_fault("btree")) {
+ set_btree_node_read_error(b);
+ goto out;
+ }
- bch2_btree_node_read_done(c, b, pick.ca, &pick.ptr);
- bch2_time_stats_update(&c->btree_read_time, start_time);
+ bch2_btree_node_read_done(c, b, pick.ca, &pick.ptr);
+ bch2_time_stats_update(&c->btree_read_time, start_time);
out:
- bio_put(bio);
- percpu_ref_put(&pick.ca->io_ref);
+ bio_put(bio);
+ percpu_ref_put(&pick.ca->io_ref);
+ } else {
+ bio->bi_end_io = btree_node_read_endio;
+ bio->bi_private = b;
+ submit_bio(bio);
+ }
}
int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
bkey_copy(&b->key, k);
BUG_ON(bch2_btree_node_hash_insert(c, b, level, id));
- bch2_btree_node_read(c, b);
+ bch2_btree_node_read(c, b, true);
six_unlock_write(&b->lock);
if (btree_node_read_error(b)) {
wbio->put_bio = true;
wbio->order = order;
wbio->used_mempool = used_mempool;
+ bio->bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA;
bio->bi_iter.bi_size = sectors_to_write << 9;
bio->bi_end_io = btree_node_write_endio;
bio->bi_private = b;
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA);
if (parent)
closure_get(parent);
#ifndef _BCACHE_BTREE_IO_H
#define _BCACHE_BTREE_IO_H
+#include "extents.h"
+
struct bch_fs;
struct btree_write;
struct btree;
struct btree_iter;
+struct btree_read_bio {
+ struct bch_fs *c;
+ struct extent_pick_ptr pick;
+ struct work_struct work;
+ struct bio bio;
+};
+
static inline void btree_node_io_unlock(struct btree *b)
{
EBUG_ON(!btree_node_write_in_flight(b));
void bch2_btree_node_read_done(struct bch_fs *, struct btree *,
struct bch_dev *, const struct bch_extent_ptr *);
-void bch2_btree_node_read(struct bch_fs *, struct btree *);
+void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
const struct bkey_i *, unsigned);
*/
if (type == SIX_LOCK_intent &&
linked->nodes_locked != linked->nodes_intent_locked) {
- linked->locks_want = max(linked->locks_want,
- iter->locks_want);
+ linked->locks_want = max_t(unsigned,
+ linked->locks_want,
+ iter->locks_want);
return false;
}
*/
if (linked->btree_id == iter->btree_id &&
level > __fls(linked->nodes_locked)) {
- linked->locks_want = max(linked->locks_want,
- iter->locks_want);
+ linked->locks_want = max_t(unsigned,
+ linked->locks_want,
+ iter->locks_want);
return false;
}
}
static int __bch2_btree_iter_unlock(struct btree_iter *iter)
{
- BUG_ON(iter->error == -EINTR);
-
while (iter->nodes_locked)
btree_node_unlock(iter, __ffs(iter->nodes_locked));
- return iter->error;
+ return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
}
int bch2_btree_iter_unlock(struct btree_iter *iter)
? bch2_btree_node_iter_prev(&tmp, b)
: bch2_btree_node_iter_prev_all(&tmp, b);
if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k,
- iter->is_extents)) {
+ iter->flags & BTREE_ITER_IS_EXTENTS)) {
char buf[100];
struct bkey uk = bkey_unpack_key(b, k);
k = bch2_btree_node_iter_peek_all(node_iter, b);
if (k && !btree_iter_pos_cmp_packed(b, &iter->pos, k,
- iter->is_extents)) {
+ iter->flags & BTREE_ITER_IS_EXTENTS)) {
char buf[100];
struct bkey uk = bkey_unpack_key(b, k);
/* didn't find the bset in the iterator - might have to readd it: */
if (new_u64s &&
btree_iter_pos_cmp_packed(b, &iter->pos, where,
- iter->is_extents))
+ iter->flags & BTREE_ITER_IS_EXTENTS))
bch2_btree_node_iter_push(node_iter, b, where, end);
return;
found:
if (new_u64s &&
btree_iter_pos_cmp_packed(b, &iter->pos, where,
- iter->is_extents)) {
+ iter->flags & BTREE_ITER_IS_EXTENTS)) {
set->k = offset;
bch2_btree_node_iter_sort(node_iter, b);
} else if (set->k < offset + clobber_u64s) {
*/
if (b->level && new_u64s && !bkey_deleted(where) &&
btree_iter_pos_cmp_packed(b, &iter->pos, where,
- iter->is_extents)) {
+ iter->flags & BTREE_ITER_IS_EXTENTS)) {
struct bset_tree *t;
struct bkey_packed *k;
static inline void __btree_iter_init(struct btree_iter *iter,
struct btree *b)
{
- bch2_btree_node_iter_init(&iter->node_iters[b->level], b,
- iter->pos, iter->is_extents,
- btree_node_is_extents(b));
+ bch2_btree_node_iter_init(&iter->node_iters[b->level], b, iter->pos,
+ iter->flags & BTREE_ITER_IS_EXTENTS,
+ btree_node_is_extents(b));
/* Skip to first non whiteout: */
if (b->level)
{
return iter->btree_id == b->btree_id &&
bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
- btree_iter_pos_cmp(iter->pos, &b->key.k, iter->is_extents);
+ btree_iter_pos_cmp(iter->pos, &b->key.k,
+ iter->flags & BTREE_ITER_IS_EXTENTS);
}
static inline void btree_iter_node_set(struct btree_iter *iter,
}
}
+noinline
+static void btree_iter_prefetch(struct btree_iter *iter)
+{
+ struct btree *b = iter->nodes[iter->level + 1];
+ struct btree_node_iter node_iter = iter->node_iters[iter->level + 1];
+ struct bkey_packed *k;
+ BKEY_PADDED(k) tmp;
+ unsigned nr = iter->level ? 1 : 8;
+
+ while (nr) {
+ bch2_btree_node_iter_advance(&node_iter, b);
+ k = bch2_btree_node_iter_peek(&node_iter, b);
+ if (!k)
+ break;
+
+ bch2_bkey_unpack(b, &tmp.k, k);
+ bch2_btree_node_prefetch(iter, &tmp.k, iter->level);
+ }
+}
+
static inline int btree_iter_down(struct btree_iter *iter)
{
struct btree *b;
iter->level = level;
mark_btree_node_locked(iter, level, lock_type);
btree_iter_node_set(iter, b);
+
+ if (iter->flags & BTREE_ITER_PREFETCH)
+ btree_iter_prefetch(iter);
+
return 0;
}
io_error:
BUG_ON(ret != -EIO);
- iter->error = ret;
+ iter->flags |= BTREE_ITER_ERROR;
iter->nodes[iter->level] = NULL;
goto out;
}
bch2_btree_node_relock(iter, iter->level) &&
btree_iter_pos_cmp(iter->pos,
&iter->nodes[iter->level]->key.k,
- iter->is_extents)))
+ iter->flags & BTREE_ITER_IS_EXTENTS)))
btree_iter_up(iter);
/*
struct bkey_s_c k;
while ((k = __btree_iter_peek_all(iter)).k &&
- !btree_iter_pos_cmp(iter->pos, k.k, iter->is_extents))
+ !btree_iter_pos_cmp(iter->pos, k.k,
+ iter->flags & BTREE_ITER_IS_EXTENTS))
__btree_iter_advance(iter);
}
if (unlikely(!iter->nodes[iter->level]))
return 0;
- iter->at_end_of_leaf = false;
+ iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF;
ret = __bch2_btree_iter_traverse(iter);
if (unlikely(ret))
struct btree *b;
int ret;
- EBUG_ON(iter->is_extents);
+ EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
ret = bch2_btree_iter_traverse(iter);
if (ret)
struct btree *b;
int ret;
- EBUG_ON(iter->is_extents);
+ EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
btree_iter_up(iter);
while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
!btree_iter_pos_cmp_packed(b, &new_pos, k,
- iter->is_extents))
+ iter->flags & BTREE_ITER_IS_EXTENTS))
bch2_btree_node_iter_advance(node_iter, b);
if (!k &&
- !btree_iter_pos_cmp(new_pos, &b->key.k, iter->is_extents))
- iter->at_end_of_leaf = true;
+ !btree_iter_pos_cmp(new_pos, &b->key.k,
+ iter->flags & BTREE_ITER_IS_EXTENTS))
+ iter->flags |= BTREE_ITER_AT_END_OF_LEAF;
iter->pos = new_pos;
}
struct bkey_s_c k;
int ret;
+ EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
+ (iter->btree_id == BTREE_ID_EXTENTS));
+
while (1) {
ret = bch2_btree_iter_traverse(iter);
if (unlikely(ret)) {
* iter->pos should always be equal to the key we just
* returned - except extents can straddle iter->pos:
*/
- if (!iter->is_extents ||
+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
return k;
struct bkey n;
int ret;
+ EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
+ (iter->btree_id == BTREE_ID_EXTENTS));
+
while (1) {
ret = bch2_btree_iter_traverse(iter);
if (unlikely(ret)) {
bkey_init(&n);
n.p = iter->pos;
- if (iter->is_extents) {
+ if (iter->flags & BTREE_ITER_IS_EXTENTS) {
if (n.p.offset == KEY_OFFSET_MAX) {
iter->pos = bkey_successor(iter->pos);
goto recheck;
}
void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
- enum btree_id btree_id, struct bpos pos,
- unsigned locks_want, unsigned depth)
+ enum btree_id btree_id, struct bpos pos,
+ unsigned locks_want, unsigned depth,
+ unsigned flags)
{
+ iter->c = c;
+ iter->pos = pos;
+ iter->flags = flags;
+ iter->btree_id = btree_id;
iter->level = depth;
- /* bch2_bkey_ops isn't used much, this would be a cache miss */
- /* iter->is_extents = bch2_bkey_ops[btree_id]->is_extents; */
- iter->is_extents = btree_id == BTREE_ID_EXTENTS;
+ iter->locks_want = min(locks_want, BTREE_MAX_DEPTH);
iter->nodes_locked = 0;
iter->nodes_intent_locked = 0;
- iter->locks_want = min(locks_want, BTREE_MAX_DEPTH);
- iter->btree_id = btree_id;
- iter->at_end_of_leaf = 0;
- iter->error = 0;
- iter->c = c;
- iter->pos = pos;
memset(iter->nodes, 0, sizeof(iter->nodes));
iter->nodes[iter->level] = BTREE_ITER_NOT_END;
iter->next = iter;
#include "btree_types.h"
-struct btree_iter {
- /* Current btree depth */
- u8 level;
-
- /*
- * Used in bch2_btree_iter_traverse(), to indicate whether we're
- * searching for @pos or the first key strictly greater than @pos
- */
- u8 is_extents;
- /* Bitmasks for read/intent locks held per level */
- u8 nodes_locked;
- u8 nodes_intent_locked;
+#define BTREE_ITER_INTENT (1 << 0)
+#define BTREE_ITER_WITH_HOLES (1 << 1)
+#define BTREE_ITER_PREFETCH (1 << 2)
+/*
+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
+ * @pos or the first key strictly greater than @pos
+ */
+#define BTREE_ITER_IS_EXTENTS (1 << 3)
+/*
+ * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
+ */
+#define BTREE_ITER_AT_END_OF_LEAF (1 << 4)
+#define BTREE_ITER_ERROR (1 << 5)
- /* Btree level below which we start taking intent locks */
- u8 locks_want;
+/*
+ * @pos - iterator's current position
+ * @level - current btree depth
+ * @locks_want - btree level below which we start taking intent locks
+ * @nodes_locked - bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked - bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+ struct bch_fs *c;
+ struct bpos pos;
+ u8 flags;
enum btree_id btree_id:8;
-
- /*
- * indicates we need to call bch2_btree_iter_traverse() to revalidate
- * iterator:
- */
- u8 at_end_of_leaf;
-
- s8 error;
-
- struct bch_fs *c;
-
- /* Current position of the iterator */
- struct bpos pos;
+ unsigned level:4,
+ locks_want:4,
+ nodes_locked:4,
+ nodes_intent_locked:4;
u32 lock_seq[BTREE_MAX_DEPTH];
void bch2_btree_iter_rewind(struct btree_iter *, struct bpos);
void __bch2_btree_iter_init(struct btree_iter *, struct bch_fs *,
- enum btree_id, struct bpos, unsigned , unsigned);
+ enum btree_id, struct bpos,
+ unsigned , unsigned, unsigned);
static inline void bch2_btree_iter_init(struct btree_iter *iter,
- struct bch_fs *c,
- enum btree_id btree_id,
- struct bpos pos)
-{
- __bch2_btree_iter_init(iter, c, btree_id, pos, 0, 0);
-}
-
-static inline void bch2_btree_iter_init_intent(struct btree_iter *iter,
- struct bch_fs *c,
- enum btree_id btree_id,
- struct bpos pos)
+ struct bch_fs *c, enum btree_id btree_id,
+ struct bpos pos, unsigned flags)
{
- __bch2_btree_iter_init(iter, c, btree_id, pos, 1, 0);
+ __bch2_btree_iter_init(iter, c, btree_id, pos,
+ flags & BTREE_ITER_INTENT ? 1 : 0, 0,
+ btree_id == BTREE_ID_EXTENTS
+ ? BTREE_ITER_IS_EXTENTS : 0);
}
void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *);
return __btree_iter_cmp(l->btree_id, l->pos, r);
}
-#define __for_each_btree_node(_iter, _c, _btree_id, _start, _depth, \
- _b, _locks_want) \
- for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), \
- _start, _locks_want, _depth), \
- (_iter)->is_extents = false, \
+#define __for_each_btree_node(_iter, _c, _btree_id, _start, \
+ _locks_want, _depth, _flags, _b) \
+ for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start, \
+ _locks_want, _depth, _flags), \
_b = bch2_btree_iter_peek_node(_iter); \
(_b); \
(_b) = bch2_btree_iter_next_node(_iter, _depth))
-#define for_each_btree_node(_iter, _c, _btree_id, _start, _depth, _b) \
- __for_each_btree_node(_iter, _c, _btree_id, _start, _depth, _b, 0)
+#define for_each_btree_node(_iter, _c, _btree_id, _start, _flags, _b) \
+ __for_each_btree_node(_iter, _c, _btree_id, _start, 0, 0, _flags, _b)
-#define __for_each_btree_key(_iter, _c, _btree_id, _start, \
- _k, _locks_want) \
- for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), \
- _start, _locks_want, 0); \
- !IS_ERR_OR_NULL(((_k) = bch2_btree_iter_peek(_iter)).k); \
+#define for_each_btree_key(_iter, _c, _btree_id, _start, _flags, _k) \
+ for (bch2_btree_iter_init((_iter), (_c), (_btree_id), \
+ (_start), (_flags)); \
+ !IS_ERR_OR_NULL(((_k) = (((_flags) & BTREE_ITER_WITH_HOLES)\
+ ? bch2_btree_iter_peek_with_holes(_iter)\
+ : bch2_btree_iter_peek(_iter))).k); \
bch2_btree_iter_advance_pos(_iter))
-#define for_each_btree_key(_iter, _c, _btree_id, _start, _k) \
- __for_each_btree_key(_iter, _c, _btree_id, _start, _k, 0)
-
-#define for_each_btree_key_intent(_iter, _c, _btree_id, _start, _k) \
- __for_each_btree_key(_iter, _c, _btree_id, _start, _k, 1)
-
-#define __for_each_btree_key_with_holes(_iter, _c, _btree_id, \
- _start, _k, _locks_want) \
- for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), \
- _start, _locks_want, 0); \
- !IS_ERR_OR_NULL(((_k) = bch2_btree_iter_peek_with_holes(_iter)).k);\
- bch2_btree_iter_advance_pos(_iter))
-
-#define for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k) \
- __for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 0)
-
-#define for_each_btree_key_with_holes_intent(_iter, _c, _btree_id, \
- _start, _k) \
- __for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 1)
-
static inline int btree_iter_err(struct bkey_s_c k)
{
return IS_ERR(k.k) ? PTR_ERR(k.k) : 0;
{ clear_bit(BTREE_NODE_ ## flag, &b->flags); }
enum btree_flags {
+ BTREE_NODE_read_in_flight,
BTREE_NODE_read_error,
BTREE_NODE_write_error,
BTREE_NODE_dirty,
BTREE_NODE_just_written,
};
+BTREE_FLAG(read_in_flight);
BTREE_FLAG(read_error);
BTREE_FLAG(write_error);
BTREE_FLAG(dirty);
* traversed again
*/
trans_for_each_entry(trans, i)
- if (i->iter->at_end_of_leaf)
+ if (i->iter->flags & BTREE_ITER_AT_END_OF_LEAF)
goto out;
trans_for_each_entry(trans, i)
struct btree_iter iter;
int ret, ret2;
- bch2_btree_iter_init_intent(&iter, c, id, bkey_start_pos(&k->k));
+ bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
+ BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(&iter);
if (unlikely(ret))
EBUG_ON(id == BTREE_ID_EXTENTS);
- bch2_btree_iter_init_intent(&iter, c, id, k->k.p);
+ bch2_btree_iter_init(&iter, c, id, k->k.p,
+ BTREE_ITER_INTENT);
u = bch2_btree_iter_peek_with_holes(&iter);
ret = btree_iter_err(u);
struct bkey_s_c k;
int ret = 0;
- bch2_btree_iter_init_intent(&iter, c, id, start);
+ bch2_btree_iter_init(&iter, c, id, start,
+ BTREE_ITER_INTENT);
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(ret = btree_iter_err(k))) {
delete.k.p = iter.pos;
delete.k.version = version;
- if (iter.is_extents) {
+ if (iter.flags & BTREE_ITER_IS_EXTENTS) {
/*
* The extents btree is special - KEY_TYPE_DISCARD is
* used for deletions, not KEY_TYPE_DELETED. This is an
new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
- new.copygc = 0;
new.gen++;
}));
return gen_after(PTR_BUCKET(ca, ptr)->mark.gen, ptr->gen);
}
-/* bucket heaps */
-
-static inline bool bucket_min_cmp(struct bucket_heap_entry l,
- struct bucket_heap_entry r)
-{
- return l.val < r.val;
-}
-
-static inline bool bucket_max_cmp(struct bucket_heap_entry l,
- struct bucket_heap_entry r)
-{
- return l.val > r.val;
-}
-
-static inline void bucket_heap_push(struct bch_dev *ca, struct bucket *g,
- unsigned long val)
-{
- struct bucket_heap_entry new = { g, val };
-
- if (!heap_full(&ca->heap))
- heap_add(&ca->heap, new, bucket_min_cmp);
- else if (bucket_min_cmp(new, heap_peek(&ca->heap))) {
- ca->heap.data[0] = new;
- heap_sift(&ca->heap, 0, bucket_min_cmp);
- }
-}
-
/* bucket gc marks */
/* The dirty and cached sector counts saturate. If this occurs,
* GC must be performed. */
#define GC_MAX_SECTORS_USED ((1U << 15) - 1)
-static inline bool bucket_unused(struct bucket *g)
+static inline unsigned bucket_sectors_used(struct bucket_mark mark)
{
- return !g->mark.counter;
+ return mark.dirty_sectors + mark.cached_sectors;
}
-static inline unsigned bucket_sectors_used(struct bucket *g)
+static inline bool bucket_unused(struct bucket_mark mark)
{
- return g->mark.dirty_sectors + g->mark.cached_sectors;
+ return !mark.owned_by_allocator &&
+ !mark.data_type &&
+ !bucket_sectors_used(mark);
}
/* Per device stats: */
#ifndef _BUCKETS_TYPES_H
#define _BUCKETS_TYPES_H
+#include "util.h"
+
enum bucket_data_type {
BUCKET_DATA = 0,
BUCKET_BTREE,
struct {
u8 gen;
- /* generation copygc is going to move this bucket into */
- unsigned copygc:1;
-
unsigned journal_seq_valid:1;
/*
};
struct bucket_heap_entry {
- struct bucket *g;
- unsigned long val;
+ size_t bucket;
+ struct bucket_mark mark;
};
+typedef HEAP(struct bucket_heap_entry) bucket_heap;
+
/*
* A reservation for space on disk:
*/
#include <linux/kthread.h>
#include <linux/preempt.h>
-static inline bool io_timer_cmp(struct io_timer *l, struct io_timer *r)
+static inline long io_timer_cmp(io_timer_heap *h,
+ struct io_timer *l,
+ struct io_timer *r)
{
- return time_after(l->expire, r->expire);
+ return l->expire - r->expire;
}
void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
/* Amount to buffer up on a percpu counter */
#define IO_CLOCK_PCPU_SECTORS 128
+typedef HEAP(struct io_timer *) io_timer_heap;
+
struct io_clock {
atomic_long_t now;
u16 __percpu *pcpu_buf;
spinlock_t timer_lock;
- DECLARE_HEAP(struct io_timer *, timers);
+ io_timer_heap timers;
};
#endif /* _BCACHE_CLOCK_TYPES_H */
bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
bio->bi_bdev = pick.ca->disk_sb.bdev;
+ bio->bi_opf = REQ_OP_READ|REQ_META;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
- bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
bch2_bio_map(bio, n_sorted);
submit_bio_wait(bio);
if (!i->size)
return i->ret;
- bch2_btree_iter_init(&iter, i->c, i->id, i->from);
+ bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(err = btree_iter_err(k))) {
if (!i->size)
return i->ret;
- bch2_btree_iter_init(&iter, i->c, i->id, i->from);
+ bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(err = btree_iter_err(k))) {
bool need_whiteout;
int ret = -ENOMEM;
- bch2_btree_iter_init_intent(&src_iter, c, BTREE_ID_DIRENTS, src_pos);
- bch2_btree_iter_init_intent(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos);
+ bch2_btree_iter_init(&src_iter, c, BTREE_ID_DIRENTS, src_pos,
+ BTREE_ITER_INTENT);
+ bch2_btree_iter_init(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos,
+ BTREE_ITER_INTENT);
bch2_btree_iter_link(&src_iter, &dst_iter);
- bch2_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos);
+ bch2_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos, 0);
bch2_btree_iter_link(&src_iter, &whiteout_iter);
if (mode == BCH_RENAME_EXCHANGE) {
struct bkey_s_c k;
int ret = 0;
- for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), k) {
+ for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), 0, k) {
if (k.k->p.inode > dir_inum)
break;
pr_debug("listing for %lu from %llu", inode->i_ino, ctx->pos);
for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
- POS(inode->i_ino, ctx->pos), k) {
+ POS(inode->i_ino, ctx->pos), 0, k) {
if (k.k->type != BCH_DIRENT)
continue;
* Necessary for btree_sort_fixup() - if there are multiple keys that compare
* equal in different sets, we have to process them newest to oldest.
*/
-#define key_sort_cmp(l, r) \
+#define key_sort_cmp(h, l, r) \
({ \
- int _c = bkey_cmp_packed(b, \
- __btree_node_offset_to_key(b, (l).k), \
- __btree_node_offset_to_key(b, (r).k)); \
+ bkey_cmp_packed(b, \
+ __btree_node_offset_to_key(b, (l).k), \
+ __btree_node_offset_to_key(b, (r).k)) \
\
- _c ? _c > 0 : (l).k > (r).k; \
+ ?: (l).k - (r).k; \
})
static inline bool should_drop_next_key(struct btree_node_iter *iter,
return false;
if (iter->used > 2 &&
- key_sort_cmp(r[0], r[1]))
+ key_sort_cmp(iter, r[0], r[1]) >= 0)
r++;
/*
}
sort_key_next(iter, b, iter->data);
- heap_sift(iter, 0, key_sort_cmp);
+ heap_sift_down(iter, 0, key_sort_cmp);
}
dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
}
/*
- * Returns true if l > r - unless l == r, in which case returns true if l is
- * older than r.
+ * If keys compare equal, compare by pointer order:
*
* Necessary for sort_fix_overlapping() - if there are multiple keys that
* compare equal in different sets, we have to process them newest to oldest.
*/
-#define extent_sort_cmp(l, r) \
+#define extent_sort_cmp(h, l, r) \
({ \
struct bkey _ul = bkey_unpack_key(b, \
__btree_node_offset_to_key(b, (l).k)); \
struct bkey _ur = bkey_unpack_key(b, \
__btree_node_offset_to_key(b, (r).k)); \
\
- int _c = bkey_cmp(bkey_start_pos(&_ul), bkey_start_pos(&_ur)); \
- _c ? _c > 0 : (l).k < (r).k; \
+ bkey_cmp(bkey_start_pos(&_ul), \
+ bkey_start_pos(&_ur)) ?: (r).k - (l).k; \
})
static inline void extent_sort_sift(struct btree_node_iter *iter,
struct btree *b, size_t i)
{
- heap_sift(iter, i, extent_sort_cmp);
+ heap_sift_down(iter, i, extent_sort_cmp);
}
static inline void extent_sort_next(struct btree_node_iter *iter,
struct btree_node_iter_set *i)
{
sort_key_next(iter, b, i);
- heap_sift(iter, i - iter->data, extent_sort_cmp);
+ heap_sift_down(iter, i - iter->data, extent_sort_cmp);
}
static void extent_sort_append(struct bch_fs *c,
_r = iter->data + 1;
if (iter->used > 2 &&
- extent_sort_cmp(_r[0], _r[1]))
+ extent_sort_cmp(iter, _r[0], _r[1]) >= 0)
_r++;
rk = __btree_node_offset_to_key(b, _r->k);
gc_pos_btree_node(b));
EBUG_ON(bkey_cmp(iter->pos, s->committed));
- EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf);
+ EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
+ !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
bch2_cut_front(iter->pos, insert);
- if (insert->k.size && iter->at_end_of_leaf)
+ if (insert->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
ret = BTREE_INSERT_NEED_TRAVERSE;
EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK);
EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
EBUG_ON(bkey_cmp(iter->pos, s.committed));
- EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf);
+ EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
+ !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
- if (insert->k->k.size && iter->at_end_of_leaf)
+ if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
ret = BTREE_INSERT_NEED_TRAVERSE;
EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK);
/*
* Traversal for trees in eytzinger layout - a full binary tree layed out in an
* array
+ */
+
+/*
+ * One based indexing version:
*
- * We used one based indexing, not zero based: with one based indexing, each
- * level of the tree starts at a power of two - leading to better alignment -
- * and it's what you want for implementing next/prev and to/from inorder.
- *
- * To/from inorder also uses 1 based indexing.
+ * With one based indexing each level of the tree starts at a power of two -
+ * good for cacheline alignment:
*
* Size parameter is treated as if we were using 0 based indexing, however:
- * valid nodes, and inorder indices, are in the range [1..size)
+ * valid nodes, and inorder indices, are in the range [1..size) - that is, there
+ * are actually size - 1 elements
*/
-static inline unsigned eytzinger_child(unsigned j, unsigned child)
+static inline unsigned eytzinger1_child(unsigned i, unsigned child)
{
EBUG_ON(child > 1);
- return (j << 1) + child;
+ return (i << 1) + child;
}
-static inline unsigned eytzinger_left_child(unsigned j)
+static inline unsigned eytzinger1_left_child(unsigned i)
{
- return eytzinger_child(j, 0);
+ return eytzinger1_child(i, 0);
}
-static inline unsigned eytzinger_right_child(unsigned j)
+static inline unsigned eytzinger1_right_child(unsigned i)
{
- return eytzinger_child(j, 1);
+ return eytzinger1_child(i, 1);
}
-static inline unsigned eytzinger_first(unsigned size)
+static inline unsigned eytzinger1_first(unsigned size)
{
return rounddown_pow_of_two(size - 1);
}
-static inline unsigned eytzinger_last(unsigned size)
+static inline unsigned eytzinger1_last(unsigned size)
{
return rounddown_pow_of_two(size) - 1;
}
/*
- * eytzinger_next() and eytzinger_prev() have the nice properties that
+ * eytzinger1_next() and eytzinger1_prev() have the nice properties that
*
- * eytzinger_next(0) == eytzinger_first())
- * eytzinger_prev(0) == eytzinger_last())
+ * eytzinger1_next(0) == eytzinger1_first())
+ * eytzinger1_prev(0) == eytzinger1_last())
*
- * eytzinger_prev(eytzinger_first()) == 0
- * eytzinger_next(eytzinger_last()) == 0
+ * eytzinger1_prev(eytzinger1_first()) == 0
+ * eytzinger1_next(eytzinger1_last()) == 0
*/
-static inline unsigned eytzinger_next(unsigned j, unsigned size)
+static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
- EBUG_ON(j >= size);
+ EBUG_ON(i >= size);
- if (eytzinger_right_child(j) < size) {
- j = eytzinger_right_child(j);
+ if (eytzinger1_right_child(i) < size) {
+ i = eytzinger1_right_child(i);
- j <<= __fls(size) - __fls(j);
- j >>= j >= size;
+ i <<= __fls(size) - __fls(i);
+ i >>= i >= size;
} else {
- j >>= ffz(j) + 1;
+ i >>= ffz(i) + 1;
}
- return j;
+ return i;
}
-static inline unsigned eytzinger_prev(unsigned j, unsigned size)
+static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
- EBUG_ON(j >= size);
+ EBUG_ON(i >= size);
- if (eytzinger_left_child(j) < size) {
- j = eytzinger_left_child(j);
+ if (eytzinger1_left_child(i) < size) {
+ i = eytzinger1_left_child(i);
- j <<= __fls(size) - __fls(j);
- j -= 1;
- j >>= j >= size;
+ i <<= __fls(size) - __fls(i);
+ i -= 1;
+ i >>= i >= size;
} else {
- j >>= __ffs(j) + 1;
+ i >>= __ffs(i) + 1;
}
- return j;
+ return i;
}
-static inline unsigned eytzinger_extra(unsigned size)
+static inline unsigned eytzinger1_extra(unsigned size)
{
return (size - rounddown_pow_of_two(size - 1)) << 1;
}
-static inline unsigned __eytzinger_to_inorder(unsigned j, unsigned size,
+static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
unsigned extra)
{
- unsigned b = __fls(j);
+ unsigned b = __fls(i);
unsigned shift = __fls(size - 1) - b;
int s;
- EBUG_ON(!j || j >= size);
+ EBUG_ON(!i || i >= size);
- j ^= 1U << b;
- j <<= 1;
- j |= 1;
- j <<= shift;
+ i ^= 1U << b;
+ i <<= 1;
+ i |= 1;
+ i <<= shift;
/*
* sign bit trick:
*
- * if (j > extra)
- * j -= (j - extra) >> 1;
+ * if (i > extra)
+ * i -= (i - extra) >> 1;
*/
- s = extra - j;
- j += (s >> 1) & (s >> 31);
+ s = extra - i;
+ i += (s >> 1) & (s >> 31);
- return j;
+ return i;
}
-static inline unsigned __inorder_to_eytzinger(unsigned j, unsigned size,
- unsigned extra)
+static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
+ unsigned extra)
{
unsigned shift;
int s;
- EBUG_ON(!j || j >= size);
+ EBUG_ON(!i || i >= size);
/*
* sign bit trick:
*
- * if (j > extra)
- * j += j - extra;
+ * if (i > extra)
+ * i += i - extra;
*/
- s = extra - j;
- j -= s & (s >> 31);
+ s = extra - i;
+ i -= s & (s >> 31);
- shift = __ffs(j);
+ shift = __ffs(i);
- j >>= shift + 1;
- j |= 1U << (__fls(size - 1) - shift);
+ i >>= shift + 1;
+ i |= 1U << (__fls(size - 1) - shift);
- return j;
+ return i;
}
-static inline unsigned eytzinger_to_inorder(unsigned j, unsigned size)
+static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
{
- return __eytzinger_to_inorder(j, size, eytzinger_extra(size));
+ return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
}
-static inline unsigned inorder_to_eytzinger(unsigned j, unsigned size)
+static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
{
- return __inorder_to_eytzinger(j, size, eytzinger_extra(size));
+ return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
}
-#define eytzinger_for_each(_i, _size) \
- for ((_i) = eytzinger_first((_size)); \
+#define eytzinger1_for_each(_i, _size) \
+ for ((_i) = eytzinger1_first((_size)); \
(_i) != 0; \
- (_i) = eytzinger_next((_i), (_size)))
+ (_i) = eytzinger1_next((_i), (_size)))
#if 0
-void eytzinger_test(void)
+void eytzinger0_test(void)
{
unsigned i, j, size;
if (!(size % 4096))
printk(KERN_INFO "tree size %u\n", size);
- assert(eytzinger_prev(0, size) == eytzinger_last(size));
- assert(eytzinger_next(0, size) == eytzinger_first(size));
+ assert(eytzinger1_prev(0, size) == eytzinger1_last(size));
+ assert(eytzinger1_next(0, size) == eytzinger1_first(size));
- assert(eytzinger_prev(eytzinger_first(size), size) == 0);
- assert(eytzinger_next(eytzinger_last(size), size) == 0);
+ assert(eytzinger1_prev(eytzinger1_first(size), size) == 0);
+ assert(eytzinger1_next(eytzinger1_last(size), size) == 0);
- eytzinger_for_each(j, size) {
+ eytzinger1_for_each(j, size) {
assert(from_inorder(i, size) == j);
assert(to_inorder(j, size) == i);
- if (j != eytzinger_last(size)) {
- unsigned next = eytzinger_next(j, size);
+ if (j != eytzinger1_last(size)) {
+ unsigned next = eytzinger1_next(j, size);
- assert(eytzinger_prev(next, size) == j);
+ assert(eytzinger1_prev(next, size) == j);
}
}
}
}
#endif
+/* Zero based indexing version: */
+
+static inline unsigned eytzinger0_child(unsigned i, unsigned child)
+{
+ EBUG_ON(child > 1);
+
+ return (i << 1) + 1 + child;
+}
+
+static inline unsigned eytzinger0_left_child(unsigned i)
+{
+ return eytzinger0_child(i, 0);
+}
+
+static inline unsigned eytzinger0_right_child(unsigned i)
+{
+ return eytzinger0_child(i, 1);
+}
+
+#if 0
+static inline unsigned eytzinger0_first(unsigned size)
+{
+}
+
+static inline unsigned eytzinger0_last(unsigned size)
+{
+}
+
+static inline unsigned eytzinger0_next(unsigned i, unsigned size)
+{
+}
+
+static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
+{
+}
+#endif
+
+static inline unsigned eytzinger0_extra(unsigned size)
+{
+ return (size + 1 - rounddown_pow_of_two(size)) << 1;
+}
+
+static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
+ unsigned extra)
+{
+ return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1;
+}
+
+static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
+ unsigned extra)
+{
+ return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1;
+}
+
+static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
+{
+ return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
+{
+ return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
+}
+
+#define eytzinger0_find(base, _nr, _size, _cmp, _search) \
+({ \
+ void *_base = base; \
+ size_t _i = 0; \
+ int _res; \
+ \
+ while (_i < (_nr) && \
+ (_res = _cmp(_search, _base + _i * (_size), _size))) \
+ _i = eytzinger0_child(_i, _res > 0); \
+ \
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { \
+ bool found1 = _i < _nr, found2 = false; \
+ unsigned _j; \
+ \
+ for (_j = 0; _j < _nr; _j++) \
+ if (!_cmp(_base + _j * (_size), _search, _size))\
+ found2 = true; \
+ \
+ BUG_ON(found1 != found2); \
+ } \
+ \
+ _i; \
+})
+
+void eytzinger0_sort(void *, size_t, size_t,
+ int (*cmp_func)(const void *, const void *, size_t),
+ void (*swap_func)(void *, void *, size_t));
+
#endif /* _EYTZINGER_H */
BUG_ON(k->k.p.inode != op->ei->vfs_inode.i_ino);
- bch2_btree_iter_init_intent(&extent_iter, wop->c, BTREE_ID_EXTENTS,
- bkey_start_pos(&bch2_keylist_front(keys)->k));
- bch2_btree_iter_init_intent(&inode_iter, wop->c, BTREE_ID_INODES,
- POS(extent_iter.pos.inode, 0));
+ bch2_btree_iter_init(&extent_iter, wop->c, BTREE_ID_EXTENTS,
+ bkey_start_pos(&bch2_keylist_front(keys)->k),
+ BTREE_ITER_INTENT);
+ bch2_btree_iter_init(&inode_iter, wop->c, BTREE_ID_INODES,
+ POS(extent_iter.pos.inode, 0),
+ BTREE_ITER_INTENT);
hook.op = op;
hook.hook.fn = bchfs_extent_update_hook;
.mapping = mapping, .nr_pages = nr_pages
};
- bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0);
INIT_LIST_HEAD(&readpages_iter.pages);
list_add(&readpages_iter.pages, pages);
bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
bio_add_page_contig(&rbio->bio, page);
- bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0);
bchfs_read(c, &iter, rbio, inode, NULL);
}
w->io->op.new_i_size = i_size;
if (wbc->sync_mode == WB_SYNC_ALL)
- w->io->bio.bio.bi_opf |= WRITE_SYNC;
+ w->io->bio.bio.bi_opf |= REQ_SYNC;
/* Before unlocking the page, transfer reservation to w->io: */
old = page_state_cmpxchg(page_state(page), new, {
bio->bi_iter.bi_sector = offset >> 9;
bio->bi_private = dio;
- ret = bio_get_user_pages(bio, iter, 1);
+ ret = bio_iov_iter_get_pages(bio, iter);
if (ret < 0) {
/* XXX: fault inject this path */
bio->bi_error = ret;
bio->bi_iter.bi_sector = (dio->offset + dio->written) >> 9;
- ret = bio_get_user_pages(bio, &dio->iter, 0);
+ ret = bio_iov_iter_get_pages(bio, &dio->iter);
if (ret < 0) {
/*
* these didn't get initialized, but bch2_dio_write_done() will
*/
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
POS(inode->i_ino,
- index << (PAGE_SHIFT - 9)), k) {
+ index << (PAGE_SHIFT - 9)), 0, k) {
if (bkey_cmp(bkey_start_pos(k.k),
POS(inode->i_ino,
(index + 1) << (PAGE_SHIFT - 9))) >= 0)
if ((offset | len) & (PAGE_SIZE - 1))
return -EINVAL;
- bch2_btree_iter_init_intent(&dst, c, BTREE_ID_EXTENTS,
- POS(inode->i_ino, offset >> 9));
+ bch2_btree_iter_init(&dst, c, BTREE_ID_EXTENTS,
+ POS(inode->i_ino, offset >> 9),
+ BTREE_ITER_INTENT);
/* position will be set from dst iter's position: */
- bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN);
+ bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN, 0);
bch2_btree_iter_link(&src, &dst);
/*
unsigned replicas = READ_ONCE(c->opts.data_replicas);
int ret;
- bch2_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+ BTREE_ITER_INTENT);
inode_lock(inode);
inode_dio_wait(inode);
return -ENXIO;
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
- POS(inode->i_ino, offset >> 9), k) {
+ POS(inode->i_ino, offset >> 9), 0, k) {
if (k.k->p.inode != inode->i_ino) {
break;
} else if (bkey_extent_is_data(k.k)) {
if (offset >= isize)
return -ENXIO;
- for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
- POS(inode->i_ino, offset >> 9), k) {
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+ POS(inode->i_ino, offset >> 9),
+ BTREE_ITER_WITH_HOLES, k) {
if (k.k->p.inode != inode->i_ino) {
next_hole = bch2_next_pagecache_hole(inode,
offset, MAX_LFS_FILESIZE);
lockdep_assert_held(&ei->update_lock);
- bch2_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(inum, 0));
+ bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0),
+ BTREE_ITER_INTENT);
do {
struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter);
return -EINVAL;
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
- POS(inode->i_ino, start >> 9), k)
+ POS(inode->i_ino, start >> 9), 0, k)
if (bkey_extent_is_data(k.k) ||
k.k->type == BCH_RESERVATION) {
if (bkey_cmp(bkey_start_pos(k.k),
};
static const struct inode_operations bch_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = page_get_link,
.setattr = bch2_setattr,
.listxattr = bch2_xattr_list,
static void hash_check_init(const struct bch_hash_desc desc,
struct hash_check *h, struct bch_fs *c)
{
- bch2_btree_iter_init(&h->chain, c, desc.btree_id, POS_MIN);
- bch2_btree_iter_init(&h->iter, c, desc.btree_id, POS_MIN);
+ bch2_btree_iter_init(&h->chain, c, desc.btree_id, POS_MIN, 0);
+ bch2_btree_iter_init(&h->iter, c, desc.btree_id, POS_MIN, 0);
}
static void hash_check_set_inode(struct hash_check *h, struct bch_fs *c,
int ret = 0;
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
- POS(BCACHE_ROOT_INO, 0), k) {
+ POS(BCACHE_ROOT_INO, 0), 0, k) {
if (k.k->type == KEY_TYPE_DISCARD)
continue;
hash_check_init(bch2_dirent_hash_desc, &h, c);
for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
- POS(BCACHE_ROOT_INO, 0), k) {
+ POS(BCACHE_ROOT_INO, 0), 0, k) {
struct bkey_s_c_dirent d;
struct bch_inode_unpacked target;
bool have_target;
hash_check_init(bch2_xattr_hash_desc, &h, c);
for_each_btree_key(&iter, c, BTREE_ID_XATTRS,
- POS(BCACHE_ROOT_INO, 0), k) {
+ POS(BCACHE_ROOT_INO, 0), 0, k) {
ret = walk_inode(c, &w, k.k->p.inode);
if (ret)
break;
goto up;
for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
- POS(e->inum, e->offset + 1), k) {
+ POS(e->inum, e->offset + 1), 0, k) {
if (k.k->p.inode != e->inum)
break;
path.nr--;
}
- for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
+ for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
if (k.k->type != BCH_INODE_FS ||
!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->i_mode)))
continue;
inc_link(c, links, range_start, range_end, BCACHE_ROOT_INO, false);
- for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, k) {
+ for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k) {
switch (k.k->type) {
case BCH_DIRENT:
d = bkey_s_c_to_dirent(k);
struct bkey_s_c k;
u64 sectors = 0;
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inum, 0), k) {
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inum, 0), 0, k) {
if (k.k->p.inode != inum)
break;
int ret = 0, ret2 = 0;
u64 nlinks_pos;
- bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0));
+ bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0), 0);
genradix_iter_init(&nlinks_iter);
while ((k = bch2_btree_iter_peek(&iter)).k &&
if (*hint == min)
searched_from_start = true;
again:
- bch2_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(*hint, 0));
+ bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(*hint, 0),
+ BTREE_ITER_INTENT);
while (1) {
struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter);
struct bkey_s_c k;
int ret = -ENOENT;
- for_each_btree_key_with_holes(&iter, c, BTREE_ID_INODES,
- POS(inode_nr, 0), k) {
+ for_each_btree_key(&iter, c, BTREE_ID_INODES,
+ POS(inode_nr, 0),
+ BTREE_ITER_WITH_HOLES, k) {
switch (k.k->type) {
case BCH_INODE_FS:
ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
struct btree_iter iter;
struct bkey_s_c k;
- for_each_btree_key(&iter, c, BTREE_ID_INODES, POS(0, 0), k) {
+ for_each_btree_key(&iter, c, BTREE_ID_INODES, POS(0, 0), 0, k) {
if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
break;
struct btree_iter iter;
int ret;
- bch2_btree_iter_init_intent(&iter, op->c, BTREE_ID_EXTENTS,
- bkey_start_pos(&bch2_keylist_front(keys)->k));
+ bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS,
+ bkey_start_pos(&bch2_keylist_front(keys)->k),
+ BTREE_ITER_INTENT);
ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
NULL, op_journal_seq(op),
if (promote_op) {
struct bio *promote_bio = &promote_op->write.wbio.bio;
- bio_init(promote_bio);
- promote_bio->bi_max_vecs = pages;
- promote_bio->bi_io_vec = promote_bio->bi_inline_vecs;
+ bio_init(promote_bio,
+ promote_bio->bi_inline_vecs,
+ pages);
bounce = true;
/* could also set read_full */
}
struct bkey_s_c k;
int ret;
- for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
- POS(inode, bvec_iter.bi_sector), k) {
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+ POS(inode, bvec_iter.bi_sector),
+ BTREE_ITER_WITH_HOLES, k) {
BKEY_PADDED(k) tmp;
struct extent_pick_ptr pick;
unsigned bytes, sectors;
n = bl->entries[i];
mutex_unlock(&j->blacklist_lock);
- bch2_btree_iter_init(&iter, c, n.btree_id, n.pos);
- iter.is_extents = false;
+ __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0);
redo_peek:
b = bch2_btree_iter_peek_node(&iter);
struct journal_entry_pin *pin;
u64 pin_seq;
+ if (!test_bit(JOURNAL_STARTED, &j->flags))
+ return;
+
while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq)))
pin->flush(j, pin, pin_seq);
bio = ca->journal.bio;
bio_reset(bio);
bio->bi_bdev = ca->disk_sb.bdev;
+ bio->bi_opf = REQ_OP_FLUSH;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
closure_bio_submit(bio, cl);
}
atomic_set(&ctxt.error_count, 0);
atomic_set(&ctxt.error_flags, 0);
- bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+ BTREE_ITER_PREFETCH);
while (!bch2_move_ctxt_wait(&ctxt) &&
(k = bch2_btree_iter_peek(&iter)).k &&
closure_init_stack(&cl);
- for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
+ for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
retry:
if (!bch2_extent_has_device(e, ca->dev_idx))
return ret; /* btree IO error */
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
- for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
+ for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
BUG_ON(bch2_extent_has_device(e, ca->dev_idx));
struct bkey_s_c_extent e;
struct btree_iter iter;
- bch2_btree_iter_init(&iter, ca->fs, BTREE_ID_EXTENTS, POS_MIN);
+ bch2_btree_iter_init(&iter, ca->fs, BTREE_ID_EXTENTS,
+ POS_MIN, BTREE_ITER_PREFETCH);
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(ret = btree_iter_err(k))) {
struct btree_iter iter;
int ret = 0;
- bch2_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS,
- bkey_start_pos(&bch2_keylist_front(keys)->k));
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+ bkey_start_pos(&bch2_keylist_front(keys)->k),
+ BTREE_ITER_INTENT);
while (1) {
struct bkey_s_extent insert =
static void migrate_bio_init(struct moving_io *io, struct bio *bio,
unsigned sectors)
{
- bio_init(bio);
+ bio_init(bio, io->bi_inline_vecs,
+ DIV_ROUND_UP(sectors, PAGE_SECTORS));
bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
bio->bi_iter.bi_size = sectors << 9;
- bio->bi_max_vecs = DIV_ROUND_UP(sectors, PAGE_SECTORS);
bio->bi_private = &io->cl;
- bio->bi_io_vec = io->bi_inline_vecs;
bch2_bio_map(bio, NULL);
}
#include "buckets.h"
#include "clock.h"
#include "extents.h"
+#include "eytzinger.h"
#include "io.h"
#include "keylist.h"
#include "move.h"
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/math64.h>
+#include <linux/sort.h>
#include <linux/wait.h>
/* Moving GC - IO loop */
+static int bucket_idx_cmp(const void *_l, const void *_r, size_t size)
+{
+ const struct bucket_heap_entry *l = _l;
+ const struct bucket_heap_entry *r = _r;
+
+ if (l->bucket < r->bucket)
+ return -1;
+ if (l->bucket > r->bucket)
+ return 1;
+ return 0;
+}
+
static const struct bch_extent_ptr *moving_pred(struct bch_dev *ca,
struct bkey_s_c k)
{
+ bucket_heap *h = &ca->copygc_heap;
const struct bch_extent_ptr *ptr;
if (bkey_extent_is_data(k.k) &&
(ptr = bch2_extent_has_device(bkey_s_c_to_extent(k),
- ca->dev_idx)) &&
- PTR_BUCKET(ca, ptr)->mark.copygc)
- return ptr;
+ ca->dev_idx))) {
+ struct bucket_heap_entry search = {
+ .bucket = PTR_BUCKET_NR(ca, ptr)
+ };
+
+ size_t i = eytzinger0_find(h->data, h->used,
+ sizeof(h->data[0]),
+ bucket_idx_cmp, &search);
+
+ if (i < h->used)
+ return ptr;
+ }
return NULL;
}
u64 sectors_to_move)
{
struct bch_fs *c = ca->fs;
- struct bucket *g;
+ bucket_heap *h = &ca->copygc_heap;
struct moving_context ctxt;
struct btree_iter iter;
struct bkey_s_c k;
u64 sectors_not_moved = 0;
size_t buckets_not_moved = 0;
+ struct bucket_heap_entry *i;
bch2_ratelimit_reset(&ca->moving_gc_pd.rate);
bch2_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate,
SECTORS_IN_FLIGHT_PER_DEVICE);
- bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+ BTREE_ITER_PREFETCH);
while (1) {
if (kthread_should_stop())
buckets_to_move);
/* don't check this if we bailed out early: */
- for_each_bucket(g, ca)
- if (g->mark.copygc && bucket_sectors_used(g)) {
- sectors_not_moved += bucket_sectors_used(g);
+ for (i = h->data; i < h->data + h->used; i++) {
+ struct bucket_mark m = READ_ONCE(ca->buckets[i->bucket].mark);
+
+ if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
+ sectors_not_moved += bucket_sectors_used(m);
buckets_not_moved++;
}
+ }
if (sectors_not_moved)
bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved",
return ret;
}
+static inline int sectors_used_cmp(bucket_heap *heap,
+ struct bucket_heap_entry l,
+ struct bucket_heap_entry r)
+{
+ return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
+}
+
static void bch2_moving_gc(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
struct bucket *g;
- struct bucket_mark new;
- u64 sectors_to_move;
+ u64 sectors_to_move = 0;
size_t buckets_to_move, buckets_unused = 0;
- struct bucket_heap_entry e;
- unsigned sectors_used, i;
+ struct bucket_heap_entry e, *i;
int reserve_sectors;
if (!have_copygc_reserve(ca)) {
*/
/*
- * We need bucket marks to be up to date, so gc can't be recalculating
- * them, and we don't want the allocator invalidating a bucket after
- * we've decided to evacuate it but before we set copygc:
+ * We need bucket marks to be up to date - gc can't be recalculating
+ * them:
*/
down_read(&c->gc_lock);
- mutex_lock(&ca->heap_lock);
- mutex_lock(&ca->fs->bucket_lock);
-
- ca->heap.used = 0;
+ ca->copygc_heap.used = 0;
for_each_bucket(g, ca) {
- bucket_cmpxchg(g, new, new.copygc = 0);
+ struct bucket_mark m = READ_ONCE(g->mark);
+ struct bucket_heap_entry e = { g - ca->buckets, m };
- if (bucket_unused(g)) {
+ if (bucket_unused(m)) {
buckets_unused++;
continue;
}
- if (g->mark.owned_by_allocator ||
- g->mark.data_type != BUCKET_DATA)
+ if (m.owned_by_allocator ||
+ m.data_type != BUCKET_DATA)
continue;
- sectors_used = bucket_sectors_used(g);
-
- if (sectors_used >= ca->mi.bucket_size)
+ if (bucket_sectors_used(m) >= ca->mi.bucket_size)
continue;
- bucket_heap_push(ca, g, sectors_used);
+ heap_add_or_replace(&ca->copygc_heap, e, -sectors_used_cmp);
}
+ up_read(&c->gc_lock);
- sectors_to_move = 0;
- for (i = 0; i < ca->heap.used; i++)
- sectors_to_move += ca->heap.data[i].val;
+ for (i = ca->copygc_heap.data;
+ i < ca->copygc_heap.data + ca->copygc_heap.used;
+ i++)
+ sectors_to_move += bucket_sectors_used(i->mark);
while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
- BUG_ON(!heap_pop(&ca->heap, e, bucket_min_cmp));
- sectors_to_move -= e.val;
+ BUG_ON(!heap_pop(&ca->copygc_heap, e, -sectors_used_cmp));
+ sectors_to_move -= bucket_sectors_used(e.mark);
}
- for (i = 0; i < ca->heap.used; i++)
- bucket_cmpxchg(ca->heap.data[i].g, new, new.copygc = 1);
+ buckets_to_move = ca->copygc_heap.used;
- buckets_to_move = ca->heap.used;
-
- mutex_unlock(&ca->fs->bucket_lock);
- mutex_unlock(&ca->heap_lock);
- up_read(&c->gc_lock);
+ eytzinger0_sort(ca->copygc_heap.data,
+ ca->copygc_heap.used,
+ sizeof(ca->copygc_heap.data[0]),
+ bucket_idx_cmp, NULL);
read_moving(ca, buckets_to_move, sectors_to_move);
}
break;
}
- cpu_relax_lowlatency();
+ cpu_relax();
}
rcu_read_unlock();
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
- cpu_relax_lowlatency();
+ cpu_relax();
}
osq_unlock(&lock->osq);
struct btree_iter *iter, const void *key)
{
bch2_btree_iter_init(iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)));
+ POS(inode, desc.hash_key(info, key)), 0);
return bch2_hash_lookup_at(desc, info, iter, key);
}
struct bch_fs *c, u64 inode,
struct btree_iter *iter, const void *key)
{
- bch2_btree_iter_init_intent(iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)));
+ bch2_btree_iter_init(iter, c, desc.btree_id,
+ POS(inode, desc.hash_key(info, key)),
+ BTREE_ITER_INTENT);
return bch2_hash_lookup_at(desc, info, iter, key);
}
struct btree_iter *iter,
const void *key)
{
- bch2_btree_iter_init_intent(iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)));
+ bch2_btree_iter_init(iter, c, desc.btree_id,
+ POS(inode, desc.hash_key(info, key)),
+ BTREE_ITER_INTENT);
return bch2_hash_hole_at(desc, iter);
}
struct bkey_s_c k;
int ret;
- bch2_btree_iter_init_intent(&hashed_slot, c, desc.btree_id,
- POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))));
- bch2_btree_iter_init_intent(&iter, c, desc.btree_id, hashed_slot.pos);
+ bch2_btree_iter_init(&hashed_slot, c, desc.btree_id,
+ POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
+ BTREE_ITER_INTENT);
+ bch2_btree_iter_init(&iter, c, desc.btree_id, hashed_slot.pos,
+ BTREE_ITER_INTENT);
bch2_btree_iter_link(&hashed_slot, &iter);
retry:
/*
int ret = -ENOENT;
bch2_btree_iter_init(&whiteout_iter, iter->c, desc.btree_id,
- iter->pos);
+ iter->pos, 0);
bch2_btree_iter_link(iter, &whiteout_iter);
ret = bch2_hash_needs_whiteout(desc, info, &whiteout_iter, iter);
struct bkey_s_c k;
int ret = -ENOENT;
- bch2_btree_iter_init_intent(&iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)));
+ bch2_btree_iter_init(&iter, c, desc.btree_id,
+ POS(inode, desc.hash_key(info, key)),
+ BTREE_ITER_INTENT);
bch2_btree_iter_init(&whiteout_iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)));
+ POS(inode, desc.hash_key(info, key)), 0);
bch2_btree_iter_link(&iter, &whiteout_iter);
retry:
k = bch2_hash_lookup_at(desc, info, &iter, key);
bch2_io_clock_exit(&c->io_clock[WRITE]);
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
- bdi_destroy(&c->bdi);
+ if (c->bdi.bdi_list.next)
+ bdi_destroy(&c->bdi);
lg_lock_free(&c->usage_lock);
free_percpu(c->usage_percpu);
mempool_exit(&c->btree_bounce_pool);
mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
sizeof(struct btree_interior_update)) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
- bioset_init(&c->btree_read_bio, 1, 0) ||
+ bioset_init(&c->btree_read_bio, 1,
+ offsetof(struct btree_read_bio, bio)) ||
bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
kfree(ca->bio_prio);
kvpfree(ca->buckets, ca->mi.nbuckets * sizeof(struct bucket));
kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
- free_heap(&ca->heap);
+ free_heap(&ca->copygc_heap);
+ free_heap(&ca->alloc_heap);
free_fifo(&ca->free_inc);
for (i = 0; i < RESERVE_NR; i++)
spin_lock_init(&ca->freelist_lock);
spin_lock_init(&ca->prio_buckets_lock);
- mutex_init(&ca->heap_lock);
mutex_init(&ca->prio_write_lock);
bch2_dev_moving_gc_init(ca);
movinggc_reserve, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
!init_fifo(&ca->free_inc, free_inc_reserve, GFP_KERNEL) ||
- !init_heap(&ca->heap, heap_size, GFP_KERNEL) ||
+ !init_heap(&ca->alloc_heap, heap_size, GFP_KERNEL) ||
+ !init_heap(&ca->copygc_heap,heap_size, GFP_KERNEL) ||
!(ca->oldest_gens = kvpmalloc(ca->mi.nbuckets *
sizeof(u8),
GFP_KERNEL|__GFP_ZERO)) ||
if (!bch2_fs_running(c))
return -EPERM;
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k)
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0, k)
if (k.k->type == BCH_EXTENT) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const struct bch_extent_ptr *ptr;
static unsigned bucket_sectors_used_fn(struct bch_dev *ca, struct bucket *g,
void *private)
{
- return bucket_sectors_used(g);
+ return bucket_sectors_used(g->mark);
}
static unsigned bucket_oldest_gen_fn(struct bch_dev *ca, struct bucket *g,
bch2_move_ctxt_init(&ctxt, &tier->pd.rate,
nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
- bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+ BTREE_ITER_PREFETCH);
while (!kthread_should_stop() &&
!bch2_move_ctxt_wait(&ctxt) &&
return n;
}
+
+#include "eytzinger.h"
+
+static int alignment_ok(const void *base, size_t align)
+{
+ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+ ((unsigned long)base & (align - 1)) == 0;
+}
+
+static void u32_swap(void *a, void *b, size_t size)
+{
+ u32 t = *(u32 *)a;
+ *(u32 *)a = *(u32 *)b;
+ *(u32 *)b = t;
+}
+
+static void u64_swap(void *a, void *b, size_t size)
+{
+ u64 t = *(u64 *)a;
+ *(u64 *)a = *(u64 *)b;
+ *(u64 *)b = t;
+}
+
+static void generic_swap(void *a, void *b, size_t size)
+{
+ char t;
+
+ do {
+ t = *(char *)a;
+ *(char *)a++ = *(char *)b;
+ *(char *)b++ = t;
+ } while (--size > 0);
+}
+
+static inline int do_cmp(void *base, size_t n, size_t size,
+ int (*cmp_func)(const void *, const void *, size_t),
+ size_t l, size_t r)
+{
+ return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ size);
+}
+
+static inline void do_swap(void *base, size_t n, size_t size,
+ void (*swap_func)(void *, void *, size_t),
+ size_t l, size_t r)
+{
+ swap_func(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ size);
+}
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+ int (*cmp_func)(const void *, const void *, size_t),
+ void (*swap_func)(void *, void *, size_t))
+{
+ int i, c, r;
+
+ if (!swap_func) {
+ if (size == 4 && alignment_ok(base, 4))
+ swap_func = u32_swap;
+ else if (size == 8 && alignment_ok(base, 8))
+ swap_func = u64_swap;
+ else
+ swap_func = generic_swap;
+ }
+
+ /* heapify */
+ for (i = n / 2 - 1; i >= 0; --i) {
+ for (r = i; r * 2 + 1 < n; r = c) {
+ c = r * 2 + 1;
+
+ if (c + 1 < n &&
+ do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+ c++;
+
+ if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+ break;
+
+ do_swap(base, n, size, swap_func, r, c);
+ }
+ }
+
+ /* sort */
+ for (i = n - 1; i > 0; --i) {
+ do_swap(base, n, size, swap_func, 0, i);
+
+ for (r = 0; r * 2 + 1 < i; r = c) {
+ c = r * 2 + 1;
+
+ if (c + 1 < i &&
+ do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+ c++;
+
+ if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+ break;
+
+ do_swap(base, n, size, swap_func, r, c);
+ }
+ }
+}
?: __vmalloc(size, gfp_mask, PAGE_KERNEL);
}
-#define DECLARE_HEAP(type, name) \
- struct { \
- size_t size, used; \
- type *data; \
- } name
+#define HEAP(type) \
+struct { \
+ size_t size, used; \
+ type *data; \
+}
+
+#define DECLARE_HEAP(type, name) HEAP(type) name
#define init_heap(heap, _size, gfp) \
({ \
#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j])
-#define heap_sift(h, i, cmp) \
+#define heap_peek(h) \
+({ \
+ EBUG_ON(!(h)->used); \
+ (h)->data[0]; \
+})
+
+#define heap_full(h) ((h)->used == (h)->size)
+
+#define heap_sift_down(h, i, cmp) \
do { \
- size_t _r, _j = i; \
+ size_t _c, _j = i; \
\
- for (; _j * 2 + 1 < (h)->used; _j = _r) { \
- _r = _j * 2 + 1; \
- if (_r + 1 < (h)->used && \
- cmp((h)->data[_r], (h)->data[_r + 1])) \
- _r++; \
+ for (; _j * 2 + 1 < (h)->used; _j = _c) { \
+ _c = _j * 2 + 1; \
+ if (_c + 1 < (h)->used && \
+ cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \
+ _c++; \
\
- if (cmp((h)->data[_r], (h)->data[_j])) \
+ if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \
break; \
- heap_swap(h, _r, _j); \
+ heap_swap(h, _c, _j); \
} \
} while (0)
-#define heap_sift_down(h, i, cmp) \
+#define heap_sift_up(h, i, cmp) \
do { \
while (i) { \
size_t p = (i - 1) / 2; \
- if (cmp((h)->data[i], (h)->data[p])) \
+ if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \
break; \
heap_swap(h, i, p); \
i = p; \
} \
} while (0)
-#define heap_add(h, d, cmp) \
+#define heap_add(h, new, cmp) \
({ \
bool _r = !heap_full(h); \
if (_r) { \
size_t _i = (h)->used++; \
- (h)->data[_i] = d; \
+ (h)->data[_i] = new; \
\
- heap_sift_down(h, _i, cmp); \
- heap_sift(h, _i, cmp); \
+ heap_sift_up(h, _i, cmp); \
} \
_r; \
})
+#define heap_add_or_replace(h, new, cmp) \
+do { \
+ if (!heap_add(h, new, cmp) && \
+ cmp(h, new, heap_peek(h)) >= 0) { \
+ (h)->data[0] = new; \
+ heap_sift_down(h, 0, cmp); \
+ } \
+} while (0)
+
#define heap_del(h, i, cmp) \
do { \
size_t _i = (i); \
BUG_ON(_i >= (h)->used); \
(h)->used--; \
heap_swap(h, _i, (h)->used); \
+ heap_sift_up(h, _i, cmp); \
heap_sift_down(h, _i, cmp); \
- heap_sift(h, _i, cmp); \
} while (0)
#define heap_pop(h, d, cmp) \
_r; \
})
-#define heap_peek(h) \
-({ \
- EBUG_ON(!(h)->used); \
- (h)->data[0]; \
-})
-
-#define heap_full(h) ((h)->used == (h)->size)
-
#define heap_resort(heap, cmp) \
do { \
ssize_t _i; \
for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \
- heap_sift(heap, _i, cmp); \
+ heap_sift_down(heap, _i, cmp); \
} while (0)
/*
ssize_t ret = 0;
size_t len;
- for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), k) {
+ for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), 0, k) {
BUG_ON(k.k->p.inode < inum);
if (k.k->p.inode > inum)
return NULL;
bio = p + front_pad;
- bio_init(bio);
- bio->bi_pool = bs;
- bio->bi_max_vecs = nr_iovecs;
- bio->bi_io_vec = bio->bi_inline_vecs;
+ bio_init(bio, bio->bi_inline_vecs, nr_iovecs);
+ bio->bi_pool = bs;
return bio;
}