From 03bc9d71b13e6f8e879894f93ea16f1f4a8280c9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Apr 2017 21:19:15 -0800 Subject: [PATCH] Update bcachefs sources to 3b4024f944 --- .bcachefs_revision | 2 +- libbcachefs/bcachefs.h | 2 +- libbcachefs/btree_io.c | 77 +---------- libbcachefs/btree_io.h | 4 +- libbcachefs/btree_update.c | 95 +++++++------- libbcachefs/buckets.h | 15 --- libbcachefs/chardev.c | 6 +- libbcachefs/io.c | 2 +- libbcachefs/journal.c | 129 ++++++++++++------- libbcachefs/journal.h | 3 +- libbcachefs/journal_types.h | 1 + libbcachefs/super.c | 9 +- libbcachefs/sysfs.c | 249 ++++++++++-------------------------- 13 files changed, 224 insertions(+), 370 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 35e8c14..9a3f687 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -da037866e669b09edc6b049ce09535d3456474cb +3b4024f94489e4d8dc8eb7f1278754a2545f8026 diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index cf1c4bd..c170e85 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -754,7 +754,7 @@ struct bch_fs { unsigned bucket_journal_seq; /* The rest of this all shows up in sysfs */ - atomic_long_t cache_read_races; + atomic_long_t read_realloc_races; unsigned foreground_write_ratelimit_enabled:1; unsigned copy_gc_enabled:1; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index b56b173..8152dc4 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1630,82 +1630,19 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, } } -/* - * Write all dirty btree nodes to disk, including roots - */ -void bch2_btree_flush(struct bch_fs *c) +void bch2_btree_verify_flushed(struct bch_fs *c) { - struct closure cl; - struct btree *b; struct bucket_table *tbl; struct rhash_head *pos; - bool saw_dirty; + struct btree *b; unsigned i; - closure_init_stack(&cl); - rcu_read_lock(); + tbl = rht_dereference_rcu(c->btree_cache_table.tbl, + &c->btree_cache_table); - do { - saw_dirty = false; - i = 0; -restart: - tbl = rht_dereference_rcu(c->btree_cache_table.tbl, - &c->btree_cache_table); - - for (; i < tbl->size; i++) - rht_for_each_entry_rcu(b, pos, tbl, i, hash) { - saw_dirty |= btree_node_dirty(b); - - if (btree_node_dirty(b) && - btree_node_may_write(b)) { - rcu_read_unlock(); - six_lock_read(&b->lock); - bch2_btree_node_write_dirty(c, b, &cl, 1); - six_unlock_read(&b->lock); - rcu_read_lock(); - goto restart; - } - } - } while (saw_dirty); - + for (i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(b, pos, tbl, i, hash) + BUG_ON(btree_node_dirty(b)); rcu_read_unlock(); - - closure_sync(&cl); -} - -/** - * bch_btree_node_flush_journal - flush any journal entries that contain keys - * from this node - * - * The bset's journal sequence number is used for preserving ordering of index - * updates across unclean shutdowns - it's used to ignore bsets newer than the - * most recent journal entry. - * - * But when rewriting btree nodes we compact all the bsets in a btree node - and - * if we compacted a bset that should be ignored with bsets we do need, that - * would be bad. So to avoid that, prior to making the new node visible ensure - * that the journal has been flushed so that all the bsets we compacted should - * be visible. - */ -void bch2_btree_node_flush_journal_entries(struct bch_fs *c, - struct btree *b, - struct closure *cl) -{ - int i = b->nsets; - - /* - * Journal sequence numbers in the different bsets will always be in - * ascending order, we only need to flush the highest - except that the - * most recent bset might not have a journal sequence number yet, so we - * need to loop: - */ - while (i--) { - u64 seq = le64_to_cpu(bset(b, &b->set[i])->journal_seq); - - if (seq) { - bch2_journal_flush_seq_async(&c->journal, seq, cl); - break; - } - } } diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 8473114..3014b5f 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -94,8 +94,6 @@ do { \ } \ } while (0) -void bch2_btree_flush(struct bch_fs *); -void bch2_btree_node_flush_journal_entries(struct bch_fs *, struct btree *, - struct closure *); +void bch2_btree_verify_flushed(struct bch_fs *); #endif /* _BCACHE_BTREE_IO_H */ diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index cdbc0de..196b742 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -161,15 +161,14 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b, { trace_btree_node_free(c, b); + BUG_ON(btree_node_dirty(b)); BUG_ON(b == btree_node_root(c, b)); BUG_ON(b->ob); BUG_ON(!list_empty(&b->write_blocked)); - six_lock_write(&b->lock); + clear_btree_node_noevict(b); - if (btree_node_dirty(b)) - bch2_btree_complete_write(c, b, btree_current_write(b)); - clear_btree_node_dirty(b); + six_lock_write(&b->lock); bch2_btree_node_hash_remove(c, b); @@ -192,6 +191,8 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) b->ob = NULL; + clear_btree_node_dirty(b); + __btree_node_free(c, b, NULL); bch2_open_bucket_put(c, ob); @@ -890,7 +891,8 @@ bch2_btree_interior_update_alloc(struct bch_fs *c) static void btree_interior_update_free(struct closure *cl) { - struct btree_interior_update *as = container_of(cl, struct btree_interior_update, cl); + struct btree_interior_update *as = + container_of(cl, struct btree_interior_update, cl); mempool_free(as, &as->c->btree_interior_update_pool); } @@ -910,9 +912,6 @@ static void btree_interior_update_nodes_reachable(struct closure *cl) bch2_btree_node_free_ondisk(c, &as->pending[i]); as->nr_pending = 0; - mutex_unlock(&c->btree_interior_update_lock); - - mutex_lock(&c->btree_interior_update_lock); list_del(&as->list); mutex_unlock(&c->btree_interior_update_lock); @@ -1039,6 +1038,15 @@ static void btree_interior_update_updated_btree(struct bch_fs *c, system_freezable_wq); } +static void btree_interior_update_reparent(struct btree_interior_update *as, + struct btree_interior_update *child) +{ + child->b = NULL; + child->mode = BTREE_INTERIOR_UPDATING_AS; + child->parent_as = as; + closure_get(&as->cl); +} + static void btree_interior_update_updated_root(struct bch_fs *c, struct btree_interior_update *as, enum btree_id btree_id) @@ -1053,14 +1061,8 @@ static void btree_interior_update_updated_root(struct bch_fs *c, * Old root might not be persistent yet - if so, redirect its * btree_interior_update operation to point to us: */ - if (r->as) { - BUG_ON(r->as->mode != BTREE_INTERIOR_UPDATING_ROOT); - - r->as->b = NULL; - r->as->mode = BTREE_INTERIOR_UPDATING_AS; - r->as->parent_as = as; - closure_get(&as->cl); - } + if (r->as) + btree_interior_update_reparent(as, r->as); as->mode = BTREE_INTERIOR_UPDATING_ROOT; as->b = r->b; @@ -1068,8 +1070,6 @@ static void btree_interior_update_updated_root(struct bch_fs *c, mutex_unlock(&c->btree_interior_update_lock); - bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl); - continue_at(&as->cl, btree_interior_update_nodes_written, system_freezable_wq); } @@ -1092,8 +1092,10 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c, struct btree_interior_update *as, struct btree *b) { + struct closure *cl, *cl_n; struct btree_interior_update *p, *n; struct pending_btree_node_free *d; + struct btree_write *w; struct bset_tree *t; /* @@ -1107,23 +1109,18 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c, for_each_bset(b, t) as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq); - /* - * Does this node have unwritten data that has a pin on the journal? - * - * If so, transfer that pin to the btree_interior_update operation - - * note that if we're freeing multiple nodes, we only need to keep the - * oldest pin of any of the nodes we're freeing. We'll release the pin - * when the new nodes are persistent and reachable on disk: - */ - bch2_journal_pin_add_if_older(&c->journal, - &b->writes[0].journal, - &as->journal, interior_update_flush); - bch2_journal_pin_add_if_older(&c->journal, - &b->writes[1].journal, - &as->journal, interior_update_flush); - mutex_lock(&c->btree_interior_update_lock); + /* Add this node to the list of nodes being freed: */ + BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending)); + + d = &as->pending[as->nr_pending++]; + d->index_update_done = false; + d->seq = b->data->keys.seq; + d->btree_id = b->btree_id; + d->level = b->level; + bkey_copy(&d->key, &b->key); + /* * Does this node have any btree_interior_update operations preventing * it from being written? @@ -1133,24 +1130,28 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c, * operations complete */ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { - BUG_ON(p->mode != BTREE_INTERIOR_UPDATING_NODE); - - p->mode = BTREE_INTERIOR_UPDATING_AS; list_del(&p->write_blocked_list); - p->b = NULL; - p->parent_as = as; - closure_get(&as->cl); + btree_interior_update_reparent(as, p); } - /* Add this node to the list of nodes being freed: */ - BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending)); + clear_btree_node_dirty(b); + w = btree_current_write(b); + + llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list) + llist_add(&cl->list, &as->wait.list); + + /* + * Does this node have unwritten data that has a pin on the journal? + * + * If so, transfer that pin to the btree_interior_update operation - + * note that if we're freeing multiple nodes, we only need to keep the + * oldest pin of any of the nodes we're freeing. We'll release the pin + * when the new nodes are persistent and reachable on disk: + */ + bch2_journal_pin_add_if_older(&c->journal, &w->journal, + &as->journal, interior_update_flush); + bch2_journal_pin_drop(&c->journal, &w->journal); - d = &as->pending[as->nr_pending++]; - d->index_update_done = false; - d->seq = b->data->keys.seq; - d->btree_id = b->btree_id; - d->level = b->level; - bkey_copy(&d->key, &b->key); mutex_unlock(&c->btree_interior_update_lock); } diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 5303cdc..3b82d7f 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -199,21 +199,6 @@ static inline u64 bch2_fs_sectors_used(struct bch_fs *c) return min(c->capacity, __bch2_fs_sectors_used(c)); } -/* XXX: kill? */ -static inline u64 sectors_available(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned i; - u64 ret = 0; - - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i) - ret += dev_buckets_available(ca) << ca->bucket_bits; - rcu_read_unlock(); - - return ret; -} - static inline bool is_available_bucket(struct bucket_mark mark) { return (!mark.owned_by_allocator && diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 2d20061..694fcd2 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -37,10 +37,10 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, path = strndup_user((const char __user *) (unsigned long) dev, PATH_MAX); - if (!path) - return ERR_PTR(-ENOMEM); + if (IS_ERR(path)) + return ERR_CAST(path); - bdev = lookup_bdev(strim(path)); + bdev = lookup_bdev(path); kfree(path); if (IS_ERR(bdev)) return ERR_CAST(bdev); diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 0a64f35..039dd04 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -1046,7 +1046,7 @@ static void bch2_read_endio(struct bio *bio) if (rbio->ptr.cached && (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || ptr_stale(rbio->ca, &rbio->ptr))) { - atomic_long_inc(&c->cache_read_races); + atomic_long_inc(&c->read_realloc_races); if (rbio->flags & BCH_READ_RETRY_IF_STALE) bch2_rbio_retry(c, rbio); diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 9e29061..f6203f1 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -180,8 +180,10 @@ redo_peek: ret == -EINTR) goto redo_peek; - /* -EROFS or perhaps -ENOSPC - bail out: */ - /* XXX warn here */ + bch2_fs_fatal_error(c, + "error %i rewriting btree node with blacklisted journal seq", + ret); + bch2_journal_halt(j); return; } } @@ -1018,6 +1020,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) fifo_for_each_entry_ptr(p, &j->pin, iter) { INIT_LIST_HEAD(&p->list); + INIT_LIST_HEAD(&p->flushed); atomic_set(&p->count, 0); } @@ -1147,6 +1150,7 @@ static void __journal_entry_new(struct journal *j, int count) &fifo_peek_back(&j->pin)); INIT_LIST_HEAD(&p->list); + INIT_LIST_HEAD(&p->flushed); atomic_set(&p->count, count); } @@ -1516,7 +1520,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) j->replay_pin_list = NULL; if (did_replay) { - bch2_btree_flush(c); + bch2_journal_flush_pins(&c->journal, U64_MAX); /* * Write a new journal entry _before_ we start journalling new data - @@ -1859,7 +1863,7 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) struct journal_entry_pin, list); if (ret) { /* must be list_del_init(), see bch2_journal_pin_drop() */ - list_del_init(&ret->list); + list_move(&ret->list, &pin_list->flushed); *seq = journal_pin_seq(j, pin_list); break; } @@ -1869,28 +1873,32 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) return ret; } -static bool journal_has_pins(struct journal *j) +static bool journal_flush_done(struct journal *j, u64 seq_to_flush) { bool ret; spin_lock(&j->lock); journal_reclaim_fast(j); - ret = fifo_used(&j->pin) > 1 || - atomic_read(&fifo_peek_front(&j->pin).count) > 1; + + ret = (fifo_used(&j->pin) == 1 && + atomic_read(&fifo_peek_front(&j->pin).count) == 1) || + last_seq(j) > seq_to_flush; spin_unlock(&j->lock); return ret; } -void bch2_journal_flush_pins(struct journal *j) +void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) { struct journal_entry_pin *pin; - u64 seq; + u64 pin_seq; - while ((pin = journal_get_next_pin(j, U64_MAX, &seq))) - pin->flush(j, pin, seq); + while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq))) + pin->flush(j, pin, pin_seq); - wait_event(j->wait, !journal_has_pins(j) || bch2_journal_error(j)); + wait_event(j->wait, + journal_flush_done(j, seq_to_flush) || + bch2_journal_error(j)); } static bool should_discard_bucket(struct journal *j, struct journal_device *ja) @@ -2174,9 +2182,18 @@ static void journal_write_done(struct closure *cl) struct journal *j = container_of(cl, struct journal, io); struct journal_buf *w = journal_prev_buf(j); + __bch2_time_stats_update(j->write_time, j->write_start_time); + j->last_seq_ondisk = le64_to_cpu(w->data->last_seq); - __bch2_time_stats_update(j->write_time, j->write_start_time); + /* + * Updating last_seq_ondisk may let journal_reclaim_work() discard more + * buckets: + * + * Must come before signaling write completion, for + * bch2_fs_journal_stop(): + */ + mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0); BUG_ON(!j->reservations.prev_buf_unwritten); atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, @@ -2199,12 +2216,6 @@ static void journal_write_done(struct closure *cl) closure_wake_up(&w->wait); wake_up(&j->wait); - - /* - * Updating last_seq_ondisk may let journal_reclaim_work() discard more - * buckets: - */ - mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0); } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) @@ -2345,8 +2356,12 @@ static void journal_write_work(struct work_struct *work) struct journal *j = container_of(to_delayed_work(work), struct journal, write_work); spin_lock(&j->lock); - set_bit(JOURNAL_NEED_WRITE, &j->flags); + if (!journal_entry_is_open(j)) { + spin_unlock(&j->lock); + return; + } + set_bit(JOURNAL_NEED_WRITE, &j->flags); if (journal_buf_switch(j, false) != JOURNAL_UNLOCKED) spin_unlock(&j->lock); } @@ -2505,6 +2520,8 @@ void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent) { + struct journal_buf *buf; + spin_lock(&j->lock); BUG_ON(seq > atomic64_read(&j->seq)); @@ -2517,8 +2534,9 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa if (seq == atomic64_read(&j->seq)) { bool set_need_write = false; - if (parent && - !closure_wait(&journal_cur_buf(j)->wait, parent)) + buf = journal_cur_buf(j); + + if (parent && !closure_wait(&buf->wait, parent)) BUG(); if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) { @@ -2529,7 +2547,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa switch (journal_buf_switch(j, set_need_write)) { case JOURNAL_ENTRY_ERROR: if (parent) - closure_wake_up(&journal_cur_buf(j)->wait); + closure_wake_up(&buf->wait); break; case JOURNAL_ENTRY_CLOSED: /* @@ -2545,7 +2563,9 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa } else if (parent && seq + 1 == atomic64_read(&j->seq) && j->reservations.prev_buf_unwritten) { - if (!closure_wait(&journal_prev_buf(j)->wait, parent)) + buf = journal_prev_buf(j); + + if (!closure_wait(&buf->wait, parent)) BUG(); smp_mb(); @@ -2553,7 +2573,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa /* check if raced with write completion (or failure) */ if (!j->reservations.prev_buf_unwritten || bch2_journal_error(j)) - closure_wake_up(&journal_prev_buf(j)->wait); + closure_wake_up(&buf->wait); } spin_unlock(&j->lock); @@ -2698,6 +2718,39 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) return ret; } +ssize_t bch2_journal_print_pins(struct journal *j, char *buf) +{ + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *pin; + ssize_t ret = 0; + unsigned i; + + spin_lock_irq(&j->pin_lock); + fifo_for_each_entry_ptr(pin_list, &j->pin, i) { + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "%llu: count %u\n", + journal_pin_seq(j, pin_list), + atomic_read(&pin_list->count)); + + list_for_each_entry(pin, &pin_list->list, list) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "\t%p %pf\n", + pin, pin->flush); + + if (!list_empty(&pin_list->flushed)) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "flushed:\n"); + + list_for_each_entry(pin, &pin_list->flushed, list) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "\t%p %pf\n", + pin, pin->flush); + } + spin_unlock_irq(&j->pin_lock); + + return ret; +} + static bool bch2_journal_writing_to_device(struct bch_dev *ca) { struct journal *j = &ca->fs->journal; @@ -2725,12 +2778,11 @@ static bool bch2_journal_writing_to_device(struct bch_dev *ca) int bch2_journal_move(struct bch_dev *ca) { - u64 last_flushed_seq; struct journal_device *ja = &ca->journal; - struct bch_fs *c = ca->fs; - struct journal *j = &c->journal; + struct journal *j = &ca->fs->journal; + u64 seq_to_flush = 0; unsigned i; - int ret = 0; /* Success */ + int ret; if (bch2_journal_writing_to_device(ca)) { /* @@ -2744,16 +2796,10 @@ int bch2_journal_move(struct bch_dev *ca) BUG_ON(bch2_journal_writing_to_device(ca)); } - /* - * Flush all btree updates to backing store so that any - * journal entries written to ca become stale and are no - * longer needed. - */ + for (i = 0; i < ja->nr; i++) + seq_to_flush = max(seq_to_flush, ja->bucket_seq[i]); - /* - * XXX: switch to normal journal reclaim machinery - */ - bch2_btree_flush(c); + bch2_journal_flush_pins(j, seq_to_flush); /* * Force a meta-data journal entry to be written so that @@ -2767,12 +2813,9 @@ int bch2_journal_move(struct bch_dev *ca) * the device */ spin_lock(&j->lock); - last_flushed_seq = last_seq(j); + ret = j->last_seq_ondisk > seq_to_flush ? 0 : -EIO; spin_unlock(&j->lock); - for (i = 0; i < ja->nr; i += 1) - BUG_ON(ja->bucket_seq[i] > last_flushed_seq); - return ret; } @@ -2786,7 +2829,7 @@ void bch2_fs_journal_stop(struct journal *j) * journal entries, then force a brand new empty journal entry to be * written: */ - bch2_journal_flush_pins(j); + bch2_journal_flush_pins(j, U64_MAX); bch2_journal_flush_async(j, NULL); bch2_journal_meta(j); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 9ad82c6..d0dd0d3 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -141,7 +141,7 @@ void bch2_journal_pin_add_if_older(struct journal *, struct journal_entry_pin *, struct journal_entry_pin *, journal_pin_flush_fn); -void bch2_journal_flush_pins(struct journal *); +void bch2_journal_flush_pins(struct journal *, u64); struct closure; struct bch_fs; @@ -354,6 +354,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) } ssize_t bch2_journal_print_debug(struct journal *, char *); +ssize_t bch2_journal_print_pins(struct journal *, char *); int bch2_dev_journal_alloc(struct bch_dev *); diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 75712ae..4b01b14 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -38,6 +38,7 @@ struct journal_buf { struct journal_entry_pin_list { struct list_head list; + struct list_head flushed; atomic_t count; }; diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 6cbfc80..f5ee2de 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -211,7 +211,14 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_gc_thread_stop(c); - bch2_btree_flush(c); + /* + * Flush journal before stopping allocators, because flushing journal + * blacklist entries involves allocating new btree nodes: + */ + bch2_journal_flush_pins(&c->journal, U64_MAX); + + if (!bch2_journal_error(&c->journal)) + bch2_btree_verify_flushed(c); for_each_member_device(ca, c, i) bch2_dev_allocator_stop(ca); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 808b308..ba04bba 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -120,6 +120,7 @@ do { \ return strtoi_h(buf, &var) ?: (ssize_t) size; \ } while (0) +write_attribute(trigger_journal_flush); write_attribute(trigger_btree_coalesce); write_attribute(trigger_gc); write_attribute(prune_cache); @@ -127,35 +128,25 @@ write_attribute(prune_cache); read_attribute(uuid); read_attribute(minor); read_attribute(bucket_size); -read_attribute(bucket_size_bytes); read_attribute(block_size); -read_attribute(block_size_bytes); read_attribute(btree_node_size); -read_attribute(btree_node_size_bytes); read_attribute(first_bucket); read_attribute(nbuckets); -read_attribute(tree_depth); -read_attribute(root_usage_percent); read_attribute(read_priority_stats); read_attribute(write_priority_stats); read_attribute(fragmentation_stats); read_attribute(oldest_gen_stats); read_attribute(reserve_stats); read_attribute(btree_cache_size); -read_attribute(cache_available_percent); read_attribute(compression_stats); read_attribute(written); read_attribute(btree_written); read_attribute(metadata_written); read_attribute(journal_debug); -write_attribute(journal_flush); -read_attribute(internal_uuid); +read_attribute(journal_pins); -read_attribute(btree_gc_running); +read_attribute(internal_uuid); -read_attribute(btree_nodes); -read_attribute(btree_used_percent); -read_attribute(average_key_size); read_attribute(available_buckets); read_attribute(free_buckets); read_attribute(dirty_data); @@ -168,10 +159,9 @@ read_attribute(meta_buckets); read_attribute(alloc_buckets); read_attribute(has_data); read_attribute(has_metadata); -read_attribute(bset_tree_stats); read_attribute(alloc_debug); -read_attribute(cache_read_races); +read_attribute(read_realloc_races); rw_attribute(journal_write_delay_ms); rw_attribute(journal_reclaim_delay_ms); @@ -221,73 +211,6 @@ static struct attribute sysfs_state_rw = { .mode = S_IRUGO }; -static int bch2_bset_print_stats(struct bch_fs *c, char *buf) -{ - struct bset_stats stats; - size_t nodes = 0; - struct btree *b; - struct bucket_table *tbl; - struct rhash_head *pos; - unsigned iter; - - memset(&stats, 0, sizeof(stats)); - - rcu_read_lock(); - for_each_cached_btree(b, c, tbl, iter, pos) { - bch2_btree_keys_stats(b, &stats); - nodes++; - } - rcu_read_unlock(); - - return snprintf(buf, PAGE_SIZE, - "btree nodes: %zu\n" - "written sets: %zu\n" - "written key bytes: %zu\n" - "unwritten sets: %zu\n" - "unwritten key bytes: %zu\n" - "no table sets: %zu\n" - "no table key bytes: %zu\n" - "floats: %zu\n" - "failed unpacked: %zu\n" - "failed prev: %zu\n" - "failed overflow: %zu\n", - nodes, - stats.sets[BSET_RO_AUX_TREE].nr, - stats.sets[BSET_RO_AUX_TREE].bytes, - stats.sets[BSET_RW_AUX_TREE].nr, - stats.sets[BSET_RW_AUX_TREE].bytes, - stats.sets[BSET_NO_AUX_TREE].nr, - stats.sets[BSET_NO_AUX_TREE].bytes, - stats.floats, - stats.failed_unpacked, - stats.failed_prev, - stats.failed_overflow); -} - -static unsigned bch2_root_usage(struct bch_fs *c) -{ - unsigned bytes = 0; - struct bkey_packed *k; - struct btree *b; - struct btree_node_iter iter; - - goto lock_root; - - do { - six_unlock_read(&b->lock); -lock_root: - b = c->btree_roots[BTREE_ID_EXTENTS].b; - six_lock_read(&b->lock); - } while (b != c->btree_roots[BTREE_ID_EXTENTS].b); - - for_each_btree_node_key(b, k, &iter, btree_node_is_extents(b)) - bytes += bkey_bytes(k); - - six_unlock_read(&b->lock); - - return (bytes * 100) / btree_bytes(c); -} - static size_t bch2_btree_cache_size(struct bch_fs *c) { size_t ret = 0; @@ -301,27 +224,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) return ret; } -static unsigned bch2_fs_available_percent(struct bch_fs *c) -{ - return div64_u64((u64) sectors_available(c) * 100, - c->capacity ?: 1); -} - -#if 0 -static unsigned bch2_btree_used(struct bch_fs *c) -{ - return div64_u64(c->gc_stats.key_bytes * 100, - (c->gc_stats.nodes ?: 1) * btree_bytes(c)); -} - -static unsigned bch2_average_key_size(struct bch_fs *c) -{ - return c->gc_stats.nkeys - ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) - : 0; -} -#endif - static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) { struct bch_fs_usage stats = bch2_fs_usage_read(c); @@ -358,6 +260,9 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) compressed_sectors_compressed = 0, compressed_sectors_uncompressed = 0; + if (!bch2_fs_running(c)) + return -EPERM; + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k) if (k.k->type == BCH_EXTENT) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); @@ -402,29 +307,17 @@ SHOW(bch2_fs) struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); sysfs_print(minor, c->minor); + sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms); sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); - sysfs_hprint(block_size, block_bytes(c)); - sysfs_print(block_size_bytes, block_bytes(c)); - sysfs_hprint(btree_node_size, c->sb.btree_node_size << 9); - sysfs_print(btree_node_size_bytes, c->sb.btree_node_size << 9); - + sysfs_print(block_size, block_bytes(c)); + sysfs_print(btree_node_size, btree_bytes(c)); sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); - sysfs_print(cache_available_percent, bch2_fs_available_percent(c)); - sysfs_print(btree_gc_running, c->gc_pos.phase != GC_PHASE_DONE); - -#if 0 - /* XXX: reimplement */ - sysfs_print(btree_used_percent, bch2_btree_used(c)); - sysfs_print(btree_nodes, c->gc_stats.nodes); - sysfs_hprint(average_key_size, bch2_average_key_size(c)); -#endif - - sysfs_print(cache_read_races, - atomic_long_read(&c->cache_read_races)); + sysfs_print(read_realloc_races, + atomic_long_read(&c->read_realloc_races)); sysfs_printf(foreground_write_ratelimit_enabled, "%i", c->foreground_write_ratelimit_enabled); @@ -445,28 +338,21 @@ SHOW(bch2_fs) /* Debugging: */ - if (attr == &sysfs_journal_debug) - return bch2_journal_print_debug(&c->journal, buf); - -#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - - if (!bch2_fs_running(c)) - return -EPERM; - - if (attr == &sysfs_bset_tree_stats) - return bch2_bset_print_stats(c, buf); if (attr == &sysfs_alloc_debug) return show_fs_alloc_debug(c, buf); - sysfs_print(tree_depth, c->btree_roots[BTREE_ID_EXTENTS].b->level); - sysfs_print(root_usage_percent, bch2_root_usage(c)); + if (attr == &sysfs_journal_debug) + return bch2_journal_print_debug(&c->journal, buf); + + if (attr == &sysfs_journal_pins) + return bch2_journal_print_pins(&c->journal, buf); if (attr == &sysfs_compression_stats) return bch2_compression_stats(c, buf); - sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); +#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); + BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM return 0; } @@ -519,17 +405,14 @@ STORE(__bch2_fs) if (!bch2_fs_running(c)) return -EPERM; - if (attr == &sysfs_journal_flush) { - bch2_journal_meta_async(&c->journal, NULL); + /* Debugging: */ - return size; - } + if (attr == &sysfs_trigger_journal_flush) + bch2_journal_meta_async(&c->journal, NULL); if (attr == &sysfs_trigger_btree_coalesce) bch2_coalesce(c); - /* Debugging: */ - if (attr == &sysfs_trigger_gc) bch2_gc(c); @@ -557,28 +440,21 @@ STORE(bch2_fs) SYSFS_OPS(bch2_fs); struct attribute *bch2_fs_files[] = { - &sysfs_journal_write_delay_ms, - &sysfs_journal_reclaim_delay_ms, - + &sysfs_minor, &sysfs_block_size, - &sysfs_block_size_bytes, &sysfs_btree_node_size, - &sysfs_btree_node_size_bytes, - &sysfs_tree_depth, - &sysfs_root_usage_percent, &sysfs_btree_cache_size, - &sysfs_cache_available_percent, - &sysfs_compression_stats, - - &sysfs_average_key_size, &sysfs_meta_replicas_have, &sysfs_data_replicas_have, + &sysfs_journal_write_delay_ms, + &sysfs_journal_reclaim_delay_ms, + &sysfs_foreground_target_percent, &sysfs_tiering_percent, - &sysfs_journal_flush, + &sysfs_compression_stats, NULL }; @@ -598,21 +474,17 @@ STORE(bch2_fs_internal) SYSFS_OPS(bch2_fs_internal); struct attribute *bch2_fs_internal_files[] = { - &sysfs_journal_debug, - &sysfs_alloc_debug, + &sysfs_journal_debug, + &sysfs_journal_pins, - &sysfs_btree_gc_running, - - &sysfs_btree_nodes, - &sysfs_btree_used_percent, - - &sysfs_bset_tree_stats, - &sysfs_cache_read_races, + &sysfs_read_realloc_races, + &sysfs_trigger_journal_flush, &sysfs_trigger_btree_coalesce, &sysfs_trigger_gc, &sysfs_prune_cache, + &sysfs_foreground_write_ratelimit_enabled, &sysfs_copy_gc_enabled, &sysfs_tiering_enabled, @@ -853,10 +725,8 @@ SHOW(bch2_dev) sysfs_printf(uuid, "%pU\n", ca->uuid.b); - sysfs_hprint(bucket_size, bucket_bytes(ca)); - sysfs_print(bucket_size_bytes, bucket_bytes(ca)); - sysfs_hprint(block_size, block_bytes(c)); - sysfs_print(block_size_bytes, block_bytes(c)); + sysfs_print(bucket_size, bucket_bytes(ca)); + sysfs_print(block_size, block_bytes(c)); sysfs_print(first_bucket, ca->mi.first_bucket); sysfs_print(nbuckets, ca->mi.nbuckets); sysfs_print(discard, ca->mi.discard); @@ -979,35 +849,46 @@ SYSFS_OPS(bch2_dev); struct attribute *bch2_dev_files[] = { &sysfs_uuid, &sysfs_bucket_size, - &sysfs_bucket_size_bytes, &sysfs_block_size, - &sysfs_block_size_bytes, &sysfs_first_bucket, &sysfs_nbuckets, - &sysfs_read_priority_stats, - &sysfs_write_priority_stats, - &sysfs_fragmentation_stats, - &sysfs_oldest_gen_stats, - &sysfs_reserve_stats, - &sysfs_available_buckets, - &sysfs_free_buckets, + + /* settings: */ + &sysfs_discard, + &sysfs_cache_replacement_policy, + &sysfs_tier, + &sysfs_state_rw, + + &sysfs_has_data, + &sysfs_has_metadata, + + /* io stats: */ + &sysfs_written, + &sysfs_btree_written, + &sysfs_metadata_written, + + /* alloc info - data: */ &sysfs_dirty_data, &sysfs_dirty_bytes, - &sysfs_dirty_buckets, &sysfs_cached_data, &sysfs_cached_bytes, + + /* alloc info - buckets: */ + &sysfs_available_buckets, + &sysfs_free_buckets, + &sysfs_dirty_buckets, &sysfs_cached_buckets, &sysfs_meta_buckets, &sysfs_alloc_buckets, - &sysfs_has_data, - &sysfs_has_metadata, - &sysfs_discard, - &sysfs_written, - &sysfs_btree_written, - &sysfs_metadata_written, - &sysfs_cache_replacement_policy, - &sysfs_tier, - &sysfs_state_rw, + + /* alloc info - other stats: */ + &sysfs_read_priority_stats, + &sysfs_write_priority_stats, + &sysfs_fragmentation_stats, + &sysfs_oldest_gen_stats, + &sysfs_reserve_stats, + + /* debug: */ &sysfs_alloc_debug, sysfs_pd_controller_files(copy_gc), -- 2.39.2