#include "super.h"
#include "super-io.h"
#include "sysfs.h"
+#include "counters.h"
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
static void bch2_fs_release(struct kobject *);
static void bch2_dev_release(struct kobject *);
+static void bch2_fs_counters_release(struct kobject *k)
+{
+}
static void bch2_fs_internal_release(struct kobject *k)
{
}
static KTYPE(bch2_fs);
+static KTYPE(bch2_fs_counters);
static KTYPE(bch2_fs_internal);
static KTYPE(bch2_fs_opts_dir);
static KTYPE(bch2_fs_time_stats);
{
struct bch_dev *ca;
unsigned i, clean_passes = 0;
+ u64 seq = 0;
bch2_rebalance_stop(c);
bch2_copygc_stop(c);
bch2_gc_thread_stop(c);
- /*
- * Flush journal before stopping allocators, because flushing journal
- * blacklist entries involves allocating new btree nodes:
- */
- bch2_journal_flush_all_pins(&c->journal);
-
- /*
- * If the allocator threads didn't all start up, the btree updates to
- * write out alloc info aren't going to work:
- */
- if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
- goto nowrote_alloc;
-
bch_verbose(c, "flushing journal and stopping allocators");
- bch2_journal_flush_all_pins(&c->journal);
- set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
-
do {
clean_passes++;
- if (bch2_journal_flush_all_pins(&c->journal))
- clean_passes = 0;
-
- /*
- * In flight interior btree updates will generate more journal
- * updates and btree updates (alloc btree):
- */
- if (bch2_btree_interior_updates_nr_pending(c)) {
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
+ if (bch2_btree_interior_updates_flush(c) ||
+ bch2_journal_flush_all_pins(&c->journal) ||
+ bch2_btree_flush_all_writes(c) ||
+ seq != atomic64_read(&c->journal.seq)) {
+ seq = atomic64_read(&c->journal.seq);
clean_passes = 0;
}
- flush_work(&c->btree_interior_update_work);
-
- if (bch2_journal_flush_all_pins(&c->journal))
- clean_passes = 0;
} while (clean_passes < 2);
- bch_verbose(c, "flushing journal and stopping allocators complete");
-
- set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
-nowrote_alloc:
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
- flush_work(&c->btree_interior_update_work);
-
- for_each_member_device(ca, c, i)
- bch2_dev_allocator_stop(ca);
- clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
- clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
+ bch_verbose(c, "flushing journal and stopping allocators complete");
+ if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+ set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
bch2_fs_journal_stop(&c->journal);
- /*
- * the journal kicks off btree writes via reclaim - wait for in flight
- * writes after stopping journal:
- */
- bch2_btree_flush_all_writes(c);
-
/*
* After stopping journal:
*/
/*
* Block new foreground-end write operations from starting - any new
* writes will return -EROFS:
- *
- * (This is really blocking new _allocations_, writes to previously
- * allocated space can still happen until stopping the allocator in
- * bch2_dev_allocator_stop()).
*/
percpu_ref_kill(&c->writes);
!test_bit(BCH_FS_ERROR, &c->flags) &&
!test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
test_bit(BCH_FS_STARTED, &c->flags) &&
- test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) &&
+ test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) &&
!c->opts.norecovery) {
bch_verbose(c, "marking filesystem clean");
bch2_fs_mark_clean(c);
if (ret)
goto err;
- clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
+ clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
for_each_rw_member(ca, c, i)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- for_each_rw_member(ca, c, i) {
- ret = bch2_dev_allocator_start(ca);
- if (ret) {
- bch_err(c, "error starting allocator threads");
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
- }
-
- set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-
- for_each_rw_member(ca, c, i)
- bch2_wake_allocator(ca);
+ bch2_do_discards(c);
if (!early) {
ret = bch2_fs_read_write_late(c);
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
bch2_journal_keys_free(&c->journal_keys);
- bch2_journal_entries_free(&c->journal_entries);
+ bch2_journal_entries_free(c);
percpu_free_rwsem(&c->mark_lock);
if (c->btree_paths_bufs)
bch2_fs_debug_exit(c);
bch2_fs_chardev_exit(c);
+ kobject_put(&c->counters_kobj);
kobject_put(&c->time_stats);
kobject_put(&c->opts_dir);
kobject_put(&c->internal);
kobject_add(&c->internal, &c->kobj, "internal") ?:
kobject_add(&c->opts_dir, &c->kobj, "options") ?:
kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+ kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
bch2_opts_create_sysfs_files(&c->opts_dir);
if (ret) {
bch_err(c, "error creating sysfs objects");
kobject_init(&c->internal, &bch2_fs_internal_ktype);
kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+ kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
c->minor = -1;
c->disk_sb.fs_sb = true;
bch2_fs_allocator_foreground_init(c);
bch2_fs_rebalance_init(c);
bch2_fs_quota_init(c);
+ bch2_fs_ec_init_early(c);
INIT_LIST_HEAD(&c->list);
INIT_WORK(&c->journal_seq_blacklist_gc_work,
bch2_blacklist_entries_gc);
- INIT_LIST_HEAD(&c->journal_entries);
INIT_LIST_HEAD(&c->journal_iters);
INIT_LIST_HEAD(&c->fsck_errors);
bch2_fs_encryption_init(c) ?:
bch2_fs_compress_init(c) ?:
bch2_fs_ec_init(c) ?:
- bch2_fs_fsio_init(c);
+ bch2_fs_fsio_init(c) ?:
+ bch2_fs_counters_init(c);
if (ret)
goto err;
- if (c->opts.nochanges)
- set_bit(JOURNAL_NOCHANGES, &c->journal.flags);
-
mi = bch2_sb_get_members(c->disk_sb.sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
static void print_mount_opts(struct bch_fs *c)
{
enum bch_opt_id i;
- char buf[512];
- struct printbuf p = PBUF(buf);
+ struct printbuf p = PRINTBUF;
bool first = true;
- strcpy(buf, "(null)");
-
if (c->opts.read_only) {
pr_buf(&p, "ro");
first = false;
if (!first)
pr_buf(&p, ",");
first = false;
- bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE);
+ bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
}
- bch_info(c, "mounted with opts: %s", buf);
+ if (!p.pos)
+ pr_buf(&p, "(null)");
+
+ bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf);
+ printbuf_exit(&p);
}
int bch2_fs_start(struct bch_fs *c)
set_bit(BCH_FS_STARTED, &c->flags);
- /*
- * Allocator threads don't start filling copygc reserve until after we
- * set BCH_FS_STARTED - wake them now:
- *
- * XXX ugly hack:
- * Need to set ca->allocator_state here instead of relying on the
- * allocator threads to do it to avoid racing with the copygc threads
- * checking it and thinking they have no alloc reserve:
- */
- for_each_online_member(ca, c, i) {
- ca->allocator_state = ALLOCATOR_running;
- bch2_wake_allocator(ca);
- }
-
if (c->opts.read_only || c->opts.nochanges) {
bch2_fs_read_only(c);
} else {
static void bch2_dev_free(struct bch_dev *ca)
{
- bch2_dev_allocator_stop(ca);
-
cancel_work_sync(&ca->io_error_work);
if (ca->kobj.state_in_sysfs &&
ca->mi = bch2_mi_to_cpu(member);
ca->uuid = member->uuid;
- if (opt_defined(c->opts, discard))
- ca->mi.discard = opt_get(c->opts, discard);
+ ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
+ ca->mi.bucket_size / btree_sectors(c));
if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) ||
ca->fs = c;
- if (ca->mi.state == BCH_MEMBER_STATE_rw &&
- bch2_dev_allocator_start(ca)) {
- bch2_dev_free(ca);
- goto err;
- }
-
bch2_dev_attach(c, ca, dev_idx);
out:
pr_verbose_init(c->opts, "ret %i", ret);
/*
* The allocator thread itself allocates btree nodes, so stop it first:
*/
- bch2_dev_allocator_stop(ca);
bch2_dev_allocator_remove(c, ca);
bch2_dev_journal_stop(&c->journal, ca);
bch2_copygc_start(c);
}
-static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
{
lockdep_assert_held(&c->state_lock);
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
-
- return bch2_dev_allocator_start(ca);
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
mutex_unlock(&c->sb_lock);
if (new_state == BCH_MEMBER_STATE_rw)
- ret = __bch2_dev_read_write(c, ca);
+ __bch2_dev_read_write(c, ca);
rebalance_wakeup(c);
static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- struct btree_trans trans;
- size_t i;
+ struct bpos start = POS(ca->dev_idx, 0);
+ struct bpos end = POS(ca->dev_idx, U64_MAX);
int ret;
- bch2_trans_init(&trans, c, 0, 0);
-
- for (i = 0; i < ca->mi.nbuckets; i++) {
- ret = lockrestart_do(&trans,
- bch2_btree_key_cache_flush(&trans,
- BTREE_ID_alloc, POS(ca->dev_idx, i)));
- if (ret)
- break;
- }
- bch2_trans_exit(&trans);
-
- if (ret) {
+ /*
+ * We clear the LRU and need_discard btrees first so that we don't race
+ * with bch2_do_invalidates() and bch2_do_discards()
+ */
+ ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+ BTREE_TRIGGER_NORUN, NULL);
+ if (ret)
bch_err(c, "error %i removing dev alloc info", ret);
- return ret;
- }
- return bch2_btree_delete_range(c, BTREE_ID_alloc,
- POS(ca->dev_idx, 0),
- POS(ca->dev_idx + 1, 0),
- 0, NULL);
+ return ret;
}
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
goto err;
}
- ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
+ ret = bch2_dev_remove_alloc(c, ca);
if (ret) {
- bch_err(ca, "Remove failed: error %i flushing journal", ret);
+ bch_err(ca, "Remove failed, error deleting alloc info");
goto err;
}
- ret = bch2_dev_remove_alloc(c, ca);
+ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
if (ret) {
- bch_err(ca, "Remove failed, error deleting alloc info");
+ bch_err(ca, "Remove failed: error %i flushing journal", ret);
goto err;
}
- /*
- * must flush all existing journal entries, they might have
- * (overwritten) keys that point to the device we're removing:
- */
- bch2_journal_flush_all_pins(&c->journal);
- /*
- * hack to ensure bch2_replicas_gc2() clears out entries to this device
- */
- bch2_journal_meta(&c->journal);
- ret = bch2_journal_error(&c->journal);
+ ret = bch2_journal_flush(&c->journal);
if (ret) {
bch_err(ca, "Remove failed, journal error");
goto err;
data = bch2_dev_has_data(c, ca);
if (data) {
- char data_has_str[100];
+ struct printbuf data_has = PRINTBUF;
- bch2_flags_to_text(&PBUF(data_has_str),
- bch2_data_types, data);
- bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+ bch2_flags_to_text(&data_has, bch2_data_types, data);
+ bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
+ printbuf_exit(&data_has);
ret = -EBUSY;
goto err;
}
struct bch_sb_field_members *mi;
struct bch_member dev_mi;
unsigned dev_idx, nr_devices, u64s;
- char *_errbuf;
- struct printbuf errbuf;
+ struct printbuf errbuf = PRINTBUF;
int ret;
- _errbuf = kmalloc(4096, GFP_KERNEL);
- if (!_errbuf)
- return -ENOMEM;
-
- errbuf = _PBUF(_errbuf, 4096);
-
ret = bch2_read_super(path, &opts, &sb);
if (ret) {
bch_err(c, "device add error: error reading super: %i", ret);
goto err;
}
+ bch2_dev_usage_init(ca);
+
ret = __bch2_dev_attach_bdev(ca, &sb);
if (ret) {
bch2_dev_free(ca);
goto err_late;
}
+ ret = bch2_fs_freespace_init(c);
+ if (ret) {
+ bch_err(c, "device add error: error initializing free space: %i", ret);
+ goto err_late;
+ }
+
ca->new_fs_bucket_idx = 0;
- if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- ret = __bch2_dev_read_write(c, ca);
- if (ret) {
- bch_err(c, "device add error: error going RW on new device: %i", ret);
- goto err_late;
- }
- }
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
up_write(&c->state_lock);
return 0;
if (ca)
bch2_dev_free(ca);
bch2_free_super(&sb);
- kfree(_errbuf);
+ printbuf_exit(&errbuf);
return ret;
err_late:
up_write(&c->state_lock);
goto err;
}
- if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- ret = __bch2_dev_read_write(c, ca);
- if (ret)
- goto err;
- }
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb.sb);
struct bch_sb_field_members *mi;
unsigned i, best_sb = 0;
const char *err;
- char *_errbuf = NULL;
- struct printbuf errbuf;
+ struct printbuf errbuf = PRINTBUF;
int ret = 0;
if (!try_module_get(THIS_MODULE))
goto err;
}
- _errbuf = kmalloc(4096, GFP_KERNEL);
- if (!_errbuf) {
- ret = -ENOMEM;
- goto err;
- }
-
- errbuf = _PBUF(_errbuf, 4096);
-
sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
if (!sb) {
ret = -ENOMEM;
}
out:
kfree(sb);
- kfree(_errbuf);
+ printbuf_exit(&errbuf);
module_put(THIS_MODULE);
pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
return c;