#include "bkey_sort.h"
#include "btree_cache.h"
#include "btree_gc.h"
+#include "btree_key_cache.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "chardev.h"
static void __bch2_fs_read_only(struct bch_fs *c)
{
struct bch_dev *ca;
- bool wrote;
+ bool wrote = false;
unsigned i, clean_passes = 0;
int ret;
*/
bch2_journal_flush_all_pins(&c->journal);
+ /*
+ * If the allocator threads didn't all start up, the btree updates to
+ * write out alloc info aren't going to work:
+ */
if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
- goto allocator_not_running;
+ goto nowrote_alloc;
- do {
- wrote = false;
+ bch_verbose(c, "writing alloc info");
+ /*
+ * This should normally just be writing the bucket read/write clocks:
+ */
+ ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
+ bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
+ bch_verbose(c, "writing alloc info complete");
- ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
- bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
+ if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+ bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
- if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
- bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
+ if (ret)
+ goto nowrote_alloc;
- if (ret)
- break;
+ bch_verbose(c, "flushing journal and stopping allocators");
- for_each_member_device(ca, c, i)
- bch2_dev_allocator_quiesce(c, ca);
+ bch2_journal_flush_all_pins(&c->journal);
+ set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
- bch2_journal_flush_all_pins(&c->journal);
+ do {
+ clean_passes++;
+
+ if (bch2_journal_flush_all_pins(&c->journal))
+ clean_passes = 0;
/*
- * We need to explicitly wait on btree interior updates to complete
- * before stopping the journal, flushing all journal pins isn't
- * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
- * interior updates have to drop their journal pin before they're
- * fully complete:
+ * In flight interior btree updates will generate more journal
+ * updates and btree updates (alloc btree):
*/
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
+ if (bch2_btree_interior_updates_nr_pending(c)) {
+ closure_wait_event(&c->btree_interior_update_wait,
+ !bch2_btree_interior_updates_nr_pending(c));
+ clean_passes = 0;
+ }
+ flush_work(&c->btree_interior_update_work);
- clean_passes = wrote ? 0 : clean_passes + 1;
+ if (bch2_journal_flush_all_pins(&c->journal))
+ clean_passes = 0;
} while (clean_passes < 2);
-allocator_not_running:
+ bch_verbose(c, "flushing journal and stopping allocators complete");
+
+ set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
+nowrote_alloc:
+ closure_wait_event(&c->btree_interior_update_wait,
+ !bch2_btree_interior_updates_nr_pending(c));
+ flush_work(&c->btree_interior_update_work);
+
for_each_member_device(ca, c, i)
bch2_dev_allocator_stop(ca);
clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
+ clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
bch2_fs_journal_stop(&c->journal);
- /* XXX: mark super that alloc info is persistent */
-
/*
* the journal kicks off btree writes via reclaim - wait for in flight
* writes after stopping journal:
!test_bit(BCH_FS_ERROR, &c->flags) &&
!test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
test_bit(BCH_FS_STARTED, &c->flags) &&
- !c->opts.norecovery)
+ test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) &&
+ !c->opts.norecovery) {
+ bch_verbose(c, "marking filesystem clean");
bch2_fs_mark_clean(c);
+ }
clear_bit(BCH_FS_RW, &c->flags);
}
struct bch_fs *c =
container_of(work, struct bch_fs, read_only_work);
- mutex_lock(&c->state_lock);
+ down_write(&c->state_lock);
bch2_fs_read_only(c);
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
}
static void bch2_fs_read_only_async(struct bch_fs *c)
if (ret)
goto err;
+ clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
+
for_each_rw_member(ca, c, i)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- if (!test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) {
- ret = bch2_fs_allocator_start(c);
- if (ret) {
- bch_err(c, "error initializing allocator");
- goto err;
- }
-
- set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
- }
-
for_each_rw_member(ca, c, i) {
ret = bch2_dev_allocator_start(ca);
if (ret) {
bch2_fs_ec_exit(c);
bch2_fs_encryption_exit(c);
bch2_fs_io_exit(c);
+ bch2_fs_btree_interior_update_exit(c);
bch2_fs_btree_iter_exit(c);
+ bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
bch2_fs_btree_cache_exit(c);
bch2_fs_journal_exit(&c->journal);
bch2_io_clock_exit(&c->io_clock[WRITE]);
mempool_exit(&c->large_bkey_pool);
mempool_exit(&c->btree_bounce_pool);
bioset_exit(&c->btree_bio);
- mempool_exit(&c->btree_interior_update_pool);
- mempool_exit(&c->btree_reserve_pool);
mempool_exit(&c->fill_iter);
percpu_ref_exit(&c->writes);
kfree(c->replicas.entries);
cancel_work_sync(&c->journal_seq_blacklist_gc_work);
- mutex_lock(&c->state_lock);
+ down_write(&c->state_lock);
bch2_fs_read_only(c);
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
for_each_member_device(ca, c, i)
if (ca->kobj.state_in_sysfs &&
bch2_opts_create_sysfs_files(&c->opts_dir))
return "error creating sysfs objects";
- mutex_lock(&c->state_lock);
+ down_write(&c->state_lock);
err = "error creating sysfs objects";
__for_each_member_device(ca, c, i, NULL)
list_add(&c->list, &bch_fs_list);
err = NULL;
err:
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
return err;
}
c->minor = -1;
c->disk_sb.fs_sb = true;
- mutex_init(&c->state_lock);
+ init_rwsem(&c->state_lock);
mutex_init(&c->sb_lock);
mutex_init(&c->replicas_gc_lock);
mutex_init(&c->btree_root_lock);
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
+ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
bch2_fs_allocator_background_init(c);
bch2_fs_allocator_foreground_init(c);
bch2_fs_rebalance_init(c);
INIT_LIST_HEAD(&c->list);
- INIT_LIST_HEAD(&c->btree_interior_update_list);
- INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
- mutex_init(&c->btree_reserve_cache_lock);
- mutex_init(&c->btree_interior_update_lock);
-
mutex_init(&c->usage_scratch_lock);
mutex_init(&c->bio_bounce_pages_lock);
seqcount_init(&c->usage_lock);
+ sema_init(&c->io_in_flight, 64);
+
c->copy_gc_enabled = 1;
c->rebalance.enabled = 1;
c->promote_whole_extents = true;
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
percpu_ref_init(&c->writes, bch2_writes_disabled,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
- mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
- sizeof(struct btree_reserve)) ||
- mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
- sizeof(struct btree_update)) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
bioset_init(&c->btree_bio, 1,
max(offsetof(struct btree_read_bio, bio),
bch2_fs_journal_init(&c->journal) ||
bch2_fs_replicas_init(c) ||
bch2_fs_btree_cache_init(c) ||
+ bch2_fs_btree_key_cache_init(&c->btree_key_cache) ||
bch2_fs_btree_iter_init(c) ||
+ bch2_fs_btree_interior_update_init(c) ||
bch2_fs_io_init(c) ||
bch2_fs_encryption_init(c) ||
bch2_fs_compress_init(c) ||
unsigned i;
int ret = -EINVAL;
- mutex_lock(&c->state_lock);
+ down_write(&c->state_lock);
BUG_ON(test_bit(BCH_FS_STARTED, &c->flags));
print_mount_opts(c);
ret = 0;
out:
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
return ret;
err:
switch (ret) {
{
int ret;
- mutex_lock(&c->state_lock);
+ down_write(&c->state_lock);
ret = __bch2_dev_set_state(c, ca, new_state, flags);
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
return ret;
}
/* Device add/removal: */
+int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+ struct btree_trans trans;
+ size_t i;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for (i = 0; i < ca->mi.nbuckets; i++) {
+ ret = bch2_btree_key_cache_flush(&trans,
+ BTREE_ID_ALLOC, POS(ca->dev_idx, i));
+ if (ret)
+ break;
+ }
+ bch2_trans_exit(&trans);
+
+ if (ret)
+ return ret;
+
+ return bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+ POS(ca->dev_idx, 0),
+ POS(ca->dev_idx + 1, 0),
+ NULL);
+}
+
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{
struct bch_sb_field_members *mi;
unsigned dev_idx = ca->dev_idx, data;
int ret = -EINVAL;
- mutex_lock(&c->state_lock);
+ down_write(&c->state_lock);
/*
* We consume a reference to ca->ref, regardless of whether we succeed
goto err;
}
- ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
- POS(ca->dev_idx, 0),
- POS(ca->dev_idx + 1, 0),
- NULL);
+ ret = bch2_dev_remove_alloc(c, ca);
if (ret) {
bch_err(ca, "Remove failed, error deleting alloc info");
goto err;
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
return 0;
err:
if (ca->mi.state == BCH_MEMBER_STATE_RW &&
!percpu_ref_is_zero(&ca->io_ref))
__bch2_dev_read_write(c, ca);
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
return ret;
}
dev_usage_clear(ca);
- mutex_lock(&c->state_lock);
+ down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
err = "insufficient space in new superblock";
goto err_late;
}
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
return 0;
err_unlock:
mutex_unlock(&c->sb_lock);
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
err:
if (ca)
bch2_dev_free(ca);
const char *err;
int ret;
- mutex_lock(&c->state_lock);
+ down_write(&c->state_lock);
ret = bch2_read_super(path, &opts, &sb);
if (ret) {
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
return ret;
}
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
return 0;
err:
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
bch2_free_super(&sb);
bch_err(c, "error bringing %s online: %s", path, err);
return -EINVAL;
int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
{
- mutex_lock(&c->state_lock);
+ down_write(&c->state_lock);
if (!bch2_dev_is_online(ca)) {
bch_err(ca, "Already offline");
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
return 0;
}
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
bch_err(ca, "Cannot offline required disk");
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
return -EINVAL;
}
__bch2_dev_offline(c, ca);
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
return 0;
}
struct bch_member *mi;
int ret = 0;
- mutex_lock(&c->state_lock);
+ down_write(&c->state_lock);
if (nbuckets < ca->mi.nbuckets) {
bch_err(ca, "Cannot shrink yet");
bch2_recalc_capacity(c);
err:
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
return ret;
}
goto err;
err = "bch2_dev_online() error";
- mutex_lock(&c->state_lock);
+ down_write(&c->state_lock);
for (i = 0; i < nr_devices; i++)
if (bch2_dev_attach_bdev(c, &sb[i])) {
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
goto err_print;
}
- mutex_unlock(&c->state_lock);
+ up_write(&c->state_lock);
err = "insufficient devices";
if (!bch2_fs_may_start(c))