#include "journal.h"
#include "keylist.h"
#include "move.h"
+#include "nocow_locking.h"
#include "rebalance.h"
#include "subvolume.h"
#include "super.h"
#include "super-io.h"
+#include "trace.h"
#include <linux/blkdev.h>
+#include <linux/prefetch.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
-#include <trace/events/bcachefs.h>
-
const char *bch2_blk_status_to_str(blk_status_t status)
{
if (status == BLK_STS_REMOVED)
return blk_status_to_str(status);
}
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
static bool bch2_target_congested(struct bch_fs *c, u16 target)
{
const struct bch_devs_mask *devs;
__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
}
+#else
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+ return false;
+}
+
+#endif
+
/* Allocate, free from mempool: */
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
struct page *page;
if (likely(!*using_mempool)) {
- page = alloc_page(GFP_NOIO);
+ page = alloc_page(GFP_NOFS);
if (unlikely(!page)) {
mutex_lock(&c->bio_bounce_pages_lock);
*using_mempool = true;
}
} else {
pool_alloc:
- page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
+ page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
}
return page;
bch2_trans_copy_iter(&iter, extent_iter);
- for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) {
+ for_each_btree_key_upto_continue_norestart(iter,
+ new->k.p, BTREE_ITER_SLOTS, old, ret) {
s64 sectors = min(new->k.p.offset, old.k->p.offset) -
max(bkey_start_offset(&new->k),
bkey_start_offset(old.k));
(!new_compressed && bch2_bkey_sectors_compressed(old))))
*usage_increasing = true;
- if (bkey_cmp(old.k->p, new->k.p) >= 0)
+ if (bkey_ge(old.k->p, new->k.p))
break;
}
return ret;
}
-static int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
- struct btree_iter *extent_iter,
- u64 new_i_size,
- s64 i_sectors_delta)
+static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
+ struct btree_iter *extent_iter,
+ u64 new_i_size,
+ s64 i_sectors_delta)
{
struct btree_iter iter;
- struct bkey_s_c inode_k;
- struct bkey_s_c_inode_v3 inode;
- struct bkey_i_inode_v3 *new_inode;
+ struct bkey_i *k;
+ struct bkey_i_inode_v3 *inode;
+ unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
int ret;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
- SPOS(0,
- extent_iter->pos.inode,
- extent_iter->snapshot),
- BTREE_ITER_INTENT|BTREE_ITER_CACHED);
- inode_k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(inode_k);
+ k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
+ SPOS(0,
+ extent_iter->pos.inode,
+ extent_iter->snapshot),
+ BTREE_ITER_CACHED);
+ ret = PTR_ERR_OR_ZERO(k);
if (unlikely(ret))
- goto err;
-
- ret = bkey_is_inode(inode_k.k) ? 0 : -ENOENT;
- if (unlikely(ret))
- goto err;
+ return ret;
- if (unlikely(inode_k.k->type != KEY_TYPE_inode_v3)) {
- inode_k = bch2_inode_to_v3(trans, inode_k);
- ret = bkey_err(inode_k);
+ if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
+ k = bch2_inode_to_v3(trans, k);
+ ret = PTR_ERR_OR_ZERO(k);
if (unlikely(ret))
goto err;
}
- inode = bkey_s_c_to_inode_v3(inode_k);
-
- new_inode = bch2_trans_kmalloc(trans, bkey_bytes(inode_k.k));
- ret = PTR_ERR_OR_ZERO(new_inode);
- if (unlikely(ret))
- goto err;
-
- bkey_reassemble(&new_inode->k_i, inode.s_c);
+ inode = bkey_i_to_inode_v3(k);
- if (!(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
- new_i_size > le64_to_cpu(inode.v->bi_size))
- new_inode->v.bi_size = cpu_to_le64(new_i_size);
+ if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
+ new_i_size > le64_to_cpu(inode->v.bi_size)) {
+ inode->v.bi_size = cpu_to_le64(new_i_size);
+ inode_update_flags = 0;
+ }
- le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta);
+ if (i_sectors_delta) {
+ le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
+ inode_update_flags = 0;
+ }
- new_inode->k.p.snapshot = iter.snapshot;
+ if (inode->k.p.snapshot != iter.snapshot) {
+ inode->k.p.snapshot = iter.snapshot;
+ inode_update_flags = 0;
+ }
- ret = bch2_trans_update(trans, &iter, &new_inode->k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ ret = bch2_trans_update(trans, &iter, &inode->k_i,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ inode_update_flags);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
struct open_buckets open_buckets;
struct bkey_s_c k;
struct bkey_buf old, new;
+ unsigned sectors_allocated;
bool have_reservation = false;
bool unwritten = opts.nocow &&
c->sb.version >= bcachefs_metadata_version_unwritten_extents;
closure_init_stack(&cl);
open_buckets.nr = 0;
retry:
+ sectors_allocated = 0;
+
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
&devs_have,
opts.data_replicas,
opts.data_replicas,
- RESERVE_none, 0, &cl, &wp);
- if (ret == -EAGAIN) {
+ BCH_WATERMARK_normal, 0, &cl, &wp);
+ if (ret) {
bch2_trans_unlock(trans);
closure_sync(&cl);
- goto retry;
- }
- if (ret)
+ if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+ goto retry;
return ret;
+ }
sectors = min(sectors, wp->sectors_free);
+ sectors_allocated = sectors;
bch2_key_resize(&e->k, sectors);
goto retry;
}
+ if (!ret && sectors_allocated)
+ bch2_increment_clock(c, sectors_allocated, WRITE);
+
bch2_open_buckets_put(c, &open_buckets);
bch2_disk_reservation_put(c, &disk_res);
bch2_bkey_buf_exit(&new, c);
bch2_btree_iter_set_snapshot(iter, snapshot);
- k = bch2_btree_iter_peek(iter);
- if (bkey_cmp(iter->pos, end_pos) >= 0) {
- bch2_btree_iter_set_pos(iter, end_pos);
+ /*
+ * peek_upto() doesn't have ideal semantics for extents:
+ */
+ k = bch2_btree_iter_peek_upto(iter, end_pos);
+ if (!k.k)
break;
- }
ret = bkey_err(k);
if (ret)
if (ret)
break;
- if (bkey_cmp(iter.pos, k->k.p) >= 0)
+ if (bkey_ge(iter.pos, k->k.p))
bch2_keylist_pop_front(&op->insert_keys);
else
bch2_cut_front(iter.pos, k);
if (to_entry(ptr + 1) < ptrs.end) {
n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
- GFP_NOIO, &ca->replica_set));
+ GFP_NOFS, &ca->replica_set));
n->bio.bi_end_io = wbio->bio.bi_end_io;
n->bio.bi_private = wbio->bio.bi_private;
bio_sectors(&n->bio));
bio_set_dev(&n->bio, ca->disk_sb.bdev);
+
+ if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
+ bio_endio(&n->bio);
+ continue;
+ }
+
submit_bio(&n->bio);
} else {
n->bio.bi_status = BLK_STS_REMOVED;
struct bch_fs *c = op->c;
bch2_disk_reservation_put(c, &op->res);
- percpu_ref_put(&c->writes);
+ if (!(op->flags & BCH_WRITE_MOVE))
+ bch2_write_ref_put(c, BCH_WRITE_REF_write);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+ EBUG_ON(cl->parent);
closure_debug_destroy(cl);
if (op->end_io)
op->end_io(op);
}
if (dst != src)
- memmove_u64s_down(dst, src, src->u64s);
+ memmove_u64s_down(dst, src, src->k.u64s);
dst = bkey_next(dst);
}
* particularly want to plumb io_opts all the way through the btree
* update stack right now
*/
- for_each_keylist_key(keys, k) {
+ for_each_keylist_key(keys, k)
bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
- if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k)))
- bch2_check_set_feature(op->c, BCH_FEATURE_incompressible);
-
- }
-
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
op->written += sectors_start - keylist_sectors(keys);
- if (ret) {
+ if (ret && !bch2_err_matches(ret, EROFS)) {
struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
bch_err_inum_offset_ratelimited(c,
k->k.p.inode, k->k.p.offset << 9,
"write error while doing btree update: %s",
bch2_err_str(ret));
- goto err;
}
+
+ if (ret)
+ goto err;
}
out:
/* If some a bucket wasn't written, we can't erasure code it: */
goto out;
}
+static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
+{
+ if (state != wp->state) {
+ u64 now = ktime_get_ns();
+
+ if (wp->last_state_change &&
+ time_after64(now, wp->last_state_change))
+ wp->time[wp->state] += now - wp->last_state_change;
+ wp->state = state;
+ wp->last_state_change = now;
+ }
+}
+
+static inline void wp_update_state(struct write_point *wp, bool running)
+{
+ enum write_point_state state;
+
+ state = running ? WRITE_POINT_running :
+ !list_empty(&wp->writes) ? WRITE_POINT_waiting_io
+ : WRITE_POINT_stopped;
+
+ __wp_update_state(wp, state);
+}
+
static void bch2_write_index(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct write_point *wp = op->wp;
struct workqueue_struct *wq = index_update_wq(op);
+ unsigned long flags;
+
+ if ((op->flags & BCH_WRITE_DONE) &&
+ (op->flags & BCH_WRITE_MOVE))
+ bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
+
+ spin_lock_irqsave(&wp->writes_lock, flags);
+ if (wp->state == WRITE_POINT_waiting_io)
+ __wp_update_state(wp, WRITE_POINT_waiting_work);
+ list_add_tail(&op->wp_list, &wp->writes);
+ spin_unlock_irqrestore (&wp->writes_lock, flags);
- barrier();
- op->btree_update_ready = true;
queue_work(wq, &wp->index_update_work);
}
+static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
+{
+ op->wp = wp;
+
+ if (wp->state == WRITE_POINT_stopped) {
+ spin_lock_irq(&wp->writes_lock);
+ __wp_update_state(wp, WRITE_POINT_waiting_io);
+ spin_unlock_irq(&wp->writes_lock);
+ }
+}
+
void bch2_write_point_do_index_updates(struct work_struct *work)
{
struct write_point *wp =
struct bch_write_op *op;
while (1) {
- spin_lock(&wp->writes_lock);
+ spin_lock_irq(&wp->writes_lock);
op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
- if (op && !op->btree_update_ready)
- op = NULL;
if (op)
list_del(&op->wp_list);
- spin_unlock(&wp->writes_lock);
+ wp_update_state(wp, op != NULL);
+ spin_unlock_irq(&wp->writes_lock);
if (!op)
break;
+ op->flags |= BCH_WRITE_IN_WORKER;
+
__bch2_write_index(op);
if (!(op->flags & BCH_WRITE_DONE))
if (wbio->put_bio)
bio_put(bio);
- if (parent) {
+ if (parent)
bio_endio(&parent->bio);
- return;
- }
-
- closure_put(cl);
+ else
+ closure_put(cl);
}
static void init_append_extent(struct bch_write_op *op,
struct bversion version,
struct bch_extent_crc_unpacked crc)
{
- struct bch_fs *c = op->c;
struct bkey_i_extent *e;
op->pos.offset += crc.uncompressed_size;
crc.nonce)
bch2_extent_crc_append(&e->k_i, crc);
- bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, crc.compressed_size,
+ bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
op->flags & BCH_WRITE_CACHED);
bch2_keylist_push(&op->insert_keys);
pages = min(pages, BIO_MAX_VECS);
bio = bio_alloc_bioset(NULL, pages, 0,
- GFP_NOIO, &c->bio_write);
+ GFP_NOFS, &c->bio_write);
wbio = wbio_init(bio);
wbio->put_bio = true;
/* copy WRITE_SYNC flag */
/* Can we just write the entire extent as is? */
if (op->crc.uncompressed_size == op->crc.live_size &&
op->crc.compressed_size <= wp->sectors_free &&
- (op->crc.compression_type == op->compression_type ||
+ (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
op->incompressible)) {
if (!crc_is_compressed(op->crc) &&
op->csum_type != op->crc.csum_type &&
- bch2_write_rechecksum(c, op, op->csum_type))
+ bch2_write_rechecksum(c, op, op->csum_type) &&
+ !c->opts.no_data_io)
return PREP_ENCODED_CHECKSUM_ERR;
return PREP_ENCODED_DO_WRITE;
csum = bch2_checksum_bio(c, op->crc.csum_type,
extent_nonce(op->version, op->crc),
bio);
- if (bch2_crc_cmp(op->crc.csum, csum))
+ if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
return PREP_ENCODED_CHECKSUM_ERR;
if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
*/
if ((op->crc.live_size != op->crc.uncompressed_size ||
op->crc.csum_type != op->csum_type) &&
- bch2_write_rechecksum(c, op, op->csum_type))
+ bch2_write_rechecksum(c, op, op->csum_type) &&
+ !c->opts.no_data_io)
return PREP_ENCODED_CHECKSUM_ERR;
/*
* If we want to compress the data, it has to be decrypted:
*/
- if ((op->compression_type ||
+ if ((op->compression_opt ||
bch2_csum_type_is_encryption(op->crc.csum_type) !=
bch2_csum_type_is_encryption(op->csum_type)) &&
bch2_write_decrypt(op))
}
if (ec_buf ||
- op->compression_type ||
+ op->compression_opt ||
(op->csum_type &&
!(op->flags & BCH_WRITE_PAGES_STABLE)) ||
(bch2_csum_type_is_encryption(op->csum_type) &&
dst->bi_iter.bi_size < c->opts.encoded_extent_max)
break;
- BUG_ON(op->compression_type &&
+ BUG_ON(op->compression_opt &&
(op->flags & BCH_WRITE_DATA_ENCODED) &&
bch2_csum_type_is_encryption(op->crc.csum_type));
- BUG_ON(op->compression_type && !bounce);
+ BUG_ON(op->compression_opt && !bounce);
crc.compression_type = op->incompressible
? BCH_COMPRESSION_TYPE_incompressible
- : op->compression_type
+ : op->compression_opt
? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
- op->compression_type)
+ op->compression_opt)
: 0;
if (!crc_is_compressed(crc)) {
dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
BUG_ON(total_output != total_input);
dst = bio_split(src, total_input >> 9,
- GFP_NOIO, &c->bio_write);
+ GFP_NOFS, &c->bio_write);
wbio_init(dst)->put_bio = true;
/* copy WRITE_SYNC flag */
dst->bi_opf = src->bi_opf;
return 0;
}
- new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ new = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
- bkey_reassemble(new, k);
-
bch2_cut_front(bkey_start_pos(&orig->k), new);
bch2_cut_back(orig->k.p, new);
bch2_trans_init(&trans, c, 0, 0);
for_each_keylist_key(&op->insert_keys, orig) {
- ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
- bkey_start_pos(&orig->k),
+ ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents,
+ bkey_start_pos(&orig->k), orig->k.p,
BTREE_ITER_INTENT, k,
NULL, NULL, BTREE_INSERT_NOFAIL, ({
- if (bkey_cmp(bkey_start_pos(k.k), orig->k.p) >= 0)
- break;
-
bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size);
}));
- if (ret) {
+ if (ret && !bch2_err_matches(ret, EROFS)) {
struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
bch_err_inum_offset_ratelimited(c,
k->k.p.inode, k->k.p.offset << 9,
"write error while doing btree update: %s",
bch2_err_str(ret));
+ }
+
+ if (ret) {
op->error = ret;
break;
}
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_ptrs_c ptrs;
- const struct bch_extent_ptr *ptr, *ptr2;
+ const struct bch_extent_ptr *ptr;
+ struct {
+ struct bpos b;
+ unsigned gen;
+ struct nocow_lock_bucket *l;
+ } buckets[BCH_REPLICAS_MAX];
+ unsigned nr_buckets = 0;
u32 snapshot;
- int ret;
+ int ret, i;
if (op->flags & BCH_WRITE_MOVE)
return;
while (1) {
struct bio *bio = &op->wbio.bio;
+ nr_buckets = 0;
+
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
/* Get iorefs before dropping btree locks: */
ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr(ptrs, ptr)
+ bkey_for_each_ptr(ptrs, ptr) {
+ buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr);
+ buckets[nr_buckets].gen = ptr->gen;
+ buckets[nr_buckets].l =
+ bucket_nocow_lock(&c->nocow_locks,
+ bucket_to_u64(buckets[nr_buckets].b));
+
+ prefetch(buckets[nr_buckets].l);
+
if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
goto err_get_ioref;
+ nr_buckets++;
+
+ if (ptr->unwritten)
+ op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
+ }
+
/* Unlock before taking nocow locks, doing IO: */
bkey_reassemble(op->insert_keys.top, k);
bch2_trans_unlock(&trans);
bch2_cut_front(op->pos, op->insert_keys.top);
- bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+ if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
+ bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
- ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(op->insert_keys.top));
- bkey_for_each_ptr(ptrs, ptr) {
- bch2_bucket_nocow_lock(&c->nocow_locks,
- PTR_BUCKET_POS(c, ptr),
- BUCKET_NOCOW_LOCK_UPDATE);
- if (unlikely(ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
- goto err_bucket_stale;
+ for (i = 0; i < nr_buckets; i++) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode);
+ struct nocow_lock_bucket *l = buckets[i].l;
+ bool stale;
- if (ptr->unwritten)
- op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
+ __bch2_bucket_nocow_lock(&c->nocow_locks, l,
+ bucket_to_u64(buckets[i].b),
+ BUCKET_NOCOW_LOCK_UPDATE);
+
+ rcu_read_lock();
+ stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen);
+ rcu_read_unlock();
+
+ if (unlikely(stale))
+ goto err_bucket_stale;
}
bio = &op->wbio.bio;
}
return;
err_get_ioref:
- bkey_for_each_ptr(ptrs, ptr2) {
- if (ptr2 == ptr)
- break;
-
- percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref);
- }
+ for (i = 0; i < nr_buckets; i++)
+ percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
/* Fall back to COW path: */
goto out;
err_bucket_stale:
- bkey_for_each_ptr(ptrs, ptr2) {
+ while (--i >= 0)
bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(c, ptr2),
+ buckets[i].b,
BUCKET_NOCOW_LOCK_UPDATE);
- if (ptr2 == ptr)
- break;
- }
-
- bkey_for_each_ptr(ptrs, ptr2)
- percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref);
+ for (i = 0; i < nr_buckets; i++)
+ percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
/* We can retry this: */
- ret = BCH_ERR_transaction_restart;
+ ret = -BCH_ERR_transaction_restart;
goto out;
}
nofs_flags = memalloc_nofs_save();
- if (unlikely(op->opts.nocow)) {
+ if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
bch2_nocow_write(op);
if (op->flags & BCH_WRITE_DONE)
goto out_nofs_restore;
}
again:
memset(&op->failed, 0, sizeof(op->failed));
- op->btree_update_ready = false;
do {
struct bkey_i *key_to_write;
&op->devs_have,
op->nr_replicas,
op->nr_replicas_required,
- op->alloc_reserve,
+ op->watermark,
op->flags,
(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
BCH_WRITE_ONLY_SPECIFIED_DEVS))
? NULL : &op->cl, &wp));
if (unlikely(ret)) {
- if (unlikely(ret != -EAGAIN)) {
- op->error = ret;
- op->flags |= BCH_WRITE_DONE;
- }
+ if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+ break;
- break;
+ goto err;
}
+ EBUG_ON(!wp);
+
bch2_open_bucket_get(c, wp, &op->open_buckets);
ret = bch2_write_extent(op, wp, &bio);
- bch2_alloc_sectors_done(c, wp);
-
- if (ret < 0) {
- op->error = ret;
+ bch2_alloc_sectors_done_inlined(c, wp);
+err:
+ if (ret <= 0) {
op->flags |= BCH_WRITE_DONE;
- break;
- }
- if (!ret)
- op->flags |= BCH_WRITE_DONE;
+ if (ret < 0) {
+ op->error = ret;
+ break;
+ }
+ }
bio->bi_end_io = bch2_write_endio;
bio->bi_private = &op->cl;
* synchronously here if we weren't able to submit all of the IO at
* once, as that signals backpressure to the caller.
*/
- if ((op->flags & BCH_WRITE_SYNC) || !(op->flags & BCH_WRITE_DONE)) {
+ if ((op->flags & BCH_WRITE_SYNC) ||
+ (!(op->flags & BCH_WRITE_DONE) &&
+ !(op->flags & BCH_WRITE_IN_WORKER))) {
closure_sync(&op->cl);
__bch2_write_index(op);
goto again;
bch2_write_done(&op->cl);
} else {
- spin_lock(&wp->writes_lock);
- op->wp = wp;
- list_add_tail(&op->wp_list, &wp->writes);
- spin_unlock(&wp->writes_lock);
-
+ bch2_write_queue(op, wp);
continue_at(&op->cl, bch2_write_index, NULL);
}
out_nofs_restore:
unsigned sectors;
int ret;
+ op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+ op->flags |= BCH_WRITE_DONE;
+
bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
set_bkey_val_bytes(&id->k, data_len);
bch2_keylist_push(&op->insert_keys);
- op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
- op->flags |= BCH_WRITE_DONE;
-
__bch2_write_index(op);
err:
bch2_write_done(&op->cl);
EBUG_ON(op->cl.parent);
BUG_ON(!op->nr_replicas);
BUG_ON(!op->write_point.v);
- BUG_ON(!bkey_cmp(op->pos, POS_MAX));
+ BUG_ON(bkey_eq(op->pos, POS_MAX));
op->start_time = local_clock();
bch2_keylist_init(&op->insert_keys, op->inline_keys);
goto err;
}
- if (c->opts.nochanges ||
- !percpu_ref_tryget_live(&c->writes)) {
- op->error = -EROFS;
+ if (c->opts.nochanges) {
+ op->error = -BCH_ERR_erofs_no_writes;
+ goto err;
+ }
+
+ if (!(op->flags & BCH_WRITE_MOVE) &&
+ !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
+ op->error = -BCH_ERR_erofs_no_writes;
goto err;
}
op->end_io(op);
}
+static const char * const bch2_write_flags[] = {
+#define x(f) #f,
+ BCH_WRITE_FLAGS()
+#undef x
+ NULL
+};
+
+void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
+{
+ prt_str(out, "pos: ");
+ bch2_bpos_to_text(out, op->pos);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_str(out, "started: ");
+ bch2_pr_time_units(out, local_clock() - op->start_time);
+ prt_newline(out);
+
+ prt_str(out, "flags: ");
+ prt_bitflags(out, bch2_write_flags, op->flags);
+ prt_newline(out);
+
+ prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+}
+
/* Cache promotion on read */
struct promote_op {
{
int ret;
+ bch2_data_update_exit(&op->write);
+
ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
bch_promote_params);
BUG_ON(ret);
- percpu_ref_put(&c->writes);
+ bch2_write_ref_put(c, BCH_WRITE_REF_promote);
kfree_rcu(op, rcu);
}
bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
op->start_time);
-
- bch2_data_update_exit(&op->write);
promote_free(c, op);
}
bch2_data_update_read_done(&op->write, rbio->pick.crc);
}
-static struct promote_op *__promote_alloc(struct bch_fs *c,
+static struct promote_op *__promote_alloc(struct btree_trans *trans,
enum btree_id btree_id,
struct bkey_s_c k,
struct bpos pos,
unsigned sectors,
struct bch_read_bio **rbio)
{
+ struct bch_fs *c = trans->c;
struct promote_op *op = NULL;
struct bio *bio;
unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
int ret;
- if (!percpu_ref_tryget_live(&c->writes))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
return NULL;
- op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
if (!op)
goto err;
*/
*rbio = kzalloc(sizeof(struct bch_read_bio) +
sizeof(struct bio_vec) * pages,
- GFP_NOIO);
+ GFP_NOFS);
if (!*rbio)
goto err;
bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
- GFP_NOIO))
+ GFP_NOFS))
goto err;
(*rbio)->bounce = true;
bio = &op->write.op.wbio.bio;
bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
- ret = bch2_data_update_init(c, &op->write,
+ ret = bch2_data_update_init(trans, NULL, &op->write,
writepoint_hashed((unsigned long) current),
opts,
(struct data_update_opts) {
.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
},
btree_id, k);
- BUG_ON(ret);
+ /*
+ * possible errors: -BCH_ERR_nocow_lock_blocked,
+ * -BCH_ERR_ENOSPC_disk_reservation:
+ */
+ if (ret) {
+ ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+ bch_promote_params);
+ BUG_ON(ret);
+ goto err;
+ }
+
op->write.op.end_io = promote_done;
return op;
kfree(*rbio);
*rbio = NULL;
kfree(op);
- percpu_ref_put(&c->writes);
+ bch2_write_ref_put(c, BCH_WRITE_REF_promote);
return NULL;
}
noinline
-static struct promote_op *promote_alloc(struct bch_fs *c,
- struct bvec_iter iter,
- struct bkey_s_c k,
- struct extent_ptr_decoded *pick,
- struct bch_io_opts opts,
- unsigned flags,
- struct bch_read_bio **rbio,
- bool *bounce,
- bool *read_full)
+static struct promote_op *promote_alloc(struct btree_trans *trans,
+ struct bvec_iter iter,
+ struct bkey_s_c k,
+ struct extent_ptr_decoded *pick,
+ struct bch_io_opts opts,
+ unsigned flags,
+ struct bch_read_bio **rbio,
+ bool *bounce,
+ bool *read_full)
{
+ struct bch_fs *c = trans->c;
bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
/* data might have to be decompressed in the write path: */
unsigned sectors = promote_full
if (!should_promote(c, k, pos, opts, flags))
return NULL;
- promote = __promote_alloc(c,
+ promote = __promote_alloc(trans,
k.k->type == KEY_TYPE_reflink_v
? BTREE_ID_reflink
: BTREE_ID_extents,
if (crc_is_compressed(rbio->pick.crc))
return 0;
- bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
if ((ret = bkey_err(k)))
goto out;
}
csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
- if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
+ if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
goto csum_err;
/*
if (ret)
goto decrypt_err;
- if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
+ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
+ !c->opts.no_data_io)
goto decompression_err;
} else {
/* don't need to decrypt the entire bio: */
reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
*offset_into_extent;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink,
- POS(0, reflink_offset),
- BTREE_ITER_SLOTS);
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
+ POS(0, reflink_offset), 0);
ret = bkey_err(k);
if (ret)
goto err;
}
if (orig->opts.promote_target)
- promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
+ promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
&rbio, &bounce, &read_full);
if (!read_full) {
rbio = rbio_init(bio_alloc_bioset(NULL,
DIV_ROUND_UP(sectors, PAGE_SECTORS),
0,
- GFP_NOIO,
+ GFP_NOFS,
&c->bio_read_split),
orig->opts);
* from the whole bio, in which case we don't want to retry and
* lose the error)
*/
- rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO,
+ rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
&c->bio_read_split),
orig->opts);
rbio->bio.bi_iter = iter;
bio_sectors(&rbio->bio));
bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
- if (likely(!(flags & BCH_READ_IN_RETRY)))
- submit_bio(&rbio->bio);
- else
- submit_bio_wait(&rbio->bio);
+ if (unlikely(c->opts.no_data_io)) {
+ if (likely(!(flags & BCH_READ_IN_RETRY)))
+ bio_endio(&rbio->bio);
+ } else {
+ if (likely(!(flags & BCH_READ_IN_RETRY)))
+ submit_bio(&rbio->bio);
+ else
+ submit_bio_wait(&rbio->bio);
+ }
+
+ /*
+ * We just submitted IO which may block, we expect relock fail
+ * events and shouldn't count them:
+ */
+ trans->notrace_relock_fail = true;
} else {
/* Attempting reconstruct read: */
if (bch2_ec_read_extent(c, rbio)) {
int bch2_fs_io_init(struct bch_fs *c)
{
- unsigned i;
+ if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+ BIOSET_NEED_BVECS))
+ return -BCH_ERR_ENOMEM_bio_read_init;
- for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++)
- two_state_lock_init(&c->nocow_locks.l[i]);
+ if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+ BIOSET_NEED_BVECS))
+ return -BCH_ERR_ENOMEM_bio_read_split_init;
- if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
- BIOSET_NEED_BVECS) ||
- bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
- BIOSET_NEED_BVECS) ||
- bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
- BIOSET_NEED_BVECS) ||
- mempool_init_page_pool(&c->bio_bounce_pages,
+ if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+ BIOSET_NEED_BVECS))
+ return -BCH_ERR_ENOMEM_bio_write_init;
+
+ if (mempool_init_page_pool(&c->bio_bounce_pages,
max_t(unsigned,
c->opts.btree_node_size,
c->opts.encoded_extent_max) /
- PAGE_SIZE, 0) ||
- rhashtable_init(&c->promote_table, &bch_promote_params))
- return -ENOMEM;
+ PAGE_SIZE, 0))
+ return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+
+ if (rhashtable_init(&c->promote_table, &bch_promote_params))
+ return -BCH_ERR_ENOMEM_promote_table_init;
return 0;
}