+ * this is a _really_ horrible hack just to avoid an atomic sub at the
+ * end:
+ */
+ if (!sync) {
+ set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
+ atomic_set(&dio->cl.remaining,
+ CLOSURE_REMAINING_INITIALIZER -
+ CLOSURE_RUNNING +
+ CLOSURE_DESTRUCTOR);
+ } else {
+ atomic_set(&dio->cl.remaining,
+ CLOSURE_REMAINING_INITIALIZER + 1);
+ }
+
+ dio->req = req;
+ dio->ret = ret;
+ /*
+ * This is one of the sketchier things I've encountered: we have to skip
+ * the dirtying of requests that are internal from the kernel (i.e. from
+ * loopback), because we'll deadlock on page_lock.
+ */
+ dio->should_dirty = iter_is_iovec(iter);
+
+ goto start;
+ while (iter->count) {
+ bio = bio_alloc_bioset(NULL,
+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+ REQ_OP_READ,
+ GFP_KERNEL,
+ &c->bio_read);
+ bio->bi_end_io = bch2_direct_IO_read_split_endio;
+start:
+ bio->bi_opf = REQ_OP_READ|REQ_SYNC;
+ bio->bi_iter.bi_sector = offset >> 9;
+ bio->bi_private = dio;
+
+ ret = bio_iov_iter_get_pages(bio, iter);
+ if (ret < 0) {
+ /* XXX: fault inject this path */
+ bio->bi_status = BLK_STS_RESOURCE;
+ bio_endio(bio);
+ break;
+ }
+
+ offset += bio->bi_iter.bi_size;
+
+ if (dio->should_dirty)
+ bio_set_pages_dirty(bio);
+
+ if (iter->count)
+ closure_get(&dio->cl);
+
+ bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
+ }
+
+ iter->count += shorten;
+
+ if (sync) {
+ closure_sync(&dio->cl);
+ closure_debug_destroy(&dio->cl);
+ ret = dio->ret;
+ bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
+ return ret;
+ } else {
+ return -EIOCBQUEUED;
+ }
+}
+
+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct address_space *mapping = file->f_mapping;
+ size_t count = iov_iter_count(iter);
+ ssize_t ret;
+
+ if (!count)
+ return 0; /* skip atime */
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct blk_plug plug;
+
+ if (unlikely(mapping->nrpages)) {
+ ret = filemap_write_and_wait_range(mapping,
+ iocb->ki_pos,
+ iocb->ki_pos + count - 1);
+ if (ret < 0)
+ goto out;
+ }
+
+ file_accessed(file);
+
+ blk_start_plug(&plug);
+ ret = bch2_direct_IO_read(iocb, iter);
+ blk_finish_plug(&plug);
+
+ if (ret >= 0)
+ iocb->ki_pos += ret;
+ } else {
+ bch2_pagecache_add_get(inode);
+ ret = generic_file_read_iter(iocb, iter);
+ bch2_pagecache_add_put(inode);
+ }
+out:
+ return bch2_err_class(ret);
+}
+
+/* O_DIRECT writes */
+
+static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
+ u64 offset, u64 size,
+ unsigned nr_replicas, bool compressed)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 end = offset + size;
+ u32 snapshot;
+ bool ret = true;
+ int err;
+
+ bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (err)
+ goto err;
+
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+ SPOS(inum.inum, offset, snapshot),
+ BTREE_ITER_SLOTS, k, err) {
+ if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
+ break;
+
+ if (k.k->p.snapshot != snapshot ||
+ nr_replicas > bch2_bkey_replicas(c, k) ||
+ (!compressed && bch2_bkey_sectors_compressed(k))) {
+ ret = false;
+ break;
+ }
+ }
+
+ offset = iter.pos.offset;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (bch2_err_matches(err, BCH_ERR_transaction_restart))
+ goto retry;
+ bch2_trans_exit(&trans);
+
+ return err ? false : ret;
+}
+
+static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
+{
+ struct bch_fs *c = dio->op.c;
+ struct bch_inode_info *inode = dio->inode;
+ struct bio *bio = &dio->op.wbio.bio;
+
+ return bch2_check_range_allocated(c, inode_inum(inode),
+ dio->op.pos.offset, bio_sectors(bio),
+ dio->op.opts.data_replicas,
+ dio->op.opts.compression != 0);
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *);
+static __always_inline long bch2_dio_write_done(struct dio_write *dio);
+
+/*
+ * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
+ * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
+ * caller's stack, we're not guaranteed that it will live for the duration of
+ * the IO:
+ */
+static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
+{
+ struct iovec *iov = dio->inline_vecs;
+
+ /*
+ * iov_iter has a single embedded iovec - nothing to do:
+ */
+ if (iter_is_ubuf(&dio->iter))
+ return 0;
+
+ /*
+ * We don't currently handle non-iovec iov_iters here - return an error,
+ * and we'll fall back to doing the IO synchronously: