#include "keylist.h"
#include "quota.h"
#include "reflink.h"
+#include "trace.h"
#include <linux/aio.h>
#include <linux/backing-dev.h>
#include <linux/uio.h>
#include <linux/writeback.h>
-#include <trace/events/bcachefs.h>
#include <trace/events/writeback.h>
+struct folio_vec {
+ struct folio *fv_folio;
+ size_t fv_offset;
+ size_t fv_len;
+};
+
+static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
+{
+
+ struct folio *folio = page_folio(bv.bv_page);
+ size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
+ bv.bv_offset;
+ size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
+
+ return (struct folio_vec) {
+ .fv_folio = folio,
+ .fv_offset = offset,
+ .fv_len = len,
+ };
+}
+
+static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
+ struct bvec_iter iter)
+{
+ return biovec_to_foliovec(bio_iter_iovec(bio, iter));
+}
+
+#define __bio_for_each_folio(bvl, bio, iter, start) \
+ for (iter = (start); \
+ (iter).bi_size && \
+ ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \
+ bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
+
+/**
+ * bio_for_each_folio - iterate over folios within a bio
+ *
+ * Like other non-_all versions, this iterates over what bio->bi_iter currently
+ * points to. This version is for drivers, where the bio may have previously
+ * been split or cloned.
+ */
+#define bio_for_each_folio(bvl, bio, iter) \
+ __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
+
/*
* Use u64 for the end pos and sector helpers because if the folio covers the
* max supported range of the mapping, the start offset of the next folio
break;
f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
- if (!f)
+ if (IS_ERR_OR_NULL(f))
break;
BUG_ON(folios->nr && folio_pos(f) != pos);
{
int ret;
+ if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
+ return 0;
+
mutex_lock(&inode->ei_quota_lock);
ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
inode->v.i_blocks += sectors;
#ifdef CONFIG_BCACHEFS_QUOTA
- if (quota_res && sectors > 0) {
+ if (quota_res &&
+ !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
+ sectors > 0) {
BUG_ON(sectors > quota_res->sectors);
BUG_ON(sectors > inode->ei_quota_reserved);
#undef x
};
-const char * const bch2_folio_sector_states[] = {
+static const char * const bch2_folio_sector_states[] = {
#define x(n) #n,
BCH_FOLIO_SECTOR_STATE()
#undef x
s->s[i].state = n;
}
+/* file offset (to folio offset) to bch_folio_sector index */
+static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
+{
+ u64 f_offset = pos - folio_pos(folio);
+ BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
+ return f_offset >> SECTOR_SHIFT;
+}
+
static inline struct bch_folio *__bch2_folio(struct folio *folio)
{
return folio_has_private(folio)
s = kzalloc(sizeof(*s) +
sizeof(struct bch_folio_sector) *
- folio_sectors(folio), GFP_NOFS|gfp);
+ folio_sectors(folio), gfp);
if (!s)
return NULL;
unsigned pg_offset, unsigned pg_len,
unsigned nr_ptrs, unsigned state)
{
- struct bch_folio *s = bch2_folio_create(folio, __GFP_NOFAIL);
+ struct bch_folio *s = bch2_folio(folio);
unsigned i, sectors = folio_sectors(folio);
BUG_ON(pg_offset >= sectors);
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
+ struct bch_folio *s;
u64 offset = folio_sector(folios[0]);
- unsigned folio_idx = 0;
+ unsigned folio_idx;
u32 snapshot;
+ bool need_set = false;
int ret;
+ for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
+ s = bch2_folio_create(folios[folio_idx], GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ need_set |= !s->uptodate;
+ }
+
+ if (!need_set)
+ return 0;
+
+ folio_idx = 0;
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
BUG_ON(k.k->p.offset < folio_start);
BUG_ON(bkey_start_offset(k.k) > folio_end);
- if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate)
+ if (!bch2_folio(folio)->uptodate)
__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
if (k.k->p.offset < folio_end)
struct address_space *mapping = file->f_mapping;
struct address_space *fdm = faults_disabled_mapping();
struct bch_inode_info *inode = file_bch_inode(file);
- int ret;
+ vm_fault_t ret;
if (fdm == mapping)
return VM_FAULT_SIGBUS;
struct bch2_folio_reservation res;
unsigned len;
loff_t isize;
- int ret;
+ vm_fault_t ret;
bch2_folio_reservation_init(c, inode, &res);
len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
- if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) {
- if (bch2_folio_set(c, inode_inum(inode), &folio, 1)) {
- folio_unlock(folio);
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
- }
-
- if (bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
+ if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
+ bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
folio_unlock(folio);
ret = VM_FAULT_SIGBUS;
goto out;
static void bch2_readpages_end_io(struct bio *bio)
{
- struct bvec_iter_all iter;
- struct folio_vec fv;
+ struct folio_iter fi;
- bio_for_each_folio_all(fv, bio, iter) {
+ bio_for_each_folio_all(fi, bio) {
if (!bio->bi_status) {
- folio_mark_uptodate(fv.fv_folio);
+ folio_mark_uptodate(fi.folio);
} else {
- folio_clear_uptodate(fv.fv_folio);
- folio_set_error(fv.fv_folio);
+ folio_clear_uptodate(fi.folio);
+ folio_set_error(fi.folio);
}
- folio_unlock(fv.fv_folio);
+ folio_unlock(fi.folio);
}
bio_put(bio);
darray_for_each(iter->folios, fi) {
ractl->_nr_pages -= 1U << folio_order(*fi);
- __bch2_folio_create(*fi, __GFP_NOFAIL);
+ __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL);
folio_put(*fi);
folio_put(*fi);
}
return false;
}
-static void readpage_bio_extend(struct readpages_iter *iter,
- struct bio *bio,
- unsigned sectors_this_extent,
- bool get_more)
+static int readpage_bio_extend(struct btree_trans *trans,
+ struct readpages_iter *iter,
+ struct bio *bio,
+ unsigned sectors_this_extent,
+ bool get_more)
{
+ /* Don't hold btree locks while allocating memory: */
+ bch2_trans_unlock(trans);
+
while (bio_sectors(bio) < sectors_this_extent &&
bio->bi_vcnt < bio->bi_max_vecs) {
struct folio *folio = readpage_iter_peek(iter);
if (!folio)
break;
- if (!__bch2_folio_create(folio, 0)) {
+ if (!__bch2_folio_create(folio, GFP_KERNEL)) {
folio_put(folio);
break;
}
- ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_NOFS);
+ ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
if (ret) {
__bch2_folio_release(folio);
folio_put(folio);
BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
}
+
+ return bch2_trans_relock(trans);
}
static void bchfs_read(struct btree_trans *trans,
sectors = min(sectors, k.k->size - offset_into_extent);
- if (readpages_iter)
- readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
- extent_partial_reads_expensive(k));
+ if (readpages_iter) {
+ ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
+ extent_partial_reads_expensive(k));
+ if (ret)
+ break;
+ }
bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
swap(rbio->bio.bi_iter.bi_size, bytes);
BIO_MAX_VECS);
struct bch_read_bio *rbio =
rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
- GFP_NOFS, &c->bio_read),
+ GFP_KERNEL, &c->bio_read),
opts);
readpage_iter_advance(&readpages_iter);
bchfs_read(&trans, rbio, inode_inum(inode),
&readpages_iter);
+ bch2_trans_unlock(&trans);
}
bch2_pagecache_add_put(inode);
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
- rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read),
+ rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
opts);
rbio->bio.bi_private = &done;
rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
container_of(op, struct bch_writepage_io, op);
struct bch_fs *c = io->op.c;
struct bio *bio = &io->op.wbio.bio;
- struct bvec_iter_all iter;
- struct folio_vec fv;
+ struct folio_iter fi;
unsigned i;
if (io->op.error) {
set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
- bio_for_each_folio_all(fv, bio, iter) {
+ bio_for_each_folio_all(fi, bio) {
struct bch_folio *s;
- folio_set_error(fv.fv_folio);
- mapping_set_error(fv.fv_folio->mapping, -EIO);
+ folio_set_error(fi.folio);
+ mapping_set_error(fi.folio->mapping, -EIO);
- s = __bch2_folio(fv.fv_folio);
+ s = __bch2_folio(fi.folio);
spin_lock(&s->lock);
- for (i = 0; i < folio_sectors(fv.fv_folio); i++)
+ for (i = 0; i < folio_sectors(fi.folio); i++)
s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
- bio_for_each_folio_all(fv, bio, iter) {
+ bio_for_each_folio_all(fi, bio) {
struct bch_folio *s;
- s = __bch2_folio(fv.fv_folio);
+ s = __bch2_folio(fi.folio);
spin_lock(&s->lock);
- for (i = 0; i < folio_sectors(fv.fv_folio); i++)
+ for (i = 0; i < folio_sectors(fi.folio); i++)
s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
*/
i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
- bio_for_each_folio_all(fv, bio, iter) {
- struct bch_folio *s = __bch2_folio(fv.fv_folio);
+ bio_for_each_folio_all(fi, bio) {
+ struct bch_folio *s = __bch2_folio(fi.folio);
if (atomic_dec_and_test(&s->write_count))
- folio_end_writeback(fv.fv_folio);
+ folio_end_writeback(fi.folio);
}
bio_put(&io->op.wbio.bio);
w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
REQ_OP_WRITE,
- GFP_NOFS,
+ GFP_KERNEL,
&c->writepage_bioset),
struct bch_writepage_io, op.wbio.bio);
op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
}
-static int __bch2_writepage(struct page *_page,
+static int __bch2_writepage(struct folio *folio,
struct writeback_control *wbc,
void *data)
{
- struct folio *folio = page_folio(_page);
struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_writepage_state *w = data;
folio_size(folio));
do_io:
f_sectors = folio_sectors(folio);
- s = bch2_folio_create(folio, __GFP_NOFAIL);
+ s = bch2_folio(folio);
if (f_sectors > w->tmp_sectors) {
kfree(w->tmp);
folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
mapping_gfp_mask(mapping));
- if (!folio)
+ if (IS_ERR_OR_NULL(folio))
goto err_unlock;
if (folio_test_uptodate(folio))
if (ret)
goto err;
out:
- if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) {
- ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
- if (ret)
- goto err;
- }
+ ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+ if (ret)
+ goto err;
ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
if (ret) {
}
}
+ ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr);
+ if (ret)
+ goto out;
+
f_pos = pos;
f_offset = pos - folio_pos(darray_first(folios));
darray_for_each(folios, fi) {
struct folio *f = *fi;
u64 f_len = min(end, folio_end_pos(f)) - f_pos;
- if (!bch2_folio_create(f, __GFP_NOFAIL)->uptodate) {
- ret = bch2_folio_set(c, inode_inum(inode), fi,
- folios.data + folios.nr - fi);
- if (ret)
- goto out;
- }
-
/*
* XXX: per POSIX and fstests generic/275, on -ENOSPC we're
* supposed to write as much as we have disk space for.
static void bch2_dio_write_loop_async(struct bch_write_op *);
static __always_inline long bch2_dio_write_done(struct dio_write *dio);
+/*
+ * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
+ * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
+ * caller's stack, we're not guaranteed that it will live for the duration of
+ * the IO:
+ */
static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
{
struct iovec *iov = dio->inline_vecs;
+ /*
+ * iov_iter has a single embedded iovec - nothing to do:
+ */
+ if (iter_is_ubuf(&dio->iter))
+ return 0;
+
+ /*
+ * We don't currently handle non-iovec iov_iters here - return an error,
+ * and we'll fall back to doing the IO synchronously:
+ */
+ if (!iter_is_iovec(&dio->iter))
+ return -1;
+
if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
GFP_KERNEL);
dio->free_iov = true;
}
- memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
- dio->iter.iov = iov;
+ memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
+ dio->iter.__iov = iov;
return 0;
}
bch2_pagecache_block_put(inode);
if (dio->free_iov)
- kfree(dio->iter.iov);
+ kfree(dio->iter.__iov);
ret = dio->op.error ?: ((long) dio->written << 9);
bio_put(&dio->op.wbio.bio);
mutex_unlock(&inode->ei_quota_lock);
}
- if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) {
- struct bvec_iter_all iter;
- struct folio_vec fv;
-
- bio_for_each_folio_all(fv, bio, iter)
- folio_put(fv.fv_folio);
- }
+ bio_release_pages(bio, false);
if (unlikely(dio->op.error))
set_bit(EI_INODE_ERROR, &inode->ei_flags);
err:
dio->op.error = ret;
- if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
- struct bvec_iter_all iter;
- struct folio_vec fv;
-
- bio_for_each_folio_all(fv, bio, iter)
- folio_put(fv.fv_folio);
- }
+ bio_release_pages(bio, false);
bch2_quota_reservation_put(c, inode, &dio->quota_res);
goto out;
u64 end_pos;
folio = filemap_lock_folio(mapping, index);
- if (!folio) {
+ if (IS_ERR_OR_NULL(folio)) {
/*
* XXX: we're doing two index lookups when we end up reading the
* folio
folio = __filemap_get_folio(mapping, index,
FGP_LOCK|FGP_CREAT, GFP_KERNEL);
- if (unlikely(!folio)) {
+ if (unlikely(IS_ERR_OR_NULL(folio))) {
ret = -ENOMEM;
goto out;
}
goto unlock;
}
- if (!s->uptodate) {
- ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
- if (ret)
- goto unlock;
- }
+ ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+ if (ret)
+ goto unlock;
for (i = round_up(start_offset, block_bytes(c)) >> 9;
i < round_down(end_offset, block_bytes(c)) >> 9;
end_pos = folio_end_pos(folio);
if (inode->v.i_size > folio_pos(folio))
end_pos = min_t(u64, inode->v.i_size, end_pos);
- ret = s->s[(end_pos - folio_pos(folio) - 1) >> 9].state >= SECTOR_dirty;
+ ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty;
folio_zero_segment(folio, start_offset, end_offset);
return ret;
}
-static int bch2_extend(struct user_namespace *mnt_userns,
+static int bch2_extend(struct mnt_idmap *idmap,
struct bch_inode_info *inode,
struct bch_inode_unpacked *inode_u,
struct iattr *iattr)
truncate_setsize(&inode->v, iattr->ia_size);
- return bch2_setattr_nonsize(mnt_userns, inode, iattr);
+ return bch2_setattr_nonsize(idmap, inode, iattr);
}
static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
return 0;
}
-int bch2_truncate(struct user_namespace *mnt_userns,
+int bch2_truncate(struct mnt_idmap *idmap,
struct bch_inode_info *inode, struct iattr *iattr)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
(u64) inode->v.i_size, inode_u.bi_size);
if (iattr->ia_size > inode->v.i_size) {
- ret = bch2_extend(mnt_userns, inode, &inode_u, iattr);
+ ret = bch2_extend(idmap, inode, &inode_u, iattr);
goto err;
}
ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0);
mutex_unlock(&inode->ei_update_lock);
- ret = bch2_setattr_nonsize(mnt_userns, inode, iattr);
+ ret = bch2_setattr_nonsize(idmap, inode, iattr);
err:
bch2_pagecache_block_put(inode);
return bch2_err_class(ret);
/* fseek: */
-static int folio_data_offset(struct folio *folio, unsigned offset)
+static int folio_data_offset(struct folio *folio, loff_t pos)
{
struct bch_folio *s = bch2_folio(folio);
unsigned i, sectors = folio_sectors(folio);
if (s)
- for (i = offset >> 9; i < sectors; i++)
+ for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
if (s->s[i].state >= SECTOR_dirty)
- return i << 9;
+ return i << SECTOR_SHIFT;
return -1;
}
folio_lock(folio);
offset = folio_data_offset(folio,
- max(folio_pos(folio), start_offset) -
- folio_pos(folio));
+ max(folio_pos(folio), start_offset));
if (offset >= 0) {
ret = clamp(folio_pos(folio) + offset,
start_offset, end_offset);
{
struct folio *folio;
struct bch_folio *s;
- unsigned i, sectors, f_offset;
+ unsigned i, sectors;
bool ret = true;
folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT);
- if (!folio)
+ if (IS_ERR_OR_NULL(folio))
return true;
s = bch2_folio(folio);
goto unlock;
sectors = folio_sectors(folio);
- f_offset = *offset - folio_pos(folio);
-
- for (i = f_offset >> 9; i < sectors; i++)
+ for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
if (s->s[i].state < SECTOR_dirty) {
- *offset = max(*offset, folio_pos(folio) + (i << 9));
+ *offset = max(*offset,
+ folio_pos(folio) + (i << SECTOR_SHIFT));
goto unlock;
}
int bch2_fs_fsio_init(struct bch_fs *c)
{
- int ret = 0;
-
- pr_verbose_init(c->opts, "");
-
if (bioset_init(&c->writepage_bioset,
4, offsetof(struct bch_writepage_io, op.wbio.bio),
BIOSET_NEED_BVECS))
1, offsetof(struct nocow_flush, bio), 0))
return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
- pr_verbose_init(c->opts, "ret %i", ret);
- return ret;
+ return 0;
}
#endif /* NO_BCACHEFS_FS */