1 // SPDX-License-Identifier: GPL-2.0
5 #include "btree_iter.h"
8 #include "fs-io-pagecache.h"
11 #include <linux/pagevec.h>
12 #include <linux/writeback.h>
14 int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
15 loff_t start, u64 end,
16 int fgp_flags, gfp_t gfp,
24 if ((u64) pos >= (u64) start + (1ULL << 20))
25 fgp_flags &= ~FGP_CREAT;
27 ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL);
31 f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
32 if (IS_ERR_OR_NULL(f))
35 BUG_ON(folios->nr && folio_pos(f) != pos);
37 pos = folio_end_pos(f);
38 darray_push(folios, f);
41 if (!folios->nr && !ret && (fgp_flags & FGP_CREAT))
44 return folios->nr ? 0 : ret;
47 /* pagecache_block must be held */
48 int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
49 loff_t start, loff_t end)
54 * XXX: the way this is currently implemented, we can spin if a process
55 * is continually redirtying a specific page
58 if (!mapping->nrpages)
61 ret = filemap_write_and_wait_range(mapping, start, end);
65 if (!mapping->nrpages)
68 ret = invalidate_inode_pages2_range(mapping,
71 } while (ret == -EBUSY);
76 static const char * const bch2_folio_sector_states[] = {
78 BCH_FOLIO_SECTOR_STATE()
83 static inline enum bch_folio_sector_state
84 folio_sector_dirty(enum bch_folio_sector_state state)
87 case SECTOR_unallocated:
90 return SECTOR_dirty_reserved;
96 static inline enum bch_folio_sector_state
97 folio_sector_undirty(enum bch_folio_sector_state state)
101 return SECTOR_unallocated;
102 case SECTOR_dirty_reserved:
103 return SECTOR_reserved;
109 static inline enum bch_folio_sector_state
110 folio_sector_reserve(enum bch_folio_sector_state state)
113 case SECTOR_unallocated:
114 return SECTOR_reserved;
116 return SECTOR_dirty_reserved;
122 /* for newly allocated folios: */
123 struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
127 s = kzalloc(sizeof(*s) +
128 sizeof(struct bch_folio_sector) *
129 folio_sectors(folio), gfp);
133 spin_lock_init(&s->lock);
134 folio_attach_private(folio, s);
138 struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
140 return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
143 static unsigned bkey_to_sector_state(struct bkey_s_c k)
145 if (bkey_extent_is_reservation(k))
146 return SECTOR_reserved;
147 if (bkey_extent_is_allocation(k.k))
148 return SECTOR_allocated;
149 return SECTOR_unallocated;
152 static void __bch2_folio_set(struct folio *folio,
153 unsigned pg_offset, unsigned pg_len,
154 unsigned nr_ptrs, unsigned state)
156 struct bch_folio *s = bch2_folio(folio);
157 unsigned i, sectors = folio_sectors(folio);
159 BUG_ON(pg_offset >= sectors);
160 BUG_ON(pg_offset + pg_len > sectors);
164 for (i = pg_offset; i < pg_offset + pg_len; i++) {
165 s->s[i].nr_replicas = nr_ptrs;
166 bch2_folio_sector_set(folio, s, i, state);
172 spin_unlock(&s->lock);
176 * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
179 int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
180 struct folio **folios, unsigned nr_folios)
182 struct btree_trans trans;
183 struct btree_iter iter;
186 u64 offset = folio_sector(folios[0]);
189 bool need_set = false;
192 for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
193 s = bch2_folio_create(folios[folio_idx], GFP_KERNEL);
197 need_set |= !s->uptodate;
204 bch2_trans_init(&trans, c, 0, 0);
206 bch2_trans_begin(&trans);
208 ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
212 for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
213 SPOS(inum.inum, offset, snapshot),
214 BTREE_ITER_SLOTS, k, ret) {
215 unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
216 unsigned state = bkey_to_sector_state(k);
218 while (folio_idx < nr_folios) {
219 struct folio *folio = folios[folio_idx];
220 u64 folio_start = folio_sector(folio);
221 u64 folio_end = folio_end_sector(folio);
222 unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
224 unsigned folio_len = min(k.k->p.offset, folio_end) -
225 folio_offset - folio_start;
227 BUG_ON(k.k->p.offset < folio_start);
228 BUG_ON(bkey_start_offset(k.k) > folio_end);
230 if (!bch2_folio(folio)->uptodate)
231 __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
233 if (k.k->p.offset < folio_end)
238 if (folio_idx == nr_folios)
242 offset = iter.pos.offset;
243 bch2_trans_iter_exit(&trans, &iter);
245 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
247 bch2_trans_exit(&trans);
252 void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
254 struct bvec_iter iter;
256 unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
257 ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
258 unsigned state = bkey_to_sector_state(k);
260 bio_for_each_folio(fv, bio, iter)
261 __bch2_folio_set(fv.fv_folio,
267 void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
270 pgoff_t index = start >> PAGE_SECTORS_SHIFT;
271 pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
272 struct folio_batch fbatch;
278 folio_batch_init(&fbatch);
280 while (filemap_get_folios(inode->v.i_mapping,
281 &index, end_index, &fbatch)) {
282 for (i = 0; i < folio_batch_count(&fbatch); i++) {
283 struct folio *folio = fbatch.folios[i];
284 u64 folio_start = folio_sector(folio);
285 u64 folio_end = folio_end_sector(folio);
286 unsigned folio_offset = max(start, folio_start) - folio_start;
287 unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
290 BUG_ON(end <= folio_start);
293 s = bch2_folio(folio);
297 for (j = folio_offset; j < folio_offset + folio_len; j++)
298 s->s[j].nr_replicas = 0;
299 spin_unlock(&s->lock);
304 folio_batch_release(&fbatch);
309 void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
312 struct bch_fs *c = inode->v.i_sb->s_fs_info;
313 pgoff_t index = start >> PAGE_SECTORS_SHIFT;
314 pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
315 struct folio_batch fbatch;
316 s64 i_sectors_delta = 0;
322 folio_batch_init(&fbatch);
324 while (filemap_get_folios(inode->v.i_mapping,
325 &index, end_index, &fbatch)) {
326 for (i = 0; i < folio_batch_count(&fbatch); i++) {
327 struct folio *folio = fbatch.folios[i];
328 u64 folio_start = folio_sector(folio);
329 u64 folio_end = folio_end_sector(folio);
330 unsigned folio_offset = max(start, folio_start) - folio_start;
331 unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
334 BUG_ON(end <= folio_start);
337 s = bch2_folio(folio);
341 for (j = folio_offset; j < folio_offset + folio_len; j++) {
342 i_sectors_delta -= s->s[j].state == SECTOR_dirty;
343 bch2_folio_sector_set(folio, s, j,
344 folio_sector_reserve(s->s[j].state));
346 spin_unlock(&s->lock);
351 folio_batch_release(&fbatch);
355 bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
358 static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
359 unsigned nr_replicas)
361 return max(0, (int) nr_replicas -
363 s->replicas_reserved);
366 int bch2_get_folio_disk_reservation(struct bch_fs *c,
367 struct bch_inode_info *inode,
368 struct folio *folio, bool check_enospc)
370 struct bch_folio *s = bch2_folio_create(folio, 0);
371 unsigned nr_replicas = inode_nr_replicas(c, inode);
372 struct disk_reservation disk_res = { 0 };
373 unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
379 for (i = 0; i < sectors; i++)
380 disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
382 if (!disk_res_sectors)
385 ret = bch2_disk_reservation_get(c, &disk_res,
388 ? BCH_DISK_RESERVATION_NOFAIL
393 for (i = 0; i < sectors; i++)
394 s->s[i].replicas_reserved +=
395 sectors_to_reserve(&s->s[i], nr_replicas);
400 void bch2_folio_reservation_put(struct bch_fs *c,
401 struct bch_inode_info *inode,
402 struct bch2_folio_reservation *res)
404 bch2_disk_reservation_put(c, &res->disk);
405 bch2_quota_reservation_put(c, inode, &res->quota);
408 int bch2_folio_reservation_get(struct bch_fs *c,
409 struct bch_inode_info *inode,
411 struct bch2_folio_reservation *res,
412 unsigned offset, unsigned len)
414 struct bch_folio *s = bch2_folio_create(folio, 0);
415 unsigned i, disk_sectors = 0, quota_sectors = 0;
421 BUG_ON(!s->uptodate);
423 for (i = round_down(offset, block_bytes(c)) >> 9;
424 i < round_up(offset + len, block_bytes(c)) >> 9;
426 disk_sectors += sectors_to_reserve(&s->s[i],
427 res->disk.nr_replicas);
428 quota_sectors += s->s[i].state == SECTOR_unallocated;
432 ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
438 ret = bch2_quota_reservation_add(c, inode, &res->quota,
439 quota_sectors, true);
441 struct disk_reservation tmp = {
442 .sectors = disk_sectors
445 bch2_disk_reservation_put(c, &tmp);
446 res->disk.sectors -= disk_sectors;
454 static void bch2_clear_folio_bits(struct folio *folio)
456 struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
457 struct bch_fs *c = inode->v.i_sb->s_fs_info;
458 struct bch_folio *s = bch2_folio(folio);
459 struct disk_reservation disk_res = { 0 };
460 int i, sectors = folio_sectors(folio), dirty_sectors = 0;
465 EBUG_ON(!folio_test_locked(folio));
466 EBUG_ON(folio_test_writeback(folio));
468 for (i = 0; i < sectors; i++) {
469 disk_res.sectors += s->s[i].replicas_reserved;
470 s->s[i].replicas_reserved = 0;
472 dirty_sectors -= s->s[i].state == SECTOR_dirty;
473 bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
476 bch2_disk_reservation_put(c, &disk_res);
478 bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
480 bch2_folio_release(folio);
483 void bch2_set_folio_dirty(struct bch_fs *c,
484 struct bch_inode_info *inode,
486 struct bch2_folio_reservation *res,
487 unsigned offset, unsigned len)
489 struct bch_folio *s = bch2_folio(folio);
490 unsigned i, dirty_sectors = 0;
492 WARN_ON((u64) folio_pos(folio) + offset + len >
493 round_up((u64) i_size_read(&inode->v), block_bytes(c)));
495 BUG_ON(!s->uptodate);
499 for (i = round_down(offset, block_bytes(c)) >> 9;
500 i < round_up(offset + len, block_bytes(c)) >> 9;
502 unsigned sectors = sectors_to_reserve(&s->s[i],
503 res->disk.nr_replicas);
506 * This can happen if we race with the error path in
507 * bch2_writepage_io_done():
509 sectors = min_t(unsigned, sectors, res->disk.sectors);
511 s->s[i].replicas_reserved += sectors;
512 res->disk.sectors -= sectors;
514 dirty_sectors += s->s[i].state == SECTOR_unallocated;
516 bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
519 spin_unlock(&s->lock);
521 bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
523 if (!folio_test_dirty(folio))
524 filemap_dirty_folio(inode->v.i_mapping, folio);
527 vm_fault_t bch2_page_fault(struct vm_fault *vmf)
529 struct file *file = vmf->vma->vm_file;
530 struct address_space *mapping = file->f_mapping;
531 struct address_space *fdm = faults_disabled_mapping();
532 struct bch_inode_info *inode = file_bch_inode(file);
536 return VM_FAULT_SIGBUS;
540 struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
542 if (bch2_pagecache_add_tryget(inode))
545 bch2_pagecache_block_put(fdm_host);
547 bch2_pagecache_add_get(inode);
548 bch2_pagecache_add_put(inode);
550 bch2_pagecache_block_get(fdm_host);
552 /* Signal that lock has been dropped: */
553 set_fdm_dropped_locks();
554 return VM_FAULT_SIGBUS;
557 bch2_pagecache_add_get(inode);
559 ret = filemap_fault(vmf);
560 bch2_pagecache_add_put(inode);
565 vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
567 struct folio *folio = page_folio(vmf->page);
568 struct file *file = vmf->vma->vm_file;
569 struct bch_inode_info *inode = file_bch_inode(file);
570 struct address_space *mapping = file->f_mapping;
571 struct bch_fs *c = inode->v.i_sb->s_fs_info;
572 struct bch2_folio_reservation res;
577 bch2_folio_reservation_init(c, inode, &res);
579 sb_start_pagefault(inode->v.i_sb);
580 file_update_time(file);
583 * Not strictly necessary, but helps avoid dio writes livelocking in
584 * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
585 * a bch2_write_invalidate_inode_pages_range() that works without dropping
586 * page lock before invalidating page
588 bch2_pagecache_add_get(inode);
591 isize = i_size_read(&inode->v);
593 if (folio->mapping != mapping || folio_pos(folio) >= isize) {
595 ret = VM_FAULT_NOPAGE;
599 len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
601 if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
602 bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
604 ret = VM_FAULT_SIGBUS;
608 bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
609 bch2_folio_reservation_put(c, inode, &res);
611 folio_wait_stable(folio);
612 ret = VM_FAULT_LOCKED;
614 bch2_pagecache_add_put(inode);
615 sb_end_pagefault(inode->v.i_sb);
620 void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
622 if (offset || length < folio_size(folio))
625 bch2_clear_folio_bits(folio);
628 bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
630 if (folio_test_dirty(folio) || folio_test_writeback(folio))
633 bch2_clear_folio_bits(folio);
639 static int folio_data_offset(struct folio *folio, loff_t pos,
640 unsigned min_replicas)
642 struct bch_folio *s = bch2_folio(folio);
643 unsigned i, sectors = folio_sectors(folio);
646 for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
647 if (s->s[i].state >= SECTOR_dirty &&
648 s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
649 return i << SECTOR_SHIFT;
654 loff_t bch2_seek_pagecache_data(struct inode *vinode,
657 unsigned min_replicas,
660 struct folio_batch fbatch;
661 pgoff_t start_index = start_offset >> PAGE_SHIFT;
662 pgoff_t end_index = end_offset >> PAGE_SHIFT;
663 pgoff_t index = start_index;
668 folio_batch_init(&fbatch);
670 while (filemap_get_folios(vinode->i_mapping,
671 &index, end_index, &fbatch)) {
672 for (i = 0; i < folio_batch_count(&fbatch); i++) {
673 struct folio *folio = fbatch.folios[i];
677 } else if (!folio_trylock(folio)) {
678 folio_batch_release(&fbatch);
682 offset = folio_data_offset(folio,
683 max(folio_pos(folio), start_offset),
686 ret = clamp(folio_pos(folio) + offset,
687 start_offset, end_offset);
689 folio_batch_release(&fbatch);
694 folio_batch_release(&fbatch);
702 * Search for a hole in a folio.
704 * The filemap layer returns -ENOENT if no folio exists, so reuse the same error
705 * code to indicate a pagecache hole exists at the returned offset. Otherwise
706 * return 0 if the folio is filled with data, or an error code. This function
707 * can return -EAGAIN if nonblock is specified.
709 static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
710 unsigned min_replicas, bool nonblock)
717 folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
718 FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
720 return PTR_ERR(folio);
722 s = bch2_folio(folio);
726 sectors = folio_sectors(folio);
727 for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
728 if (s->s[i].state < SECTOR_dirty ||
729 s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
730 *offset = max(*offset,
731 folio_pos(folio) + (i << SECTOR_SHIFT));
735 *offset = folio_end_pos(folio);
743 loff_t bch2_seek_pagecache_hole(struct inode *vinode,
746 unsigned min_replicas,
749 struct address_space *mapping = vinode->i_mapping;
750 loff_t offset = start_offset;
753 while (!ret && offset < end_offset)
754 ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock);
756 if (ret && ret != -ENOENT)
758 return min(offset, end_offset);
761 int bch2_clamp_data_hole(struct inode *inode,
764 unsigned min_replicas,
769 ret = bch2_seek_pagecache_hole(inode,
770 *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
776 if (*hole_start == *hole_end)
779 ret = bch2_seek_pagecache_data(inode,
780 *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
788 #endif /* NO_BCACHEFS_FS */