]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fs-io.c
Update bcachefs sources to 799716df00 bcachefs: Delete an incorrect bch2_trans_unlock()
[bcachefs-tools-debian] / libbcachefs / fs-io.c
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3
4 #include "bcachefs.h"
5 #include "alloc_foreground.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "clock.h"
10 #include "error.h"
11 #include "extents.h"
12 #include "extent_update.h"
13 #include "fs.h"
14 #include "fs-io.h"
15 #include "fsck.h"
16 #include "inode.h"
17 #include "journal.h"
18 #include "io.h"
19 #include "keylist.h"
20 #include "quota.h"
21 #include "reflink.h"
22 #include "trace.h"
23
24 #include <linux/aio.h>
25 #include <linux/backing-dev.h>
26 #include <linux/falloc.h>
27 #include <linux/migrate.h>
28 #include <linux/mmu_context.h>
29 #include <linux/pagevec.h>
30 #include <linux/rmap.h>
31 #include <linux/sched/signal.h>
32 #include <linux/task_io_accounting_ops.h>
33 #include <linux/uio.h>
34 #include <linux/writeback.h>
35
36 #include <trace/events/writeback.h>
37
38 /*
39  * Use u64 for the end pos and sector helpers because if the folio covers the
40  * max supported range of the mapping, the start offset of the next folio
41  * overflows loff_t. This breaks much of the range based processing in the
42  * buffered write path.
43  */
44 static inline u64 folio_end_pos(struct folio *folio)
45 {
46         return folio_pos(folio) + folio_size(folio);
47 }
48
49 static inline size_t folio_sectors(struct folio *folio)
50 {
51         return PAGE_SECTORS << folio_order(folio);
52 }
53
54 static inline loff_t folio_sector(struct folio *folio)
55 {
56         return folio_pos(folio) >> 9;
57 }
58
59 static inline u64 folio_end_sector(struct folio *folio)
60 {
61         return folio_end_pos(folio) >> 9;
62 }
63
64 typedef DARRAY(struct folio *) folios;
65
66 static int filemap_get_contig_folios_d(struct address_space *mapping,
67                                        loff_t start, u64 end,
68                                        int fgp_flags, gfp_t gfp,
69                                        folios *folios)
70 {
71         struct folio *f;
72         u64 pos = start;
73         int ret = 0;
74
75         while (pos < end) {
76                 if ((u64) pos >= (u64) start + (1ULL << 20))
77                         fgp_flags &= ~FGP_CREAT;
78
79                 ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL);
80                 if (ret)
81                         break;
82
83                 f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
84                 if (!f)
85                         break;
86
87                 BUG_ON(folios->nr && folio_pos(f) != pos);
88
89                 pos = folio_end_pos(f);
90                 darray_push(folios, f);
91         }
92
93         if (!folios->nr && !ret && (fgp_flags & FGP_CREAT))
94                 ret = -ENOMEM;
95
96         return folios->nr ? 0 : ret;
97 }
98
99 struct nocow_flush {
100         struct closure  *cl;
101         struct bch_dev  *ca;
102         struct bio      bio;
103 };
104
105 static void nocow_flush_endio(struct bio *_bio)
106 {
107
108         struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
109
110         closure_put(bio->cl);
111         percpu_ref_put(&bio->ca->io_ref);
112         bio_put(&bio->bio);
113 }
114
115 static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
116                                                 struct bch_inode_info *inode,
117                                                 struct closure *cl)
118 {
119         struct nocow_flush *bio;
120         struct bch_dev *ca;
121         struct bch_devs_mask devs;
122         unsigned dev;
123
124         dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
125         if (dev == BCH_SB_MEMBERS_MAX)
126                 return;
127
128         devs = inode->ei_devs_need_flush;
129         memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
130
131         for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
132                 rcu_read_lock();
133                 ca = rcu_dereference(c->devs[dev]);
134                 if (ca && !percpu_ref_tryget(&ca->io_ref))
135                         ca = NULL;
136                 rcu_read_unlock();
137
138                 if (!ca)
139                         continue;
140
141                 bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
142                                                     REQ_OP_FLUSH,
143                                                     GFP_KERNEL,
144                                                     &c->nocow_flush_bioset),
145                                    struct nocow_flush, bio);
146                 bio->cl                 = cl;
147                 bio->ca                 = ca;
148                 bio->bio.bi_end_io      = nocow_flush_endio;
149                 closure_bio_submit(&bio->bio, cl);
150         }
151 }
152
153 static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
154                                          struct bch_inode_info *inode)
155 {
156         struct closure cl;
157
158         closure_init_stack(&cl);
159         bch2_inode_flush_nocow_writes_async(c, inode, &cl);
160         closure_sync(&cl);
161
162         return 0;
163 }
164
165 static inline bool bio_full(struct bio *bio, unsigned len)
166 {
167         if (bio->bi_vcnt >= bio->bi_max_vecs)
168                 return true;
169         if (bio->bi_iter.bi_size > UINT_MAX - len)
170                 return true;
171         return false;
172 }
173
174 static inline struct address_space *faults_disabled_mapping(void)
175 {
176         return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
177 }
178
179 static inline void set_fdm_dropped_locks(void)
180 {
181         current->faults_disabled_mapping =
182                 (void *) (((unsigned long) current->faults_disabled_mapping)|1);
183 }
184
185 static inline bool fdm_dropped_locks(void)
186 {
187         return ((unsigned long) current->faults_disabled_mapping) & 1;
188 }
189
190 struct quota_res {
191         u64                             sectors;
192 };
193
194 struct bch_writepage_io {
195         struct bch_inode_info           *inode;
196
197         /* must be last: */
198         struct bch_write_op             op;
199 };
200
201 struct dio_write {
202         struct kiocb                    *req;
203         struct address_space            *mapping;
204         struct bch_inode_info           *inode;
205         struct mm_struct                *mm;
206         unsigned                        loop:1,
207                                         extending:1,
208                                         sync:1,
209                                         flush:1,
210                                         free_iov:1;
211         struct quota_res                quota_res;
212         u64                             written;
213
214         struct iov_iter                 iter;
215         struct iovec                    inline_vecs[2];
216
217         /* must be last: */
218         struct bch_write_op             op;
219 };
220
221 struct dio_read {
222         struct closure                  cl;
223         struct kiocb                    *req;
224         long                            ret;
225         bool                            should_dirty;
226         struct bch_read_bio             rbio;
227 };
228
229 /* pagecache_block must be held */
230 static noinline int write_invalidate_inode_pages_range(struct address_space *mapping,
231                                               loff_t start, loff_t end)
232 {
233         int ret;
234
235         /*
236          * XXX: the way this is currently implemented, we can spin if a process
237          * is continually redirtying a specific page
238          */
239         do {
240                 if (!mapping->nrpages)
241                         return 0;
242
243                 ret = filemap_write_and_wait_range(mapping, start, end);
244                 if (ret)
245                         break;
246
247                 if (!mapping->nrpages)
248                         return 0;
249
250                 ret = invalidate_inode_pages2_range(mapping,
251                                 start >> PAGE_SHIFT,
252                                 end >> PAGE_SHIFT);
253         } while (ret == -EBUSY);
254
255         return ret;
256 }
257
258 /* quotas */
259
260 #ifdef CONFIG_BCACHEFS_QUOTA
261
262 static void __bch2_quota_reservation_put(struct bch_fs *c,
263                                          struct bch_inode_info *inode,
264                                          struct quota_res *res)
265 {
266         BUG_ON(res->sectors > inode->ei_quota_reserved);
267
268         bch2_quota_acct(c, inode->ei_qid, Q_SPC,
269                         -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
270         inode->ei_quota_reserved -= res->sectors;
271         res->sectors = 0;
272 }
273
274 static void bch2_quota_reservation_put(struct bch_fs *c,
275                                        struct bch_inode_info *inode,
276                                        struct quota_res *res)
277 {
278         if (res->sectors) {
279                 mutex_lock(&inode->ei_quota_lock);
280                 __bch2_quota_reservation_put(c, inode, res);
281                 mutex_unlock(&inode->ei_quota_lock);
282         }
283 }
284
285 static int bch2_quota_reservation_add(struct bch_fs *c,
286                                       struct bch_inode_info *inode,
287                                       struct quota_res *res,
288                                       u64 sectors,
289                                       bool check_enospc)
290 {
291         int ret;
292
293         if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
294                 return 0;
295
296         mutex_lock(&inode->ei_quota_lock);
297         ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
298                               check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
299         if (likely(!ret)) {
300                 inode->ei_quota_reserved += sectors;
301                 res->sectors += sectors;
302         }
303         mutex_unlock(&inode->ei_quota_lock);
304
305         return ret;
306 }
307
308 #else
309
310 static void __bch2_quota_reservation_put(struct bch_fs *c,
311                                          struct bch_inode_info *inode,
312                                          struct quota_res *res) {}
313
314 static void bch2_quota_reservation_put(struct bch_fs *c,
315                                        struct bch_inode_info *inode,
316                                        struct quota_res *res) {}
317
318 static int bch2_quota_reservation_add(struct bch_fs *c,
319                                       struct bch_inode_info *inode,
320                                       struct quota_res *res,
321                                       unsigned sectors,
322                                       bool check_enospc)
323 {
324         return 0;
325 }
326
327 #endif
328
329 /* i_size updates: */
330
331 struct inode_new_size {
332         loff_t          new_size;
333         u64             now;
334         unsigned        fields;
335 };
336
337 static int inode_set_size(struct bch_inode_info *inode,
338                           struct bch_inode_unpacked *bi,
339                           void *p)
340 {
341         struct inode_new_size *s = p;
342
343         bi->bi_size = s->new_size;
344         if (s->fields & ATTR_ATIME)
345                 bi->bi_atime = s->now;
346         if (s->fields & ATTR_MTIME)
347                 bi->bi_mtime = s->now;
348         if (s->fields & ATTR_CTIME)
349                 bi->bi_ctime = s->now;
350
351         return 0;
352 }
353
354 int __must_check bch2_write_inode_size(struct bch_fs *c,
355                                        struct bch_inode_info *inode,
356                                        loff_t new_size, unsigned fields)
357 {
358         struct inode_new_size s = {
359                 .new_size       = new_size,
360                 .now            = bch2_current_time(c),
361                 .fields         = fields,
362         };
363
364         return bch2_write_inode(c, inode, inode_set_size, &s, fields);
365 }
366
367 static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
368                            struct quota_res *quota_res, s64 sectors)
369 {
370         bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
371                                 "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
372                                 inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
373                                 inode->ei_inode.bi_sectors);
374         inode->v.i_blocks += sectors;
375
376 #ifdef CONFIG_BCACHEFS_QUOTA
377         if (quota_res &&
378             !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
379             sectors > 0) {
380                 BUG_ON(sectors > quota_res->sectors);
381                 BUG_ON(sectors > inode->ei_quota_reserved);
382
383                 quota_res->sectors -= sectors;
384                 inode->ei_quota_reserved -= sectors;
385         } else {
386                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
387         }
388 #endif
389 }
390
391 static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
392                            struct quota_res *quota_res, s64 sectors)
393 {
394         if (sectors) {
395                 mutex_lock(&inode->ei_quota_lock);
396                 __i_sectors_acct(c, inode, quota_res, sectors);
397                 mutex_unlock(&inode->ei_quota_lock);
398         }
399 }
400
401 /* page state: */
402
403 /* stored in page->private: */
404
405 #define BCH_FOLIO_SECTOR_STATE()        \
406         x(unallocated)                  \
407         x(reserved)                     \
408         x(dirty)                        \
409         x(dirty_reserved)               \
410         x(allocated)
411
412 enum bch_folio_sector_state {
413 #define x(n)    SECTOR_##n,
414         BCH_FOLIO_SECTOR_STATE()
415 #undef x
416 };
417
418 const char * const bch2_folio_sector_states[] = {
419 #define x(n)    #n,
420         BCH_FOLIO_SECTOR_STATE()
421 #undef x
422         NULL
423 };
424
425 static inline enum bch_folio_sector_state
426 folio_sector_dirty(enum bch_folio_sector_state state)
427 {
428         switch (state) {
429         case SECTOR_unallocated:
430                 return SECTOR_dirty;
431         case SECTOR_reserved:
432                 return SECTOR_dirty_reserved;
433         default:
434                 return state;
435         }
436 }
437
438 static inline enum bch_folio_sector_state
439 folio_sector_undirty(enum bch_folio_sector_state state)
440 {
441         switch (state) {
442         case SECTOR_dirty:
443                 return SECTOR_unallocated;
444         case SECTOR_dirty_reserved:
445                 return SECTOR_reserved;
446         default:
447                 return state;
448         }
449 }
450
451 static inline enum bch_folio_sector_state
452 folio_sector_reserve(enum bch_folio_sector_state state)
453 {
454         switch (state) {
455         case SECTOR_unallocated:
456                 return SECTOR_reserved;
457         case SECTOR_dirty:
458                 return SECTOR_dirty_reserved;
459         default:
460                 return state;
461         }
462 }
463
464 struct bch_folio_sector {
465         /* Uncompressed, fully allocated replicas (or on disk reservation): */
466         unsigned                nr_replicas:4;
467
468         /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
469         unsigned                replicas_reserved:4;
470
471         /* i_sectors: */
472         enum bch_folio_sector_state state:8;
473 };
474
475 struct bch_folio {
476         spinlock_t              lock;
477         atomic_t                write_count;
478         /*
479          * Is the sector state up to date with the btree?
480          * (Not the data itself)
481          */
482         bool                    uptodate;
483         struct bch_folio_sector s[];
484 };
485
486 static inline void folio_sector_set(struct folio *folio,
487                              struct bch_folio *s,
488                              unsigned i, unsigned n)
489 {
490         s->s[i].state = n;
491 }
492
493 /* file offset (to folio offset) to bch_folio_sector index */
494 static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
495 {
496         u64 f_offset = pos - folio_pos(folio);
497         BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
498         return f_offset >> SECTOR_SHIFT;
499 }
500
501 static inline struct bch_folio *__bch2_folio(struct folio *folio)
502 {
503         return folio_has_private(folio)
504                 ? (struct bch_folio *) folio_get_private(folio)
505                 : NULL;
506 }
507
508 static inline struct bch_folio *bch2_folio(struct folio *folio)
509 {
510         EBUG_ON(!folio_test_locked(folio));
511
512         return __bch2_folio(folio);
513 }
514
515 /* for newly allocated folios: */
516 static void __bch2_folio_release(struct folio *folio)
517 {
518         kfree(folio_detach_private(folio));
519 }
520
521 static void bch2_folio_release(struct folio *folio)
522 {
523         EBUG_ON(!folio_test_locked(folio));
524         __bch2_folio_release(folio);
525 }
526
527 /* for newly allocated folios: */
528 static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
529 {
530         struct bch_folio *s;
531
532         s = kzalloc(sizeof(*s) +
533                     sizeof(struct bch_folio_sector) *
534                     folio_sectors(folio), GFP_NOFS|gfp);
535         if (!s)
536                 return NULL;
537
538         spin_lock_init(&s->lock);
539         folio_attach_private(folio, s);
540         return s;
541 }
542
543 static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
544 {
545         return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
546 }
547
548 static unsigned bkey_to_sector_state(struct bkey_s_c k)
549 {
550         if (bkey_extent_is_reservation(k))
551                 return SECTOR_reserved;
552         if (bkey_extent_is_allocation(k.k))
553                 return SECTOR_allocated;
554         return SECTOR_unallocated;
555 }
556
557 static void __bch2_folio_set(struct folio *folio,
558                              unsigned pg_offset, unsigned pg_len,
559                              unsigned nr_ptrs, unsigned state)
560 {
561         struct bch_folio *s = bch2_folio_create(folio, __GFP_NOFAIL);
562         unsigned i, sectors = folio_sectors(folio);
563
564         BUG_ON(pg_offset >= sectors);
565         BUG_ON(pg_offset + pg_len > sectors);
566
567         spin_lock(&s->lock);
568
569         for (i = pg_offset; i < pg_offset + pg_len; i++) {
570                 s->s[i].nr_replicas     = nr_ptrs;
571                 folio_sector_set(folio, s, i, state);
572         }
573
574         if (i == sectors)
575                 s->uptodate = true;
576
577         spin_unlock(&s->lock);
578 }
579
580 /*
581  * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
582  * extents btree:
583  */
584 static int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
585                           struct folio **folios, unsigned nr_folios)
586 {
587         struct btree_trans trans;
588         struct btree_iter iter;
589         struct bkey_s_c k;
590         u64 offset = folio_sector(folios[0]);
591         unsigned folio_idx = 0;
592         u32 snapshot;
593         int ret;
594
595         bch2_trans_init(&trans, c, 0, 0);
596 retry:
597         bch2_trans_begin(&trans);
598
599         ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
600         if (ret)
601                 goto err;
602
603         for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
604                            SPOS(inum.inum, offset, snapshot),
605                            BTREE_ITER_SLOTS, k, ret) {
606                 unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
607                 unsigned state = bkey_to_sector_state(k);
608
609                 while (folio_idx < nr_folios) {
610                         struct folio *folio = folios[folio_idx];
611                         u64 folio_start = folio_sector(folio);
612                         u64 folio_end   = folio_end_sector(folio);
613                         unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start;
614                         unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start;
615
616                         BUG_ON(k.k->p.offset < folio_start);
617                         BUG_ON(bkey_start_offset(k.k) > folio_end);
618
619                         if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate)
620                                 __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
621
622                         if (k.k->p.offset < folio_end)
623                                 break;
624                         folio_idx++;
625                 }
626
627                 if (folio_idx == nr_folios)
628                         break;
629         }
630
631         offset = iter.pos.offset;
632         bch2_trans_iter_exit(&trans, &iter);
633 err:
634         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
635                 goto retry;
636         bch2_trans_exit(&trans);
637
638         return ret;
639 }
640
641 static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
642 {
643         struct bvec_iter iter;
644         struct folio_vec fv;
645         unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
646                 ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
647         unsigned state = bkey_to_sector_state(k);
648
649         bio_for_each_folio(fv, bio, iter)
650                 __bch2_folio_set(fv.fv_folio,
651                                  fv.fv_offset >> 9,
652                                  fv.fv_len >> 9,
653                                  nr_ptrs, state);
654 }
655
656 static void mark_pagecache_unallocated(struct bch_inode_info *inode,
657                                        u64 start, u64 end)
658 {
659         pgoff_t index = start >> PAGE_SECTORS_SHIFT;
660         pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
661         struct folio_batch fbatch;
662         unsigned i, j;
663
664         if (end <= start)
665                 return;
666
667         folio_batch_init(&fbatch);
668
669         while (filemap_get_folios(inode->v.i_mapping,
670                                   &index, end_index, &fbatch)) {
671                 for (i = 0; i < folio_batch_count(&fbatch); i++) {
672                         struct folio *folio = fbatch.folios[i];
673                         u64 folio_start = folio_sector(folio);
674                         u64 folio_end = folio_end_sector(folio);
675                         unsigned folio_offset = max(start, folio_start) - folio_start;
676                         unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
677                         struct bch_folio *s;
678
679                         BUG_ON(end <= folio_start);
680
681                         folio_lock(folio);
682                         s = bch2_folio(folio);
683
684                         if (s) {
685                                 spin_lock(&s->lock);
686                                 for (j = folio_offset; j < folio_offset + folio_len; j++)
687                                         s->s[j].nr_replicas = 0;
688                                 spin_unlock(&s->lock);
689                         }
690
691                         folio_unlock(folio);
692                 }
693                 folio_batch_release(&fbatch);
694                 cond_resched();
695         }
696 }
697
698 static void mark_pagecache_reserved(struct bch_inode_info *inode,
699                                     u64 start, u64 end)
700 {
701         struct bch_fs *c = inode->v.i_sb->s_fs_info;
702         pgoff_t index = start >> PAGE_SECTORS_SHIFT;
703         pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
704         struct folio_batch fbatch;
705         s64 i_sectors_delta = 0;
706         unsigned i, j;
707
708         if (end <= start)
709                 return;
710
711         folio_batch_init(&fbatch);
712
713         while (filemap_get_folios(inode->v.i_mapping,
714                                   &index, end_index, &fbatch)) {
715                 for (i = 0; i < folio_batch_count(&fbatch); i++) {
716                         struct folio *folio = fbatch.folios[i];
717                         u64 folio_start = folio_sector(folio);
718                         u64 folio_end = folio_end_sector(folio);
719                         unsigned folio_offset = max(start, folio_start) - folio_start;
720                         unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
721                         struct bch_folio *s;
722
723                         BUG_ON(end <= folio_start);
724
725                         folio_lock(folio);
726                         s = bch2_folio(folio);
727
728                         if (s) {
729                                 spin_lock(&s->lock);
730                                 for (j = folio_offset; j < folio_offset + folio_len; j++) {
731                                         i_sectors_delta -= s->s[j].state == SECTOR_dirty;
732                                         folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state));
733                                 }
734                                 spin_unlock(&s->lock);
735                         }
736
737                         folio_unlock(folio);
738                 }
739                 folio_batch_release(&fbatch);
740                 cond_resched();
741         }
742
743         i_sectors_acct(c, inode, NULL, i_sectors_delta);
744 }
745
746 static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
747 {
748         /* XXX: this should not be open coded */
749         return inode->ei_inode.bi_data_replicas
750                 ? inode->ei_inode.bi_data_replicas - 1
751                 : c->opts.data_replicas;
752 }
753
754 static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
755                                           unsigned nr_replicas)
756 {
757         return max(0, (int) nr_replicas -
758                    s->nr_replicas -
759                    s->replicas_reserved);
760 }
761
762 static int bch2_get_folio_disk_reservation(struct bch_fs *c,
763                                 struct bch_inode_info *inode,
764                                 struct folio *folio, bool check_enospc)
765 {
766         struct bch_folio *s = bch2_folio_create(folio, 0);
767         unsigned nr_replicas = inode_nr_replicas(c, inode);
768         struct disk_reservation disk_res = { 0 };
769         unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
770         int ret;
771
772         if (!s)
773                 return -ENOMEM;
774
775         for (i = 0; i < sectors; i++)
776                 disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
777
778         if (!disk_res_sectors)
779                 return 0;
780
781         ret = bch2_disk_reservation_get(c, &disk_res,
782                                         disk_res_sectors, 1,
783                                         !check_enospc
784                                         ? BCH_DISK_RESERVATION_NOFAIL
785                                         : 0);
786         if (unlikely(ret))
787                 return ret;
788
789         for (i = 0; i < sectors; i++)
790                 s->s[i].replicas_reserved +=
791                         sectors_to_reserve(&s->s[i], nr_replicas);
792
793         return 0;
794 }
795
796 struct bch2_folio_reservation {
797         struct disk_reservation disk;
798         struct quota_res        quota;
799 };
800
801 static void bch2_folio_reservation_init(struct bch_fs *c,
802                         struct bch_inode_info *inode,
803                         struct bch2_folio_reservation *res)
804 {
805         memset(res, 0, sizeof(*res));
806
807         res->disk.nr_replicas = inode_nr_replicas(c, inode);
808 }
809
810 static void bch2_folio_reservation_put(struct bch_fs *c,
811                         struct bch_inode_info *inode,
812                         struct bch2_folio_reservation *res)
813 {
814         bch2_disk_reservation_put(c, &res->disk);
815         bch2_quota_reservation_put(c, inode, &res->quota);
816 }
817
818 static int bch2_folio_reservation_get(struct bch_fs *c,
819                         struct bch_inode_info *inode,
820                         struct folio *folio,
821                         struct bch2_folio_reservation *res,
822                         unsigned offset, unsigned len)
823 {
824         struct bch_folio *s = bch2_folio_create(folio, 0);
825         unsigned i, disk_sectors = 0, quota_sectors = 0;
826         int ret;
827
828         if (!s)
829                 return -ENOMEM;
830
831         BUG_ON(!s->uptodate);
832
833         for (i = round_down(offset, block_bytes(c)) >> 9;
834              i < round_up(offset + len, block_bytes(c)) >> 9;
835              i++) {
836                 disk_sectors += sectors_to_reserve(&s->s[i],
837                                                 res->disk.nr_replicas);
838                 quota_sectors += s->s[i].state == SECTOR_unallocated;
839         }
840
841         if (disk_sectors) {
842                 ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
843                 if (unlikely(ret))
844                         return ret;
845         }
846
847         if (quota_sectors) {
848                 ret = bch2_quota_reservation_add(c, inode, &res->quota,
849                                                  quota_sectors, true);
850                 if (unlikely(ret)) {
851                         struct disk_reservation tmp = {
852                                 .sectors = disk_sectors
853                         };
854
855                         bch2_disk_reservation_put(c, &tmp);
856                         res->disk.sectors -= disk_sectors;
857                         return ret;
858                 }
859         }
860
861         return 0;
862 }
863
864 static void bch2_clear_folio_bits(struct folio *folio)
865 {
866         struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
867         struct bch_fs *c = inode->v.i_sb->s_fs_info;
868         struct bch_folio *s = bch2_folio(folio);
869         struct disk_reservation disk_res = { 0 };
870         int i, sectors = folio_sectors(folio), dirty_sectors = 0;
871
872         if (!s)
873                 return;
874
875         EBUG_ON(!folio_test_locked(folio));
876         EBUG_ON(folio_test_writeback(folio));
877
878         for (i = 0; i < sectors; i++) {
879                 disk_res.sectors += s->s[i].replicas_reserved;
880                 s->s[i].replicas_reserved = 0;
881
882                 dirty_sectors -= s->s[i].state == SECTOR_dirty;
883                 folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
884         }
885
886         bch2_disk_reservation_put(c, &disk_res);
887
888         i_sectors_acct(c, inode, NULL, dirty_sectors);
889
890         bch2_folio_release(folio);
891 }
892
893 static void bch2_set_folio_dirty(struct bch_fs *c,
894                         struct bch_inode_info *inode,
895                         struct folio *folio,
896                         struct bch2_folio_reservation *res,
897                         unsigned offset, unsigned len)
898 {
899         struct bch_folio *s = bch2_folio(folio);
900         unsigned i, dirty_sectors = 0;
901
902         WARN_ON((u64) folio_pos(folio) + offset + len >
903                 round_up((u64) i_size_read(&inode->v), block_bytes(c)));
904
905         BUG_ON(!s->uptodate);
906
907         spin_lock(&s->lock);
908
909         for (i = round_down(offset, block_bytes(c)) >> 9;
910              i < round_up(offset + len, block_bytes(c)) >> 9;
911              i++) {
912                 unsigned sectors = sectors_to_reserve(&s->s[i],
913                                                 res->disk.nr_replicas);
914
915                 /*
916                  * This can happen if we race with the error path in
917                  * bch2_writepage_io_done():
918                  */
919                 sectors = min_t(unsigned, sectors, res->disk.sectors);
920
921                 s->s[i].replicas_reserved += sectors;
922                 res->disk.sectors -= sectors;
923
924                 dirty_sectors += s->s[i].state == SECTOR_unallocated;
925
926                 folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
927         }
928
929         spin_unlock(&s->lock);
930
931         i_sectors_acct(c, inode, &res->quota, dirty_sectors);
932
933         if (!folio_test_dirty(folio))
934                 filemap_dirty_folio(inode->v.i_mapping, folio);
935 }
936
937 vm_fault_t bch2_page_fault(struct vm_fault *vmf)
938 {
939         struct file *file = vmf->vma->vm_file;
940         struct address_space *mapping = file->f_mapping;
941         struct address_space *fdm = faults_disabled_mapping();
942         struct bch_inode_info *inode = file_bch_inode(file);
943         int ret;
944
945         if (fdm == mapping)
946                 return VM_FAULT_SIGBUS;
947
948         /* Lock ordering: */
949         if (fdm > mapping) {
950                 struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
951
952                 if (bch2_pagecache_add_tryget(inode))
953                         goto got_lock;
954
955                 bch2_pagecache_block_put(fdm_host);
956
957                 bch2_pagecache_add_get(inode);
958                 bch2_pagecache_add_put(inode);
959
960                 bch2_pagecache_block_get(fdm_host);
961
962                 /* Signal that lock has been dropped: */
963                 set_fdm_dropped_locks();
964                 return VM_FAULT_SIGBUS;
965         }
966
967         bch2_pagecache_add_get(inode);
968 got_lock:
969         ret = filemap_fault(vmf);
970         bch2_pagecache_add_put(inode);
971
972         return ret;
973 }
974
975 vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
976 {
977         struct folio *folio = page_folio(vmf->page);
978         struct file *file = vmf->vma->vm_file;
979         struct bch_inode_info *inode = file_bch_inode(file);
980         struct address_space *mapping = file->f_mapping;
981         struct bch_fs *c = inode->v.i_sb->s_fs_info;
982         struct bch2_folio_reservation res;
983         unsigned len;
984         loff_t isize;
985         int ret;
986
987         bch2_folio_reservation_init(c, inode, &res);
988
989         sb_start_pagefault(inode->v.i_sb);
990         file_update_time(file);
991
992         /*
993          * Not strictly necessary, but helps avoid dio writes livelocking in
994          * write_invalidate_inode_pages_range() - can drop this if/when we get
995          * a write_invalidate_inode_pages_range() that works without dropping
996          * page lock before invalidating page
997          */
998         bch2_pagecache_add_get(inode);
999
1000         folio_lock(folio);
1001         isize = i_size_read(&inode->v);
1002
1003         if (folio->mapping != mapping || folio_pos(folio) >= isize) {
1004                 folio_unlock(folio);
1005                 ret = VM_FAULT_NOPAGE;
1006                 goto out;
1007         }
1008
1009         len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
1010
1011         if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) {
1012                 if (bch2_folio_set(c, inode_inum(inode), &folio, 1)) {
1013                         folio_unlock(folio);
1014                         ret = VM_FAULT_SIGBUS;
1015                         goto out;
1016                 }
1017         }
1018
1019         if (bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
1020                 folio_unlock(folio);
1021                 ret = VM_FAULT_SIGBUS;
1022                 goto out;
1023         }
1024
1025         bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
1026         bch2_folio_reservation_put(c, inode, &res);
1027
1028         folio_wait_stable(folio);
1029         ret = VM_FAULT_LOCKED;
1030 out:
1031         bch2_pagecache_add_put(inode);
1032         sb_end_pagefault(inode->v.i_sb);
1033
1034         return ret;
1035 }
1036
1037 void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
1038 {
1039         if (offset || length < folio_size(folio))
1040                 return;
1041
1042         bch2_clear_folio_bits(folio);
1043 }
1044
1045 bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
1046 {
1047         if (folio_test_dirty(folio) || folio_test_writeback(folio))
1048                 return false;
1049
1050         bch2_clear_folio_bits(folio);
1051         return true;
1052 }
1053
1054 /* readpage(s): */
1055
1056 static void bch2_readpages_end_io(struct bio *bio)
1057 {
1058         struct bvec_iter_all iter;
1059         struct folio_vec fv;
1060
1061         bio_for_each_folio_all(fv, bio, iter) {
1062                 if (!bio->bi_status) {
1063                         folio_mark_uptodate(fv.fv_folio);
1064                 } else {
1065                         folio_clear_uptodate(fv.fv_folio);
1066                         folio_set_error(fv.fv_folio);
1067                 }
1068                 folio_unlock(fv.fv_folio);
1069         }
1070
1071         bio_put(bio);
1072 }
1073
1074 struct readpages_iter {
1075         struct address_space    *mapping;
1076         unsigned                idx;
1077         folios                  folios;
1078 };
1079
1080 static int readpages_iter_init(struct readpages_iter *iter,
1081                                struct readahead_control *ractl)
1082 {
1083         struct folio **fi;
1084         int ret;
1085
1086         memset(iter, 0, sizeof(*iter));
1087
1088         iter->mapping = ractl->mapping;
1089
1090         ret = filemap_get_contig_folios_d(iter->mapping,
1091                                 ractl->_index << PAGE_SHIFT,
1092                                 (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
1093                                 0, mapping_gfp_mask(iter->mapping),
1094                                 &iter->folios);
1095         if (ret)
1096                 return ret;
1097
1098         darray_for_each(iter->folios, fi) {
1099                 ractl->_nr_pages -= 1U << folio_order(*fi);
1100                 __bch2_folio_create(*fi, __GFP_NOFAIL);
1101                 folio_put(*fi);
1102                 folio_put(*fi);
1103         }
1104
1105         return 0;
1106 }
1107
1108 static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
1109 {
1110         if (iter->idx >= iter->folios.nr)
1111                 return NULL;
1112         return iter->folios.data[iter->idx];
1113 }
1114
1115 static inline void readpage_iter_advance(struct readpages_iter *iter)
1116 {
1117         iter->idx++;
1118 }
1119
1120 static bool extent_partial_reads_expensive(struct bkey_s_c k)
1121 {
1122         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1123         struct bch_extent_crc_unpacked crc;
1124         const union bch_extent_entry *i;
1125
1126         bkey_for_each_crc(k.k, ptrs, crc, i)
1127                 if (crc.csum_type || crc.compression_type)
1128                         return true;
1129         return false;
1130 }
1131
1132 static void readpage_bio_extend(struct readpages_iter *iter,
1133                                 struct bio *bio,
1134                                 unsigned sectors_this_extent,
1135                                 bool get_more)
1136 {
1137         while (bio_sectors(bio) < sectors_this_extent &&
1138                bio->bi_vcnt < bio->bi_max_vecs) {
1139                 struct folio *folio = readpage_iter_peek(iter);
1140                 int ret;
1141
1142                 if (folio) {
1143                         readpage_iter_advance(iter);
1144                 } else {
1145                         pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
1146
1147                         if (!get_more)
1148                                 break;
1149
1150                         folio = xa_load(&iter->mapping->i_pages, folio_offset);
1151                         if (folio && !xa_is_value(folio))
1152                                 break;
1153
1154                         folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
1155                         if (!folio)
1156                                 break;
1157
1158                         if (!__bch2_folio_create(folio, 0)) {
1159                                 folio_put(folio);
1160                                 break;
1161                         }
1162
1163                         ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_NOFS);
1164                         if (ret) {
1165                                 __bch2_folio_release(folio);
1166                                 folio_put(folio);
1167                                 break;
1168                         }
1169
1170                         folio_put(folio);
1171                 }
1172
1173                 BUG_ON(folio_sector(folio) != bio_end_sector(bio));
1174
1175                 BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
1176         }
1177 }
1178
1179 static void bchfs_read(struct btree_trans *trans,
1180                        struct bch_read_bio *rbio,
1181                        subvol_inum inum,
1182                        struct readpages_iter *readpages_iter)
1183 {
1184         struct bch_fs *c = trans->c;
1185         struct btree_iter iter;
1186         struct bkey_buf sk;
1187         int flags = BCH_READ_RETRY_IF_STALE|
1188                 BCH_READ_MAY_PROMOTE;
1189         u32 snapshot;
1190         int ret = 0;
1191
1192         rbio->c = c;
1193         rbio->start_time = local_clock();
1194         rbio->subvol = inum.subvol;
1195
1196         bch2_bkey_buf_init(&sk);
1197 retry:
1198         bch2_trans_begin(trans);
1199         iter = (struct btree_iter) { NULL };
1200
1201         ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
1202         if (ret)
1203                 goto err;
1204
1205         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1206                              SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
1207                              BTREE_ITER_SLOTS);
1208         while (1) {
1209                 struct bkey_s_c k;
1210                 unsigned bytes, sectors, offset_into_extent;
1211                 enum btree_id data_btree = BTREE_ID_extents;
1212
1213                 /*
1214                  * read_extent -> io_time_reset may cause a transaction restart
1215                  * without returning an error, we need to check for that here:
1216                  */
1217                 ret = bch2_trans_relock(trans);
1218                 if (ret)
1219                         break;
1220
1221                 bch2_btree_iter_set_pos(&iter,
1222                                 POS(inum.inum, rbio->bio.bi_iter.bi_sector));
1223
1224                 k = bch2_btree_iter_peek_slot(&iter);
1225                 ret = bkey_err(k);
1226                 if (ret)
1227                         break;
1228
1229                 offset_into_extent = iter.pos.offset -
1230                         bkey_start_offset(k.k);
1231                 sectors = k.k->size - offset_into_extent;
1232
1233                 bch2_bkey_buf_reassemble(&sk, c, k);
1234
1235                 ret = bch2_read_indirect_extent(trans, &data_btree,
1236                                         &offset_into_extent, &sk);
1237                 if (ret)
1238                         break;
1239
1240                 k = bkey_i_to_s_c(sk.k);
1241
1242                 sectors = min(sectors, k.k->size - offset_into_extent);
1243
1244                 if (readpages_iter)
1245                         readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
1246                                             extent_partial_reads_expensive(k));
1247
1248                 bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
1249                 swap(rbio->bio.bi_iter.bi_size, bytes);
1250
1251                 if (rbio->bio.bi_iter.bi_size == bytes)
1252                         flags |= BCH_READ_LAST_FRAGMENT;
1253
1254                 bch2_bio_page_state_set(&rbio->bio, k);
1255
1256                 bch2_read_extent(trans, rbio, iter.pos,
1257                                  data_btree, k, offset_into_extent, flags);
1258
1259                 if (flags & BCH_READ_LAST_FRAGMENT)
1260                         break;
1261
1262                 swap(rbio->bio.bi_iter.bi_size, bytes);
1263                 bio_advance(&rbio->bio, bytes);
1264
1265                 ret = btree_trans_too_many_iters(trans);
1266                 if (ret)
1267                         break;
1268         }
1269 err:
1270         bch2_trans_iter_exit(trans, &iter);
1271
1272         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1273                 goto retry;
1274
1275         if (ret) {
1276                 bch_err_inum_offset_ratelimited(c,
1277                                 iter.pos.inode,
1278                                 iter.pos.offset << 9,
1279                                 "read error %i from btree lookup", ret);
1280                 rbio->bio.bi_status = BLK_STS_IOERR;
1281                 bio_endio(&rbio->bio);
1282         }
1283
1284         bch2_bkey_buf_exit(&sk, c);
1285 }
1286
1287 void bch2_readahead(struct readahead_control *ractl)
1288 {
1289         struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
1290         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1291         struct bch_io_opts opts;
1292         struct btree_trans trans;
1293         struct folio *folio;
1294         struct readpages_iter readpages_iter;
1295         int ret;
1296
1297         bch2_inode_opts_get(&opts, c, &inode->ei_inode);
1298
1299         ret = readpages_iter_init(&readpages_iter, ractl);
1300         BUG_ON(ret);
1301
1302         bch2_trans_init(&trans, c, 0, 0);
1303
1304         bch2_pagecache_add_get(inode);
1305
1306         while ((folio = readpage_iter_peek(&readpages_iter))) {
1307                 unsigned n = min_t(unsigned,
1308                                    readpages_iter.folios.nr -
1309                                    readpages_iter.idx,
1310                                    BIO_MAX_VECS);
1311                 struct bch_read_bio *rbio =
1312                         rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
1313                                                    GFP_NOFS, &c->bio_read),
1314                                   opts);
1315
1316                 readpage_iter_advance(&readpages_iter);
1317
1318                 rbio->bio.bi_iter.bi_sector = folio_sector(folio);
1319                 rbio->bio.bi_end_io = bch2_readpages_end_io;
1320                 BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
1321
1322                 bchfs_read(&trans, rbio, inode_inum(inode),
1323                            &readpages_iter);
1324         }
1325
1326         bch2_pagecache_add_put(inode);
1327
1328         bch2_trans_exit(&trans);
1329         darray_exit(&readpages_iter.folios);
1330 }
1331
1332 static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
1333                              subvol_inum inum, struct folio *folio)
1334 {
1335         struct btree_trans trans;
1336
1337         bch2_folio_create(folio, __GFP_NOFAIL);
1338
1339         rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
1340         rbio->bio.bi_iter.bi_sector = folio_sector(folio);
1341         BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
1342
1343         bch2_trans_init(&trans, c, 0, 0);
1344         bchfs_read(&trans, rbio, inum, NULL);
1345         bch2_trans_exit(&trans);
1346 }
1347
1348 static void bch2_read_single_folio_end_io(struct bio *bio)
1349 {
1350         complete(bio->bi_private);
1351 }
1352
1353 static int bch2_read_single_folio(struct folio *folio,
1354                                   struct address_space *mapping)
1355 {
1356         struct bch_inode_info *inode = to_bch_ei(mapping->host);
1357         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1358         struct bch_read_bio *rbio;
1359         struct bch_io_opts opts;
1360         int ret;
1361         DECLARE_COMPLETION_ONSTACK(done);
1362
1363         bch2_inode_opts_get(&opts, c, &inode->ei_inode);
1364
1365         rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read),
1366                          opts);
1367         rbio->bio.bi_private = &done;
1368         rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
1369
1370         __bchfs_readfolio(c, rbio, inode_inum(inode), folio);
1371         wait_for_completion(&done);
1372
1373         ret = blk_status_to_errno(rbio->bio.bi_status);
1374         bio_put(&rbio->bio);
1375
1376         if (ret < 0)
1377                 return ret;
1378
1379         folio_mark_uptodate(folio);
1380         return 0;
1381 }
1382
1383 int bch2_read_folio(struct file *file, struct folio *folio)
1384 {
1385         int ret;
1386
1387         ret = bch2_read_single_folio(folio, folio->mapping);
1388         folio_unlock(folio);
1389         return bch2_err_class(ret);
1390 }
1391
1392 /* writepages: */
1393
1394 struct bch_writepage_state {
1395         struct bch_writepage_io *io;
1396         struct bch_io_opts      opts;
1397         struct bch_folio_sector *tmp;
1398         unsigned                tmp_sectors;
1399 };
1400
1401 static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
1402                                                                   struct bch_inode_info *inode)
1403 {
1404         struct bch_writepage_state ret = { 0 };
1405
1406         bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
1407         return ret;
1408 }
1409
1410 static void bch2_writepage_io_done(struct bch_write_op *op)
1411 {
1412         struct bch_writepage_io *io =
1413                 container_of(op, struct bch_writepage_io, op);
1414         struct bch_fs *c = io->op.c;
1415         struct bio *bio = &io->op.wbio.bio;
1416         struct bvec_iter_all iter;
1417         struct folio_vec fv;
1418         unsigned i;
1419
1420         if (io->op.error) {
1421                 set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
1422
1423                 bio_for_each_folio_all(fv, bio, iter) {
1424                         struct bch_folio *s;
1425
1426                         folio_set_error(fv.fv_folio);
1427                         mapping_set_error(fv.fv_folio->mapping, -EIO);
1428
1429                         s = __bch2_folio(fv.fv_folio);
1430                         spin_lock(&s->lock);
1431                         for (i = 0; i < folio_sectors(fv.fv_folio); i++)
1432                                 s->s[i].nr_replicas = 0;
1433                         spin_unlock(&s->lock);
1434                 }
1435         }
1436
1437         if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
1438                 bio_for_each_folio_all(fv, bio, iter) {
1439                         struct bch_folio *s;
1440
1441                         s = __bch2_folio(fv.fv_folio);
1442                         spin_lock(&s->lock);
1443                         for (i = 0; i < folio_sectors(fv.fv_folio); i++)
1444                                 s->s[i].nr_replicas = 0;
1445                         spin_unlock(&s->lock);
1446                 }
1447         }
1448
1449         /*
1450          * racing with fallocate can cause us to add fewer sectors than
1451          * expected - but we shouldn't add more sectors than expected:
1452          */
1453         WARN_ON_ONCE(io->op.i_sectors_delta > 0);
1454
1455         /*
1456          * (error (due to going RO) halfway through a page can screw that up
1457          * slightly)
1458          * XXX wtf?
1459            BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
1460          */
1461
1462         /*
1463          * PageWriteback is effectively our ref on the inode - fixup i_blocks
1464          * before calling end_page_writeback:
1465          */
1466         i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
1467
1468         bio_for_each_folio_all(fv, bio, iter) {
1469                 struct bch_folio *s = __bch2_folio(fv.fv_folio);
1470
1471                 if (atomic_dec_and_test(&s->write_count))
1472                         folio_end_writeback(fv.fv_folio);
1473         }
1474
1475         bio_put(&io->op.wbio.bio);
1476 }
1477
1478 static void bch2_writepage_do_io(struct bch_writepage_state *w)
1479 {
1480         struct bch_writepage_io *io = w->io;
1481
1482         w->io = NULL;
1483         closure_call(&io->op.cl, bch2_write, NULL, NULL);
1484 }
1485
1486 /*
1487  * Get a bch_writepage_io and add @page to it - appending to an existing one if
1488  * possible, else allocating a new one:
1489  */
1490 static void bch2_writepage_io_alloc(struct bch_fs *c,
1491                                     struct writeback_control *wbc,
1492                                     struct bch_writepage_state *w,
1493                                     struct bch_inode_info *inode,
1494                                     u64 sector,
1495                                     unsigned nr_replicas)
1496 {
1497         struct bch_write_op *op;
1498
1499         w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
1500                                               REQ_OP_WRITE,
1501                                               GFP_NOFS,
1502                                               &c->writepage_bioset),
1503                              struct bch_writepage_io, op.wbio.bio);
1504
1505         w->io->inode            = inode;
1506         op                      = &w->io->op;
1507         bch2_write_op_init(op, c, w->opts);
1508         op->target              = w->opts.foreground_target;
1509         op->nr_replicas         = nr_replicas;
1510         op->res.nr_replicas     = nr_replicas;
1511         op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
1512         op->subvol              = inode->ei_subvol;
1513         op->pos                 = POS(inode->v.i_ino, sector);
1514         op->end_io              = bch2_writepage_io_done;
1515         op->devs_need_flush     = &inode->ei_devs_need_flush;
1516         op->wbio.bio.bi_iter.bi_sector = sector;
1517         op->wbio.bio.bi_opf     = wbc_to_write_flags(wbc);
1518 }
1519
1520 static int __bch2_writepage(struct folio *folio,
1521                             struct writeback_control *wbc,
1522                             void *data)
1523 {
1524         struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
1525         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1526         struct bch_writepage_state *w = data;
1527         struct bch_folio *s;
1528         unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
1529         loff_t i_size = i_size_read(&inode->v);
1530         int ret;
1531
1532         EBUG_ON(!folio_test_uptodate(folio));
1533
1534         /* Is the folio fully inside i_size? */
1535         if (folio_end_pos(folio) <= i_size)
1536                 goto do_io;
1537
1538         /* Is the folio fully outside i_size? (truncate in progress) */
1539         if (folio_pos(folio) >= i_size) {
1540                 folio_unlock(folio);
1541                 return 0;
1542         }
1543
1544         /*
1545          * The folio straddles i_size.  It must be zeroed out on each and every
1546          * writepage invocation because it may be mmapped.  "A file is mapped
1547          * in multiples of the folio size.  For a file that is not a multiple of
1548          * the  folio size, the remaining memory is zeroed when mapped, and
1549          * writes to that region are not written out to the file."
1550          */
1551         folio_zero_segment(folio,
1552                            i_size - folio_pos(folio),
1553                            folio_size(folio));
1554 do_io:
1555         f_sectors = folio_sectors(folio);
1556         s = bch2_folio_create(folio, __GFP_NOFAIL);
1557
1558         if (f_sectors > w->tmp_sectors) {
1559                 kfree(w->tmp);
1560                 w->tmp = kzalloc(sizeof(struct bch_folio_sector) *
1561                                  f_sectors, __GFP_NOFAIL);
1562                 w->tmp_sectors = f_sectors;
1563         }
1564
1565         /*
1566          * Things get really hairy with errors during writeback:
1567          */
1568         ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
1569         BUG_ON(ret);
1570
1571         /* Before unlocking the page, get copy of reservations: */
1572         spin_lock(&s->lock);
1573         memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
1574
1575         for (i = 0; i < f_sectors; i++) {
1576                 if (s->s[i].state < SECTOR_dirty)
1577                         continue;
1578
1579                 nr_replicas_this_write =
1580                         min_t(unsigned, nr_replicas_this_write,
1581                               s->s[i].nr_replicas +
1582                               s->s[i].replicas_reserved);
1583         }
1584
1585         for (i = 0; i < f_sectors; i++) {
1586                 if (s->s[i].state < SECTOR_dirty)
1587                         continue;
1588
1589                 s->s[i].nr_replicas = w->opts.compression
1590                         ? 0 : nr_replicas_this_write;
1591
1592                 s->s[i].replicas_reserved = 0;
1593                 folio_sector_set(folio, s, i, SECTOR_allocated);
1594         }
1595         spin_unlock(&s->lock);
1596
1597         BUG_ON(atomic_read(&s->write_count));
1598         atomic_set(&s->write_count, 1);
1599
1600         BUG_ON(folio_test_writeback(folio));
1601         folio_start_writeback(folio);
1602
1603         folio_unlock(folio);
1604
1605         offset = 0;
1606         while (1) {
1607                 unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
1608                 u64 sector;
1609
1610                 while (offset < f_sectors &&
1611                        w->tmp[offset].state < SECTOR_dirty)
1612                         offset++;
1613
1614                 if (offset == f_sectors)
1615                         break;
1616
1617                 while (offset + sectors < f_sectors &&
1618                        w->tmp[offset + sectors].state >= SECTOR_dirty) {
1619                         reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
1620                         dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
1621                         sectors++;
1622                 }
1623                 BUG_ON(!sectors);
1624
1625                 sector = folio_sector(folio) + offset;
1626
1627                 if (w->io &&
1628                     (w->io->op.res.nr_replicas != nr_replicas_this_write ||
1629                      bio_full(&w->io->op.wbio.bio, sectors << 9) ||
1630                      w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
1631                      (BIO_MAX_VECS * PAGE_SIZE) ||
1632                      bio_end_sector(&w->io->op.wbio.bio) != sector))
1633                         bch2_writepage_do_io(w);
1634
1635                 if (!w->io)
1636                         bch2_writepage_io_alloc(c, wbc, w, inode, sector,
1637                                                 nr_replicas_this_write);
1638
1639                 atomic_inc(&s->write_count);
1640
1641                 BUG_ON(inode != w->io->inode);
1642                 BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
1643                                      sectors << 9, offset << 9));
1644
1645                 /* Check for writing past i_size: */
1646                 WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
1647                           round_up(i_size, block_bytes(c)) &&
1648                           !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
1649                           "writing past i_size: %llu > %llu (unrounded %llu)\n",
1650                           bio_end_sector(&w->io->op.wbio.bio) << 9,
1651                           round_up(i_size, block_bytes(c)),
1652                           i_size);
1653
1654                 w->io->op.res.sectors += reserved_sectors;
1655                 w->io->op.i_sectors_delta -= dirty_sectors;
1656                 w->io->op.new_i_size = i_size;
1657
1658                 offset += sectors;
1659         }
1660
1661         if (atomic_dec_and_test(&s->write_count))
1662                 folio_end_writeback(folio);
1663
1664         return 0;
1665 }
1666
1667 int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
1668 {
1669         struct bch_fs *c = mapping->host->i_sb->s_fs_info;
1670         struct bch_writepage_state w =
1671                 bch_writepage_state_init(c, to_bch_ei(mapping->host));
1672         struct blk_plug plug;
1673         int ret;
1674
1675         blk_start_plug(&plug);
1676         ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
1677         if (w.io)
1678                 bch2_writepage_do_io(&w);
1679         blk_finish_plug(&plug);
1680         kfree(w.tmp);
1681         return bch2_err_class(ret);
1682 }
1683
1684 /* buffered writes: */
1685
1686 int bch2_write_begin(struct file *file, struct address_space *mapping,
1687                      loff_t pos, unsigned len,
1688                      struct page **pagep, void **fsdata)
1689 {
1690         struct bch_inode_info *inode = to_bch_ei(mapping->host);
1691         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1692         struct bch2_folio_reservation *res;
1693         struct folio *folio;
1694         unsigned offset;
1695         int ret = -ENOMEM;
1696
1697         res = kmalloc(sizeof(*res), GFP_KERNEL);
1698         if (!res)
1699                 return -ENOMEM;
1700
1701         bch2_folio_reservation_init(c, inode, res);
1702         *fsdata = res;
1703
1704         bch2_pagecache_add_get(inode);
1705
1706         folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
1707                                 FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
1708                                 mapping_gfp_mask(mapping));
1709         if (!folio)
1710                 goto err_unlock;
1711
1712         if (folio_test_uptodate(folio))
1713                 goto out;
1714
1715         offset = pos - folio_pos(folio);
1716         len = min_t(size_t, len, folio_end_pos(folio) - pos);
1717
1718         /* If we're writing entire folio, don't need to read it in first: */
1719         if (!offset && len == folio_size(folio))
1720                 goto out;
1721
1722         if (!offset && pos + len >= inode->v.i_size) {
1723                 folio_zero_segment(folio, len, folio_size(folio));
1724                 flush_dcache_folio(folio);
1725                 goto out;
1726         }
1727
1728         if (folio_pos(folio) >= inode->v.i_size) {
1729                 folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
1730                 flush_dcache_folio(folio);
1731                 goto out;
1732         }
1733 readpage:
1734         ret = bch2_read_single_folio(folio, mapping);
1735         if (ret)
1736                 goto err;
1737 out:
1738         if (!bch2_folio_create(folio, __GFP_NOFAIL)->uptodate) {
1739                 ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
1740                 if (ret)
1741                         goto err;
1742         }
1743
1744         ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
1745         if (ret) {
1746                 if (!folio_test_uptodate(folio)) {
1747                         /*
1748                          * If the folio hasn't been read in, we won't know if we
1749                          * actually need a reservation - we don't actually need
1750                          * to read here, we just need to check if the folio is
1751                          * fully backed by uncompressed data:
1752                          */
1753                         goto readpage;
1754                 }
1755
1756                 goto err;
1757         }
1758
1759         *pagep = &folio->page;
1760         return 0;
1761 err:
1762         folio_unlock(folio);
1763         folio_put(folio);
1764         *pagep = NULL;
1765 err_unlock:
1766         bch2_pagecache_add_put(inode);
1767         kfree(res);
1768         *fsdata = NULL;
1769         return bch2_err_class(ret);
1770 }
1771
1772 int bch2_write_end(struct file *file, struct address_space *mapping,
1773                    loff_t pos, unsigned len, unsigned copied,
1774                    struct page *page, void *fsdata)
1775 {
1776         struct bch_inode_info *inode = to_bch_ei(mapping->host);
1777         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1778         struct bch2_folio_reservation *res = fsdata;
1779         struct folio *folio = page_folio(page);
1780         unsigned offset = pos - folio_pos(folio);
1781
1782         lockdep_assert_held(&inode->v.i_rwsem);
1783         BUG_ON(offset + copied > folio_size(folio));
1784
1785         if (unlikely(copied < len && !folio_test_uptodate(folio))) {
1786                 /*
1787                  * The folio needs to be read in, but that would destroy
1788                  * our partial write - simplest thing is to just force
1789                  * userspace to redo the write:
1790                  */
1791                 folio_zero_range(folio, 0, folio_size(folio));
1792                 flush_dcache_folio(folio);
1793                 copied = 0;
1794         }
1795
1796         spin_lock(&inode->v.i_lock);
1797         if (pos + copied > inode->v.i_size)
1798                 i_size_write(&inode->v, pos + copied);
1799         spin_unlock(&inode->v.i_lock);
1800
1801         if (copied) {
1802                 if (!folio_test_uptodate(folio))
1803                         folio_mark_uptodate(folio);
1804
1805                 bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
1806
1807                 inode->ei_last_dirtied = (unsigned long) current;
1808         }
1809
1810         folio_unlock(folio);
1811         folio_put(folio);
1812         bch2_pagecache_add_put(inode);
1813
1814         bch2_folio_reservation_put(c, inode, res);
1815         kfree(res);
1816
1817         return copied;
1818 }
1819
1820 static noinline void folios_trunc(folios *folios, struct folio **fi)
1821 {
1822         while (folios->data + folios->nr > fi) {
1823                 struct folio *f = darray_pop(folios);
1824
1825                 folio_unlock(f);
1826                 folio_put(f);
1827         }
1828 }
1829
1830 static int __bch2_buffered_write(struct bch_inode_info *inode,
1831                                  struct address_space *mapping,
1832                                  struct iov_iter *iter,
1833                                  loff_t pos, unsigned len)
1834 {
1835         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1836         struct bch2_folio_reservation res;
1837         folios folios;
1838         struct folio **fi, *f;
1839         unsigned copied = 0, f_offset;
1840         u64 end = pos + len, f_pos;
1841         loff_t last_folio_pos = inode->v.i_size;
1842         int ret = 0;
1843
1844         BUG_ON(!len);
1845
1846         bch2_folio_reservation_init(c, inode, &res);
1847         darray_init(&folios);
1848
1849         ret = filemap_get_contig_folios_d(mapping, pos, end,
1850                                    FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
1851                                    mapping_gfp_mask(mapping),
1852                                    &folios);
1853         if (ret)
1854                 goto out;
1855
1856         BUG_ON(!folios.nr);
1857
1858         f = darray_first(folios);
1859         if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
1860                 ret = bch2_read_single_folio(f, mapping);
1861                 if (ret)
1862                         goto out;
1863         }
1864
1865         f = darray_last(folios);
1866         end = min(end, folio_end_pos(f));
1867         last_folio_pos = folio_pos(f);
1868         if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
1869                 if (end >= inode->v.i_size) {
1870                         folio_zero_range(f, 0, folio_size(f));
1871                 } else {
1872                         ret = bch2_read_single_folio(f, mapping);
1873                         if (ret)
1874                                 goto out;
1875                 }
1876         }
1877
1878         f_pos = pos;
1879         f_offset = pos - folio_pos(darray_first(folios));
1880         darray_for_each(folios, fi) {
1881                 struct folio *f = *fi;
1882                 u64 f_len = min(end, folio_end_pos(f)) - f_pos;
1883
1884                 if (!bch2_folio_create(f, __GFP_NOFAIL)->uptodate) {
1885                         ret = bch2_folio_set(c, inode_inum(inode), fi,
1886                                              folios.data + folios.nr - fi);
1887                         if (ret)
1888                                 goto out;
1889                 }
1890
1891                 /*
1892                  * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
1893                  * supposed to write as much as we have disk space for.
1894                  *
1895                  * On failure here we should still write out a partial page if
1896                  * we aren't completely out of disk space - we don't do that
1897                  * yet:
1898                  */
1899                 ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
1900                 if (unlikely(ret)) {
1901                         folios_trunc(&folios, fi);
1902                         if (!folios.nr)
1903                                 goto out;
1904
1905                         end = min(end, folio_end_pos(darray_last(folios)));
1906                         break;
1907                 }
1908
1909                 f_pos = folio_end_pos(f);
1910                 f_offset = 0;
1911         }
1912
1913         if (mapping_writably_mapped(mapping))
1914                 darray_for_each(folios, fi)
1915                         flush_dcache_folio(*fi);
1916
1917         f_pos = pos;
1918         f_offset = pos - folio_pos(darray_first(folios));
1919         darray_for_each(folios, fi) {
1920                 struct folio *f = *fi;
1921                 u64 f_len = min(end, folio_end_pos(f)) - f_pos;
1922                 unsigned f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter);
1923
1924                 if (!f_copied) {
1925                         folios_trunc(&folios, fi);
1926                         break;
1927                 }
1928
1929                 if (!folio_test_uptodate(f) &&
1930                     f_copied != folio_size(f) &&
1931                     pos + copied + f_copied < inode->v.i_size) {
1932                         folio_zero_range(f, 0, folio_size(f));
1933                         folios_trunc(&folios, fi);
1934                         break;
1935                 }
1936
1937                 flush_dcache_folio(f);
1938                 copied += f_copied;
1939
1940                 if (f_copied != f_len) {
1941                         folios_trunc(&folios, fi + 1);
1942                         break;
1943                 }
1944
1945                 f_pos = folio_end_pos(f);
1946                 f_offset = 0;
1947         }
1948
1949         if (!copied)
1950                 goto out;
1951
1952         end = pos + copied;
1953
1954         spin_lock(&inode->v.i_lock);
1955         if (end > inode->v.i_size)
1956                 i_size_write(&inode->v, end);
1957         spin_unlock(&inode->v.i_lock);
1958
1959         f_pos = pos;
1960         f_offset = pos - folio_pos(darray_first(folios));
1961         darray_for_each(folios, fi) {
1962                 struct folio *f = *fi;
1963                 u64 f_len = min(end, folio_end_pos(f)) - f_pos;
1964
1965                 if (!folio_test_uptodate(f))
1966                         folio_mark_uptodate(f);
1967
1968                 bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
1969
1970                 f_pos = folio_end_pos(f);
1971                 f_offset = 0;
1972         }
1973
1974         inode->ei_last_dirtied = (unsigned long) current;
1975 out:
1976         darray_for_each(folios, fi) {
1977                 folio_unlock(*fi);
1978                 folio_put(*fi);
1979         }
1980
1981         /*
1982          * If the last folio added to the mapping starts beyond current EOF, we
1983          * performed a short write but left around at least one post-EOF folio.
1984          * Clean up the mapping before we return.
1985          */
1986         if (last_folio_pos >= inode->v.i_size)
1987                 truncate_pagecache(&inode->v, inode->v.i_size);
1988
1989         darray_exit(&folios);
1990         bch2_folio_reservation_put(c, inode, &res);
1991
1992         return copied ?: ret;
1993 }
1994
1995 static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
1996 {
1997         struct file *file = iocb->ki_filp;
1998         struct address_space *mapping = file->f_mapping;
1999         struct bch_inode_info *inode = file_bch_inode(file);
2000         loff_t pos = iocb->ki_pos;
2001         ssize_t written = 0;
2002         int ret = 0;
2003
2004         bch2_pagecache_add_get(inode);
2005
2006         do {
2007                 unsigned offset = pos & (PAGE_SIZE - 1);
2008                 unsigned bytes = iov_iter_count(iter);
2009 again:
2010                 /*
2011                  * Bring in the user page that we will copy from _first_.
2012                  * Otherwise there's a nasty deadlock on copying from the
2013                  * same page as we're writing to, without it being marked
2014                  * up-to-date.
2015                  *
2016                  * Not only is this an optimisation, but it is also required
2017                  * to check that the address is actually valid, when atomic
2018                  * usercopies are used, below.
2019                  */
2020                 if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
2021                         bytes = min_t(unsigned long, iov_iter_count(iter),
2022                                       PAGE_SIZE - offset);
2023
2024                         if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
2025                                 ret = -EFAULT;
2026                                 break;
2027                         }
2028                 }
2029
2030                 if (unlikely(fatal_signal_pending(current))) {
2031                         ret = -EINTR;
2032                         break;
2033                 }
2034
2035                 ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
2036                 if (unlikely(ret < 0))
2037                         break;
2038
2039                 cond_resched();
2040
2041                 if (unlikely(ret == 0)) {
2042                         /*
2043                          * If we were unable to copy any data at all, we must
2044                          * fall back to a single segment length write.
2045                          *
2046                          * If we didn't fallback here, we could livelock
2047                          * because not all segments in the iov can be copied at
2048                          * once without a pagefault.
2049                          */
2050                         bytes = min_t(unsigned long, PAGE_SIZE - offset,
2051                                       iov_iter_single_seg_count(iter));
2052                         goto again;
2053                 }
2054                 pos += ret;
2055                 written += ret;
2056                 ret = 0;
2057
2058                 balance_dirty_pages_ratelimited(mapping);
2059         } while (iov_iter_count(iter));
2060
2061         bch2_pagecache_add_put(inode);
2062
2063         return written ? written : ret;
2064 }
2065
2066 /* O_DIRECT reads */
2067
2068 static void bio_check_or_release(struct bio *bio, bool check_dirty)
2069 {
2070         if (check_dirty) {
2071                 bio_check_pages_dirty(bio);
2072         } else {
2073                 bio_release_pages(bio, false);
2074                 bio_put(bio);
2075         }
2076 }
2077
2078 static void bch2_dio_read_complete(struct closure *cl)
2079 {
2080         struct dio_read *dio = container_of(cl, struct dio_read, cl);
2081
2082         dio->req->ki_complete(dio->req, dio->ret);
2083         bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
2084 }
2085
2086 static void bch2_direct_IO_read_endio(struct bio *bio)
2087 {
2088         struct dio_read *dio = bio->bi_private;
2089
2090         if (bio->bi_status)
2091                 dio->ret = blk_status_to_errno(bio->bi_status);
2092
2093         closure_put(&dio->cl);
2094 }
2095
2096 static void bch2_direct_IO_read_split_endio(struct bio *bio)
2097 {
2098         struct dio_read *dio = bio->bi_private;
2099         bool should_dirty = dio->should_dirty;
2100
2101         bch2_direct_IO_read_endio(bio);
2102         bio_check_or_release(bio, should_dirty);
2103 }
2104
2105 static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
2106 {
2107         struct file *file = req->ki_filp;
2108         struct bch_inode_info *inode = file_bch_inode(file);
2109         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2110         struct bch_io_opts opts;
2111         struct dio_read *dio;
2112         struct bio *bio;
2113         loff_t offset = req->ki_pos;
2114         bool sync = is_sync_kiocb(req);
2115         size_t shorten;
2116         ssize_t ret;
2117
2118         bch2_inode_opts_get(&opts, c, &inode->ei_inode);
2119
2120         if ((offset|iter->count) & (block_bytes(c) - 1))
2121                 return -EINVAL;
2122
2123         ret = min_t(loff_t, iter->count,
2124                     max_t(loff_t, 0, i_size_read(&inode->v) - offset));
2125
2126         if (!ret)
2127                 return ret;
2128
2129         shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
2130         iter->count -= shorten;
2131
2132         bio = bio_alloc_bioset(NULL,
2133                                bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
2134                                REQ_OP_READ,
2135                                GFP_KERNEL,
2136                                &c->dio_read_bioset);
2137
2138         bio->bi_end_io = bch2_direct_IO_read_endio;
2139
2140         dio = container_of(bio, struct dio_read, rbio.bio);
2141         closure_init(&dio->cl, NULL);
2142
2143         /*
2144          * this is a _really_ horrible hack just to avoid an atomic sub at the
2145          * end:
2146          */
2147         if (!sync) {
2148                 set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
2149                 atomic_set(&dio->cl.remaining,
2150                            CLOSURE_REMAINING_INITIALIZER -
2151                            CLOSURE_RUNNING +
2152                            CLOSURE_DESTRUCTOR);
2153         } else {
2154                 atomic_set(&dio->cl.remaining,
2155                            CLOSURE_REMAINING_INITIALIZER + 1);
2156         }
2157
2158         dio->req        = req;
2159         dio->ret        = ret;
2160         /*
2161          * This is one of the sketchier things I've encountered: we have to skip
2162          * the dirtying of requests that are internal from the kernel (i.e. from
2163          * loopback), because we'll deadlock on page_lock.
2164          */
2165         dio->should_dirty = iter_is_iovec(iter);
2166
2167         goto start;
2168         while (iter->count) {
2169                 bio = bio_alloc_bioset(NULL,
2170                                        bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
2171                                        REQ_OP_READ,
2172                                        GFP_KERNEL,
2173                                        &c->bio_read);
2174                 bio->bi_end_io          = bch2_direct_IO_read_split_endio;
2175 start:
2176                 bio->bi_opf             = REQ_OP_READ|REQ_SYNC;
2177                 bio->bi_iter.bi_sector  = offset >> 9;
2178                 bio->bi_private         = dio;
2179
2180                 ret = bio_iov_iter_get_pages(bio, iter);
2181                 if (ret < 0) {
2182                         /* XXX: fault inject this path */
2183                         bio->bi_status = BLK_STS_RESOURCE;
2184                         bio_endio(bio);
2185                         break;
2186                 }
2187
2188                 offset += bio->bi_iter.bi_size;
2189
2190                 if (dio->should_dirty)
2191                         bio_set_pages_dirty(bio);
2192
2193                 if (iter->count)
2194                         closure_get(&dio->cl);
2195
2196                 bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
2197         }
2198
2199         iter->count += shorten;
2200
2201         if (sync) {
2202                 closure_sync(&dio->cl);
2203                 closure_debug_destroy(&dio->cl);
2204                 ret = dio->ret;
2205                 bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
2206                 return ret;
2207         } else {
2208                 return -EIOCBQUEUED;
2209         }
2210 }
2211
2212 ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
2213 {
2214         struct file *file = iocb->ki_filp;
2215         struct bch_inode_info *inode = file_bch_inode(file);
2216         struct address_space *mapping = file->f_mapping;
2217         size_t count = iov_iter_count(iter);
2218         ssize_t ret;
2219
2220         if (!count)
2221                 return 0; /* skip atime */
2222
2223         if (iocb->ki_flags & IOCB_DIRECT) {
2224                 struct blk_plug plug;
2225
2226                 if (unlikely(mapping->nrpages)) {
2227                         ret = filemap_write_and_wait_range(mapping,
2228                                                 iocb->ki_pos,
2229                                                 iocb->ki_pos + count - 1);
2230                         if (ret < 0)
2231                                 goto out;
2232                 }
2233
2234                 file_accessed(file);
2235
2236                 blk_start_plug(&plug);
2237                 ret = bch2_direct_IO_read(iocb, iter);
2238                 blk_finish_plug(&plug);
2239
2240                 if (ret >= 0)
2241                         iocb->ki_pos += ret;
2242         } else {
2243                 bch2_pagecache_add_get(inode);
2244                 ret = generic_file_read_iter(iocb, iter);
2245                 bch2_pagecache_add_put(inode);
2246         }
2247 out:
2248         return bch2_err_class(ret);
2249 }
2250
2251 /* O_DIRECT writes */
2252
2253 static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
2254                                        u64 offset, u64 size,
2255                                        unsigned nr_replicas, bool compressed)
2256 {
2257         struct btree_trans trans;
2258         struct btree_iter iter;
2259         struct bkey_s_c k;
2260         u64 end = offset + size;
2261         u32 snapshot;
2262         bool ret = true;
2263         int err;
2264
2265         bch2_trans_init(&trans, c, 0, 0);
2266 retry:
2267         bch2_trans_begin(&trans);
2268
2269         err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
2270         if (err)
2271                 goto err;
2272
2273         for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
2274                            SPOS(inum.inum, offset, snapshot),
2275                            BTREE_ITER_SLOTS, k, err) {
2276                 if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
2277                         break;
2278
2279                 if (k.k->p.snapshot != snapshot ||
2280                     nr_replicas > bch2_bkey_replicas(c, k) ||
2281                     (!compressed && bch2_bkey_sectors_compressed(k))) {
2282                         ret = false;
2283                         break;
2284                 }
2285         }
2286
2287         offset = iter.pos.offset;
2288         bch2_trans_iter_exit(&trans, &iter);
2289 err:
2290         if (bch2_err_matches(err, BCH_ERR_transaction_restart))
2291                 goto retry;
2292         bch2_trans_exit(&trans);
2293
2294         return err ? false : ret;
2295 }
2296
2297 static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
2298 {
2299         struct bch_fs *c = dio->op.c;
2300         struct bch_inode_info *inode = dio->inode;
2301         struct bio *bio = &dio->op.wbio.bio;
2302
2303         return bch2_check_range_allocated(c, inode_inum(inode),
2304                                 dio->op.pos.offset, bio_sectors(bio),
2305                                 dio->op.opts.data_replicas,
2306                                 dio->op.opts.compression != 0);
2307 }
2308
2309 static void bch2_dio_write_loop_async(struct bch_write_op *);
2310 static __always_inline long bch2_dio_write_done(struct dio_write *dio);
2311
2312 static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
2313 {
2314         struct iovec *iov = dio->inline_vecs;
2315
2316         if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
2317                 iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
2318                                     GFP_KERNEL);
2319                 if (unlikely(!iov))
2320                         return -ENOMEM;
2321
2322                 dio->free_iov = true;
2323         }
2324
2325         memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
2326         dio->iter.iov = iov;
2327         return 0;
2328 }
2329
2330 static void bch2_dio_write_flush_done(struct closure *cl)
2331 {
2332         struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
2333         struct bch_fs *c = dio->op.c;
2334
2335         closure_debug_destroy(cl);
2336
2337         dio->op.error = bch2_journal_error(&c->journal);
2338
2339         bch2_dio_write_done(dio);
2340 }
2341
2342 static noinline void bch2_dio_write_flush(struct dio_write *dio)
2343 {
2344         struct bch_fs *c = dio->op.c;
2345         struct bch_inode_unpacked inode;
2346         int ret;
2347
2348         dio->flush = 0;
2349
2350         closure_init(&dio->op.cl, NULL);
2351
2352         if (!dio->op.error) {
2353                 ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
2354                 if (ret) {
2355                         dio->op.error = ret;
2356                 } else {
2357                         bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl);
2358                         bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
2359                 }
2360         }
2361
2362         if (dio->sync) {
2363                 closure_sync(&dio->op.cl);
2364                 closure_debug_destroy(&dio->op.cl);
2365         } else {
2366                 continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
2367         }
2368 }
2369
2370 static __always_inline long bch2_dio_write_done(struct dio_write *dio)
2371 {
2372         struct kiocb *req = dio->req;
2373         struct bch_inode_info *inode = dio->inode;
2374         bool sync = dio->sync;
2375         long ret;
2376
2377         if (unlikely(dio->flush)) {
2378                 bch2_dio_write_flush(dio);
2379                 if (!sync)
2380                         return -EIOCBQUEUED;
2381         }
2382
2383         bch2_pagecache_block_put(inode);
2384
2385         if (dio->free_iov)
2386                 kfree(dio->iter.iov);
2387
2388         ret = dio->op.error ?: ((long) dio->written << 9);
2389         bio_put(&dio->op.wbio.bio);
2390
2391         /* inode->i_dio_count is our ref on inode and thus bch_fs */
2392         inode_dio_end(&inode->v);
2393
2394         if (ret < 0)
2395                 ret = bch2_err_class(ret);
2396
2397         if (!sync) {
2398                 req->ki_complete(req, ret);
2399                 ret = -EIOCBQUEUED;
2400         }
2401         return ret;
2402 }
2403
2404 static __always_inline void bch2_dio_write_end(struct dio_write *dio)
2405 {
2406         struct bch_fs *c = dio->op.c;
2407         struct kiocb *req = dio->req;
2408         struct bch_inode_info *inode = dio->inode;
2409         struct bio *bio = &dio->op.wbio.bio;
2410
2411         req->ki_pos     += (u64) dio->op.written << 9;
2412         dio->written    += dio->op.written;
2413
2414         if (dio->extending) {
2415                 spin_lock(&inode->v.i_lock);
2416                 if (req->ki_pos > inode->v.i_size)
2417                         i_size_write(&inode->v, req->ki_pos);
2418                 spin_unlock(&inode->v.i_lock);
2419         }
2420
2421         if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
2422                 mutex_lock(&inode->ei_quota_lock);
2423                 __i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
2424                 __bch2_quota_reservation_put(c, inode, &dio->quota_res);
2425                 mutex_unlock(&inode->ei_quota_lock);
2426         }
2427
2428         if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) {
2429                 struct bvec_iter_all iter;
2430                 struct folio_vec fv;
2431
2432                 bio_for_each_folio_all(fv, bio, iter)
2433                         folio_put(fv.fv_folio);
2434         }
2435
2436         if (unlikely(dio->op.error))
2437                 set_bit(EI_INODE_ERROR, &inode->ei_flags);
2438 }
2439
2440 static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
2441 {
2442         struct bch_fs *c = dio->op.c;
2443         struct kiocb *req = dio->req;
2444         struct address_space *mapping = dio->mapping;
2445         struct bch_inode_info *inode = dio->inode;
2446         struct bch_io_opts opts;
2447         struct bio *bio = &dio->op.wbio.bio;
2448         unsigned unaligned, iter_count;
2449         bool sync = dio->sync, dropped_locks;
2450         long ret;
2451
2452         bch2_inode_opts_get(&opts, c, &inode->ei_inode);
2453
2454         while (1) {
2455                 iter_count = dio->iter.count;
2456
2457                 EBUG_ON(current->faults_disabled_mapping);
2458                 current->faults_disabled_mapping = mapping;
2459
2460                 ret = bio_iov_iter_get_pages(bio, &dio->iter);
2461
2462                 dropped_locks = fdm_dropped_locks();
2463
2464                 current->faults_disabled_mapping = NULL;
2465
2466                 /*
2467                  * If the fault handler returned an error but also signalled
2468                  * that it dropped & retook ei_pagecache_lock, we just need to
2469                  * re-shoot down the page cache and retry:
2470                  */
2471                 if (dropped_locks && ret)
2472                         ret = 0;
2473
2474                 if (unlikely(ret < 0))
2475                         goto err;
2476
2477                 if (unlikely(dropped_locks)) {
2478                         ret = write_invalidate_inode_pages_range(mapping,
2479                                         req->ki_pos,
2480                                         req->ki_pos + iter_count - 1);
2481                         if (unlikely(ret))
2482                                 goto err;
2483
2484                         if (!bio->bi_iter.bi_size)
2485                                 continue;
2486                 }
2487
2488                 unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
2489                 bio->bi_iter.bi_size -= unaligned;
2490                 iov_iter_revert(&dio->iter, unaligned);
2491
2492                 if (!bio->bi_iter.bi_size) {
2493                         /*
2494                          * bio_iov_iter_get_pages was only able to get <
2495                          * blocksize worth of pages:
2496                          */
2497                         ret = -EFAULT;
2498                         goto err;
2499                 }
2500
2501                 bch2_write_op_init(&dio->op, c, opts);
2502                 dio->op.end_io          = sync
2503                         ? NULL
2504                         : bch2_dio_write_loop_async;
2505                 dio->op.target          = dio->op.opts.foreground_target;
2506                 dio->op.write_point     = writepoint_hashed((unsigned long) current);
2507                 dio->op.nr_replicas     = dio->op.opts.data_replicas;
2508                 dio->op.subvol          = inode->ei_subvol;
2509                 dio->op.pos             = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
2510                 dio->op.devs_need_flush = &inode->ei_devs_need_flush;
2511
2512                 if (sync)
2513                         dio->op.flags |= BCH_WRITE_SYNC;
2514                 dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
2515
2516                 ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
2517                                                  bio_sectors(bio), true);
2518                 if (unlikely(ret))
2519                         goto err;
2520
2521                 ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
2522                                                 dio->op.opts.data_replicas, 0);
2523                 if (unlikely(ret) &&
2524                     !bch2_dio_write_check_allocated(dio))
2525                         goto err;
2526
2527                 task_io_account_write(bio->bi_iter.bi_size);
2528
2529                 if (unlikely(dio->iter.count) &&
2530                     !dio->sync &&
2531                     !dio->loop &&
2532                     bch2_dio_write_copy_iov(dio))
2533                         dio->sync = sync = true;
2534
2535                 dio->loop = true;
2536                 closure_call(&dio->op.cl, bch2_write, NULL, NULL);
2537
2538                 if (!sync)
2539                         return -EIOCBQUEUED;
2540
2541                 bch2_dio_write_end(dio);
2542
2543                 if (likely(!dio->iter.count) || dio->op.error)
2544                         break;
2545
2546                 bio_reset(bio, NULL, REQ_OP_WRITE);
2547         }
2548 out:
2549         return bch2_dio_write_done(dio);
2550 err:
2551         dio->op.error = ret;
2552
2553         if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
2554                 struct bvec_iter_all iter;
2555                 struct folio_vec fv;
2556
2557                 bio_for_each_folio_all(fv, bio, iter)
2558                         folio_put(fv.fv_folio);
2559         }
2560
2561         bch2_quota_reservation_put(c, inode, &dio->quota_res);
2562         goto out;
2563 }
2564
2565 static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
2566 {
2567         struct mm_struct *mm = dio->mm;
2568
2569         bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
2570
2571         if (mm)
2572                 kthread_use_mm(mm);
2573         bch2_dio_write_loop(dio);
2574         if (mm)
2575                 kthread_unuse_mm(mm);
2576 }
2577
2578 static void bch2_dio_write_loop_async(struct bch_write_op *op)
2579 {
2580         struct dio_write *dio = container_of(op, struct dio_write, op);
2581
2582         bch2_dio_write_end(dio);
2583
2584         if (likely(!dio->iter.count) || dio->op.error)
2585                 bch2_dio_write_done(dio);
2586         else
2587                 bch2_dio_write_continue(dio);
2588 }
2589
2590 static noinline
2591 ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
2592 {
2593         struct file *file = req->ki_filp;
2594         struct address_space *mapping = file->f_mapping;
2595         struct bch_inode_info *inode = file_bch_inode(file);
2596         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2597         struct dio_write *dio;
2598         struct bio *bio;
2599         bool locked = true, extending;
2600         ssize_t ret;
2601
2602         prefetch(&c->opts);
2603         prefetch((void *) &c->opts + 64);
2604         prefetch(&inode->ei_inode);
2605         prefetch((void *) &inode->ei_inode + 64);
2606
2607         inode_lock(&inode->v);
2608
2609         ret = generic_write_checks(req, iter);
2610         if (unlikely(ret <= 0))
2611                 goto err;
2612
2613         ret = file_remove_privs(file);
2614         if (unlikely(ret))
2615                 goto err;
2616
2617         ret = file_update_time(file);
2618         if (unlikely(ret))
2619                 goto err;
2620
2621         if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
2622                 goto err;
2623
2624         inode_dio_begin(&inode->v);
2625         bch2_pagecache_block_get(inode);
2626
2627         extending = req->ki_pos + iter->count > inode->v.i_size;
2628         if (!extending) {
2629                 inode_unlock(&inode->v);
2630                 locked = false;
2631         }
2632
2633         bio = bio_alloc_bioset(NULL,
2634                                bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
2635                                REQ_OP_WRITE,
2636                                GFP_KERNEL,
2637                                &c->dio_write_bioset);
2638         dio = container_of(bio, struct dio_write, op.wbio.bio);
2639         dio->req                = req;
2640         dio->mapping            = mapping;
2641         dio->inode              = inode;
2642         dio->mm                 = current->mm;
2643         dio->loop               = false;
2644         dio->extending          = extending;
2645         dio->sync               = is_sync_kiocb(req) || extending;
2646         dio->flush              = iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
2647         dio->free_iov           = false;
2648         dio->quota_res.sectors  = 0;
2649         dio->written            = 0;
2650         dio->iter               = *iter;
2651         dio->op.c               = c;
2652
2653         if (unlikely(mapping->nrpages)) {
2654                 ret = write_invalidate_inode_pages_range(mapping,
2655                                                 req->ki_pos,
2656                                                 req->ki_pos + iter->count - 1);
2657                 if (unlikely(ret))
2658                         goto err_put_bio;
2659         }
2660
2661         ret = bch2_dio_write_loop(dio);
2662 err:
2663         if (locked)
2664                 inode_unlock(&inode->v);
2665         return ret;
2666 err_put_bio:
2667         bch2_pagecache_block_put(inode);
2668         bio_put(bio);
2669         inode_dio_end(&inode->v);
2670         goto err;
2671 }
2672
2673 ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
2674 {
2675         struct file *file = iocb->ki_filp;
2676         struct bch_inode_info *inode = file_bch_inode(file);
2677         ssize_t ret;
2678
2679         if (iocb->ki_flags & IOCB_DIRECT) {
2680                 ret = bch2_direct_write(iocb, from);
2681                 goto out;
2682         }
2683
2684         /* We can write back this queue in page reclaim */
2685         current->backing_dev_info = inode_to_bdi(&inode->v);
2686         inode_lock(&inode->v);
2687
2688         ret = generic_write_checks(iocb, from);
2689         if (ret <= 0)
2690                 goto unlock;
2691
2692         ret = file_remove_privs(file);
2693         if (ret)
2694                 goto unlock;
2695
2696         ret = file_update_time(file);
2697         if (ret)
2698                 goto unlock;
2699
2700         ret = bch2_buffered_write(iocb, from);
2701         if (likely(ret > 0))
2702                 iocb->ki_pos += ret;
2703 unlock:
2704         inode_unlock(&inode->v);
2705         current->backing_dev_info = NULL;
2706
2707         if (ret > 0)
2708                 ret = generic_write_sync(iocb, ret);
2709 out:
2710         return bch2_err_class(ret);
2711 }
2712
2713 /* fsync: */
2714
2715 /*
2716  * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
2717  * insert trigger: look up the btree inode instead
2718  */
2719 static int bch2_flush_inode(struct bch_fs *c,
2720                             struct bch_inode_info *inode)
2721 {
2722         struct bch_inode_unpacked u;
2723         int ret;
2724
2725         if (c->opts.journal_flush_disabled)
2726                 return 0;
2727
2728         ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u);
2729         if (ret)
2730                 return ret;
2731
2732         return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
2733                 bch2_inode_flush_nocow_writes(c, inode);
2734 }
2735
2736 int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2737 {
2738         struct bch_inode_info *inode = file_bch_inode(file);
2739         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2740         int ret, ret2, ret3;
2741
2742         ret = file_write_and_wait_range(file, start, end);
2743         ret2 = sync_inode_metadata(&inode->v, 1);
2744         ret3 = bch2_flush_inode(c, inode);
2745
2746         return bch2_err_class(ret ?: ret2 ?: ret3);
2747 }
2748
2749 /* truncate: */
2750
2751 static inline int range_has_data(struct bch_fs *c, u32 subvol,
2752                                  struct bpos start,
2753                                  struct bpos end)
2754 {
2755         struct btree_trans trans;
2756         struct btree_iter iter;
2757         struct bkey_s_c k;
2758         int ret = 0;
2759
2760         bch2_trans_init(&trans, c, 0, 0);
2761 retry:
2762         bch2_trans_begin(&trans);
2763
2764         ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
2765         if (ret)
2766                 goto err;
2767
2768         for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
2769                 if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
2770                         ret = 1;
2771                         break;
2772                 }
2773         start = iter.pos;
2774         bch2_trans_iter_exit(&trans, &iter);
2775 err:
2776         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2777                 goto retry;
2778
2779         bch2_trans_exit(&trans);
2780         return ret;
2781 }
2782
2783 static int __bch2_truncate_folio(struct bch_inode_info *inode,
2784                                  pgoff_t index, loff_t start, loff_t end)
2785 {
2786         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2787         struct address_space *mapping = inode->v.i_mapping;
2788         struct bch_folio *s;
2789         unsigned start_offset = start & (PAGE_SIZE - 1);
2790         unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
2791         unsigned i;
2792         struct folio *folio;
2793         s64 i_sectors_delta = 0;
2794         int ret = 0;
2795         u64 end_pos;
2796
2797         folio = filemap_lock_folio(mapping, index);
2798         if (!folio) {
2799                 /*
2800                  * XXX: we're doing two index lookups when we end up reading the
2801                  * folio
2802                  */
2803                 ret = range_has_data(c, inode->ei_subvol,
2804                                 POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
2805                                 POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
2806                 if (ret <= 0)
2807                         return ret;
2808
2809                 folio = __filemap_get_folio(mapping, index,
2810                                             FGP_LOCK|FGP_CREAT, GFP_KERNEL);
2811                 if (unlikely(!folio)) {
2812                         ret = -ENOMEM;
2813                         goto out;
2814                 }
2815         }
2816
2817         BUG_ON(start    >= folio_end_pos(folio));
2818         BUG_ON(end      <= folio_pos(folio));
2819
2820         start_offset    = max(start, folio_pos(folio)) - folio_pos(folio);
2821         end_offset      = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio);
2822
2823         /* Folio boundary? Nothing to do */
2824         if (start_offset == 0 &&
2825             end_offset == folio_size(folio)) {
2826                 ret = 0;
2827                 goto unlock;
2828         }
2829
2830         s = bch2_folio_create(folio, 0);
2831         if (!s) {
2832                 ret = -ENOMEM;
2833                 goto unlock;
2834         }
2835
2836         if (!folio_test_uptodate(folio)) {
2837                 ret = bch2_read_single_folio(folio, mapping);
2838                 if (ret)
2839                         goto unlock;
2840         }
2841
2842         if (!s->uptodate) {
2843                 ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
2844                 if (ret)
2845                         goto unlock;
2846         }
2847
2848         for (i = round_up(start_offset, block_bytes(c)) >> 9;
2849              i < round_down(end_offset, block_bytes(c)) >> 9;
2850              i++) {
2851                 s->s[i].nr_replicas     = 0;
2852
2853                 i_sectors_delta -= s->s[i].state == SECTOR_dirty;
2854                 folio_sector_set(folio, s, i, SECTOR_unallocated);
2855         }
2856
2857         i_sectors_acct(c, inode, NULL, i_sectors_delta);
2858
2859         /*
2860          * Caller needs to know whether this folio will be written out by
2861          * writeback - doing an i_size update if necessary - or whether it will
2862          * be responsible for the i_size update.
2863          *
2864          * Note that we shouldn't ever see a folio beyond EOF, but check and
2865          * warn if so. This has been observed by failure to clean up folios
2866          * after a short write and there's still a chance reclaim will fix
2867          * things up.
2868          */
2869         WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size);
2870         end_pos = folio_end_pos(folio);
2871         if (inode->v.i_size > folio_pos(folio))
2872                 end_pos = min_t(u64, inode->v.i_size, end_pos);
2873         ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty;
2874
2875         folio_zero_segment(folio, start_offset, end_offset);
2876
2877         /*
2878          * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
2879          *
2880          * XXX: because we aren't currently tracking whether the folio has actual
2881          * data in it (vs. just 0s, or only partially written) this wrong. ick.
2882          */
2883         BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false));
2884
2885         /*
2886          * This removes any writeable userspace mappings; we need to force
2887          * .page_mkwrite to be called again before any mmapped writes, to
2888          * redirty the full page:
2889          */
2890         folio_mkclean(folio);
2891         filemap_dirty_folio(mapping, folio);
2892 unlock:
2893         folio_unlock(folio);
2894         folio_put(folio);
2895 out:
2896         return ret;
2897 }
2898
2899 static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from)
2900 {
2901         return __bch2_truncate_folio(inode, from >> PAGE_SHIFT,
2902                                      from, ANYSINT_MAX(loff_t));
2903 }
2904
2905 static int bch2_truncate_folios(struct bch_inode_info *inode,
2906                                 loff_t start, loff_t end)
2907 {
2908         int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT,
2909                                         start, end);
2910
2911         if (ret >= 0 &&
2912             start >> PAGE_SHIFT != end >> PAGE_SHIFT)
2913                 ret = __bch2_truncate_folio(inode,
2914                                         (end - 1) >> PAGE_SHIFT,
2915                                         start, end);
2916         return ret;
2917 }
2918
2919 static int bch2_extend(struct mnt_idmap *idmap,
2920                        struct bch_inode_info *inode,
2921                        struct bch_inode_unpacked *inode_u,
2922                        struct iattr *iattr)
2923 {
2924         struct address_space *mapping = inode->v.i_mapping;
2925         int ret;
2926
2927         /*
2928          * sync appends:
2929          *
2930          * this has to be done _before_ extending i_size:
2931          */
2932         ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
2933         if (ret)
2934                 return ret;
2935
2936         truncate_setsize(&inode->v, iattr->ia_size);
2937
2938         return bch2_setattr_nonsize(idmap, inode, iattr);
2939 }
2940
2941 static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
2942                                    struct bch_inode_unpacked *bi,
2943                                    void *p)
2944 {
2945         bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
2946         return 0;
2947 }
2948
2949 static int bch2_truncate_start_fn(struct bch_inode_info *inode,
2950                                   struct bch_inode_unpacked *bi, void *p)
2951 {
2952         u64 *new_i_size = p;
2953
2954         bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
2955         bi->bi_size = *new_i_size;
2956         return 0;
2957 }
2958
2959 int bch2_truncate(struct mnt_idmap *idmap,
2960                   struct bch_inode_info *inode, struct iattr *iattr)
2961 {
2962         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2963         struct address_space *mapping = inode->v.i_mapping;
2964         struct bch_inode_unpacked inode_u;
2965         u64 new_i_size = iattr->ia_size;
2966         s64 i_sectors_delta = 0;
2967         int ret = 0;
2968
2969         /*
2970          * If the truncate call with change the size of the file, the
2971          * cmtimes should be updated. If the size will not change, we
2972          * do not need to update the cmtimes.
2973          */
2974         if (iattr->ia_size != inode->v.i_size) {
2975                 if (!(iattr->ia_valid & ATTR_MTIME))
2976                         ktime_get_coarse_real_ts64(&iattr->ia_mtime);
2977                 if (!(iattr->ia_valid & ATTR_CTIME))
2978                         ktime_get_coarse_real_ts64(&iattr->ia_ctime);
2979                 iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
2980         }
2981
2982         inode_dio_wait(&inode->v);
2983         bch2_pagecache_block_get(inode);
2984
2985         ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
2986         if (ret)
2987                 goto err;
2988
2989         /*
2990          * check this before next assertion; on filesystem error our normal
2991          * invariants are a bit broken (truncate has to truncate the page cache
2992          * before the inode).
2993          */
2994         ret = bch2_journal_error(&c->journal);
2995         if (ret)
2996                 goto err;
2997
2998         WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
2999                   inode->v.i_size < inode_u.bi_size,
3000                   "truncate spotted in mem i_size < btree i_size: %llu < %llu\n",
3001                   (u64) inode->v.i_size, inode_u.bi_size);
3002
3003         if (iattr->ia_size > inode->v.i_size) {
3004                 ret = bch2_extend(idmap, inode, &inode_u, iattr);
3005                 goto err;
3006         }
3007
3008         iattr->ia_valid &= ~ATTR_SIZE;
3009
3010         ret = bch2_truncate_folio(inode, iattr->ia_size);
3011         if (unlikely(ret < 0))
3012                 goto err;
3013
3014         /*
3015          * When extending, we're going to write the new i_size to disk
3016          * immediately so we need to flush anything above the current on disk
3017          * i_size first:
3018          *
3019          * Also, when extending we need to flush the page that i_size currently
3020          * straddles - if it's mapped to userspace, we need to ensure that
3021          * userspace has to redirty it and call .mkwrite -> set_page_dirty
3022          * again to allocate the part of the page that was extended.
3023          */
3024         if (iattr->ia_size > inode_u.bi_size)
3025                 ret = filemap_write_and_wait_range(mapping,
3026                                 inode_u.bi_size,
3027                                 iattr->ia_size - 1);
3028         else if (iattr->ia_size & (PAGE_SIZE - 1))
3029                 ret = filemap_write_and_wait_range(mapping,
3030                                 round_down(iattr->ia_size, PAGE_SIZE),
3031                                 iattr->ia_size - 1);
3032         if (ret)
3033                 goto err;
3034
3035         mutex_lock(&inode->ei_update_lock);
3036         ret = bch2_write_inode(c, inode, bch2_truncate_start_fn,
3037                                &new_i_size, 0);
3038         mutex_unlock(&inode->ei_update_lock);
3039
3040         if (unlikely(ret))
3041                 goto err;
3042
3043         truncate_setsize(&inode->v, iattr->ia_size);
3044
3045         ret = bch2_fpunch(c, inode_inum(inode),
3046                         round_up(iattr->ia_size, block_bytes(c)) >> 9,
3047                         U64_MAX, &i_sectors_delta);
3048         i_sectors_acct(c, inode, NULL, i_sectors_delta);
3049
3050         bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
3051                                 !bch2_journal_error(&c->journal), c,
3052                                 "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
3053                                 inode->v.i_ino, (u64) inode->v.i_blocks,
3054                                 inode->ei_inode.bi_sectors);
3055         if (unlikely(ret))
3056                 goto err;
3057
3058         mutex_lock(&inode->ei_update_lock);
3059         ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0);
3060         mutex_unlock(&inode->ei_update_lock);
3061
3062         ret = bch2_setattr_nonsize(idmap, inode, iattr);
3063 err:
3064         bch2_pagecache_block_put(inode);
3065         return bch2_err_class(ret);
3066 }
3067
3068 /* fallocate: */
3069
3070 static int inode_update_times_fn(struct bch_inode_info *inode,
3071                                  struct bch_inode_unpacked *bi, void *p)
3072 {
3073         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3074
3075         bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
3076         return 0;
3077 }
3078
3079 static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
3080 {
3081         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3082         u64 end         = offset + len;
3083         u64 block_start = round_up(offset, block_bytes(c));
3084         u64 block_end   = round_down(end, block_bytes(c));
3085         bool truncated_last_page;
3086         int ret = 0;
3087
3088         ret = bch2_truncate_folios(inode, offset, end);
3089         if (unlikely(ret < 0))
3090                 goto err;
3091
3092         truncated_last_page = ret;
3093
3094         truncate_pagecache_range(&inode->v, offset, end - 1);
3095
3096         if (block_start < block_end) {
3097                 s64 i_sectors_delta = 0;
3098
3099                 ret = bch2_fpunch(c, inode_inum(inode),
3100                                   block_start >> 9, block_end >> 9,
3101                                   &i_sectors_delta);
3102                 i_sectors_acct(c, inode, NULL, i_sectors_delta);
3103         }
3104
3105         mutex_lock(&inode->ei_update_lock);
3106         if (end >= inode->v.i_size && !truncated_last_page) {
3107                 ret = bch2_write_inode_size(c, inode, inode->v.i_size,
3108                                             ATTR_MTIME|ATTR_CTIME);
3109         } else {
3110                 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
3111                                        ATTR_MTIME|ATTR_CTIME);
3112         }
3113         mutex_unlock(&inode->ei_update_lock);
3114 err:
3115         return ret;
3116 }
3117
3118 static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
3119                                    loff_t offset, loff_t len,
3120                                    bool insert)
3121 {
3122         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3123         struct address_space *mapping = inode->v.i_mapping;
3124         struct bkey_buf copy;
3125         struct btree_trans trans;
3126         struct btree_iter src, dst, del;
3127         loff_t shift, new_size;
3128         u64 src_start;
3129         int ret = 0;
3130
3131         if ((offset | len) & (block_bytes(c) - 1))
3132                 return -EINVAL;
3133
3134         if (insert) {
3135                 if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
3136                         return -EFBIG;
3137
3138                 if (offset >= inode->v.i_size)
3139                         return -EINVAL;
3140
3141                 src_start       = U64_MAX;
3142                 shift           = len;
3143         } else {
3144                 if (offset + len >= inode->v.i_size)
3145                         return -EINVAL;
3146
3147                 src_start       = offset + len;
3148                 shift           = -len;
3149         }
3150
3151         new_size = inode->v.i_size + shift;
3152
3153         ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
3154         if (ret)
3155                 return ret;
3156
3157         if (insert) {
3158                 i_size_write(&inode->v, new_size);
3159                 mutex_lock(&inode->ei_update_lock);
3160                 ret = bch2_write_inode_size(c, inode, new_size,
3161                                             ATTR_MTIME|ATTR_CTIME);
3162                 mutex_unlock(&inode->ei_update_lock);
3163         } else {
3164                 s64 i_sectors_delta = 0;
3165
3166                 ret = bch2_fpunch(c, inode_inum(inode),
3167                                   offset >> 9, (offset + len) >> 9,
3168                                   &i_sectors_delta);
3169                 i_sectors_acct(c, inode, NULL, i_sectors_delta);
3170
3171                 if (ret)
3172                         return ret;
3173         }
3174
3175         bch2_bkey_buf_init(&copy);
3176         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
3177         bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
3178                         POS(inode->v.i_ino, src_start >> 9),
3179                         BTREE_ITER_INTENT);
3180         bch2_trans_copy_iter(&dst, &src);
3181         bch2_trans_copy_iter(&del, &src);
3182
3183         while (ret == 0 ||
3184                bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
3185                 struct disk_reservation disk_res =
3186                         bch2_disk_reservation_init(c, 0);
3187                 struct bkey_i delete;
3188                 struct bkey_s_c k;
3189                 struct bpos next_pos;
3190                 struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
3191                 struct bpos atomic_end;
3192                 unsigned trigger_flags = 0;
3193                 u32 snapshot;
3194
3195                 bch2_trans_begin(&trans);
3196
3197                 ret = bch2_subvolume_get_snapshot(&trans,
3198                                         inode->ei_subvol, &snapshot);
3199                 if (ret)
3200                         continue;
3201
3202                 bch2_btree_iter_set_snapshot(&src, snapshot);
3203                 bch2_btree_iter_set_snapshot(&dst, snapshot);
3204                 bch2_btree_iter_set_snapshot(&del, snapshot);
3205
3206                 bch2_trans_begin(&trans);
3207
3208                 k = insert
3209                         ? bch2_btree_iter_peek_prev(&src)
3210                         : bch2_btree_iter_peek_upto(&src, POS(inode->v.i_ino, U64_MAX));
3211                 if ((ret = bkey_err(k)))
3212                         continue;
3213
3214                 if (!k.k || k.k->p.inode != inode->v.i_ino)
3215                         break;
3216
3217                 if (insert &&
3218                     bkey_le(k.k->p, POS(inode->v.i_ino, offset >> 9)))
3219                         break;
3220 reassemble:
3221                 bch2_bkey_buf_reassemble(&copy, c, k);
3222
3223                 if (insert &&
3224                     bkey_lt(bkey_start_pos(k.k), move_pos))
3225                         bch2_cut_front(move_pos, copy.k);
3226
3227                 copy.k->k.p.offset += shift >> 9;
3228                 bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k));
3229
3230                 ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
3231                 if (ret)
3232                         continue;
3233
3234                 if (!bkey_eq(atomic_end, copy.k->k.p)) {
3235                         if (insert) {
3236                                 move_pos = atomic_end;
3237                                 move_pos.offset -= shift >> 9;
3238                                 goto reassemble;
3239                         } else {
3240                                 bch2_cut_back(atomic_end, copy.k);
3241                         }
3242                 }
3243
3244                 bkey_init(&delete.k);
3245                 delete.k.p = copy.k->k.p;
3246                 delete.k.size = copy.k->k.size;
3247                 delete.k.p.offset -= shift >> 9;
3248                 bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
3249
3250                 next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
3251
3252                 if (copy.k->k.size != k.k->size) {
3253                         /* We might end up splitting compressed extents: */
3254                         unsigned nr_ptrs =
3255                                 bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
3256
3257                         ret = bch2_disk_reservation_get(c, &disk_res,
3258                                         copy.k->k.size, nr_ptrs,
3259                                         BCH_DISK_RESERVATION_NOFAIL);
3260                         BUG_ON(ret);
3261                 }
3262
3263                 ret =   bch2_btree_iter_traverse(&del) ?:
3264                         bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
3265                         bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
3266                         bch2_trans_commit(&trans, &disk_res, NULL,
3267                                           BTREE_INSERT_NOFAIL);
3268                 bch2_disk_reservation_put(c, &disk_res);
3269
3270                 if (!ret)
3271                         bch2_btree_iter_set_pos(&src, next_pos);
3272         }
3273         bch2_trans_iter_exit(&trans, &del);
3274         bch2_trans_iter_exit(&trans, &dst);
3275         bch2_trans_iter_exit(&trans, &src);
3276         bch2_trans_exit(&trans);
3277         bch2_bkey_buf_exit(&copy, c);
3278
3279         if (ret)
3280                 return ret;
3281
3282         mutex_lock(&inode->ei_update_lock);
3283         if (!insert) {
3284                 i_size_write(&inode->v, new_size);
3285                 ret = bch2_write_inode_size(c, inode, new_size,
3286                                             ATTR_MTIME|ATTR_CTIME);
3287         } else {
3288                 /* We need an inode update to update bi_journal_seq for fsync: */
3289                 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
3290                                        ATTR_MTIME|ATTR_CTIME);
3291         }
3292         mutex_unlock(&inode->ei_update_lock);
3293         return ret;
3294 }
3295
3296 static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
3297                              u64 start_sector, u64 end_sector)
3298 {
3299         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3300         struct btree_trans trans;
3301         struct btree_iter iter;
3302         struct bpos end_pos = POS(inode->v.i_ino, end_sector);
3303         struct bch_io_opts opts;
3304         int ret = 0;
3305
3306         bch2_inode_opts_get(&opts, c, &inode->ei_inode);
3307         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
3308
3309         bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
3310                         POS(inode->v.i_ino, start_sector),
3311                         BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
3312
3313         while (!ret && bkey_lt(iter.pos, end_pos)) {
3314                 s64 i_sectors_delta = 0;
3315                 struct quota_res quota_res = { 0 };
3316                 struct bkey_s_c k;
3317                 unsigned sectors;
3318                 u32 snapshot;
3319
3320                 bch2_trans_begin(&trans);
3321
3322                 ret = bch2_subvolume_get_snapshot(&trans,
3323                                         inode->ei_subvol, &snapshot);
3324                 if (ret)
3325                         goto bkey_err;
3326
3327                 bch2_btree_iter_set_snapshot(&iter, snapshot);
3328
3329                 k = bch2_btree_iter_peek_slot(&iter);
3330                 if ((ret = bkey_err(k)))
3331                         goto bkey_err;
3332
3333                 /* already reserved */
3334                 if (bkey_extent_is_reservation(k) &&
3335                     bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
3336                         bch2_btree_iter_advance(&iter);
3337                         continue;
3338                 }
3339
3340                 if (bkey_extent_is_data(k.k) &&
3341                     !(mode & FALLOC_FL_ZERO_RANGE)) {
3342                         bch2_btree_iter_advance(&iter);
3343                         continue;
3344                 }
3345
3346                 /*
3347                  * XXX: for nocow mode, we should promote shared extents to
3348                  * unshared here
3349                  */
3350
3351                 sectors = bpos_min(k.k->p, end_pos).offset - iter.pos.offset;
3352
3353                 if (!bkey_extent_is_allocation(k.k)) {
3354                         ret = bch2_quota_reservation_add(c, inode,
3355                                         &quota_res,
3356                                         sectors, true);
3357                         if (unlikely(ret))
3358                                 goto bkey_err;
3359                 }
3360
3361                 ret = bch2_extent_fallocate(&trans, inode_inum(inode), &iter,
3362                                             sectors, opts, &i_sectors_delta,
3363                                             writepoint_hashed((unsigned long) current));
3364                 if (ret)
3365                         goto bkey_err;
3366
3367                 i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
3368 bkey_err:
3369                 bch2_quota_reservation_put(c, inode, &quota_res);
3370                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
3371                         ret = 0;
3372         }
3373
3374         bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */
3375         mark_pagecache_reserved(inode, start_sector, iter.pos.offset);
3376
3377         if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
3378                 struct quota_res quota_res = { 0 };
3379                 s64 i_sectors_delta = 0;
3380
3381                 bch2_fpunch_at(&trans, &iter, inode_inum(inode),
3382                                end_sector, &i_sectors_delta);
3383                 i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
3384                 bch2_quota_reservation_put(c, inode, &quota_res);
3385         }
3386
3387         bch2_trans_iter_exit(&trans, &iter);
3388         bch2_trans_exit(&trans);
3389         return ret;
3390 }
3391
3392 static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
3393                             loff_t offset, loff_t len)
3394 {
3395         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3396         u64 end         = offset + len;
3397         u64 block_start = round_down(offset,    block_bytes(c));
3398         u64 block_end   = round_up(end,         block_bytes(c));
3399         bool truncated_last_page = false;
3400         int ret, ret2 = 0;
3401
3402         if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
3403                 ret = inode_newsize_ok(&inode->v, end);
3404                 if (ret)
3405                         return ret;
3406         }
3407
3408         if (mode & FALLOC_FL_ZERO_RANGE) {
3409                 ret = bch2_truncate_folios(inode, offset, end);
3410                 if (unlikely(ret < 0))
3411                         return ret;
3412
3413                 truncated_last_page = ret;
3414
3415                 truncate_pagecache_range(&inode->v, offset, end - 1);
3416
3417                 block_start     = round_up(offset,      block_bytes(c));
3418                 block_end       = round_down(end,       block_bytes(c));
3419         }
3420
3421         ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
3422
3423         /*
3424          * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
3425          * so that the VFS cache i_size is consistent with the btree i_size:
3426          */
3427         if (ret &&
3428             !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
3429                 return ret;
3430
3431         if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
3432                 end = inode->v.i_size;
3433
3434         if (end >= inode->v.i_size &&
3435             (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
3436              !(mode & FALLOC_FL_KEEP_SIZE))) {
3437                 spin_lock(&inode->v.i_lock);
3438                 i_size_write(&inode->v, end);
3439                 spin_unlock(&inode->v.i_lock);
3440
3441                 mutex_lock(&inode->ei_update_lock);
3442                 ret2 = bch2_write_inode_size(c, inode, end, 0);
3443                 mutex_unlock(&inode->ei_update_lock);
3444         }
3445
3446         return ret ?: ret2;
3447 }
3448
3449 long bch2_fallocate_dispatch(struct file *file, int mode,
3450                              loff_t offset, loff_t len)
3451 {
3452         struct bch_inode_info *inode = file_bch_inode(file);
3453         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3454         long ret;
3455
3456         if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate))
3457                 return -EROFS;
3458
3459         inode_lock(&inode->v);
3460         inode_dio_wait(&inode->v);
3461         bch2_pagecache_block_get(inode);
3462
3463         ret = file_modified(file);
3464         if (ret)
3465                 goto err;
3466
3467         if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
3468                 ret = bchfs_fallocate(inode, mode, offset, len);
3469         else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
3470                 ret = bchfs_fpunch(inode, offset, len);
3471         else if (mode == FALLOC_FL_INSERT_RANGE)
3472                 ret = bchfs_fcollapse_finsert(inode, offset, len, true);
3473         else if (mode == FALLOC_FL_COLLAPSE_RANGE)
3474                 ret = bchfs_fcollapse_finsert(inode, offset, len, false);
3475         else
3476                 ret = -EOPNOTSUPP;
3477 err:
3478         bch2_pagecache_block_put(inode);
3479         inode_unlock(&inode->v);
3480         bch2_write_ref_put(c, BCH_WRITE_REF_fallocate);
3481
3482         return bch2_err_class(ret);
3483 }
3484
3485 /*
3486  * Take a quota reservation for unallocated blocks in a given file range
3487  * Does not check pagecache
3488  */
3489 static int quota_reserve_range(struct bch_inode_info *inode,
3490                                struct quota_res *res,
3491                                u64 start, u64 end)
3492 {
3493         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3494         struct btree_trans trans;
3495         struct btree_iter iter;
3496         struct bkey_s_c k;
3497         u32 snapshot;
3498         u64 sectors = end - start;
3499         u64 pos = start;
3500         int ret;
3501
3502         bch2_trans_init(&trans, c, 0, 0);
3503 retry:
3504         bch2_trans_begin(&trans);
3505
3506         ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
3507         if (ret)
3508                 goto err;
3509
3510         bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
3511                              SPOS(inode->v.i_ino, pos, snapshot), 0);
3512
3513         while (!(ret = btree_trans_too_many_iters(&trans)) &&
3514                (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
3515                !(ret = bkey_err(k))) {
3516                 if (bkey_extent_is_allocation(k.k)) {
3517                         u64 s = min(end, k.k->p.offset) -
3518                                 max(start, bkey_start_offset(k.k));
3519                         BUG_ON(s > sectors);
3520                         sectors -= s;
3521                 }
3522                 bch2_btree_iter_advance(&iter);
3523         }
3524         pos = iter.pos.offset;
3525         bch2_trans_iter_exit(&trans, &iter);
3526 err:
3527         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
3528                 goto retry;
3529
3530         bch2_trans_exit(&trans);
3531
3532         if (ret)
3533                 return ret;
3534
3535         return bch2_quota_reservation_add(c, inode, res, sectors, true);
3536 }
3537
3538 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
3539                              struct file *file_dst, loff_t pos_dst,
3540                              loff_t len, unsigned remap_flags)
3541 {
3542         struct bch_inode_info *src = file_bch_inode(file_src);
3543         struct bch_inode_info *dst = file_bch_inode(file_dst);
3544         struct bch_fs *c = src->v.i_sb->s_fs_info;
3545         struct quota_res quota_res = { 0 };
3546         s64 i_sectors_delta = 0;
3547         u64 aligned_len;
3548         loff_t ret = 0;
3549
3550         if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
3551                 return -EINVAL;
3552
3553         if (remap_flags & REMAP_FILE_DEDUP)
3554                 return -EOPNOTSUPP;
3555
3556         if ((pos_src & (block_bytes(c) - 1)) ||
3557             (pos_dst & (block_bytes(c) - 1)))
3558                 return -EINVAL;
3559
3560         if (src == dst &&
3561             abs(pos_src - pos_dst) < len)
3562                 return -EINVAL;
3563
3564         bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
3565
3566         inode_dio_wait(&src->v);
3567         inode_dio_wait(&dst->v);
3568
3569         ret = generic_remap_file_range_prep(file_src, pos_src,
3570                                             file_dst, pos_dst,
3571                                             &len, remap_flags);
3572         if (ret < 0 || len == 0)
3573                 goto err;
3574
3575         aligned_len = round_up((u64) len, block_bytes(c));
3576
3577         ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
3578                                 pos_dst, pos_dst + len - 1);
3579         if (ret)
3580                 goto err;
3581
3582         ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9,
3583                                   (pos_dst + aligned_len) >> 9);
3584         if (ret)
3585                 goto err;
3586
3587         file_update_time(file_dst);
3588
3589         mark_pagecache_unallocated(src, pos_src >> 9,
3590                                    (pos_src + aligned_len) >> 9);
3591
3592         ret = bch2_remap_range(c,
3593                                inode_inum(dst), pos_dst >> 9,
3594                                inode_inum(src), pos_src >> 9,
3595                                aligned_len >> 9,
3596                                pos_dst + len, &i_sectors_delta);
3597         if (ret < 0)
3598                 goto err;
3599
3600         /*
3601          * due to alignment, we might have remapped slightly more than requsted
3602          */
3603         ret = min((u64) ret << 9, (u64) len);
3604
3605         i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
3606
3607         spin_lock(&dst->v.i_lock);
3608         if (pos_dst + ret > dst->v.i_size)
3609                 i_size_write(&dst->v, pos_dst + ret);
3610         spin_unlock(&dst->v.i_lock);
3611
3612         if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
3613             IS_SYNC(file_inode(file_dst)))
3614                 ret = bch2_flush_inode(c, dst);
3615 err:
3616         bch2_quota_reservation_put(c, dst, &quota_res);
3617         bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
3618
3619         return bch2_err_class(ret);
3620 }
3621
3622 /* fseek: */
3623
3624 static int folio_data_offset(struct folio *folio, loff_t pos)
3625 {
3626         struct bch_folio *s = bch2_folio(folio);
3627         unsigned i, sectors = folio_sectors(folio);
3628
3629         if (s)
3630                 for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
3631                         if (s->s[i].state >= SECTOR_dirty)
3632                                 return i << SECTOR_SHIFT;
3633
3634         return -1;
3635 }
3636
3637 static loff_t bch2_seek_pagecache_data(struct inode *vinode,
3638                                        loff_t start_offset,
3639                                        loff_t end_offset)
3640 {
3641         struct folio_batch fbatch;
3642         pgoff_t start_index     = start_offset >> PAGE_SHIFT;
3643         pgoff_t end_index       = end_offset >> PAGE_SHIFT;
3644         pgoff_t index           = start_index;
3645         unsigned i;
3646         loff_t ret;
3647         int offset;
3648
3649         folio_batch_init(&fbatch);
3650
3651         while (filemap_get_folios(vinode->i_mapping,
3652                                   &index, end_index, &fbatch)) {
3653                 for (i = 0; i < folio_batch_count(&fbatch); i++) {
3654                         struct folio *folio = fbatch.folios[i];
3655
3656                         folio_lock(folio);
3657                         offset = folio_data_offset(folio,
3658                                         max(folio_pos(folio), start_offset));
3659                         if (offset >= 0) {
3660                                 ret = clamp(folio_pos(folio) + offset,
3661                                             start_offset, end_offset);
3662                                 folio_unlock(folio);
3663                                 folio_batch_release(&fbatch);
3664                                 return ret;
3665                         }
3666                         folio_unlock(folio);
3667                 }
3668                 folio_batch_release(&fbatch);
3669                 cond_resched();
3670         }
3671
3672         return end_offset;
3673 }
3674
3675 static loff_t bch2_seek_data(struct file *file, u64 offset)
3676 {
3677         struct bch_inode_info *inode = file_bch_inode(file);
3678         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3679         struct btree_trans trans;
3680         struct btree_iter iter;
3681         struct bkey_s_c k;
3682         subvol_inum inum = inode_inum(inode);
3683         u64 isize, next_data = MAX_LFS_FILESIZE;
3684         u32 snapshot;
3685         int ret;
3686
3687         isize = i_size_read(&inode->v);
3688         if (offset >= isize)
3689                 return -ENXIO;
3690
3691         bch2_trans_init(&trans, c, 0, 0);
3692 retry:
3693         bch2_trans_begin(&trans);
3694
3695         ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
3696         if (ret)
3697                 goto err;
3698
3699         for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents,
3700                            SPOS(inode->v.i_ino, offset >> 9, snapshot),
3701                            POS(inode->v.i_ino, U64_MAX),
3702                            0, k, ret) {
3703                 if (bkey_extent_is_data(k.k)) {
3704                         next_data = max(offset, bkey_start_offset(k.k) << 9);
3705                         break;
3706                 } else if (k.k->p.offset >> 9 > isize)
3707                         break;
3708         }
3709         bch2_trans_iter_exit(&trans, &iter);
3710 err:
3711         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
3712                 goto retry;
3713
3714         bch2_trans_exit(&trans);
3715         if (ret)
3716                 return ret;
3717
3718         if (next_data > offset)
3719                 next_data = bch2_seek_pagecache_data(&inode->v,
3720                                                      offset, next_data);
3721
3722         if (next_data >= isize)
3723                 return -ENXIO;
3724
3725         return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
3726 }
3727
3728 static bool folio_hole_offset(struct address_space *mapping, loff_t *offset)
3729 {
3730         struct folio *folio;
3731         struct bch_folio *s;
3732         unsigned i, sectors;
3733         bool ret = true;
3734
3735         folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT);
3736         if (!folio)
3737                 return true;
3738
3739         s = bch2_folio(folio);
3740         if (!s)
3741                 goto unlock;
3742
3743         sectors = folio_sectors(folio);
3744         for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
3745                 if (s->s[i].state < SECTOR_dirty) {
3746                         *offset = max(*offset,
3747                                       folio_pos(folio) + (i << SECTOR_SHIFT));
3748                         goto unlock;
3749                 }
3750
3751         *offset = folio_end_pos(folio);
3752         ret = false;
3753 unlock:
3754         folio_unlock(folio);
3755         return ret;
3756 }
3757
3758 static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
3759                                        loff_t start_offset,
3760                                        loff_t end_offset)
3761 {
3762         struct address_space *mapping = vinode->i_mapping;
3763         loff_t offset = start_offset;
3764
3765         while (offset < end_offset &&
3766                !folio_hole_offset(mapping, &offset))
3767                 ;
3768
3769         return min(offset, end_offset);
3770 }
3771
3772 static loff_t bch2_seek_hole(struct file *file, u64 offset)
3773 {
3774         struct bch_inode_info *inode = file_bch_inode(file);
3775         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3776         struct btree_trans trans;
3777         struct btree_iter iter;
3778         struct bkey_s_c k;
3779         subvol_inum inum = inode_inum(inode);
3780         u64 isize, next_hole = MAX_LFS_FILESIZE;
3781         u32 snapshot;
3782         int ret;
3783
3784         isize = i_size_read(&inode->v);
3785         if (offset >= isize)
3786                 return -ENXIO;
3787
3788         bch2_trans_init(&trans, c, 0, 0);
3789 retry:
3790         bch2_trans_begin(&trans);
3791
3792         ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
3793         if (ret)
3794                 goto err;
3795
3796         for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
3797                            SPOS(inode->v.i_ino, offset >> 9, snapshot),
3798                            BTREE_ITER_SLOTS, k, ret) {
3799                 if (k.k->p.inode != inode->v.i_ino) {
3800                         next_hole = bch2_seek_pagecache_hole(&inode->v,
3801                                         offset, MAX_LFS_FILESIZE);
3802                         break;
3803                 } else if (!bkey_extent_is_data(k.k)) {
3804                         next_hole = bch2_seek_pagecache_hole(&inode->v,
3805                                         max(offset, bkey_start_offset(k.k) << 9),
3806                                         k.k->p.offset << 9);
3807
3808                         if (next_hole < k.k->p.offset << 9)
3809                                 break;
3810                 } else {
3811                         offset = max(offset, bkey_start_offset(k.k) << 9);
3812                 }
3813         }
3814         bch2_trans_iter_exit(&trans, &iter);
3815 err:
3816         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
3817                 goto retry;
3818
3819         bch2_trans_exit(&trans);
3820         if (ret)
3821                 return ret;
3822
3823         if (next_hole > isize)
3824                 next_hole = isize;
3825
3826         return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
3827 }
3828
3829 loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
3830 {
3831         loff_t ret;
3832
3833         switch (whence) {
3834         case SEEK_SET:
3835         case SEEK_CUR:
3836         case SEEK_END:
3837                 ret = generic_file_llseek(file, offset, whence);
3838                 break;
3839         case SEEK_DATA:
3840                 ret = bch2_seek_data(file, offset);
3841                 break;
3842         case SEEK_HOLE:
3843                 ret = bch2_seek_hole(file, offset);
3844                 break;
3845         default:
3846                 ret = -EINVAL;
3847                 break;
3848         }
3849
3850         return bch2_err_class(ret);
3851 }
3852
3853 void bch2_fs_fsio_exit(struct bch_fs *c)
3854 {
3855         bioset_exit(&c->nocow_flush_bioset);
3856         bioset_exit(&c->dio_write_bioset);
3857         bioset_exit(&c->dio_read_bioset);
3858         bioset_exit(&c->writepage_bioset);
3859 }
3860
3861 int bch2_fs_fsio_init(struct bch_fs *c)
3862 {
3863         int ret = 0;
3864
3865         pr_verbose_init(c->opts, "");
3866
3867         if (bioset_init(&c->writepage_bioset,
3868                         4, offsetof(struct bch_writepage_io, op.wbio.bio),
3869                         BIOSET_NEED_BVECS))
3870                 return -BCH_ERR_ENOMEM_writepage_bioset_init;
3871
3872         if (bioset_init(&c->dio_read_bioset,
3873                         4, offsetof(struct dio_read, rbio.bio),
3874                         BIOSET_NEED_BVECS))
3875                 return -BCH_ERR_ENOMEM_dio_read_bioset_init;
3876
3877         if (bioset_init(&c->dio_write_bioset,
3878                         4, offsetof(struct dio_write, op.wbio.bio),
3879                         BIOSET_NEED_BVECS))
3880                 return -BCH_ERR_ENOMEM_dio_write_bioset_init;
3881
3882         if (bioset_init(&c->nocow_flush_bioset,
3883                         1, offsetof(struct nocow_flush, bio), 0))
3884                 return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
3885
3886         pr_verbose_init(c->opts, "ret %i", ret);
3887         return ret;
3888 }
3889
3890 #endif /* NO_BCACHEFS_FS */