]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fs-io.c
Update bcachefs sources to 0342eebf85 bcachefs: Improve the backpointer to missing...
[bcachefs-tools-debian] / libbcachefs / fs-io.c
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3
4 #include "bcachefs.h"
5 #include "alloc_foreground.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "clock.h"
10 #include "error.h"
11 #include "extents.h"
12 #include "extent_update.h"
13 #include "fs.h"
14 #include "fs-io.h"
15 #include "fsck.h"
16 #include "inode.h"
17 #include "journal.h"
18 #include "io.h"
19 #include "keylist.h"
20 #include "quota.h"
21 #include "reflink.h"
22
23 #include <linux/aio.h>
24 #include <linux/backing-dev.h>
25 #include <linux/falloc.h>
26 #include <linux/migrate.h>
27 #include <linux/mmu_context.h>
28 #include <linux/pagevec.h>
29 #include <linux/rmap.h>
30 #include <linux/sched/signal.h>
31 #include <linux/task_io_accounting_ops.h>
32 #include <linux/uio.h>
33 #include <linux/writeback.h>
34
35 #include <trace/events/bcachefs.h>
36 #include <trace/events/writeback.h>
37
38 struct nocow_flush {
39         struct closure  *cl;
40         struct bch_dev  *ca;
41         struct bio      bio;
42 };
43
44 static void nocow_flush_endio(struct bio *_bio)
45 {
46
47         struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
48
49         closure_put(bio->cl);
50         percpu_ref_put(&bio->ca->io_ref);
51         bio_put(&bio->bio);
52 }
53
54 static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
55                                                 struct bch_inode_info *inode,
56                                                 struct closure *cl)
57 {
58         struct nocow_flush *bio;
59         struct bch_dev *ca;
60         struct bch_devs_mask devs;
61         unsigned dev;
62
63         dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
64         if (dev == BCH_SB_MEMBERS_MAX)
65                 return;
66
67         devs = inode->ei_devs_need_flush;
68         memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
69
70         for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
71                 rcu_read_lock();
72                 ca = rcu_dereference(c->devs[dev]);
73                 if (ca && !percpu_ref_tryget(&ca->io_ref))
74                         ca = NULL;
75                 rcu_read_unlock();
76
77                 if (!ca)
78                         continue;
79
80                 bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
81                                                     REQ_OP_FLUSH,
82                                                     GFP_KERNEL,
83                                                     &c->nocow_flush_bioset),
84                                    struct nocow_flush, bio);
85                 bio->cl                 = cl;
86                 bio->ca                 = ca;
87                 bio->bio.bi_end_io      = nocow_flush_endio;
88                 closure_bio_submit(&bio->bio, cl);
89         }
90 }
91
92 static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
93                                          struct bch_inode_info *inode)
94 {
95         struct closure cl;
96
97         closure_init_stack(&cl);
98         bch2_inode_flush_nocow_writes_async(c, inode, &cl);
99         closure_sync(&cl);
100
101         return 0;
102 }
103
104 static inline bool bio_full(struct bio *bio, unsigned len)
105 {
106         if (bio->bi_vcnt >= bio->bi_max_vecs)
107                 return true;
108         if (bio->bi_iter.bi_size > UINT_MAX - len)
109                 return true;
110         return false;
111 }
112
113 static inline struct address_space *faults_disabled_mapping(void)
114 {
115         return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
116 }
117
118 static inline void set_fdm_dropped_locks(void)
119 {
120         current->faults_disabled_mapping =
121                 (void *) (((unsigned long) current->faults_disabled_mapping)|1);
122 }
123
124 static inline bool fdm_dropped_locks(void)
125 {
126         return ((unsigned long) current->faults_disabled_mapping) & 1;
127 }
128
129 struct quota_res {
130         u64                             sectors;
131 };
132
133 struct bch_writepage_io {
134         struct bch_inode_info           *inode;
135
136         /* must be last: */
137         struct bch_write_op             op;
138 };
139
140 struct dio_write {
141         struct kiocb                    *req;
142         struct address_space            *mapping;
143         struct bch_inode_info           *inode;
144         struct mm_struct                *mm;
145         unsigned                        loop:1,
146                                         extending:1,
147                                         sync:1,
148                                         flush:1,
149                                         free_iov:1;
150         struct quota_res                quota_res;
151         u64                             written;
152
153         struct iov_iter                 iter;
154         struct iovec                    inline_vecs[2];
155
156         /* must be last: */
157         struct bch_write_op             op;
158 };
159
160 struct dio_read {
161         struct closure                  cl;
162         struct kiocb                    *req;
163         long                            ret;
164         bool                            should_dirty;
165         struct bch_read_bio             rbio;
166 };
167
168 /* pagecache_block must be held */
169 static noinline int write_invalidate_inode_pages_range(struct address_space *mapping,
170                                               loff_t start, loff_t end)
171 {
172         int ret;
173
174         /*
175          * XXX: the way this is currently implemented, we can spin if a process
176          * is continually redirtying a specific page
177          */
178         do {
179                 if (!mapping->nrpages)
180                         return 0;
181
182                 ret = filemap_write_and_wait_range(mapping, start, end);
183                 if (ret)
184                         break;
185
186                 if (!mapping->nrpages)
187                         return 0;
188
189                 ret = invalidate_inode_pages2_range(mapping,
190                                 start >> PAGE_SHIFT,
191                                 end >> PAGE_SHIFT);
192         } while (ret == -EBUSY);
193
194         return ret;
195 }
196
197 /* quotas */
198
199 #ifdef CONFIG_BCACHEFS_QUOTA
200
201 static void __bch2_quota_reservation_put(struct bch_fs *c,
202                                          struct bch_inode_info *inode,
203                                          struct quota_res *res)
204 {
205         BUG_ON(res->sectors > inode->ei_quota_reserved);
206
207         bch2_quota_acct(c, inode->ei_qid, Q_SPC,
208                         -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
209         inode->ei_quota_reserved -= res->sectors;
210         res->sectors = 0;
211 }
212
213 static void bch2_quota_reservation_put(struct bch_fs *c,
214                                        struct bch_inode_info *inode,
215                                        struct quota_res *res)
216 {
217         if (res->sectors) {
218                 mutex_lock(&inode->ei_quota_lock);
219                 __bch2_quota_reservation_put(c, inode, res);
220                 mutex_unlock(&inode->ei_quota_lock);
221         }
222 }
223
224 static int bch2_quota_reservation_add(struct bch_fs *c,
225                                       struct bch_inode_info *inode,
226                                       struct quota_res *res,
227                                       u64 sectors,
228                                       bool check_enospc)
229 {
230         int ret;
231
232         mutex_lock(&inode->ei_quota_lock);
233         ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
234                               check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
235         if (likely(!ret)) {
236                 inode->ei_quota_reserved += sectors;
237                 res->sectors += sectors;
238         }
239         mutex_unlock(&inode->ei_quota_lock);
240
241         return ret;
242 }
243
244 #else
245
246 static void __bch2_quota_reservation_put(struct bch_fs *c,
247                                          struct bch_inode_info *inode,
248                                          struct quota_res *res) {}
249
250 static void bch2_quota_reservation_put(struct bch_fs *c,
251                                        struct bch_inode_info *inode,
252                                        struct quota_res *res) {}
253
254 static int bch2_quota_reservation_add(struct bch_fs *c,
255                                       struct bch_inode_info *inode,
256                                       struct quota_res *res,
257                                       unsigned sectors,
258                                       bool check_enospc)
259 {
260         return 0;
261 }
262
263 #endif
264
265 /* i_size updates: */
266
267 struct inode_new_size {
268         loff_t          new_size;
269         u64             now;
270         unsigned        fields;
271 };
272
273 static int inode_set_size(struct bch_inode_info *inode,
274                           struct bch_inode_unpacked *bi,
275                           void *p)
276 {
277         struct inode_new_size *s = p;
278
279         bi->bi_size = s->new_size;
280         if (s->fields & ATTR_ATIME)
281                 bi->bi_atime = s->now;
282         if (s->fields & ATTR_MTIME)
283                 bi->bi_mtime = s->now;
284         if (s->fields & ATTR_CTIME)
285                 bi->bi_ctime = s->now;
286
287         return 0;
288 }
289
290 int __must_check bch2_write_inode_size(struct bch_fs *c,
291                                        struct bch_inode_info *inode,
292                                        loff_t new_size, unsigned fields)
293 {
294         struct inode_new_size s = {
295                 .new_size       = new_size,
296                 .now            = bch2_current_time(c),
297                 .fields         = fields,
298         };
299
300         return bch2_write_inode(c, inode, inode_set_size, &s, fields);
301 }
302
303 static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
304                            struct quota_res *quota_res, s64 sectors)
305 {
306         bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
307                                 "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
308                                 inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
309                                 inode->ei_inode.bi_sectors);
310         inode->v.i_blocks += sectors;
311
312 #ifdef CONFIG_BCACHEFS_QUOTA
313         if (quota_res && sectors > 0) {
314                 BUG_ON(sectors > quota_res->sectors);
315                 BUG_ON(sectors > inode->ei_quota_reserved);
316
317                 quota_res->sectors -= sectors;
318                 inode->ei_quota_reserved -= sectors;
319         } else {
320                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
321         }
322 #endif
323 }
324
325 static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
326                            struct quota_res *quota_res, s64 sectors)
327 {
328         if (sectors) {
329                 mutex_lock(&inode->ei_quota_lock);
330                 __i_sectors_acct(c, inode, quota_res, sectors);
331                 mutex_unlock(&inode->ei_quota_lock);
332         }
333 }
334
335 /* page state: */
336
337 /* stored in page->private: */
338
339 struct bch_page_sector {
340         /* Uncompressed, fully allocated replicas (or on disk reservation): */
341         unsigned                nr_replicas:4;
342
343         /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
344         unsigned                replicas_reserved:4;
345
346         /* i_sectors: */
347         enum {
348                 SECTOR_UNALLOCATED,
349                 SECTOR_RESERVED,
350                 SECTOR_DIRTY,
351                 SECTOR_DIRTY_RESERVED,
352                 SECTOR_ALLOCATED,
353         }                       state:8;
354 };
355
356 struct bch_page_state {
357         spinlock_t              lock;
358         atomic_t                write_count;
359         bool                    uptodate;
360         struct bch_page_sector  s[PAGE_SECTORS];
361 };
362
363 static inline struct bch_page_state *__bch2_page_state(struct page *page)
364 {
365         return page_has_private(page)
366                 ? (struct bch_page_state *) page_private(page)
367                 : NULL;
368 }
369
370 static inline struct bch_page_state *bch2_page_state(struct page *page)
371 {
372         EBUG_ON(!PageLocked(page));
373
374         return __bch2_page_state(page);
375 }
376
377 /* for newly allocated pages: */
378 static void __bch2_page_state_release(struct page *page)
379 {
380         kfree(detach_page_private(page));
381 }
382
383 static void bch2_page_state_release(struct page *page)
384 {
385         EBUG_ON(!PageLocked(page));
386         __bch2_page_state_release(page);
387 }
388
389 /* for newly allocated pages: */
390 static struct bch_page_state *__bch2_page_state_create(struct page *page,
391                                                        gfp_t gfp)
392 {
393         struct bch_page_state *s;
394
395         s = kzalloc(sizeof(*s), GFP_NOFS|gfp);
396         if (!s)
397                 return NULL;
398
399         spin_lock_init(&s->lock);
400         attach_page_private(page, s);
401         return s;
402 }
403
404 static struct bch_page_state *bch2_page_state_create(struct page *page,
405                                                      gfp_t gfp)
406 {
407         return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
408 }
409
410 static unsigned bkey_to_sector_state(struct bkey_s_c k)
411 {
412         if (bkey_extent_is_reservation(k))
413                 return SECTOR_RESERVED;
414         if (bkey_extent_is_allocation(k.k))
415                 return SECTOR_ALLOCATED;
416         return SECTOR_UNALLOCATED;
417 }
418
419 static void __bch2_page_state_set(struct page *page,
420                                   unsigned pg_offset, unsigned pg_len,
421                                   unsigned nr_ptrs, unsigned state)
422 {
423         struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL);
424         unsigned i;
425
426         BUG_ON(pg_offset >= PAGE_SECTORS);
427         BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
428
429         spin_lock(&s->lock);
430
431         for (i = pg_offset; i < pg_offset + pg_len; i++) {
432                 s->s[i].nr_replicas = nr_ptrs;
433                 s->s[i].state = state;
434         }
435
436         if (i == PAGE_SECTORS)
437                 s->uptodate = true;
438
439         spin_unlock(&s->lock);
440 }
441
442 static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum,
443                                struct page **pages, unsigned nr_pages)
444 {
445         struct btree_trans trans;
446         struct btree_iter iter;
447         struct bkey_s_c k;
448         u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT;
449         unsigned pg_idx = 0;
450         u32 snapshot;
451         int ret;
452
453         bch2_trans_init(&trans, c, 0, 0);
454 retry:
455         bch2_trans_begin(&trans);
456
457         ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
458         if (ret)
459                 goto err;
460
461         for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
462                            SPOS(inum.inum, offset, snapshot),
463                            BTREE_ITER_SLOTS, k, ret) {
464                 unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
465                 unsigned state = bkey_to_sector_state(k);
466
467                 while (pg_idx < nr_pages) {
468                         struct page *page = pages[pg_idx];
469                         u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
470                         u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
471                         unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start;
472                         unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start;
473
474                         BUG_ON(k.k->p.offset < pg_start);
475                         BUG_ON(bkey_start_offset(k.k) > pg_end);
476
477                         if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate)
478                                 __bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state);
479
480                         if (k.k->p.offset < pg_end)
481                                 break;
482                         pg_idx++;
483                 }
484
485                 if (pg_idx == nr_pages)
486                         break;
487         }
488
489         offset = iter.pos.offset;
490         bch2_trans_iter_exit(&trans, &iter);
491 err:
492         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
493                 goto retry;
494         bch2_trans_exit(&trans);
495
496         return ret;
497 }
498
499 static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
500 {
501         struct bvec_iter iter;
502         struct bio_vec bv;
503         unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
504                 ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
505         unsigned state = bkey_to_sector_state(k);
506
507         bio_for_each_segment(bv, bio, iter)
508                 __bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9,
509                                       bv.bv_len >> 9, nr_ptrs, state);
510 }
511
512 static void mark_pagecache_unallocated(struct bch_inode_info *inode,
513                                        u64 start, u64 end)
514 {
515         pgoff_t index = start >> PAGE_SECTORS_SHIFT;
516         pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
517         struct folio_batch fbatch;
518         unsigned i, j;
519
520         if (end <= start)
521                 return;
522
523         folio_batch_init(&fbatch);
524
525         while (filemap_get_folios(inode->v.i_mapping,
526                                   &index, end_index, &fbatch)) {
527                 for (i = 0; i < folio_batch_count(&fbatch); i++) {
528                         struct folio *folio = fbatch.folios[i];
529                         u64 pg_start = folio->index << PAGE_SECTORS_SHIFT;
530                         u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
531                         unsigned pg_offset = max(start, pg_start) - pg_start;
532                         unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
533                         struct bch_page_state *s;
534
535                         BUG_ON(end <= pg_start);
536                         BUG_ON(pg_offset >= PAGE_SECTORS);
537                         BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
538
539                         folio_lock(folio);
540                         s = bch2_page_state(&folio->page);
541
542                         if (s) {
543                                 spin_lock(&s->lock);
544                                 for (j = pg_offset; j < pg_offset + pg_len; j++)
545                                         s->s[j].nr_replicas = 0;
546                                 spin_unlock(&s->lock);
547                         }
548
549                         folio_unlock(folio);
550                 }
551                 folio_batch_release(&fbatch);
552                 cond_resched();
553         }
554 }
555
556 static void mark_pagecache_reserved(struct bch_inode_info *inode,
557                                     u64 start, u64 end)
558 {
559         struct bch_fs *c = inode->v.i_sb->s_fs_info;
560         pgoff_t index = start >> PAGE_SECTORS_SHIFT;
561         pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
562         struct folio_batch fbatch;
563         s64 i_sectors_delta = 0;
564         unsigned i, j;
565
566         if (end <= start)
567                 return;
568
569         folio_batch_init(&fbatch);
570
571         while (filemap_get_folios(inode->v.i_mapping,
572                                   &index, end_index, &fbatch)) {
573                 for (i = 0; i < folio_batch_count(&fbatch); i++) {
574                         struct folio *folio = fbatch.folios[i];
575                         u64 pg_start = folio->index << PAGE_SECTORS_SHIFT;
576                         u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
577                         unsigned pg_offset = max(start, pg_start) - pg_start;
578                         unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
579                         struct bch_page_state *s;
580
581                         BUG_ON(end <= pg_start);
582                         BUG_ON(pg_offset >= PAGE_SECTORS);
583                         BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
584
585                         folio_lock(folio);
586                         s = bch2_page_state(&folio->page);
587
588                         if (s) {
589                                 spin_lock(&s->lock);
590                                 for (j = pg_offset; j < pg_offset + pg_len; j++)
591                                         switch (s->s[j].state) {
592                                         case SECTOR_UNALLOCATED:
593                                                 s->s[j].state = SECTOR_RESERVED;
594                                                 break;
595                                         case SECTOR_DIRTY:
596                                                 s->s[j].state = SECTOR_DIRTY_RESERVED;
597                                                 i_sectors_delta--;
598                                                 break;
599                                         default:
600                                                 break;
601                                         }
602                                 spin_unlock(&s->lock);
603                         }
604
605                         folio_unlock(folio);
606                 }
607                 folio_batch_release(&fbatch);
608                 cond_resched();
609         }
610
611         i_sectors_acct(c, inode, NULL, i_sectors_delta);
612 }
613
614 static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
615 {
616         /* XXX: this should not be open coded */
617         return inode->ei_inode.bi_data_replicas
618                 ? inode->ei_inode.bi_data_replicas - 1
619                 : c->opts.data_replicas;
620 }
621
622 static inline unsigned sectors_to_reserve(struct bch_page_sector *s,
623                                                   unsigned nr_replicas)
624 {
625         return max(0, (int) nr_replicas -
626                    s->nr_replicas -
627                    s->replicas_reserved);
628 }
629
630 static int bch2_get_page_disk_reservation(struct bch_fs *c,
631                                 struct bch_inode_info *inode,
632                                 struct page *page, bool check_enospc)
633 {
634         struct bch_page_state *s = bch2_page_state_create(page, 0);
635         unsigned nr_replicas = inode_nr_replicas(c, inode);
636         struct disk_reservation disk_res = { 0 };
637         unsigned i, disk_res_sectors = 0;
638         int ret;
639
640         if (!s)
641                 return -ENOMEM;
642
643         for (i = 0; i < ARRAY_SIZE(s->s); i++)
644                 disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
645
646         if (!disk_res_sectors)
647                 return 0;
648
649         ret = bch2_disk_reservation_get(c, &disk_res,
650                                         disk_res_sectors, 1,
651                                         !check_enospc
652                                         ? BCH_DISK_RESERVATION_NOFAIL
653                                         : 0);
654         if (unlikely(ret))
655                 return ret;
656
657         for (i = 0; i < ARRAY_SIZE(s->s); i++)
658                 s->s[i].replicas_reserved +=
659                         sectors_to_reserve(&s->s[i], nr_replicas);
660
661         return 0;
662 }
663
664 struct bch2_page_reservation {
665         struct disk_reservation disk;
666         struct quota_res        quota;
667 };
668
669 static void bch2_page_reservation_init(struct bch_fs *c,
670                         struct bch_inode_info *inode,
671                         struct bch2_page_reservation *res)
672 {
673         memset(res, 0, sizeof(*res));
674
675         res->disk.nr_replicas = inode_nr_replicas(c, inode);
676 }
677
678 static void bch2_page_reservation_put(struct bch_fs *c,
679                         struct bch_inode_info *inode,
680                         struct bch2_page_reservation *res)
681 {
682         bch2_disk_reservation_put(c, &res->disk);
683         bch2_quota_reservation_put(c, inode, &res->quota);
684 }
685
686 static int bch2_page_reservation_get(struct bch_fs *c,
687                         struct bch_inode_info *inode, struct page *page,
688                         struct bch2_page_reservation *res,
689                         unsigned offset, unsigned len)
690 {
691         struct bch_page_state *s = bch2_page_state_create(page, 0);
692         unsigned i, disk_sectors = 0, quota_sectors = 0;
693         int ret;
694
695         if (!s)
696                 return -ENOMEM;
697
698         BUG_ON(!s->uptodate);
699
700         for (i = round_down(offset, block_bytes(c)) >> 9;
701              i < round_up(offset + len, block_bytes(c)) >> 9;
702              i++) {
703                 disk_sectors += sectors_to_reserve(&s->s[i],
704                                                 res->disk.nr_replicas);
705                 quota_sectors += s->s[i].state == SECTOR_UNALLOCATED;
706         }
707
708         if (disk_sectors) {
709                 ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
710                 if (unlikely(ret))
711                         return ret;
712         }
713
714         if (quota_sectors) {
715                 ret = bch2_quota_reservation_add(c, inode, &res->quota,
716                                                  quota_sectors, true);
717                 if (unlikely(ret)) {
718                         struct disk_reservation tmp = {
719                                 .sectors = disk_sectors
720                         };
721
722                         bch2_disk_reservation_put(c, &tmp);
723                         res->disk.sectors -= disk_sectors;
724                         return ret;
725                 }
726         }
727
728         return 0;
729 }
730
731 static void bch2_clear_page_bits(struct page *page)
732 {
733         struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
734         struct bch_fs *c = inode->v.i_sb->s_fs_info;
735         struct bch_page_state *s = bch2_page_state(page);
736         struct disk_reservation disk_res = { 0 };
737         int i, dirty_sectors = 0;
738
739         if (!s)
740                 return;
741
742         EBUG_ON(!PageLocked(page));
743         EBUG_ON(PageWriteback(page));
744
745         for (i = 0; i < ARRAY_SIZE(s->s); i++) {
746                 disk_res.sectors += s->s[i].replicas_reserved;
747                 s->s[i].replicas_reserved = 0;
748
749                 switch (s->s[i].state) {
750                 case SECTOR_DIRTY:
751                         s->s[i].state = SECTOR_UNALLOCATED;
752                         --dirty_sectors;
753                         break;
754                 case SECTOR_DIRTY_RESERVED:
755                         s->s[i].state = SECTOR_RESERVED;
756                         break;
757                 default:
758                         break;
759                 }
760         }
761
762         bch2_disk_reservation_put(c, &disk_res);
763
764         i_sectors_acct(c, inode, NULL, dirty_sectors);
765
766         bch2_page_state_release(page);
767 }
768
769 static void bch2_set_page_dirty(struct bch_fs *c,
770                         struct bch_inode_info *inode, struct page *page,
771                         struct bch2_page_reservation *res,
772                         unsigned offset, unsigned len)
773 {
774         struct bch_page_state *s = bch2_page_state(page);
775         unsigned i, dirty_sectors = 0;
776
777         WARN_ON((u64) page_offset(page) + offset + len >
778                 round_up((u64) i_size_read(&inode->v), block_bytes(c)));
779
780         spin_lock(&s->lock);
781
782         for (i = round_down(offset, block_bytes(c)) >> 9;
783              i < round_up(offset + len, block_bytes(c)) >> 9;
784              i++) {
785                 unsigned sectors = sectors_to_reserve(&s->s[i],
786                                                 res->disk.nr_replicas);
787
788                 /*
789                  * This can happen if we race with the error path in
790                  * bch2_writepage_io_done():
791                  */
792                 sectors = min_t(unsigned, sectors, res->disk.sectors);
793
794                 s->s[i].replicas_reserved += sectors;
795                 res->disk.sectors -= sectors;
796
797                 switch (s->s[i].state) {
798                 case SECTOR_UNALLOCATED:
799                         s->s[i].state = SECTOR_DIRTY;
800                         dirty_sectors++;
801                         break;
802                 case SECTOR_RESERVED:
803                         s->s[i].state = SECTOR_DIRTY_RESERVED;
804                         break;
805                 default:
806                         break;
807                 }
808         }
809
810         spin_unlock(&s->lock);
811
812         i_sectors_acct(c, inode, &res->quota, dirty_sectors);
813
814         if (!PageDirty(page))
815                 filemap_dirty_folio(inode->v.i_mapping, page_folio(page));
816 }
817
818 vm_fault_t bch2_page_fault(struct vm_fault *vmf)
819 {
820         struct file *file = vmf->vma->vm_file;
821         struct address_space *mapping = file->f_mapping;
822         struct address_space *fdm = faults_disabled_mapping();
823         struct bch_inode_info *inode = file_bch_inode(file);
824         int ret;
825
826         if (fdm == mapping)
827                 return VM_FAULT_SIGBUS;
828
829         /* Lock ordering: */
830         if (fdm > mapping) {
831                 struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
832
833                 if (bch2_pagecache_add_tryget(inode))
834                         goto got_lock;
835
836                 bch2_pagecache_block_put(fdm_host);
837
838                 bch2_pagecache_add_get(inode);
839                 bch2_pagecache_add_put(inode);
840
841                 bch2_pagecache_block_get(fdm_host);
842
843                 /* Signal that lock has been dropped: */
844                 set_fdm_dropped_locks();
845                 return VM_FAULT_SIGBUS;
846         }
847
848         bch2_pagecache_add_get(inode);
849 got_lock:
850         ret = filemap_fault(vmf);
851         bch2_pagecache_add_put(inode);
852
853         return ret;
854 }
855
856 vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
857 {
858         struct page *page = vmf->page;
859         struct file *file = vmf->vma->vm_file;
860         struct bch_inode_info *inode = file_bch_inode(file);
861         struct address_space *mapping = file->f_mapping;
862         struct bch_fs *c = inode->v.i_sb->s_fs_info;
863         struct bch2_page_reservation res;
864         unsigned len;
865         loff_t isize;
866         int ret;
867
868         bch2_page_reservation_init(c, inode, &res);
869
870         sb_start_pagefault(inode->v.i_sb);
871         file_update_time(file);
872
873         /*
874          * Not strictly necessary, but helps avoid dio writes livelocking in
875          * write_invalidate_inode_pages_range() - can drop this if/when we get
876          * a write_invalidate_inode_pages_range() that works without dropping
877          * page lock before invalidating page
878          */
879         bch2_pagecache_add_get(inode);
880
881         lock_page(page);
882         isize = i_size_read(&inode->v);
883
884         if (page->mapping != mapping || page_offset(page) >= isize) {
885                 unlock_page(page);
886                 ret = VM_FAULT_NOPAGE;
887                 goto out;
888         }
889
890         len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page));
891
892         if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
893                 if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) {
894                         unlock_page(page);
895                         ret = VM_FAULT_SIGBUS;
896                         goto out;
897                 }
898         }
899
900         if (bch2_page_reservation_get(c, inode, page, &res, 0, len)) {
901                 unlock_page(page);
902                 ret = VM_FAULT_SIGBUS;
903                 goto out;
904         }
905
906         bch2_set_page_dirty(c, inode, page, &res, 0, len);
907         bch2_page_reservation_put(c, inode, &res);
908
909         wait_for_stable_page(page);
910         ret = VM_FAULT_LOCKED;
911 out:
912         bch2_pagecache_add_put(inode);
913         sb_end_pagefault(inode->v.i_sb);
914
915         return ret;
916 }
917
918 void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
919 {
920         if (offset || length < folio_size(folio))
921                 return;
922
923         bch2_clear_page_bits(&folio->page);
924 }
925
926 bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
927 {
928         if (folio_test_dirty(folio) || folio_test_writeback(folio))
929                 return false;
930
931         bch2_clear_page_bits(&folio->page);
932         return true;
933 }
934
935 /* readpage(s): */
936
937 static void bch2_readpages_end_io(struct bio *bio)
938 {
939         struct bvec_iter_all iter;
940         struct bio_vec *bv;
941
942         bio_for_each_segment_all(bv, bio, iter) {
943                 struct page *page = bv->bv_page;
944
945                 if (!bio->bi_status) {
946                         SetPageUptodate(page);
947                 } else {
948                         ClearPageUptodate(page);
949                         SetPageError(page);
950                 }
951                 unlock_page(page);
952         }
953
954         bio_put(bio);
955 }
956
957 struct readpages_iter {
958         struct address_space    *mapping;
959         struct page             **pages;
960         unsigned                nr_pages;
961         unsigned                idx;
962         pgoff_t                 offset;
963 };
964
965 static int readpages_iter_init(struct readpages_iter *iter,
966                                struct readahead_control *ractl)
967 {
968         unsigned i, nr_pages = readahead_count(ractl);
969
970         memset(iter, 0, sizeof(*iter));
971
972         iter->mapping   = ractl->mapping;
973         iter->offset    = readahead_index(ractl);
974         iter->nr_pages  = nr_pages;
975
976         iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
977         if (!iter->pages)
978                 return -ENOMEM;
979
980         nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
981         for (i = 0; i < nr_pages; i++) {
982                 __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
983                 put_page(iter->pages[i]);
984         }
985
986         return 0;
987 }
988
989 static inline struct page *readpage_iter_next(struct readpages_iter *iter)
990 {
991         if (iter->idx >= iter->nr_pages)
992                 return NULL;
993
994         EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
995
996         return iter->pages[iter->idx];
997 }
998
999 static bool extent_partial_reads_expensive(struct bkey_s_c k)
1000 {
1001         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1002         struct bch_extent_crc_unpacked crc;
1003         const union bch_extent_entry *i;
1004
1005         bkey_for_each_crc(k.k, ptrs, crc, i)
1006                 if (crc.csum_type || crc.compression_type)
1007                         return true;
1008         return false;
1009 }
1010
1011 static void readpage_bio_extend(struct readpages_iter *iter,
1012                                 struct bio *bio,
1013                                 unsigned sectors_this_extent,
1014                                 bool get_more)
1015 {
1016         while (bio_sectors(bio) < sectors_this_extent &&
1017                bio->bi_vcnt < bio->bi_max_vecs) {
1018                 pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
1019                 struct page *page = readpage_iter_next(iter);
1020                 int ret;
1021
1022                 if (page) {
1023                         if (iter->offset + iter->idx != page_offset)
1024                                 break;
1025
1026                         iter->idx++;
1027                 } else {
1028                         if (!get_more)
1029                                 break;
1030
1031                         page = xa_load(&iter->mapping->i_pages, page_offset);
1032                         if (page && !xa_is_value(page))
1033                                 break;
1034
1035                         page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
1036                         if (!page)
1037                                 break;
1038
1039                         if (!__bch2_page_state_create(page, 0)) {
1040                                 put_page(page);
1041                                 break;
1042                         }
1043
1044                         ret = add_to_page_cache_lru(page, iter->mapping,
1045                                                     page_offset, GFP_NOFS);
1046                         if (ret) {
1047                                 __bch2_page_state_release(page);
1048                                 put_page(page);
1049                                 break;
1050                         }
1051
1052                         put_page(page);
1053                 }
1054
1055                 BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0));
1056         }
1057 }
1058
1059 static void bchfs_read(struct btree_trans *trans,
1060                        struct bch_read_bio *rbio,
1061                        subvol_inum inum,
1062                        struct readpages_iter *readpages_iter)
1063 {
1064         struct bch_fs *c = trans->c;
1065         struct btree_iter iter;
1066         struct bkey_buf sk;
1067         int flags = BCH_READ_RETRY_IF_STALE|
1068                 BCH_READ_MAY_PROMOTE;
1069         u32 snapshot;
1070         int ret = 0;
1071
1072         rbio->c = c;
1073         rbio->start_time = local_clock();
1074         rbio->subvol = inum.subvol;
1075
1076         bch2_bkey_buf_init(&sk);
1077 retry:
1078         bch2_trans_begin(trans);
1079         iter = (struct btree_iter) { NULL };
1080
1081         ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
1082         if (ret)
1083                 goto err;
1084
1085         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1086                              SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
1087                              BTREE_ITER_SLOTS);
1088         while (1) {
1089                 struct bkey_s_c k;
1090                 unsigned bytes, sectors, offset_into_extent;
1091                 enum btree_id data_btree = BTREE_ID_extents;
1092
1093                 /*
1094                  * read_extent -> io_time_reset may cause a transaction restart
1095                  * without returning an error, we need to check for that here:
1096                  */
1097                 ret = bch2_trans_relock(trans);
1098                 if (ret)
1099                         break;
1100
1101                 bch2_btree_iter_set_pos(&iter,
1102                                 POS(inum.inum, rbio->bio.bi_iter.bi_sector));
1103
1104                 k = bch2_btree_iter_peek_slot(&iter);
1105                 ret = bkey_err(k);
1106                 if (ret)
1107                         break;
1108
1109                 offset_into_extent = iter.pos.offset -
1110                         bkey_start_offset(k.k);
1111                 sectors = k.k->size - offset_into_extent;
1112
1113                 bch2_bkey_buf_reassemble(&sk, c, k);
1114
1115                 ret = bch2_read_indirect_extent(trans, &data_btree,
1116                                         &offset_into_extent, &sk);
1117                 if (ret)
1118                         break;
1119
1120                 k = bkey_i_to_s_c(sk.k);
1121
1122                 sectors = min(sectors, k.k->size - offset_into_extent);
1123
1124                 if (readpages_iter)
1125                         readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
1126                                             extent_partial_reads_expensive(k));
1127
1128                 bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
1129                 swap(rbio->bio.bi_iter.bi_size, bytes);
1130
1131                 if (rbio->bio.bi_iter.bi_size == bytes)
1132                         flags |= BCH_READ_LAST_FRAGMENT;
1133
1134                 bch2_bio_page_state_set(&rbio->bio, k);
1135
1136                 bch2_read_extent(trans, rbio, iter.pos,
1137                                  data_btree, k, offset_into_extent, flags);
1138
1139                 if (flags & BCH_READ_LAST_FRAGMENT)
1140                         break;
1141
1142                 swap(rbio->bio.bi_iter.bi_size, bytes);
1143                 bio_advance(&rbio->bio, bytes);
1144
1145                 ret = btree_trans_too_many_iters(trans);
1146                 if (ret)
1147                         break;
1148         }
1149 err:
1150         bch2_trans_iter_exit(trans, &iter);
1151
1152         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1153                 goto retry;
1154
1155         if (ret) {
1156                 bch_err_inum_offset_ratelimited(c,
1157                                 iter.pos.inode,
1158                                 iter.pos.offset << 9,
1159                                 "read error %i from btree lookup", ret);
1160                 rbio->bio.bi_status = BLK_STS_IOERR;
1161                 bio_endio(&rbio->bio);
1162         }
1163
1164         bch2_bkey_buf_exit(&sk, c);
1165 }
1166
1167 void bch2_readahead(struct readahead_control *ractl)
1168 {
1169         struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
1170         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1171         struct bch_io_opts opts;
1172         struct btree_trans trans;
1173         struct page *page;
1174         struct readpages_iter readpages_iter;
1175         int ret;
1176
1177         bch2_inode_opts_get(&opts, c, &inode->ei_inode);
1178
1179         ret = readpages_iter_init(&readpages_iter, ractl);
1180         BUG_ON(ret);
1181
1182         bch2_trans_init(&trans, c, 0, 0);
1183
1184         bch2_pagecache_add_get(inode);
1185
1186         while ((page = readpage_iter_next(&readpages_iter))) {
1187                 pgoff_t index = readpages_iter.offset + readpages_iter.idx;
1188                 unsigned n = min_t(unsigned,
1189                                    readpages_iter.nr_pages -
1190                                    readpages_iter.idx,
1191                                    BIO_MAX_VECS);
1192                 struct bch_read_bio *rbio =
1193                         rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
1194                                                    GFP_NOFS, &c->bio_read),
1195                                   opts);
1196
1197                 readpages_iter.idx++;
1198
1199                 rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT;
1200                 rbio->bio.bi_end_io = bch2_readpages_end_io;
1201                 BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
1202
1203                 bchfs_read(&trans, rbio, inode_inum(inode),
1204                            &readpages_iter);
1205         }
1206
1207         bch2_pagecache_add_put(inode);
1208
1209         bch2_trans_exit(&trans);
1210         kfree(readpages_iter.pages);
1211 }
1212
1213 static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
1214                              subvol_inum inum, struct page *page)
1215 {
1216         struct btree_trans trans;
1217
1218         bch2_page_state_create(page, __GFP_NOFAIL);
1219
1220         rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
1221         rbio->bio.bi_iter.bi_sector =
1222                 (sector_t) page->index << PAGE_SECTORS_SHIFT;
1223         BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
1224
1225         bch2_trans_init(&trans, c, 0, 0);
1226         bchfs_read(&trans, rbio, inum, NULL);
1227         bch2_trans_exit(&trans);
1228 }
1229
1230 static void bch2_read_single_page_end_io(struct bio *bio)
1231 {
1232         complete(bio->bi_private);
1233 }
1234
1235 static int bch2_read_single_page(struct page *page,
1236                                  struct address_space *mapping)
1237 {
1238         struct bch_inode_info *inode = to_bch_ei(mapping->host);
1239         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1240         struct bch_read_bio *rbio;
1241         struct bch_io_opts opts;
1242         int ret;
1243         DECLARE_COMPLETION_ONSTACK(done);
1244
1245         bch2_inode_opts_get(&opts, c, &inode->ei_inode);
1246
1247         rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read),
1248                          opts);
1249         rbio->bio.bi_private = &done;
1250         rbio->bio.bi_end_io = bch2_read_single_page_end_io;
1251
1252         __bchfs_readpage(c, rbio, inode_inum(inode), page);
1253         wait_for_completion(&done);
1254
1255         ret = blk_status_to_errno(rbio->bio.bi_status);
1256         bio_put(&rbio->bio);
1257
1258         if (ret < 0)
1259                 return ret;
1260
1261         SetPageUptodate(page);
1262         return 0;
1263 }
1264
1265 int bch2_read_folio(struct file *file, struct folio *folio)
1266 {
1267         struct page *page = &folio->page;
1268         int ret;
1269
1270         ret = bch2_read_single_page(page, page->mapping);
1271         folio_unlock(folio);
1272         return bch2_err_class(ret);
1273 }
1274
1275 /* writepages: */
1276
1277 struct bch_writepage_state {
1278         struct bch_writepage_io *io;
1279         struct bch_io_opts      opts;
1280 };
1281
1282 static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
1283                                                                   struct bch_inode_info *inode)
1284 {
1285         struct bch_writepage_state ret = { 0 };
1286
1287         bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
1288         return ret;
1289 }
1290
1291 static void bch2_writepage_io_done(struct bch_write_op *op)
1292 {
1293         struct bch_writepage_io *io =
1294                 container_of(op, struct bch_writepage_io, op);
1295         struct bch_fs *c = io->op.c;
1296         struct bio *bio = &io->op.wbio.bio;
1297         struct bvec_iter_all iter;
1298         struct bio_vec *bvec;
1299         unsigned i;
1300
1301         if (io->op.error) {
1302                 set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
1303
1304                 bio_for_each_segment_all(bvec, bio, iter) {
1305                         struct bch_page_state *s;
1306
1307                         SetPageError(bvec->bv_page);
1308                         mapping_set_error(bvec->bv_page->mapping, -EIO);
1309
1310                         s = __bch2_page_state(bvec->bv_page);
1311                         spin_lock(&s->lock);
1312                         for (i = 0; i < PAGE_SECTORS; i++)
1313                                 s->s[i].nr_replicas = 0;
1314                         spin_unlock(&s->lock);
1315                 }
1316         }
1317
1318         if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
1319                 bio_for_each_segment_all(bvec, bio, iter) {
1320                         struct bch_page_state *s;
1321
1322                         s = __bch2_page_state(bvec->bv_page);
1323                         spin_lock(&s->lock);
1324                         for (i = 0; i < PAGE_SECTORS; i++)
1325                                 s->s[i].nr_replicas = 0;
1326                         spin_unlock(&s->lock);
1327                 }
1328         }
1329
1330         /*
1331          * racing with fallocate can cause us to add fewer sectors than
1332          * expected - but we shouldn't add more sectors than expected:
1333          */
1334         WARN_ON_ONCE(io->op.i_sectors_delta > 0);
1335
1336         /*
1337          * (error (due to going RO) halfway through a page can screw that up
1338          * slightly)
1339          * XXX wtf?
1340            BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
1341          */
1342
1343         /*
1344          * PageWriteback is effectively our ref on the inode - fixup i_blocks
1345          * before calling end_page_writeback:
1346          */
1347         i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
1348
1349         bio_for_each_segment_all(bvec, bio, iter) {
1350                 struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
1351
1352                 if (atomic_dec_and_test(&s->write_count))
1353                         end_page_writeback(bvec->bv_page);
1354         }
1355
1356         bio_put(&io->op.wbio.bio);
1357 }
1358
1359 static void bch2_writepage_do_io(struct bch_writepage_state *w)
1360 {
1361         struct bch_writepage_io *io = w->io;
1362
1363         w->io = NULL;
1364         closure_call(&io->op.cl, bch2_write, NULL, NULL);
1365 }
1366
1367 /*
1368  * Get a bch_writepage_io and add @page to it - appending to an existing one if
1369  * possible, else allocating a new one:
1370  */
1371 static void bch2_writepage_io_alloc(struct bch_fs *c,
1372                                     struct writeback_control *wbc,
1373                                     struct bch_writepage_state *w,
1374                                     struct bch_inode_info *inode,
1375                                     u64 sector,
1376                                     unsigned nr_replicas)
1377 {
1378         struct bch_write_op *op;
1379
1380         w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
1381                                               REQ_OP_WRITE,
1382                                               GFP_NOFS,
1383                                               &c->writepage_bioset),
1384                              struct bch_writepage_io, op.wbio.bio);
1385
1386         w->io->inode            = inode;
1387         op                      = &w->io->op;
1388         bch2_write_op_init(op, c, w->opts);
1389         op->target              = w->opts.foreground_target;
1390         op->nr_replicas         = nr_replicas;
1391         op->res.nr_replicas     = nr_replicas;
1392         op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
1393         op->subvol              = inode->ei_subvol;
1394         op->pos                 = POS(inode->v.i_ino, sector);
1395         op->end_io              = bch2_writepage_io_done;
1396         op->devs_need_flush     = &inode->ei_devs_need_flush;
1397         op->wbio.bio.bi_iter.bi_sector = sector;
1398         op->wbio.bio.bi_opf     = wbc_to_write_flags(wbc);
1399 }
1400
1401 static int __bch2_writepage(struct page *page,
1402                             struct writeback_control *wbc,
1403                             void *data)
1404 {
1405         struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
1406         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1407         struct bch_writepage_state *w = data;
1408         struct bch_page_state *s, orig;
1409         unsigned i, offset, nr_replicas_this_write = U32_MAX;
1410         loff_t i_size = i_size_read(&inode->v);
1411         pgoff_t end_index = i_size >> PAGE_SHIFT;
1412         int ret;
1413
1414         EBUG_ON(!PageUptodate(page));
1415
1416         /* Is the page fully inside i_size? */
1417         if (page->index < end_index)
1418                 goto do_io;
1419
1420         /* Is the page fully outside i_size? (truncate in progress) */
1421         offset = i_size & (PAGE_SIZE - 1);
1422         if (page->index > end_index || !offset) {
1423                 unlock_page(page);
1424                 return 0;
1425         }
1426
1427         /*
1428          * The page straddles i_size.  It must be zeroed out on each and every
1429          * writepage invocation because it may be mmapped.  "A file is mapped
1430          * in multiples of the page size.  For a file that is not a multiple of
1431          * the  page size, the remaining memory is zeroed when mapped, and
1432          * writes to that region are not written out to the file."
1433          */
1434         zero_user_segment(page, offset, PAGE_SIZE);
1435 do_io:
1436         s = bch2_page_state_create(page, __GFP_NOFAIL);
1437
1438         /*
1439          * Things get really hairy with errors during writeback:
1440          */
1441         ret = bch2_get_page_disk_reservation(c, inode, page, false);
1442         BUG_ON(ret);
1443
1444         /* Before unlocking the page, get copy of reservations: */
1445         spin_lock(&s->lock);
1446         orig = *s;
1447         spin_unlock(&s->lock);
1448
1449         for (i = 0; i < PAGE_SECTORS; i++) {
1450                 if (s->s[i].state < SECTOR_DIRTY)
1451                         continue;
1452
1453                 nr_replicas_this_write =
1454                         min_t(unsigned, nr_replicas_this_write,
1455                               s->s[i].nr_replicas +
1456                               s->s[i].replicas_reserved);
1457         }
1458
1459         for (i = 0; i < PAGE_SECTORS; i++) {
1460                 if (s->s[i].state < SECTOR_DIRTY)
1461                         continue;
1462
1463                 s->s[i].nr_replicas = w->opts.compression
1464                         ? 0 : nr_replicas_this_write;
1465
1466                 s->s[i].replicas_reserved = 0;
1467                 s->s[i].state = SECTOR_ALLOCATED;
1468         }
1469
1470         BUG_ON(atomic_read(&s->write_count));
1471         atomic_set(&s->write_count, 1);
1472
1473         BUG_ON(PageWriteback(page));
1474         set_page_writeback(page);
1475
1476         unlock_page(page);
1477
1478         offset = 0;
1479         while (1) {
1480                 unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
1481                 u64 sector;
1482
1483                 while (offset < PAGE_SECTORS &&
1484                        orig.s[offset].state < SECTOR_DIRTY)
1485                         offset++;
1486
1487                 if (offset == PAGE_SECTORS)
1488                         break;
1489
1490                 while (offset + sectors < PAGE_SECTORS &&
1491                        orig.s[offset + sectors].state >= SECTOR_DIRTY) {
1492                         reserved_sectors += orig.s[offset + sectors].replicas_reserved;
1493                         dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY;
1494                         sectors++;
1495                 }
1496                 BUG_ON(!sectors);
1497
1498                 sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset;
1499
1500                 if (w->io &&
1501                     (w->io->op.res.nr_replicas != nr_replicas_this_write ||
1502                      bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
1503                      w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
1504                      (BIO_MAX_VECS * PAGE_SIZE) ||
1505                      bio_end_sector(&w->io->op.wbio.bio) != sector))
1506                         bch2_writepage_do_io(w);
1507
1508                 if (!w->io)
1509                         bch2_writepage_io_alloc(c, wbc, w, inode, sector,
1510                                                 nr_replicas_this_write);
1511
1512                 atomic_inc(&s->write_count);
1513
1514                 BUG_ON(inode != w->io->inode);
1515                 BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page,
1516                                      sectors << 9, offset << 9));
1517
1518                 /* Check for writing past i_size: */
1519                 WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
1520                           round_up(i_size, block_bytes(c)) &&
1521                           !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
1522                           "writing past i_size: %llu > %llu (unrounded %llu)\n",
1523                           bio_end_sector(&w->io->op.wbio.bio) << 9,
1524                           round_up(i_size, block_bytes(c)),
1525                           i_size);
1526
1527                 w->io->op.res.sectors += reserved_sectors;
1528                 w->io->op.i_sectors_delta -= dirty_sectors;
1529                 w->io->op.new_i_size = i_size;
1530
1531                 offset += sectors;
1532         }
1533
1534         if (atomic_dec_and_test(&s->write_count))
1535                 end_page_writeback(page);
1536
1537         return 0;
1538 }
1539
1540 int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
1541 {
1542         struct bch_fs *c = mapping->host->i_sb->s_fs_info;
1543         struct bch_writepage_state w =
1544                 bch_writepage_state_init(c, to_bch_ei(mapping->host));
1545         struct blk_plug plug;
1546         int ret;
1547
1548         blk_start_plug(&plug);
1549         ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
1550         if (w.io)
1551                 bch2_writepage_do_io(&w);
1552         blk_finish_plug(&plug);
1553         return bch2_err_class(ret);
1554 }
1555
1556 /* buffered writes: */
1557
1558 int bch2_write_begin(struct file *file, struct address_space *mapping,
1559                      loff_t pos, unsigned len,
1560                      struct page **pagep, void **fsdata)
1561 {
1562         struct bch_inode_info *inode = to_bch_ei(mapping->host);
1563         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1564         struct bch2_page_reservation *res;
1565         pgoff_t index = pos >> PAGE_SHIFT;
1566         unsigned offset = pos & (PAGE_SIZE - 1);
1567         struct page *page;
1568         int ret = -ENOMEM;
1569
1570         res = kmalloc(sizeof(*res), GFP_KERNEL);
1571         if (!res)
1572                 return -ENOMEM;
1573
1574         bch2_page_reservation_init(c, inode, res);
1575         *fsdata = res;
1576
1577         bch2_pagecache_add_get(inode);
1578
1579         page = grab_cache_page_write_begin(mapping, index);
1580         if (!page)
1581                 goto err_unlock;
1582
1583         if (PageUptodate(page))
1584                 goto out;
1585
1586         /* If we're writing entire page, don't need to read it in first: */
1587         if (len == PAGE_SIZE)
1588                 goto out;
1589
1590         if (!offset && pos + len >= inode->v.i_size) {
1591                 zero_user_segment(page, len, PAGE_SIZE);
1592                 flush_dcache_page(page);
1593                 goto out;
1594         }
1595
1596         if (index > inode->v.i_size >> PAGE_SHIFT) {
1597                 zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE);
1598                 flush_dcache_page(page);
1599                 goto out;
1600         }
1601 readpage:
1602         ret = bch2_read_single_page(page, mapping);
1603         if (ret)
1604                 goto err;
1605 out:
1606         if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
1607                 ret = bch2_page_state_set(c, inode_inum(inode), &page, 1);
1608                 if (ret)
1609                         goto err;
1610         }
1611
1612         ret = bch2_page_reservation_get(c, inode, page, res, offset, len);
1613         if (ret) {
1614                 if (!PageUptodate(page)) {
1615                         /*
1616                          * If the page hasn't been read in, we won't know if we
1617                          * actually need a reservation - we don't actually need
1618                          * to read here, we just need to check if the page is
1619                          * fully backed by uncompressed data:
1620                          */
1621                         goto readpage;
1622                 }
1623
1624                 goto err;
1625         }
1626
1627         *pagep = page;
1628         return 0;
1629 err:
1630         unlock_page(page);
1631         put_page(page);
1632         *pagep = NULL;
1633 err_unlock:
1634         bch2_pagecache_add_put(inode);
1635         kfree(res);
1636         *fsdata = NULL;
1637         return bch2_err_class(ret);
1638 }
1639
1640 int bch2_write_end(struct file *file, struct address_space *mapping,
1641                    loff_t pos, unsigned len, unsigned copied,
1642                    struct page *page, void *fsdata)
1643 {
1644         struct bch_inode_info *inode = to_bch_ei(mapping->host);
1645         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1646         struct bch2_page_reservation *res = fsdata;
1647         unsigned offset = pos & (PAGE_SIZE - 1);
1648
1649         lockdep_assert_held(&inode->v.i_rwsem);
1650
1651         if (unlikely(copied < len && !PageUptodate(page))) {
1652                 /*
1653                  * The page needs to be read in, but that would destroy
1654                  * our partial write - simplest thing is to just force
1655                  * userspace to redo the write:
1656                  */
1657                 zero_user(page, 0, PAGE_SIZE);
1658                 flush_dcache_page(page);
1659                 copied = 0;
1660         }
1661
1662         spin_lock(&inode->v.i_lock);
1663         if (pos + copied > inode->v.i_size)
1664                 i_size_write(&inode->v, pos + copied);
1665         spin_unlock(&inode->v.i_lock);
1666
1667         if (copied) {
1668                 if (!PageUptodate(page))
1669                         SetPageUptodate(page);
1670
1671                 bch2_set_page_dirty(c, inode, page, res, offset, copied);
1672
1673                 inode->ei_last_dirtied = (unsigned long) current;
1674         }
1675
1676         unlock_page(page);
1677         put_page(page);
1678         bch2_pagecache_add_put(inode);
1679
1680         bch2_page_reservation_put(c, inode, res);
1681         kfree(res);
1682
1683         return copied;
1684 }
1685
1686 #define WRITE_BATCH_PAGES       32
1687
1688 static int __bch2_buffered_write(struct bch_inode_info *inode,
1689                                  struct address_space *mapping,
1690                                  struct iov_iter *iter,
1691                                  loff_t pos, unsigned len)
1692 {
1693         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1694         struct page *pages[WRITE_BATCH_PAGES];
1695         struct bch2_page_reservation res;
1696         unsigned long index = pos >> PAGE_SHIFT;
1697         unsigned offset = pos & (PAGE_SIZE - 1);
1698         unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
1699         unsigned i, reserved = 0, set_dirty = 0;
1700         unsigned copied = 0, nr_pages_copied = 0;
1701         int ret = 0;
1702
1703         BUG_ON(!len);
1704         BUG_ON(nr_pages > ARRAY_SIZE(pages));
1705
1706         bch2_page_reservation_init(c, inode, &res);
1707
1708         for (i = 0; i < nr_pages; i++) {
1709                 pages[i] = grab_cache_page_write_begin(mapping, index + i);
1710                 if (!pages[i]) {
1711                         nr_pages = i;
1712                         if (!i) {
1713                                 ret = -ENOMEM;
1714                                 goto out;
1715                         }
1716                         len = min_t(unsigned, len,
1717                                     nr_pages * PAGE_SIZE - offset);
1718                         break;
1719                 }
1720         }
1721
1722         if (offset && !PageUptodate(pages[0])) {
1723                 ret = bch2_read_single_page(pages[0], mapping);
1724                 if (ret)
1725                         goto out;
1726         }
1727
1728         if ((pos + len) & (PAGE_SIZE - 1) &&
1729             !PageUptodate(pages[nr_pages - 1])) {
1730                 if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) {
1731                         zero_user(pages[nr_pages - 1], 0, PAGE_SIZE);
1732                 } else {
1733                         ret = bch2_read_single_page(pages[nr_pages - 1], mapping);
1734                         if (ret)
1735                                 goto out;
1736                 }
1737         }
1738
1739         while (reserved < len) {
1740                 unsigned i = (offset + reserved) >> PAGE_SHIFT;
1741                 struct page *page = pages[i];
1742                 unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
1743                 unsigned pg_len = min_t(unsigned, len - reserved,
1744                                         PAGE_SIZE - pg_offset);
1745
1746                 if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
1747                         ret = bch2_page_state_set(c, inode_inum(inode),
1748                                                   pages + i, nr_pages - i);
1749                         if (ret)
1750                                 goto out;
1751                 }
1752
1753                 /*
1754                  * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
1755                  * supposed to write as much as we have disk space for.
1756                  *
1757                  * On failure here we should still write out a partial page if
1758                  * we aren't completely out of disk space - we don't do that
1759                  * yet:
1760                  */
1761                 ret = bch2_page_reservation_get(c, inode, page, &res,
1762                                                 pg_offset, pg_len);
1763                 if (unlikely(ret)) {
1764                         if (!reserved)
1765                                 goto out;
1766                         break;
1767                 }
1768
1769                 reserved += pg_len;
1770         }
1771
1772         if (mapping_writably_mapped(mapping))
1773                 for (i = 0; i < nr_pages; i++)
1774                         flush_dcache_page(pages[i]);
1775
1776         while (copied < reserved) {
1777                 struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
1778                 unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
1779                 unsigned pg_len = min_t(unsigned, reserved - copied,
1780                                         PAGE_SIZE - pg_offset);
1781                 unsigned pg_copied = copy_page_from_iter_atomic(page,
1782                                                 pg_offset, pg_len, iter);
1783
1784                 if (!pg_copied)
1785                         break;
1786
1787                 if (!PageUptodate(page) &&
1788                     pg_copied != PAGE_SIZE &&
1789                     pos + copied + pg_copied < inode->v.i_size) {
1790                         zero_user(page, 0, PAGE_SIZE);
1791                         break;
1792                 }
1793
1794                 flush_dcache_page(page);
1795                 copied += pg_copied;
1796
1797                 if (pg_copied != pg_len)
1798                         break;
1799         }
1800
1801         if (!copied)
1802                 goto out;
1803
1804         spin_lock(&inode->v.i_lock);
1805         if (pos + copied > inode->v.i_size)
1806                 i_size_write(&inode->v, pos + copied);
1807         spin_unlock(&inode->v.i_lock);
1808
1809         while (set_dirty < copied) {
1810                 struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT];
1811                 unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1);
1812                 unsigned pg_len = min_t(unsigned, copied - set_dirty,
1813                                         PAGE_SIZE - pg_offset);
1814
1815                 if (!PageUptodate(page))
1816                         SetPageUptodate(page);
1817
1818                 bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len);
1819                 unlock_page(page);
1820                 put_page(page);
1821
1822                 set_dirty += pg_len;
1823         }
1824
1825         nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
1826         inode->ei_last_dirtied = (unsigned long) current;
1827 out:
1828         for (i = nr_pages_copied; i < nr_pages; i++) {
1829                 unlock_page(pages[i]);
1830                 put_page(pages[i]);
1831         }
1832
1833         bch2_page_reservation_put(c, inode, &res);
1834
1835         return copied ?: ret;
1836 }
1837
1838 static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
1839 {
1840         struct file *file = iocb->ki_filp;
1841         struct address_space *mapping = file->f_mapping;
1842         struct bch_inode_info *inode = file_bch_inode(file);
1843         loff_t pos = iocb->ki_pos;
1844         ssize_t written = 0;
1845         int ret = 0;
1846
1847         bch2_pagecache_add_get(inode);
1848
1849         do {
1850                 unsigned offset = pos & (PAGE_SIZE - 1);
1851                 unsigned bytes = min_t(unsigned long, iov_iter_count(iter),
1852                               PAGE_SIZE * WRITE_BATCH_PAGES - offset);
1853 again:
1854                 /*
1855                  * Bring in the user page that we will copy from _first_.
1856                  * Otherwise there's a nasty deadlock on copying from the
1857                  * same page as we're writing to, without it being marked
1858                  * up-to-date.
1859                  *
1860                  * Not only is this an optimisation, but it is also required
1861                  * to check that the address is actually valid, when atomic
1862                  * usercopies are used, below.
1863                  */
1864                 if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
1865                         bytes = min_t(unsigned long, iov_iter_count(iter),
1866                                       PAGE_SIZE - offset);
1867
1868                         if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
1869                                 ret = -EFAULT;
1870                                 break;
1871                         }
1872                 }
1873
1874                 if (unlikely(fatal_signal_pending(current))) {
1875                         ret = -EINTR;
1876                         break;
1877                 }
1878
1879                 ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
1880                 if (unlikely(ret < 0))
1881                         break;
1882
1883                 cond_resched();
1884
1885                 if (unlikely(ret == 0)) {
1886                         /*
1887                          * If we were unable to copy any data at all, we must
1888                          * fall back to a single segment length write.
1889                          *
1890                          * If we didn't fallback here, we could livelock
1891                          * because not all segments in the iov can be copied at
1892                          * once without a pagefault.
1893                          */
1894                         bytes = min_t(unsigned long, PAGE_SIZE - offset,
1895                                       iov_iter_single_seg_count(iter));
1896                         goto again;
1897                 }
1898                 pos += ret;
1899                 written += ret;
1900                 ret = 0;
1901
1902                 balance_dirty_pages_ratelimited(mapping);
1903         } while (iov_iter_count(iter));
1904
1905         bch2_pagecache_add_put(inode);
1906
1907         return written ? written : ret;
1908 }
1909
1910 /* O_DIRECT reads */
1911
1912 static void bio_check_or_release(struct bio *bio, bool check_dirty)
1913 {
1914         if (check_dirty) {
1915                 bio_check_pages_dirty(bio);
1916         } else {
1917                 bio_release_pages(bio, false);
1918                 bio_put(bio);
1919         }
1920 }
1921
1922 static void bch2_dio_read_complete(struct closure *cl)
1923 {
1924         struct dio_read *dio = container_of(cl, struct dio_read, cl);
1925
1926         dio->req->ki_complete(dio->req, dio->ret);
1927         bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
1928 }
1929
1930 static void bch2_direct_IO_read_endio(struct bio *bio)
1931 {
1932         struct dio_read *dio = bio->bi_private;
1933
1934         if (bio->bi_status)
1935                 dio->ret = blk_status_to_errno(bio->bi_status);
1936
1937         closure_put(&dio->cl);
1938 }
1939
1940 static void bch2_direct_IO_read_split_endio(struct bio *bio)
1941 {
1942         struct dio_read *dio = bio->bi_private;
1943         bool should_dirty = dio->should_dirty;
1944
1945         bch2_direct_IO_read_endio(bio);
1946         bio_check_or_release(bio, should_dirty);
1947 }
1948
1949 static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
1950 {
1951         struct file *file = req->ki_filp;
1952         struct bch_inode_info *inode = file_bch_inode(file);
1953         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1954         struct bch_io_opts opts;
1955         struct dio_read *dio;
1956         struct bio *bio;
1957         loff_t offset = req->ki_pos;
1958         bool sync = is_sync_kiocb(req);
1959         size_t shorten;
1960         ssize_t ret;
1961
1962         bch2_inode_opts_get(&opts, c, &inode->ei_inode);
1963
1964         if ((offset|iter->count) & (block_bytes(c) - 1))
1965                 return -EINVAL;
1966
1967         ret = min_t(loff_t, iter->count,
1968                     max_t(loff_t, 0, i_size_read(&inode->v) - offset));
1969
1970         if (!ret)
1971                 return ret;
1972
1973         shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
1974         iter->count -= shorten;
1975
1976         bio = bio_alloc_bioset(NULL,
1977                                bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
1978                                REQ_OP_READ,
1979                                GFP_KERNEL,
1980                                &c->dio_read_bioset);
1981
1982         bio->bi_end_io = bch2_direct_IO_read_endio;
1983
1984         dio = container_of(bio, struct dio_read, rbio.bio);
1985         closure_init(&dio->cl, NULL);
1986
1987         /*
1988          * this is a _really_ horrible hack just to avoid an atomic sub at the
1989          * end:
1990          */
1991         if (!sync) {
1992                 set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
1993                 atomic_set(&dio->cl.remaining,
1994                            CLOSURE_REMAINING_INITIALIZER -
1995                            CLOSURE_RUNNING +
1996                            CLOSURE_DESTRUCTOR);
1997         } else {
1998                 atomic_set(&dio->cl.remaining,
1999                            CLOSURE_REMAINING_INITIALIZER + 1);
2000         }
2001
2002         dio->req        = req;
2003         dio->ret        = ret;
2004         /*
2005          * This is one of the sketchier things I've encountered: we have to skip
2006          * the dirtying of requests that are internal from the kernel (i.e. from
2007          * loopback), because we'll deadlock on page_lock.
2008          */
2009         dio->should_dirty = iter_is_iovec(iter);
2010
2011         goto start;
2012         while (iter->count) {
2013                 bio = bio_alloc_bioset(NULL,
2014                                        bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
2015                                        REQ_OP_READ,
2016                                        GFP_KERNEL,
2017                                        &c->bio_read);
2018                 bio->bi_end_io          = bch2_direct_IO_read_split_endio;
2019 start:
2020                 bio->bi_opf             = REQ_OP_READ|REQ_SYNC;
2021                 bio->bi_iter.bi_sector  = offset >> 9;
2022                 bio->bi_private         = dio;
2023
2024                 ret = bio_iov_iter_get_pages(bio, iter);
2025                 if (ret < 0) {
2026                         /* XXX: fault inject this path */
2027                         bio->bi_status = BLK_STS_RESOURCE;
2028                         bio_endio(bio);
2029                         break;
2030                 }
2031
2032                 offset += bio->bi_iter.bi_size;
2033
2034                 if (dio->should_dirty)
2035                         bio_set_pages_dirty(bio);
2036
2037                 if (iter->count)
2038                         closure_get(&dio->cl);
2039
2040                 bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
2041         }
2042
2043         iter->count += shorten;
2044
2045         if (sync) {
2046                 closure_sync(&dio->cl);
2047                 closure_debug_destroy(&dio->cl);
2048                 ret = dio->ret;
2049                 bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
2050                 return ret;
2051         } else {
2052                 return -EIOCBQUEUED;
2053         }
2054 }
2055
2056 ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
2057 {
2058         struct file *file = iocb->ki_filp;
2059         struct bch_inode_info *inode = file_bch_inode(file);
2060         struct address_space *mapping = file->f_mapping;
2061         size_t count = iov_iter_count(iter);
2062         ssize_t ret;
2063
2064         if (!count)
2065                 return 0; /* skip atime */
2066
2067         if (iocb->ki_flags & IOCB_DIRECT) {
2068                 struct blk_plug plug;
2069
2070                 if (unlikely(mapping->nrpages)) {
2071                         ret = filemap_write_and_wait_range(mapping,
2072                                                 iocb->ki_pos,
2073                                                 iocb->ki_pos + count - 1);
2074                         if (ret < 0)
2075                                 goto out;
2076                 }
2077
2078                 file_accessed(file);
2079
2080                 blk_start_plug(&plug);
2081                 ret = bch2_direct_IO_read(iocb, iter);
2082                 blk_finish_plug(&plug);
2083
2084                 if (ret >= 0)
2085                         iocb->ki_pos += ret;
2086         } else {
2087                 bch2_pagecache_add_get(inode);
2088                 ret = generic_file_read_iter(iocb, iter);
2089                 bch2_pagecache_add_put(inode);
2090         }
2091 out:
2092         return bch2_err_class(ret);
2093 }
2094
2095 /* O_DIRECT writes */
2096
2097 static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
2098                                        u64 offset, u64 size,
2099                                        unsigned nr_replicas, bool compressed)
2100 {
2101         struct btree_trans trans;
2102         struct btree_iter iter;
2103         struct bkey_s_c k;
2104         u64 end = offset + size;
2105         u32 snapshot;
2106         bool ret = true;
2107         int err;
2108
2109         bch2_trans_init(&trans, c, 0, 0);
2110 retry:
2111         bch2_trans_begin(&trans);
2112
2113         err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
2114         if (err)
2115                 goto err;
2116
2117         for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
2118                            SPOS(inum.inum, offset, snapshot),
2119                            BTREE_ITER_SLOTS, k, err) {
2120                 if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
2121                         break;
2122
2123                 if (k.k->p.snapshot != snapshot ||
2124                     nr_replicas > bch2_bkey_replicas(c, k) ||
2125                     (!compressed && bch2_bkey_sectors_compressed(k))) {
2126                         ret = false;
2127                         break;
2128                 }
2129         }
2130
2131         offset = iter.pos.offset;
2132         bch2_trans_iter_exit(&trans, &iter);
2133 err:
2134         if (bch2_err_matches(err, BCH_ERR_transaction_restart))
2135                 goto retry;
2136         bch2_trans_exit(&trans);
2137
2138         return err ? false : ret;
2139 }
2140
2141 static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
2142 {
2143         struct bch_fs *c = dio->op.c;
2144         struct bch_inode_info *inode = dio->inode;
2145         struct bio *bio = &dio->op.wbio.bio;
2146
2147         return bch2_check_range_allocated(c, inode_inum(inode),
2148                                 dio->op.pos.offset, bio_sectors(bio),
2149                                 dio->op.opts.data_replicas,
2150                                 dio->op.opts.compression != 0);
2151 }
2152
2153 static void bch2_dio_write_loop_async(struct bch_write_op *);
2154 static __always_inline long bch2_dio_write_done(struct dio_write *dio);
2155
2156 static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
2157 {
2158         struct iovec *iov = dio->inline_vecs;
2159
2160         if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
2161                 iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
2162                                     GFP_KERNEL);
2163                 if (unlikely(!iov))
2164                         return -ENOMEM;
2165
2166                 dio->free_iov = true;
2167         }
2168
2169         memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
2170         dio->iter.iov = iov;
2171         return 0;
2172 }
2173
2174 static void bch2_dio_write_flush_done(struct closure *cl)
2175 {
2176         struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
2177         struct bch_fs *c = dio->op.c;
2178
2179         closure_debug_destroy(cl);
2180
2181         dio->op.error = bch2_journal_error(&c->journal);
2182
2183         bch2_dio_write_done(dio);
2184 }
2185
2186 static noinline void bch2_dio_write_flush(struct dio_write *dio)
2187 {
2188         struct bch_fs *c = dio->op.c;
2189         struct bch_inode_unpacked inode;
2190         int ret;
2191
2192         dio->flush = 0;
2193
2194         closure_init(&dio->op.cl, NULL);
2195
2196         if (!dio->op.error) {
2197                 ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
2198                 if (ret) {
2199                         dio->op.error = ret;
2200                 } else {
2201                         bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl);
2202                         bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
2203                 }
2204         }
2205
2206         if (dio->sync) {
2207                 closure_sync(&dio->op.cl);
2208                 closure_debug_destroy(&dio->op.cl);
2209         } else {
2210                 continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
2211         }
2212 }
2213
2214 static __always_inline long bch2_dio_write_done(struct dio_write *dio)
2215 {
2216         struct kiocb *req = dio->req;
2217         struct bch_inode_info *inode = dio->inode;
2218         bool sync = dio->sync;
2219         long ret;
2220
2221         if (unlikely(dio->flush)) {
2222                 bch2_dio_write_flush(dio);
2223                 if (!sync)
2224                         return -EIOCBQUEUED;
2225         }
2226
2227         bch2_pagecache_block_put(inode);
2228
2229         if (dio->free_iov)
2230                 kfree(dio->iter.iov);
2231
2232         ret = dio->op.error ?: ((long) dio->written << 9);
2233         bio_put(&dio->op.wbio.bio);
2234
2235         /* inode->i_dio_count is our ref on inode and thus bch_fs */
2236         inode_dio_end(&inode->v);
2237
2238         if (ret < 0)
2239                 ret = bch2_err_class(ret);
2240
2241         if (!sync) {
2242                 req->ki_complete(req, ret);
2243                 ret = -EIOCBQUEUED;
2244         }
2245         return ret;
2246 }
2247
2248 static __always_inline void bch2_dio_write_end(struct dio_write *dio)
2249 {
2250         struct bch_fs *c = dio->op.c;
2251         struct kiocb *req = dio->req;
2252         struct bch_inode_info *inode = dio->inode;
2253         struct bio *bio = &dio->op.wbio.bio;
2254         struct bvec_iter_all iter;
2255         struct bio_vec *bv;
2256
2257         req->ki_pos     += (u64) dio->op.written << 9;
2258         dio->written    += dio->op.written;
2259
2260         if (dio->extending) {
2261                 spin_lock(&inode->v.i_lock);
2262                 if (req->ki_pos > inode->v.i_size)
2263                         i_size_write(&inode->v, req->ki_pos);
2264                 spin_unlock(&inode->v.i_lock);
2265         }
2266
2267         if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
2268                 mutex_lock(&inode->ei_quota_lock);
2269                 __i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
2270                 __bch2_quota_reservation_put(c, inode, &dio->quota_res);
2271                 mutex_unlock(&inode->ei_quota_lock);
2272         }
2273
2274         if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
2275                 bio_for_each_segment_all(bv, bio, iter)
2276                         put_page(bv->bv_page);
2277
2278         if (unlikely(dio->op.error))
2279                 set_bit(EI_INODE_ERROR, &inode->ei_flags);
2280 }
2281
2282 static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
2283 {
2284         struct bch_fs *c = dio->op.c;
2285         struct kiocb *req = dio->req;
2286         struct address_space *mapping = dio->mapping;
2287         struct bch_inode_info *inode = dio->inode;
2288         struct bch_io_opts opts;
2289         struct bio *bio = &dio->op.wbio.bio;
2290         unsigned unaligned, iter_count;
2291         bool sync = dio->sync, dropped_locks;
2292         long ret;
2293
2294         bch2_inode_opts_get(&opts, c, &inode->ei_inode);
2295
2296         while (1) {
2297                 iter_count = dio->iter.count;
2298
2299                 EBUG_ON(current->faults_disabled_mapping);
2300                 current->faults_disabled_mapping = mapping;
2301
2302                 ret = bio_iov_iter_get_pages(bio, &dio->iter);
2303
2304                 dropped_locks = fdm_dropped_locks();
2305
2306                 current->faults_disabled_mapping = NULL;
2307
2308                 /*
2309                  * If the fault handler returned an error but also signalled
2310                  * that it dropped & retook ei_pagecache_lock, we just need to
2311                  * re-shoot down the page cache and retry:
2312                  */
2313                 if (dropped_locks && ret)
2314                         ret = 0;
2315
2316                 if (unlikely(ret < 0))
2317                         goto err;
2318
2319                 if (unlikely(dropped_locks)) {
2320                         ret = write_invalidate_inode_pages_range(mapping,
2321                                         req->ki_pos,
2322                                         req->ki_pos + iter_count - 1);
2323                         if (unlikely(ret))
2324                                 goto err;
2325
2326                         if (!bio->bi_iter.bi_size)
2327                                 continue;
2328                 }
2329
2330                 unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
2331                 bio->bi_iter.bi_size -= unaligned;
2332                 iov_iter_revert(&dio->iter, unaligned);
2333
2334                 if (!bio->bi_iter.bi_size) {
2335                         /*
2336                          * bio_iov_iter_get_pages was only able to get <
2337                          * blocksize worth of pages:
2338                          */
2339                         ret = -EFAULT;
2340                         goto err;
2341                 }
2342
2343                 bch2_write_op_init(&dio->op, c, opts);
2344                 dio->op.end_io          = sync
2345                         ? NULL
2346                         : bch2_dio_write_loop_async;
2347                 dio->op.target          = dio->op.opts.foreground_target;
2348                 dio->op.write_point     = writepoint_hashed((unsigned long) current);
2349                 dio->op.nr_replicas     = dio->op.opts.data_replicas;
2350                 dio->op.subvol          = inode->ei_subvol;
2351                 dio->op.pos             = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
2352                 dio->op.devs_need_flush = &inode->ei_devs_need_flush;
2353
2354                 if (sync)
2355                         dio->op.flags |= BCH_WRITE_SYNC;
2356                 dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
2357
2358                 ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
2359                                                  bio_sectors(bio), true);
2360                 if (unlikely(ret))
2361                         goto err;
2362
2363                 ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
2364                                                 dio->op.opts.data_replicas, 0);
2365                 if (unlikely(ret) &&
2366                     !bch2_dio_write_check_allocated(dio))
2367                         goto err;
2368
2369                 task_io_account_write(bio->bi_iter.bi_size);
2370
2371                 if (unlikely(dio->iter.count) &&
2372                     !dio->sync &&
2373                     !dio->loop &&
2374                     bch2_dio_write_copy_iov(dio))
2375                         dio->sync = sync = true;
2376
2377                 dio->loop = true;
2378                 closure_call(&dio->op.cl, bch2_write, NULL, NULL);
2379
2380                 if (!sync)
2381                         return -EIOCBQUEUED;
2382
2383                 bch2_dio_write_end(dio);
2384
2385                 if (likely(!dio->iter.count) || dio->op.error)
2386                         break;
2387
2388                 bio_reset(bio, NULL, REQ_OP_WRITE);
2389         }
2390 out:
2391         return bch2_dio_write_done(dio);
2392 err:
2393         dio->op.error = ret;
2394
2395         if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
2396                 struct bvec_iter_all iter;
2397                 struct bio_vec *bv;
2398
2399                 bio_for_each_segment_all(bv, bio, iter)
2400                         put_page(bv->bv_page);
2401         }
2402
2403         bch2_quota_reservation_put(c, inode, &dio->quota_res);
2404         goto out;
2405 }
2406
2407 static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
2408 {
2409         struct mm_struct *mm = dio->mm;
2410
2411         bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
2412
2413         if (mm)
2414                 kthread_use_mm(mm);
2415         bch2_dio_write_loop(dio);
2416         if (mm)
2417                 kthread_unuse_mm(mm);
2418 }
2419
2420 static void bch2_dio_write_loop_async(struct bch_write_op *op)
2421 {
2422         struct dio_write *dio = container_of(op, struct dio_write, op);
2423
2424         bch2_dio_write_end(dio);
2425
2426         if (likely(!dio->iter.count) || dio->op.error)
2427                 bch2_dio_write_done(dio);
2428         else
2429                 bch2_dio_write_continue(dio);
2430 }
2431
2432 static noinline
2433 ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
2434 {
2435         struct file *file = req->ki_filp;
2436         struct address_space *mapping = file->f_mapping;
2437         struct bch_inode_info *inode = file_bch_inode(file);
2438         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2439         struct dio_write *dio;
2440         struct bio *bio;
2441         bool locked = true, extending;
2442         ssize_t ret;
2443
2444         prefetch(&c->opts);
2445         prefetch((void *) &c->opts + 64);
2446         prefetch(&inode->ei_inode);
2447         prefetch((void *) &inode->ei_inode + 64);
2448
2449         inode_lock(&inode->v);
2450
2451         ret = generic_write_checks(req, iter);
2452         if (unlikely(ret <= 0))
2453                 goto err;
2454
2455         ret = file_remove_privs(file);
2456         if (unlikely(ret))
2457                 goto err;
2458
2459         ret = file_update_time(file);
2460         if (unlikely(ret))
2461                 goto err;
2462
2463         if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
2464                 goto err;
2465
2466         inode_dio_begin(&inode->v);
2467         bch2_pagecache_block_get(inode);
2468
2469         extending = req->ki_pos + iter->count > inode->v.i_size;
2470         if (!extending) {
2471                 inode_unlock(&inode->v);
2472                 locked = false;
2473         }
2474
2475         bio = bio_alloc_bioset(NULL,
2476                                bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
2477                                REQ_OP_WRITE,
2478                                GFP_KERNEL,
2479                                &c->dio_write_bioset);
2480         dio = container_of(bio, struct dio_write, op.wbio.bio);
2481         dio->req                = req;
2482         dio->mapping            = mapping;
2483         dio->inode              = inode;
2484         dio->mm                 = current->mm;
2485         dio->loop               = false;
2486         dio->extending          = extending;
2487         dio->sync               = is_sync_kiocb(req) || extending;
2488         dio->flush              = iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
2489         dio->free_iov           = false;
2490         dio->quota_res.sectors  = 0;
2491         dio->written            = 0;
2492         dio->iter               = *iter;
2493         dio->op.c               = c;
2494
2495         if (unlikely(mapping->nrpages)) {
2496                 ret = write_invalidate_inode_pages_range(mapping,
2497                                                 req->ki_pos,
2498                                                 req->ki_pos + iter->count - 1);
2499                 if (unlikely(ret))
2500                         goto err_put_bio;
2501         }
2502
2503         ret = bch2_dio_write_loop(dio);
2504 err:
2505         if (locked)
2506                 inode_unlock(&inode->v);
2507         return ret;
2508 err_put_bio:
2509         bch2_pagecache_block_put(inode);
2510         bio_put(bio);
2511         inode_dio_end(&inode->v);
2512         goto err;
2513 }
2514
2515 ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
2516 {
2517         struct file *file = iocb->ki_filp;
2518         struct bch_inode_info *inode = file_bch_inode(file);
2519         ssize_t ret;
2520
2521         if (iocb->ki_flags & IOCB_DIRECT) {
2522                 ret = bch2_direct_write(iocb, from);
2523                 goto out;
2524         }
2525
2526         /* We can write back this queue in page reclaim */
2527         current->backing_dev_info = inode_to_bdi(&inode->v);
2528         inode_lock(&inode->v);
2529
2530         ret = generic_write_checks(iocb, from);
2531         if (ret <= 0)
2532                 goto unlock;
2533
2534         ret = file_remove_privs(file);
2535         if (ret)
2536                 goto unlock;
2537
2538         ret = file_update_time(file);
2539         if (ret)
2540                 goto unlock;
2541
2542         ret = bch2_buffered_write(iocb, from);
2543         if (likely(ret > 0))
2544                 iocb->ki_pos += ret;
2545 unlock:
2546         inode_unlock(&inode->v);
2547         current->backing_dev_info = NULL;
2548
2549         if (ret > 0)
2550                 ret = generic_write_sync(iocb, ret);
2551 out:
2552         return bch2_err_class(ret);
2553 }
2554
2555 /* fsync: */
2556
2557 /*
2558  * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
2559  * insert trigger: look up the btree inode instead
2560  */
2561 static int bch2_flush_inode(struct bch_fs *c,
2562                             struct bch_inode_info *inode)
2563 {
2564         struct bch_inode_unpacked u;
2565         int ret;
2566
2567         if (c->opts.journal_flush_disabled)
2568                 return 0;
2569
2570         ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u);
2571         if (ret)
2572                 return ret;
2573
2574         return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
2575                 bch2_inode_flush_nocow_writes(c, inode);
2576 }
2577
2578 int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2579 {
2580         struct bch_inode_info *inode = file_bch_inode(file);
2581         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2582         int ret, ret2, ret3;
2583
2584         ret = file_write_and_wait_range(file, start, end);
2585         ret2 = sync_inode_metadata(&inode->v, 1);
2586         ret3 = bch2_flush_inode(c, inode);
2587
2588         return bch2_err_class(ret ?: ret2 ?: ret3);
2589 }
2590
2591 /* truncate: */
2592
2593 static inline int range_has_data(struct bch_fs *c, u32 subvol,
2594                                  struct bpos start,
2595                                  struct bpos end)
2596 {
2597         struct btree_trans trans;
2598         struct btree_iter iter;
2599         struct bkey_s_c k;
2600         int ret = 0;
2601
2602         bch2_trans_init(&trans, c, 0, 0);
2603 retry:
2604         bch2_trans_begin(&trans);
2605
2606         ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
2607         if (ret)
2608                 goto err;
2609
2610         for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
2611                 if (bkey_extent_is_data(k.k)) {
2612                         ret = 1;
2613                         break;
2614                 }
2615         start = iter.pos;
2616         bch2_trans_iter_exit(&trans, &iter);
2617 err:
2618         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2619                 goto retry;
2620
2621         bch2_trans_exit(&trans);
2622         return ret;
2623 }
2624
2625 static int __bch2_truncate_page(struct bch_inode_info *inode,
2626                                 pgoff_t index, loff_t start, loff_t end)
2627 {
2628         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2629         struct address_space *mapping = inode->v.i_mapping;
2630         struct bch_page_state *s;
2631         unsigned start_offset = start & (PAGE_SIZE - 1);
2632         unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
2633         unsigned i;
2634         struct page *page;
2635         s64 i_sectors_delta = 0;
2636         int ret = 0;
2637
2638         /* Page boundary? Nothing to do */
2639         if (!((index == start >> PAGE_SHIFT && start_offset) ||
2640               (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE)))
2641                 return 0;
2642
2643         /* Above i_size? */
2644         if (index << PAGE_SHIFT >= inode->v.i_size)
2645                 return 0;
2646
2647         page = find_lock_page(mapping, index);
2648         if (!page) {
2649                 /*
2650                  * XXX: we're doing two index lookups when we end up reading the
2651                  * page
2652                  */
2653                 ret = range_has_data(c, inode->ei_subvol,
2654                                 POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
2655                                 POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
2656                 if (ret <= 0)
2657                         return ret;
2658
2659                 page = find_or_create_page(mapping, index, GFP_KERNEL);
2660                 if (unlikely(!page)) {
2661                         ret = -ENOMEM;
2662                         goto out;
2663                 }
2664         }
2665
2666         s = bch2_page_state_create(page, 0);
2667         if (!s) {
2668                 ret = -ENOMEM;
2669                 goto unlock;
2670         }
2671
2672         if (!PageUptodate(page)) {
2673                 ret = bch2_read_single_page(page, mapping);
2674                 if (ret)
2675                         goto unlock;
2676         }
2677
2678         if (index != start >> PAGE_SHIFT)
2679                 start_offset = 0;
2680         if (index != end >> PAGE_SHIFT)
2681                 end_offset = PAGE_SIZE;
2682
2683         for (i = round_up(start_offset, block_bytes(c)) >> 9;
2684              i < round_down(end_offset, block_bytes(c)) >> 9;
2685              i++) {
2686                 s->s[i].nr_replicas     = 0;
2687                 if (s->s[i].state == SECTOR_DIRTY)
2688                         i_sectors_delta--;
2689                 s->s[i].state           = SECTOR_UNALLOCATED;
2690         }
2691
2692         i_sectors_acct(c, inode, NULL, i_sectors_delta);
2693
2694         /*
2695          * Caller needs to know whether this page will be written out by
2696          * writeback - doing an i_size update if necessary - or whether it will
2697          * be responsible for the i_size update:
2698          */
2699         ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT),
2700                           PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY;
2701
2702         zero_user_segment(page, start_offset, end_offset);
2703
2704         /*
2705          * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
2706          *
2707          * XXX: because we aren't currently tracking whether the page has actual
2708          * data in it (vs. just 0s, or only partially written) this wrong. ick.
2709          */
2710         BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false));
2711
2712         /*
2713          * This removes any writeable userspace mappings; we need to force
2714          * .page_mkwrite to be called again before any mmapped writes, to
2715          * redirty the full page:
2716          */
2717         page_mkclean(page);
2718         filemap_dirty_folio(mapping, page_folio(page));
2719 unlock:
2720         unlock_page(page);
2721         put_page(page);
2722 out:
2723         return ret;
2724 }
2725
2726 static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
2727 {
2728         return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
2729                                     from, round_up(from, PAGE_SIZE));
2730 }
2731
2732 static int bch2_truncate_pages(struct bch_inode_info *inode,
2733                                loff_t start, loff_t end)
2734 {
2735         int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT,
2736                                        start, end);
2737
2738         if (ret >= 0 &&
2739             start >> PAGE_SHIFT != end >> PAGE_SHIFT)
2740                 ret = __bch2_truncate_page(inode,
2741                                            end >> PAGE_SHIFT,
2742                                            start, end);
2743         return ret;
2744 }
2745
2746 static int bch2_extend(struct user_namespace *mnt_userns,
2747                        struct bch_inode_info *inode,
2748                        struct bch_inode_unpacked *inode_u,
2749                        struct iattr *iattr)
2750 {
2751         struct address_space *mapping = inode->v.i_mapping;
2752         int ret;
2753
2754         /*
2755          * sync appends:
2756          *
2757          * this has to be done _before_ extending i_size:
2758          */
2759         ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
2760         if (ret)
2761                 return ret;
2762
2763         truncate_setsize(&inode->v, iattr->ia_size);
2764
2765         return bch2_setattr_nonsize(mnt_userns, inode, iattr);
2766 }
2767
2768 static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
2769                                    struct bch_inode_unpacked *bi,
2770                                    void *p)
2771 {
2772         bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
2773         return 0;
2774 }
2775
2776 static int bch2_truncate_start_fn(struct bch_inode_info *inode,
2777                                   struct bch_inode_unpacked *bi, void *p)
2778 {
2779         u64 *new_i_size = p;
2780
2781         bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
2782         bi->bi_size = *new_i_size;
2783         return 0;
2784 }
2785
2786 int bch2_truncate(struct user_namespace *mnt_userns,
2787                   struct bch_inode_info *inode, struct iattr *iattr)
2788 {
2789         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2790         struct address_space *mapping = inode->v.i_mapping;
2791         struct bch_inode_unpacked inode_u;
2792         u64 new_i_size = iattr->ia_size;
2793         s64 i_sectors_delta = 0;
2794         int ret = 0;
2795
2796         /*
2797          * If the truncate call with change the size of the file, the
2798          * cmtimes should be updated. If the size will not change, we
2799          * do not need to update the cmtimes.
2800          */
2801         if (iattr->ia_size != inode->v.i_size) {
2802                 if (!(iattr->ia_valid & ATTR_MTIME))
2803                         ktime_get_coarse_real_ts64(&iattr->ia_mtime);
2804                 if (!(iattr->ia_valid & ATTR_CTIME))
2805                         ktime_get_coarse_real_ts64(&iattr->ia_ctime);
2806                 iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
2807         }
2808
2809         inode_dio_wait(&inode->v);
2810         bch2_pagecache_block_get(inode);
2811
2812         ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
2813         if (ret)
2814                 goto err;
2815
2816         /*
2817          * check this before next assertion; on filesystem error our normal
2818          * invariants are a bit broken (truncate has to truncate the page cache
2819          * before the inode).
2820          */
2821         ret = bch2_journal_error(&c->journal);
2822         if (ret)
2823                 goto err;
2824
2825         WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
2826                   inode->v.i_size < inode_u.bi_size,
2827                   "truncate spotted in mem i_size < btree i_size: %llu < %llu\n",
2828                   (u64) inode->v.i_size, inode_u.bi_size);
2829
2830         if (iattr->ia_size > inode->v.i_size) {
2831                 ret = bch2_extend(mnt_userns, inode, &inode_u, iattr);
2832                 goto err;
2833         }
2834
2835         iattr->ia_valid &= ~ATTR_SIZE;
2836
2837         ret = bch2_truncate_page(inode, iattr->ia_size);
2838         if (unlikely(ret < 0))
2839                 goto err;
2840
2841         /*
2842          * When extending, we're going to write the new i_size to disk
2843          * immediately so we need to flush anything above the current on disk
2844          * i_size first:
2845          *
2846          * Also, when extending we need to flush the page that i_size currently
2847          * straddles - if it's mapped to userspace, we need to ensure that
2848          * userspace has to redirty it and call .mkwrite -> set_page_dirty
2849          * again to allocate the part of the page that was extended.
2850          */
2851         if (iattr->ia_size > inode_u.bi_size)
2852                 ret = filemap_write_and_wait_range(mapping,
2853                                 inode_u.bi_size,
2854                                 iattr->ia_size - 1);
2855         else if (iattr->ia_size & (PAGE_SIZE - 1))
2856                 ret = filemap_write_and_wait_range(mapping,
2857                                 round_down(iattr->ia_size, PAGE_SIZE),
2858                                 iattr->ia_size - 1);
2859         if (ret)
2860                 goto err;
2861
2862         mutex_lock(&inode->ei_update_lock);
2863         ret = bch2_write_inode(c, inode, bch2_truncate_start_fn,
2864                                &new_i_size, 0);
2865         mutex_unlock(&inode->ei_update_lock);
2866
2867         if (unlikely(ret))
2868                 goto err;
2869
2870         truncate_setsize(&inode->v, iattr->ia_size);
2871
2872         ret = bch2_fpunch(c, inode_inum(inode),
2873                         round_up(iattr->ia_size, block_bytes(c)) >> 9,
2874                         U64_MAX, &i_sectors_delta);
2875         i_sectors_acct(c, inode, NULL, i_sectors_delta);
2876
2877         bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
2878                                 !bch2_journal_error(&c->journal), c,
2879                                 "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
2880                                 inode->v.i_ino, (u64) inode->v.i_blocks,
2881                                 inode->ei_inode.bi_sectors);
2882         if (unlikely(ret))
2883                 goto err;
2884
2885         mutex_lock(&inode->ei_update_lock);
2886         ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0);
2887         mutex_unlock(&inode->ei_update_lock);
2888
2889         ret = bch2_setattr_nonsize(mnt_userns, inode, iattr);
2890 err:
2891         bch2_pagecache_block_put(inode);
2892         return bch2_err_class(ret);
2893 }
2894
2895 /* fallocate: */
2896
2897 static int inode_update_times_fn(struct bch_inode_info *inode,
2898                                  struct bch_inode_unpacked *bi, void *p)
2899 {
2900         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2901
2902         bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
2903         return 0;
2904 }
2905
2906 static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
2907 {
2908         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2909         u64 end         = offset + len;
2910         u64 block_start = round_up(offset, block_bytes(c));
2911         u64 block_end   = round_down(end, block_bytes(c));
2912         bool truncated_last_page;
2913         int ret = 0;
2914
2915         ret = bch2_truncate_pages(inode, offset, end);
2916         if (unlikely(ret < 0))
2917                 goto err;
2918
2919         truncated_last_page = ret;
2920
2921         truncate_pagecache_range(&inode->v, offset, end - 1);
2922
2923         if (block_start < block_end) {
2924                 s64 i_sectors_delta = 0;
2925
2926                 ret = bch2_fpunch(c, inode_inum(inode),
2927                                   block_start >> 9, block_end >> 9,
2928                                   &i_sectors_delta);
2929                 i_sectors_acct(c, inode, NULL, i_sectors_delta);
2930         }
2931
2932         mutex_lock(&inode->ei_update_lock);
2933         if (end >= inode->v.i_size && !truncated_last_page) {
2934                 ret = bch2_write_inode_size(c, inode, inode->v.i_size,
2935                                             ATTR_MTIME|ATTR_CTIME);
2936         } else {
2937                 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
2938                                        ATTR_MTIME|ATTR_CTIME);
2939         }
2940         mutex_unlock(&inode->ei_update_lock);
2941 err:
2942         return ret;
2943 }
2944
2945 static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
2946                                    loff_t offset, loff_t len,
2947                                    bool insert)
2948 {
2949         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2950         struct address_space *mapping = inode->v.i_mapping;
2951         struct bkey_buf copy;
2952         struct btree_trans trans;
2953         struct btree_iter src, dst, del;
2954         loff_t shift, new_size;
2955         u64 src_start;
2956         int ret = 0;
2957
2958         if ((offset | len) & (block_bytes(c) - 1))
2959                 return -EINVAL;
2960
2961         if (insert) {
2962                 if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
2963                         return -EFBIG;
2964
2965                 if (offset >= inode->v.i_size)
2966                         return -EINVAL;
2967
2968                 src_start       = U64_MAX;
2969                 shift           = len;
2970         } else {
2971                 if (offset + len >= inode->v.i_size)
2972                         return -EINVAL;
2973
2974                 src_start       = offset + len;
2975                 shift           = -len;
2976         }
2977
2978         new_size = inode->v.i_size + shift;
2979
2980         ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
2981         if (ret)
2982                 return ret;
2983
2984         if (insert) {
2985                 i_size_write(&inode->v, new_size);
2986                 mutex_lock(&inode->ei_update_lock);
2987                 ret = bch2_write_inode_size(c, inode, new_size,
2988                                             ATTR_MTIME|ATTR_CTIME);
2989                 mutex_unlock(&inode->ei_update_lock);
2990         } else {
2991                 s64 i_sectors_delta = 0;
2992
2993                 ret = bch2_fpunch(c, inode_inum(inode),
2994                                   offset >> 9, (offset + len) >> 9,
2995                                   &i_sectors_delta);
2996                 i_sectors_acct(c, inode, NULL, i_sectors_delta);
2997
2998                 if (ret)
2999                         return ret;
3000         }
3001
3002         bch2_bkey_buf_init(&copy);
3003         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
3004         bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
3005                         POS(inode->v.i_ino, src_start >> 9),
3006                         BTREE_ITER_INTENT);
3007         bch2_trans_copy_iter(&dst, &src);
3008         bch2_trans_copy_iter(&del, &src);
3009
3010         while (ret == 0 ||
3011                bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
3012                 struct disk_reservation disk_res =
3013                         bch2_disk_reservation_init(c, 0);
3014                 struct bkey_i delete;
3015                 struct bkey_s_c k;
3016                 struct bpos next_pos;
3017                 struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
3018                 struct bpos atomic_end;
3019                 unsigned trigger_flags = 0;
3020                 u32 snapshot;
3021
3022                 bch2_trans_begin(&trans);
3023
3024                 ret = bch2_subvolume_get_snapshot(&trans,
3025                                         inode->ei_subvol, &snapshot);
3026                 if (ret)
3027                         continue;
3028
3029                 bch2_btree_iter_set_snapshot(&src, snapshot);
3030                 bch2_btree_iter_set_snapshot(&dst, snapshot);
3031                 bch2_btree_iter_set_snapshot(&del, snapshot);
3032
3033                 bch2_trans_begin(&trans);
3034
3035                 k = insert
3036                         ? bch2_btree_iter_peek_prev(&src)
3037                         : bch2_btree_iter_peek_upto(&src, POS(inode->v.i_ino, U64_MAX));
3038                 if ((ret = bkey_err(k)))
3039                         continue;
3040
3041                 if (!k.k || k.k->p.inode != inode->v.i_ino)
3042                         break;
3043
3044                 if (insert &&
3045                     bkey_le(k.k->p, POS(inode->v.i_ino, offset >> 9)))
3046                         break;
3047 reassemble:
3048                 bch2_bkey_buf_reassemble(&copy, c, k);
3049
3050                 if (insert &&
3051                     bkey_lt(bkey_start_pos(k.k), move_pos))
3052                         bch2_cut_front(move_pos, copy.k);
3053
3054                 copy.k->k.p.offset += shift >> 9;
3055                 bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k));
3056
3057                 ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
3058                 if (ret)
3059                         continue;
3060
3061                 if (!bkey_eq(atomic_end, copy.k->k.p)) {
3062                         if (insert) {
3063                                 move_pos = atomic_end;
3064                                 move_pos.offset -= shift >> 9;
3065                                 goto reassemble;
3066                         } else {
3067                                 bch2_cut_back(atomic_end, copy.k);
3068                         }
3069                 }
3070
3071                 bkey_init(&delete.k);
3072                 delete.k.p = copy.k->k.p;
3073                 delete.k.size = copy.k->k.size;
3074                 delete.k.p.offset -= shift >> 9;
3075                 bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
3076
3077                 next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
3078
3079                 if (copy.k->k.size != k.k->size) {
3080                         /* We might end up splitting compressed extents: */
3081                         unsigned nr_ptrs =
3082                                 bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
3083
3084                         ret = bch2_disk_reservation_get(c, &disk_res,
3085                                         copy.k->k.size, nr_ptrs,
3086                                         BCH_DISK_RESERVATION_NOFAIL);
3087                         BUG_ON(ret);
3088                 }
3089
3090                 ret =   bch2_btree_iter_traverse(&del) ?:
3091                         bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
3092                         bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
3093                         bch2_trans_commit(&trans, &disk_res, NULL,
3094                                           BTREE_INSERT_NOFAIL);
3095                 bch2_disk_reservation_put(c, &disk_res);
3096
3097                 if (!ret)
3098                         bch2_btree_iter_set_pos(&src, next_pos);
3099         }
3100         bch2_trans_iter_exit(&trans, &del);
3101         bch2_trans_iter_exit(&trans, &dst);
3102         bch2_trans_iter_exit(&trans, &src);
3103         bch2_trans_exit(&trans);
3104         bch2_bkey_buf_exit(&copy, c);
3105
3106         if (ret)
3107                 return ret;
3108
3109         mutex_lock(&inode->ei_update_lock);
3110         if (!insert) {
3111                 i_size_write(&inode->v, new_size);
3112                 ret = bch2_write_inode_size(c, inode, new_size,
3113                                             ATTR_MTIME|ATTR_CTIME);
3114         } else {
3115                 /* We need an inode update to update bi_journal_seq for fsync: */
3116                 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
3117                                        ATTR_MTIME|ATTR_CTIME);
3118         }
3119         mutex_unlock(&inode->ei_update_lock);
3120         return ret;
3121 }
3122
3123 static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
3124                              u64 start_sector, u64 end_sector)
3125 {
3126         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3127         struct btree_trans trans;
3128         struct btree_iter iter;
3129         struct bpos end_pos = POS(inode->v.i_ino, end_sector);
3130         struct bch_io_opts opts;
3131         int ret = 0;
3132
3133         bch2_inode_opts_get(&opts, c, &inode->ei_inode);
3134         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
3135
3136         bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
3137                         POS(inode->v.i_ino, start_sector),
3138                         BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
3139
3140         while (!ret && bkey_lt(iter.pos, end_pos)) {
3141                 s64 i_sectors_delta = 0;
3142                 struct quota_res quota_res = { 0 };
3143                 struct bkey_s_c k;
3144                 unsigned sectors;
3145                 u32 snapshot;
3146
3147                 bch2_trans_begin(&trans);
3148
3149                 ret = bch2_subvolume_get_snapshot(&trans,
3150                                         inode->ei_subvol, &snapshot);
3151                 if (ret)
3152                         goto bkey_err;
3153
3154                 bch2_btree_iter_set_snapshot(&iter, snapshot);
3155
3156                 k = bch2_btree_iter_peek_slot(&iter);
3157                 if ((ret = bkey_err(k)))
3158                         goto bkey_err;
3159
3160                 /* already reserved */
3161                 if (bkey_extent_is_reservation(k) &&
3162                     bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
3163                         bch2_btree_iter_advance(&iter);
3164                         continue;
3165                 }
3166
3167                 if (bkey_extent_is_data(k.k) &&
3168                     !(mode & FALLOC_FL_ZERO_RANGE)) {
3169                         bch2_btree_iter_advance(&iter);
3170                         continue;
3171                 }
3172
3173                 /*
3174                  * XXX: for nocow mode, we should promote shared extents to
3175                  * unshared here
3176                  */
3177
3178                 sectors = bpos_min(k.k->p, end_pos).offset - iter.pos.offset;
3179
3180                 if (!bkey_extent_is_allocation(k.k)) {
3181                         ret = bch2_quota_reservation_add(c, inode,
3182                                         &quota_res,
3183                                         sectors, true);
3184                         if (unlikely(ret))
3185                                 goto bkey_err;
3186                 }
3187
3188                 ret = bch2_extent_fallocate(&trans, inode_inum(inode), &iter,
3189                                             sectors, opts, &i_sectors_delta,
3190                                             writepoint_hashed((unsigned long) current));
3191                 if (ret)
3192                         goto bkey_err;
3193
3194                 i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
3195 bkey_err:
3196                 bch2_quota_reservation_put(c, inode, &quota_res);
3197                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
3198                         ret = 0;
3199         }
3200
3201         bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */
3202         mark_pagecache_reserved(inode, start_sector, iter.pos.offset);
3203
3204         if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
3205                 struct quota_res quota_res = { 0 };
3206                 s64 i_sectors_delta = 0;
3207
3208                 bch2_fpunch_at(&trans, &iter, inode_inum(inode),
3209                                end_sector, &i_sectors_delta);
3210                 i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
3211                 bch2_quota_reservation_put(c, inode, &quota_res);
3212         }
3213
3214         bch2_trans_iter_exit(&trans, &iter);
3215         bch2_trans_exit(&trans);
3216         return ret;
3217 }
3218
3219 static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
3220                             loff_t offset, loff_t len)
3221 {
3222         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3223         u64 end         = offset + len;
3224         u64 block_start = round_down(offset,    block_bytes(c));
3225         u64 block_end   = round_up(end,         block_bytes(c));
3226         bool truncated_last_page = false;
3227         int ret, ret2 = 0;
3228
3229         if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
3230                 ret = inode_newsize_ok(&inode->v, end);
3231                 if (ret)
3232                         return ret;
3233         }
3234
3235         if (mode & FALLOC_FL_ZERO_RANGE) {
3236                 ret = bch2_truncate_pages(inode, offset, end);
3237                 if (unlikely(ret < 0))
3238                         return ret;
3239
3240                 truncated_last_page = ret;
3241
3242                 truncate_pagecache_range(&inode->v, offset, end - 1);
3243
3244                 block_start     = round_up(offset,      block_bytes(c));
3245                 block_end       = round_down(end,       block_bytes(c));
3246         }
3247
3248         ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
3249
3250         /*
3251          * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
3252          * so that the VFS cache i_size is consistent with the btree i_size:
3253          */
3254         if (ret &&
3255             !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
3256                 return ret;
3257
3258         if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
3259                 end = inode->v.i_size;
3260
3261         if (end >= inode->v.i_size &&
3262             (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
3263              !(mode & FALLOC_FL_KEEP_SIZE))) {
3264                 spin_lock(&inode->v.i_lock);
3265                 i_size_write(&inode->v, end);
3266                 spin_unlock(&inode->v.i_lock);
3267
3268                 mutex_lock(&inode->ei_update_lock);
3269                 ret2 = bch2_write_inode_size(c, inode, end, 0);
3270                 mutex_unlock(&inode->ei_update_lock);
3271         }
3272
3273         return ret ?: ret2;
3274 }
3275
3276 long bch2_fallocate_dispatch(struct file *file, int mode,
3277                              loff_t offset, loff_t len)
3278 {
3279         struct bch_inode_info *inode = file_bch_inode(file);
3280         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3281         long ret;
3282
3283         if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate))
3284                 return -EROFS;
3285
3286         inode_lock(&inode->v);
3287         inode_dio_wait(&inode->v);
3288         bch2_pagecache_block_get(inode);
3289
3290         ret = file_modified(file);
3291         if (ret)
3292                 goto err;
3293
3294         if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
3295                 ret = bchfs_fallocate(inode, mode, offset, len);
3296         else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
3297                 ret = bchfs_fpunch(inode, offset, len);
3298         else if (mode == FALLOC_FL_INSERT_RANGE)
3299                 ret = bchfs_fcollapse_finsert(inode, offset, len, true);
3300         else if (mode == FALLOC_FL_COLLAPSE_RANGE)
3301                 ret = bchfs_fcollapse_finsert(inode, offset, len, false);
3302         else
3303                 ret = -EOPNOTSUPP;
3304 err:
3305         bch2_pagecache_block_put(inode);
3306         inode_unlock(&inode->v);
3307         bch2_write_ref_put(c, BCH_WRITE_REF_fallocate);
3308
3309         return bch2_err_class(ret);
3310 }
3311
3312 /*
3313  * Take a quota reservation for unallocated blocks in a given file range
3314  * Does not check pagecache
3315  */
3316 static int quota_reserve_range(struct bch_inode_info *inode,
3317                                struct quota_res *res,
3318                                u64 start, u64 end)
3319 {
3320         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3321         struct btree_trans trans;
3322         struct btree_iter iter;
3323         struct bkey_s_c k;
3324         u32 snapshot;
3325         u64 sectors = end - start;
3326         u64 pos = start;
3327         int ret;
3328
3329         bch2_trans_init(&trans, c, 0, 0);
3330 retry:
3331         bch2_trans_begin(&trans);
3332
3333         ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
3334         if (ret)
3335                 goto err;
3336
3337         bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
3338                              SPOS(inode->v.i_ino, pos, snapshot), 0);
3339
3340         while (!(ret = btree_trans_too_many_iters(&trans)) &&
3341                (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
3342                !(ret = bkey_err(k))) {
3343                 if (bkey_extent_is_allocation(k.k)) {
3344                         u64 s = min(end, k.k->p.offset) -
3345                                 max(start, bkey_start_offset(k.k));
3346                         BUG_ON(s > sectors);
3347                         sectors -= s;
3348                 }
3349                 bch2_btree_iter_advance(&iter);
3350         }
3351         pos = iter.pos.offset;
3352         bch2_trans_iter_exit(&trans, &iter);
3353 err:
3354         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
3355                 goto retry;
3356
3357         bch2_trans_exit(&trans);
3358
3359         if (ret)
3360                 return ret;
3361
3362         return bch2_quota_reservation_add(c, inode, res, sectors, true);
3363 }
3364
3365 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
3366                              struct file *file_dst, loff_t pos_dst,
3367                              loff_t len, unsigned remap_flags)
3368 {
3369         struct bch_inode_info *src = file_bch_inode(file_src);
3370         struct bch_inode_info *dst = file_bch_inode(file_dst);
3371         struct bch_fs *c = src->v.i_sb->s_fs_info;
3372         struct quota_res quota_res = { 0 };
3373         s64 i_sectors_delta = 0;
3374         u64 aligned_len;
3375         loff_t ret = 0;
3376
3377         if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
3378                 return -EINVAL;
3379
3380         if (remap_flags & REMAP_FILE_DEDUP)
3381                 return -EOPNOTSUPP;
3382
3383         if ((pos_src & (block_bytes(c) - 1)) ||
3384             (pos_dst & (block_bytes(c) - 1)))
3385                 return -EINVAL;
3386
3387         if (src == dst &&
3388             abs(pos_src - pos_dst) < len)
3389                 return -EINVAL;
3390
3391         bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
3392
3393         inode_dio_wait(&src->v);
3394         inode_dio_wait(&dst->v);
3395
3396         ret = generic_remap_file_range_prep(file_src, pos_src,
3397                                             file_dst, pos_dst,
3398                                             &len, remap_flags);
3399         if (ret < 0 || len == 0)
3400                 goto err;
3401
3402         aligned_len = round_up((u64) len, block_bytes(c));
3403
3404         ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
3405                                 pos_dst, pos_dst + len - 1);
3406         if (ret)
3407                 goto err;
3408
3409         ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9,
3410                                   (pos_dst + aligned_len) >> 9);
3411         if (ret)
3412                 goto err;
3413
3414         file_update_time(file_dst);
3415
3416         mark_pagecache_unallocated(src, pos_src >> 9,
3417                                    (pos_src + aligned_len) >> 9);
3418
3419         ret = bch2_remap_range(c,
3420                                inode_inum(dst), pos_dst >> 9,
3421                                inode_inum(src), pos_src >> 9,
3422                                aligned_len >> 9,
3423                                pos_dst + len, &i_sectors_delta);
3424         if (ret < 0)
3425                 goto err;
3426
3427         /*
3428          * due to alignment, we might have remapped slightly more than requsted
3429          */
3430         ret = min((u64) ret << 9, (u64) len);
3431
3432         i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
3433
3434         spin_lock(&dst->v.i_lock);
3435         if (pos_dst + ret > dst->v.i_size)
3436                 i_size_write(&dst->v, pos_dst + ret);
3437         spin_unlock(&dst->v.i_lock);
3438
3439         if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
3440             IS_SYNC(file_inode(file_dst)))
3441                 ret = bch2_flush_inode(c, dst);
3442 err:
3443         bch2_quota_reservation_put(c, dst, &quota_res);
3444         bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
3445
3446         return bch2_err_class(ret);
3447 }
3448
3449 /* fseek: */
3450
3451 static int folio_data_offset(struct folio *folio, unsigned offset)
3452 {
3453         struct bch_page_state *s = bch2_page_state(&folio->page);
3454         unsigned i;
3455
3456         if (s)
3457                 for (i = offset >> 9; i < PAGE_SECTORS; i++)
3458                         if (s->s[i].state >= SECTOR_DIRTY)
3459                                 return i << 9;
3460
3461         return -1;
3462 }
3463
3464 static loff_t bch2_seek_pagecache_data(struct inode *vinode,
3465                                        loff_t start_offset,
3466                                        loff_t end_offset)
3467 {
3468         struct folio_batch fbatch;
3469         pgoff_t start_index     = start_offset >> PAGE_SHIFT;
3470         pgoff_t end_index       = end_offset >> PAGE_SHIFT;
3471         pgoff_t index           = start_index;
3472         unsigned i;
3473         loff_t ret;
3474         int offset;
3475
3476         folio_batch_init(&fbatch);
3477
3478         while (filemap_get_folios(vinode->i_mapping,
3479                                   &index, end_index, &fbatch)) {
3480                 for (i = 0; i < folio_batch_count(&fbatch); i++) {
3481                         struct folio *folio = fbatch.folios[i];
3482
3483                         folio_lock(folio);
3484                         offset = folio_data_offset(folio,
3485                                         folio->index == start_index
3486                                         ? start_offset & (PAGE_SIZE - 1)
3487                                         : 0);
3488                         if (offset >= 0) {
3489                                 ret = clamp(((loff_t) folio->index << PAGE_SHIFT) +
3490                                             offset,
3491                                             start_offset, end_offset);
3492                                 folio_unlock(folio);
3493                                 folio_batch_release(&fbatch);
3494                                 return ret;
3495                         }
3496                         folio_unlock(folio);
3497                 }
3498                 folio_batch_release(&fbatch);
3499                 cond_resched();
3500         }
3501
3502         return end_offset;
3503 }
3504
3505 static loff_t bch2_seek_data(struct file *file, u64 offset)
3506 {
3507         struct bch_inode_info *inode = file_bch_inode(file);
3508         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3509         struct btree_trans trans;
3510         struct btree_iter iter;
3511         struct bkey_s_c k;
3512         subvol_inum inum = inode_inum(inode);
3513         u64 isize, next_data = MAX_LFS_FILESIZE;
3514         u32 snapshot;
3515         int ret;
3516
3517         isize = i_size_read(&inode->v);
3518         if (offset >= isize)
3519                 return -ENXIO;
3520
3521         bch2_trans_init(&trans, c, 0, 0);
3522 retry:
3523         bch2_trans_begin(&trans);
3524
3525         ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
3526         if (ret)
3527                 goto err;
3528
3529         for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents,
3530                            SPOS(inode->v.i_ino, offset >> 9, snapshot),
3531                            POS(inode->v.i_ino, U64_MAX),
3532                            0, k, ret) {
3533                 if (bkey_extent_is_data(k.k)) {
3534                         next_data = max(offset, bkey_start_offset(k.k) << 9);
3535                         break;
3536                 } else if (k.k->p.offset >> 9 > isize)
3537                         break;
3538         }
3539         bch2_trans_iter_exit(&trans, &iter);
3540 err:
3541         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
3542                 goto retry;
3543
3544         bch2_trans_exit(&trans);
3545         if (ret)
3546                 return ret;
3547
3548         if (next_data > offset)
3549                 next_data = bch2_seek_pagecache_data(&inode->v,
3550                                                      offset, next_data);
3551
3552         if (next_data >= isize)
3553                 return -ENXIO;
3554
3555         return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
3556 }
3557
3558 static int __page_hole_offset(struct page *page, unsigned offset)
3559 {
3560         struct bch_page_state *s = bch2_page_state(page);
3561         unsigned i;
3562
3563         if (!s)
3564                 return 0;
3565
3566         for (i = offset >> 9; i < PAGE_SECTORS; i++)
3567                 if (s->s[i].state < SECTOR_DIRTY)
3568                         return i << 9;
3569
3570         return -1;
3571 }
3572
3573 static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
3574 {
3575         pgoff_t index = offset >> PAGE_SHIFT;
3576         struct page *page;
3577         int pg_offset;
3578         loff_t ret = -1;
3579
3580         page = find_lock_page(mapping, index);
3581         if (!page)
3582                 return offset;
3583
3584         pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
3585         if (pg_offset >= 0)
3586                 ret = ((loff_t) index << PAGE_SHIFT) + pg_offset;
3587
3588         unlock_page(page);
3589
3590         return ret;
3591 }
3592
3593 static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
3594                                        loff_t start_offset,
3595                                        loff_t end_offset)
3596 {
3597         struct address_space *mapping = vinode->i_mapping;
3598         loff_t offset = start_offset, hole;
3599
3600         while (offset < end_offset) {
3601                 hole = page_hole_offset(mapping, offset);
3602                 if (hole >= 0 && hole <= end_offset)
3603                         return max(start_offset, hole);
3604
3605                 offset += PAGE_SIZE;
3606                 offset &= PAGE_MASK;
3607         }
3608
3609         return end_offset;
3610 }
3611
3612 static loff_t bch2_seek_hole(struct file *file, u64 offset)
3613 {
3614         struct bch_inode_info *inode = file_bch_inode(file);
3615         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3616         struct btree_trans trans;
3617         struct btree_iter iter;
3618         struct bkey_s_c k;
3619         subvol_inum inum = inode_inum(inode);
3620         u64 isize, next_hole = MAX_LFS_FILESIZE;
3621         u32 snapshot;
3622         int ret;
3623
3624         isize = i_size_read(&inode->v);
3625         if (offset >= isize)
3626                 return -ENXIO;
3627
3628         bch2_trans_init(&trans, c, 0, 0);
3629 retry:
3630         bch2_trans_begin(&trans);
3631
3632         ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
3633         if (ret)
3634                 goto err;
3635
3636         for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
3637                            SPOS(inode->v.i_ino, offset >> 9, snapshot),
3638                            BTREE_ITER_SLOTS, k, ret) {
3639                 if (k.k->p.inode != inode->v.i_ino) {
3640                         next_hole = bch2_seek_pagecache_hole(&inode->v,
3641                                         offset, MAX_LFS_FILESIZE);
3642                         break;
3643                 } else if (!bkey_extent_is_data(k.k)) {
3644                         next_hole = bch2_seek_pagecache_hole(&inode->v,
3645                                         max(offset, bkey_start_offset(k.k) << 9),
3646                                         k.k->p.offset << 9);
3647
3648                         if (next_hole < k.k->p.offset << 9)
3649                                 break;
3650                 } else {
3651                         offset = max(offset, bkey_start_offset(k.k) << 9);
3652                 }
3653         }
3654         bch2_trans_iter_exit(&trans, &iter);
3655 err:
3656         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
3657                 goto retry;
3658
3659         bch2_trans_exit(&trans);
3660         if (ret)
3661                 return ret;
3662
3663         if (next_hole > isize)
3664                 next_hole = isize;
3665
3666         return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
3667 }
3668
3669 loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
3670 {
3671         loff_t ret;
3672
3673         switch (whence) {
3674         case SEEK_SET:
3675         case SEEK_CUR:
3676         case SEEK_END:
3677                 ret = generic_file_llseek(file, offset, whence);
3678                 break;
3679         case SEEK_DATA:
3680                 ret = bch2_seek_data(file, offset);
3681                 break;
3682         case SEEK_HOLE:
3683                 ret = bch2_seek_hole(file, offset);
3684                 break;
3685         default:
3686                 ret = -EINVAL;
3687                 break;
3688         }
3689
3690         return bch2_err_class(ret);
3691 }
3692
3693 void bch2_fs_fsio_exit(struct bch_fs *c)
3694 {
3695         bioset_exit(&c->nocow_flush_bioset);
3696         bioset_exit(&c->dio_write_bioset);
3697         bioset_exit(&c->dio_read_bioset);
3698         bioset_exit(&c->writepage_bioset);
3699 }
3700
3701 int bch2_fs_fsio_init(struct bch_fs *c)
3702 {
3703         int ret = 0;
3704
3705         pr_verbose_init(c->opts, "");
3706
3707         if (bioset_init(&c->writepage_bioset,
3708                         4, offsetof(struct bch_writepage_io, op.wbio.bio),
3709                         BIOSET_NEED_BVECS))
3710                 return -BCH_ERR_ENOMEM_writepage_bioset_init;
3711
3712         if (bioset_init(&c->dio_read_bioset,
3713                         4, offsetof(struct dio_read, rbio.bio),
3714                         BIOSET_NEED_BVECS))
3715                 return -BCH_ERR_ENOMEM_dio_read_bioset_init;
3716
3717         if (bioset_init(&c->dio_write_bioset,
3718                         4, offsetof(struct dio_write, op.wbio.bio),
3719                         BIOSET_NEED_BVECS))
3720                 return -BCH_ERR_ENOMEM_dio_write_bioset_init;
3721
3722         if (bioset_init(&c->nocow_flush_bioset,
3723                         1, offsetof(struct nocow_flush, bio), 0))
3724                 return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
3725
3726         pr_verbose_init(c->opts, "ret %i", ret);
3727         return ret;
3728 }
3729
3730 #endif /* NO_BCACHEFS_FS */