]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fs-io.c
Update bcachefs sources to 24c6361e20 bcachefs: Fix a trans path overflow in bch2_btr...
[bcachefs-tools-debian] / libbcachefs / fs-io.c
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3
4 #include "bcachefs.h"
5 #include "alloc_foreground.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "clock.h"
10 #include "error.h"
11 #include "extents.h"
12 #include "extent_update.h"
13 #include "fs.h"
14 #include "fs-io.h"
15 #include "fsck.h"
16 #include "inode.h"
17 #include "journal.h"
18 #include "io.h"
19 #include "keylist.h"
20 #include "quota.h"
21 #include "reflink.h"
22
23 #include <linux/aio.h>
24 #include <linux/backing-dev.h>
25 #include <linux/falloc.h>
26 #include <linux/migrate.h>
27 #include <linux/mmu_context.h>
28 #include <linux/pagevec.h>
29 #include <linux/rmap.h>
30 #include <linux/sched/signal.h>
31 #include <linux/task_io_accounting_ops.h>
32 #include <linux/uio.h>
33 #include <linux/writeback.h>
34
35 #include <trace/events/bcachefs.h>
36 #include <trace/events/writeback.h>
37
38 static inline bool bio_full(struct bio *bio, unsigned len)
39 {
40         if (bio->bi_vcnt >= bio->bi_max_vecs)
41                 return true;
42         if (bio->bi_iter.bi_size > UINT_MAX - len)
43                 return true;
44         return false;
45 }
46
47 static inline struct address_space *faults_disabled_mapping(void)
48 {
49         return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
50 }
51
52 static inline void set_fdm_dropped_locks(void)
53 {
54         current->faults_disabled_mapping =
55                 (void *) (((unsigned long) current->faults_disabled_mapping)|1);
56 }
57
58 static inline bool fdm_dropped_locks(void)
59 {
60         return ((unsigned long) current->faults_disabled_mapping) & 1;
61 }
62
63 struct quota_res {
64         u64                             sectors;
65 };
66
67 struct bch_writepage_io {
68         struct closure                  cl;
69         struct bch_inode_info           *inode;
70
71         /* must be last: */
72         struct bch_write_op             op;
73 };
74
75 struct dio_write {
76         struct completion               done;
77         struct kiocb                    *req;
78         struct mm_struct                *mm;
79         unsigned                        loop:1,
80                                         sync:1,
81                                         free_iov:1;
82         struct quota_res                quota_res;
83         u64                             written;
84
85         struct iov_iter                 iter;
86         struct iovec                    inline_vecs[2];
87
88         /* must be last: */
89         struct bch_write_op             op;
90 };
91
92 struct dio_read {
93         struct closure                  cl;
94         struct kiocb                    *req;
95         long                            ret;
96         bool                            should_dirty;
97         struct bch_read_bio             rbio;
98 };
99
100 /* pagecache_block must be held */
101 static int write_invalidate_inode_pages_range(struct address_space *mapping,
102                                               loff_t start, loff_t end)
103 {
104         int ret;
105
106         /*
107          * XXX: the way this is currently implemented, we can spin if a process
108          * is continually redirtying a specific page
109          */
110         do {
111                 if (!mapping->nrpages)
112                         return 0;
113
114                 ret = filemap_write_and_wait_range(mapping, start, end);
115                 if (ret)
116                         break;
117
118                 if (!mapping->nrpages)
119                         return 0;
120
121                 ret = invalidate_inode_pages2_range(mapping,
122                                 start >> PAGE_SHIFT,
123                                 end >> PAGE_SHIFT);
124         } while (ret == -EBUSY);
125
126         return ret;
127 }
128
129 /* quotas */
130
131 #ifdef CONFIG_BCACHEFS_QUOTA
132
133 static void bch2_quota_reservation_put(struct bch_fs *c,
134                                        struct bch_inode_info *inode,
135                                        struct quota_res *res)
136 {
137         if (!res->sectors)
138                 return;
139
140         mutex_lock(&inode->ei_quota_lock);
141         BUG_ON(res->sectors > inode->ei_quota_reserved);
142
143         bch2_quota_acct(c, inode->ei_qid, Q_SPC,
144                         -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
145         inode->ei_quota_reserved -= res->sectors;
146         mutex_unlock(&inode->ei_quota_lock);
147
148         res->sectors = 0;
149 }
150
151 static int bch2_quota_reservation_add(struct bch_fs *c,
152                                       struct bch_inode_info *inode,
153                                       struct quota_res *res,
154                                       unsigned sectors,
155                                       bool check_enospc)
156 {
157         int ret;
158
159         mutex_lock(&inode->ei_quota_lock);
160         ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
161                               check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
162         if (likely(!ret)) {
163                 inode->ei_quota_reserved += sectors;
164                 res->sectors += sectors;
165         }
166         mutex_unlock(&inode->ei_quota_lock);
167
168         return ret;
169 }
170
171 #else
172
173 static void bch2_quota_reservation_put(struct bch_fs *c,
174                                        struct bch_inode_info *inode,
175                                        struct quota_res *res)
176 {
177 }
178
179 static int bch2_quota_reservation_add(struct bch_fs *c,
180                                       struct bch_inode_info *inode,
181                                       struct quota_res *res,
182                                       unsigned sectors,
183                                       bool check_enospc)
184 {
185         return 0;
186 }
187
188 #endif
189
190 /* i_size updates: */
191
192 struct inode_new_size {
193         loff_t          new_size;
194         u64             now;
195         unsigned        fields;
196 };
197
198 static int inode_set_size(struct bch_inode_info *inode,
199                           struct bch_inode_unpacked *bi,
200                           void *p)
201 {
202         struct inode_new_size *s = p;
203
204         bi->bi_size = s->new_size;
205         if (s->fields & ATTR_ATIME)
206                 bi->bi_atime = s->now;
207         if (s->fields & ATTR_MTIME)
208                 bi->bi_mtime = s->now;
209         if (s->fields & ATTR_CTIME)
210                 bi->bi_ctime = s->now;
211
212         return 0;
213 }
214
215 int __must_check bch2_write_inode_size(struct bch_fs *c,
216                                        struct bch_inode_info *inode,
217                                        loff_t new_size, unsigned fields)
218 {
219         struct inode_new_size s = {
220                 .new_size       = new_size,
221                 .now            = bch2_current_time(c),
222                 .fields         = fields,
223         };
224
225         return bch2_write_inode(c, inode, inode_set_size, &s, fields);
226 }
227
228 static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
229                            struct quota_res *quota_res, s64 sectors)
230 {
231         if (!sectors)
232                 return;
233
234         mutex_lock(&inode->ei_quota_lock);
235         bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
236                                 "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
237                                 inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
238                                 inode->ei_inode.bi_sectors);
239         inode->v.i_blocks += sectors;
240
241 #ifdef CONFIG_BCACHEFS_QUOTA
242         if (quota_res && sectors > 0) {
243                 BUG_ON(sectors > quota_res->sectors);
244                 BUG_ON(sectors > inode->ei_quota_reserved);
245
246                 quota_res->sectors -= sectors;
247                 inode->ei_quota_reserved -= sectors;
248         } else {
249                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
250         }
251 #endif
252         mutex_unlock(&inode->ei_quota_lock);
253 }
254
255 /* page state: */
256
257 /* stored in page->private: */
258
259 struct bch_page_sector {
260         /* Uncompressed, fully allocated replicas (or on disk reservation): */
261         unsigned                nr_replicas:4;
262
263         /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
264         unsigned                replicas_reserved:4;
265
266         /* i_sectors: */
267         enum {
268                 SECTOR_UNALLOCATED,
269                 SECTOR_RESERVED,
270                 SECTOR_DIRTY,
271                 SECTOR_DIRTY_RESERVED,
272                 SECTOR_ALLOCATED,
273         }                       state:8;
274 };
275
276 struct bch_page_state {
277         spinlock_t              lock;
278         atomic_t                write_count;
279         bool                    uptodate;
280         struct bch_page_sector  s[PAGE_SECTORS];
281 };
282
283 static inline struct bch_page_state *__bch2_page_state(struct page *page)
284 {
285         return page_has_private(page)
286                 ? (struct bch_page_state *) page_private(page)
287                 : NULL;
288 }
289
290 static inline struct bch_page_state *bch2_page_state(struct page *page)
291 {
292         EBUG_ON(!PageLocked(page));
293
294         return __bch2_page_state(page);
295 }
296
297 /* for newly allocated pages: */
298 static void __bch2_page_state_release(struct page *page)
299 {
300         kfree(detach_page_private(page));
301 }
302
303 static void bch2_page_state_release(struct page *page)
304 {
305         EBUG_ON(!PageLocked(page));
306         __bch2_page_state_release(page);
307 }
308
309 /* for newly allocated pages: */
310 static struct bch_page_state *__bch2_page_state_create(struct page *page,
311                                                        gfp_t gfp)
312 {
313         struct bch_page_state *s;
314
315         s = kzalloc(sizeof(*s), GFP_NOFS|gfp);
316         if (!s)
317                 return NULL;
318
319         spin_lock_init(&s->lock);
320         attach_page_private(page, s);
321         return s;
322 }
323
324 static struct bch_page_state *bch2_page_state_create(struct page *page,
325                                                      gfp_t gfp)
326 {
327         return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
328 }
329
330 static unsigned bkey_to_sector_state(const struct bkey *k)
331 {
332         if (k->type == KEY_TYPE_reservation)
333                 return SECTOR_RESERVED;
334         if (bkey_extent_is_allocation(k))
335                 return SECTOR_ALLOCATED;
336         return SECTOR_UNALLOCATED;
337 }
338
339 static void __bch2_page_state_set(struct page *page,
340                                   unsigned pg_offset, unsigned pg_len,
341                                   unsigned nr_ptrs, unsigned state)
342 {
343         struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL);
344         unsigned i;
345
346         BUG_ON(pg_offset >= PAGE_SECTORS);
347         BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
348
349         spin_lock(&s->lock);
350
351         for (i = pg_offset; i < pg_offset + pg_len; i++) {
352                 s->s[i].nr_replicas = nr_ptrs;
353                 s->s[i].state = state;
354         }
355
356         if (i == PAGE_SECTORS)
357                 s->uptodate = true;
358
359         spin_unlock(&s->lock);
360 }
361
362 static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum,
363                                struct page **pages, unsigned nr_pages)
364 {
365         struct btree_trans trans;
366         struct btree_iter iter;
367         struct bkey_s_c k;
368         u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT;
369         unsigned pg_idx = 0;
370         u32 snapshot;
371         int ret;
372
373         bch2_trans_init(&trans, c, 0, 0);
374 retry:
375         bch2_trans_begin(&trans);
376
377         ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
378         if (ret)
379                 goto err;
380
381         for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
382                            SPOS(inum.inum, offset, snapshot),
383                            BTREE_ITER_SLOTS, k, ret) {
384                 unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
385                 unsigned state = bkey_to_sector_state(k.k);
386
387                 while (pg_idx < nr_pages) {
388                         struct page *page = pages[pg_idx];
389                         u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
390                         u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
391                         unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start;
392                         unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start;
393
394                         BUG_ON(k.k->p.offset < pg_start);
395                         BUG_ON(bkey_start_offset(k.k) > pg_end);
396
397                         if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate)
398                                 __bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state);
399
400                         if (k.k->p.offset < pg_end)
401                                 break;
402                         pg_idx++;
403                 }
404
405                 if (pg_idx == nr_pages)
406                         break;
407         }
408
409         offset = iter.pos.offset;
410         bch2_trans_iter_exit(&trans, &iter);
411 err:
412         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
413                 goto retry;
414         bch2_trans_exit(&trans);
415
416         return ret;
417 }
418
419 static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
420 {
421         struct bvec_iter iter;
422         struct bio_vec bv;
423         unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
424                 ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
425         unsigned state = bkey_to_sector_state(k.k);
426
427         bio_for_each_segment(bv, bio, iter)
428                 __bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9,
429                                       bv.bv_len >> 9, nr_ptrs, state);
430 }
431
432 static void mark_pagecache_unallocated(struct bch_inode_info *inode,
433                                        u64 start, u64 end)
434 {
435         pgoff_t index = start >> PAGE_SECTORS_SHIFT;
436         pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
437         struct pagevec pvec;
438
439         if (end <= start)
440                 return;
441
442         pagevec_init(&pvec);
443
444         do {
445                 unsigned nr_pages, i, j;
446
447                 nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
448                                                 &index, end_index);
449                 for (i = 0; i < nr_pages; i++) {
450                         struct page *page = pvec.pages[i];
451                         u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
452                         u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
453                         unsigned pg_offset = max(start, pg_start) - pg_start;
454                         unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
455                         struct bch_page_state *s;
456
457                         BUG_ON(end <= pg_start);
458                         BUG_ON(pg_offset >= PAGE_SECTORS);
459                         BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
460
461                         lock_page(page);
462                         s = bch2_page_state(page);
463
464                         if (s) {
465                                 spin_lock(&s->lock);
466                                 for (j = pg_offset; j < pg_offset + pg_len; j++)
467                                         s->s[j].nr_replicas = 0;
468                                 spin_unlock(&s->lock);
469                         }
470
471                         unlock_page(page);
472                 }
473                 pagevec_release(&pvec);
474         } while (index <= end_index);
475 }
476
477 static void mark_pagecache_reserved(struct bch_inode_info *inode,
478                                     u64 start, u64 end)
479 {
480         struct bch_fs *c = inode->v.i_sb->s_fs_info;
481         pgoff_t index = start >> PAGE_SECTORS_SHIFT;
482         pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
483         struct pagevec pvec;
484         s64 i_sectors_delta = 0;
485
486         if (end <= start)
487                 return;
488
489         pagevec_init(&pvec);
490
491         do {
492                 unsigned nr_pages, i, j;
493
494                 nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
495                                                 &index, end_index);
496                 for (i = 0; i < nr_pages; i++) {
497                         struct page *page = pvec.pages[i];
498                         u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
499                         u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
500                         unsigned pg_offset = max(start, pg_start) - pg_start;
501                         unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
502                         struct bch_page_state *s;
503
504                         BUG_ON(end <= pg_start);
505                         BUG_ON(pg_offset >= PAGE_SECTORS);
506                         BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
507
508                         lock_page(page);
509                         s = bch2_page_state(page);
510
511                         if (s) {
512                                 spin_lock(&s->lock);
513                                 for (j = pg_offset; j < pg_offset + pg_len; j++)
514                                         switch (s->s[j].state) {
515                                         case SECTOR_UNALLOCATED:
516                                                 s->s[j].state = SECTOR_RESERVED;
517                                                 break;
518                                         case SECTOR_DIRTY:
519                                                 s->s[j].state = SECTOR_DIRTY_RESERVED;
520                                                 i_sectors_delta--;
521                                                 break;
522                                         default:
523                                                 break;
524                                         }
525                                 spin_unlock(&s->lock);
526                         }
527
528                         unlock_page(page);
529                 }
530                 pagevec_release(&pvec);
531         } while (index <= end_index);
532
533         i_sectors_acct(c, inode, NULL, i_sectors_delta);
534 }
535
536 static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
537 {
538         /* XXX: this should not be open coded */
539         return inode->ei_inode.bi_data_replicas
540                 ? inode->ei_inode.bi_data_replicas - 1
541                 : c->opts.data_replicas;
542 }
543
544 static inline unsigned sectors_to_reserve(struct bch_page_sector *s,
545                                                   unsigned nr_replicas)
546 {
547         return max(0, (int) nr_replicas -
548                    s->nr_replicas -
549                    s->replicas_reserved);
550 }
551
552 static int bch2_get_page_disk_reservation(struct bch_fs *c,
553                                 struct bch_inode_info *inode,
554                                 struct page *page, bool check_enospc)
555 {
556         struct bch_page_state *s = bch2_page_state_create(page, 0);
557         unsigned nr_replicas = inode_nr_replicas(c, inode);
558         struct disk_reservation disk_res = { 0 };
559         unsigned i, disk_res_sectors = 0;
560         int ret;
561
562         if (!s)
563                 return -ENOMEM;
564
565         for (i = 0; i < ARRAY_SIZE(s->s); i++)
566                 disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
567
568         if (!disk_res_sectors)
569                 return 0;
570
571         ret = bch2_disk_reservation_get(c, &disk_res,
572                                         disk_res_sectors, 1,
573                                         !check_enospc
574                                         ? BCH_DISK_RESERVATION_NOFAIL
575                                         : 0);
576         if (unlikely(ret))
577                 return ret;
578
579         for (i = 0; i < ARRAY_SIZE(s->s); i++)
580                 s->s[i].replicas_reserved +=
581                         sectors_to_reserve(&s->s[i], nr_replicas);
582
583         return 0;
584 }
585
586 struct bch2_page_reservation {
587         struct disk_reservation disk;
588         struct quota_res        quota;
589 };
590
591 static void bch2_page_reservation_init(struct bch_fs *c,
592                         struct bch_inode_info *inode,
593                         struct bch2_page_reservation *res)
594 {
595         memset(res, 0, sizeof(*res));
596
597         res->disk.nr_replicas = inode_nr_replicas(c, inode);
598 }
599
600 static void bch2_page_reservation_put(struct bch_fs *c,
601                         struct bch_inode_info *inode,
602                         struct bch2_page_reservation *res)
603 {
604         bch2_disk_reservation_put(c, &res->disk);
605         bch2_quota_reservation_put(c, inode, &res->quota);
606 }
607
608 static int bch2_page_reservation_get(struct bch_fs *c,
609                         struct bch_inode_info *inode, struct page *page,
610                         struct bch2_page_reservation *res,
611                         unsigned offset, unsigned len, bool check_enospc)
612 {
613         struct bch_page_state *s = bch2_page_state_create(page, 0);
614         unsigned i, disk_sectors = 0, quota_sectors = 0;
615         int ret;
616
617         if (!s)
618                 return -ENOMEM;
619
620         BUG_ON(!s->uptodate);
621
622         for (i = round_down(offset, block_bytes(c)) >> 9;
623              i < round_up(offset + len, block_bytes(c)) >> 9;
624              i++) {
625                 disk_sectors += sectors_to_reserve(&s->s[i],
626                                                 res->disk.nr_replicas);
627                 quota_sectors += s->s[i].state == SECTOR_UNALLOCATED;
628         }
629
630         if (disk_sectors) {
631                 ret = bch2_disk_reservation_add(c, &res->disk,
632                                                 disk_sectors,
633                                                 !check_enospc
634                                                 ? BCH_DISK_RESERVATION_NOFAIL
635                                                 : 0);
636                 if (unlikely(ret))
637                         return ret;
638         }
639
640         if (quota_sectors) {
641                 ret = bch2_quota_reservation_add(c, inode, &res->quota,
642                                                  quota_sectors,
643                                                  check_enospc);
644                 if (unlikely(ret)) {
645                         struct disk_reservation tmp = {
646                                 .sectors = disk_sectors
647                         };
648
649                         bch2_disk_reservation_put(c, &tmp);
650                         res->disk.sectors -= disk_sectors;
651                         return ret;
652                 }
653         }
654
655         return 0;
656 }
657
658 static void bch2_clear_page_bits(struct page *page)
659 {
660         struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
661         struct bch_fs *c = inode->v.i_sb->s_fs_info;
662         struct bch_page_state *s = bch2_page_state(page);
663         struct disk_reservation disk_res = { 0 };
664         int i, dirty_sectors = 0;
665
666         if (!s)
667                 return;
668
669         EBUG_ON(!PageLocked(page));
670         EBUG_ON(PageWriteback(page));
671
672         for (i = 0; i < ARRAY_SIZE(s->s); i++) {
673                 disk_res.sectors += s->s[i].replicas_reserved;
674                 s->s[i].replicas_reserved = 0;
675
676                 switch (s->s[i].state) {
677                 case SECTOR_DIRTY:
678                         s->s[i].state = SECTOR_UNALLOCATED;
679                         --dirty_sectors;
680                         break;
681                 case SECTOR_DIRTY_RESERVED:
682                         s->s[i].state = SECTOR_RESERVED;
683                         break;
684                 default:
685                         break;
686                 }
687         }
688
689         bch2_disk_reservation_put(c, &disk_res);
690
691         i_sectors_acct(c, inode, NULL, dirty_sectors);
692
693         bch2_page_state_release(page);
694 }
695
696 static void bch2_set_page_dirty(struct bch_fs *c,
697                         struct bch_inode_info *inode, struct page *page,
698                         struct bch2_page_reservation *res,
699                         unsigned offset, unsigned len)
700 {
701         struct bch_page_state *s = bch2_page_state(page);
702         unsigned i, dirty_sectors = 0;
703
704         WARN_ON((u64) page_offset(page) + offset + len >
705                 round_up((u64) i_size_read(&inode->v), block_bytes(c)));
706
707         spin_lock(&s->lock);
708
709         for (i = round_down(offset, block_bytes(c)) >> 9;
710              i < round_up(offset + len, block_bytes(c)) >> 9;
711              i++) {
712                 unsigned sectors = sectors_to_reserve(&s->s[i],
713                                                 res->disk.nr_replicas);
714
715                 /*
716                  * This can happen if we race with the error path in
717                  * bch2_writepage_io_done():
718                  */
719                 sectors = min_t(unsigned, sectors, res->disk.sectors);
720
721                 s->s[i].replicas_reserved += sectors;
722                 res->disk.sectors -= sectors;
723
724                 switch (s->s[i].state) {
725                 case SECTOR_UNALLOCATED:
726                         s->s[i].state = SECTOR_DIRTY;
727                         dirty_sectors++;
728                         break;
729                 case SECTOR_RESERVED:
730                         s->s[i].state = SECTOR_DIRTY_RESERVED;
731                         break;
732                 default:
733                         break;
734                 }
735         }
736
737         spin_unlock(&s->lock);
738
739         i_sectors_acct(c, inode, &res->quota, dirty_sectors);
740
741         if (!PageDirty(page))
742                 __set_page_dirty_nobuffers(page);
743 }
744
745 vm_fault_t bch2_page_fault(struct vm_fault *vmf)
746 {
747         struct file *file = vmf->vma->vm_file;
748         struct address_space *mapping = file->f_mapping;
749         struct address_space *fdm = faults_disabled_mapping();
750         struct bch_inode_info *inode = file_bch_inode(file);
751         int ret;
752
753         if (fdm == mapping)
754                 return VM_FAULT_SIGBUS;
755
756         /* Lock ordering: */
757         if (fdm > mapping) {
758                 struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
759
760                 if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock))
761                         goto got_lock;
762
763                 bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock);
764
765                 bch2_pagecache_add_get(&inode->ei_pagecache_lock);
766                 bch2_pagecache_add_put(&inode->ei_pagecache_lock);
767
768                 bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock);
769
770                 /* Signal that lock has been dropped: */
771                 set_fdm_dropped_locks();
772                 return VM_FAULT_SIGBUS;
773         }
774
775         bch2_pagecache_add_get(&inode->ei_pagecache_lock);
776 got_lock:
777         ret = filemap_fault(vmf);
778         bch2_pagecache_add_put(&inode->ei_pagecache_lock);
779
780         return ret;
781 }
782
783 vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
784 {
785         struct page *page = vmf->page;
786         struct file *file = vmf->vma->vm_file;
787         struct bch_inode_info *inode = file_bch_inode(file);
788         struct address_space *mapping = file->f_mapping;
789         struct bch_fs *c = inode->v.i_sb->s_fs_info;
790         struct bch2_page_reservation res;
791         unsigned len;
792         loff_t isize;
793         int ret;
794
795         bch2_page_reservation_init(c, inode, &res);
796
797         sb_start_pagefault(inode->v.i_sb);
798         file_update_time(file);
799
800         /*
801          * Not strictly necessary, but helps avoid dio writes livelocking in
802          * write_invalidate_inode_pages_range() - can drop this if/when we get
803          * a write_invalidate_inode_pages_range() that works without dropping
804          * page lock before invalidating page
805          */
806         bch2_pagecache_add_get(&inode->ei_pagecache_lock);
807
808         lock_page(page);
809         isize = i_size_read(&inode->v);
810
811         if (page->mapping != mapping || page_offset(page) >= isize) {
812                 unlock_page(page);
813                 ret = VM_FAULT_NOPAGE;
814                 goto out;
815         }
816
817         len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page));
818
819         if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
820                 if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) {
821                         unlock_page(page);
822                         ret = VM_FAULT_SIGBUS;
823                         goto out;
824                 }
825         }
826
827         if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
828                 unlock_page(page);
829                 ret = VM_FAULT_SIGBUS;
830                 goto out;
831         }
832
833         bch2_set_page_dirty(c, inode, page, &res, 0, len);
834         bch2_page_reservation_put(c, inode, &res);
835
836         wait_for_stable_page(page);
837         ret = VM_FAULT_LOCKED;
838 out:
839         bch2_pagecache_add_put(&inode->ei_pagecache_lock);
840         sb_end_pagefault(inode->v.i_sb);
841
842         return ret;
843 }
844
845 void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
846 {
847         if (offset || length < folio_size(folio))
848                 return;
849
850         bch2_clear_page_bits(&folio->page);
851 }
852
853 bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
854 {
855         if (folio_test_dirty(folio) || folio_test_writeback(folio))
856                 return false;
857
858         bch2_clear_page_bits(&folio->page);
859         return true;
860 }
861
862 #ifdef CONFIG_MIGRATION
863 int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
864                       struct page *page, enum migrate_mode mode)
865 {
866         int ret;
867
868         EBUG_ON(!PageLocked(page));
869         EBUG_ON(!PageLocked(newpage));
870
871         ret = migrate_page_move_mapping(mapping, newpage, page, 0);
872         if (ret != MIGRATEPAGE_SUCCESS)
873                 return ret;
874
875         if (PagePrivate(page))
876                 attach_page_private(newpage, detach_page_private(page));
877
878         if (mode != MIGRATE_SYNC_NO_COPY)
879                 migrate_page_copy(newpage, page);
880         else
881                 migrate_page_states(newpage, page);
882         return MIGRATEPAGE_SUCCESS;
883 }
884 #endif
885
886 /* readpage(s): */
887
888 static void bch2_readpages_end_io(struct bio *bio)
889 {
890         struct bvec_iter_all iter;
891         struct bio_vec *bv;
892
893         bio_for_each_segment_all(bv, bio, iter) {
894                 struct page *page = bv->bv_page;
895
896                 if (!bio->bi_status) {
897                         SetPageUptodate(page);
898                 } else {
899                         ClearPageUptodate(page);
900                         SetPageError(page);
901                 }
902                 unlock_page(page);
903         }
904
905         bio_put(bio);
906 }
907
908 struct readpages_iter {
909         struct address_space    *mapping;
910         struct page             **pages;
911         unsigned                nr_pages;
912         unsigned                idx;
913         pgoff_t                 offset;
914 };
915
916 static int readpages_iter_init(struct readpages_iter *iter,
917                                struct readahead_control *ractl)
918 {
919         unsigned i, nr_pages = readahead_count(ractl);
920
921         memset(iter, 0, sizeof(*iter));
922
923         iter->mapping   = ractl->mapping;
924         iter->offset    = readahead_index(ractl);
925         iter->nr_pages  = nr_pages;
926
927         iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
928         if (!iter->pages)
929                 return -ENOMEM;
930
931         nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
932         for (i = 0; i < nr_pages; i++) {
933                 __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
934                 put_page(iter->pages[i]);
935         }
936
937         return 0;
938 }
939
940 static inline struct page *readpage_iter_next(struct readpages_iter *iter)
941 {
942         if (iter->idx >= iter->nr_pages)
943                 return NULL;
944
945         EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
946
947         return iter->pages[iter->idx];
948 }
949
950 static bool extent_partial_reads_expensive(struct bkey_s_c k)
951 {
952         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
953         struct bch_extent_crc_unpacked crc;
954         const union bch_extent_entry *i;
955
956         bkey_for_each_crc(k.k, ptrs, crc, i)
957                 if (crc.csum_type || crc.compression_type)
958                         return true;
959         return false;
960 }
961
962 static void readpage_bio_extend(struct readpages_iter *iter,
963                                 struct bio *bio,
964                                 unsigned sectors_this_extent,
965                                 bool get_more)
966 {
967         while (bio_sectors(bio) < sectors_this_extent &&
968                bio->bi_vcnt < bio->bi_max_vecs) {
969                 pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
970                 struct page *page = readpage_iter_next(iter);
971                 int ret;
972
973                 if (page) {
974                         if (iter->offset + iter->idx != page_offset)
975                                 break;
976
977                         iter->idx++;
978                 } else {
979                         if (!get_more)
980                                 break;
981
982                         page = xa_load(&iter->mapping->i_pages, page_offset);
983                         if (page && !xa_is_value(page))
984                                 break;
985
986                         page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
987                         if (!page)
988                                 break;
989
990                         if (!__bch2_page_state_create(page, 0)) {
991                                 put_page(page);
992                                 break;
993                         }
994
995                         ret = add_to_page_cache_lru(page, iter->mapping,
996                                                     page_offset, GFP_NOFS);
997                         if (ret) {
998                                 __bch2_page_state_release(page);
999                                 put_page(page);
1000                                 break;
1001                         }
1002
1003                         put_page(page);
1004                 }
1005
1006                 BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0));
1007         }
1008 }
1009
1010 static void bchfs_read(struct btree_trans *trans,
1011                        struct bch_read_bio *rbio,
1012                        subvol_inum inum,
1013                        struct readpages_iter *readpages_iter)
1014 {
1015         struct bch_fs *c = trans->c;
1016         struct btree_iter iter;
1017         struct bkey_buf sk;
1018         int flags = BCH_READ_RETRY_IF_STALE|
1019                 BCH_READ_MAY_PROMOTE;
1020         u32 snapshot;
1021         int ret = 0;
1022
1023         rbio->c = c;
1024         rbio->start_time = local_clock();
1025         rbio->subvol = inum.subvol;
1026
1027         bch2_bkey_buf_init(&sk);
1028 retry:
1029         bch2_trans_begin(trans);
1030         iter = (struct btree_iter) { NULL };
1031
1032         ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
1033         if (ret)
1034                 goto err;
1035
1036         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1037                              SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
1038                              BTREE_ITER_SLOTS);
1039         while (1) {
1040                 struct bkey_s_c k;
1041                 unsigned bytes, sectors, offset_into_extent;
1042                 enum btree_id data_btree = BTREE_ID_extents;
1043
1044                 /*
1045                  * read_extent -> io_time_reset may cause a transaction restart
1046                  * without returning an error, we need to check for that here:
1047                  */
1048                 ret = bch2_trans_relock(trans);
1049                 if (ret)
1050                         break;
1051
1052                 bch2_btree_iter_set_pos(&iter,
1053                                 POS(inum.inum, rbio->bio.bi_iter.bi_sector));
1054
1055                 k = bch2_btree_iter_peek_slot(&iter);
1056                 ret = bkey_err(k);
1057                 if (ret)
1058                         break;
1059
1060                 offset_into_extent = iter.pos.offset -
1061                         bkey_start_offset(k.k);
1062                 sectors = k.k->size - offset_into_extent;
1063
1064                 bch2_bkey_buf_reassemble(&sk, c, k);
1065
1066                 ret = bch2_read_indirect_extent(trans, &data_btree,
1067                                         &offset_into_extent, &sk);
1068                 if (ret)
1069                         break;
1070
1071                 k = bkey_i_to_s_c(sk.k);
1072
1073                 sectors = min(sectors, k.k->size - offset_into_extent);
1074
1075                 if (readpages_iter)
1076                         readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
1077                                             extent_partial_reads_expensive(k));
1078
1079                 bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
1080                 swap(rbio->bio.bi_iter.bi_size, bytes);
1081
1082                 if (rbio->bio.bi_iter.bi_size == bytes)
1083                         flags |= BCH_READ_LAST_FRAGMENT;
1084
1085                 bch2_bio_page_state_set(&rbio->bio, k);
1086
1087                 bch2_read_extent(trans, rbio, iter.pos,
1088                                  data_btree, k, offset_into_extent, flags);
1089
1090                 if (flags & BCH_READ_LAST_FRAGMENT)
1091                         break;
1092
1093                 swap(rbio->bio.bi_iter.bi_size, bytes);
1094                 bio_advance(&rbio->bio, bytes);
1095
1096                 ret = btree_trans_too_many_iters(trans);
1097                 if (ret)
1098                         break;
1099         }
1100 err:
1101         bch2_trans_iter_exit(trans, &iter);
1102
1103         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1104                 goto retry;
1105
1106         if (ret) {
1107                 bch_err_inum_ratelimited(c, inum.inum,
1108                                 "read error %i from btree lookup", ret);
1109                 rbio->bio.bi_status = BLK_STS_IOERR;
1110                 bio_endio(&rbio->bio);
1111         }
1112
1113         bch2_bkey_buf_exit(&sk, c);
1114 }
1115
1116 void bch2_readahead(struct readahead_control *ractl)
1117 {
1118         struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
1119         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1120         struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
1121         struct btree_trans trans;
1122         struct page *page;
1123         struct readpages_iter readpages_iter;
1124         int ret;
1125
1126         ret = readpages_iter_init(&readpages_iter, ractl);
1127         BUG_ON(ret);
1128
1129         bch2_trans_init(&trans, c, 0, 0);
1130
1131         bch2_pagecache_add_get(&inode->ei_pagecache_lock);
1132
1133         while ((page = readpage_iter_next(&readpages_iter))) {
1134                 pgoff_t index = readpages_iter.offset + readpages_iter.idx;
1135                 unsigned n = min_t(unsigned,
1136                                    readpages_iter.nr_pages -
1137                                    readpages_iter.idx,
1138                                    BIO_MAX_VECS);
1139                 struct bch_read_bio *rbio =
1140                         rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
1141                                                    GFP_NOFS, &c->bio_read),
1142                                   opts);
1143
1144                 readpages_iter.idx++;
1145
1146                 rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT;
1147                 rbio->bio.bi_end_io = bch2_readpages_end_io;
1148                 BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
1149
1150                 bchfs_read(&trans, rbio, inode_inum(inode),
1151                            &readpages_iter);
1152         }
1153
1154         bch2_pagecache_add_put(&inode->ei_pagecache_lock);
1155
1156         bch2_trans_exit(&trans);
1157         kfree(readpages_iter.pages);
1158 }
1159
1160 static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
1161                              subvol_inum inum, struct page *page)
1162 {
1163         struct btree_trans trans;
1164
1165         bch2_page_state_create(page, __GFP_NOFAIL);
1166
1167         bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
1168         rbio->bio.bi_iter.bi_sector =
1169                 (sector_t) page->index << PAGE_SECTORS_SHIFT;
1170         BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
1171
1172         bch2_trans_init(&trans, c, 0, 0);
1173         bchfs_read(&trans, rbio, inum, NULL);
1174         bch2_trans_exit(&trans);
1175 }
1176
1177 static void bch2_read_single_page_end_io(struct bio *bio)
1178 {
1179         complete(bio->bi_private);
1180 }
1181
1182 static int bch2_read_single_page(struct page *page,
1183                                  struct address_space *mapping)
1184 {
1185         struct bch_inode_info *inode = to_bch_ei(mapping->host);
1186         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1187         struct bch_read_bio *rbio;
1188         int ret;
1189         DECLARE_COMPLETION_ONSTACK(done);
1190
1191         rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read),
1192                          io_opts(c, &inode->ei_inode));
1193         rbio->bio.bi_private = &done;
1194         rbio->bio.bi_end_io = bch2_read_single_page_end_io;
1195
1196         __bchfs_readpage(c, rbio, inode_inum(inode), page);
1197         wait_for_completion(&done);
1198
1199         ret = blk_status_to_errno(rbio->bio.bi_status);
1200         bio_put(&rbio->bio);
1201
1202         if (ret < 0)
1203                 return ret;
1204
1205         SetPageUptodate(page);
1206         return 0;
1207 }
1208
1209 int bch2_read_folio(struct file *file, struct folio *folio)
1210 {
1211         struct page *page = &folio->page;
1212         int ret;
1213
1214         ret = bch2_read_single_page(page, page->mapping);
1215         folio_unlock(folio);
1216         return bch2_err_class(ret);
1217 }
1218
1219 /* writepages: */
1220
1221 struct bch_writepage_state {
1222         struct bch_writepage_io *io;
1223         struct bch_io_opts      opts;
1224 };
1225
1226 static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
1227                                                                   struct bch_inode_info *inode)
1228 {
1229         return (struct bch_writepage_state) {
1230                 .opts = io_opts(c, &inode->ei_inode)
1231         };
1232 }
1233
1234 static void bch2_writepage_io_free(struct closure *cl)
1235 {
1236         struct bch_writepage_io *io = container_of(cl,
1237                                         struct bch_writepage_io, cl);
1238
1239         bio_put(&io->op.wbio.bio);
1240 }
1241
1242 static void bch2_writepage_io_done(struct closure *cl)
1243 {
1244         struct bch_writepage_io *io = container_of(cl,
1245                                         struct bch_writepage_io, cl);
1246         struct bch_fs *c = io->op.c;
1247         struct bio *bio = &io->op.wbio.bio;
1248         struct bvec_iter_all iter;
1249         struct bio_vec *bvec;
1250         unsigned i;
1251
1252         if (io->op.error) {
1253                 set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
1254
1255                 bio_for_each_segment_all(bvec, bio, iter) {
1256                         struct bch_page_state *s;
1257
1258                         SetPageError(bvec->bv_page);
1259                         mapping_set_error(bvec->bv_page->mapping, -EIO);
1260
1261                         s = __bch2_page_state(bvec->bv_page);
1262                         spin_lock(&s->lock);
1263                         for (i = 0; i < PAGE_SECTORS; i++)
1264                                 s->s[i].nr_replicas = 0;
1265                         spin_unlock(&s->lock);
1266                 }
1267         }
1268
1269         if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
1270                 bio_for_each_segment_all(bvec, bio, iter) {
1271                         struct bch_page_state *s;
1272
1273                         s = __bch2_page_state(bvec->bv_page);
1274                         spin_lock(&s->lock);
1275                         for (i = 0; i < PAGE_SECTORS; i++)
1276                                 s->s[i].nr_replicas = 0;
1277                         spin_unlock(&s->lock);
1278                 }
1279         }
1280
1281         /*
1282          * racing with fallocate can cause us to add fewer sectors than
1283          * expected - but we shouldn't add more sectors than expected:
1284          */
1285         WARN_ON_ONCE(io->op.i_sectors_delta > 0);
1286
1287         /*
1288          * (error (due to going RO) halfway through a page can screw that up
1289          * slightly)
1290          * XXX wtf?
1291            BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
1292          */
1293
1294         /*
1295          * PageWriteback is effectively our ref on the inode - fixup i_blocks
1296          * before calling end_page_writeback:
1297          */
1298         i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
1299
1300         bio_for_each_segment_all(bvec, bio, iter) {
1301                 struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
1302
1303                 if (atomic_dec_and_test(&s->write_count))
1304                         end_page_writeback(bvec->bv_page);
1305         }
1306
1307         closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
1308 }
1309
1310 static void bch2_writepage_do_io(struct bch_writepage_state *w)
1311 {
1312         struct bch_writepage_io *io = w->io;
1313
1314         w->io = NULL;
1315         closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
1316         continue_at(&io->cl, bch2_writepage_io_done, NULL);
1317 }
1318
1319 /*
1320  * Get a bch_writepage_io and add @page to it - appending to an existing one if
1321  * possible, else allocating a new one:
1322  */
1323 static void bch2_writepage_io_alloc(struct bch_fs *c,
1324                                     struct writeback_control *wbc,
1325                                     struct bch_writepage_state *w,
1326                                     struct bch_inode_info *inode,
1327                                     u64 sector,
1328                                     unsigned nr_replicas)
1329 {
1330         struct bch_write_op *op;
1331
1332         w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
1333                                               REQ_OP_WRITE,
1334                                               GFP_NOFS,
1335                                               &c->writepage_bioset),
1336                              struct bch_writepage_io, op.wbio.bio);
1337
1338         closure_init(&w->io->cl, NULL);
1339         w->io->inode            = inode;
1340
1341         op                      = &w->io->op;
1342         bch2_write_op_init(op, c, w->opts);
1343         op->target              = w->opts.foreground_target;
1344         op->nr_replicas         = nr_replicas;
1345         op->res.nr_replicas     = nr_replicas;
1346         op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
1347         op->subvol              = inode->ei_subvol;
1348         op->pos                 = POS(inode->v.i_ino, sector);
1349         op->wbio.bio.bi_iter.bi_sector = sector;
1350         op->wbio.bio.bi_opf     = wbc_to_write_flags(wbc);
1351 }
1352
1353 static int __bch2_writepage(struct page *page,
1354                             struct writeback_control *wbc,
1355                             void *data)
1356 {
1357         struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
1358         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1359         struct bch_writepage_state *w = data;
1360         struct bch_page_state *s, orig;
1361         unsigned i, offset, nr_replicas_this_write = U32_MAX;
1362         loff_t i_size = i_size_read(&inode->v);
1363         pgoff_t end_index = i_size >> PAGE_SHIFT;
1364         int ret;
1365
1366         EBUG_ON(!PageUptodate(page));
1367
1368         /* Is the page fully inside i_size? */
1369         if (page->index < end_index)
1370                 goto do_io;
1371
1372         /* Is the page fully outside i_size? (truncate in progress) */
1373         offset = i_size & (PAGE_SIZE - 1);
1374         if (page->index > end_index || !offset) {
1375                 unlock_page(page);
1376                 return 0;
1377         }
1378
1379         /*
1380          * The page straddles i_size.  It must be zeroed out on each and every
1381          * writepage invocation because it may be mmapped.  "A file is mapped
1382          * in multiples of the page size.  For a file that is not a multiple of
1383          * the  page size, the remaining memory is zeroed when mapped, and
1384          * writes to that region are not written out to the file."
1385          */
1386         zero_user_segment(page, offset, PAGE_SIZE);
1387 do_io:
1388         s = bch2_page_state_create(page, __GFP_NOFAIL);
1389
1390         /*
1391          * Things get really hairy with errors during writeback:
1392          */
1393         ret = bch2_get_page_disk_reservation(c, inode, page, false);
1394         BUG_ON(ret);
1395
1396         /* Before unlocking the page, get copy of reservations: */
1397         spin_lock(&s->lock);
1398         orig = *s;
1399         spin_unlock(&s->lock);
1400
1401         for (i = 0; i < PAGE_SECTORS; i++) {
1402                 if (s->s[i].state < SECTOR_DIRTY)
1403                         continue;
1404
1405                 nr_replicas_this_write =
1406                         min_t(unsigned, nr_replicas_this_write,
1407                               s->s[i].nr_replicas +
1408                               s->s[i].replicas_reserved);
1409         }
1410
1411         for (i = 0; i < PAGE_SECTORS; i++) {
1412                 if (s->s[i].state < SECTOR_DIRTY)
1413                         continue;
1414
1415                 s->s[i].nr_replicas = w->opts.compression
1416                         ? 0 : nr_replicas_this_write;
1417
1418                 s->s[i].replicas_reserved = 0;
1419                 s->s[i].state = SECTOR_ALLOCATED;
1420         }
1421
1422         BUG_ON(atomic_read(&s->write_count));
1423         atomic_set(&s->write_count, 1);
1424
1425         BUG_ON(PageWriteback(page));
1426         set_page_writeback(page);
1427
1428         unlock_page(page);
1429
1430         offset = 0;
1431         while (1) {
1432                 unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
1433                 u64 sector;
1434
1435                 while (offset < PAGE_SECTORS &&
1436                        orig.s[offset].state < SECTOR_DIRTY)
1437                         offset++;
1438
1439                 if (offset == PAGE_SECTORS)
1440                         break;
1441
1442                 while (offset + sectors < PAGE_SECTORS &&
1443                        orig.s[offset + sectors].state >= SECTOR_DIRTY) {
1444                         reserved_sectors += orig.s[offset + sectors].replicas_reserved;
1445                         dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY;
1446                         sectors++;
1447                 }
1448                 BUG_ON(!sectors);
1449
1450                 sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset;
1451
1452                 if (w->io &&
1453                     (w->io->op.res.nr_replicas != nr_replicas_this_write ||
1454                      bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
1455                      w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
1456                      (BIO_MAX_VECS * PAGE_SIZE) ||
1457                      bio_end_sector(&w->io->op.wbio.bio) != sector))
1458                         bch2_writepage_do_io(w);
1459
1460                 if (!w->io)
1461                         bch2_writepage_io_alloc(c, wbc, w, inode, sector,
1462                                                 nr_replicas_this_write);
1463
1464                 atomic_inc(&s->write_count);
1465
1466                 BUG_ON(inode != w->io->inode);
1467                 BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page,
1468                                      sectors << 9, offset << 9));
1469
1470                 /* Check for writing past i_size: */
1471                 WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
1472                              round_up(i_size, block_bytes(c)));
1473
1474                 w->io->op.res.sectors += reserved_sectors;
1475                 w->io->op.i_sectors_delta -= dirty_sectors;
1476                 w->io->op.new_i_size = i_size;
1477
1478                 offset += sectors;
1479         }
1480
1481         if (atomic_dec_and_test(&s->write_count))
1482                 end_page_writeback(page);
1483
1484         return 0;
1485 }
1486
1487 int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
1488 {
1489         struct bch_fs *c = mapping->host->i_sb->s_fs_info;
1490         struct bch_writepage_state w =
1491                 bch_writepage_state_init(c, to_bch_ei(mapping->host));
1492         struct blk_plug plug;
1493         int ret;
1494
1495         blk_start_plug(&plug);
1496         ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
1497         if (w.io)
1498                 bch2_writepage_do_io(&w);
1499         blk_finish_plug(&plug);
1500         return bch2_err_class(ret);
1501 }
1502
1503 /* buffered writes: */
1504
1505 int bch2_write_begin(struct file *file, struct address_space *mapping,
1506                      loff_t pos, unsigned len,
1507                      struct page **pagep, void **fsdata)
1508 {
1509         struct bch_inode_info *inode = to_bch_ei(mapping->host);
1510         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1511         struct bch2_page_reservation *res;
1512         pgoff_t index = pos >> PAGE_SHIFT;
1513         unsigned offset = pos & (PAGE_SIZE - 1);
1514         struct page *page;
1515         int ret = -ENOMEM;
1516
1517         res = kmalloc(sizeof(*res), GFP_KERNEL);
1518         if (!res)
1519                 return -ENOMEM;
1520
1521         bch2_page_reservation_init(c, inode, res);
1522         *fsdata = res;
1523
1524         bch2_pagecache_add_get(&inode->ei_pagecache_lock);
1525
1526         page = grab_cache_page_write_begin(mapping, index);
1527         if (!page)
1528                 goto err_unlock;
1529
1530         if (PageUptodate(page))
1531                 goto out;
1532
1533         /* If we're writing entire page, don't need to read it in first: */
1534         if (len == PAGE_SIZE)
1535                 goto out;
1536
1537         if (!offset && pos + len >= inode->v.i_size) {
1538                 zero_user_segment(page, len, PAGE_SIZE);
1539                 flush_dcache_page(page);
1540                 goto out;
1541         }
1542
1543         if (index > inode->v.i_size >> PAGE_SHIFT) {
1544                 zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE);
1545                 flush_dcache_page(page);
1546                 goto out;
1547         }
1548 readpage:
1549         ret = bch2_read_single_page(page, mapping);
1550         if (ret)
1551                 goto err;
1552 out:
1553         if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
1554                 ret = bch2_page_state_set(c, inode_inum(inode), &page, 1);
1555                 if (ret)
1556                         goto out;
1557         }
1558
1559         ret = bch2_page_reservation_get(c, inode, page, res,
1560                                         offset, len, true);
1561         if (ret) {
1562                 if (!PageUptodate(page)) {
1563                         /*
1564                          * If the page hasn't been read in, we won't know if we
1565                          * actually need a reservation - we don't actually need
1566                          * to read here, we just need to check if the page is
1567                          * fully backed by uncompressed data:
1568                          */
1569                         goto readpage;
1570                 }
1571
1572                 goto err;
1573         }
1574
1575         *pagep = page;
1576         return 0;
1577 err:
1578         unlock_page(page);
1579         put_page(page);
1580         *pagep = NULL;
1581 err_unlock:
1582         bch2_pagecache_add_put(&inode->ei_pagecache_lock);
1583         kfree(res);
1584         *fsdata = NULL;
1585         return bch2_err_class(ret);
1586 }
1587
1588 int bch2_write_end(struct file *file, struct address_space *mapping,
1589                    loff_t pos, unsigned len, unsigned copied,
1590                    struct page *page, void *fsdata)
1591 {
1592         struct bch_inode_info *inode = to_bch_ei(mapping->host);
1593         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1594         struct bch2_page_reservation *res = fsdata;
1595         unsigned offset = pos & (PAGE_SIZE - 1);
1596
1597         lockdep_assert_held(&inode->v.i_rwsem);
1598
1599         if (unlikely(copied < len && !PageUptodate(page))) {
1600                 /*
1601                  * The page needs to be read in, but that would destroy
1602                  * our partial write - simplest thing is to just force
1603                  * userspace to redo the write:
1604                  */
1605                 zero_user(page, 0, PAGE_SIZE);
1606                 flush_dcache_page(page);
1607                 copied = 0;
1608         }
1609
1610         spin_lock(&inode->v.i_lock);
1611         if (pos + copied > inode->v.i_size)
1612                 i_size_write(&inode->v, pos + copied);
1613         spin_unlock(&inode->v.i_lock);
1614
1615         if (copied) {
1616                 if (!PageUptodate(page))
1617                         SetPageUptodate(page);
1618
1619                 bch2_set_page_dirty(c, inode, page, res, offset, copied);
1620
1621                 inode->ei_last_dirtied = (unsigned long) current;
1622         }
1623
1624         unlock_page(page);
1625         put_page(page);
1626         bch2_pagecache_add_put(&inode->ei_pagecache_lock);
1627
1628         bch2_page_reservation_put(c, inode, res);
1629         kfree(res);
1630
1631         return copied;
1632 }
1633
1634 #define WRITE_BATCH_PAGES       32
1635
1636 static int __bch2_buffered_write(struct bch_inode_info *inode,
1637                                  struct address_space *mapping,
1638                                  struct iov_iter *iter,
1639                                  loff_t pos, unsigned len)
1640 {
1641         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1642         struct page *pages[WRITE_BATCH_PAGES];
1643         struct bch2_page_reservation res;
1644         unsigned long index = pos >> PAGE_SHIFT;
1645         unsigned offset = pos & (PAGE_SIZE - 1);
1646         unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
1647         unsigned i, reserved = 0, set_dirty = 0;
1648         unsigned copied = 0, nr_pages_copied = 0;
1649         int ret = 0;
1650
1651         BUG_ON(!len);
1652         BUG_ON(nr_pages > ARRAY_SIZE(pages));
1653
1654         bch2_page_reservation_init(c, inode, &res);
1655
1656         for (i = 0; i < nr_pages; i++) {
1657                 pages[i] = grab_cache_page_write_begin(mapping, index + i);
1658                 if (!pages[i]) {
1659                         nr_pages = i;
1660                         if (!i) {
1661                                 ret = -ENOMEM;
1662                                 goto out;
1663                         }
1664                         len = min_t(unsigned, len,
1665                                     nr_pages * PAGE_SIZE - offset);
1666                         break;
1667                 }
1668         }
1669
1670         if (offset && !PageUptodate(pages[0])) {
1671                 ret = bch2_read_single_page(pages[0], mapping);
1672                 if (ret)
1673                         goto out;
1674         }
1675
1676         if ((pos + len) & (PAGE_SIZE - 1) &&
1677             !PageUptodate(pages[nr_pages - 1])) {
1678                 if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) {
1679                         zero_user(pages[nr_pages - 1], 0, PAGE_SIZE);
1680                 } else {
1681                         ret = bch2_read_single_page(pages[nr_pages - 1], mapping);
1682                         if (ret)
1683                                 goto out;
1684                 }
1685         }
1686
1687         while (reserved < len) {
1688                 unsigned i = (offset + reserved) >> PAGE_SHIFT;
1689                 struct page *page = pages[i];
1690                 unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
1691                 unsigned pg_len = min_t(unsigned, len - reserved,
1692                                         PAGE_SIZE - pg_offset);
1693
1694                 if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
1695                         ret = bch2_page_state_set(c, inode_inum(inode),
1696                                                   pages + i, nr_pages - i);
1697                         if (ret)
1698                                 goto out;
1699                 }
1700
1701                 ret = bch2_page_reservation_get(c, inode, page, &res,
1702                                                 pg_offset, pg_len, true);
1703                 if (ret)
1704                         goto out;
1705
1706                 reserved += pg_len;
1707         }
1708
1709         if (mapping_writably_mapped(mapping))
1710                 for (i = 0; i < nr_pages; i++)
1711                         flush_dcache_page(pages[i]);
1712
1713         while (copied < len) {
1714                 struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
1715                 unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
1716                 unsigned pg_len = min_t(unsigned, len - copied,
1717                                         PAGE_SIZE - pg_offset);
1718                 unsigned pg_copied = copy_page_from_iter_atomic(page,
1719                                                 pg_offset, pg_len,iter);
1720
1721                 if (!pg_copied)
1722                         break;
1723
1724                 if (!PageUptodate(page) &&
1725                     pg_copied != PAGE_SIZE &&
1726                     pos + copied + pg_copied < inode->v.i_size) {
1727                         zero_user(page, 0, PAGE_SIZE);
1728                         break;
1729                 }
1730
1731                 flush_dcache_page(page);
1732                 copied += pg_copied;
1733
1734                 if (pg_copied != pg_len)
1735                         break;
1736         }
1737
1738         if (!copied)
1739                 goto out;
1740
1741         spin_lock(&inode->v.i_lock);
1742         if (pos + copied > inode->v.i_size)
1743                 i_size_write(&inode->v, pos + copied);
1744         spin_unlock(&inode->v.i_lock);
1745
1746         while (set_dirty < copied) {
1747                 struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT];
1748                 unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1);
1749                 unsigned pg_len = min_t(unsigned, copied - set_dirty,
1750                                         PAGE_SIZE - pg_offset);
1751
1752                 if (!PageUptodate(page))
1753                         SetPageUptodate(page);
1754
1755                 bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len);
1756                 unlock_page(page);
1757                 put_page(page);
1758
1759                 set_dirty += pg_len;
1760         }
1761
1762         nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
1763         inode->ei_last_dirtied = (unsigned long) current;
1764 out:
1765         for (i = nr_pages_copied; i < nr_pages; i++) {
1766                 unlock_page(pages[i]);
1767                 put_page(pages[i]);
1768         }
1769
1770         bch2_page_reservation_put(c, inode, &res);
1771
1772         return copied ?: ret;
1773 }
1774
1775 static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
1776 {
1777         struct file *file = iocb->ki_filp;
1778         struct address_space *mapping = file->f_mapping;
1779         struct bch_inode_info *inode = file_bch_inode(file);
1780         loff_t pos = iocb->ki_pos;
1781         ssize_t written = 0;
1782         int ret = 0;
1783
1784         bch2_pagecache_add_get(&inode->ei_pagecache_lock);
1785
1786         do {
1787                 unsigned offset = pos & (PAGE_SIZE - 1);
1788                 unsigned bytes = min_t(unsigned long, iov_iter_count(iter),
1789                               PAGE_SIZE * WRITE_BATCH_PAGES - offset);
1790 again:
1791                 /*
1792                  * Bring in the user page that we will copy from _first_.
1793                  * Otherwise there's a nasty deadlock on copying from the
1794                  * same page as we're writing to, without it being marked
1795                  * up-to-date.
1796                  *
1797                  * Not only is this an optimisation, but it is also required
1798                  * to check that the address is actually valid, when atomic
1799                  * usercopies are used, below.
1800                  */
1801                 if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
1802                         bytes = min_t(unsigned long, iov_iter_count(iter),
1803                                       PAGE_SIZE - offset);
1804
1805                         if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
1806                                 ret = -EFAULT;
1807                                 break;
1808                         }
1809                 }
1810
1811                 if (unlikely(fatal_signal_pending(current))) {
1812                         ret = -EINTR;
1813                         break;
1814                 }
1815
1816                 ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
1817                 if (unlikely(ret < 0))
1818                         break;
1819
1820                 cond_resched();
1821
1822                 if (unlikely(ret == 0)) {
1823                         /*
1824                          * If we were unable to copy any data at all, we must
1825                          * fall back to a single segment length write.
1826                          *
1827                          * If we didn't fallback here, we could livelock
1828                          * because not all segments in the iov can be copied at
1829                          * once without a pagefault.
1830                          */
1831                         bytes = min_t(unsigned long, PAGE_SIZE - offset,
1832                                       iov_iter_single_seg_count(iter));
1833                         goto again;
1834                 }
1835                 pos += ret;
1836                 written += ret;
1837                 ret = 0;
1838
1839                 balance_dirty_pages_ratelimited(mapping);
1840         } while (iov_iter_count(iter));
1841
1842         bch2_pagecache_add_put(&inode->ei_pagecache_lock);
1843
1844         return written ? written : ret;
1845 }
1846
1847 /* O_DIRECT reads */
1848
1849 static void bio_check_or_release(struct bio *bio, bool check_dirty)
1850 {
1851         if (check_dirty) {
1852                 bio_check_pages_dirty(bio);
1853         } else {
1854                 bio_release_pages(bio, false);
1855                 bio_put(bio);
1856         }
1857 }
1858
1859 static void bch2_dio_read_complete(struct closure *cl)
1860 {
1861         struct dio_read *dio = container_of(cl, struct dio_read, cl);
1862
1863         dio->req->ki_complete(dio->req, dio->ret);
1864         bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
1865 }
1866
1867 static void bch2_direct_IO_read_endio(struct bio *bio)
1868 {
1869         struct dio_read *dio = bio->bi_private;
1870
1871         if (bio->bi_status)
1872                 dio->ret = blk_status_to_errno(bio->bi_status);
1873
1874         closure_put(&dio->cl);
1875 }
1876
1877 static void bch2_direct_IO_read_split_endio(struct bio *bio)
1878 {
1879         struct dio_read *dio = bio->bi_private;
1880         bool should_dirty = dio->should_dirty;
1881
1882         bch2_direct_IO_read_endio(bio);
1883         bio_check_or_release(bio, should_dirty);
1884 }
1885
1886 static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
1887 {
1888         struct file *file = req->ki_filp;
1889         struct bch_inode_info *inode = file_bch_inode(file);
1890         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1891         struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
1892         struct dio_read *dio;
1893         struct bio *bio;
1894         loff_t offset = req->ki_pos;
1895         bool sync = is_sync_kiocb(req);
1896         size_t shorten;
1897         ssize_t ret;
1898
1899         if ((offset|iter->count) & (block_bytes(c) - 1))
1900                 return -EINVAL;
1901
1902         ret = min_t(loff_t, iter->count,
1903                     max_t(loff_t, 0, i_size_read(&inode->v) - offset));
1904
1905         if (!ret)
1906                 return ret;
1907
1908         shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
1909         iter->count -= shorten;
1910
1911         bio = bio_alloc_bioset(NULL,
1912                                bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
1913                                REQ_OP_READ,
1914                                GFP_KERNEL,
1915                                &c->dio_read_bioset);
1916
1917         bio->bi_end_io = bch2_direct_IO_read_endio;
1918
1919         dio = container_of(bio, struct dio_read, rbio.bio);
1920         closure_init(&dio->cl, NULL);
1921
1922         /*
1923          * this is a _really_ horrible hack just to avoid an atomic sub at the
1924          * end:
1925          */
1926         if (!sync) {
1927                 set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
1928                 atomic_set(&dio->cl.remaining,
1929                            CLOSURE_REMAINING_INITIALIZER -
1930                            CLOSURE_RUNNING +
1931                            CLOSURE_DESTRUCTOR);
1932         } else {
1933                 atomic_set(&dio->cl.remaining,
1934                            CLOSURE_REMAINING_INITIALIZER + 1);
1935         }
1936
1937         dio->req        = req;
1938         dio->ret        = ret;
1939         /*
1940          * This is one of the sketchier things I've encountered: we have to skip
1941          * the dirtying of requests that are internal from the kernel (i.e. from
1942          * loopback), because we'll deadlock on page_lock.
1943          */
1944         dio->should_dirty = iter_is_iovec(iter);
1945
1946         goto start;
1947         while (iter->count) {
1948                 bio = bio_alloc_bioset(NULL,
1949                                        bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
1950                                        REQ_OP_READ,
1951                                        GFP_KERNEL,
1952                                        &c->bio_read);
1953                 bio->bi_end_io          = bch2_direct_IO_read_split_endio;
1954 start:
1955                 bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC);
1956                 bio->bi_iter.bi_sector  = offset >> 9;
1957                 bio->bi_private         = dio;
1958
1959                 ret = bio_iov_iter_get_pages(bio, iter);
1960                 if (ret < 0) {
1961                         /* XXX: fault inject this path */
1962                         bio->bi_status = BLK_STS_RESOURCE;
1963                         bio_endio(bio);
1964                         break;
1965                 }
1966
1967                 offset += bio->bi_iter.bi_size;
1968
1969                 if (dio->should_dirty)
1970                         bio_set_pages_dirty(bio);
1971
1972                 if (iter->count)
1973                         closure_get(&dio->cl);
1974
1975                 bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
1976         }
1977
1978         iter->count += shorten;
1979
1980         if (sync) {
1981                 closure_sync(&dio->cl);
1982                 closure_debug_destroy(&dio->cl);
1983                 ret = dio->ret;
1984                 bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
1985                 return ret;
1986         } else {
1987                 return -EIOCBQUEUED;
1988         }
1989 }
1990
1991 ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
1992 {
1993         struct file *file = iocb->ki_filp;
1994         struct bch_inode_info *inode = file_bch_inode(file);
1995         struct address_space *mapping = file->f_mapping;
1996         size_t count = iov_iter_count(iter);
1997         ssize_t ret;
1998
1999         if (!count)
2000                 return 0; /* skip atime */
2001
2002         if (iocb->ki_flags & IOCB_DIRECT) {
2003                 struct blk_plug plug;
2004
2005                 ret = filemap_write_and_wait_range(mapping,
2006                                         iocb->ki_pos,
2007                                         iocb->ki_pos + count - 1);
2008                 if (ret < 0)
2009                         goto out;
2010
2011                 file_accessed(file);
2012
2013                 blk_start_plug(&plug);
2014                 ret = bch2_direct_IO_read(iocb, iter);
2015                 blk_finish_plug(&plug);
2016
2017                 if (ret >= 0)
2018                         iocb->ki_pos += ret;
2019         } else {
2020                 bch2_pagecache_add_get(&inode->ei_pagecache_lock);
2021                 ret = generic_file_read_iter(iocb, iter);
2022                 bch2_pagecache_add_put(&inode->ei_pagecache_lock);
2023         }
2024 out:
2025         return bch2_err_class(ret);
2026 }
2027
2028 /* O_DIRECT writes */
2029
2030 static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
2031                                        u64 offset, u64 size,
2032                                        unsigned nr_replicas, bool compressed)
2033 {
2034         struct btree_trans trans;
2035         struct btree_iter iter;
2036         struct bkey_s_c k;
2037         u64 end = offset + size;
2038         u32 snapshot;
2039         bool ret = true;
2040         int err;
2041
2042         bch2_trans_init(&trans, c, 0, 0);
2043 retry:
2044         bch2_trans_begin(&trans);
2045
2046         err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
2047         if (err)
2048                 goto err;
2049
2050         for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
2051                            SPOS(inum.inum, offset, snapshot),
2052                            BTREE_ITER_SLOTS, k, err) {
2053                 if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0)
2054                         break;
2055
2056                 if (k.k->p.snapshot != snapshot ||
2057                     nr_replicas > bch2_bkey_replicas(c, k) ||
2058                     (!compressed && bch2_bkey_sectors_compressed(k))) {
2059                         ret = false;
2060                         break;
2061                 }
2062         }
2063
2064         offset = iter.pos.offset;
2065         bch2_trans_iter_exit(&trans, &iter);
2066 err:
2067         if (bch2_err_matches(err, BCH_ERR_transaction_restart))
2068                 goto retry;
2069         bch2_trans_exit(&trans);
2070
2071         return err ? false : ret;
2072 }
2073
2074 static void bch2_dio_write_loop_async(struct bch_write_op *);
2075
2076 static long bch2_dio_write_loop(struct dio_write *dio)
2077 {
2078         bool kthread = (current->flags & PF_KTHREAD) != 0;
2079         struct kiocb *req = dio->req;
2080         struct address_space *mapping = req->ki_filp->f_mapping;
2081         struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
2082         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2083         struct bio *bio = &dio->op.wbio.bio;
2084         struct bvec_iter_all iter;
2085         struct bio_vec *bv;
2086         unsigned unaligned, iter_count;
2087         bool sync = dio->sync, dropped_locks;
2088         long ret;
2089
2090         if (dio->loop)
2091                 goto loop;
2092
2093         while (1) {
2094                 iter_count = dio->iter.count;
2095
2096                 if (kthread && dio->mm)
2097                         kthread_use_mm(dio->mm);
2098                 BUG_ON(current->faults_disabled_mapping);
2099                 current->faults_disabled_mapping = mapping;
2100
2101                 ret = bio_iov_iter_get_pages(bio, &dio->iter);
2102
2103                 dropped_locks = fdm_dropped_locks();
2104
2105                 current->faults_disabled_mapping = NULL;
2106                 if (kthread && dio->mm)
2107                         kthread_unuse_mm(dio->mm);
2108
2109                 /*
2110                  * If the fault handler returned an error but also signalled
2111                  * that it dropped & retook ei_pagecache_lock, we just need to
2112                  * re-shoot down the page cache and retry:
2113                  */
2114                 if (dropped_locks && ret)
2115                         ret = 0;
2116
2117                 if (unlikely(ret < 0))
2118                         goto err;
2119
2120                 if (unlikely(dropped_locks)) {
2121                         ret = write_invalidate_inode_pages_range(mapping,
2122                                         req->ki_pos,
2123                                         req->ki_pos + iter_count - 1);
2124                         if (unlikely(ret))
2125                                 goto err;
2126
2127                         if (!bio->bi_iter.bi_size)
2128                                 continue;
2129                 }
2130
2131                 unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
2132                 bio->bi_iter.bi_size -= unaligned;
2133                 iov_iter_revert(&dio->iter, unaligned);
2134
2135                 if (!bio->bi_iter.bi_size) {
2136                         /*
2137                          * bio_iov_iter_get_pages was only able to get <
2138                          * blocksize worth of pages:
2139                          */
2140                         ret = -EFAULT;
2141                         goto err;
2142                 }
2143
2144                 bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode));
2145                 dio->op.end_io          = bch2_dio_write_loop_async;
2146                 dio->op.target          = dio->op.opts.foreground_target;
2147                 dio->op.write_point     = writepoint_hashed((unsigned long) current);
2148                 dio->op.nr_replicas     = dio->op.opts.data_replicas;
2149                 dio->op.subvol          = inode->ei_subvol;
2150                 dio->op.pos             = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
2151
2152                 if ((req->ki_flags & IOCB_DSYNC) &&
2153                     !c->opts.journal_flush_disabled)
2154                         dio->op.flags |= BCH_WRITE_FLUSH;
2155                 dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
2156
2157                 ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
2158                                                 dio->op.opts.data_replicas, 0);
2159                 if (unlikely(ret) &&
2160                     !bch2_check_range_allocated(c, inode_inum(inode),
2161                                 dio->op.pos.offset, bio_sectors(bio),
2162                                 dio->op.opts.data_replicas,
2163                                 dio->op.opts.compression != 0))
2164                         goto err;
2165
2166                 task_io_account_write(bio->bi_iter.bi_size);
2167
2168                 if (!dio->sync && !dio->loop && dio->iter.count) {
2169                         struct iovec *iov = dio->inline_vecs;
2170
2171                         if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
2172                                 iov = kmalloc(dio->iter.nr_segs * sizeof(*iov),
2173                                               GFP_KERNEL);
2174                                 if (unlikely(!iov)) {
2175                                         dio->sync = sync = true;
2176                                         goto do_io;
2177                                 }
2178
2179                                 dio->free_iov = true;
2180                         }
2181
2182                         memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
2183                         dio->iter.iov = iov;
2184                 }
2185 do_io:
2186                 dio->loop = true;
2187                 closure_call(&dio->op.cl, bch2_write, NULL, NULL);
2188
2189                 if (sync)
2190                         wait_for_completion(&dio->done);
2191                 else
2192                         return -EIOCBQUEUED;
2193 loop:
2194                 i_sectors_acct(c, inode, &dio->quota_res,
2195                                dio->op.i_sectors_delta);
2196                 req->ki_pos += (u64) dio->op.written << 9;
2197                 dio->written += dio->op.written;
2198
2199                 spin_lock(&inode->v.i_lock);
2200                 if (req->ki_pos > inode->v.i_size)
2201                         i_size_write(&inode->v, req->ki_pos);
2202                 spin_unlock(&inode->v.i_lock);
2203
2204                 if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
2205                         bio_for_each_segment_all(bv, bio, iter)
2206                                 put_page(bv->bv_page);
2207                 bio->bi_vcnt = 0;
2208
2209                 if (dio->op.error) {
2210                         set_bit(EI_INODE_ERROR, &inode->ei_flags);
2211                         break;
2212                 }
2213
2214                 if (!dio->iter.count)
2215                         break;
2216
2217                 bio_reset(bio, NULL, REQ_OP_WRITE);
2218                 reinit_completion(&dio->done);
2219         }
2220
2221         ret = dio->op.error ?: ((long) dio->written << 9);
2222 err:
2223         bch2_pagecache_block_put(&inode->ei_pagecache_lock);
2224         bch2_quota_reservation_put(c, inode, &dio->quota_res);
2225
2226         if (dio->free_iov)
2227                 kfree(dio->iter.iov);
2228
2229         if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
2230                 bio_for_each_segment_all(bv, bio, iter)
2231                         put_page(bv->bv_page);
2232         bio_put(bio);
2233
2234         /* inode->i_dio_count is our ref on inode and thus bch_fs */
2235         inode_dio_end(&inode->v);
2236
2237         if (!sync) {
2238                 req->ki_complete(req, ret);
2239                 ret = -EIOCBQUEUED;
2240         }
2241         return ret;
2242 }
2243
2244 static void bch2_dio_write_loop_async(struct bch_write_op *op)
2245 {
2246         struct dio_write *dio = container_of(op, struct dio_write, op);
2247
2248         if (dio->sync)
2249                 complete(&dio->done);
2250         else
2251                 bch2_dio_write_loop(dio);
2252 }
2253
2254 static noinline
2255 ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
2256 {
2257         struct file *file = req->ki_filp;
2258         struct address_space *mapping = file->f_mapping;
2259         struct bch_inode_info *inode = file_bch_inode(file);
2260         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2261         struct dio_write *dio;
2262         struct bio *bio;
2263         bool locked = true, extending;
2264         ssize_t ret;
2265
2266         prefetch(&c->opts);
2267         prefetch((void *) &c->opts + 64);
2268         prefetch(&inode->ei_inode);
2269         prefetch((void *) &inode->ei_inode + 64);
2270
2271         inode_lock(&inode->v);
2272
2273         ret = generic_write_checks(req, iter);
2274         if (unlikely(ret <= 0))
2275                 goto err;
2276
2277         ret = file_remove_privs(file);
2278         if (unlikely(ret))
2279                 goto err;
2280
2281         ret = file_update_time(file);
2282         if (unlikely(ret))
2283                 goto err;
2284
2285         if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
2286                 goto err;
2287
2288         inode_dio_begin(&inode->v);
2289         bch2_pagecache_block_get(&inode->ei_pagecache_lock);
2290
2291         extending = req->ki_pos + iter->count > inode->v.i_size;
2292         if (!extending) {
2293                 inode_unlock(&inode->v);
2294                 locked = false;
2295         }
2296
2297         bio = bio_alloc_bioset(NULL,
2298                                bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
2299                                REQ_OP_WRITE,
2300                                GFP_KERNEL,
2301                                &c->dio_write_bioset);
2302         dio = container_of(bio, struct dio_write, op.wbio.bio);
2303         init_completion(&dio->done);
2304         dio->req                = req;
2305         dio->mm                 = current->mm;
2306         dio->loop               = false;
2307         dio->sync               = is_sync_kiocb(req) || extending;
2308         dio->free_iov           = false;
2309         dio->quota_res.sectors  = 0;
2310         dio->written            = 0;
2311         dio->iter               = *iter;
2312
2313         ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
2314                                          iter->count >> 9, true);
2315         if (unlikely(ret))
2316                 goto err_put_bio;
2317
2318         ret = write_invalidate_inode_pages_range(mapping,
2319                                         req->ki_pos,
2320                                         req->ki_pos + iter->count - 1);
2321         if (unlikely(ret))
2322                 goto err_put_bio;
2323
2324         ret = bch2_dio_write_loop(dio);
2325 err:
2326         if (locked)
2327                 inode_unlock(&inode->v);
2328         return ret;
2329 err_put_bio:
2330         bch2_pagecache_block_put(&inode->ei_pagecache_lock);
2331         bch2_quota_reservation_put(c, inode, &dio->quota_res);
2332         bio_put(bio);
2333         inode_dio_end(&inode->v);
2334         goto err;
2335 }
2336
2337 ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
2338 {
2339         struct file *file = iocb->ki_filp;
2340         struct bch_inode_info *inode = file_bch_inode(file);
2341         ssize_t ret;
2342
2343         if (iocb->ki_flags & IOCB_DIRECT) {
2344                 ret = bch2_direct_write(iocb, from);
2345                 goto out;
2346         }
2347
2348         /* We can write back this queue in page reclaim */
2349         current->backing_dev_info = inode_to_bdi(&inode->v);
2350         inode_lock(&inode->v);
2351
2352         ret = generic_write_checks(iocb, from);
2353         if (ret <= 0)
2354                 goto unlock;
2355
2356         ret = file_remove_privs(file);
2357         if (ret)
2358                 goto unlock;
2359
2360         ret = file_update_time(file);
2361         if (ret)
2362                 goto unlock;
2363
2364         ret = bch2_buffered_write(iocb, from);
2365         if (likely(ret > 0))
2366                 iocb->ki_pos += ret;
2367 unlock:
2368         inode_unlock(&inode->v);
2369         current->backing_dev_info = NULL;
2370
2371         if (ret > 0)
2372                 ret = generic_write_sync(iocb, ret);
2373 out:
2374         return bch2_err_class(ret);
2375 }
2376
2377 /* fsync: */
2378
2379 /*
2380  * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
2381  * insert trigger: look up the btree inode instead
2382  */
2383 static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum)
2384 {
2385         struct bch_inode_unpacked inode;
2386         int ret;
2387
2388         if (c->opts.journal_flush_disabled)
2389                 return 0;
2390
2391         ret = bch2_inode_find_by_inum(c, inum, &inode);
2392         if (ret)
2393                 return ret;
2394
2395         return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq);
2396 }
2397
2398 int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2399 {
2400         struct bch_inode_info *inode = file_bch_inode(file);
2401         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2402         int ret, ret2, ret3;
2403
2404         ret = file_write_and_wait_range(file, start, end);
2405         ret2 = sync_inode_metadata(&inode->v, 1);
2406         ret3 = bch2_flush_inode(c, inode_inum(inode));
2407
2408         return bch2_err_class(ret ?: ret2 ?: ret3);
2409 }
2410
2411 /* truncate: */
2412
2413 static inline int range_has_data(struct bch_fs *c, u32 subvol,
2414                                  struct bpos start,
2415                                  struct bpos end)
2416 {
2417         struct btree_trans trans;
2418         struct btree_iter iter;
2419         struct bkey_s_c k;
2420         int ret = 0;
2421
2422         bch2_trans_init(&trans, c, 0, 0);
2423 retry:
2424         bch2_trans_begin(&trans);
2425
2426         ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
2427         if (ret)
2428                 goto err;
2429
2430         for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
2431                 if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
2432                         break;
2433
2434                 if (bkey_extent_is_data(k.k)) {
2435                         ret = 1;
2436                         break;
2437                 }
2438         }
2439         start = iter.pos;
2440         bch2_trans_iter_exit(&trans, &iter);
2441 err:
2442         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2443                 goto retry;
2444
2445         bch2_trans_exit(&trans);
2446         return ret;
2447 }
2448
2449 static int __bch2_truncate_page(struct bch_inode_info *inode,
2450                                 pgoff_t index, loff_t start, loff_t end)
2451 {
2452         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2453         struct address_space *mapping = inode->v.i_mapping;
2454         struct bch_page_state *s;
2455         unsigned start_offset = start & (PAGE_SIZE - 1);
2456         unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
2457         unsigned i;
2458         struct page *page;
2459         s64 i_sectors_delta = 0;
2460         int ret = 0;
2461
2462         /* Page boundary? Nothing to do */
2463         if (!((index == start >> PAGE_SHIFT && start_offset) ||
2464               (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE)))
2465                 return 0;
2466
2467         /* Above i_size? */
2468         if (index << PAGE_SHIFT >= inode->v.i_size)
2469                 return 0;
2470
2471         page = find_lock_page(mapping, index);
2472         if (!page) {
2473                 /*
2474                  * XXX: we're doing two index lookups when we end up reading the
2475                  * page
2476                  */
2477                 ret = range_has_data(c, inode->ei_subvol,
2478                                 POS(inode->v.i_ino, index << PAGE_SECTORS_SHIFT),
2479                                 POS(inode->v.i_ino, (index + 1) << PAGE_SECTORS_SHIFT));
2480                 if (ret <= 0)
2481                         return ret;
2482
2483                 page = find_or_create_page(mapping, index, GFP_KERNEL);
2484                 if (unlikely(!page)) {
2485                         ret = -ENOMEM;
2486                         goto out;
2487                 }
2488         }
2489
2490         s = bch2_page_state_create(page, 0);
2491         if (!s) {
2492                 ret = -ENOMEM;
2493                 goto unlock;
2494         }
2495
2496         if (!PageUptodate(page)) {
2497                 ret = bch2_read_single_page(page, mapping);
2498                 if (ret)
2499                         goto unlock;
2500         }
2501
2502         if (index != start >> PAGE_SHIFT)
2503                 start_offset = 0;
2504         if (index != end >> PAGE_SHIFT)
2505                 end_offset = PAGE_SIZE;
2506
2507         for (i = round_up(start_offset, block_bytes(c)) >> 9;
2508              i < round_down(end_offset, block_bytes(c)) >> 9;
2509              i++) {
2510                 s->s[i].nr_replicas     = 0;
2511                 if (s->s[i].state == SECTOR_DIRTY)
2512                         i_sectors_delta--;
2513                 s->s[i].state           = SECTOR_UNALLOCATED;
2514         }
2515
2516         i_sectors_acct(c, inode, NULL, i_sectors_delta);
2517
2518         /*
2519          * Caller needs to know whether this page will be written out by
2520          * writeback - doing an i_size update if necessary - or whether it will
2521          * be responsible for the i_size update:
2522          */
2523         ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT),
2524                           PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY;
2525
2526         zero_user_segment(page, start_offset, end_offset);
2527
2528         /*
2529          * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
2530          *
2531          * XXX: because we aren't currently tracking whether the page has actual
2532          * data in it (vs. just 0s, or only partially written) this wrong. ick.
2533          */
2534         BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false));
2535
2536         /*
2537          * This removes any writeable userspace mappings; we need to force
2538          * .page_mkwrite to be called again before any mmapped writes, to
2539          * redirty the full page:
2540          */
2541         page_mkclean(page);
2542         __set_page_dirty_nobuffers(page);
2543 unlock:
2544         unlock_page(page);
2545         put_page(page);
2546 out:
2547         return ret;
2548 }
2549
2550 static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
2551 {
2552         return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
2553                                     from, round_up(from, PAGE_SIZE));
2554 }
2555
2556 static int bch2_truncate_pages(struct bch_inode_info *inode,
2557                                loff_t start, loff_t end)
2558 {
2559         int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT,
2560                                        start, end);
2561
2562         if (ret >= 0 &&
2563             start >> PAGE_SHIFT != end >> PAGE_SHIFT)
2564                 ret = __bch2_truncate_page(inode,
2565                                            end >> PAGE_SHIFT,
2566                                            start, end);
2567         return ret;
2568 }
2569
2570 static int bch2_extend(struct user_namespace *mnt_userns,
2571                        struct bch_inode_info *inode,
2572                        struct bch_inode_unpacked *inode_u,
2573                        struct iattr *iattr)
2574 {
2575         struct address_space *mapping = inode->v.i_mapping;
2576         int ret;
2577
2578         /*
2579          * sync appends:
2580          *
2581          * this has to be done _before_ extending i_size:
2582          */
2583         ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
2584         if (ret)
2585                 return ret;
2586
2587         truncate_setsize(&inode->v, iattr->ia_size);
2588
2589         return bch2_setattr_nonsize(mnt_userns, inode, iattr);
2590 }
2591
2592 static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
2593                                    struct bch_inode_unpacked *bi,
2594                                    void *p)
2595 {
2596         bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
2597         return 0;
2598 }
2599
2600 static int bch2_truncate_start_fn(struct bch_inode_info *inode,
2601                                   struct bch_inode_unpacked *bi, void *p)
2602 {
2603         u64 *new_i_size = p;
2604
2605         bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
2606         bi->bi_size = *new_i_size;
2607         return 0;
2608 }
2609
2610 int bch2_truncate(struct user_namespace *mnt_userns,
2611                   struct bch_inode_info *inode, struct iattr *iattr)
2612 {
2613         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2614         struct address_space *mapping = inode->v.i_mapping;
2615         struct bch_inode_unpacked inode_u;
2616         u64 new_i_size = iattr->ia_size;
2617         s64 i_sectors_delta = 0;
2618         int ret = 0;
2619
2620         /*
2621          * If the truncate call with change the size of the file, the
2622          * cmtimes should be updated. If the size will not change, we
2623          * do not need to update the cmtimes.
2624          */
2625         if (iattr->ia_size != inode->v.i_size) {
2626                 if (!(iattr->ia_valid & ATTR_MTIME))
2627                         ktime_get_coarse_real_ts64(&iattr->ia_mtime);
2628                 if (!(iattr->ia_valid & ATTR_CTIME))
2629                         ktime_get_coarse_real_ts64(&iattr->ia_ctime);
2630                 iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
2631         }
2632
2633         inode_dio_wait(&inode->v);
2634         bch2_pagecache_block_get(&inode->ei_pagecache_lock);
2635
2636         ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
2637         if (ret)
2638                 goto err;
2639
2640         /*
2641          * check this before next assertion; on filesystem error our normal
2642          * invariants are a bit broken (truncate has to truncate the page cache
2643          * before the inode).
2644          */
2645         ret = bch2_journal_error(&c->journal);
2646         if (ret)
2647                 goto err;
2648
2649         WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
2650                 inode->v.i_size < inode_u.bi_size);
2651
2652         if (iattr->ia_size > inode->v.i_size) {
2653                 ret = bch2_extend(mnt_userns, inode, &inode_u, iattr);
2654                 goto err;
2655         }
2656
2657         iattr->ia_valid &= ~ATTR_SIZE;
2658
2659         ret = bch2_truncate_page(inode, iattr->ia_size);
2660         if (unlikely(ret < 0))
2661                 goto err;
2662
2663         /*
2664          * When extending, we're going to write the new i_size to disk
2665          * immediately so we need to flush anything above the current on disk
2666          * i_size first:
2667          *
2668          * Also, when extending we need to flush the page that i_size currently
2669          * straddles - if it's mapped to userspace, we need to ensure that
2670          * userspace has to redirty it and call .mkwrite -> set_page_dirty
2671          * again to allocate the part of the page that was extended.
2672          */
2673         if (iattr->ia_size > inode_u.bi_size)
2674                 ret = filemap_write_and_wait_range(mapping,
2675                                 inode_u.bi_size,
2676                                 iattr->ia_size - 1);
2677         else if (iattr->ia_size & (PAGE_SIZE - 1))
2678                 ret = filemap_write_and_wait_range(mapping,
2679                                 round_down(iattr->ia_size, PAGE_SIZE),
2680                                 iattr->ia_size - 1);
2681         if (ret)
2682                 goto err;
2683
2684         mutex_lock(&inode->ei_update_lock);
2685         ret = bch2_write_inode(c, inode, bch2_truncate_start_fn,
2686                                &new_i_size, 0);
2687         mutex_unlock(&inode->ei_update_lock);
2688
2689         if (unlikely(ret))
2690                 goto err;
2691
2692         truncate_setsize(&inode->v, iattr->ia_size);
2693
2694         ret = bch2_fpunch(c, inode_inum(inode),
2695                         round_up(iattr->ia_size, block_bytes(c)) >> 9,
2696                         U64_MAX, &i_sectors_delta);
2697         i_sectors_acct(c, inode, NULL, i_sectors_delta);
2698
2699         bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
2700                                 !bch2_journal_error(&c->journal), c,
2701                                 "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
2702                                 inode->v.i_ino, (u64) inode->v.i_blocks,
2703                                 inode->ei_inode.bi_sectors);
2704         if (unlikely(ret))
2705                 goto err;
2706
2707         mutex_lock(&inode->ei_update_lock);
2708         ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0);
2709         mutex_unlock(&inode->ei_update_lock);
2710
2711         ret = bch2_setattr_nonsize(mnt_userns, inode, iattr);
2712 err:
2713         bch2_pagecache_block_put(&inode->ei_pagecache_lock);
2714         return bch2_err_class(ret);
2715 }
2716
2717 /* fallocate: */
2718
2719 static int inode_update_times_fn(struct bch_inode_info *inode,
2720                                  struct bch_inode_unpacked *bi, void *p)
2721 {
2722         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2723
2724         bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
2725         return 0;
2726 }
2727
2728 static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
2729 {
2730         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2731         u64 end         = offset + len;
2732         u64 block_start = round_up(offset, block_bytes(c));
2733         u64 block_end   = round_down(end, block_bytes(c));
2734         bool truncated_last_page;
2735         int ret = 0;
2736
2737         ret = bch2_truncate_pages(inode, offset, end);
2738         if (unlikely(ret < 0))
2739                 goto err;
2740
2741         truncated_last_page = ret;
2742
2743         truncate_pagecache_range(&inode->v, offset, end - 1);
2744
2745         if (block_start < block_end ) {
2746                 s64 i_sectors_delta = 0;
2747
2748                 ret = bch2_fpunch(c, inode_inum(inode),
2749                                   block_start >> 9, block_end >> 9,
2750                                   &i_sectors_delta);
2751                 i_sectors_acct(c, inode, NULL, i_sectors_delta);
2752         }
2753
2754         mutex_lock(&inode->ei_update_lock);
2755         if (end >= inode->v.i_size && !truncated_last_page) {
2756                 ret = bch2_write_inode_size(c, inode, inode->v.i_size,
2757                                             ATTR_MTIME|ATTR_CTIME);
2758         } else {
2759                 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
2760                                        ATTR_MTIME|ATTR_CTIME);
2761         }
2762         mutex_unlock(&inode->ei_update_lock);
2763 err:
2764         return ret;
2765 }
2766
2767 static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
2768                                    loff_t offset, loff_t len,
2769                                    bool insert)
2770 {
2771         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2772         struct address_space *mapping = inode->v.i_mapping;
2773         struct bkey_buf copy;
2774         struct btree_trans trans;
2775         struct btree_iter src, dst, del;
2776         loff_t shift, new_size;
2777         u64 src_start;
2778         int ret = 0;
2779
2780         if ((offset | len) & (block_bytes(c) - 1))
2781                 return -EINVAL;
2782
2783         if (insert) {
2784                 if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
2785                         return -EFBIG;
2786
2787                 if (offset >= inode->v.i_size)
2788                         return -EINVAL;
2789
2790                 src_start       = U64_MAX;
2791                 shift           = len;
2792         } else {
2793                 if (offset + len >= inode->v.i_size)
2794                         return -EINVAL;
2795
2796                 src_start       = offset + len;
2797                 shift           = -len;
2798         }
2799
2800         new_size = inode->v.i_size + shift;
2801
2802         ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
2803         if (ret)
2804                 return ret;
2805
2806         if (insert) {
2807                 i_size_write(&inode->v, new_size);
2808                 mutex_lock(&inode->ei_update_lock);
2809                 ret = bch2_write_inode_size(c, inode, new_size,
2810                                             ATTR_MTIME|ATTR_CTIME);
2811                 mutex_unlock(&inode->ei_update_lock);
2812         } else {
2813                 s64 i_sectors_delta = 0;
2814
2815                 ret = bch2_fpunch(c, inode_inum(inode),
2816                                   offset >> 9, (offset + len) >> 9,
2817                                   &i_sectors_delta);
2818                 i_sectors_acct(c, inode, NULL, i_sectors_delta);
2819
2820                 if (ret)
2821                         return ret;
2822         }
2823
2824         bch2_bkey_buf_init(&copy);
2825         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
2826         bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
2827                         POS(inode->v.i_ino, src_start >> 9),
2828                         BTREE_ITER_INTENT);
2829         bch2_trans_copy_iter(&dst, &src);
2830         bch2_trans_copy_iter(&del, &src);
2831
2832         while (ret == 0 ||
2833                bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
2834                 struct disk_reservation disk_res =
2835                         bch2_disk_reservation_init(c, 0);
2836                 struct bkey_i delete;
2837                 struct bkey_s_c k;
2838                 struct bpos next_pos;
2839                 struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
2840                 struct bpos atomic_end;
2841                 unsigned trigger_flags = 0;
2842                 u32 snapshot;
2843
2844                 bch2_trans_begin(&trans);
2845
2846                 ret = bch2_subvolume_get_snapshot(&trans,
2847                                         inode->ei_subvol, &snapshot);
2848                 if (ret)
2849                         continue;
2850
2851                 bch2_btree_iter_set_snapshot(&src, snapshot);
2852                 bch2_btree_iter_set_snapshot(&dst, snapshot);
2853                 bch2_btree_iter_set_snapshot(&del, snapshot);
2854
2855                 bch2_trans_begin(&trans);
2856
2857                 k = insert
2858                         ? bch2_btree_iter_peek_prev(&src)
2859                         : bch2_btree_iter_peek(&src);
2860                 if ((ret = bkey_err(k)))
2861                         continue;
2862
2863                 if (!k.k || k.k->p.inode != inode->v.i_ino)
2864                         break;
2865
2866                 if (insert &&
2867                     bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
2868                         break;
2869 reassemble:
2870                 bch2_bkey_buf_reassemble(&copy, c, k);
2871
2872                 if (insert &&
2873                     bkey_cmp(bkey_start_pos(k.k), move_pos) < 0)
2874                         bch2_cut_front(move_pos, copy.k);
2875
2876                 copy.k->k.p.offset += shift >> 9;
2877                 bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k));
2878
2879                 ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
2880                 if (ret)
2881                         continue;
2882
2883                 if (bkey_cmp(atomic_end, copy.k->k.p)) {
2884                         if (insert) {
2885                                 move_pos = atomic_end;
2886                                 move_pos.offset -= shift >> 9;
2887                                 goto reassemble;
2888                         } else {
2889                                 bch2_cut_back(atomic_end, copy.k);
2890                         }
2891                 }
2892
2893                 bkey_init(&delete.k);
2894                 delete.k.p = copy.k->k.p;
2895                 delete.k.size = copy.k->k.size;
2896                 delete.k.p.offset -= shift >> 9;
2897                 bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
2898
2899                 next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
2900
2901                 if (copy.k->k.size != k.k->size) {
2902                         /* We might end up splitting compressed extents: */
2903                         unsigned nr_ptrs =
2904                                 bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
2905
2906                         ret = bch2_disk_reservation_get(c, &disk_res,
2907                                         copy.k->k.size, nr_ptrs,
2908                                         BCH_DISK_RESERVATION_NOFAIL);
2909                         BUG_ON(ret);
2910                 }
2911
2912                 ret =   bch2_btree_iter_traverse(&del) ?:
2913                         bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
2914                         bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
2915                         bch2_trans_commit(&trans, &disk_res, NULL,
2916                                           BTREE_INSERT_NOFAIL);
2917                 bch2_disk_reservation_put(c, &disk_res);
2918
2919                 if (!ret)
2920                         bch2_btree_iter_set_pos(&src, next_pos);
2921         }
2922         bch2_trans_iter_exit(&trans, &del);
2923         bch2_trans_iter_exit(&trans, &dst);
2924         bch2_trans_iter_exit(&trans, &src);
2925         bch2_trans_exit(&trans);
2926         bch2_bkey_buf_exit(&copy, c);
2927
2928         if (ret)
2929                 return ret;
2930
2931         mutex_lock(&inode->ei_update_lock);
2932         if (!insert) {
2933                 i_size_write(&inode->v, new_size);
2934                 ret = bch2_write_inode_size(c, inode, new_size,
2935                                             ATTR_MTIME|ATTR_CTIME);
2936         } else {
2937                 /* We need an inode update to update bi_journal_seq for fsync: */
2938                 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
2939                                        ATTR_MTIME|ATTR_CTIME);
2940         }
2941         mutex_unlock(&inode->ei_update_lock);
2942         return ret;
2943 }
2944
2945 static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
2946                              u64 start_sector, u64 end_sector)
2947 {
2948         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2949         struct btree_trans trans;
2950         struct btree_iter iter;
2951         struct bpos end_pos = POS(inode->v.i_ino, end_sector);
2952         unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
2953         int ret = 0;
2954
2955         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
2956
2957         bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
2958                         POS(inode->v.i_ino, start_sector),
2959                         BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
2960
2961         while (!ret && bkey_cmp(iter.pos, end_pos) < 0) {
2962                 s64 i_sectors_delta = 0;
2963                 struct disk_reservation disk_res = { 0 };
2964                 struct quota_res quota_res = { 0 };
2965                 struct bkey_i_reservation reservation;
2966                 struct bkey_s_c k;
2967                 unsigned sectors;
2968                 u32 snapshot;
2969
2970                 bch2_trans_begin(&trans);
2971
2972                 ret = bch2_subvolume_get_snapshot(&trans,
2973                                         inode->ei_subvol, &snapshot);
2974                 if (ret)
2975                         goto bkey_err;
2976
2977                 bch2_btree_iter_set_snapshot(&iter, snapshot);
2978
2979                 k = bch2_btree_iter_peek_slot(&iter);
2980                 if ((ret = bkey_err(k)))
2981                         goto bkey_err;
2982
2983                 /* already reserved */
2984                 if (k.k->type == KEY_TYPE_reservation &&
2985                     bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
2986                         bch2_btree_iter_advance(&iter);
2987                         continue;
2988                 }
2989
2990                 if (bkey_extent_is_data(k.k) &&
2991                     !(mode & FALLOC_FL_ZERO_RANGE)) {
2992                         bch2_btree_iter_advance(&iter);
2993                         continue;
2994                 }
2995
2996                 bkey_reservation_init(&reservation.k_i);
2997                 reservation.k.type      = KEY_TYPE_reservation;
2998                 reservation.k.p         = k.k->p;
2999                 reservation.k.size      = k.k->size;
3000
3001                 bch2_cut_front(iter.pos,        &reservation.k_i);
3002                 bch2_cut_back(end_pos,          &reservation.k_i);
3003
3004                 sectors = reservation.k.size;
3005                 reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k);
3006
3007                 if (!bkey_extent_is_allocation(k.k)) {
3008                         ret = bch2_quota_reservation_add(c, inode,
3009                                         &quota_res,
3010                                         sectors, true);
3011                         if (unlikely(ret))
3012                                 goto bkey_err;
3013                 }
3014
3015                 if (reservation.v.nr_replicas < replicas ||
3016                     bch2_bkey_sectors_compressed(k)) {
3017                         ret = bch2_disk_reservation_get(c, &disk_res, sectors,
3018                                                         replicas, 0);
3019                         if (unlikely(ret))
3020                                 goto bkey_err;
3021
3022                         reservation.v.nr_replicas = disk_res.nr_replicas;
3023                 }
3024
3025                 ret = bch2_extent_update(&trans, inode_inum(inode), &iter,
3026                                          &reservation.k_i,
3027                                 &disk_res, NULL,
3028                                 0, &i_sectors_delta, true);
3029                 if (ret)
3030                         goto bkey_err;
3031                 i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
3032 bkey_err:
3033                 bch2_quota_reservation_put(c, inode, &quota_res);
3034                 bch2_disk_reservation_put(c, &disk_res);
3035                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
3036                         ret = 0;
3037         }
3038
3039         bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */
3040         mark_pagecache_reserved(inode, start_sector, iter.pos.offset);
3041
3042         if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
3043                 struct quota_res quota_res = { 0 };
3044                 s64 i_sectors_delta = 0;
3045
3046                 bch2_fpunch_at(&trans, &iter, inode_inum(inode),
3047                                end_sector, &i_sectors_delta);
3048                 i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
3049                 bch2_quota_reservation_put(c, inode, &quota_res);
3050         }
3051
3052         bch2_trans_iter_exit(&trans, &iter);
3053         bch2_trans_exit(&trans);
3054         return ret;
3055 }
3056
3057 static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
3058                             loff_t offset, loff_t len)
3059 {
3060         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3061         u64 end         = offset + len;
3062         u64 block_start = round_down(offset,    block_bytes(c));
3063         u64 block_end   = round_up(end,         block_bytes(c));
3064         bool truncated_last_page = false;
3065         int ret, ret2 = 0;
3066
3067         if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
3068                 ret = inode_newsize_ok(&inode->v, end);
3069                 if (ret)
3070                         return ret;
3071         }
3072
3073         if (mode & FALLOC_FL_ZERO_RANGE) {
3074                 ret = bch2_truncate_pages(inode, offset, end);
3075                 if (unlikely(ret < 0))
3076                         return ret;
3077
3078                 truncated_last_page = ret;
3079
3080                 truncate_pagecache_range(&inode->v, offset, end - 1);
3081
3082                 block_start     = round_up(offset,      block_bytes(c));
3083                 block_end       = round_down(end,       block_bytes(c));
3084         }
3085
3086         ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
3087
3088         /*
3089          * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
3090          * so that the VFS cache i_size is consistent with the btree i_size:
3091          */
3092         if (ret &&
3093             !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
3094                 return ret;
3095
3096         if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
3097                 end = inode->v.i_size;
3098
3099         if (end >= inode->v.i_size &&
3100             (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
3101              !(mode & FALLOC_FL_KEEP_SIZE))) {
3102                 spin_lock(&inode->v.i_lock);
3103                 i_size_write(&inode->v, end);
3104                 spin_unlock(&inode->v.i_lock);
3105
3106                 mutex_lock(&inode->ei_update_lock);
3107                 ret2 = bch2_write_inode_size(c, inode, end, 0);
3108                 mutex_unlock(&inode->ei_update_lock);
3109         }
3110
3111         return ret ?: ret2;
3112 }
3113
3114 long bch2_fallocate_dispatch(struct file *file, int mode,
3115                              loff_t offset, loff_t len)
3116 {
3117         struct bch_inode_info *inode = file_bch_inode(file);
3118         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3119         long ret;
3120
3121         if (!percpu_ref_tryget_live(&c->writes))
3122                 return -EROFS;
3123
3124         inode_lock(&inode->v);
3125         inode_dio_wait(&inode->v);
3126         bch2_pagecache_block_get(&inode->ei_pagecache_lock);
3127
3128         if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
3129                 ret = bchfs_fallocate(inode, mode, offset, len);
3130         else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
3131                 ret = bchfs_fpunch(inode, offset, len);
3132         else if (mode == FALLOC_FL_INSERT_RANGE)
3133                 ret = bchfs_fcollapse_finsert(inode, offset, len, true);
3134         else if (mode == FALLOC_FL_COLLAPSE_RANGE)
3135                 ret = bchfs_fcollapse_finsert(inode, offset, len, false);
3136         else
3137                 ret = -EOPNOTSUPP;
3138
3139
3140         bch2_pagecache_block_put(&inode->ei_pagecache_lock);
3141         inode_unlock(&inode->v);
3142         percpu_ref_put(&c->writes);
3143
3144         return bch2_err_class(ret);
3145 }
3146
3147 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
3148                              struct file *file_dst, loff_t pos_dst,
3149                              loff_t len, unsigned remap_flags)
3150 {
3151         struct bch_inode_info *src = file_bch_inode(file_src);
3152         struct bch_inode_info *dst = file_bch_inode(file_dst);
3153         struct bch_fs *c = src->v.i_sb->s_fs_info;
3154         s64 i_sectors_delta = 0;
3155         u64 aligned_len;
3156         loff_t ret = 0;
3157
3158         if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
3159                 return -EINVAL;
3160
3161         if (remap_flags & REMAP_FILE_DEDUP)
3162                 return -EOPNOTSUPP;
3163
3164         if ((pos_src & (block_bytes(c) - 1)) ||
3165             (pos_dst & (block_bytes(c) - 1)))
3166                 return -EINVAL;
3167
3168         if (src == dst &&
3169             abs(pos_src - pos_dst) < len)
3170                 return -EINVAL;
3171
3172         bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
3173
3174         file_update_time(file_dst);
3175
3176         inode_dio_wait(&src->v);
3177         inode_dio_wait(&dst->v);
3178
3179         ret = generic_remap_file_range_prep(file_src, pos_src,
3180                                             file_dst, pos_dst,
3181                                             &len, remap_flags);
3182         if (ret < 0 || len == 0)
3183                 goto err;
3184
3185         aligned_len = round_up((u64) len, block_bytes(c));
3186
3187         ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
3188                                 pos_dst, pos_dst + len - 1);
3189         if (ret)
3190                 goto err;
3191
3192         mark_pagecache_unallocated(src, pos_src >> 9,
3193                                    (pos_src + aligned_len) >> 9);
3194
3195         ret = bch2_remap_range(c,
3196                                inode_inum(dst), pos_dst >> 9,
3197                                inode_inum(src), pos_src >> 9,
3198                                aligned_len >> 9,
3199                                pos_dst + len, &i_sectors_delta);
3200         if (ret < 0)
3201                 goto err;
3202
3203         /*
3204          * due to alignment, we might have remapped slightly more than requsted
3205          */
3206         ret = min((u64) ret << 9, (u64) len);
3207
3208         /* XXX get a quota reservation */
3209         i_sectors_acct(c, dst, NULL, i_sectors_delta);
3210
3211         spin_lock(&dst->v.i_lock);
3212         if (pos_dst + ret > dst->v.i_size)
3213                 i_size_write(&dst->v, pos_dst + ret);
3214         spin_unlock(&dst->v.i_lock);
3215
3216         if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
3217             IS_SYNC(file_inode(file_dst)))
3218                 ret = bch2_flush_inode(c, inode_inum(dst));
3219 err:
3220         bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
3221
3222         return bch2_err_class(ret);
3223 }
3224
3225 /* fseek: */
3226
3227 static int page_data_offset(struct page *page, unsigned offset)
3228 {
3229         struct bch_page_state *s = bch2_page_state(page);
3230         unsigned i;
3231
3232         if (s)
3233                 for (i = offset >> 9; i < PAGE_SECTORS; i++)
3234                         if (s->s[i].state >= SECTOR_DIRTY)
3235                                 return i << 9;
3236
3237         return -1;
3238 }
3239
3240 static loff_t bch2_seek_pagecache_data(struct inode *vinode,
3241                                        loff_t start_offset,
3242                                        loff_t end_offset)
3243 {
3244         struct address_space *mapping = vinode->i_mapping;
3245         struct page *page;
3246         pgoff_t start_index     = start_offset >> PAGE_SHIFT;
3247         pgoff_t end_index       = end_offset >> PAGE_SHIFT;
3248         pgoff_t index           = start_index;
3249         loff_t ret;
3250         int offset;
3251
3252         while (index <= end_index) {
3253                 if (find_get_pages_range(mapping, &index, end_index, 1, &page)) {
3254                         lock_page(page);
3255
3256                         offset = page_data_offset(page,
3257                                         page->index == start_index
3258                                         ? start_offset & (PAGE_SIZE - 1)
3259                                         : 0);
3260                         if (offset >= 0) {
3261                                 ret = clamp(((loff_t) page->index << PAGE_SHIFT) +
3262                                             offset,
3263                                             start_offset, end_offset);
3264                                 unlock_page(page);
3265                                 put_page(page);
3266                                 return ret;
3267                         }
3268
3269                         unlock_page(page);
3270                         put_page(page);
3271                 } else {
3272                         break;
3273                 }
3274         }
3275
3276         return end_offset;
3277 }
3278
3279 static loff_t bch2_seek_data(struct file *file, u64 offset)
3280 {
3281         struct bch_inode_info *inode = file_bch_inode(file);
3282         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3283         struct btree_trans trans;
3284         struct btree_iter iter;
3285         struct bkey_s_c k;
3286         subvol_inum inum = inode_inum(inode);
3287         u64 isize, next_data = MAX_LFS_FILESIZE;
3288         u32 snapshot;
3289         int ret;
3290
3291         isize = i_size_read(&inode->v);
3292         if (offset >= isize)
3293                 return -ENXIO;
3294
3295         bch2_trans_init(&trans, c, 0, 0);
3296 retry:
3297         bch2_trans_begin(&trans);
3298
3299         ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
3300         if (ret)
3301                 goto err;
3302
3303         for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
3304                            SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) {
3305                 if (k.k->p.inode != inode->v.i_ino) {
3306                         break;
3307                 } else if (bkey_extent_is_data(k.k)) {
3308                         next_data = max(offset, bkey_start_offset(k.k) << 9);
3309                         break;
3310                 } else if (k.k->p.offset >> 9 > isize)
3311                         break;
3312         }
3313         bch2_trans_iter_exit(&trans, &iter);
3314 err:
3315         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
3316                 goto retry;
3317
3318         bch2_trans_exit(&trans);
3319         if (ret)
3320                 return ret;
3321
3322         if (next_data > offset)
3323                 next_data = bch2_seek_pagecache_data(&inode->v,
3324                                                      offset, next_data);
3325
3326         if (next_data >= isize)
3327                 return -ENXIO;
3328
3329         return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
3330 }
3331
3332 static int __page_hole_offset(struct page *page, unsigned offset)
3333 {
3334         struct bch_page_state *s = bch2_page_state(page);
3335         unsigned i;
3336
3337         if (!s)
3338                 return 0;
3339
3340         for (i = offset >> 9; i < PAGE_SECTORS; i++)
3341                 if (s->s[i].state < SECTOR_DIRTY)
3342                         return i << 9;
3343
3344         return -1;
3345 }
3346
3347 static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
3348 {
3349         pgoff_t index = offset >> PAGE_SHIFT;
3350         struct page *page;
3351         int pg_offset;
3352         loff_t ret = -1;
3353
3354         page = find_lock_page(mapping, index);
3355         if (!page)
3356                 return offset;
3357
3358         pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
3359         if (pg_offset >= 0)
3360                 ret = ((loff_t) index << PAGE_SHIFT) + pg_offset;
3361
3362         unlock_page(page);
3363
3364         return ret;
3365 }
3366
3367 static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
3368                                        loff_t start_offset,
3369                                        loff_t end_offset)
3370 {
3371         struct address_space *mapping = vinode->i_mapping;
3372         loff_t offset = start_offset, hole;
3373
3374         while (offset < end_offset) {
3375                 hole = page_hole_offset(mapping, offset);
3376                 if (hole >= 0 && hole <= end_offset)
3377                         return max(start_offset, hole);
3378
3379                 offset += PAGE_SIZE;
3380                 offset &= PAGE_MASK;
3381         }
3382
3383         return end_offset;
3384 }
3385
3386 static loff_t bch2_seek_hole(struct file *file, u64 offset)
3387 {
3388         struct bch_inode_info *inode = file_bch_inode(file);
3389         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3390         struct btree_trans trans;
3391         struct btree_iter iter;
3392         struct bkey_s_c k;
3393         subvol_inum inum = inode_inum(inode);
3394         u64 isize, next_hole = MAX_LFS_FILESIZE;
3395         u32 snapshot;
3396         int ret;
3397
3398         isize = i_size_read(&inode->v);
3399         if (offset >= isize)
3400                 return -ENXIO;
3401
3402         bch2_trans_init(&trans, c, 0, 0);
3403 retry:
3404         bch2_trans_begin(&trans);
3405
3406         ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
3407         if (ret)
3408                 goto err;
3409
3410         for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
3411                            SPOS(inode->v.i_ino, offset >> 9, snapshot),
3412                            BTREE_ITER_SLOTS, k, ret) {
3413                 if (k.k->p.inode != inode->v.i_ino) {
3414                         next_hole = bch2_seek_pagecache_hole(&inode->v,
3415                                         offset, MAX_LFS_FILESIZE);
3416                         break;
3417                 } else if (!bkey_extent_is_data(k.k)) {
3418                         next_hole = bch2_seek_pagecache_hole(&inode->v,
3419                                         max(offset, bkey_start_offset(k.k) << 9),
3420                                         k.k->p.offset << 9);
3421
3422                         if (next_hole < k.k->p.offset << 9)
3423                                 break;
3424                 } else {
3425                         offset = max(offset, bkey_start_offset(k.k) << 9);
3426                 }
3427         }
3428         bch2_trans_iter_exit(&trans, &iter);
3429 err:
3430         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
3431                 goto retry;
3432
3433         bch2_trans_exit(&trans);
3434         if (ret)
3435                 return ret;
3436
3437         if (next_hole > isize)
3438                 next_hole = isize;
3439
3440         return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
3441 }
3442
3443 loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
3444 {
3445         loff_t ret;
3446
3447         switch (whence) {
3448         case SEEK_SET:
3449         case SEEK_CUR:
3450         case SEEK_END:
3451                 ret = generic_file_llseek(file, offset, whence);
3452                 break;
3453         case SEEK_DATA:
3454                 ret = bch2_seek_data(file, offset);
3455                 break;
3456         case SEEK_HOLE:
3457                 ret = bch2_seek_hole(file, offset);
3458                 break;
3459         default:
3460                 ret = -EINVAL;
3461                 break;
3462         }
3463
3464         return bch2_err_class(ret);
3465 }
3466
3467 void bch2_fs_fsio_exit(struct bch_fs *c)
3468 {
3469         bioset_exit(&c->dio_write_bioset);
3470         bioset_exit(&c->dio_read_bioset);
3471         bioset_exit(&c->writepage_bioset);
3472 }
3473
3474 int bch2_fs_fsio_init(struct bch_fs *c)
3475 {
3476         int ret = 0;
3477
3478         pr_verbose_init(c->opts, "");
3479
3480         if (bioset_init(&c->writepage_bioset,
3481                         4, offsetof(struct bch_writepage_io, op.wbio.bio),
3482                         BIOSET_NEED_BVECS) ||
3483             bioset_init(&c->dio_read_bioset,
3484                         4, offsetof(struct dio_read, rbio.bio),
3485                         BIOSET_NEED_BVECS) ||
3486             bioset_init(&c->dio_write_bioset,
3487                         4, offsetof(struct dio_write, op.wbio.bio),
3488                         BIOSET_NEED_BVECS))
3489                 ret = -ENOMEM;
3490
3491         pr_verbose_init(c->opts, "ret %i", ret);
3492         return ret;
3493 }
3494
3495 #endif /* NO_BCACHEFS_FS */