]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fs-io.c
Update bcachefs sources to fbb669e9de bcachefs: Kill btree_node_iter_large
[bcachefs-tools-debian] / libbcachefs / fs-io.c
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3
4 #include "bcachefs.h"
5 #include "alloc_foreground.h"
6 #include "bkey_on_stack.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "clock.h"
10 #include "error.h"
11 #include "extents.h"
12 #include "extent_update.h"
13 #include "fs.h"
14 #include "fs-io.h"
15 #include "fsck.h"
16 #include "inode.h"
17 #include "journal.h"
18 #include "io.h"
19 #include "keylist.h"
20 #include "quota.h"
21 #include "reflink.h"
22
23 #include <linux/aio.h>
24 #include <linux/backing-dev.h>
25 #include <linux/falloc.h>
26 #include <linux/migrate.h>
27 #include <linux/mmu_context.h>
28 #include <linux/pagevec.h>
29 #include <linux/sched/signal.h>
30 #include <linux/task_io_accounting_ops.h>
31 #include <linux/uio.h>
32 #include <linux/writeback.h>
33
34 #include <trace/events/bcachefs.h>
35 #include <trace/events/writeback.h>
36
37 struct quota_res {
38         u64                             sectors;
39 };
40
41 struct bch_writepage_io {
42         struct closure                  cl;
43         struct bch_inode_info           *inode;
44
45         /* must be last: */
46         struct bch_write_op             op;
47 };
48
49 struct dio_write {
50         struct completion               done;
51         struct kiocb                    *req;
52         struct mm_struct                *mm;
53         unsigned                        loop:1,
54                                         sync:1,
55                                         free_iov:1;
56         struct quota_res                quota_res;
57
58         struct iov_iter                 iter;
59         struct iovec                    inline_vecs[2];
60
61         /* must be last: */
62         struct bch_write_op             op;
63 };
64
65 struct dio_read {
66         struct closure                  cl;
67         struct kiocb                    *req;
68         long                            ret;
69         struct bch_read_bio             rbio;
70 };
71
72 /* pagecache_block must be held */
73 static int write_invalidate_inode_pages_range(struct address_space *mapping,
74                                               loff_t start, loff_t end)
75 {
76         int ret;
77
78         /*
79          * XXX: the way this is currently implemented, we can spin if a process
80          * is continually redirtying a specific page
81          */
82         do {
83                 if (!mapping->nrpages &&
84                     !mapping->nrexceptional)
85                         return 0;
86
87                 ret = filemap_write_and_wait_range(mapping, start, end);
88                 if (ret)
89                         break;
90
91                 if (!mapping->nrpages)
92                         return 0;
93
94                 ret = invalidate_inode_pages2_range(mapping,
95                                 start >> PAGE_SHIFT,
96                                 end >> PAGE_SHIFT);
97         } while (ret == -EBUSY);
98
99         return ret;
100 }
101
102 /* quotas */
103
104 #ifdef CONFIG_BCACHEFS_QUOTA
105
106 static void bch2_quota_reservation_put(struct bch_fs *c,
107                                        struct bch_inode_info *inode,
108                                        struct quota_res *res)
109 {
110         if (!res->sectors)
111                 return;
112
113         mutex_lock(&inode->ei_quota_lock);
114         BUG_ON(res->sectors > inode->ei_quota_reserved);
115
116         bch2_quota_acct(c, inode->ei_qid, Q_SPC,
117                         -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
118         inode->ei_quota_reserved -= res->sectors;
119         mutex_unlock(&inode->ei_quota_lock);
120
121         res->sectors = 0;
122 }
123
124 static int bch2_quota_reservation_add(struct bch_fs *c,
125                                       struct bch_inode_info *inode,
126                                       struct quota_res *res,
127                                       unsigned sectors,
128                                       bool check_enospc)
129 {
130         int ret;
131
132         mutex_lock(&inode->ei_quota_lock);
133         ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
134                               check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
135         if (likely(!ret)) {
136                 inode->ei_quota_reserved += sectors;
137                 res->sectors += sectors;
138         }
139         mutex_unlock(&inode->ei_quota_lock);
140
141         return ret;
142 }
143
144 #else
145
146 static void bch2_quota_reservation_put(struct bch_fs *c,
147                                        struct bch_inode_info *inode,
148                                        struct quota_res *res)
149 {
150 }
151
152 static int bch2_quota_reservation_add(struct bch_fs *c,
153                                       struct bch_inode_info *inode,
154                                       struct quota_res *res,
155                                       unsigned sectors,
156                                       bool check_enospc)
157 {
158         return 0;
159 }
160
161 #endif
162
163 /* i_size updates: */
164
165 struct inode_new_size {
166         loff_t          new_size;
167         u64             now;
168         unsigned        fields;
169 };
170
171 static int inode_set_size(struct bch_inode_info *inode,
172                           struct bch_inode_unpacked *bi,
173                           void *p)
174 {
175         struct inode_new_size *s = p;
176
177         bi->bi_size = s->new_size;
178         if (s->fields & ATTR_ATIME)
179                 bi->bi_atime = s->now;
180         if (s->fields & ATTR_MTIME)
181                 bi->bi_mtime = s->now;
182         if (s->fields & ATTR_CTIME)
183                 bi->bi_ctime = s->now;
184
185         return 0;
186 }
187
188 int __must_check bch2_write_inode_size(struct bch_fs *c,
189                                        struct bch_inode_info *inode,
190                                        loff_t new_size, unsigned fields)
191 {
192         struct inode_new_size s = {
193                 .new_size       = new_size,
194                 .now            = bch2_current_time(c),
195                 .fields         = fields,
196         };
197
198         return bch2_write_inode(c, inode, inode_set_size, &s, fields);
199 }
200
201 static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
202                            struct quota_res *quota_res, s64 sectors)
203 {
204         if (!sectors)
205                 return;
206
207         mutex_lock(&inode->ei_quota_lock);
208 #ifdef CONFIG_BCACHEFS_QUOTA
209         if (quota_res && sectors > 0) {
210                 BUG_ON(sectors > quota_res->sectors);
211                 BUG_ON(sectors > inode->ei_quota_reserved);
212
213                 quota_res->sectors -= sectors;
214                 inode->ei_quota_reserved -= sectors;
215         } else {
216                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
217         }
218 #endif
219         inode->v.i_blocks += sectors;
220         mutex_unlock(&inode->ei_quota_lock);
221 }
222
223 /* page state: */
224
225 /* stored in page->private: */
226
227 struct bch_page_sector {
228         /* Uncompressed, fully allocated replicas: */
229         unsigned                nr_replicas:3;
230
231         /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */
232         unsigned                replicas_reserved:3;
233
234         /* i_sectors: */
235         enum {
236                 SECTOR_UNALLOCATED,
237                 SECTOR_RESERVED,
238                 SECTOR_DIRTY,
239                 SECTOR_ALLOCATED,
240         }                       state:2;
241 };
242
243 struct bch_page_state {
244         spinlock_t              lock;
245         atomic_t                write_count;
246         struct bch_page_sector  s[PAGE_SECTORS];
247 };
248
249 static inline struct bch_page_state *__bch2_page_state(struct page *page)
250 {
251         return page_has_private(page)
252                 ? (struct bch_page_state *) page_private(page)
253                 : NULL;
254 }
255
256 static inline struct bch_page_state *bch2_page_state(struct page *page)
257 {
258         EBUG_ON(!PageLocked(page));
259
260         return __bch2_page_state(page);
261 }
262
263 /* for newly allocated pages: */
264 static void __bch2_page_state_release(struct page *page)
265 {
266         struct bch_page_state *s = __bch2_page_state(page);
267
268         if (!s)
269                 return;
270
271         ClearPagePrivate(page);
272         set_page_private(page, 0);
273         put_page(page);
274         kfree(s);
275 }
276
277 static void bch2_page_state_release(struct page *page)
278 {
279         struct bch_page_state *s = bch2_page_state(page);
280
281         if (!s)
282                 return;
283
284         ClearPagePrivate(page);
285         set_page_private(page, 0);
286         put_page(page);
287         kfree(s);
288 }
289
290 /* for newly allocated pages: */
291 static struct bch_page_state *__bch2_page_state_create(struct page *page,
292                                                        gfp_t gfp)
293 {
294         struct bch_page_state *s;
295
296         s = kzalloc(sizeof(*s), GFP_NOFS|gfp);
297         if (!s)
298                 return NULL;
299
300         spin_lock_init(&s->lock);
301         /*
302          * migrate_page_move_mapping() assumes that pages with private data
303          * have their count elevated by 1.
304          */
305         get_page(page);
306         set_page_private(page, (unsigned long) s);
307         SetPagePrivate(page);
308         return s;
309 }
310
311 static struct bch_page_state *bch2_page_state_create(struct page *page,
312                                                      gfp_t gfp)
313 {
314         return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
315 }
316
317 static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
318 {
319         /* XXX: this should not be open coded */
320         return inode->ei_inode.bi_data_replicas
321                 ? inode->ei_inode.bi_data_replicas - 1
322                 : c->opts.data_replicas;
323 }
324
325 static inline unsigned sectors_to_reserve(struct bch_page_sector *s,
326                                                   unsigned nr_replicas)
327 {
328         return max(0, (int) nr_replicas -
329                    s->nr_replicas -
330                    s->replicas_reserved);
331 }
332
333 static int bch2_get_page_disk_reservation(struct bch_fs *c,
334                                 struct bch_inode_info *inode,
335                                 struct page *page, bool check_enospc)
336 {
337         struct bch_page_state *s = bch2_page_state_create(page, 0);
338         unsigned nr_replicas = inode_nr_replicas(c, inode);
339         struct disk_reservation disk_res = { 0 };
340         unsigned i, disk_res_sectors = 0;
341         int ret;
342
343         if (!s)
344                 return -ENOMEM;
345
346         for (i = 0; i < ARRAY_SIZE(s->s); i++)
347                 disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
348
349         if (!disk_res_sectors)
350                 return 0;
351
352         ret = bch2_disk_reservation_get(c, &disk_res,
353                                         disk_res_sectors, 1,
354                                         !check_enospc
355                                         ? BCH_DISK_RESERVATION_NOFAIL
356                                         : 0);
357         if (unlikely(ret))
358                 return ret;
359
360         for (i = 0; i < ARRAY_SIZE(s->s); i++)
361                 s->s[i].replicas_reserved +=
362                         sectors_to_reserve(&s->s[i], nr_replicas);
363
364         return 0;
365 }
366
367 struct bch2_page_reservation {
368         struct disk_reservation disk;
369         struct quota_res        quota;
370 };
371
372 static void bch2_page_reservation_init(struct bch_fs *c,
373                         struct bch_inode_info *inode,
374                         struct bch2_page_reservation *res)
375 {
376         memset(res, 0, sizeof(*res));
377
378         res->disk.nr_replicas = inode_nr_replicas(c, inode);
379 }
380
381 static void bch2_page_reservation_put(struct bch_fs *c,
382                         struct bch_inode_info *inode,
383                         struct bch2_page_reservation *res)
384 {
385         bch2_disk_reservation_put(c, &res->disk);
386         bch2_quota_reservation_put(c, inode, &res->quota);
387 }
388
389 static int bch2_page_reservation_get(struct bch_fs *c,
390                         struct bch_inode_info *inode, struct page *page,
391                         struct bch2_page_reservation *res,
392                         unsigned offset, unsigned len, bool check_enospc)
393 {
394         struct bch_page_state *s = bch2_page_state_create(page, 0);
395         unsigned i, disk_sectors = 0, quota_sectors = 0;
396         int ret;
397
398         if (!s)
399                 return -ENOMEM;
400
401         for (i = round_down(offset, block_bytes(c)) >> 9;
402              i < round_up(offset + len, block_bytes(c)) >> 9;
403              i++) {
404                 disk_sectors += sectors_to_reserve(&s->s[i],
405                                                 res->disk.nr_replicas);
406                 quota_sectors += s->s[i].state == SECTOR_UNALLOCATED;
407         }
408
409         if (disk_sectors) {
410                 ret = bch2_disk_reservation_add(c, &res->disk,
411                                                 disk_sectors,
412                                                 !check_enospc
413                                                 ? BCH_DISK_RESERVATION_NOFAIL
414                                                 : 0);
415                 if (unlikely(ret))
416                         return ret;
417         }
418
419         if (quota_sectors) {
420                 ret = bch2_quota_reservation_add(c, inode, &res->quota,
421                                                  quota_sectors,
422                                                  check_enospc);
423                 if (unlikely(ret)) {
424                         struct disk_reservation tmp = {
425                                 .sectors = disk_sectors
426                         };
427
428                         bch2_disk_reservation_put(c, &tmp);
429                         res->disk.sectors -= disk_sectors;
430                         return ret;
431                 }
432         }
433
434         return 0;
435 }
436
437 static void bch2_clear_page_bits(struct page *page)
438 {
439         struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
440         struct bch_fs *c = inode->v.i_sb->s_fs_info;
441         struct bch_page_state *s = bch2_page_state(page);
442         struct disk_reservation disk_res = { 0 };
443         int i, dirty_sectors = 0;
444
445         if (!s)
446                 return;
447
448         EBUG_ON(!PageLocked(page));
449         EBUG_ON(PageWriteback(page));
450
451         for (i = 0; i < ARRAY_SIZE(s->s); i++) {
452                 disk_res.sectors += s->s[i].replicas_reserved;
453                 s->s[i].replicas_reserved = 0;
454
455                 if (s->s[i].state == SECTOR_DIRTY) {
456                         dirty_sectors++;
457                         s->s[i].state = SECTOR_UNALLOCATED;
458                 }
459         }
460
461         bch2_disk_reservation_put(c, &disk_res);
462
463         if (dirty_sectors)
464                 i_sectors_acct(c, inode, NULL, -dirty_sectors);
465
466         bch2_page_state_release(page);
467 }
468
469 static void bch2_set_page_dirty(struct bch_fs *c,
470                         struct bch_inode_info *inode, struct page *page,
471                         struct bch2_page_reservation *res,
472                         unsigned offset, unsigned len)
473 {
474         struct bch_page_state *s = bch2_page_state(page);
475         unsigned i, dirty_sectors = 0;
476
477         WARN_ON((u64) page_offset(page) + offset + len >
478                 round_up((u64) i_size_read(&inode->v), block_bytes(c)));
479
480         spin_lock(&s->lock);
481
482         for (i = round_down(offset, block_bytes(c)) >> 9;
483              i < round_up(offset + len, block_bytes(c)) >> 9;
484              i++) {
485                 unsigned sectors = sectors_to_reserve(&s->s[i],
486                                                 res->disk.nr_replicas);
487
488                 /*
489                  * This can happen if we race with the error path in
490                  * bch2_writepage_io_done():
491                  */
492                 sectors = min_t(unsigned, sectors, res->disk.sectors);
493
494                 s->s[i].replicas_reserved += sectors;
495                 res->disk.sectors -= sectors;
496
497                 if (s->s[i].state == SECTOR_UNALLOCATED)
498                         dirty_sectors++;
499
500                 s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY);
501         }
502
503         spin_unlock(&s->lock);
504
505         if (dirty_sectors)
506                 i_sectors_acct(c, inode, &res->quota, dirty_sectors);
507
508         if (!PageDirty(page))
509                 __set_page_dirty_nobuffers(page);
510 }
511
512 vm_fault_t bch2_page_fault(struct vm_fault *vmf)
513 {
514         struct file *file = vmf->vma->vm_file;
515         struct bch_inode_info *inode = file_bch_inode(file);
516         int ret;
517
518         bch2_pagecache_add_get(&inode->ei_pagecache_lock);
519         ret = filemap_fault(vmf);
520         bch2_pagecache_add_put(&inode->ei_pagecache_lock);
521
522         return ret;
523 }
524
525 vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
526 {
527         struct page *page = vmf->page;
528         struct file *file = vmf->vma->vm_file;
529         struct bch_inode_info *inode = file_bch_inode(file);
530         struct address_space *mapping = file->f_mapping;
531         struct bch_fs *c = inode->v.i_sb->s_fs_info;
532         struct bch2_page_reservation res;
533         unsigned len;
534         loff_t isize;
535         int ret = VM_FAULT_LOCKED;
536
537         bch2_page_reservation_init(c, inode, &res);
538
539         sb_start_pagefault(inode->v.i_sb);
540         file_update_time(file);
541
542         /*
543          * Not strictly necessary, but helps avoid dio writes livelocking in
544          * write_invalidate_inode_pages_range() - can drop this if/when we get
545          * a write_invalidate_inode_pages_range() that works without dropping
546          * page lock before invalidating page
547          */
548         bch2_pagecache_add_get(&inode->ei_pagecache_lock);
549
550         lock_page(page);
551         isize = i_size_read(&inode->v);
552
553         if (page->mapping != mapping || page_offset(page) >= isize) {
554                 unlock_page(page);
555                 ret = VM_FAULT_NOPAGE;
556                 goto out;
557         }
558
559         len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page));
560
561         if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
562                 unlock_page(page);
563                 ret = VM_FAULT_SIGBUS;
564                 goto out;
565         }
566
567         bch2_set_page_dirty(c, inode, page, &res, 0, len);
568         bch2_page_reservation_put(c, inode, &res);
569
570         wait_for_stable_page(page);
571 out:
572         bch2_pagecache_add_put(&inode->ei_pagecache_lock);
573         sb_end_pagefault(inode->v.i_sb);
574
575         return ret;
576 }
577
578 void bch2_invalidatepage(struct page *page, unsigned int offset,
579                          unsigned int length)
580 {
581         if (offset || length < PAGE_SIZE)
582                 return;
583
584         bch2_clear_page_bits(page);
585 }
586
587 int bch2_releasepage(struct page *page, gfp_t gfp_mask)
588 {
589         if (PageDirty(page))
590                 return 0;
591
592         bch2_clear_page_bits(page);
593         return 1;
594 }
595
596 #ifdef CONFIG_MIGRATION
597 int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
598                       struct page *page, enum migrate_mode mode)
599 {
600         int ret;
601
602         EBUG_ON(!PageLocked(page));
603         EBUG_ON(!PageLocked(newpage));
604
605         ret = migrate_page_move_mapping(mapping, newpage, page, 0);
606         if (ret != MIGRATEPAGE_SUCCESS)
607                 return ret;
608
609         if (PagePrivate(page)) {
610                 ClearPagePrivate(page);
611                 get_page(newpage);
612                 set_page_private(newpage, page_private(page));
613                 set_page_private(page, 0);
614                 put_page(page);
615                 SetPagePrivate(newpage);
616         }
617
618         if (mode != MIGRATE_SYNC_NO_COPY)
619                 migrate_page_copy(newpage, page);
620         else
621                 migrate_page_states(newpage, page);
622         return MIGRATEPAGE_SUCCESS;
623 }
624 #endif
625
626 /* readpage(s): */
627
628 static void bch2_readpages_end_io(struct bio *bio)
629 {
630         struct bvec_iter_all iter;
631         struct bio_vec *bv;
632
633         bio_for_each_segment_all(bv, bio, iter) {
634                 struct page *page = bv->bv_page;
635
636                 if (!bio->bi_status) {
637                         SetPageUptodate(page);
638                 } else {
639                         ClearPageUptodate(page);
640                         SetPageError(page);
641                 }
642                 unlock_page(page);
643         }
644
645         bio_put(bio);
646 }
647
648 static inline void page_state_init_for_read(struct page *page)
649 {
650         SetPagePrivate(page);
651         page->private = 0;
652 }
653
654 struct readpages_iter {
655         struct address_space    *mapping;
656         struct page             **pages;
657         unsigned                nr_pages;
658         unsigned                nr_added;
659         unsigned                idx;
660         pgoff_t                 offset;
661 };
662
663 static int readpages_iter_init(struct readpages_iter *iter,
664                                struct address_space *mapping,
665                                struct list_head *pages, unsigned nr_pages)
666 {
667         memset(iter, 0, sizeof(*iter));
668
669         iter->mapping   = mapping;
670         iter->offset    = list_last_entry(pages, struct page, lru)->index;
671
672         iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
673         if (!iter->pages)
674                 return -ENOMEM;
675
676         while (!list_empty(pages)) {
677                 struct page *page = list_last_entry(pages, struct page, lru);
678
679                 __bch2_page_state_create(page, __GFP_NOFAIL);
680
681                 iter->pages[iter->nr_pages++] = page;
682                 list_del(&page->lru);
683         }
684
685         return 0;
686 }
687
688 static inline struct page *readpage_iter_next(struct readpages_iter *iter)
689 {
690         struct page *page;
691         unsigned i;
692         int ret;
693
694         BUG_ON(iter->idx > iter->nr_added);
695         BUG_ON(iter->nr_added > iter->nr_pages);
696
697         if (iter->idx < iter->nr_added)
698                 goto out;
699
700         while (1) {
701                 if (iter->idx == iter->nr_pages)
702                         return NULL;
703
704                 ret = add_to_page_cache_lru_vec(iter->mapping,
705                                 iter->pages     + iter->nr_added,
706                                 iter->nr_pages  - iter->nr_added,
707                                 iter->offset    + iter->nr_added,
708                                 GFP_NOFS);
709                 if (ret > 0)
710                         break;
711
712                 page = iter->pages[iter->nr_added];
713                 iter->idx++;
714                 iter->nr_added++;
715
716                 __bch2_page_state_release(page);
717                 put_page(page);
718         }
719
720         iter->nr_added += ret;
721
722         for (i = iter->idx; i < iter->nr_added; i++)
723                 put_page(iter->pages[i]);
724 out:
725         EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
726
727         return iter->pages[iter->idx];
728 }
729
730 static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
731 {
732         struct bvec_iter iter;
733         struct bio_vec bv;
734         unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
735                 ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
736         unsigned state = k.k->type == KEY_TYPE_reservation
737                 ? SECTOR_RESERVED
738                 : SECTOR_ALLOCATED;
739
740         bio_for_each_segment(bv, bio, iter) {
741                 struct bch_page_state *s = bch2_page_state(bv.bv_page);
742                 unsigned i;
743
744                 for (i = bv.bv_offset >> 9;
745                      i < (bv.bv_offset + bv.bv_len) >> 9;
746                      i++) {
747                         s->s[i].nr_replicas = nr_ptrs;
748                         s->s[i].state = state;
749                 }
750         }
751 }
752
753 static bool extent_partial_reads_expensive(struct bkey_s_c k)
754 {
755         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
756         struct bch_extent_crc_unpacked crc;
757         const union bch_extent_entry *i;
758
759         bkey_for_each_crc(k.k, ptrs, crc, i)
760                 if (crc.csum_type || crc.compression_type)
761                         return true;
762         return false;
763 }
764
765 static void readpage_bio_extend(struct readpages_iter *iter,
766                                 struct bio *bio,
767                                 unsigned sectors_this_extent,
768                                 bool get_more)
769 {
770         while (bio_sectors(bio) < sectors_this_extent &&
771                bio->bi_vcnt < bio->bi_max_vecs) {
772                 pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
773                 struct page *page = readpage_iter_next(iter);
774                 int ret;
775
776                 if (page) {
777                         if (iter->offset + iter->idx != page_offset)
778                                 break;
779
780                         iter->idx++;
781                 } else {
782                         if (!get_more)
783                                 break;
784
785                         page = xa_load(&iter->mapping->i_pages, page_offset);
786                         if (page && !xa_is_value(page))
787                                 break;
788
789                         page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
790                         if (!page)
791                                 break;
792
793                         if (!__bch2_page_state_create(page, 0)) {
794                                 put_page(page);
795                                 break;
796                         }
797
798                         ret = add_to_page_cache_lru(page, iter->mapping,
799                                                     page_offset, GFP_NOFS);
800                         if (ret) {
801                                 __bch2_page_state_release(page);
802                                 put_page(page);
803                                 break;
804                         }
805
806                         put_page(page);
807                 }
808
809                 BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0));
810         }
811 }
812
813 static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
814                        struct bch_read_bio *rbio, u64 inum,
815                        struct readpages_iter *readpages_iter)
816 {
817         struct bch_fs *c = trans->c;
818         struct bkey_on_stack sk;
819         int flags = BCH_READ_RETRY_IF_STALE|
820                 BCH_READ_MAY_PROMOTE;
821         int ret = 0;
822
823         rbio->c = c;
824         rbio->start_time = local_clock();
825
826         bkey_on_stack_init(&sk);
827 retry:
828         while (1) {
829                 struct bkey_s_c k;
830                 unsigned bytes, sectors, offset_into_extent;
831
832                 bch2_btree_iter_set_pos(iter,
833                                 POS(inum, rbio->bio.bi_iter.bi_sector));
834
835                 k = bch2_btree_iter_peek_slot(iter);
836                 ret = bkey_err(k);
837                 if (ret)
838                         break;
839
840                 bkey_on_stack_reassemble(&sk, c, k);
841                 k = bkey_i_to_s_c(sk.k);
842
843                 offset_into_extent = iter->pos.offset -
844                         bkey_start_offset(k.k);
845                 sectors = k.k->size - offset_into_extent;
846
847                 ret = bch2_read_indirect_extent(trans,
848                                         &offset_into_extent, sk.k);
849                 if (ret)
850                         break;
851
852                 sectors = min(sectors, k.k->size - offset_into_extent);
853
854                 bch2_trans_unlock(trans);
855
856                 if (readpages_iter)
857                         readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
858                                             extent_partial_reads_expensive(k));
859
860                 bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
861                 swap(rbio->bio.bi_iter.bi_size, bytes);
862
863                 if (rbio->bio.bi_iter.bi_size == bytes)
864                         flags |= BCH_READ_LAST_FRAGMENT;
865
866                 if (bkey_extent_is_allocation(k.k))
867                         bch2_add_page_sectors(&rbio->bio, k);
868
869                 bch2_read_extent(c, rbio, k, offset_into_extent, flags);
870
871                 if (flags & BCH_READ_LAST_FRAGMENT)
872                         break;
873
874                 swap(rbio->bio.bi_iter.bi_size, bytes);
875                 bio_advance(&rbio->bio, bytes);
876         }
877
878         if (ret == -EINTR)
879                 goto retry;
880
881         if (ret) {
882                 bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
883                 bio_endio(&rbio->bio);
884         }
885
886         bkey_on_stack_exit(&sk, c);
887 }
888
889 int bch2_readpages(struct file *file, struct address_space *mapping,
890                    struct list_head *pages, unsigned nr_pages)
891 {
892         struct bch_inode_info *inode = to_bch_ei(mapping->host);
893         struct bch_fs *c = inode->v.i_sb->s_fs_info;
894         struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
895         struct btree_trans trans;
896         struct btree_iter *iter;
897         struct page *page;
898         struct readpages_iter readpages_iter;
899         int ret;
900
901         ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages);
902         BUG_ON(ret);
903
904         bch2_trans_init(&trans, c, 0, 0);
905
906         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
907                                    BTREE_ITER_SLOTS);
908
909         bch2_pagecache_add_get(&inode->ei_pagecache_lock);
910
911         while ((page = readpage_iter_next(&readpages_iter))) {
912                 pgoff_t index = readpages_iter.offset + readpages_iter.idx;
913                 unsigned n = min_t(unsigned,
914                                    readpages_iter.nr_pages -
915                                    readpages_iter.idx,
916                                    BIO_MAX_PAGES);
917                 struct bch_read_bio *rbio =
918                         rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read),
919                                   opts);
920
921                 readpages_iter.idx++;
922
923                 bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0);
924                 rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT;
925                 rbio->bio.bi_end_io = bch2_readpages_end_io;
926                 BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
927
928                 bchfs_read(&trans, iter, rbio, inode->v.i_ino,
929                            &readpages_iter);
930         }
931
932         bch2_pagecache_add_put(&inode->ei_pagecache_lock);
933
934         bch2_trans_exit(&trans);
935         kfree(readpages_iter.pages);
936
937         return 0;
938 }
939
940 static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
941                              u64 inum, struct page *page)
942 {
943         struct btree_trans trans;
944         struct btree_iter *iter;
945
946         bch2_page_state_create(page, __GFP_NOFAIL);
947
948         bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
949         rbio->bio.bi_iter.bi_sector =
950                 (sector_t) page->index << PAGE_SECTOR_SHIFT;
951         BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
952
953         bch2_trans_init(&trans, c, 0, 0);
954         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
955                                    BTREE_ITER_SLOTS);
956
957         bchfs_read(&trans, iter, rbio, inum, NULL);
958
959         bch2_trans_exit(&trans);
960 }
961
962 int bch2_readpage(struct file *file, struct page *page)
963 {
964         struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
965         struct bch_fs *c = inode->v.i_sb->s_fs_info;
966         struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
967         struct bch_read_bio *rbio;
968
969         rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
970         rbio->bio.bi_end_io = bch2_readpages_end_io;
971
972         __bchfs_readpage(c, rbio, inode->v.i_ino, page);
973         return 0;
974 }
975
976 static void bch2_read_single_page_end_io(struct bio *bio)
977 {
978         complete(bio->bi_private);
979 }
980
981 static int bch2_read_single_page(struct page *page,
982                                  struct address_space *mapping)
983 {
984         struct bch_inode_info *inode = to_bch_ei(mapping->host);
985         struct bch_fs *c = inode->v.i_sb->s_fs_info;
986         struct bch_read_bio *rbio;
987         int ret;
988         DECLARE_COMPLETION_ONSTACK(done);
989
990         rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read),
991                          io_opts(c, &inode->ei_inode));
992         rbio->bio.bi_private = &done;
993         rbio->bio.bi_end_io = bch2_read_single_page_end_io;
994
995         __bchfs_readpage(c, rbio, inode->v.i_ino, page);
996         wait_for_completion(&done);
997
998         ret = blk_status_to_errno(rbio->bio.bi_status);
999         bio_put(&rbio->bio);
1000
1001         if (ret < 0)
1002                 return ret;
1003
1004         SetPageUptodate(page);
1005         return 0;
1006 }
1007
1008 /* writepages: */
1009
1010 struct bch_writepage_state {
1011         struct bch_writepage_io *io;
1012         struct bch_io_opts      opts;
1013 };
1014
1015 static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
1016                                                                   struct bch_inode_info *inode)
1017 {
1018         return (struct bch_writepage_state) {
1019                 .opts = io_opts(c, &inode->ei_inode)
1020         };
1021 }
1022
1023 static void bch2_writepage_io_free(struct closure *cl)
1024 {
1025         struct bch_writepage_io *io = container_of(cl,
1026                                         struct bch_writepage_io, cl);
1027
1028         bio_put(&io->op.wbio.bio);
1029 }
1030
1031 static void bch2_writepage_io_done(struct closure *cl)
1032 {
1033         struct bch_writepage_io *io = container_of(cl,
1034                                         struct bch_writepage_io, cl);
1035         struct bch_fs *c = io->op.c;
1036         struct bio *bio = &io->op.wbio.bio;
1037         struct bvec_iter_all iter;
1038         struct bio_vec *bvec;
1039         unsigned i;
1040
1041         if (io->op.error) {
1042                 bio_for_each_segment_all(bvec, bio, iter) {
1043                         struct bch_page_state *s;
1044
1045                         SetPageError(bvec->bv_page);
1046                         mapping_set_error(bvec->bv_page->mapping, -EIO);
1047
1048                         s = __bch2_page_state(bvec->bv_page);
1049                         spin_lock(&s->lock);
1050                         for (i = 0; i < PAGE_SECTORS; i++)
1051                                 s->s[i].nr_replicas = 0;
1052                         spin_unlock(&s->lock);
1053                 }
1054         }
1055
1056         if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
1057                 bio_for_each_segment_all(bvec, bio, iter) {
1058                         struct bch_page_state *s;
1059
1060                         s = __bch2_page_state(bvec->bv_page);
1061                         spin_lock(&s->lock);
1062                         for (i = 0; i < PAGE_SECTORS; i++)
1063                                 s->s[i].nr_replicas = 0;
1064                         spin_unlock(&s->lock);
1065                 }
1066         }
1067
1068         /*
1069          * racing with fallocate can cause us to add fewer sectors than
1070          * expected - but we shouldn't add more sectors than expected:
1071          */
1072         BUG_ON(io->op.i_sectors_delta > 0);
1073
1074         /*
1075          * (error (due to going RO) halfway through a page can screw that up
1076          * slightly)
1077          * XXX wtf?
1078            BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
1079          */
1080
1081         /*
1082          * PageWriteback is effectively our ref on the inode - fixup i_blocks
1083          * before calling end_page_writeback:
1084          */
1085         i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
1086
1087         bio_for_each_segment_all(bvec, bio, iter) {
1088                 struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
1089
1090                 if (atomic_dec_and_test(&s->write_count))
1091                         end_page_writeback(bvec->bv_page);
1092         }
1093
1094         closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
1095 }
1096
1097 static void bch2_writepage_do_io(struct bch_writepage_state *w)
1098 {
1099         struct bch_writepage_io *io = w->io;
1100
1101         w->io = NULL;
1102         closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
1103         continue_at(&io->cl, bch2_writepage_io_done, NULL);
1104 }
1105
1106 /*
1107  * Get a bch_writepage_io and add @page to it - appending to an existing one if
1108  * possible, else allocating a new one:
1109  */
1110 static void bch2_writepage_io_alloc(struct bch_fs *c,
1111                                     struct writeback_control *wbc,
1112                                     struct bch_writepage_state *w,
1113                                     struct bch_inode_info *inode,
1114                                     u64 sector,
1115                                     unsigned nr_replicas)
1116 {
1117         struct bch_write_op *op;
1118
1119         w->io = container_of(bio_alloc_bioset(GFP_NOFS,
1120                                               BIO_MAX_PAGES,
1121                                               &c->writepage_bioset),
1122                              struct bch_writepage_io, op.wbio.bio);
1123
1124         closure_init(&w->io->cl, NULL);
1125         w->io->inode            = inode;
1126
1127         op                      = &w->io->op;
1128         bch2_write_op_init(op, c, w->opts);
1129         op->target              = w->opts.foreground_target;
1130         op_journal_seq_set(op, &inode->ei_journal_seq);
1131         op->nr_replicas         = nr_replicas;
1132         op->res.nr_replicas     = nr_replicas;
1133         op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
1134         op->pos                 = POS(inode->v.i_ino, sector);
1135         op->wbio.bio.bi_iter.bi_sector = sector;
1136         op->wbio.bio.bi_opf     = wbc_to_write_flags(wbc);
1137 }
1138
1139 static int __bch2_writepage(struct page *page,
1140                             struct writeback_control *wbc,
1141                             void *data)
1142 {
1143         struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
1144         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1145         struct bch_writepage_state *w = data;
1146         struct bch_page_state *s, orig;
1147         unsigned i, offset, nr_replicas_this_write = U32_MAX;
1148         loff_t i_size = i_size_read(&inode->v);
1149         pgoff_t end_index = i_size >> PAGE_SHIFT;
1150         int ret;
1151
1152         EBUG_ON(!PageUptodate(page));
1153
1154         /* Is the page fully inside i_size? */
1155         if (page->index < end_index)
1156                 goto do_io;
1157
1158         /* Is the page fully outside i_size? (truncate in progress) */
1159         offset = i_size & (PAGE_SIZE - 1);
1160         if (page->index > end_index || !offset) {
1161                 unlock_page(page);
1162                 return 0;
1163         }
1164
1165         /*
1166          * The page straddles i_size.  It must be zeroed out on each and every
1167          * writepage invocation because it may be mmapped.  "A file is mapped
1168          * in multiples of the page size.  For a file that is not a multiple of
1169          * the  page size, the remaining memory is zeroed when mapped, and
1170          * writes to that region are not written out to the file."
1171          */
1172         zero_user_segment(page, offset, PAGE_SIZE);
1173 do_io:
1174         s = bch2_page_state_create(page, __GFP_NOFAIL);
1175
1176         ret = bch2_get_page_disk_reservation(c, inode, page, true);
1177         if (ret) {
1178                 SetPageError(page);
1179                 mapping_set_error(page->mapping, ret);
1180                 unlock_page(page);
1181                 return 0;
1182         }
1183
1184         /* Before unlocking the page, get copy of reservations: */
1185         orig = *s;
1186
1187         for (i = 0; i < PAGE_SECTORS; i++) {
1188                 if (s->s[i].state < SECTOR_DIRTY)
1189                         continue;
1190
1191                 nr_replicas_this_write =
1192                         min_t(unsigned, nr_replicas_this_write,
1193                               s->s[i].nr_replicas +
1194                               s->s[i].replicas_reserved);
1195         }
1196
1197         for (i = 0; i < PAGE_SECTORS; i++) {
1198                 if (s->s[i].state < SECTOR_DIRTY)
1199                         continue;
1200
1201                 s->s[i].nr_replicas = w->opts.compression
1202                         ? 0 : nr_replicas_this_write;
1203
1204                 s->s[i].replicas_reserved = 0;
1205                 s->s[i].state = SECTOR_ALLOCATED;
1206         }
1207
1208         BUG_ON(atomic_read(&s->write_count));
1209         atomic_set(&s->write_count, 1);
1210
1211         BUG_ON(PageWriteback(page));
1212         set_page_writeback(page);
1213
1214         unlock_page(page);
1215
1216         offset = 0;
1217         while (1) {
1218                 unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0;
1219                 u64 sector;
1220
1221                 while (offset < PAGE_SECTORS &&
1222                        orig.s[offset].state < SECTOR_DIRTY)
1223                         offset++;
1224
1225                 if (offset == PAGE_SECTORS)
1226                         break;
1227
1228                 sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset;
1229
1230                 while (offset + sectors < PAGE_SECTORS &&
1231                        orig.s[offset + sectors].state >= SECTOR_DIRTY)
1232                         sectors++;
1233
1234                 for (i = offset; i < offset + sectors; i++) {
1235                         reserved_sectors += orig.s[i].replicas_reserved;
1236                         dirty_sectors += orig.s[i].state == SECTOR_DIRTY;
1237                 }
1238
1239                 if (w->io &&
1240                     (w->io->op.res.nr_replicas != nr_replicas_this_write ||
1241                      bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
1242                      w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) ||
1243                      bio_end_sector(&w->io->op.wbio.bio) != sector))
1244                         bch2_writepage_do_io(w);
1245
1246                 if (!w->io)
1247                         bch2_writepage_io_alloc(c, wbc, w, inode, sector,
1248                                                 nr_replicas_this_write);
1249
1250                 atomic_inc(&s->write_count);
1251
1252                 BUG_ON(inode != w->io->inode);
1253                 BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page,
1254                                      sectors << 9, offset << 9));
1255
1256                 /* Check for writing past i_size: */
1257                 WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) >
1258                         round_up(i_size, block_bytes(c)));
1259
1260                 w->io->op.res.sectors += reserved_sectors;
1261                 w->io->op.i_sectors_delta -= dirty_sectors;
1262                 w->io->op.new_i_size = i_size;
1263
1264                 offset += sectors;
1265         }
1266
1267         if (atomic_dec_and_test(&s->write_count))
1268                 end_page_writeback(page);
1269
1270         return 0;
1271 }
1272
1273 int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
1274 {
1275         struct bch_fs *c = mapping->host->i_sb->s_fs_info;
1276         struct bch_writepage_state w =
1277                 bch_writepage_state_init(c, to_bch_ei(mapping->host));
1278         struct blk_plug plug;
1279         int ret;
1280
1281         blk_start_plug(&plug);
1282         ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
1283         if (w.io)
1284                 bch2_writepage_do_io(&w);
1285         blk_finish_plug(&plug);
1286         return ret;
1287 }
1288
1289 int bch2_writepage(struct page *page, struct writeback_control *wbc)
1290 {
1291         struct bch_fs *c = page->mapping->host->i_sb->s_fs_info;
1292         struct bch_writepage_state w =
1293                 bch_writepage_state_init(c, to_bch_ei(page->mapping->host));
1294         int ret;
1295
1296         ret = __bch2_writepage(page, wbc, &w);
1297         if (w.io)
1298                 bch2_writepage_do_io(&w);
1299
1300         return ret;
1301 }
1302
1303 /* buffered writes: */
1304
1305 int bch2_write_begin(struct file *file, struct address_space *mapping,
1306                      loff_t pos, unsigned len, unsigned flags,
1307                      struct page **pagep, void **fsdata)
1308 {
1309         struct bch_inode_info *inode = to_bch_ei(mapping->host);
1310         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1311         struct bch2_page_reservation *res;
1312         pgoff_t index = pos >> PAGE_SHIFT;
1313         unsigned offset = pos & (PAGE_SIZE - 1);
1314         struct page *page;
1315         int ret = -ENOMEM;
1316
1317         res = kmalloc(sizeof(*res), GFP_KERNEL);
1318         if (!res)
1319                 return -ENOMEM;
1320
1321         bch2_page_reservation_init(c, inode, res);
1322         *fsdata = res;
1323
1324         bch2_pagecache_add_get(&inode->ei_pagecache_lock);
1325
1326         page = grab_cache_page_write_begin(mapping, index, flags);
1327         if (!page)
1328                 goto err_unlock;
1329
1330         if (PageUptodate(page))
1331                 goto out;
1332
1333         /* If we're writing entire page, don't need to read it in first: */
1334         if (len == PAGE_SIZE)
1335                 goto out;
1336
1337         if (!offset && pos + len >= inode->v.i_size) {
1338                 zero_user_segment(page, len, PAGE_SIZE);
1339                 flush_dcache_page(page);
1340                 goto out;
1341         }
1342
1343         if (index > inode->v.i_size >> PAGE_SHIFT) {
1344                 zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE);
1345                 flush_dcache_page(page);
1346                 goto out;
1347         }
1348 readpage:
1349         ret = bch2_read_single_page(page, mapping);
1350         if (ret)
1351                 goto err;
1352 out:
1353         ret = bch2_page_reservation_get(c, inode, page, res,
1354                                         offset, len, true);
1355         if (ret) {
1356                 if (!PageUptodate(page)) {
1357                         /*
1358                          * If the page hasn't been read in, we won't know if we
1359                          * actually need a reservation - we don't actually need
1360                          * to read here, we just need to check if the page is
1361                          * fully backed by uncompressed data:
1362                          */
1363                         goto readpage;
1364                 }
1365
1366                 goto err;
1367         }
1368
1369         *pagep = page;
1370         return 0;
1371 err:
1372         unlock_page(page);
1373         put_page(page);
1374         *pagep = NULL;
1375 err_unlock:
1376         bch2_pagecache_add_put(&inode->ei_pagecache_lock);
1377         kfree(res);
1378         *fsdata = NULL;
1379         return ret;
1380 }
1381
1382 int bch2_write_end(struct file *file, struct address_space *mapping,
1383                    loff_t pos, unsigned len, unsigned copied,
1384                    struct page *page, void *fsdata)
1385 {
1386         struct bch_inode_info *inode = to_bch_ei(mapping->host);
1387         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1388         struct bch2_page_reservation *res = fsdata;
1389         unsigned offset = pos & (PAGE_SIZE - 1);
1390
1391         lockdep_assert_held(&inode->v.i_rwsem);
1392
1393         if (unlikely(copied < len && !PageUptodate(page))) {
1394                 /*
1395                  * The page needs to be read in, but that would destroy
1396                  * our partial write - simplest thing is to just force
1397                  * userspace to redo the write:
1398                  */
1399                 zero_user(page, 0, PAGE_SIZE);
1400                 flush_dcache_page(page);
1401                 copied = 0;
1402         }
1403
1404         spin_lock(&inode->v.i_lock);
1405         if (pos + copied > inode->v.i_size)
1406                 i_size_write(&inode->v, pos + copied);
1407         spin_unlock(&inode->v.i_lock);
1408
1409         if (copied) {
1410                 if (!PageUptodate(page))
1411                         SetPageUptodate(page);
1412
1413                 bch2_set_page_dirty(c, inode, page, res, offset, copied);
1414
1415                 inode->ei_last_dirtied = (unsigned long) current;
1416         }
1417
1418         unlock_page(page);
1419         put_page(page);
1420         bch2_pagecache_add_put(&inode->ei_pagecache_lock);
1421
1422         bch2_page_reservation_put(c, inode, res);
1423         kfree(res);
1424
1425         return copied;
1426 }
1427
1428 #define WRITE_BATCH_PAGES       32
1429
1430 static int __bch2_buffered_write(struct bch_inode_info *inode,
1431                                  struct address_space *mapping,
1432                                  struct iov_iter *iter,
1433                                  loff_t pos, unsigned len)
1434 {
1435         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1436         struct page *pages[WRITE_BATCH_PAGES];
1437         struct bch2_page_reservation res;
1438         unsigned long index = pos >> PAGE_SHIFT;
1439         unsigned offset = pos & (PAGE_SIZE - 1);
1440         unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
1441         unsigned i, reserved = 0, set_dirty = 0;
1442         unsigned copied = 0, nr_pages_copied = 0;
1443         int ret = 0;
1444
1445         BUG_ON(!len);
1446         BUG_ON(nr_pages > ARRAY_SIZE(pages));
1447
1448         bch2_page_reservation_init(c, inode, &res);
1449
1450         for (i = 0; i < nr_pages; i++) {
1451                 pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
1452                 if (!pages[i]) {
1453                         nr_pages = i;
1454                         if (!i) {
1455                                 ret = -ENOMEM;
1456                                 goto out;
1457                         }
1458                         len = min_t(unsigned, len,
1459                                     nr_pages * PAGE_SIZE - offset);
1460                         break;
1461                 }
1462         }
1463
1464         if (offset && !PageUptodate(pages[0])) {
1465                 ret = bch2_read_single_page(pages[0], mapping);
1466                 if (ret)
1467                         goto out;
1468         }
1469
1470         if ((pos + len) & (PAGE_SIZE - 1) &&
1471             !PageUptodate(pages[nr_pages - 1])) {
1472                 if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) {
1473                         zero_user(pages[nr_pages - 1], 0, PAGE_SIZE);
1474                 } else {
1475                         ret = bch2_read_single_page(pages[nr_pages - 1], mapping);
1476                         if (ret)
1477                                 goto out;
1478                 }
1479         }
1480
1481         while (reserved < len) {
1482                 struct page *page = pages[(offset + reserved) >> PAGE_SHIFT];
1483                 unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
1484                 unsigned pg_len = min_t(unsigned, len - reserved,
1485                                         PAGE_SIZE - pg_offset);
1486 retry_reservation:
1487                 ret = bch2_page_reservation_get(c, inode, page, &res,
1488                                                 pg_offset, pg_len, true);
1489
1490                 if (ret && !PageUptodate(page)) {
1491                         ret = bch2_read_single_page(page, mapping);
1492                         if (!ret)
1493                                 goto retry_reservation;
1494                 }
1495
1496                 if (ret)
1497                         goto out;
1498
1499                 reserved += pg_len;
1500         }
1501
1502         if (mapping_writably_mapped(mapping))
1503                 for (i = 0; i < nr_pages; i++)
1504                         flush_dcache_page(pages[i]);
1505
1506         while (copied < len) {
1507                 struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
1508                 unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
1509                 unsigned pg_len = min_t(unsigned, len - copied,
1510                                         PAGE_SIZE - pg_offset);
1511                 unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
1512                                                 iter, pg_offset, pg_len);
1513
1514                 if (!pg_copied)
1515                         break;
1516
1517                 flush_dcache_page(page);
1518                 iov_iter_advance(iter, pg_copied);
1519                 copied += pg_copied;
1520         }
1521
1522         if (!copied)
1523                 goto out;
1524
1525         if (copied < len &&
1526             ((offset + copied) & (PAGE_SIZE - 1))) {
1527                 struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
1528
1529                 if (!PageUptodate(page)) {
1530                         zero_user(page, 0, PAGE_SIZE);
1531                         copied -= (offset + copied) & (PAGE_SIZE - 1);
1532                 }
1533         }
1534
1535         spin_lock(&inode->v.i_lock);
1536         if (pos + copied > inode->v.i_size)
1537                 i_size_write(&inode->v, pos + copied);
1538         spin_unlock(&inode->v.i_lock);
1539
1540         while (set_dirty < copied) {
1541                 struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT];
1542                 unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1);
1543                 unsigned pg_len = min_t(unsigned, copied - set_dirty,
1544                                         PAGE_SIZE - pg_offset);
1545
1546                 if (!PageUptodate(page))
1547                         SetPageUptodate(page);
1548
1549                 bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len);
1550                 unlock_page(page);
1551                 put_page(page);
1552
1553                 set_dirty += pg_len;
1554         }
1555
1556         nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
1557         inode->ei_last_dirtied = (unsigned long) current;
1558 out:
1559         for (i = nr_pages_copied; i < nr_pages; i++) {
1560                 unlock_page(pages[i]);
1561                 put_page(pages[i]);
1562         }
1563
1564         bch2_page_reservation_put(c, inode, &res);
1565
1566         return copied ?: ret;
1567 }
1568
1569 static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
1570 {
1571         struct file *file = iocb->ki_filp;
1572         struct address_space *mapping = file->f_mapping;
1573         struct bch_inode_info *inode = file_bch_inode(file);
1574         loff_t pos = iocb->ki_pos;
1575         ssize_t written = 0;
1576         int ret = 0;
1577
1578         bch2_pagecache_add_get(&inode->ei_pagecache_lock);
1579
1580         do {
1581                 unsigned offset = pos & (PAGE_SIZE - 1);
1582                 unsigned bytes = min_t(unsigned long, iov_iter_count(iter),
1583                               PAGE_SIZE * WRITE_BATCH_PAGES - offset);
1584 again:
1585                 /*
1586                  * Bring in the user page that we will copy from _first_.
1587                  * Otherwise there's a nasty deadlock on copying from the
1588                  * same page as we're writing to, without it being marked
1589                  * up-to-date.
1590                  *
1591                  * Not only is this an optimisation, but it is also required
1592                  * to check that the address is actually valid, when atomic
1593                  * usercopies are used, below.
1594                  */
1595                 if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
1596                         bytes = min_t(unsigned long, iov_iter_count(iter),
1597                                       PAGE_SIZE - offset);
1598
1599                         if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
1600                                 ret = -EFAULT;
1601                                 break;
1602                         }
1603                 }
1604
1605                 if (unlikely(fatal_signal_pending(current))) {
1606                         ret = -EINTR;
1607                         break;
1608                 }
1609
1610                 ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
1611                 if (unlikely(ret < 0))
1612                         break;
1613
1614                 cond_resched();
1615
1616                 if (unlikely(ret == 0)) {
1617                         /*
1618                          * If we were unable to copy any data at all, we must
1619                          * fall back to a single segment length write.
1620                          *
1621                          * If we didn't fallback here, we could livelock
1622                          * because not all segments in the iov can be copied at
1623                          * once without a pagefault.
1624                          */
1625                         bytes = min_t(unsigned long, PAGE_SIZE - offset,
1626                                       iov_iter_single_seg_count(iter));
1627                         goto again;
1628                 }
1629                 pos += ret;
1630                 written += ret;
1631
1632                 balance_dirty_pages_ratelimited(mapping);
1633         } while (iov_iter_count(iter));
1634
1635         bch2_pagecache_add_put(&inode->ei_pagecache_lock);
1636
1637         return written ? written : ret;
1638 }
1639
1640 /* O_DIRECT reads */
1641
1642 static void bch2_dio_read_complete(struct closure *cl)
1643 {
1644         struct dio_read *dio = container_of(cl, struct dio_read, cl);
1645
1646         dio->req->ki_complete(dio->req, dio->ret, 0);
1647         bio_check_pages_dirty(&dio->rbio.bio);  /* transfers ownership */
1648 }
1649
1650 static void bch2_direct_IO_read_endio(struct bio *bio)
1651 {
1652         struct dio_read *dio = bio->bi_private;
1653
1654         if (bio->bi_status)
1655                 dio->ret = blk_status_to_errno(bio->bi_status);
1656
1657         closure_put(&dio->cl);
1658 }
1659
1660 static void bch2_direct_IO_read_split_endio(struct bio *bio)
1661 {
1662         bch2_direct_IO_read_endio(bio);
1663         bio_check_pages_dirty(bio);     /* transfers ownership */
1664 }
1665
1666 static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
1667 {
1668         struct file *file = req->ki_filp;
1669         struct bch_inode_info *inode = file_bch_inode(file);
1670         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1671         struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
1672         struct dio_read *dio;
1673         struct bio *bio;
1674         loff_t offset = req->ki_pos;
1675         bool sync = is_sync_kiocb(req);
1676         size_t shorten;
1677         ssize_t ret;
1678
1679         if ((offset|iter->count) & (block_bytes(c) - 1))
1680                 return -EINVAL;
1681
1682         ret = min_t(loff_t, iter->count,
1683                     max_t(loff_t, 0, i_size_read(&inode->v) - offset));
1684
1685         if (!ret)
1686                 return ret;
1687
1688         shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
1689         iter->count -= shorten;
1690
1691         bio = bio_alloc_bioset(GFP_KERNEL,
1692                                iov_iter_npages(iter, BIO_MAX_PAGES),
1693                                &c->dio_read_bioset);
1694
1695         bio->bi_end_io = bch2_direct_IO_read_endio;
1696
1697         dio = container_of(bio, struct dio_read, rbio.bio);
1698         closure_init(&dio->cl, NULL);
1699
1700         /*
1701          * this is a _really_ horrible hack just to avoid an atomic sub at the
1702          * end:
1703          */
1704         if (!sync) {
1705                 set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
1706                 atomic_set(&dio->cl.remaining,
1707                            CLOSURE_REMAINING_INITIALIZER -
1708                            CLOSURE_RUNNING +
1709                            CLOSURE_DESTRUCTOR);
1710         } else {
1711                 atomic_set(&dio->cl.remaining,
1712                            CLOSURE_REMAINING_INITIALIZER + 1);
1713         }
1714
1715         dio->req        = req;
1716         dio->ret        = ret;
1717
1718         goto start;
1719         while (iter->count) {
1720                 bio = bio_alloc_bioset(GFP_KERNEL,
1721                                        iov_iter_npages(iter, BIO_MAX_PAGES),
1722                                        &c->bio_read);
1723                 bio->bi_end_io          = bch2_direct_IO_read_split_endio;
1724 start:
1725                 bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC);
1726                 bio->bi_iter.bi_sector  = offset >> 9;
1727                 bio->bi_private         = dio;
1728
1729                 ret = bio_iov_iter_get_pages(bio, iter);
1730                 if (ret < 0) {
1731                         /* XXX: fault inject this path */
1732                         bio->bi_status = BLK_STS_RESOURCE;
1733                         bio_endio(bio);
1734                         break;
1735                 }
1736
1737                 offset += bio->bi_iter.bi_size;
1738                 bio_set_pages_dirty(bio);
1739
1740                 if (iter->count)
1741                         closure_get(&dio->cl);
1742
1743                 bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
1744         }
1745
1746         iter->count += shorten;
1747
1748         if (sync) {
1749                 closure_sync(&dio->cl);
1750                 closure_debug_destroy(&dio->cl);
1751                 ret = dio->ret;
1752                 bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
1753                 return ret;
1754         } else {
1755                 return -EIOCBQUEUED;
1756         }
1757 }
1758
1759 ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
1760 {
1761         struct file *file = iocb->ki_filp;
1762         struct bch_inode_info *inode = file_bch_inode(file);
1763         struct address_space *mapping = file->f_mapping;
1764         size_t count = iov_iter_count(iter);
1765         ssize_t ret;
1766
1767         if (!count)
1768                 return 0; /* skip atime */
1769
1770         if (iocb->ki_flags & IOCB_DIRECT) {
1771                 struct blk_plug plug;
1772
1773                 ret = filemap_write_and_wait_range(mapping,
1774                                         iocb->ki_pos,
1775                                         iocb->ki_pos + count - 1);
1776                 if (ret < 0)
1777                         return ret;
1778
1779                 file_accessed(file);
1780
1781                 blk_start_plug(&plug);
1782                 ret = bch2_direct_IO_read(iocb, iter);
1783                 blk_finish_plug(&plug);
1784
1785                 if (ret >= 0)
1786                         iocb->ki_pos += ret;
1787         } else {
1788                 bch2_pagecache_add_get(&inode->ei_pagecache_lock);
1789                 ret = generic_file_read_iter(iocb, iter);
1790                 bch2_pagecache_add_put(&inode->ei_pagecache_lock);
1791         }
1792
1793         return ret;
1794 }
1795
1796 /* O_DIRECT writes */
1797
1798 static long bch2_dio_write_loop(struct dio_write *dio)
1799 {
1800         bool kthread = (current->flags & PF_KTHREAD) != 0;
1801         struct bch_fs *c = dio->op.c;
1802         struct kiocb *req = dio->req;
1803         struct address_space *mapping = req->ki_filp->f_mapping;
1804         struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
1805         struct bio *bio = &dio->op.wbio.bio;
1806         struct bvec_iter_all iter;
1807         struct bio_vec *bv;
1808         unsigned unaligned;
1809         u64 new_i_size;
1810         bool sync;
1811         long ret;
1812
1813         if (dio->loop)
1814                 goto loop;
1815
1816         while (1) {
1817                 if (kthread)
1818                         use_mm(dio->mm);
1819                 BUG_ON(current->faults_disabled_mapping);
1820                 current->faults_disabled_mapping = mapping;
1821
1822                 ret = bio_iov_iter_get_pages(bio, &dio->iter);
1823
1824                 current->faults_disabled_mapping = NULL;
1825                 if (kthread)
1826                         unuse_mm(dio->mm);
1827
1828                 if (unlikely(ret < 0))
1829                         goto err;
1830
1831                 unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
1832                 bio->bi_iter.bi_size -= unaligned;
1833                 iov_iter_revert(&dio->iter, unaligned);
1834
1835                 if (!bio->bi_iter.bi_size) {
1836                         /*
1837                          * bio_iov_iter_get_pages was only able to get <
1838                          * blocksize worth of pages:
1839                          */
1840                         bio_for_each_segment_all(bv, bio, iter)
1841                                 put_page(bv->bv_page);
1842                         ret = -EFAULT;
1843                         goto err;
1844                 }
1845
1846                 dio->op.pos = POS(inode->v.i_ino,
1847                                   (req->ki_pos >> 9) + dio->op.written);
1848
1849                 task_io_account_write(bio->bi_iter.bi_size);
1850
1851                 if (!dio->sync && !dio->loop && dio->iter.count) {
1852                         struct iovec *iov = dio->inline_vecs;
1853
1854                         if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
1855                                 iov = kmalloc(dio->iter.nr_segs * sizeof(*iov),
1856                                               GFP_KERNEL);
1857                                 if (unlikely(!iov)) {
1858                                         dio->sync = true;
1859                                         goto do_io;
1860                                 }
1861
1862                                 dio->free_iov = true;
1863                         }
1864
1865                         memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
1866                         dio->iter.iov = iov;
1867                 }
1868 do_io:
1869                 dio->loop = true;
1870                 closure_call(&dio->op.cl, bch2_write, NULL, NULL);
1871
1872                 if (dio->sync)
1873                         wait_for_completion(&dio->done);
1874                 else
1875                         return -EIOCBQUEUED;
1876 loop:
1877                 i_sectors_acct(c, inode, &dio->quota_res,
1878                                dio->op.i_sectors_delta);
1879                 dio->op.i_sectors_delta = 0;
1880
1881                 new_i_size = req->ki_pos + ((u64) dio->op.written << 9);
1882
1883                 spin_lock(&inode->v.i_lock);
1884                 if (new_i_size > inode->v.i_size)
1885                         i_size_write(&inode->v, new_i_size);
1886                 spin_unlock(&inode->v.i_lock);
1887
1888                 bio_for_each_segment_all(bv, bio, iter)
1889                         put_page(bv->bv_page);
1890                 if (!dio->iter.count || dio->op.error)
1891                         break;
1892
1893                 bio_reset(bio);
1894                 reinit_completion(&dio->done);
1895         }
1896
1897         ret = dio->op.error ?: ((long) dio->op.written << 9);
1898 err:
1899         bch2_pagecache_block_put(&inode->ei_pagecache_lock);
1900         bch2_disk_reservation_put(c, &dio->op.res);
1901         bch2_quota_reservation_put(c, inode, &dio->quota_res);
1902
1903         if (dio->free_iov)
1904                 kfree(dio->iter.iov);
1905
1906         sync = dio->sync;
1907         bio_put(bio);
1908
1909         /* inode->i_dio_count is our ref on inode and thus bch_fs */
1910         inode_dio_end(&inode->v);
1911
1912         if (!sync) {
1913                 req->ki_complete(req, ret, 0);
1914                 ret = -EIOCBQUEUED;
1915         }
1916         return ret;
1917 }
1918
1919 static void bch2_dio_write_loop_async(struct bch_write_op *op)
1920 {
1921         struct dio_write *dio = container_of(op, struct dio_write, op);
1922
1923         if (dio->sync)
1924                 complete(&dio->done);
1925         else
1926                 bch2_dio_write_loop(dio);
1927 }
1928
1929 static noinline
1930 ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
1931 {
1932         struct file *file = req->ki_filp;
1933         struct address_space *mapping = file->f_mapping;
1934         struct bch_inode_info *inode = file_bch_inode(file);
1935         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1936         struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
1937         struct dio_write *dio;
1938         struct bio *bio;
1939         bool locked = true, extending;
1940         ssize_t ret;
1941
1942         prefetch(&c->opts);
1943         prefetch((void *) &c->opts + 64);
1944         prefetch(&inode->ei_inode);
1945         prefetch((void *) &inode->ei_inode + 64);
1946
1947         inode_lock(&inode->v);
1948
1949         ret = generic_write_checks(req, iter);
1950         if (unlikely(ret <= 0))
1951                 goto err;
1952
1953         ret = file_remove_privs(file);
1954         if (unlikely(ret))
1955                 goto err;
1956
1957         ret = file_update_time(file);
1958         if (unlikely(ret))
1959                 goto err;
1960
1961         if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
1962                 goto err;
1963
1964         inode_dio_begin(&inode->v);
1965         bch2_pagecache_block_get(&inode->ei_pagecache_lock);
1966
1967         extending = req->ki_pos + iter->count > inode->v.i_size;
1968         if (!extending) {
1969                 inode_unlock(&inode->v);
1970                 locked = false;
1971         }
1972
1973         bio = bio_alloc_bioset(GFP_KERNEL,
1974                                iov_iter_npages(iter, BIO_MAX_PAGES),
1975                                &c->dio_write_bioset);
1976         dio = container_of(bio, struct dio_write, op.wbio.bio);
1977         init_completion(&dio->done);
1978         dio->req                = req;
1979         dio->mm                 = current->mm;
1980         dio->loop               = false;
1981         dio->sync               = is_sync_kiocb(req) || extending;
1982         dio->free_iov           = false;
1983         dio->quota_res.sectors  = 0;
1984         dio->iter               = *iter;
1985
1986         bch2_write_op_init(&dio->op, c, opts);
1987         dio->op.end_io          = bch2_dio_write_loop_async;
1988         dio->op.target          = opts.foreground_target;
1989         op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
1990         dio->op.write_point     = writepoint_hashed((unsigned long) current);
1991         dio->op.flags |= BCH_WRITE_NOPUT_RESERVATION;
1992
1993         if ((req->ki_flags & IOCB_DSYNC) &&
1994             !c->opts.journal_flush_disabled)
1995                 dio->op.flags |= BCH_WRITE_FLUSH;
1996
1997         ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
1998                                          iter->count >> 9, true);
1999         if (unlikely(ret))
2000                 goto err_put_bio;
2001
2002         dio->op.nr_replicas     = dio->op.opts.data_replicas;
2003
2004         ret = bch2_disk_reservation_get(c, &dio->op.res, iter->count >> 9,
2005                                         dio->op.opts.data_replicas, 0);
2006         if (unlikely(ret) &&
2007             !bch2_check_range_allocated(c, POS(inode->v.i_ino,
2008                                                req->ki_pos >> 9),
2009                                         iter->count >> 9,
2010                                         dio->op.opts.data_replicas))
2011                 goto err_put_bio;
2012
2013         ret = write_invalidate_inode_pages_range(mapping,
2014                                         req->ki_pos,
2015                                         req->ki_pos + iter->count - 1);
2016         if (unlikely(ret))
2017                 goto err_put_bio;
2018
2019         ret = bch2_dio_write_loop(dio);
2020 err:
2021         if (locked)
2022                 inode_unlock(&inode->v);
2023         if (ret > 0)
2024                 req->ki_pos += ret;
2025         return ret;
2026 err_put_bio:
2027         bch2_pagecache_block_put(&inode->ei_pagecache_lock);
2028         bch2_disk_reservation_put(c, &dio->op.res);
2029         bch2_quota_reservation_put(c, inode, &dio->quota_res);
2030         bio_put(bio);
2031         inode_dio_end(&inode->v);
2032         goto err;
2033 }
2034
2035 ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
2036 {
2037         struct file *file = iocb->ki_filp;
2038         struct bch_inode_info *inode = file_bch_inode(file);
2039         ssize_t ret;
2040
2041         if (iocb->ki_flags & IOCB_DIRECT)
2042                 return bch2_direct_write(iocb, from);
2043
2044         /* We can write back this queue in page reclaim */
2045         current->backing_dev_info = inode_to_bdi(&inode->v);
2046         inode_lock(&inode->v);
2047
2048         ret = generic_write_checks(iocb, from);
2049         if (ret <= 0)
2050                 goto unlock;
2051
2052         ret = file_remove_privs(file);
2053         if (ret)
2054                 goto unlock;
2055
2056         ret = file_update_time(file);
2057         if (ret)
2058                 goto unlock;
2059
2060         ret = bch2_buffered_write(iocb, from);
2061         if (likely(ret > 0))
2062                 iocb->ki_pos += ret;
2063 unlock:
2064         inode_unlock(&inode->v);
2065         current->backing_dev_info = NULL;
2066
2067         if (ret > 0)
2068                 ret = generic_write_sync(iocb, ret);
2069
2070         return ret;
2071 }
2072
2073 /* fsync: */
2074
2075 int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2076 {
2077         struct bch_inode_info *inode = file_bch_inode(file);
2078         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2079         int ret, ret2;
2080
2081         ret = file_write_and_wait_range(file, start, end);
2082         if (ret)
2083                 return ret;
2084
2085         if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC))
2086                 goto out;
2087
2088         ret = sync_inode_metadata(&inode->v, 1);
2089         if (ret)
2090                 return ret;
2091 out:
2092         if (!c->opts.journal_flush_disabled)
2093                 ret = bch2_journal_flush_seq(&c->journal,
2094                                              inode->ei_journal_seq);
2095         ret2 = file_check_and_advance_wb_err(file);
2096
2097         return ret ?: ret2;
2098 }
2099
2100 /* truncate: */
2101
2102 static inline int range_has_data(struct bch_fs *c,
2103                                   struct bpos start,
2104                                   struct bpos end)
2105 {
2106         struct btree_trans trans;
2107         struct btree_iter *iter;
2108         struct bkey_s_c k;
2109         int ret = 0;
2110
2111         bch2_trans_init(&trans, c, 0, 0);
2112
2113         for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) {
2114                 if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
2115                         break;
2116
2117                 if (bkey_extent_is_data(k.k)) {
2118                         ret = 1;
2119                         break;
2120                 }
2121         }
2122
2123         return bch2_trans_exit(&trans) ?: ret;
2124 }
2125
2126 static int __bch2_truncate_page(struct bch_inode_info *inode,
2127                                 pgoff_t index, loff_t start, loff_t end)
2128 {
2129         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2130         struct address_space *mapping = inode->v.i_mapping;
2131         struct bch_page_state *s;
2132         unsigned start_offset = start & (PAGE_SIZE - 1);
2133         unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
2134         unsigned i;
2135         struct page *page;
2136         int ret = 0;
2137
2138         /* Page boundary? Nothing to do */
2139         if (!((index == start >> PAGE_SHIFT && start_offset) ||
2140               (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE)))
2141                 return 0;
2142
2143         /* Above i_size? */
2144         if (index << PAGE_SHIFT >= inode->v.i_size)
2145                 return 0;
2146
2147         page = find_lock_page(mapping, index);
2148         if (!page) {
2149                 /*
2150                  * XXX: we're doing two index lookups when we end up reading the
2151                  * page
2152                  */
2153                 ret = range_has_data(c,
2154                                 POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT),
2155                                 POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT));
2156                 if (ret <= 0)
2157                         return ret;
2158
2159                 page = find_or_create_page(mapping, index, GFP_KERNEL);
2160                 if (unlikely(!page)) {
2161                         ret = -ENOMEM;
2162                         goto out;
2163                 }
2164         }
2165
2166         s = bch2_page_state_create(page, 0);
2167         if (!s) {
2168                 ret = -ENOMEM;
2169                 goto unlock;
2170         }
2171
2172         if (!PageUptodate(page)) {
2173                 ret = bch2_read_single_page(page, mapping);
2174                 if (ret)
2175                         goto unlock;
2176         }
2177
2178         if (index != start >> PAGE_SHIFT)
2179                 start_offset = 0;
2180         if (index != end >> PAGE_SHIFT)
2181                 end_offset = PAGE_SIZE;
2182
2183         for (i = round_up(start_offset, block_bytes(c)) >> 9;
2184              i < round_down(end_offset, block_bytes(c)) >> 9;
2185              i++) {
2186                 s->s[i].nr_replicas     = 0;
2187                 s->s[i].state           = SECTOR_UNALLOCATED;
2188         }
2189
2190         zero_user_segment(page, start_offset, end_offset);
2191
2192         /*
2193          * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
2194          *
2195          * XXX: because we aren't currently tracking whether the page has actual
2196          * data in it (vs. just 0s, or only partially written) this wrong. ick.
2197          */
2198         ret = bch2_get_page_disk_reservation(c, inode, page, false);
2199         BUG_ON(ret);
2200
2201         __set_page_dirty_nobuffers(page);
2202 unlock:
2203         unlock_page(page);
2204         put_page(page);
2205 out:
2206         return ret;
2207 }
2208
2209 static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
2210 {
2211         return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
2212                                     from, round_up(from, PAGE_SIZE));
2213 }
2214
2215 static int bch2_extend(struct bch_inode_info *inode,
2216                        struct bch_inode_unpacked *inode_u,
2217                        struct iattr *iattr)
2218 {
2219         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2220         struct address_space *mapping = inode->v.i_mapping;
2221         int ret;
2222
2223         /*
2224          * sync appends:
2225          *
2226          * this has to be done _before_ extending i_size:
2227          */
2228         ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
2229         if (ret)
2230                 return ret;
2231
2232         truncate_setsize(&inode->v, iattr->ia_size);
2233         setattr_copy(&inode->v, iattr);
2234
2235         mutex_lock(&inode->ei_update_lock);
2236         ret = bch2_write_inode_size(c, inode, inode->v.i_size,
2237                                     ATTR_MTIME|ATTR_CTIME);
2238         mutex_unlock(&inode->ei_update_lock);
2239
2240         return ret;
2241 }
2242
2243 static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
2244                                    struct bch_inode_unpacked *bi,
2245                                    void *p)
2246 {
2247         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2248
2249         bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
2250         bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
2251         return 0;
2252 }
2253
2254 static int bch2_truncate_start_fn(struct bch_inode_info *inode,
2255                                   struct bch_inode_unpacked *bi, void *p)
2256 {
2257         u64 *new_i_size = p;
2258
2259         bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
2260         bi->bi_size = *new_i_size;
2261         return 0;
2262 }
2263
2264 int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
2265 {
2266         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2267         struct address_space *mapping = inode->v.i_mapping;
2268         struct bch_inode_unpacked inode_u;
2269         struct btree_trans trans;
2270         struct btree_iter *iter;
2271         u64 new_i_size = iattr->ia_size;
2272         s64 i_sectors_delta = 0;
2273         int ret = 0;
2274
2275         inode_dio_wait(&inode->v);
2276         bch2_pagecache_block_get(&inode->ei_pagecache_lock);
2277
2278         /*
2279          * fetch current on disk i_size: inode is locked, i_size can only
2280          * increase underneath us:
2281          */
2282         bch2_trans_init(&trans, c, 0, 0);
2283         iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0);
2284         ret = PTR_ERR_OR_ZERO(iter);
2285         bch2_trans_exit(&trans);
2286
2287         if (ret)
2288                 goto err;
2289
2290         BUG_ON(inode->v.i_size < inode_u.bi_size);
2291
2292         if (iattr->ia_size > inode->v.i_size) {
2293                 ret = bch2_extend(inode, &inode_u, iattr);
2294                 goto err;
2295         }
2296
2297         ret = bch2_truncate_page(inode, iattr->ia_size);
2298         if (unlikely(ret))
2299                 goto err;
2300
2301         /*
2302          * When extending, we're going to write the new i_size to disk
2303          * immediately so we need to flush anything above the current on disk
2304          * i_size first:
2305          *
2306          * Also, when extending we need to flush the page that i_size currently
2307          * straddles - if it's mapped to userspace, we need to ensure that
2308          * userspace has to redirty it and call .mkwrite -> set_page_dirty
2309          * again to allocate the part of the page that was extended.
2310          */
2311         if (iattr->ia_size > inode_u.bi_size)
2312                 ret = filemap_write_and_wait_range(mapping,
2313                                 inode_u.bi_size,
2314                                 iattr->ia_size - 1);
2315         else if (iattr->ia_size & (PAGE_SIZE - 1))
2316                 ret = filemap_write_and_wait_range(mapping,
2317                                 round_down(iattr->ia_size, PAGE_SIZE),
2318                                 iattr->ia_size - 1);
2319         if (ret)
2320                 goto err;
2321
2322         mutex_lock(&inode->ei_update_lock);
2323         ret = bch2_write_inode(c, inode, bch2_truncate_start_fn,
2324                                &new_i_size, 0);
2325         mutex_unlock(&inode->ei_update_lock);
2326
2327         if (unlikely(ret))
2328                 goto err;
2329
2330         truncate_setsize(&inode->v, iattr->ia_size);
2331
2332         ret = bch2_fpunch(c, inode->v.i_ino,
2333                         round_up(iattr->ia_size, block_bytes(c)) >> 9,
2334                         U64_MAX, &inode->ei_journal_seq, &i_sectors_delta);
2335         i_sectors_acct(c, inode, NULL, i_sectors_delta);
2336
2337         if (unlikely(ret))
2338                 goto err;
2339
2340         setattr_copy(&inode->v, iattr);
2341
2342         mutex_lock(&inode->ei_update_lock);
2343         ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL,
2344                                ATTR_MTIME|ATTR_CTIME);
2345         mutex_unlock(&inode->ei_update_lock);
2346 err:
2347         bch2_pagecache_block_put(&inode->ei_pagecache_lock);
2348         return ret;
2349 }
2350
2351 /* fallocate: */
2352
2353 static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
2354 {
2355         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2356         u64 discard_start = round_up(offset, block_bytes(c)) >> 9;
2357         u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9;
2358         int ret = 0;
2359
2360         inode_lock(&inode->v);
2361         inode_dio_wait(&inode->v);
2362         bch2_pagecache_block_get(&inode->ei_pagecache_lock);
2363
2364         ret = __bch2_truncate_page(inode,
2365                                    offset >> PAGE_SHIFT,
2366                                    offset, offset + len);
2367         if (unlikely(ret))
2368                 goto err;
2369
2370         if (offset >> PAGE_SHIFT !=
2371             (offset + len) >> PAGE_SHIFT) {
2372                 ret = __bch2_truncate_page(inode,
2373                                            (offset + len) >> PAGE_SHIFT,
2374                                            offset, offset + len);
2375                 if (unlikely(ret))
2376                         goto err;
2377         }
2378
2379         truncate_pagecache_range(&inode->v, offset, offset + len - 1);
2380
2381         if (discard_start < discard_end) {
2382                 s64 i_sectors_delta = 0;
2383
2384                 ret = bch2_fpunch(c, inode->v.i_ino,
2385                                   discard_start, discard_end,
2386                                   &inode->ei_journal_seq,
2387                                   &i_sectors_delta);
2388                 i_sectors_acct(c, inode, NULL, i_sectors_delta);
2389         }
2390 err:
2391         bch2_pagecache_block_put(&inode->ei_pagecache_lock);
2392         inode_unlock(&inode->v);
2393
2394         return ret;
2395 }
2396
2397 static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
2398                                    loff_t offset, loff_t len,
2399                                    bool insert)
2400 {
2401         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2402         struct address_space *mapping = inode->v.i_mapping;
2403         struct bkey_on_stack copy;
2404         struct btree_trans trans;
2405         struct btree_iter *src, *dst, *del = NULL;
2406         loff_t shift, new_size;
2407         u64 src_start;
2408         int ret;
2409
2410         if ((offset | len) & (block_bytes(c) - 1))
2411                 return -EINVAL;
2412
2413         bkey_on_stack_init(&copy);
2414         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
2415
2416         /*
2417          * We need i_mutex to keep the page cache consistent with the extents
2418          * btree, and the btree consistent with i_size - we don't need outside
2419          * locking for the extents btree itself, because we're using linked
2420          * iterators
2421          */
2422         inode_lock(&inode->v);
2423         inode_dio_wait(&inode->v);
2424         bch2_pagecache_block_get(&inode->ei_pagecache_lock);
2425
2426         if (insert) {
2427                 ret = -EFBIG;
2428                 if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
2429                         goto err;
2430
2431                 ret = -EINVAL;
2432                 if (offset >= inode->v.i_size)
2433                         goto err;
2434
2435                 src_start       = U64_MAX;
2436                 shift           = len;
2437         } else {
2438                 ret = -EINVAL;
2439                 if (offset + len >= inode->v.i_size)
2440                         goto err;
2441
2442                 src_start       = offset + len;
2443                 shift           = -len;
2444         }
2445
2446         new_size = inode->v.i_size + shift;
2447
2448         ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
2449         if (ret)
2450                 goto err;
2451
2452         if (insert) {
2453                 i_size_write(&inode->v, new_size);
2454                 mutex_lock(&inode->ei_update_lock);
2455                 ret = bch2_write_inode_size(c, inode, new_size,
2456                                             ATTR_MTIME|ATTR_CTIME);
2457                 mutex_unlock(&inode->ei_update_lock);
2458         } else {
2459                 s64 i_sectors_delta = 0;
2460
2461                 ret = bch2_fpunch(c, inode->v.i_ino,
2462                                   offset >> 9, (offset + len) >> 9,
2463                                   &inode->ei_journal_seq,
2464                                   &i_sectors_delta);
2465                 i_sectors_acct(c, inode, NULL, i_sectors_delta);
2466
2467                 if (ret)
2468                         goto err;
2469         }
2470
2471         src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
2472                         POS(inode->v.i_ino, src_start >> 9),
2473                         BTREE_ITER_INTENT);
2474         BUG_ON(IS_ERR_OR_NULL(src));
2475
2476         dst = bch2_trans_copy_iter(&trans, src);
2477         BUG_ON(IS_ERR_OR_NULL(dst));
2478
2479         while (1) {
2480                 struct disk_reservation disk_res =
2481                         bch2_disk_reservation_init(c, 0);
2482                 struct bkey_i delete;
2483                 struct bkey_s_c k;
2484                 struct bpos next_pos;
2485                 struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
2486                 struct bpos atomic_end;
2487                 unsigned commit_flags = BTREE_INSERT_NOFAIL|
2488                         BTREE_INSERT_ATOMIC|
2489                         BTREE_INSERT_USE_RESERVE;
2490
2491                 k = insert
2492                         ? bch2_btree_iter_peek_prev(src)
2493                         : bch2_btree_iter_peek(src);
2494                 if ((ret = bkey_err(k)))
2495                         goto bkey_err;
2496
2497                 if (!k.k || k.k->p.inode != inode->v.i_ino)
2498                         break;
2499
2500                 BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k)));
2501
2502                 if (insert &&
2503                     bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
2504                         break;
2505 reassemble:
2506                 bkey_on_stack_reassemble(&copy, c, k);
2507
2508                 if (insert &&
2509                     bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) {
2510                         bch2_cut_front(move_pos, copy.k);
2511                         bch2_btree_iter_set_pos(src, bkey_start_pos(&copy.k->k));
2512                 }
2513
2514                 copy.k->k.p.offset += shift >> 9;
2515                 bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k->k));
2516
2517                 ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end);
2518                 if (ret)
2519                         goto bkey_err;
2520
2521                 if (bkey_cmp(atomic_end, copy.k->k.p)) {
2522                         if (insert) {
2523                                 move_pos = atomic_end;
2524                                 move_pos.offset -= shift >> 9;
2525                                 goto reassemble;
2526                         } else {
2527                                 bch2_cut_back(atomic_end, copy.k);
2528                         }
2529                 }
2530
2531                 bkey_init(&delete.k);
2532                 delete.k.p = src->pos;
2533                 bch2_key_resize(&delete.k, copy.k->k.size);
2534
2535                 next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
2536
2537                 /*
2538                  * If the new and old keys overlap (because we're moving an
2539                  * extent that's bigger than the amount we're collapsing by),
2540                  * we need to trim the delete key here so they don't overlap
2541                  * because overlaps on insertions aren't handled before
2542                  * triggers are run, so the overwrite will get double counted
2543                  * by the triggers machinery:
2544                  */
2545                 if (insert &&
2546                     bkey_cmp(bkey_start_pos(&copy.k->k), delete.k.p) < 0) {
2547                         bch2_cut_back(bkey_start_pos(&copy.k->k), &delete);
2548                 } else if (!insert &&
2549                            bkey_cmp(copy.k->k.p,
2550                                     bkey_start_pos(&delete.k)) > 0) {
2551                         bch2_cut_front(copy.k->k.p, &delete);
2552
2553                         del = bch2_trans_copy_iter(&trans, src);
2554                         BUG_ON(IS_ERR_OR_NULL(del));
2555
2556                         bch2_btree_iter_set_pos(del,
2557                                 bkey_start_pos(&delete.k));
2558                 }
2559
2560                 bch2_trans_update(&trans, dst, copy.k);
2561                 bch2_trans_update(&trans, del ?: src, &delete);
2562
2563                 if (copy.k->k.size == k.k->size) {
2564                         /*
2565                          * If we're moving the entire extent, we can skip
2566                          * running triggers:
2567                          */
2568                         commit_flags |= BTREE_INSERT_NOMARK;
2569                 } else {
2570                         /* We might end up splitting compressed extents: */
2571                         unsigned nr_ptrs =
2572                                 bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
2573
2574                         ret = bch2_disk_reservation_get(c, &disk_res,
2575                                         copy.k->k.size, nr_ptrs,
2576                                         BCH_DISK_RESERVATION_NOFAIL);
2577                         BUG_ON(ret);
2578                 }
2579
2580                 ret = bch2_trans_commit(&trans, &disk_res,
2581                                         &inode->ei_journal_seq,
2582                                         commit_flags);
2583                 bch2_disk_reservation_put(c, &disk_res);
2584 bkey_err:
2585                 if (del)
2586                         bch2_trans_iter_put(&trans, del);
2587                 del = NULL;
2588
2589                 if (!ret)
2590                         bch2_btree_iter_set_pos(src, next_pos);
2591
2592                 if (ret == -EINTR)
2593                         ret = 0;
2594                 if (ret)
2595                         goto err;
2596
2597                 bch2_trans_cond_resched(&trans);
2598         }
2599         bch2_trans_unlock(&trans);
2600
2601         if (!insert) {
2602                 i_size_write(&inode->v, new_size);
2603                 mutex_lock(&inode->ei_update_lock);
2604                 ret = bch2_write_inode_size(c, inode, new_size,
2605                                             ATTR_MTIME|ATTR_CTIME);
2606                 mutex_unlock(&inode->ei_update_lock);
2607         }
2608 err:
2609         bch2_trans_exit(&trans);
2610         bkey_on_stack_exit(&copy, c);
2611         bch2_pagecache_block_put(&inode->ei_pagecache_lock);
2612         inode_unlock(&inode->v);
2613         return ret;
2614 }
2615
2616 static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
2617                             loff_t offset, loff_t len)
2618 {
2619         struct address_space *mapping = inode->v.i_mapping;
2620         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2621         struct btree_trans trans;
2622         struct btree_iter *iter;
2623         struct bpos end_pos;
2624         loff_t end              = offset + len;
2625         loff_t block_start      = round_down(offset,    block_bytes(c));
2626         loff_t block_end        = round_up(end,         block_bytes(c));
2627         unsigned sectors;
2628         unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
2629         int ret;
2630
2631         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2632
2633         inode_lock(&inode->v);
2634         inode_dio_wait(&inode->v);
2635         bch2_pagecache_block_get(&inode->ei_pagecache_lock);
2636
2637         if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
2638                 ret = inode_newsize_ok(&inode->v, end);
2639                 if (ret)
2640                         goto err;
2641         }
2642
2643         if (mode & FALLOC_FL_ZERO_RANGE) {
2644                 ret = __bch2_truncate_page(inode,
2645                                            offset >> PAGE_SHIFT,
2646                                            offset, end);
2647
2648                 if (!ret &&
2649                     offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
2650                         ret = __bch2_truncate_page(inode,
2651                                                    end >> PAGE_SHIFT,
2652                                                    offset, end);
2653
2654                 if (unlikely(ret))
2655                         goto err;
2656
2657                 truncate_pagecache_range(&inode->v, offset, end - 1);
2658         }
2659
2660         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
2661                         POS(inode->v.i_ino, block_start >> 9),
2662                         BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
2663         end_pos = POS(inode->v.i_ino, block_end >> 9);
2664
2665         while (bkey_cmp(iter->pos, end_pos) < 0) {
2666                 s64 i_sectors_delta = 0;
2667                 struct disk_reservation disk_res = { 0 };
2668                 struct quota_res quota_res = { 0 };
2669                 struct bkey_i_reservation reservation;
2670                 struct bkey_s_c k;
2671
2672                 k = bch2_btree_iter_peek_slot(iter);
2673                 if ((ret = bkey_err(k)))
2674                         goto bkey_err;
2675
2676                 /* already reserved */
2677                 if (k.k->type == KEY_TYPE_reservation &&
2678                     bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
2679                         bch2_btree_iter_next_slot(iter);
2680                         continue;
2681                 }
2682
2683                 if (bkey_extent_is_data(k.k) &&
2684                     !(mode & FALLOC_FL_ZERO_RANGE)) {
2685                         bch2_btree_iter_next_slot(iter);
2686                         continue;
2687                 }
2688
2689                 bkey_reservation_init(&reservation.k_i);
2690                 reservation.k.type      = KEY_TYPE_reservation;
2691                 reservation.k.p         = k.k->p;
2692                 reservation.k.size      = k.k->size;
2693
2694                 bch2_cut_front(iter->pos,       &reservation.k_i);
2695                 bch2_cut_back(end_pos,          &reservation.k_i);
2696
2697                 sectors = reservation.k.size;
2698                 reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k);
2699
2700                 if (!bkey_extent_is_allocation(k.k)) {
2701                         ret = bch2_quota_reservation_add(c, inode,
2702                                         &quota_res,
2703                                         sectors, true);
2704                         if (unlikely(ret))
2705                                 goto bkey_err;
2706                 }
2707
2708                 if (reservation.v.nr_replicas < replicas ||
2709                     bch2_bkey_sectors_compressed(k)) {
2710                         ret = bch2_disk_reservation_get(c, &disk_res, sectors,
2711                                                         replicas, 0);
2712                         if (unlikely(ret))
2713                                 goto bkey_err;
2714
2715                         reservation.v.nr_replicas = disk_res.nr_replicas;
2716                 }
2717
2718                 bch2_trans_begin_updates(&trans);
2719
2720                 ret = bch2_extent_update(&trans, iter, &reservation.k_i,
2721                                 &disk_res, &inode->ei_journal_seq,
2722                                 0, &i_sectors_delta);
2723                 i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
2724 bkey_err:
2725                 bch2_quota_reservation_put(c, inode, &quota_res);
2726                 bch2_disk_reservation_put(c, &disk_res);
2727                 if (ret == -EINTR)
2728                         ret = 0;
2729                 if (ret)
2730                         goto err;
2731         }
2732
2733         /*
2734          * Do we need to extend the file?
2735          *
2736          * If we zeroed up to the end of the file, we dropped whatever writes
2737          * were going to write out the current i_size, so we have to extend
2738          * manually even if FL_KEEP_SIZE was set:
2739          */
2740         if (end >= inode->v.i_size &&
2741             (!(mode & FALLOC_FL_KEEP_SIZE) ||
2742              (mode & FALLOC_FL_ZERO_RANGE))) {
2743                 struct btree_iter *inode_iter;
2744                 struct bch_inode_unpacked inode_u;
2745
2746                 do {
2747                         bch2_trans_begin(&trans);
2748                         inode_iter = bch2_inode_peek(&trans, &inode_u,
2749                                                      inode->v.i_ino, 0);
2750                         ret = PTR_ERR_OR_ZERO(inode_iter);
2751                 } while (ret == -EINTR);
2752
2753                 bch2_trans_unlock(&trans);
2754
2755                 if (ret)
2756                         goto err;
2757
2758                 /*
2759                  * Sync existing appends before extending i_size,
2760                  * as in bch2_extend():
2761                  */
2762                 ret = filemap_write_and_wait_range(mapping,
2763                                         inode_u.bi_size, S64_MAX);
2764                 if (ret)
2765                         goto err;
2766
2767                 if (mode & FALLOC_FL_KEEP_SIZE)
2768                         end = inode->v.i_size;
2769                 else
2770                         i_size_write(&inode->v, end);
2771
2772                 mutex_lock(&inode->ei_update_lock);
2773                 ret = bch2_write_inode_size(c, inode, end, 0);
2774                 mutex_unlock(&inode->ei_update_lock);
2775         }
2776 err:
2777         bch2_trans_exit(&trans);
2778         bch2_pagecache_block_put(&inode->ei_pagecache_lock);
2779         inode_unlock(&inode->v);
2780         return ret;
2781 }
2782
2783 long bch2_fallocate_dispatch(struct file *file, int mode,
2784                              loff_t offset, loff_t len)
2785 {
2786         struct bch_inode_info *inode = file_bch_inode(file);
2787         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2788         long ret;
2789
2790         if (!percpu_ref_tryget(&c->writes))
2791                 return -EROFS;
2792
2793         if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
2794                 ret = bchfs_fallocate(inode, mode, offset, len);
2795         else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
2796                 ret = bchfs_fpunch(inode, offset, len);
2797         else if (mode == FALLOC_FL_INSERT_RANGE)
2798                 ret = bchfs_fcollapse_finsert(inode, offset, len, true);
2799         else if (mode == FALLOC_FL_COLLAPSE_RANGE)
2800                 ret = bchfs_fcollapse_finsert(inode, offset, len, false);
2801         else
2802                 ret = -EOPNOTSUPP;
2803
2804         percpu_ref_put(&c->writes);
2805
2806         return ret;
2807 }
2808
2809 static void mark_range_unallocated(struct bch_inode_info *inode,
2810                                    loff_t start, loff_t end)
2811 {
2812         pgoff_t index = start >> PAGE_SHIFT;
2813         pgoff_t end_index = (end - 1) >> PAGE_SHIFT;
2814         struct pagevec pvec;
2815
2816         pagevec_init(&pvec);
2817
2818         do {
2819                 unsigned nr_pages, i, j;
2820
2821                 nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
2822                                                 &index, end_index);
2823                 if (nr_pages == 0)
2824                         break;
2825
2826                 for (i = 0; i < nr_pages; i++) {
2827                         struct page *page = pvec.pages[i];
2828                         struct bch_page_state *s;
2829
2830                         lock_page(page);
2831                         s = bch2_page_state(page);
2832
2833                         if (s) {
2834                                 spin_lock(&s->lock);
2835                                 for (j = 0; j < PAGE_SECTORS; j++)
2836                                         s->s[j].nr_replicas = 0;
2837                                 spin_unlock(&s->lock);
2838                         }
2839
2840                         unlock_page(page);
2841                 }
2842                 pagevec_release(&pvec);
2843         } while (index <= end_index);
2844 }
2845
2846 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
2847                              struct file *file_dst, loff_t pos_dst,
2848                              loff_t len, unsigned remap_flags)
2849 {
2850         struct bch_inode_info *src = file_bch_inode(file_src);
2851         struct bch_inode_info *dst = file_bch_inode(file_dst);
2852         struct bch_fs *c = src->v.i_sb->s_fs_info;
2853         s64 i_sectors_delta = 0;
2854         u64 aligned_len;
2855         loff_t ret = 0;
2856
2857         if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
2858                 return -EINVAL;
2859
2860         if (remap_flags & REMAP_FILE_DEDUP)
2861                 return -EOPNOTSUPP;
2862
2863         if ((pos_src & (block_bytes(c) - 1)) ||
2864             (pos_dst & (block_bytes(c) - 1)))
2865                 return -EINVAL;
2866
2867         if (src == dst &&
2868             abs(pos_src - pos_dst) < len)
2869                 return -EINVAL;
2870
2871         bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
2872
2873         file_update_time(file_dst);
2874
2875         inode_dio_wait(&src->v);
2876         inode_dio_wait(&dst->v);
2877
2878         ret = generic_remap_file_range_prep(file_src, pos_src,
2879                                             file_dst, pos_dst,
2880                                             &len, remap_flags);
2881         if (ret < 0 || len == 0)
2882                 goto err;
2883
2884         aligned_len = round_up((u64) len, block_bytes(c));
2885
2886         ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
2887                                 pos_dst, pos_dst + len - 1);
2888         if (ret)
2889                 goto err;
2890
2891         mark_range_unallocated(src, pos_src, pos_src + aligned_len);
2892
2893         ret = bch2_remap_range(c,
2894                                POS(dst->v.i_ino, pos_dst >> 9),
2895                                POS(src->v.i_ino, pos_src >> 9),
2896                                aligned_len >> 9,
2897                                &dst->ei_journal_seq,
2898                                pos_dst + len, &i_sectors_delta);
2899         if (ret < 0)
2900                 goto err;
2901
2902         /*
2903          * due to alignment, we might have remapped slightly more than requsted
2904          */
2905         ret = min((u64) ret << 9, (u64) len);
2906
2907         /* XXX get a quota reservation */
2908         i_sectors_acct(c, dst, NULL, i_sectors_delta);
2909
2910         spin_lock(&dst->v.i_lock);
2911         if (pos_dst + ret > dst->v.i_size)
2912                 i_size_write(&dst->v, pos_dst + ret);
2913         spin_unlock(&dst->v.i_lock);
2914 err:
2915         bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
2916
2917         return ret;
2918 }
2919
2920 /* fseek: */
2921
2922 static int page_data_offset(struct page *page, unsigned offset)
2923 {
2924         struct bch_page_state *s = bch2_page_state(page);
2925         unsigned i;
2926
2927         if (s)
2928                 for (i = offset >> 9; i < PAGE_SECTORS; i++)
2929                         if (s->s[i].state >= SECTOR_DIRTY)
2930                                 return i << 9;
2931
2932         return -1;
2933 }
2934
2935 static loff_t bch2_seek_pagecache_data(struct inode *vinode,
2936                                        loff_t start_offset,
2937                                        loff_t end_offset)
2938 {
2939         struct address_space *mapping = vinode->i_mapping;
2940         struct page *page;
2941         pgoff_t start_index     = start_offset >> PAGE_SHIFT;
2942         pgoff_t end_index       = end_offset >> PAGE_SHIFT;
2943         pgoff_t index           = start_index;
2944         loff_t ret;
2945         int offset;
2946
2947         while (index <= end_index) {
2948                 if (find_get_pages_range(mapping, &index, end_index, 1, &page)) {
2949                         lock_page(page);
2950
2951                         offset = page_data_offset(page,
2952                                         page->index == start_index
2953                                         ? start_offset & (PAGE_SIZE - 1)
2954                                         : 0);
2955                         if (offset >= 0) {
2956                                 ret = clamp(((loff_t) page->index << PAGE_SHIFT) +
2957                                             offset,
2958                                             start_offset, end_offset);
2959                                 unlock_page(page);
2960                                 put_page(page);
2961                                 return ret;
2962                         }
2963
2964                         unlock_page(page);
2965                         put_page(page);
2966                 } else {
2967                         break;
2968                 }
2969         }
2970
2971         return end_offset;
2972 }
2973
2974 static loff_t bch2_seek_data(struct file *file, u64 offset)
2975 {
2976         struct bch_inode_info *inode = file_bch_inode(file);
2977         struct bch_fs *c = inode->v.i_sb->s_fs_info;
2978         struct btree_trans trans;
2979         struct btree_iter *iter;
2980         struct bkey_s_c k;
2981         u64 isize, next_data = MAX_LFS_FILESIZE;
2982         int ret;
2983
2984         isize = i_size_read(&inode->v);
2985         if (offset >= isize)
2986                 return -ENXIO;
2987
2988         bch2_trans_init(&trans, c, 0, 0);
2989
2990         for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
2991                            POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
2992                 if (k.k->p.inode != inode->v.i_ino) {
2993                         break;
2994                 } else if (bkey_extent_is_data(k.k)) {
2995                         next_data = max(offset, bkey_start_offset(k.k) << 9);
2996                         break;
2997                 } else if (k.k->p.offset >> 9 > isize)
2998                         break;
2999         }
3000
3001         ret = bch2_trans_exit(&trans) ?: ret;
3002         if (ret)
3003                 return ret;
3004
3005         if (next_data > offset)
3006                 next_data = bch2_seek_pagecache_data(&inode->v,
3007                                                      offset, next_data);
3008
3009         if (next_data >= isize)
3010                 return -ENXIO;
3011
3012         return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
3013 }
3014
3015 static int __page_hole_offset(struct page *page, unsigned offset)
3016 {
3017         struct bch_page_state *s = bch2_page_state(page);
3018         unsigned i;
3019
3020         if (!s)
3021                 return 0;
3022
3023         for (i = offset >> 9; i < PAGE_SECTORS; i++)
3024                 if (s->s[i].state < SECTOR_DIRTY)
3025                         return i << 9;
3026
3027         return -1;
3028 }
3029
3030 static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
3031 {
3032         pgoff_t index = offset >> PAGE_SHIFT;
3033         struct page *page;
3034         int pg_offset;
3035         loff_t ret = -1;
3036
3037         page = find_lock_entry(mapping, index);
3038         if (!page || xa_is_value(page))
3039                 return offset;
3040
3041         pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
3042         if (pg_offset >= 0)
3043                 ret = ((loff_t) index << PAGE_SHIFT) + pg_offset;
3044
3045         unlock_page(page);
3046
3047         return ret;
3048 }
3049
3050 static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
3051                                        loff_t start_offset,
3052                                        loff_t end_offset)
3053 {
3054         struct address_space *mapping = vinode->i_mapping;
3055         loff_t offset = start_offset, hole;
3056
3057         while (offset < end_offset) {
3058                 hole = page_hole_offset(mapping, offset);
3059                 if (hole >= 0 && hole <= end_offset)
3060                         return max(start_offset, hole);
3061
3062                 offset += PAGE_SIZE;
3063                 offset &= PAGE_MASK;
3064         }
3065
3066         return end_offset;
3067 }
3068
3069 static loff_t bch2_seek_hole(struct file *file, u64 offset)
3070 {
3071         struct bch_inode_info *inode = file_bch_inode(file);
3072         struct bch_fs *c = inode->v.i_sb->s_fs_info;
3073         struct btree_trans trans;
3074         struct btree_iter *iter;
3075         struct bkey_s_c k;
3076         u64 isize, next_hole = MAX_LFS_FILESIZE;
3077         int ret;
3078
3079         isize = i_size_read(&inode->v);
3080         if (offset >= isize)
3081                 return -ENXIO;
3082
3083         bch2_trans_init(&trans, c, 0, 0);
3084
3085         for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
3086                            POS(inode->v.i_ino, offset >> 9),
3087                            BTREE_ITER_SLOTS, k, ret) {
3088                 if (k.k->p.inode != inode->v.i_ino) {
3089                         next_hole = bch2_seek_pagecache_hole(&inode->v,
3090                                         offset, MAX_LFS_FILESIZE);
3091                         break;
3092                 } else if (!bkey_extent_is_data(k.k)) {
3093                         next_hole = bch2_seek_pagecache_hole(&inode->v,
3094                                         max(offset, bkey_start_offset(k.k) << 9),
3095                                         k.k->p.offset << 9);
3096
3097                         if (next_hole < k.k->p.offset << 9)
3098                                 break;
3099                 } else {
3100                         offset = max(offset, bkey_start_offset(k.k) << 9);
3101                 }
3102         }
3103
3104         ret = bch2_trans_exit(&trans) ?: ret;
3105         if (ret)
3106                 return ret;
3107
3108         if (next_hole > isize)
3109                 next_hole = isize;
3110
3111         return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
3112 }
3113
3114 loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
3115 {
3116         switch (whence) {
3117         case SEEK_SET:
3118         case SEEK_CUR:
3119         case SEEK_END:
3120                 return generic_file_llseek(file, offset, whence);
3121         case SEEK_DATA:
3122                 return bch2_seek_data(file, offset);
3123         case SEEK_HOLE:
3124                 return bch2_seek_hole(file, offset);
3125         }
3126
3127         return -EINVAL;
3128 }
3129
3130 void bch2_fs_fsio_exit(struct bch_fs *c)
3131 {
3132         bioset_exit(&c->dio_write_bioset);
3133         bioset_exit(&c->dio_read_bioset);
3134         bioset_exit(&c->writepage_bioset);
3135 }
3136
3137 int bch2_fs_fsio_init(struct bch_fs *c)
3138 {
3139         int ret = 0;
3140
3141         pr_verbose_init(c->opts, "");
3142
3143         if (bioset_init(&c->writepage_bioset,
3144                         4, offsetof(struct bch_writepage_io, op.wbio.bio),
3145                         BIOSET_NEED_BVECS) ||
3146             bioset_init(&c->dio_read_bioset,
3147                         4, offsetof(struct dio_read, rbio.bio),
3148                         BIOSET_NEED_BVECS) ||
3149             bioset_init(&c->dio_write_bioset,
3150                         4, offsetof(struct dio_write, op.wbio.bio),
3151                         BIOSET_NEED_BVECS))
3152                 ret = -ENOMEM;
3153
3154         pr_verbose_init(c->opts, "ret %i", ret);
3155         return ret;
3156 }
3157
3158 #endif /* NO_BCACHEFS_FS */