]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fs.c
Update bcachefs sources to b1899a0bd9 bcachefs: Move bch2_evict_subvolume_inodes...
[bcachefs-tools-debian] / libbcachefs / fs.c
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3
4 #include "bcachefs.h"
5 #include "acl.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "chardev.h"
10 #include "dirent.h"
11 #include "extents.h"
12 #include "fs.h"
13 #include "fs-common.h"
14 #include "fs-io.h"
15 #include "fs-ioctl.h"
16 #include "fsck.h"
17 #include "inode.h"
18 #include "io.h"
19 #include "journal.h"
20 #include "keylist.h"
21 #include "quota.h"
22 #include "super.h"
23 #include "xattr.h"
24
25 #include <linux/aio.h>
26 #include <linux/backing-dev.h>
27 #include <linux/exportfs.h>
28 #include <linux/fiemap.h>
29 #include <linux/module.h>
30 #include <linux/pagemap.h>
31 #include <linux/posix_acl.h>
32 #include <linux/random.h>
33 #include <linux/statfs.h>
34 #include <linux/string.h>
35 #include <linux/xattr.h>
36
37 static struct kmem_cache *bch2_inode_cache;
38
39 static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum,
40                                 struct bch_inode_info *,
41                                 struct bch_inode_unpacked *);
42
43 static void journal_seq_copy(struct bch_fs *c,
44                              struct bch_inode_info *dst,
45                              u64 journal_seq)
46 {
47         /*
48          * atomic64_cmpxchg has a fallback for archs that don't support it,
49          * cmpxchg does not:
50          */
51         atomic64_t *dst_seq = (void *) &dst->ei_journal_seq;
52         u64 old, v = READ_ONCE(dst->ei_journal_seq);
53
54         do {
55                 old = v;
56
57                 if (old >= journal_seq)
58                         break;
59         } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
60
61         bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq);
62 }
63
64 static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
65 {
66         BUG_ON(atomic_long_read(&lock->v) == 0);
67
68         if (atomic_long_sub_return_release(i, &lock->v) == 0)
69                 wake_up_all(&lock->wait);
70 }
71
72 static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
73 {
74         long v = atomic_long_read(&lock->v), old;
75
76         do {
77                 old = v;
78
79                 if (i > 0 ? v < 0 : v > 0)
80                         return false;
81         } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
82                                         old, old + i)) != old);
83         return true;
84 }
85
86 static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
87 {
88         wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
89 }
90
91 void bch2_pagecache_add_put(struct pagecache_lock *lock)
92 {
93         __pagecache_lock_put(lock, 1);
94 }
95
96 bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
97 {
98         return __pagecache_lock_tryget(lock, 1);
99 }
100
101 void bch2_pagecache_add_get(struct pagecache_lock *lock)
102 {
103         __pagecache_lock_get(lock, 1);
104 }
105
106 void bch2_pagecache_block_put(struct pagecache_lock *lock)
107 {
108         __pagecache_lock_put(lock, -1);
109 }
110
111 void bch2_pagecache_block_get(struct pagecache_lock *lock)
112 {
113         __pagecache_lock_get(lock, -1);
114 }
115
116 void bch2_inode_update_after_write(struct bch_fs *c,
117                                    struct bch_inode_info *inode,
118                                    struct bch_inode_unpacked *bi,
119                                    unsigned fields)
120 {
121         set_nlink(&inode->v, bch2_inode_nlink_get(bi));
122         i_uid_write(&inode->v, bi->bi_uid);
123         i_gid_write(&inode->v, bi->bi_gid);
124         inode->v.i_mode = bi->bi_mode;
125
126         if (fields & ATTR_ATIME)
127                 inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
128         if (fields & ATTR_MTIME)
129                 inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
130         if (fields & ATTR_CTIME)
131                 inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
132
133         inode->ei_inode         = *bi;
134
135         bch2_inode_flags_to_vfs(inode);
136 }
137
138 int __must_check bch2_write_inode(struct bch_fs *c,
139                                   struct bch_inode_info *inode,
140                                   inode_set_fn set,
141                                   void *p, unsigned fields)
142 {
143         struct btree_trans trans;
144         struct btree_iter iter = { NULL };
145         struct bch_inode_unpacked inode_u;
146         int ret;
147
148         bch2_trans_init(&trans, c, 0, 512);
149 retry:
150         bch2_trans_begin(&trans);
151
152         ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
153                                 BTREE_ITER_INTENT) ?:
154                 (set ? set(inode, &inode_u, p) : 0) ?:
155                 bch2_inode_write(&trans, &iter, &inode_u) ?:
156                 bch2_trans_commit(&trans, NULL,
157                                   &inode->ei_journal_seq,
158                                   BTREE_INSERT_NOFAIL);
159
160         /*
161          * the btree node lock protects inode->ei_inode, not ei_update_lock;
162          * this is important for inode updates via bchfs_write_index_update
163          */
164         if (!ret)
165                 bch2_inode_update_after_write(c, inode, &inode_u, fields);
166
167         bch2_trans_iter_exit(&trans, &iter);
168
169         if (ret == -EINTR)
170                 goto retry;
171
172         bch2_trans_exit(&trans);
173         return ret < 0 ? ret : 0;
174 }
175
176 int bch2_fs_quota_transfer(struct bch_fs *c,
177                            struct bch_inode_info *inode,
178                            struct bch_qid new_qid,
179                            unsigned qtypes,
180                            enum quota_acct_mode mode)
181 {
182         unsigned i;
183         int ret;
184
185         qtypes &= enabled_qtypes(c);
186
187         for (i = 0; i < QTYP_NR; i++)
188                 if (new_qid.q[i] == inode->ei_qid.q[i])
189                         qtypes &= ~(1U << i);
190
191         if (!qtypes)
192                 return 0;
193
194         mutex_lock(&inode->ei_quota_lock);
195
196         ret = bch2_quota_transfer(c, qtypes, new_qid,
197                                   inode->ei_qid,
198                                   inode->v.i_blocks +
199                                   inode->ei_quota_reserved,
200                                   mode);
201         if (!ret)
202                 for (i = 0; i < QTYP_NR; i++)
203                         if (qtypes & (1 << i))
204                                 inode->ei_qid.q[i] = new_qid.q[i];
205
206         mutex_unlock(&inode->ei_quota_lock);
207
208         return ret;
209 }
210
211 static int bch2_iget5_test(struct inode *vinode, void *p)
212 {
213         struct bch_inode_info *inode = to_bch_ei(vinode);
214         subvol_inum *inum = p;
215
216         return inode->ei_subvol == inum->subvol &&
217                 inode->ei_inode.bi_inum == inum->inum;
218 }
219
220 static int bch2_iget5_set(struct inode *vinode, void *p)
221 {
222         struct bch_inode_info *inode = to_bch_ei(vinode);
223         subvol_inum *inum = p;
224
225         inode->v.i_ino          = inum->inum;
226         inode->ei_subvol        = inum->subvol;
227         inode->ei_inode.bi_inum = inum->inum;
228         return 0;
229 }
230
231 static unsigned bch2_inode_hash(subvol_inum inum)
232 {
233         return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
234 }
235
236 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
237 {
238         struct bch_inode_unpacked inode_u;
239         struct bch_inode_info *inode;
240         int ret;
241
242         inode = to_bch_ei(iget5_locked(c->vfs_sb,
243                                        bch2_inode_hash(inum),
244                                        bch2_iget5_test,
245                                        bch2_iget5_set,
246                                        &inum));
247         if (unlikely(!inode))
248                 return ERR_PTR(-ENOMEM);
249         if (!(inode->v.i_state & I_NEW))
250                 return &inode->v;
251
252         ret = bch2_inode_find_by_inum(c, inum, &inode_u);
253         if (ret) {
254                 iget_failed(&inode->v);
255                 return ERR_PTR(ret);
256         }
257
258         bch2_vfs_inode_init(c, inum, inode, &inode_u);
259
260         inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum.inum);
261
262         unlock_new_inode(&inode->v);
263
264         return &inode->v;
265 }
266
267 struct bch_inode_info *
268 __bch2_create(struct user_namespace *mnt_userns,
269               struct bch_inode_info *dir, struct dentry *dentry,
270               umode_t mode, dev_t rdev, subvol_inum snapshot_src,
271               unsigned flags)
272 {
273         struct bch_fs *c = dir->v.i_sb->s_fs_info;
274         struct btree_trans trans;
275         struct bch_inode_unpacked dir_u;
276         struct bch_inode_info *inode, *old;
277         struct bch_inode_unpacked inode_u;
278         struct posix_acl *default_acl = NULL, *acl = NULL;
279         subvol_inum inum;
280         u64 journal_seq = 0;
281         int ret;
282
283         /*
284          * preallocate acls + vfs inode before btree transaction, so that
285          * nothing can fail after the transaction succeeds:
286          */
287 #ifdef CONFIG_BCACHEFS_POSIX_ACL
288         ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
289         if (ret)
290                 return ERR_PTR(ret);
291 #endif
292         inode = to_bch_ei(new_inode(c->vfs_sb));
293         if (unlikely(!inode)) {
294                 inode = ERR_PTR(-ENOMEM);
295                 goto err;
296         }
297
298         bch2_inode_init_early(c, &inode_u);
299
300         if (!(flags & BCH_CREATE_TMPFILE))
301                 mutex_lock(&dir->ei_update_lock);
302
303         bch2_trans_init(&trans, c, 8,
304                         2048 + (!(flags & BCH_CREATE_TMPFILE)
305                                 ? dentry->d_name.len : 0));
306 retry:
307         bch2_trans_begin(&trans);
308
309         ret   = bch2_create_trans(&trans,
310                                   inode_inum(dir), &dir_u, &inode_u,
311                                   !(flags & BCH_CREATE_TMPFILE)
312                                   ? &dentry->d_name : NULL,
313                                   from_kuid(mnt_userns, current_fsuid()),
314                                   from_kgid(mnt_userns, current_fsgid()),
315                                   mode, rdev,
316                                   default_acl, acl, snapshot_src, flags) ?:
317                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
318                                 KEY_TYPE_QUOTA_PREALLOC);
319         if (unlikely(ret))
320                 goto err_before_quota;
321
322         ret   = bch2_trans_commit(&trans, NULL, &journal_seq, 0);
323         if (unlikely(ret)) {
324                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
325                                 KEY_TYPE_QUOTA_WARN);
326 err_before_quota:
327                 if (ret == -EINTR)
328                         goto retry;
329                 goto err_trans;
330         }
331
332         if (!(flags & BCH_CREATE_TMPFILE)) {
333                 bch2_inode_update_after_write(c, dir, &dir_u,
334                                               ATTR_MTIME|ATTR_CTIME);
335                 journal_seq_copy(c, dir, journal_seq);
336                 mutex_unlock(&dir->ei_update_lock);
337         }
338
339         inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
340         inum.inum = inode_u.bi_inum;
341
342         bch2_vfs_inode_init(c, inum, inode, &inode_u);
343         journal_seq_copy(c, inode, journal_seq);
344
345         set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
346         set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
347
348         /*
349          * we must insert the new inode into the inode cache before calling
350          * bch2_trans_exit() and dropping locks, else we could race with another
351          * thread pulling the inode in and modifying it:
352          */
353
354         inode->v.i_state |= I_CREATING;
355
356         old = to_bch_ei(inode_insert5(&inode->v,
357                                       bch2_inode_hash(inum),
358                                       bch2_iget5_test,
359                                       bch2_iget5_set,
360                                       &inum));
361         BUG_ON(!old);
362
363         if (unlikely(old != inode)) {
364                 /*
365                  * We raced, another process pulled the new inode into cache
366                  * before us:
367                  */
368                 journal_seq_copy(c, old, journal_seq);
369                 make_bad_inode(&inode->v);
370                 iput(&inode->v);
371
372                 inode = old;
373         } else {
374                 /*
375                  * we really don't want insert_inode_locked2() to be setting
376                  * I_NEW...
377                  */
378                 unlock_new_inode(&inode->v);
379         }
380
381         bch2_trans_exit(&trans);
382 err:
383         posix_acl_release(default_acl);
384         posix_acl_release(acl);
385         return inode;
386 err_trans:
387         if (!(flags & BCH_CREATE_TMPFILE))
388                 mutex_unlock(&dir->ei_update_lock);
389
390         bch2_trans_exit(&trans);
391         make_bad_inode(&inode->v);
392         iput(&inode->v);
393         inode = ERR_PTR(ret);
394         goto err;
395 }
396
397 /* methods */
398
399 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
400                                   unsigned int flags)
401 {
402         struct bch_fs *c = vdir->i_sb->s_fs_info;
403         struct bch_inode_info *dir = to_bch_ei(vdir);
404         struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
405         struct inode *vinode = NULL;
406         subvol_inum inum = { .subvol = 1 };
407         int ret;
408
409         ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
410                                  &dentry->d_name, &inum);
411
412         if (!ret)
413                 vinode = bch2_vfs_inode_get(c, inum);
414
415         return d_splice_alias(vinode, dentry);
416 }
417
418 static int bch2_mknod(struct user_namespace *mnt_userns,
419                       struct inode *vdir, struct dentry *dentry,
420                       umode_t mode, dev_t rdev)
421 {
422         struct bch_inode_info *inode =
423                 __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev,
424                               (subvol_inum) { 0 }, 0);
425
426         if (IS_ERR(inode))
427                 return PTR_ERR(inode);
428
429         d_instantiate(dentry, &inode->v);
430         return 0;
431 }
432
433 static int bch2_create(struct user_namespace *mnt_userns,
434                        struct inode *vdir, struct dentry *dentry,
435                        umode_t mode, bool excl)
436 {
437         return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFREG, 0);
438 }
439
440 static int __bch2_link(struct bch_fs *c,
441                        struct bch_inode_info *inode,
442                        struct bch_inode_info *dir,
443                        struct dentry *dentry)
444 {
445         struct btree_trans trans;
446         struct bch_inode_unpacked dir_u, inode_u;
447         int ret;
448
449         mutex_lock(&inode->ei_update_lock);
450         bch2_trans_init(&trans, c, 4, 1024);
451
452         ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0,
453                         bch2_link_trans(&trans,
454                                         inode_inum(dir),   &dir_u,
455                                         inode_inum(inode), &inode_u,
456                                         &dentry->d_name));
457
458         if (likely(!ret)) {
459                 BUG_ON(inode_u.bi_inum != inode->v.i_ino);
460
461                 journal_seq_copy(c, inode, dir->ei_journal_seq);
462                 bch2_inode_update_after_write(c, dir, &dir_u,
463                                               ATTR_MTIME|ATTR_CTIME);
464                 bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
465         }
466
467         bch2_trans_exit(&trans);
468         mutex_unlock(&inode->ei_update_lock);
469         return ret;
470 }
471
472 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
473                      struct dentry *dentry)
474 {
475         struct bch_fs *c = vdir->i_sb->s_fs_info;
476         struct bch_inode_info *dir = to_bch_ei(vdir);
477         struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
478         int ret;
479
480         lockdep_assert_held(&inode->v.i_rwsem);
481
482         ret = __bch2_link(c, inode, dir, dentry);
483         if (unlikely(ret))
484                 return ret;
485
486         ihold(&inode->v);
487         d_instantiate(dentry, &inode->v);
488         return 0;
489 }
490
491 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
492                   bool deleting_snapshot)
493 {
494         struct bch_fs *c = vdir->i_sb->s_fs_info;
495         struct bch_inode_info *dir = to_bch_ei(vdir);
496         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
497         struct bch_inode_unpacked dir_u, inode_u;
498         struct btree_trans trans;
499         int ret;
500
501         bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
502         bch2_trans_init(&trans, c, 4, 1024);
503
504         ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
505                               BTREE_INSERT_NOFAIL,
506                         bch2_unlink_trans(&trans,
507                                           inode_inum(dir), &dir_u,
508                                           &inode_u, &dentry->d_name,
509                                           deleting_snapshot));
510
511         if (likely(!ret)) {
512                 BUG_ON(inode_u.bi_inum != inode->v.i_ino);
513
514                 journal_seq_copy(c, inode, dir->ei_journal_seq);
515                 bch2_inode_update_after_write(c, dir, &dir_u,
516                                               ATTR_MTIME|ATTR_CTIME);
517                 bch2_inode_update_after_write(c, inode, &inode_u,
518                                               ATTR_MTIME);
519         }
520
521         bch2_trans_exit(&trans);
522         bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
523
524         return ret;
525 }
526
527 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
528 {
529         return __bch2_unlink(vdir, dentry, false);
530 }
531
532 static int bch2_symlink(struct user_namespace *mnt_userns,
533                         struct inode *vdir, struct dentry *dentry,
534                         const char *symname)
535 {
536         struct bch_fs *c = vdir->i_sb->s_fs_info;
537         struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
538         int ret;
539
540         inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
541                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
542         if (unlikely(IS_ERR(inode)))
543                 return PTR_ERR(inode);
544
545         inode_lock(&inode->v);
546         ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
547         inode_unlock(&inode->v);
548
549         if (unlikely(ret))
550                 goto err;
551
552         ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
553         if (unlikely(ret))
554                 goto err;
555
556         journal_seq_copy(c, dir, inode->ei_journal_seq);
557
558         ret = __bch2_link(c, inode, dir, dentry);
559         if (unlikely(ret))
560                 goto err;
561
562         d_instantiate(dentry, &inode->v);
563         return 0;
564 err:
565         iput(&inode->v);
566         return ret;
567 }
568
569 static int bch2_mkdir(struct user_namespace *mnt_userns,
570                       struct inode *vdir, struct dentry *dentry, umode_t mode)
571 {
572         return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFDIR, 0);
573 }
574
575 static int bch2_rename2(struct user_namespace *mnt_userns,
576                         struct inode *src_vdir, struct dentry *src_dentry,
577                         struct inode *dst_vdir, struct dentry *dst_dentry,
578                         unsigned flags)
579 {
580         struct bch_fs *c = src_vdir->i_sb->s_fs_info;
581         struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
582         struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
583         struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
584         struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
585         struct bch_inode_unpacked dst_dir_u, src_dir_u;
586         struct bch_inode_unpacked src_inode_u, dst_inode_u;
587         struct btree_trans trans;
588         enum bch_rename_mode mode = flags & RENAME_EXCHANGE
589                 ? BCH_RENAME_EXCHANGE
590                 : dst_dentry->d_inode
591                 ? BCH_RENAME_OVERWRITE : BCH_RENAME;
592         u64 journal_seq = 0;
593         int ret;
594
595         if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
596                 return -EINVAL;
597
598         if (mode == BCH_RENAME_OVERWRITE) {
599                 ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
600                                                    0, LLONG_MAX);
601                 if (ret)
602                         return ret;
603         }
604
605         bch2_trans_init(&trans, c, 8, 2048);
606
607         bch2_lock_inodes(INODE_UPDATE_LOCK,
608                          src_dir,
609                          dst_dir,
610                          src_inode,
611                          dst_inode);
612
613         if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
614                 ret = bch2_fs_quota_transfer(c, src_inode,
615                                              dst_dir->ei_qid,
616                                              1 << QTYP_PRJ,
617                                              KEY_TYPE_QUOTA_PREALLOC);
618                 if (ret)
619                         goto err;
620         }
621
622         if (mode == BCH_RENAME_EXCHANGE &&
623             inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
624                 ret = bch2_fs_quota_transfer(c, dst_inode,
625                                              src_dir->ei_qid,
626                                              1 << QTYP_PRJ,
627                                              KEY_TYPE_QUOTA_PREALLOC);
628                 if (ret)
629                         goto err;
630         }
631
632         ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0,
633                         bch2_rename_trans(&trans,
634                                           inode_inum(src_dir), &src_dir_u,
635                                           inode_inum(dst_dir), &dst_dir_u,
636                                           &src_inode_u,
637                                           &dst_inode_u,
638                                           &src_dentry->d_name,
639                                           &dst_dentry->d_name,
640                                           mode));
641         if (unlikely(ret))
642                 goto err;
643
644         BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
645         BUG_ON(dst_inode &&
646                dst_inode->v.i_ino != dst_inode_u.bi_inum);
647
648         bch2_inode_update_after_write(c, src_dir, &src_dir_u,
649                                       ATTR_MTIME|ATTR_CTIME);
650         journal_seq_copy(c, src_dir, journal_seq);
651
652         if (src_dir != dst_dir) {
653                 bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
654                                               ATTR_MTIME|ATTR_CTIME);
655                 journal_seq_copy(c, dst_dir, journal_seq);
656         }
657
658         bch2_inode_update_after_write(c, src_inode, &src_inode_u,
659                                       ATTR_CTIME);
660         journal_seq_copy(c, src_inode, journal_seq);
661
662         if (dst_inode) {
663                 bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
664                                               ATTR_CTIME);
665                 journal_seq_copy(c, dst_inode, journal_seq);
666         }
667 err:
668         bch2_trans_exit(&trans);
669
670         bch2_fs_quota_transfer(c, src_inode,
671                                bch_qid(&src_inode->ei_inode),
672                                1 << QTYP_PRJ,
673                                KEY_TYPE_QUOTA_NOCHECK);
674         if (dst_inode)
675                 bch2_fs_quota_transfer(c, dst_inode,
676                                        bch_qid(&dst_inode->ei_inode),
677                                        1 << QTYP_PRJ,
678                                        KEY_TYPE_QUOTA_NOCHECK);
679
680         bch2_unlock_inodes(INODE_UPDATE_LOCK,
681                            src_dir,
682                            dst_dir,
683                            src_inode,
684                            dst_inode);
685
686         return ret;
687 }
688
689 static void bch2_setattr_copy(struct user_namespace *mnt_userns,
690                               struct bch_inode_info *inode,
691                               struct bch_inode_unpacked *bi,
692                               struct iattr *attr)
693 {
694         struct bch_fs *c = inode->v.i_sb->s_fs_info;
695         unsigned int ia_valid = attr->ia_valid;
696
697         if (ia_valid & ATTR_UID)
698                 bi->bi_uid = from_kuid(mnt_userns, attr->ia_uid);
699         if (ia_valid & ATTR_GID)
700                 bi->bi_gid = from_kgid(mnt_userns, attr->ia_gid);
701
702         if (ia_valid & ATTR_SIZE)
703                 bi->bi_size = attr->ia_size;
704
705         if (ia_valid & ATTR_ATIME)
706                 bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
707         if (ia_valid & ATTR_MTIME)
708                 bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
709         if (ia_valid & ATTR_CTIME)
710                 bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
711
712         if (ia_valid & ATTR_MODE) {
713                 umode_t mode = attr->ia_mode;
714                 kgid_t gid = ia_valid & ATTR_GID
715                         ? attr->ia_gid
716                         : inode->v.i_gid;
717
718                 if (!in_group_p(gid) &&
719                     !capable_wrt_inode_uidgid(mnt_userns, &inode->v, CAP_FSETID))
720                         mode &= ~S_ISGID;
721                 bi->bi_mode = mode;
722         }
723 }
724
725 int bch2_setattr_nonsize(struct user_namespace *mnt_userns,
726                          struct bch_inode_info *inode,
727                          struct iattr *attr)
728 {
729         struct bch_fs *c = inode->v.i_sb->s_fs_info;
730         struct bch_qid qid;
731         struct btree_trans trans;
732         struct btree_iter inode_iter = { NULL };
733         struct bch_inode_unpacked inode_u;
734         struct posix_acl *acl = NULL;
735         int ret;
736
737         mutex_lock(&inode->ei_update_lock);
738
739         qid = inode->ei_qid;
740
741         if (attr->ia_valid & ATTR_UID)
742                 qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid);
743
744         if (attr->ia_valid & ATTR_GID)
745                 qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid);
746
747         ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
748                                      KEY_TYPE_QUOTA_PREALLOC);
749         if (ret)
750                 goto err;
751
752         bch2_trans_init(&trans, c, 0, 0);
753 retry:
754         bch2_trans_begin(&trans);
755         kfree(acl);
756         acl = NULL;
757
758         ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
759                               BTREE_ITER_INTENT);
760         if (ret)
761                 goto btree_err;
762
763         bch2_setattr_copy(mnt_userns, inode, &inode_u, attr);
764
765         if (attr->ia_valid & ATTR_MODE) {
766                 ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
767                                      inode_u.bi_mode, &acl);
768                 if (ret)
769                         goto btree_err;
770         }
771
772         ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
773                 bch2_trans_commit(&trans, NULL,
774                                   &inode->ei_journal_seq,
775                                   BTREE_INSERT_NOFAIL);
776 btree_err:
777         bch2_trans_iter_exit(&trans, &inode_iter);
778
779         if (ret == -EINTR)
780                 goto retry;
781         if (unlikely(ret))
782                 goto err_trans;
783
784         bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid);
785
786         if (acl)
787                 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
788 err_trans:
789         bch2_trans_exit(&trans);
790 err:
791         mutex_unlock(&inode->ei_update_lock);
792
793         return ret;
794 }
795
796 static int bch2_getattr(struct user_namespace *mnt_userns,
797                         const struct path *path, struct kstat *stat,
798                         u32 request_mask, unsigned query_flags)
799 {
800         struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
801         struct bch_fs *c = inode->v.i_sb->s_fs_info;
802
803         stat->dev       = inode->v.i_sb->s_dev;
804         stat->ino       = inode->v.i_ino;
805         stat->mode      = inode->v.i_mode;
806         stat->nlink     = inode->v.i_nlink;
807         stat->uid       = inode->v.i_uid;
808         stat->gid       = inode->v.i_gid;
809         stat->rdev      = inode->v.i_rdev;
810         stat->size      = i_size_read(&inode->v);
811         stat->atime     = inode->v.i_atime;
812         stat->mtime     = inode->v.i_mtime;
813         stat->ctime     = inode->v.i_ctime;
814         stat->blksize   = block_bytes(c);
815         stat->blocks    = inode->v.i_blocks;
816
817         if (request_mask & STATX_BTIME) {
818                 stat->result_mask |= STATX_BTIME;
819                 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
820         }
821
822         if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
823                 stat->attributes |= STATX_ATTR_IMMUTABLE;
824         stat->attributes_mask    |= STATX_ATTR_IMMUTABLE;
825
826         if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
827                 stat->attributes |= STATX_ATTR_APPEND;
828         stat->attributes_mask    |= STATX_ATTR_APPEND;
829
830         if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
831                 stat->attributes |= STATX_ATTR_NODUMP;
832         stat->attributes_mask    |= STATX_ATTR_NODUMP;
833
834         return 0;
835 }
836
837 static int bch2_setattr(struct user_namespace *mnt_userns,
838                         struct dentry *dentry, struct iattr *iattr)
839 {
840         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
841         int ret;
842
843         lockdep_assert_held(&inode->v.i_rwsem);
844
845         ret = setattr_prepare(mnt_userns, dentry, iattr);
846         if (ret)
847                 return ret;
848
849         return iattr->ia_valid & ATTR_SIZE
850                 ? bch2_truncate(mnt_userns, inode, iattr)
851                 : bch2_setattr_nonsize(mnt_userns, inode, iattr);
852 }
853
854 static int bch2_tmpfile(struct user_namespace *mnt_userns,
855                         struct inode *vdir, struct dentry *dentry, umode_t mode)
856 {
857         struct bch_inode_info *inode =
858                 __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0,
859                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
860
861         if (IS_ERR(inode))
862                 return PTR_ERR(inode);
863
864         d_mark_tmpfile(dentry, &inode->v);
865         d_instantiate(dentry, &inode->v);
866         return 0;
867 }
868
869 static int bch2_fill_extent(struct bch_fs *c,
870                             struct fiemap_extent_info *info,
871                             struct bkey_s_c k, unsigned flags)
872 {
873         if (bkey_extent_is_direct_data(k.k)) {
874                 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
875                 const union bch_extent_entry *entry;
876                 struct extent_ptr_decoded p;
877                 int ret;
878
879                 if (k.k->type == KEY_TYPE_reflink_v)
880                         flags |= FIEMAP_EXTENT_SHARED;
881
882                 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
883                         int flags2 = 0;
884                         u64 offset = p.ptr.offset;
885
886                         if (p.crc.compression_type)
887                                 flags2 |= FIEMAP_EXTENT_ENCODED;
888                         else
889                                 offset += p.crc.offset;
890
891                         if ((offset & (c->opts.block_size - 1)) ||
892                             (k.k->size & (c->opts.block_size - 1)))
893                                 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
894
895                         ret = fiemap_fill_next_extent(info,
896                                                 bkey_start_offset(k.k) << 9,
897                                                 offset << 9,
898                                                 k.k->size << 9, flags|flags2);
899                         if (ret)
900                                 return ret;
901                 }
902
903                 return 0;
904         } else if (bkey_extent_is_inline_data(k.k)) {
905                 return fiemap_fill_next_extent(info,
906                                                bkey_start_offset(k.k) << 9,
907                                                0, k.k->size << 9,
908                                                flags|
909                                                FIEMAP_EXTENT_DATA_INLINE);
910         } else if (k.k->type == KEY_TYPE_reservation) {
911                 return fiemap_fill_next_extent(info,
912                                                bkey_start_offset(k.k) << 9,
913                                                0, k.k->size << 9,
914                                                flags|
915                                                FIEMAP_EXTENT_DELALLOC|
916                                                FIEMAP_EXTENT_UNWRITTEN);
917         } else {
918                 BUG();
919         }
920 }
921
922 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
923                        u64 start, u64 len)
924 {
925         struct bch_fs *c = vinode->i_sb->s_fs_info;
926         struct bch_inode_info *ei = to_bch_ei(vinode);
927         struct btree_trans trans;
928         struct btree_iter iter;
929         struct bkey_s_c k;
930         struct bkey_buf cur, prev;
931         struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
932         unsigned offset_into_extent, sectors;
933         bool have_extent = false;
934         u32 snapshot;
935         int ret = 0;
936
937         ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
938         if (ret)
939                 return ret;
940
941         if (start + len < start)
942                 return -EINVAL;
943
944         start >>= 9;
945
946         bch2_bkey_buf_init(&cur);
947         bch2_bkey_buf_init(&prev);
948         bch2_trans_init(&trans, c, 0, 0);
949 retry:
950         bch2_trans_begin(&trans);
951
952         ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
953         if (ret)
954                 goto err;
955
956         bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
957                              SPOS(ei->v.i_ino, start, snapshot), 0);
958
959         while ((k = bch2_btree_iter_peek(&iter)).k &&
960                !(ret = bkey_err(k)) &&
961                bkey_cmp(iter.pos, end) < 0) {
962                 enum btree_id data_btree = BTREE_ID_extents;
963
964                 if (!bkey_extent_is_data(k.k) &&
965                     k.k->type != KEY_TYPE_reservation) {
966                         bch2_btree_iter_advance(&iter);
967                         continue;
968                 }
969
970                 offset_into_extent      = iter.pos.offset -
971                         bkey_start_offset(k.k);
972                 sectors                 = k.k->size - offset_into_extent;
973
974                 bch2_bkey_buf_reassemble(&cur, c, k);
975
976                 ret = bch2_read_indirect_extent(&trans, &data_btree,
977                                         &offset_into_extent, &cur);
978                 if (ret)
979                         break;
980
981                 k = bkey_i_to_s_c(cur.k);
982                 bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
983
984                 sectors = min(sectors, k.k->size - offset_into_extent);
985
986                 bch2_cut_front(POS(k.k->p.inode,
987                                    bkey_start_offset(k.k) +
988                                    offset_into_extent),
989                                cur.k);
990                 bch2_key_resize(&cur.k->k, sectors);
991                 cur.k->k.p = iter.pos;
992                 cur.k->k.p.offset += cur.k->k.size;
993
994                 if (have_extent) {
995                         ret = bch2_fill_extent(c, info,
996                                         bkey_i_to_s_c(prev.k), 0);
997                         if (ret)
998                                 break;
999                 }
1000
1001                 bkey_copy(prev.k, cur.k);
1002                 have_extent = true;
1003
1004                 bch2_btree_iter_set_pos(&iter,
1005                         POS(iter.pos.inode, iter.pos.offset + sectors));
1006         }
1007         start = iter.pos.offset;
1008         bch2_trans_iter_exit(&trans, &iter);
1009 err:
1010         if (ret == -EINTR)
1011                 goto retry;
1012
1013         if (!ret && have_extent)
1014                 ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
1015                                        FIEMAP_EXTENT_LAST);
1016
1017         bch2_trans_exit(&trans);
1018         bch2_bkey_buf_exit(&cur, c);
1019         bch2_bkey_buf_exit(&prev, c);
1020         return ret < 0 ? ret : 0;
1021 }
1022
1023 static const struct vm_operations_struct bch_vm_ops = {
1024         .fault          = bch2_page_fault,
1025         .map_pages      = filemap_map_pages,
1026         .page_mkwrite   = bch2_page_mkwrite,
1027 };
1028
1029 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
1030 {
1031         file_accessed(file);
1032
1033         vma->vm_ops = &bch_vm_ops;
1034         return 0;
1035 }
1036
1037 /* Directories: */
1038
1039 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
1040 {
1041         return generic_file_llseek_size(file, offset, whence,
1042                                         S64_MAX, S64_MAX);
1043 }
1044
1045 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1046 {
1047         struct bch_inode_info *inode = file_bch_inode(file);
1048         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1049
1050         if (!dir_emit_dots(file, ctx))
1051                 return 0;
1052
1053         return bch2_readdir(c, inode_inum(inode), ctx);
1054 }
1055
1056 static const struct file_operations bch_file_operations = {
1057         .llseek         = bch2_llseek,
1058         .read_iter      = bch2_read_iter,
1059         .write_iter     = bch2_write_iter,
1060         .mmap           = bch2_mmap,
1061         .open           = generic_file_open,
1062         .fsync          = bch2_fsync,
1063         .splice_read    = generic_file_splice_read,
1064         .splice_write   = iter_file_splice_write,
1065         .fallocate      = bch2_fallocate_dispatch,
1066         .unlocked_ioctl = bch2_fs_file_ioctl,
1067 #ifdef CONFIG_COMPAT
1068         .compat_ioctl   = bch2_compat_fs_ioctl,
1069 #endif
1070         .remap_file_range = bch2_remap_file_range,
1071 };
1072
1073 static const struct inode_operations bch_file_inode_operations = {
1074         .getattr        = bch2_getattr,
1075         .setattr        = bch2_setattr,
1076         .fiemap         = bch2_fiemap,
1077         .listxattr      = bch2_xattr_list,
1078 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1079         .get_acl        = bch2_get_acl,
1080         .set_acl        = bch2_set_acl,
1081 #endif
1082 };
1083
1084 static const struct inode_operations bch_dir_inode_operations = {
1085         .lookup         = bch2_lookup,
1086         .create         = bch2_create,
1087         .link           = bch2_link,
1088         .unlink         = bch2_unlink,
1089         .symlink        = bch2_symlink,
1090         .mkdir          = bch2_mkdir,
1091         .rmdir          = bch2_unlink,
1092         .mknod          = bch2_mknod,
1093         .rename         = bch2_rename2,
1094         .getattr        = bch2_getattr,
1095         .setattr        = bch2_setattr,
1096         .tmpfile        = bch2_tmpfile,
1097         .listxattr      = bch2_xattr_list,
1098 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1099         .get_acl        = bch2_get_acl,
1100         .set_acl        = bch2_set_acl,
1101 #endif
1102 };
1103
1104 static const struct file_operations bch_dir_file_operations = {
1105         .llseek         = bch2_dir_llseek,
1106         .read           = generic_read_dir,
1107         .iterate_shared = bch2_vfs_readdir,
1108         .fsync          = bch2_fsync,
1109         .unlocked_ioctl = bch2_fs_file_ioctl,
1110 #ifdef CONFIG_COMPAT
1111         .compat_ioctl   = bch2_compat_fs_ioctl,
1112 #endif
1113 };
1114
1115 static const struct inode_operations bch_symlink_inode_operations = {
1116         .get_link       = page_get_link,
1117         .getattr        = bch2_getattr,
1118         .setattr        = bch2_setattr,
1119         .listxattr      = bch2_xattr_list,
1120 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1121         .get_acl        = bch2_get_acl,
1122         .set_acl        = bch2_set_acl,
1123 #endif
1124 };
1125
1126 static const struct inode_operations bch_special_inode_operations = {
1127         .getattr        = bch2_getattr,
1128         .setattr        = bch2_setattr,
1129         .listxattr      = bch2_xattr_list,
1130 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1131         .get_acl        = bch2_get_acl,
1132         .set_acl        = bch2_set_acl,
1133 #endif
1134 };
1135
1136 static const struct address_space_operations bch_address_space_operations = {
1137         .writepage      = bch2_writepage,
1138         .readpage       = bch2_readpage,
1139         .writepages     = bch2_writepages,
1140         .readahead      = bch2_readahead,
1141         .set_page_dirty = __set_page_dirty_nobuffers,
1142         .write_begin    = bch2_write_begin,
1143         .write_end      = bch2_write_end,
1144         .invalidatepage = bch2_invalidatepage,
1145         .releasepage    = bch2_releasepage,
1146         .direct_IO      = noop_direct_IO,
1147 #ifdef CONFIG_MIGRATION
1148         .migratepage    = bch2_migrate_page,
1149 #endif
1150         .error_remove_page = generic_error_remove_page,
1151 };
1152
1153 #if 0
1154 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1155                 u64 ino, u32 generation)
1156 {
1157         struct bch_fs *c = sb->s_fs_info;
1158         struct inode *vinode;
1159
1160         if (ino < BCACHEFS_ROOT_INO)
1161                 return ERR_PTR(-ESTALE);
1162
1163         vinode = bch2_vfs_inode_get(c, ino);
1164         if (IS_ERR(vinode))
1165                 return ERR_CAST(vinode);
1166         if (generation && vinode->i_generation != generation) {
1167                 /* we didn't find the right inode.. */
1168                 iput(vinode);
1169                 return ERR_PTR(-ESTALE);
1170         }
1171         return vinode;
1172 }
1173
1174 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
1175                 int fh_len, int fh_type)
1176 {
1177         return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1178                                     bch2_nfs_get_inode);
1179 }
1180
1181 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
1182                 int fh_len, int fh_type)
1183 {
1184         return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1185                                     bch2_nfs_get_inode);
1186 }
1187 #endif
1188
1189 static const struct export_operations bch_export_ops = {
1190         //.fh_to_dentry = bch2_fh_to_dentry,
1191         //.fh_to_parent = bch2_fh_to_parent,
1192         //.get_parent   = bch2_get_parent,
1193 };
1194
1195 static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum,
1196                                 struct bch_inode_info *inode,
1197                                 struct bch_inode_unpacked *bi)
1198 {
1199         bch2_inode_update_after_write(c, inode, bi, ~0);
1200
1201         inode->v.i_blocks       = bi->bi_sectors;
1202         inode->v.i_ino          = bi->bi_inum;
1203         inode->v.i_rdev         = bi->bi_dev;
1204         inode->v.i_generation   = bi->bi_generation;
1205         inode->v.i_size         = bi->bi_size;
1206
1207         inode->ei_flags         = 0;
1208         inode->ei_journal_seq   = 0;
1209         inode->ei_quota_reserved = 0;
1210         inode->ei_qid           = bch_qid(bi);
1211         inode->ei_subvol        = inum.subvol;
1212
1213         inode->v.i_mapping->a_ops = &bch_address_space_operations;
1214
1215         switch (inode->v.i_mode & S_IFMT) {
1216         case S_IFREG:
1217                 inode->v.i_op   = &bch_file_inode_operations;
1218                 inode->v.i_fop  = &bch_file_operations;
1219                 break;
1220         case S_IFDIR:
1221                 inode->v.i_op   = &bch_dir_inode_operations;
1222                 inode->v.i_fop  = &bch_dir_file_operations;
1223                 break;
1224         case S_IFLNK:
1225                 inode_nohighmem(&inode->v);
1226                 inode->v.i_op   = &bch_symlink_inode_operations;
1227                 break;
1228         default:
1229                 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1230                 inode->v.i_op   = &bch_special_inode_operations;
1231                 break;
1232         }
1233 }
1234
1235 static struct inode *bch2_alloc_inode(struct super_block *sb)
1236 {
1237         struct bch_inode_info *inode;
1238
1239         inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
1240         if (!inode)
1241                 return NULL;
1242
1243         inode_init_once(&inode->v);
1244         mutex_init(&inode->ei_update_lock);
1245         pagecache_lock_init(&inode->ei_pagecache_lock);
1246         mutex_init(&inode->ei_quota_lock);
1247         inode->ei_journal_seq = 0;
1248
1249         return &inode->v;
1250 }
1251
1252 static void bch2_i_callback(struct rcu_head *head)
1253 {
1254         struct inode *vinode = container_of(head, struct inode, i_rcu);
1255         struct bch_inode_info *inode = to_bch_ei(vinode);
1256
1257         kmem_cache_free(bch2_inode_cache, inode);
1258 }
1259
1260 static void bch2_destroy_inode(struct inode *vinode)
1261 {
1262         call_rcu(&vinode->i_rcu, bch2_i_callback);
1263 }
1264
1265 static int inode_update_times_fn(struct bch_inode_info *inode,
1266                                  struct bch_inode_unpacked *bi,
1267                                  void *p)
1268 {
1269         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1270
1271         bi->bi_atime    = timespec_to_bch2_time(c, inode->v.i_atime);
1272         bi->bi_mtime    = timespec_to_bch2_time(c, inode->v.i_mtime);
1273         bi->bi_ctime    = timespec_to_bch2_time(c, inode->v.i_ctime);
1274
1275         return 0;
1276 }
1277
1278 static int bch2_vfs_write_inode(struct inode *vinode,
1279                                 struct writeback_control *wbc)
1280 {
1281         struct bch_fs *c = vinode->i_sb->s_fs_info;
1282         struct bch_inode_info *inode = to_bch_ei(vinode);
1283         int ret;
1284
1285         mutex_lock(&inode->ei_update_lock);
1286         ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1287                                ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1288         mutex_unlock(&inode->ei_update_lock);
1289
1290         return ret;
1291 }
1292
1293 static void bch2_evict_inode(struct inode *vinode)
1294 {
1295         struct bch_fs *c = vinode->i_sb->s_fs_info;
1296         struct bch_inode_info *inode = to_bch_ei(vinode);
1297
1298         truncate_inode_pages_final(&inode->v.i_data);
1299
1300         clear_inode(&inode->v);
1301
1302         BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1303
1304         if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1305                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1306                                 KEY_TYPE_QUOTA_WARN);
1307                 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1308                                 KEY_TYPE_QUOTA_WARN);
1309                 bch2_inode_rm(c, inode_inum(inode), true);
1310         }
1311 }
1312
1313 void bch2_evict_subvolume_inodes(struct bch_fs *c,
1314                                  struct snapshot_id_list *s)
1315 {
1316         struct super_block *sb = c->vfs_sb;
1317         struct inode *inode;
1318
1319         spin_lock(&sb->s_inode_list_lock);
1320         list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1321                 if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
1322                     (inode->i_state & I_FREEING))
1323                         continue;
1324
1325                 d_mark_dontcache(inode);
1326                 d_prune_aliases(inode);
1327         }
1328         spin_unlock(&sb->s_inode_list_lock);
1329 again:
1330         cond_resched();
1331         spin_lock(&sb->s_inode_list_lock);
1332         list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1333                 if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
1334                     (inode->i_state & I_FREEING))
1335                         continue;
1336
1337                 if (!(inode->i_state & I_DONTCACHE)) {
1338                         d_mark_dontcache(inode);
1339                         d_prune_aliases(inode);
1340                 }
1341
1342                 spin_lock(&inode->i_lock);
1343                 if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
1344                     !(inode->i_state & I_FREEING)) {
1345                         wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
1346                         DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
1347                         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
1348                         spin_unlock(&inode->i_lock);
1349                         spin_unlock(&sb->s_inode_list_lock);
1350                         schedule();
1351                         finish_wait(wq, &wait.wq_entry);
1352                         goto again;
1353                 }
1354
1355                 spin_unlock(&inode->i_lock);
1356         }
1357         spin_unlock(&sb->s_inode_list_lock);
1358 }
1359
1360 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1361 {
1362         struct super_block *sb = dentry->d_sb;
1363         struct bch_fs *c = sb->s_fs_info;
1364         struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1365         unsigned shift = sb->s_blocksize_bits - 9;
1366         /*
1367          * this assumes inodes take up 64 bytes, which is a decent average
1368          * number:
1369          */
1370         u64 avail_inodes = ((usage.capacity - usage.used) << 3);
1371         u64 fsid;
1372
1373         buf->f_type     = BCACHEFS_STATFS_MAGIC;
1374         buf->f_bsize    = sb->s_blocksize;
1375         buf->f_blocks   = usage.capacity >> shift;
1376         buf->f_bfree    = usage.free >> shift;
1377         buf->f_bavail   = avail_factor(usage.free) >> shift;
1378
1379         buf->f_files    = usage.nr_inodes + avail_inodes;
1380         buf->f_ffree    = avail_inodes;
1381
1382         fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
1383                le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
1384         buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
1385         buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
1386         buf->f_namelen  = BCH_NAME_MAX;
1387
1388         return 0;
1389 }
1390
1391 static int bch2_sync_fs(struct super_block *sb, int wait)
1392 {
1393         struct bch_fs *c = sb->s_fs_info;
1394
1395         if (c->opts.journal_flush_disabled)
1396                 return 0;
1397
1398         if (!wait) {
1399                 bch2_journal_flush_async(&c->journal, NULL);
1400                 return 0;
1401         }
1402
1403         return bch2_journal_flush(&c->journal);
1404 }
1405
1406 static struct bch_fs *bch2_path_to_fs(const char *path)
1407 {
1408         struct bch_fs *c;
1409         dev_t dev;
1410         int ret;
1411
1412         ret = lookup_bdev(path, &dev);
1413         if (ret)
1414                 return ERR_PTR(ret);
1415
1416         c = bch2_dev_to_fs(dev);
1417         if (c)
1418                 closure_put(&c->cl);
1419         return c ?: ERR_PTR(-ENOENT);
1420 }
1421
1422 static char **split_devs(const char *_dev_name, unsigned *nr)
1423 {
1424         char *dev_name = NULL, **devs = NULL, *s;
1425         size_t i, nr_devs = 0;
1426
1427         dev_name = kstrdup(_dev_name, GFP_KERNEL);
1428         if (!dev_name)
1429                 return NULL;
1430
1431         for (s = dev_name; s; s = strchr(s + 1, ':'))
1432                 nr_devs++;
1433
1434         devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
1435         if (!devs) {
1436                 kfree(dev_name);
1437                 return NULL;
1438         }
1439
1440         for (i = 0, s = dev_name;
1441              s;
1442              (s = strchr(s, ':')) && (*s++ = '\0'))
1443                 devs[i++] = s;
1444
1445         *nr = nr_devs;
1446         return devs;
1447 }
1448
1449 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1450 {
1451         struct bch_fs *c = sb->s_fs_info;
1452         struct bch_opts opts = bch2_opts_empty();
1453         int ret;
1454
1455         opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
1456
1457         ret = bch2_parse_mount_opts(c, &opts, data);
1458         if (ret)
1459                 return ret;
1460
1461         if (opts.read_only != c->opts.read_only) {
1462                 down_write(&c->state_lock);
1463
1464                 if (opts.read_only) {
1465                         bch2_fs_read_only(c);
1466
1467                         sb->s_flags |= SB_RDONLY;
1468                 } else {
1469                         ret = bch2_fs_read_write(c);
1470                         if (ret) {
1471                                 bch_err(c, "error going rw: %i", ret);
1472                                 up_write(&c->state_lock);
1473                                 return -EINVAL;
1474                         }
1475
1476                         sb->s_flags &= ~SB_RDONLY;
1477                 }
1478
1479                 c->opts.read_only = opts.read_only;
1480
1481                 up_write(&c->state_lock);
1482         }
1483
1484         if (opts.errors >= 0)
1485                 c->opts.errors = opts.errors;
1486
1487         return ret;
1488 }
1489
1490 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
1491 {
1492         struct bch_fs *c = root->d_sb->s_fs_info;
1493         struct bch_dev *ca;
1494         unsigned i;
1495         bool first = true;
1496
1497         for_each_online_member(ca, c, i) {
1498                 if (!first)
1499                         seq_putc(seq, ':');
1500                 first = false;
1501                 seq_puts(seq, "/dev/");
1502                 seq_puts(seq, ca->name);
1503         }
1504
1505         return 0;
1506 }
1507
1508 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1509 {
1510         struct bch_fs *c = root->d_sb->s_fs_info;
1511         enum bch_opt_id i;
1512         char buf[512];
1513
1514         for (i = 0; i < bch2_opts_nr; i++) {
1515                 const struct bch_option *opt = &bch2_opt_table[i];
1516                 u64 v = bch2_opt_get_by_id(&c->opts, i);
1517
1518                 if (!(opt->mode & OPT_MOUNT))
1519                         continue;
1520
1521                 if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1522                         continue;
1523
1524                 bch2_opt_to_text(&PBUF(buf), c, opt, v,
1525                                  OPT_SHOW_MOUNT_STYLE);
1526                 seq_putc(seq, ',');
1527                 seq_puts(seq, buf);
1528         }
1529
1530         return 0;
1531 }
1532
1533 static void bch2_put_super(struct super_block *sb)
1534 {
1535         struct bch_fs *c = sb->s_fs_info;
1536
1537         __bch2_fs_stop(c);
1538 }
1539
1540 static const struct super_operations bch_super_operations = {
1541         .alloc_inode    = bch2_alloc_inode,
1542         .destroy_inode  = bch2_destroy_inode,
1543         .write_inode    = bch2_vfs_write_inode,
1544         .evict_inode    = bch2_evict_inode,
1545         .sync_fs        = bch2_sync_fs,
1546         .statfs         = bch2_statfs,
1547         .show_devname   = bch2_show_devname,
1548         .show_options   = bch2_show_options,
1549         .remount_fs     = bch2_remount,
1550         .put_super      = bch2_put_super,
1551 #if 0
1552         .freeze_fs      = bch2_freeze,
1553         .unfreeze_fs    = bch2_unfreeze,
1554 #endif
1555 };
1556
1557 static int bch2_set_super(struct super_block *s, void *data)
1558 {
1559         s->s_fs_info = data;
1560         return 0;
1561 }
1562
1563 static int bch2_noset_super(struct super_block *s, void *data)
1564 {
1565         return -EBUSY;
1566 }
1567
1568 static int bch2_test_super(struct super_block *s, void *data)
1569 {
1570         struct bch_fs *c = s->s_fs_info;
1571         struct bch_fs **devs = data;
1572         unsigned i;
1573
1574         if (!c)
1575                 return false;
1576
1577         for (i = 0; devs[i]; i++)
1578                 if (c != devs[i])
1579                         return false;
1580         return true;
1581 }
1582
1583 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1584                                  int flags, const char *dev_name, void *data)
1585 {
1586         struct bch_fs *c;
1587         struct bch_dev *ca;
1588         struct super_block *sb;
1589         struct inode *vinode;
1590         struct bch_opts opts = bch2_opts_empty();
1591         char **devs;
1592         struct bch_fs **devs_to_fs = NULL;
1593         unsigned i, nr_devs;
1594         int ret;
1595
1596         opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
1597
1598         ret = bch2_parse_mount_opts(NULL, &opts, data);
1599         if (ret)
1600                 return ERR_PTR(ret);
1601
1602         if (!dev_name || strlen(dev_name) == 0)
1603                 return ERR_PTR(-EINVAL);
1604
1605         devs = split_devs(dev_name, &nr_devs);
1606         if (!devs)
1607                 return ERR_PTR(-ENOMEM);
1608
1609         devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
1610         if (!devs_to_fs) {
1611                 sb = ERR_PTR(-ENOMEM);
1612                 goto got_sb;
1613         }
1614
1615         for (i = 0; i < nr_devs; i++)
1616                 devs_to_fs[i] = bch2_path_to_fs(devs[i]);
1617
1618         sb = sget(fs_type, bch2_test_super, bch2_noset_super,
1619                   flags|SB_NOSEC, devs_to_fs);
1620         if (!IS_ERR(sb))
1621                 goto got_sb;
1622
1623         c = bch2_fs_open(devs, nr_devs, opts);
1624         if (IS_ERR(c)) {
1625                 sb = ERR_CAST(c);
1626                 goto got_sb;
1627         }
1628
1629         /* Some options can't be parsed until after the fs is started: */
1630         ret = bch2_parse_mount_opts(c, &opts, data);
1631         if (ret) {
1632                 bch2_fs_stop(c);
1633                 sb = ERR_PTR(ret);
1634                 goto got_sb;
1635         }
1636
1637         bch2_opts_apply(&c->opts, opts);
1638
1639         sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
1640         if (IS_ERR(sb))
1641                 bch2_fs_stop(c);
1642 got_sb:
1643         kfree(devs_to_fs);
1644         kfree(devs[0]);
1645         kfree(devs);
1646
1647         if (IS_ERR(sb))
1648                 return ERR_CAST(sb);
1649
1650         c = sb->s_fs_info;
1651
1652         if (sb->s_root) {
1653                 if ((flags ^ sb->s_flags) & SB_RDONLY) {
1654                         ret = -EBUSY;
1655                         goto err_put_super;
1656                 }
1657                 goto out;
1658         }
1659
1660         sb->s_blocksize         = block_bytes(c);
1661         sb->s_blocksize_bits    = ilog2(block_bytes(c));
1662         sb->s_maxbytes          = MAX_LFS_FILESIZE;
1663         sb->s_op                = &bch_super_operations;
1664         sb->s_export_op         = &bch_export_ops;
1665 #ifdef CONFIG_BCACHEFS_QUOTA
1666         sb->s_qcop              = &bch2_quotactl_operations;
1667         sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1668 #endif
1669         sb->s_xattr             = bch2_xattr_handlers;
1670         sb->s_magic             = BCACHEFS_STATFS_MAGIC;
1671         sb->s_time_gran         = c->sb.nsec_per_time_unit;
1672         sb->s_time_min          = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
1673         sb->s_time_max          = div_s64(S64_MAX, c->sb.time_units_per_sec);
1674         c->vfs_sb               = sb;
1675         strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
1676
1677         ret = super_setup_bdi(sb);
1678         if (ret)
1679                 goto err_put_super;
1680
1681         sb->s_bdi->ra_pages             = VM_READAHEAD_PAGES;
1682
1683         for_each_online_member(ca, c, i) {
1684                 struct block_device *bdev = ca->disk_sb.bdev;
1685
1686                 /* XXX: create an anonymous device for multi device filesystems */
1687                 sb->s_bdev      = bdev;
1688                 sb->s_dev       = bdev->bd_dev;
1689                 percpu_ref_put(&ca->io_ref);
1690                 break;
1691         }
1692
1693         c->dev = sb->s_dev;
1694
1695 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1696         if (c->opts.acl)
1697                 sb->s_flags     |= SB_POSIXACL;
1698 #endif
1699
1700         vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
1701         if (IS_ERR(vinode)) {
1702                 bch_err(c, "error mounting: error getting root inode %i",
1703                         (int) PTR_ERR(vinode));
1704                 ret = PTR_ERR(vinode);
1705                 goto err_put_super;
1706         }
1707
1708         sb->s_root = d_make_root(vinode);
1709         if (!sb->s_root) {
1710                 bch_err(c, "error mounting: error allocating root dentry");
1711                 ret = -ENOMEM;
1712                 goto err_put_super;
1713         }
1714
1715         sb->s_flags |= SB_ACTIVE;
1716 out:
1717         return dget(sb->s_root);
1718
1719 err_put_super:
1720         deactivate_locked_super(sb);
1721         return ERR_PTR(ret);
1722 }
1723
1724 static void bch2_kill_sb(struct super_block *sb)
1725 {
1726         struct bch_fs *c = sb->s_fs_info;
1727
1728         generic_shutdown_super(sb);
1729         bch2_fs_free(c);
1730 }
1731
1732 static struct file_system_type bcache_fs_type = {
1733         .owner          = THIS_MODULE,
1734         .name           = "bcachefs",
1735         .mount          = bch2_mount,
1736         .kill_sb        = bch2_kill_sb,
1737         .fs_flags       = FS_REQUIRES_DEV,
1738 };
1739
1740 MODULE_ALIAS_FS("bcachefs");
1741
1742 void bch2_vfs_exit(void)
1743 {
1744         unregister_filesystem(&bcache_fs_type);
1745         if (bch2_inode_cache)
1746                 kmem_cache_destroy(bch2_inode_cache);
1747 }
1748
1749 int __init bch2_vfs_init(void)
1750 {
1751         int ret = -ENOMEM;
1752
1753         bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
1754         if (!bch2_inode_cache)
1755                 goto err;
1756
1757         ret = register_filesystem(&bcache_fs_type);
1758         if (ret)
1759                 goto err;
1760
1761         return 0;
1762 err:
1763         bch2_vfs_exit();
1764         return ret;
1765 }
1766
1767 #endif /* NO_BCACHEFS_FS */