]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fs.c
Update bcachefs sources to 6d9ff21de7 bcachefs: Kill journal buf bloom filter
[bcachefs-tools-debian] / libbcachefs / fs.c
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3
4 #include "bcachefs.h"
5 #include "acl.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "chardev.h"
10 #include "dirent.h"
11 #include "extents.h"
12 #include "fs.h"
13 #include "fs-common.h"
14 #include "fs-io.h"
15 #include "fs-ioctl.h"
16 #include "fsck.h"
17 #include "inode.h"
18 #include "io.h"
19 #include "journal.h"
20 #include "keylist.h"
21 #include "quota.h"
22 #include "super.h"
23 #include "xattr.h"
24
25 #include <linux/aio.h>
26 #include <linux/backing-dev.h>
27 #include <linux/exportfs.h>
28 #include <linux/fiemap.h>
29 #include <linux/module.h>
30 #include <linux/pagemap.h>
31 #include <linux/posix_acl.h>
32 #include <linux/random.h>
33 #include <linux/statfs.h>
34 #include <linux/string.h>
35 #include <linux/xattr.h>
36
37 static struct kmem_cache *bch2_inode_cache;
38
39 static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum,
40                                 struct bch_inode_info *,
41                                 struct bch_inode_unpacked *);
42
43 static void journal_seq_copy(struct bch_fs *c,
44                              struct bch_inode_info *dst,
45                              u64 journal_seq)
46 {
47         /*
48          * atomic64_cmpxchg has a fallback for archs that don't support it,
49          * cmpxchg does not:
50          */
51         atomic64_t *dst_seq = (void *) &dst->ei_journal_seq;
52         u64 old, v = READ_ONCE(dst->ei_journal_seq);
53
54         do {
55                 old = v;
56
57                 if (old >= journal_seq)
58                         break;
59         } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
60 }
61
62 static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
63 {
64         BUG_ON(atomic_long_read(&lock->v) == 0);
65
66         if (atomic_long_sub_return_release(i, &lock->v) == 0)
67                 wake_up_all(&lock->wait);
68 }
69
70 static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
71 {
72         long v = atomic_long_read(&lock->v), old;
73
74         do {
75                 old = v;
76
77                 if (i > 0 ? v < 0 : v > 0)
78                         return false;
79         } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
80                                         old, old + i)) != old);
81         return true;
82 }
83
84 static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
85 {
86         wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
87 }
88
89 void bch2_pagecache_add_put(struct pagecache_lock *lock)
90 {
91         __pagecache_lock_put(lock, 1);
92 }
93
94 bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
95 {
96         return __pagecache_lock_tryget(lock, 1);
97 }
98
99 void bch2_pagecache_add_get(struct pagecache_lock *lock)
100 {
101         __pagecache_lock_get(lock, 1);
102 }
103
104 void bch2_pagecache_block_put(struct pagecache_lock *lock)
105 {
106         __pagecache_lock_put(lock, -1);
107 }
108
109 void bch2_pagecache_block_get(struct pagecache_lock *lock)
110 {
111         __pagecache_lock_get(lock, -1);
112 }
113
114 void bch2_inode_update_after_write(struct bch_fs *c,
115                                    struct bch_inode_info *inode,
116                                    struct bch_inode_unpacked *bi,
117                                    unsigned fields)
118 {
119         set_nlink(&inode->v, bch2_inode_nlink_get(bi));
120         i_uid_write(&inode->v, bi->bi_uid);
121         i_gid_write(&inode->v, bi->bi_gid);
122         inode->v.i_mode = bi->bi_mode;
123
124         if (fields & ATTR_ATIME)
125                 inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
126         if (fields & ATTR_MTIME)
127                 inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
128         if (fields & ATTR_CTIME)
129                 inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
130
131         inode->ei_inode         = *bi;
132
133         bch2_inode_flags_to_vfs(inode);
134 }
135
136 int __must_check bch2_write_inode(struct bch_fs *c,
137                                   struct bch_inode_info *inode,
138                                   inode_set_fn set,
139                                   void *p, unsigned fields)
140 {
141         struct btree_trans trans;
142         struct btree_iter iter = { NULL };
143         struct bch_inode_unpacked inode_u;
144         int ret;
145
146         bch2_trans_init(&trans, c, 0, 512);
147 retry:
148         bch2_trans_begin(&trans);
149
150         ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
151                                 BTREE_ITER_INTENT) ?:
152                 (set ? set(inode, &inode_u, p) : 0) ?:
153                 bch2_inode_write(&trans, &iter, &inode_u) ?:
154                 bch2_trans_commit(&trans, NULL,
155                                   &inode->ei_journal_seq,
156                                   BTREE_INSERT_NOFAIL);
157
158         /*
159          * the btree node lock protects inode->ei_inode, not ei_update_lock;
160          * this is important for inode updates via bchfs_write_index_update
161          */
162         if (!ret)
163                 bch2_inode_update_after_write(c, inode, &inode_u, fields);
164
165         bch2_trans_iter_exit(&trans, &iter);
166
167         if (ret == -EINTR)
168                 goto retry;
169
170         bch2_trans_exit(&trans);
171         return ret < 0 ? ret : 0;
172 }
173
174 int bch2_fs_quota_transfer(struct bch_fs *c,
175                            struct bch_inode_info *inode,
176                            struct bch_qid new_qid,
177                            unsigned qtypes,
178                            enum quota_acct_mode mode)
179 {
180         unsigned i;
181         int ret;
182
183         qtypes &= enabled_qtypes(c);
184
185         for (i = 0; i < QTYP_NR; i++)
186                 if (new_qid.q[i] == inode->ei_qid.q[i])
187                         qtypes &= ~(1U << i);
188
189         if (!qtypes)
190                 return 0;
191
192         mutex_lock(&inode->ei_quota_lock);
193
194         ret = bch2_quota_transfer(c, qtypes, new_qid,
195                                   inode->ei_qid,
196                                   inode->v.i_blocks +
197                                   inode->ei_quota_reserved,
198                                   mode);
199         if (!ret)
200                 for (i = 0; i < QTYP_NR; i++)
201                         if (qtypes & (1 << i))
202                                 inode->ei_qid.q[i] = new_qid.q[i];
203
204         mutex_unlock(&inode->ei_quota_lock);
205
206         return ret;
207 }
208
209 static int bch2_iget5_test(struct inode *vinode, void *p)
210 {
211         struct bch_inode_info *inode = to_bch_ei(vinode);
212         subvol_inum *inum = p;
213
214         return inode->ei_subvol == inum->subvol &&
215                 inode->ei_inode.bi_inum == inum->inum;
216 }
217
218 static int bch2_iget5_set(struct inode *vinode, void *p)
219 {
220         struct bch_inode_info *inode = to_bch_ei(vinode);
221         subvol_inum *inum = p;
222
223         inode->v.i_ino          = inum->inum;
224         inode->ei_subvol        = inum->subvol;
225         inode->ei_inode.bi_inum = inum->inum;
226         return 0;
227 }
228
229 static unsigned bch2_inode_hash(subvol_inum inum)
230 {
231         return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
232 }
233
234 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
235 {
236         struct bch_inode_unpacked inode_u;
237         struct bch_inode_info *inode;
238         int ret;
239
240         inode = to_bch_ei(iget5_locked(c->vfs_sb,
241                                        bch2_inode_hash(inum),
242                                        bch2_iget5_test,
243                                        bch2_iget5_set,
244                                        &inum));
245         if (unlikely(!inode))
246                 return ERR_PTR(-ENOMEM);
247         if (!(inode->v.i_state & I_NEW))
248                 return &inode->v;
249
250         ret = bch2_inode_find_by_inum(c, inum, &inode_u);
251         if (ret) {
252                 iget_failed(&inode->v);
253                 return ERR_PTR(ret);
254         }
255
256         bch2_vfs_inode_init(c, inum, inode, &inode_u);
257
258         unlock_new_inode(&inode->v);
259
260         return &inode->v;
261 }
262
263 struct bch_inode_info *
264 __bch2_create(struct user_namespace *mnt_userns,
265               struct bch_inode_info *dir, struct dentry *dentry,
266               umode_t mode, dev_t rdev, subvol_inum snapshot_src,
267               unsigned flags)
268 {
269         struct bch_fs *c = dir->v.i_sb->s_fs_info;
270         struct btree_trans trans;
271         struct bch_inode_unpacked dir_u;
272         struct bch_inode_info *inode, *old;
273         struct bch_inode_unpacked inode_u;
274         struct posix_acl *default_acl = NULL, *acl = NULL;
275         subvol_inum inum;
276         u64 journal_seq = 0;
277         int ret;
278
279         /*
280          * preallocate acls + vfs inode before btree transaction, so that
281          * nothing can fail after the transaction succeeds:
282          */
283 #ifdef CONFIG_BCACHEFS_POSIX_ACL
284         ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
285         if (ret)
286                 return ERR_PTR(ret);
287 #endif
288         inode = to_bch_ei(new_inode(c->vfs_sb));
289         if (unlikely(!inode)) {
290                 inode = ERR_PTR(-ENOMEM);
291                 goto err;
292         }
293
294         bch2_inode_init_early(c, &inode_u);
295
296         if (!(flags & BCH_CREATE_TMPFILE))
297                 mutex_lock(&dir->ei_update_lock);
298
299         bch2_trans_init(&trans, c, 8,
300                         2048 + (!(flags & BCH_CREATE_TMPFILE)
301                                 ? dentry->d_name.len : 0));
302 retry:
303         bch2_trans_begin(&trans);
304
305         ret   = bch2_create_trans(&trans,
306                                   inode_inum(dir), &dir_u, &inode_u,
307                                   !(flags & BCH_CREATE_TMPFILE)
308                                   ? &dentry->d_name : NULL,
309                                   from_kuid(mnt_userns, current_fsuid()),
310                                   from_kgid(mnt_userns, current_fsgid()),
311                                   mode, rdev,
312                                   default_acl, acl, snapshot_src, flags) ?:
313                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
314                                 KEY_TYPE_QUOTA_PREALLOC);
315         if (unlikely(ret))
316                 goto err_before_quota;
317
318         ret   = bch2_trans_commit(&trans, NULL, &journal_seq, 0);
319         if (unlikely(ret)) {
320                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
321                                 KEY_TYPE_QUOTA_WARN);
322 err_before_quota:
323                 if (ret == -EINTR)
324                         goto retry;
325                 goto err_trans;
326         }
327
328         if (!(flags & BCH_CREATE_TMPFILE)) {
329                 bch2_inode_update_after_write(c, dir, &dir_u,
330                                               ATTR_MTIME|ATTR_CTIME);
331                 journal_seq_copy(c, dir, journal_seq);
332                 mutex_unlock(&dir->ei_update_lock);
333         }
334
335         inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
336         inum.inum = inode_u.bi_inum;
337
338         bch2_vfs_inode_init(c, inum, inode, &inode_u);
339         journal_seq_copy(c, inode, journal_seq);
340
341         set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
342         set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
343
344         /*
345          * we must insert the new inode into the inode cache before calling
346          * bch2_trans_exit() and dropping locks, else we could race with another
347          * thread pulling the inode in and modifying it:
348          */
349
350         inode->v.i_state |= I_CREATING;
351
352         old = to_bch_ei(inode_insert5(&inode->v,
353                                       bch2_inode_hash(inum),
354                                       bch2_iget5_test,
355                                       bch2_iget5_set,
356                                       &inum));
357         BUG_ON(!old);
358
359         if (unlikely(old != inode)) {
360                 /*
361                  * We raced, another process pulled the new inode into cache
362                  * before us:
363                  */
364                 journal_seq_copy(c, old, journal_seq);
365                 make_bad_inode(&inode->v);
366                 iput(&inode->v);
367
368                 inode = old;
369         } else {
370                 /*
371                  * we really don't want insert_inode_locked2() to be setting
372                  * I_NEW...
373                  */
374                 unlock_new_inode(&inode->v);
375         }
376
377         bch2_trans_exit(&trans);
378 err:
379         posix_acl_release(default_acl);
380         posix_acl_release(acl);
381         return inode;
382 err_trans:
383         if (!(flags & BCH_CREATE_TMPFILE))
384                 mutex_unlock(&dir->ei_update_lock);
385
386         bch2_trans_exit(&trans);
387         make_bad_inode(&inode->v);
388         iput(&inode->v);
389         inode = ERR_PTR(ret);
390         goto err;
391 }
392
393 /* methods */
394
395 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
396                                   unsigned int flags)
397 {
398         struct bch_fs *c = vdir->i_sb->s_fs_info;
399         struct bch_inode_info *dir = to_bch_ei(vdir);
400         struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
401         struct inode *vinode = NULL;
402         subvol_inum inum = { .subvol = 1 };
403         int ret;
404
405         ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
406                                  &dentry->d_name, &inum);
407
408         if (!ret)
409                 vinode = bch2_vfs_inode_get(c, inum);
410
411         return d_splice_alias(vinode, dentry);
412 }
413
414 static int bch2_mknod(struct user_namespace *mnt_userns,
415                       struct inode *vdir, struct dentry *dentry,
416                       umode_t mode, dev_t rdev)
417 {
418         struct bch_inode_info *inode =
419                 __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev,
420                               (subvol_inum) { 0 }, 0);
421
422         if (IS_ERR(inode))
423                 return PTR_ERR(inode);
424
425         d_instantiate(dentry, &inode->v);
426         return 0;
427 }
428
429 static int bch2_create(struct user_namespace *mnt_userns,
430                        struct inode *vdir, struct dentry *dentry,
431                        umode_t mode, bool excl)
432 {
433         return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFREG, 0);
434 }
435
436 static int __bch2_link(struct bch_fs *c,
437                        struct bch_inode_info *inode,
438                        struct bch_inode_info *dir,
439                        struct dentry *dentry)
440 {
441         struct btree_trans trans;
442         struct bch_inode_unpacked dir_u, inode_u;
443         int ret;
444
445         mutex_lock(&inode->ei_update_lock);
446         bch2_trans_init(&trans, c, 4, 1024);
447
448         ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0,
449                         bch2_link_trans(&trans,
450                                         inode_inum(dir),   &dir_u,
451                                         inode_inum(inode), &inode_u,
452                                         &dentry->d_name));
453
454         if (likely(!ret)) {
455                 BUG_ON(inode_u.bi_inum != inode->v.i_ino);
456
457                 journal_seq_copy(c, inode, dir->ei_journal_seq);
458                 bch2_inode_update_after_write(c, dir, &dir_u,
459                                               ATTR_MTIME|ATTR_CTIME);
460                 bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
461         }
462
463         bch2_trans_exit(&trans);
464         mutex_unlock(&inode->ei_update_lock);
465         return ret;
466 }
467
468 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
469                      struct dentry *dentry)
470 {
471         struct bch_fs *c = vdir->i_sb->s_fs_info;
472         struct bch_inode_info *dir = to_bch_ei(vdir);
473         struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
474         int ret;
475
476         lockdep_assert_held(&inode->v.i_rwsem);
477
478         ret = __bch2_link(c, inode, dir, dentry);
479         if (unlikely(ret))
480                 return ret;
481
482         ihold(&inode->v);
483         d_instantiate(dentry, &inode->v);
484         return 0;
485 }
486
487 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
488                   bool deleting_snapshot)
489 {
490         struct bch_fs *c = vdir->i_sb->s_fs_info;
491         struct bch_inode_info *dir = to_bch_ei(vdir);
492         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
493         struct bch_inode_unpacked dir_u, inode_u;
494         struct btree_trans trans;
495         int ret;
496
497         bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
498         bch2_trans_init(&trans, c, 4, 1024);
499
500         ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
501                               BTREE_INSERT_NOFAIL,
502                         bch2_unlink_trans(&trans,
503                                           inode_inum(dir), &dir_u,
504                                           &inode_u, &dentry->d_name,
505                                           deleting_snapshot));
506
507         if (likely(!ret)) {
508                 BUG_ON(inode_u.bi_inum != inode->v.i_ino);
509
510                 journal_seq_copy(c, inode, dir->ei_journal_seq);
511                 bch2_inode_update_after_write(c, dir, &dir_u,
512                                               ATTR_MTIME|ATTR_CTIME);
513                 bch2_inode_update_after_write(c, inode, &inode_u,
514                                               ATTR_MTIME);
515         }
516
517         bch2_trans_exit(&trans);
518         bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
519
520         return ret;
521 }
522
523 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
524 {
525         return __bch2_unlink(vdir, dentry, false);
526 }
527
528 static int bch2_symlink(struct user_namespace *mnt_userns,
529                         struct inode *vdir, struct dentry *dentry,
530                         const char *symname)
531 {
532         struct bch_fs *c = vdir->i_sb->s_fs_info;
533         struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
534         int ret;
535
536         inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
537                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
538         if (unlikely(IS_ERR(inode)))
539                 return PTR_ERR(inode);
540
541         inode_lock(&inode->v);
542         ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
543         inode_unlock(&inode->v);
544
545         if (unlikely(ret))
546                 goto err;
547
548         ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
549         if (unlikely(ret))
550                 goto err;
551
552         journal_seq_copy(c, dir, inode->ei_journal_seq);
553
554         ret = __bch2_link(c, inode, dir, dentry);
555         if (unlikely(ret))
556                 goto err;
557
558         d_instantiate(dentry, &inode->v);
559         return 0;
560 err:
561         iput(&inode->v);
562         return ret;
563 }
564
565 static int bch2_mkdir(struct user_namespace *mnt_userns,
566                       struct inode *vdir, struct dentry *dentry, umode_t mode)
567 {
568         return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFDIR, 0);
569 }
570
571 static int bch2_rename2(struct user_namespace *mnt_userns,
572                         struct inode *src_vdir, struct dentry *src_dentry,
573                         struct inode *dst_vdir, struct dentry *dst_dentry,
574                         unsigned flags)
575 {
576         struct bch_fs *c = src_vdir->i_sb->s_fs_info;
577         struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
578         struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
579         struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
580         struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
581         struct bch_inode_unpacked dst_dir_u, src_dir_u;
582         struct bch_inode_unpacked src_inode_u, dst_inode_u;
583         struct btree_trans trans;
584         enum bch_rename_mode mode = flags & RENAME_EXCHANGE
585                 ? BCH_RENAME_EXCHANGE
586                 : dst_dentry->d_inode
587                 ? BCH_RENAME_OVERWRITE : BCH_RENAME;
588         u64 journal_seq = 0;
589         int ret;
590
591         if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
592                 return -EINVAL;
593
594         if (mode == BCH_RENAME_OVERWRITE) {
595                 ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
596                                                    0, LLONG_MAX);
597                 if (ret)
598                         return ret;
599         }
600
601         bch2_trans_init(&trans, c, 8, 2048);
602
603         bch2_lock_inodes(INODE_UPDATE_LOCK,
604                          src_dir,
605                          dst_dir,
606                          src_inode,
607                          dst_inode);
608
609         if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
610                 ret = bch2_fs_quota_transfer(c, src_inode,
611                                              dst_dir->ei_qid,
612                                              1 << QTYP_PRJ,
613                                              KEY_TYPE_QUOTA_PREALLOC);
614                 if (ret)
615                         goto err;
616         }
617
618         if (mode == BCH_RENAME_EXCHANGE &&
619             inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
620                 ret = bch2_fs_quota_transfer(c, dst_inode,
621                                              src_dir->ei_qid,
622                                              1 << QTYP_PRJ,
623                                              KEY_TYPE_QUOTA_PREALLOC);
624                 if (ret)
625                         goto err;
626         }
627
628         ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0,
629                         bch2_rename_trans(&trans,
630                                           inode_inum(src_dir), &src_dir_u,
631                                           inode_inum(dst_dir), &dst_dir_u,
632                                           &src_inode_u,
633                                           &dst_inode_u,
634                                           &src_dentry->d_name,
635                                           &dst_dentry->d_name,
636                                           mode));
637         if (unlikely(ret))
638                 goto err;
639
640         BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
641         BUG_ON(dst_inode &&
642                dst_inode->v.i_ino != dst_inode_u.bi_inum);
643
644         bch2_inode_update_after_write(c, src_dir, &src_dir_u,
645                                       ATTR_MTIME|ATTR_CTIME);
646         journal_seq_copy(c, src_dir, journal_seq);
647
648         if (src_dir != dst_dir) {
649                 bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
650                                               ATTR_MTIME|ATTR_CTIME);
651                 journal_seq_copy(c, dst_dir, journal_seq);
652         }
653
654         bch2_inode_update_after_write(c, src_inode, &src_inode_u,
655                                       ATTR_CTIME);
656         journal_seq_copy(c, src_inode, journal_seq);
657
658         if (dst_inode) {
659                 bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
660                                               ATTR_CTIME);
661                 journal_seq_copy(c, dst_inode, journal_seq);
662         }
663 err:
664         bch2_trans_exit(&trans);
665
666         bch2_fs_quota_transfer(c, src_inode,
667                                bch_qid(&src_inode->ei_inode),
668                                1 << QTYP_PRJ,
669                                KEY_TYPE_QUOTA_NOCHECK);
670         if (dst_inode)
671                 bch2_fs_quota_transfer(c, dst_inode,
672                                        bch_qid(&dst_inode->ei_inode),
673                                        1 << QTYP_PRJ,
674                                        KEY_TYPE_QUOTA_NOCHECK);
675
676         bch2_unlock_inodes(INODE_UPDATE_LOCK,
677                            src_dir,
678                            dst_dir,
679                            src_inode,
680                            dst_inode);
681
682         return ret;
683 }
684
685 static void bch2_setattr_copy(struct user_namespace *mnt_userns,
686                               struct bch_inode_info *inode,
687                               struct bch_inode_unpacked *bi,
688                               struct iattr *attr)
689 {
690         struct bch_fs *c = inode->v.i_sb->s_fs_info;
691         unsigned int ia_valid = attr->ia_valid;
692
693         if (ia_valid & ATTR_UID)
694                 bi->bi_uid = from_kuid(mnt_userns, attr->ia_uid);
695         if (ia_valid & ATTR_GID)
696                 bi->bi_gid = from_kgid(mnt_userns, attr->ia_gid);
697
698         if (ia_valid & ATTR_SIZE)
699                 bi->bi_size = attr->ia_size;
700
701         if (ia_valid & ATTR_ATIME)
702                 bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
703         if (ia_valid & ATTR_MTIME)
704                 bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
705         if (ia_valid & ATTR_CTIME)
706                 bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
707
708         if (ia_valid & ATTR_MODE) {
709                 umode_t mode = attr->ia_mode;
710                 kgid_t gid = ia_valid & ATTR_GID
711                         ? attr->ia_gid
712                         : inode->v.i_gid;
713
714                 if (!in_group_p(gid) &&
715                     !capable_wrt_inode_uidgid(mnt_userns, &inode->v, CAP_FSETID))
716                         mode &= ~S_ISGID;
717                 bi->bi_mode = mode;
718         }
719 }
720
721 int bch2_setattr_nonsize(struct user_namespace *mnt_userns,
722                          struct bch_inode_info *inode,
723                          struct iattr *attr)
724 {
725         struct bch_fs *c = inode->v.i_sb->s_fs_info;
726         struct bch_qid qid;
727         struct btree_trans trans;
728         struct btree_iter inode_iter = { NULL };
729         struct bch_inode_unpacked inode_u;
730         struct posix_acl *acl = NULL;
731         int ret;
732
733         mutex_lock(&inode->ei_update_lock);
734
735         qid = inode->ei_qid;
736
737         if (attr->ia_valid & ATTR_UID)
738                 qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid);
739
740         if (attr->ia_valid & ATTR_GID)
741                 qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid);
742
743         ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
744                                      KEY_TYPE_QUOTA_PREALLOC);
745         if (ret)
746                 goto err;
747
748         bch2_trans_init(&trans, c, 0, 0);
749 retry:
750         bch2_trans_begin(&trans);
751         kfree(acl);
752         acl = NULL;
753
754         ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
755                               BTREE_ITER_INTENT);
756         if (ret)
757                 goto btree_err;
758
759         bch2_setattr_copy(mnt_userns, inode, &inode_u, attr);
760
761         if (attr->ia_valid & ATTR_MODE) {
762                 ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
763                                      inode_u.bi_mode, &acl);
764                 if (ret)
765                         goto btree_err;
766         }
767
768         ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
769                 bch2_trans_commit(&trans, NULL,
770                                   &inode->ei_journal_seq,
771                                   BTREE_INSERT_NOFAIL);
772 btree_err:
773         bch2_trans_iter_exit(&trans, &inode_iter);
774
775         if (ret == -EINTR)
776                 goto retry;
777         if (unlikely(ret))
778                 goto err_trans;
779
780         bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid);
781
782         if (acl)
783                 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
784 err_trans:
785         bch2_trans_exit(&trans);
786 err:
787         mutex_unlock(&inode->ei_update_lock);
788
789         return ret;
790 }
791
792 static int bch2_getattr(struct user_namespace *mnt_userns,
793                         const struct path *path, struct kstat *stat,
794                         u32 request_mask, unsigned query_flags)
795 {
796         struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
797         struct bch_fs *c = inode->v.i_sb->s_fs_info;
798
799         stat->dev       = inode->v.i_sb->s_dev;
800         stat->ino       = inode->v.i_ino;
801         stat->mode      = inode->v.i_mode;
802         stat->nlink     = inode->v.i_nlink;
803         stat->uid       = inode->v.i_uid;
804         stat->gid       = inode->v.i_gid;
805         stat->rdev      = inode->v.i_rdev;
806         stat->size      = i_size_read(&inode->v);
807         stat->atime     = inode->v.i_atime;
808         stat->mtime     = inode->v.i_mtime;
809         stat->ctime     = inode->v.i_ctime;
810         stat->blksize   = block_bytes(c);
811         stat->blocks    = inode->v.i_blocks;
812
813         if (request_mask & STATX_BTIME) {
814                 stat->result_mask |= STATX_BTIME;
815                 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
816         }
817
818         if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
819                 stat->attributes |= STATX_ATTR_IMMUTABLE;
820         stat->attributes_mask    |= STATX_ATTR_IMMUTABLE;
821
822         if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
823                 stat->attributes |= STATX_ATTR_APPEND;
824         stat->attributes_mask    |= STATX_ATTR_APPEND;
825
826         if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
827                 stat->attributes |= STATX_ATTR_NODUMP;
828         stat->attributes_mask    |= STATX_ATTR_NODUMP;
829
830         return 0;
831 }
832
833 static int bch2_setattr(struct user_namespace *mnt_userns,
834                         struct dentry *dentry, struct iattr *iattr)
835 {
836         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
837         int ret;
838
839         lockdep_assert_held(&inode->v.i_rwsem);
840
841         ret = setattr_prepare(mnt_userns, dentry, iattr);
842         if (ret)
843                 return ret;
844
845         return iattr->ia_valid & ATTR_SIZE
846                 ? bch2_truncate(mnt_userns, inode, iattr)
847                 : bch2_setattr_nonsize(mnt_userns, inode, iattr);
848 }
849
850 static int bch2_tmpfile(struct user_namespace *mnt_userns,
851                         struct inode *vdir, struct dentry *dentry, umode_t mode)
852 {
853         struct bch_inode_info *inode =
854                 __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0,
855                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
856
857         if (IS_ERR(inode))
858                 return PTR_ERR(inode);
859
860         d_mark_tmpfile(dentry, &inode->v);
861         d_instantiate(dentry, &inode->v);
862         return 0;
863 }
864
865 static int bch2_fill_extent(struct bch_fs *c,
866                             struct fiemap_extent_info *info,
867                             struct bkey_s_c k, unsigned flags)
868 {
869         if (bkey_extent_is_direct_data(k.k)) {
870                 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
871                 const union bch_extent_entry *entry;
872                 struct extent_ptr_decoded p;
873                 int ret;
874
875                 if (k.k->type == KEY_TYPE_reflink_v)
876                         flags |= FIEMAP_EXTENT_SHARED;
877
878                 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
879                         int flags2 = 0;
880                         u64 offset = p.ptr.offset;
881
882                         if (p.crc.compression_type)
883                                 flags2 |= FIEMAP_EXTENT_ENCODED;
884                         else
885                                 offset += p.crc.offset;
886
887                         if ((offset & (c->opts.block_size - 1)) ||
888                             (k.k->size & (c->opts.block_size - 1)))
889                                 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
890
891                         ret = fiemap_fill_next_extent(info,
892                                                 bkey_start_offset(k.k) << 9,
893                                                 offset << 9,
894                                                 k.k->size << 9, flags|flags2);
895                         if (ret)
896                                 return ret;
897                 }
898
899                 return 0;
900         } else if (bkey_extent_is_inline_data(k.k)) {
901                 return fiemap_fill_next_extent(info,
902                                                bkey_start_offset(k.k) << 9,
903                                                0, k.k->size << 9,
904                                                flags|
905                                                FIEMAP_EXTENT_DATA_INLINE);
906         } else if (k.k->type == KEY_TYPE_reservation) {
907                 return fiemap_fill_next_extent(info,
908                                                bkey_start_offset(k.k) << 9,
909                                                0, k.k->size << 9,
910                                                flags|
911                                                FIEMAP_EXTENT_DELALLOC|
912                                                FIEMAP_EXTENT_UNWRITTEN);
913         } else {
914                 BUG();
915         }
916 }
917
918 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
919                        u64 start, u64 len)
920 {
921         struct bch_fs *c = vinode->i_sb->s_fs_info;
922         struct bch_inode_info *ei = to_bch_ei(vinode);
923         struct btree_trans trans;
924         struct btree_iter iter;
925         struct bkey_s_c k;
926         struct bkey_buf cur, prev;
927         struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
928         unsigned offset_into_extent, sectors;
929         bool have_extent = false;
930         u32 snapshot;
931         int ret = 0;
932
933         ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
934         if (ret)
935                 return ret;
936
937         if (start + len < start)
938                 return -EINVAL;
939
940         start >>= 9;
941
942         bch2_bkey_buf_init(&cur);
943         bch2_bkey_buf_init(&prev);
944         bch2_trans_init(&trans, c, 0, 0);
945 retry:
946         bch2_trans_begin(&trans);
947
948         ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
949         if (ret)
950                 goto err;
951
952         bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
953                              SPOS(ei->v.i_ino, start, snapshot), 0);
954
955         while ((k = bch2_btree_iter_peek(&iter)).k &&
956                !(ret = bkey_err(k)) &&
957                bkey_cmp(iter.pos, end) < 0) {
958                 enum btree_id data_btree = BTREE_ID_extents;
959
960                 if (!bkey_extent_is_data(k.k) &&
961                     k.k->type != KEY_TYPE_reservation) {
962                         bch2_btree_iter_advance(&iter);
963                         continue;
964                 }
965
966                 offset_into_extent      = iter.pos.offset -
967                         bkey_start_offset(k.k);
968                 sectors                 = k.k->size - offset_into_extent;
969
970                 bch2_bkey_buf_reassemble(&cur, c, k);
971
972                 ret = bch2_read_indirect_extent(&trans, &data_btree,
973                                         &offset_into_extent, &cur);
974                 if (ret)
975                         break;
976
977                 k = bkey_i_to_s_c(cur.k);
978                 bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
979
980                 sectors = min(sectors, k.k->size - offset_into_extent);
981
982                 bch2_cut_front(POS(k.k->p.inode,
983                                    bkey_start_offset(k.k) +
984                                    offset_into_extent),
985                                cur.k);
986                 bch2_key_resize(&cur.k->k, sectors);
987                 cur.k->k.p = iter.pos;
988                 cur.k->k.p.offset += cur.k->k.size;
989
990                 if (have_extent) {
991                         ret = bch2_fill_extent(c, info,
992                                         bkey_i_to_s_c(prev.k), 0);
993                         if (ret)
994                                 break;
995                 }
996
997                 bkey_copy(prev.k, cur.k);
998                 have_extent = true;
999
1000                 bch2_btree_iter_set_pos(&iter,
1001                         POS(iter.pos.inode, iter.pos.offset + sectors));
1002         }
1003         start = iter.pos.offset;
1004         bch2_trans_iter_exit(&trans, &iter);
1005 err:
1006         if (ret == -EINTR)
1007                 goto retry;
1008
1009         if (!ret && have_extent)
1010                 ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
1011                                        FIEMAP_EXTENT_LAST);
1012
1013         bch2_trans_exit(&trans);
1014         bch2_bkey_buf_exit(&cur, c);
1015         bch2_bkey_buf_exit(&prev, c);
1016         return ret < 0 ? ret : 0;
1017 }
1018
1019 static const struct vm_operations_struct bch_vm_ops = {
1020         .fault          = bch2_page_fault,
1021         .map_pages      = filemap_map_pages,
1022         .page_mkwrite   = bch2_page_mkwrite,
1023 };
1024
1025 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
1026 {
1027         file_accessed(file);
1028
1029         vma->vm_ops = &bch_vm_ops;
1030         return 0;
1031 }
1032
1033 /* Directories: */
1034
1035 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
1036 {
1037         return generic_file_llseek_size(file, offset, whence,
1038                                         S64_MAX, S64_MAX);
1039 }
1040
1041 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1042 {
1043         struct bch_inode_info *inode = file_bch_inode(file);
1044         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1045
1046         if (!dir_emit_dots(file, ctx))
1047                 return 0;
1048
1049         return bch2_readdir(c, inode_inum(inode), ctx);
1050 }
1051
1052 static const struct file_operations bch_file_operations = {
1053         .llseek         = bch2_llseek,
1054         .read_iter      = bch2_read_iter,
1055         .write_iter     = bch2_write_iter,
1056         .mmap           = bch2_mmap,
1057         .open           = generic_file_open,
1058         .fsync          = bch2_fsync,
1059         .splice_read    = generic_file_splice_read,
1060         .splice_write   = iter_file_splice_write,
1061         .fallocate      = bch2_fallocate_dispatch,
1062         .unlocked_ioctl = bch2_fs_file_ioctl,
1063 #ifdef CONFIG_COMPAT
1064         .compat_ioctl   = bch2_compat_fs_ioctl,
1065 #endif
1066         .remap_file_range = bch2_remap_file_range,
1067 };
1068
1069 static const struct inode_operations bch_file_inode_operations = {
1070         .getattr        = bch2_getattr,
1071         .setattr        = bch2_setattr,
1072         .fiemap         = bch2_fiemap,
1073         .listxattr      = bch2_xattr_list,
1074 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1075         .get_acl        = bch2_get_acl,
1076         .set_acl        = bch2_set_acl,
1077 #endif
1078 };
1079
1080 static const struct inode_operations bch_dir_inode_operations = {
1081         .lookup         = bch2_lookup,
1082         .create         = bch2_create,
1083         .link           = bch2_link,
1084         .unlink         = bch2_unlink,
1085         .symlink        = bch2_symlink,
1086         .mkdir          = bch2_mkdir,
1087         .rmdir          = bch2_unlink,
1088         .mknod          = bch2_mknod,
1089         .rename         = bch2_rename2,
1090         .getattr        = bch2_getattr,
1091         .setattr        = bch2_setattr,
1092         .tmpfile        = bch2_tmpfile,
1093         .listxattr      = bch2_xattr_list,
1094 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1095         .get_acl        = bch2_get_acl,
1096         .set_acl        = bch2_set_acl,
1097 #endif
1098 };
1099
1100 static const struct file_operations bch_dir_file_operations = {
1101         .llseek         = bch2_dir_llseek,
1102         .read           = generic_read_dir,
1103         .iterate_shared = bch2_vfs_readdir,
1104         .fsync          = bch2_fsync,
1105         .unlocked_ioctl = bch2_fs_file_ioctl,
1106 #ifdef CONFIG_COMPAT
1107         .compat_ioctl   = bch2_compat_fs_ioctl,
1108 #endif
1109 };
1110
1111 static const struct inode_operations bch_symlink_inode_operations = {
1112         .get_link       = page_get_link,
1113         .getattr        = bch2_getattr,
1114         .setattr        = bch2_setattr,
1115         .listxattr      = bch2_xattr_list,
1116 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1117         .get_acl        = bch2_get_acl,
1118         .set_acl        = bch2_set_acl,
1119 #endif
1120 };
1121
1122 static const struct inode_operations bch_special_inode_operations = {
1123         .getattr        = bch2_getattr,
1124         .setattr        = bch2_setattr,
1125         .listxattr      = bch2_xattr_list,
1126 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1127         .get_acl        = bch2_get_acl,
1128         .set_acl        = bch2_set_acl,
1129 #endif
1130 };
1131
1132 static const struct address_space_operations bch_address_space_operations = {
1133         .writepage      = bch2_writepage,
1134         .readpage       = bch2_readpage,
1135         .writepages     = bch2_writepages,
1136         .readahead      = bch2_readahead,
1137         .set_page_dirty = __set_page_dirty_nobuffers,
1138         .write_begin    = bch2_write_begin,
1139         .write_end      = bch2_write_end,
1140         .invalidatepage = bch2_invalidatepage,
1141         .releasepage    = bch2_releasepage,
1142         .direct_IO      = noop_direct_IO,
1143 #ifdef CONFIG_MIGRATION
1144         .migratepage    = bch2_migrate_page,
1145 #endif
1146         .error_remove_page = generic_error_remove_page,
1147 };
1148
1149 #if 0
1150 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1151                 u64 ino, u32 generation)
1152 {
1153         struct bch_fs *c = sb->s_fs_info;
1154         struct inode *vinode;
1155
1156         if (ino < BCACHEFS_ROOT_INO)
1157                 return ERR_PTR(-ESTALE);
1158
1159         vinode = bch2_vfs_inode_get(c, ino);
1160         if (IS_ERR(vinode))
1161                 return ERR_CAST(vinode);
1162         if (generation && vinode->i_generation != generation) {
1163                 /* we didn't find the right inode.. */
1164                 iput(vinode);
1165                 return ERR_PTR(-ESTALE);
1166         }
1167         return vinode;
1168 }
1169
1170 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
1171                 int fh_len, int fh_type)
1172 {
1173         return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1174                                     bch2_nfs_get_inode);
1175 }
1176
1177 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
1178                 int fh_len, int fh_type)
1179 {
1180         return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1181                                     bch2_nfs_get_inode);
1182 }
1183 #endif
1184
1185 static const struct export_operations bch_export_ops = {
1186         //.fh_to_dentry = bch2_fh_to_dentry,
1187         //.fh_to_parent = bch2_fh_to_parent,
1188         //.get_parent   = bch2_get_parent,
1189 };
1190
1191 static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum,
1192                                 struct bch_inode_info *inode,
1193                                 struct bch_inode_unpacked *bi)
1194 {
1195         bch2_inode_update_after_write(c, inode, bi, ~0);
1196
1197         inode->v.i_blocks       = bi->bi_sectors;
1198         inode->v.i_ino          = bi->bi_inum;
1199         inode->v.i_rdev         = bi->bi_dev;
1200         inode->v.i_generation   = bi->bi_generation;
1201         inode->v.i_size         = bi->bi_size;
1202
1203         inode->ei_flags         = 0;
1204         inode->ei_journal_seq   = bi->bi_journal_seq;
1205         inode->ei_quota_reserved = 0;
1206         inode->ei_qid           = bch_qid(bi);
1207         inode->ei_subvol        = inum.subvol;
1208
1209         inode->v.i_mapping->a_ops = &bch_address_space_operations;
1210
1211         switch (inode->v.i_mode & S_IFMT) {
1212         case S_IFREG:
1213                 inode->v.i_op   = &bch_file_inode_operations;
1214                 inode->v.i_fop  = &bch_file_operations;
1215                 break;
1216         case S_IFDIR:
1217                 inode->v.i_op   = &bch_dir_inode_operations;
1218                 inode->v.i_fop  = &bch_dir_file_operations;
1219                 break;
1220         case S_IFLNK:
1221                 inode_nohighmem(&inode->v);
1222                 inode->v.i_op   = &bch_symlink_inode_operations;
1223                 break;
1224         default:
1225                 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1226                 inode->v.i_op   = &bch_special_inode_operations;
1227                 break;
1228         }
1229 }
1230
1231 static struct inode *bch2_alloc_inode(struct super_block *sb)
1232 {
1233         struct bch_inode_info *inode;
1234
1235         inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
1236         if (!inode)
1237                 return NULL;
1238
1239         inode_init_once(&inode->v);
1240         mutex_init(&inode->ei_update_lock);
1241         pagecache_lock_init(&inode->ei_pagecache_lock);
1242         mutex_init(&inode->ei_quota_lock);
1243         inode->ei_journal_seq = 0;
1244
1245         return &inode->v;
1246 }
1247
1248 static void bch2_i_callback(struct rcu_head *head)
1249 {
1250         struct inode *vinode = container_of(head, struct inode, i_rcu);
1251         struct bch_inode_info *inode = to_bch_ei(vinode);
1252
1253         kmem_cache_free(bch2_inode_cache, inode);
1254 }
1255
1256 static void bch2_destroy_inode(struct inode *vinode)
1257 {
1258         call_rcu(&vinode->i_rcu, bch2_i_callback);
1259 }
1260
1261 static int inode_update_times_fn(struct bch_inode_info *inode,
1262                                  struct bch_inode_unpacked *bi,
1263                                  void *p)
1264 {
1265         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1266
1267         bi->bi_atime    = timespec_to_bch2_time(c, inode->v.i_atime);
1268         bi->bi_mtime    = timespec_to_bch2_time(c, inode->v.i_mtime);
1269         bi->bi_ctime    = timespec_to_bch2_time(c, inode->v.i_ctime);
1270
1271         return 0;
1272 }
1273
1274 static int bch2_vfs_write_inode(struct inode *vinode,
1275                                 struct writeback_control *wbc)
1276 {
1277         struct bch_fs *c = vinode->i_sb->s_fs_info;
1278         struct bch_inode_info *inode = to_bch_ei(vinode);
1279         int ret;
1280
1281         mutex_lock(&inode->ei_update_lock);
1282         ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1283                                ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1284         mutex_unlock(&inode->ei_update_lock);
1285
1286         return ret;
1287 }
1288
1289 static void bch2_evict_inode(struct inode *vinode)
1290 {
1291         struct bch_fs *c = vinode->i_sb->s_fs_info;
1292         struct bch_inode_info *inode = to_bch_ei(vinode);
1293
1294         truncate_inode_pages_final(&inode->v.i_data);
1295
1296         clear_inode(&inode->v);
1297
1298         BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1299
1300         if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1301                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1302                                 KEY_TYPE_QUOTA_WARN);
1303                 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1304                                 KEY_TYPE_QUOTA_WARN);
1305                 bch2_inode_rm(c, inode_inum(inode), true);
1306         }
1307 }
1308
1309 void bch2_evict_subvolume_inodes(struct bch_fs *c,
1310                                  struct snapshot_id_list *s)
1311 {
1312         struct super_block *sb = c->vfs_sb;
1313         struct inode *inode;
1314
1315         spin_lock(&sb->s_inode_list_lock);
1316         list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1317                 if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
1318                     (inode->i_state & I_FREEING))
1319                         continue;
1320
1321                 d_mark_dontcache(inode);
1322                 d_prune_aliases(inode);
1323         }
1324         spin_unlock(&sb->s_inode_list_lock);
1325 again:
1326         cond_resched();
1327         spin_lock(&sb->s_inode_list_lock);
1328         list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1329                 if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
1330                     (inode->i_state & I_FREEING))
1331                         continue;
1332
1333                 if (!(inode->i_state & I_DONTCACHE)) {
1334                         d_mark_dontcache(inode);
1335                         d_prune_aliases(inode);
1336                 }
1337
1338                 spin_lock(&inode->i_lock);
1339                 if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
1340                     !(inode->i_state & I_FREEING)) {
1341                         wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
1342                         DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
1343                         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
1344                         spin_unlock(&inode->i_lock);
1345                         spin_unlock(&sb->s_inode_list_lock);
1346                         schedule();
1347                         finish_wait(wq, &wait.wq_entry);
1348                         goto again;
1349                 }
1350
1351                 spin_unlock(&inode->i_lock);
1352         }
1353         spin_unlock(&sb->s_inode_list_lock);
1354 }
1355
1356 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1357 {
1358         struct super_block *sb = dentry->d_sb;
1359         struct bch_fs *c = sb->s_fs_info;
1360         struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1361         unsigned shift = sb->s_blocksize_bits - 9;
1362         /*
1363          * this assumes inodes take up 64 bytes, which is a decent average
1364          * number:
1365          */
1366         u64 avail_inodes = ((usage.capacity - usage.used) << 3);
1367         u64 fsid;
1368
1369         buf->f_type     = BCACHEFS_STATFS_MAGIC;
1370         buf->f_bsize    = sb->s_blocksize;
1371         buf->f_blocks   = usage.capacity >> shift;
1372         buf->f_bfree    = usage.free >> shift;
1373         buf->f_bavail   = avail_factor(usage.free) >> shift;
1374
1375         buf->f_files    = usage.nr_inodes + avail_inodes;
1376         buf->f_ffree    = avail_inodes;
1377
1378         fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
1379                le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
1380         buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
1381         buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
1382         buf->f_namelen  = BCH_NAME_MAX;
1383
1384         return 0;
1385 }
1386
1387 static int bch2_sync_fs(struct super_block *sb, int wait)
1388 {
1389         struct bch_fs *c = sb->s_fs_info;
1390
1391         if (c->opts.journal_flush_disabled)
1392                 return 0;
1393
1394         if (!wait) {
1395                 bch2_journal_flush_async(&c->journal, NULL);
1396                 return 0;
1397         }
1398
1399         return bch2_journal_flush(&c->journal);
1400 }
1401
1402 static struct bch_fs *bch2_path_to_fs(const char *path)
1403 {
1404         struct bch_fs *c;
1405         dev_t dev;
1406         int ret;
1407
1408         ret = lookup_bdev(path, &dev);
1409         if (ret)
1410                 return ERR_PTR(ret);
1411
1412         c = bch2_dev_to_fs(dev);
1413         if (c)
1414                 closure_put(&c->cl);
1415         return c ?: ERR_PTR(-ENOENT);
1416 }
1417
1418 static char **split_devs(const char *_dev_name, unsigned *nr)
1419 {
1420         char *dev_name = NULL, **devs = NULL, *s;
1421         size_t i, nr_devs = 0;
1422
1423         dev_name = kstrdup(_dev_name, GFP_KERNEL);
1424         if (!dev_name)
1425                 return NULL;
1426
1427         for (s = dev_name; s; s = strchr(s + 1, ':'))
1428                 nr_devs++;
1429
1430         devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
1431         if (!devs) {
1432                 kfree(dev_name);
1433                 return NULL;
1434         }
1435
1436         for (i = 0, s = dev_name;
1437              s;
1438              (s = strchr(s, ':')) && (*s++ = '\0'))
1439                 devs[i++] = s;
1440
1441         *nr = nr_devs;
1442         return devs;
1443 }
1444
1445 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1446 {
1447         struct bch_fs *c = sb->s_fs_info;
1448         struct bch_opts opts = bch2_opts_empty();
1449         int ret;
1450
1451         opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
1452
1453         ret = bch2_parse_mount_opts(c, &opts, data);
1454         if (ret)
1455                 return ret;
1456
1457         if (opts.read_only != c->opts.read_only) {
1458                 down_write(&c->state_lock);
1459
1460                 if (opts.read_only) {
1461                         bch2_fs_read_only(c);
1462
1463                         sb->s_flags |= SB_RDONLY;
1464                 } else {
1465                         ret = bch2_fs_read_write(c);
1466                         if (ret) {
1467                                 bch_err(c, "error going rw: %i", ret);
1468                                 up_write(&c->state_lock);
1469                                 return -EINVAL;
1470                         }
1471
1472                         sb->s_flags &= ~SB_RDONLY;
1473                 }
1474
1475                 c->opts.read_only = opts.read_only;
1476
1477                 up_write(&c->state_lock);
1478         }
1479
1480         if (opts.errors >= 0)
1481                 c->opts.errors = opts.errors;
1482
1483         return ret;
1484 }
1485
1486 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
1487 {
1488         struct bch_fs *c = root->d_sb->s_fs_info;
1489         struct bch_dev *ca;
1490         unsigned i;
1491         bool first = true;
1492
1493         for_each_online_member(ca, c, i) {
1494                 if (!first)
1495                         seq_putc(seq, ':');
1496                 first = false;
1497                 seq_puts(seq, "/dev/");
1498                 seq_puts(seq, ca->name);
1499         }
1500
1501         return 0;
1502 }
1503
1504 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1505 {
1506         struct bch_fs *c = root->d_sb->s_fs_info;
1507         enum bch_opt_id i;
1508         char buf[512];
1509
1510         for (i = 0; i < bch2_opts_nr; i++) {
1511                 const struct bch_option *opt = &bch2_opt_table[i];
1512                 u64 v = bch2_opt_get_by_id(&c->opts, i);
1513
1514                 if (!(opt->mode & OPT_MOUNT))
1515                         continue;
1516
1517                 if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1518                         continue;
1519
1520                 bch2_opt_to_text(&PBUF(buf), c, opt, v,
1521                                  OPT_SHOW_MOUNT_STYLE);
1522                 seq_putc(seq, ',');
1523                 seq_puts(seq, buf);
1524         }
1525
1526         return 0;
1527 }
1528
1529 static void bch2_put_super(struct super_block *sb)
1530 {
1531         struct bch_fs *c = sb->s_fs_info;
1532
1533         __bch2_fs_stop(c);
1534 }
1535
1536 static const struct super_operations bch_super_operations = {
1537         .alloc_inode    = bch2_alloc_inode,
1538         .destroy_inode  = bch2_destroy_inode,
1539         .write_inode    = bch2_vfs_write_inode,
1540         .evict_inode    = bch2_evict_inode,
1541         .sync_fs        = bch2_sync_fs,
1542         .statfs         = bch2_statfs,
1543         .show_devname   = bch2_show_devname,
1544         .show_options   = bch2_show_options,
1545         .remount_fs     = bch2_remount,
1546         .put_super      = bch2_put_super,
1547 #if 0
1548         .freeze_fs      = bch2_freeze,
1549         .unfreeze_fs    = bch2_unfreeze,
1550 #endif
1551 };
1552
1553 static int bch2_set_super(struct super_block *s, void *data)
1554 {
1555         s->s_fs_info = data;
1556         return 0;
1557 }
1558
1559 static int bch2_noset_super(struct super_block *s, void *data)
1560 {
1561         return -EBUSY;
1562 }
1563
1564 static int bch2_test_super(struct super_block *s, void *data)
1565 {
1566         struct bch_fs *c = s->s_fs_info;
1567         struct bch_fs **devs = data;
1568         unsigned i;
1569
1570         if (!c)
1571                 return false;
1572
1573         for (i = 0; devs[i]; i++)
1574                 if (c != devs[i])
1575                         return false;
1576         return true;
1577 }
1578
1579 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1580                                  int flags, const char *dev_name, void *data)
1581 {
1582         struct bch_fs *c;
1583         struct bch_dev *ca;
1584         struct super_block *sb;
1585         struct inode *vinode;
1586         struct bch_opts opts = bch2_opts_empty();
1587         char **devs;
1588         struct bch_fs **devs_to_fs = NULL;
1589         unsigned i, nr_devs;
1590         int ret;
1591
1592         opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
1593
1594         ret = bch2_parse_mount_opts(NULL, &opts, data);
1595         if (ret)
1596                 return ERR_PTR(ret);
1597
1598         if (!dev_name || strlen(dev_name) == 0)
1599                 return ERR_PTR(-EINVAL);
1600
1601         devs = split_devs(dev_name, &nr_devs);
1602         if (!devs)
1603                 return ERR_PTR(-ENOMEM);
1604
1605         devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
1606         if (!devs_to_fs) {
1607                 sb = ERR_PTR(-ENOMEM);
1608                 goto got_sb;
1609         }
1610
1611         for (i = 0; i < nr_devs; i++)
1612                 devs_to_fs[i] = bch2_path_to_fs(devs[i]);
1613
1614         sb = sget(fs_type, bch2_test_super, bch2_noset_super,
1615                   flags|SB_NOSEC, devs_to_fs);
1616         if (!IS_ERR(sb))
1617                 goto got_sb;
1618
1619         c = bch2_fs_open(devs, nr_devs, opts);
1620         if (IS_ERR(c)) {
1621                 sb = ERR_CAST(c);
1622                 goto got_sb;
1623         }
1624
1625         /* Some options can't be parsed until after the fs is started: */
1626         ret = bch2_parse_mount_opts(c, &opts, data);
1627         if (ret) {
1628                 bch2_fs_stop(c);
1629                 sb = ERR_PTR(ret);
1630                 goto got_sb;
1631         }
1632
1633         bch2_opts_apply(&c->opts, opts);
1634
1635         sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
1636         if (IS_ERR(sb))
1637                 bch2_fs_stop(c);
1638 got_sb:
1639         kfree(devs_to_fs);
1640         kfree(devs[0]);
1641         kfree(devs);
1642
1643         if (IS_ERR(sb))
1644                 return ERR_CAST(sb);
1645
1646         c = sb->s_fs_info;
1647
1648         if (sb->s_root) {
1649                 if ((flags ^ sb->s_flags) & SB_RDONLY) {
1650                         ret = -EBUSY;
1651                         goto err_put_super;
1652                 }
1653                 goto out;
1654         }
1655
1656         sb->s_blocksize         = block_bytes(c);
1657         sb->s_blocksize_bits    = ilog2(block_bytes(c));
1658         sb->s_maxbytes          = MAX_LFS_FILESIZE;
1659         sb->s_op                = &bch_super_operations;
1660         sb->s_export_op         = &bch_export_ops;
1661 #ifdef CONFIG_BCACHEFS_QUOTA
1662         sb->s_qcop              = &bch2_quotactl_operations;
1663         sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1664 #endif
1665         sb->s_xattr             = bch2_xattr_handlers;
1666         sb->s_magic             = BCACHEFS_STATFS_MAGIC;
1667         sb->s_time_gran         = c->sb.nsec_per_time_unit;
1668         sb->s_time_min          = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
1669         sb->s_time_max          = div_s64(S64_MAX, c->sb.time_units_per_sec);
1670         c->vfs_sb               = sb;
1671         strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
1672
1673         ret = super_setup_bdi(sb);
1674         if (ret)
1675                 goto err_put_super;
1676
1677         sb->s_bdi->ra_pages             = VM_READAHEAD_PAGES;
1678
1679         for_each_online_member(ca, c, i) {
1680                 struct block_device *bdev = ca->disk_sb.bdev;
1681
1682                 /* XXX: create an anonymous device for multi device filesystems */
1683                 sb->s_bdev      = bdev;
1684                 sb->s_dev       = bdev->bd_dev;
1685                 percpu_ref_put(&ca->io_ref);
1686                 break;
1687         }
1688
1689         c->dev = sb->s_dev;
1690
1691 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1692         if (c->opts.acl)
1693                 sb->s_flags     |= SB_POSIXACL;
1694 #endif
1695
1696         vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
1697         if (IS_ERR(vinode)) {
1698                 bch_err(c, "error mounting: error getting root inode %i",
1699                         (int) PTR_ERR(vinode));
1700                 ret = PTR_ERR(vinode);
1701                 goto err_put_super;
1702         }
1703
1704         sb->s_root = d_make_root(vinode);
1705         if (!sb->s_root) {
1706                 bch_err(c, "error mounting: error allocating root dentry");
1707                 ret = -ENOMEM;
1708                 goto err_put_super;
1709         }
1710
1711         sb->s_flags |= SB_ACTIVE;
1712 out:
1713         return dget(sb->s_root);
1714
1715 err_put_super:
1716         deactivate_locked_super(sb);
1717         return ERR_PTR(ret);
1718 }
1719
1720 static void bch2_kill_sb(struct super_block *sb)
1721 {
1722         struct bch_fs *c = sb->s_fs_info;
1723
1724         generic_shutdown_super(sb);
1725         bch2_fs_free(c);
1726 }
1727
1728 static struct file_system_type bcache_fs_type = {
1729         .owner          = THIS_MODULE,
1730         .name           = "bcachefs",
1731         .mount          = bch2_mount,
1732         .kill_sb        = bch2_kill_sb,
1733         .fs_flags       = FS_REQUIRES_DEV,
1734 };
1735
1736 MODULE_ALIAS_FS("bcachefs");
1737
1738 void bch2_vfs_exit(void)
1739 {
1740         unregister_filesystem(&bcache_fs_type);
1741         if (bch2_inode_cache)
1742                 kmem_cache_destroy(bch2_inode_cache);
1743 }
1744
1745 int __init bch2_vfs_init(void)
1746 {
1747         int ret = -ENOMEM;
1748
1749         bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
1750         if (!bch2_inode_cache)
1751                 goto err;
1752
1753         ret = register_filesystem(&bcache_fs_type);
1754         if (ret)
1755                 goto err;
1756
1757         return 0;
1758 err:
1759         bch2_vfs_exit();
1760         return ret;
1761 }
1762
1763 #endif /* NO_BCACHEFS_FS */