]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fs.c
Update bcachefs sources to 6afa1fcb13 bcachefs: Clean up error reporting in the start...
[bcachefs-tools-debian] / libbcachefs / fs.c
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3
4 #include "bcachefs.h"
5 #include "acl.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "chardev.h"
10 #include "dirent.h"
11 #include "extents.h"
12 #include "fs.h"
13 #include "fs-common.h"
14 #include "fs-io.h"
15 #include "fs-ioctl.h"
16 #include "fsck.h"
17 #include "inode.h"
18 #include "io.h"
19 #include "journal.h"
20 #include "keylist.h"
21 #include "quota.h"
22 #include "super.h"
23 #include "xattr.h"
24
25 #include <linux/aio.h>
26 #include <linux/backing-dev.h>
27 #include <linux/exportfs.h>
28 #include <linux/fiemap.h>
29 #include <linux/module.h>
30 #include <linux/pagemap.h>
31 #include <linux/posix_acl.h>
32 #include <linux/random.h>
33 #include <linux/statfs.h>
34 #include <linux/string.h>
35 #include <linux/xattr.h>
36
37 static struct kmem_cache *bch2_inode_cache;
38
39 static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum,
40                                 struct bch_inode_info *,
41                                 struct bch_inode_unpacked *);
42
43 static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
44 {
45         BUG_ON(atomic_long_read(&lock->v) == 0);
46
47         if (atomic_long_sub_return_release(i, &lock->v) == 0)
48                 wake_up_all(&lock->wait);
49 }
50
51 static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
52 {
53         long v = atomic_long_read(&lock->v), old;
54
55         do {
56                 old = v;
57
58                 if (i > 0 ? v < 0 : v > 0)
59                         return false;
60         } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
61                                         old, old + i)) != old);
62         return true;
63 }
64
65 static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
66 {
67         wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
68 }
69
70 void bch2_pagecache_add_put(struct pagecache_lock *lock)
71 {
72         __pagecache_lock_put(lock, 1);
73 }
74
75 bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
76 {
77         return __pagecache_lock_tryget(lock, 1);
78 }
79
80 void bch2_pagecache_add_get(struct pagecache_lock *lock)
81 {
82         __pagecache_lock_get(lock, 1);
83 }
84
85 void bch2_pagecache_block_put(struct pagecache_lock *lock)
86 {
87         __pagecache_lock_put(lock, -1);
88 }
89
90 void bch2_pagecache_block_get(struct pagecache_lock *lock)
91 {
92         __pagecache_lock_get(lock, -1);
93 }
94
95 void bch2_inode_update_after_write(struct bch_fs *c,
96                                    struct bch_inode_info *inode,
97                                    struct bch_inode_unpacked *bi,
98                                    unsigned fields)
99 {
100         set_nlink(&inode->v, bch2_inode_nlink_get(bi));
101         i_uid_write(&inode->v, bi->bi_uid);
102         i_gid_write(&inode->v, bi->bi_gid);
103         inode->v.i_mode = bi->bi_mode;
104
105         if (fields & ATTR_ATIME)
106                 inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
107         if (fields & ATTR_MTIME)
108                 inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
109         if (fields & ATTR_CTIME)
110                 inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
111
112         inode->ei_inode         = *bi;
113
114         bch2_inode_flags_to_vfs(inode);
115 }
116
117 int __must_check bch2_write_inode(struct bch_fs *c,
118                                   struct bch_inode_info *inode,
119                                   inode_set_fn set,
120                                   void *p, unsigned fields)
121 {
122         struct btree_trans trans;
123         struct btree_iter iter = { NULL };
124         struct bch_inode_unpacked inode_u;
125         int ret;
126
127         bch2_trans_init(&trans, c, 0, 512);
128 retry:
129         bch2_trans_begin(&trans);
130
131         ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
132                                 BTREE_ITER_INTENT) ?:
133                 (set ? set(inode, &inode_u, p) : 0) ?:
134                 bch2_inode_write(&trans, &iter, &inode_u) ?:
135                 bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL);
136
137         /*
138          * the btree node lock protects inode->ei_inode, not ei_update_lock;
139          * this is important for inode updates via bchfs_write_index_update
140          */
141         if (!ret)
142                 bch2_inode_update_after_write(c, inode, &inode_u, fields);
143
144         bch2_trans_iter_exit(&trans, &iter);
145
146         if (ret == -EINTR)
147                 goto retry;
148
149         bch2_trans_exit(&trans);
150         return ret < 0 ? ret : 0;
151 }
152
153 int bch2_fs_quota_transfer(struct bch_fs *c,
154                            struct bch_inode_info *inode,
155                            struct bch_qid new_qid,
156                            unsigned qtypes,
157                            enum quota_acct_mode mode)
158 {
159         unsigned i;
160         int ret;
161
162         qtypes &= enabled_qtypes(c);
163
164         for (i = 0; i < QTYP_NR; i++)
165                 if (new_qid.q[i] == inode->ei_qid.q[i])
166                         qtypes &= ~(1U << i);
167
168         if (!qtypes)
169                 return 0;
170
171         mutex_lock(&inode->ei_quota_lock);
172
173         ret = bch2_quota_transfer(c, qtypes, new_qid,
174                                   inode->ei_qid,
175                                   inode->v.i_blocks +
176                                   inode->ei_quota_reserved,
177                                   mode);
178         if (!ret)
179                 for (i = 0; i < QTYP_NR; i++)
180                         if (qtypes & (1 << i))
181                                 inode->ei_qid.q[i] = new_qid.q[i];
182
183         mutex_unlock(&inode->ei_quota_lock);
184
185         return ret;
186 }
187
188 static int bch2_iget5_test(struct inode *vinode, void *p)
189 {
190         struct bch_inode_info *inode = to_bch_ei(vinode);
191         subvol_inum *inum = p;
192
193         return inode->ei_subvol == inum->subvol &&
194                 inode->ei_inode.bi_inum == inum->inum;
195 }
196
197 static int bch2_iget5_set(struct inode *vinode, void *p)
198 {
199         struct bch_inode_info *inode = to_bch_ei(vinode);
200         subvol_inum *inum = p;
201
202         inode->v.i_ino          = inum->inum;
203         inode->ei_subvol        = inum->subvol;
204         inode->ei_inode.bi_inum = inum->inum;
205         return 0;
206 }
207
208 static unsigned bch2_inode_hash(subvol_inum inum)
209 {
210         return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
211 }
212
213 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
214 {
215         struct bch_inode_unpacked inode_u;
216         struct bch_inode_info *inode;
217         int ret;
218
219         inode = to_bch_ei(iget5_locked(c->vfs_sb,
220                                        bch2_inode_hash(inum),
221                                        bch2_iget5_test,
222                                        bch2_iget5_set,
223                                        &inum));
224         if (unlikely(!inode))
225                 return ERR_PTR(-ENOMEM);
226         if (!(inode->v.i_state & I_NEW))
227                 return &inode->v;
228
229         ret = bch2_inode_find_by_inum(c, inum, &inode_u);
230         if (ret) {
231                 iget_failed(&inode->v);
232                 return ERR_PTR(ret);
233         }
234
235         bch2_vfs_inode_init(c, inum, inode, &inode_u);
236
237         unlock_new_inode(&inode->v);
238
239         return &inode->v;
240 }
241
242 struct bch_inode_info *
243 __bch2_create(struct user_namespace *mnt_userns,
244               struct bch_inode_info *dir, struct dentry *dentry,
245               umode_t mode, dev_t rdev, subvol_inum snapshot_src,
246               unsigned flags)
247 {
248         struct bch_fs *c = dir->v.i_sb->s_fs_info;
249         struct btree_trans trans;
250         struct bch_inode_unpacked dir_u;
251         struct bch_inode_info *inode, *old;
252         struct bch_inode_unpacked inode_u;
253         struct posix_acl *default_acl = NULL, *acl = NULL;
254         subvol_inum inum;
255         u64 journal_seq = 0;
256         int ret;
257
258         /*
259          * preallocate acls + vfs inode before btree transaction, so that
260          * nothing can fail after the transaction succeeds:
261          */
262 #ifdef CONFIG_BCACHEFS_POSIX_ACL
263         ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
264         if (ret)
265                 return ERR_PTR(ret);
266 #endif
267         inode = to_bch_ei(new_inode(c->vfs_sb));
268         if (unlikely(!inode)) {
269                 inode = ERR_PTR(-ENOMEM);
270                 goto err;
271         }
272
273         bch2_inode_init_early(c, &inode_u);
274
275         if (!(flags & BCH_CREATE_TMPFILE))
276                 mutex_lock(&dir->ei_update_lock);
277
278         bch2_trans_init(&trans, c, 8,
279                         2048 + (!(flags & BCH_CREATE_TMPFILE)
280                                 ? dentry->d_name.len : 0));
281 retry:
282         bch2_trans_begin(&trans);
283
284         ret   = bch2_create_trans(&trans,
285                                   inode_inum(dir), &dir_u, &inode_u,
286                                   !(flags & BCH_CREATE_TMPFILE)
287                                   ? &dentry->d_name : NULL,
288                                   from_kuid(mnt_userns, current_fsuid()),
289                                   from_kgid(mnt_userns, current_fsgid()),
290                                   mode, rdev,
291                                   default_acl, acl, snapshot_src, flags) ?:
292                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
293                                 KEY_TYPE_QUOTA_PREALLOC);
294         if (unlikely(ret))
295                 goto err_before_quota;
296
297         ret   = bch2_trans_commit(&trans, NULL, &journal_seq, 0);
298         if (unlikely(ret)) {
299                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
300                                 KEY_TYPE_QUOTA_WARN);
301 err_before_quota:
302                 if (ret == -EINTR)
303                         goto retry;
304                 goto err_trans;
305         }
306
307         if (!(flags & BCH_CREATE_TMPFILE)) {
308                 bch2_inode_update_after_write(c, dir, &dir_u,
309                                               ATTR_MTIME|ATTR_CTIME);
310                 mutex_unlock(&dir->ei_update_lock);
311         }
312
313         inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
314         inum.inum = inode_u.bi_inum;
315
316         bch2_vfs_inode_init(c, inum, inode, &inode_u);
317
318         set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
319         set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
320
321         /*
322          * we must insert the new inode into the inode cache before calling
323          * bch2_trans_exit() and dropping locks, else we could race with another
324          * thread pulling the inode in and modifying it:
325          */
326
327         inode->v.i_state |= I_CREATING;
328
329         old = to_bch_ei(inode_insert5(&inode->v,
330                                       bch2_inode_hash(inum),
331                                       bch2_iget5_test,
332                                       bch2_iget5_set,
333                                       &inum));
334         BUG_ON(!old);
335
336         if (unlikely(old != inode)) {
337                 /*
338                  * We raced, another process pulled the new inode into cache
339                  * before us:
340                  */
341                 make_bad_inode(&inode->v);
342                 iput(&inode->v);
343
344                 inode = old;
345         } else {
346                 /*
347                  * we really don't want insert_inode_locked2() to be setting
348                  * I_NEW...
349                  */
350                 unlock_new_inode(&inode->v);
351         }
352
353         bch2_trans_exit(&trans);
354 err:
355         posix_acl_release(default_acl);
356         posix_acl_release(acl);
357         return inode;
358 err_trans:
359         if (!(flags & BCH_CREATE_TMPFILE))
360                 mutex_unlock(&dir->ei_update_lock);
361
362         bch2_trans_exit(&trans);
363         make_bad_inode(&inode->v);
364         iput(&inode->v);
365         inode = ERR_PTR(ret);
366         goto err;
367 }
368
369 /* methods */
370
371 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
372                                   unsigned int flags)
373 {
374         struct bch_fs *c = vdir->i_sb->s_fs_info;
375         struct bch_inode_info *dir = to_bch_ei(vdir);
376         struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
377         struct inode *vinode = NULL;
378         subvol_inum inum = { .subvol = 1 };
379         int ret;
380
381         ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
382                                  &dentry->d_name, &inum);
383
384         if (!ret)
385                 vinode = bch2_vfs_inode_get(c, inum);
386
387         return d_splice_alias(vinode, dentry);
388 }
389
390 static int bch2_mknod(struct user_namespace *mnt_userns,
391                       struct inode *vdir, struct dentry *dentry,
392                       umode_t mode, dev_t rdev)
393 {
394         struct bch_inode_info *inode =
395                 __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev,
396                               (subvol_inum) { 0 }, 0);
397
398         if (IS_ERR(inode))
399                 return PTR_ERR(inode);
400
401         d_instantiate(dentry, &inode->v);
402         return 0;
403 }
404
405 static int bch2_create(struct user_namespace *mnt_userns,
406                        struct inode *vdir, struct dentry *dentry,
407                        umode_t mode, bool excl)
408 {
409         return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFREG, 0);
410 }
411
412 static int __bch2_link(struct bch_fs *c,
413                        struct bch_inode_info *inode,
414                        struct bch_inode_info *dir,
415                        struct dentry *dentry)
416 {
417         struct btree_trans trans;
418         struct bch_inode_unpacked dir_u, inode_u;
419         int ret;
420
421         mutex_lock(&inode->ei_update_lock);
422         bch2_trans_init(&trans, c, 4, 1024);
423
424         ret = __bch2_trans_do(&trans, NULL, NULL, 0,
425                         bch2_link_trans(&trans,
426                                         inode_inum(dir),   &dir_u,
427                                         inode_inum(inode), &inode_u,
428                                         &dentry->d_name));
429
430         if (likely(!ret)) {
431                 BUG_ON(inode_u.bi_inum != inode->v.i_ino);
432
433                 bch2_inode_update_after_write(c, dir, &dir_u,
434                                               ATTR_MTIME|ATTR_CTIME);
435                 bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
436         }
437
438         bch2_trans_exit(&trans);
439         mutex_unlock(&inode->ei_update_lock);
440         return ret;
441 }
442
443 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
444                      struct dentry *dentry)
445 {
446         struct bch_fs *c = vdir->i_sb->s_fs_info;
447         struct bch_inode_info *dir = to_bch_ei(vdir);
448         struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
449         int ret;
450
451         lockdep_assert_held(&inode->v.i_rwsem);
452
453         ret = __bch2_link(c, inode, dir, dentry);
454         if (unlikely(ret))
455                 return ret;
456
457         ihold(&inode->v);
458         d_instantiate(dentry, &inode->v);
459         return 0;
460 }
461
462 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
463                   bool deleting_snapshot)
464 {
465         struct bch_fs *c = vdir->i_sb->s_fs_info;
466         struct bch_inode_info *dir = to_bch_ei(vdir);
467         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
468         struct bch_inode_unpacked dir_u, inode_u;
469         struct btree_trans trans;
470         int ret;
471
472         bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
473         bch2_trans_init(&trans, c, 4, 1024);
474
475         ret = __bch2_trans_do(&trans, NULL, NULL,
476                               BTREE_INSERT_NOFAIL,
477                         bch2_unlink_trans(&trans,
478                                           inode_inum(dir), &dir_u,
479                                           &inode_u, &dentry->d_name,
480                                           deleting_snapshot));
481
482         if (likely(!ret)) {
483                 BUG_ON(inode_u.bi_inum != inode->v.i_ino);
484
485                 bch2_inode_update_after_write(c, dir, &dir_u,
486                                               ATTR_MTIME|ATTR_CTIME);
487                 bch2_inode_update_after_write(c, inode, &inode_u,
488                                               ATTR_MTIME);
489         }
490
491         bch2_trans_exit(&trans);
492         bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
493
494         return ret;
495 }
496
497 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
498 {
499         return __bch2_unlink(vdir, dentry, false);
500 }
501
502 static int bch2_symlink(struct user_namespace *mnt_userns,
503                         struct inode *vdir, struct dentry *dentry,
504                         const char *symname)
505 {
506         struct bch_fs *c = vdir->i_sb->s_fs_info;
507         struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
508         int ret;
509
510         inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
511                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
512         if (unlikely(IS_ERR(inode)))
513                 return PTR_ERR(inode);
514
515         inode_lock(&inode->v);
516         ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
517         inode_unlock(&inode->v);
518
519         if (unlikely(ret))
520                 goto err;
521
522         ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
523         if (unlikely(ret))
524                 goto err;
525
526         ret = __bch2_link(c, inode, dir, dentry);
527         if (unlikely(ret))
528                 goto err;
529
530         d_instantiate(dentry, &inode->v);
531         return 0;
532 err:
533         iput(&inode->v);
534         return ret;
535 }
536
537 static int bch2_mkdir(struct user_namespace *mnt_userns,
538                       struct inode *vdir, struct dentry *dentry, umode_t mode)
539 {
540         return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFDIR, 0);
541 }
542
543 static int bch2_rename2(struct user_namespace *mnt_userns,
544                         struct inode *src_vdir, struct dentry *src_dentry,
545                         struct inode *dst_vdir, struct dentry *dst_dentry,
546                         unsigned flags)
547 {
548         struct bch_fs *c = src_vdir->i_sb->s_fs_info;
549         struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
550         struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
551         struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
552         struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
553         struct bch_inode_unpacked dst_dir_u, src_dir_u;
554         struct bch_inode_unpacked src_inode_u, dst_inode_u;
555         struct btree_trans trans;
556         enum bch_rename_mode mode = flags & RENAME_EXCHANGE
557                 ? BCH_RENAME_EXCHANGE
558                 : dst_dentry->d_inode
559                 ? BCH_RENAME_OVERWRITE : BCH_RENAME;
560         int ret;
561
562         if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
563                 return -EINVAL;
564
565         if (mode == BCH_RENAME_OVERWRITE) {
566                 ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
567                                                    0, LLONG_MAX);
568                 if (ret)
569                         return ret;
570         }
571
572         bch2_trans_init(&trans, c, 8, 2048);
573
574         bch2_lock_inodes(INODE_UPDATE_LOCK,
575                          src_dir,
576                          dst_dir,
577                          src_inode,
578                          dst_inode);
579
580         if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
581                 ret = bch2_fs_quota_transfer(c, src_inode,
582                                              dst_dir->ei_qid,
583                                              1 << QTYP_PRJ,
584                                              KEY_TYPE_QUOTA_PREALLOC);
585                 if (ret)
586                         goto err;
587         }
588
589         if (mode == BCH_RENAME_EXCHANGE &&
590             inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
591                 ret = bch2_fs_quota_transfer(c, dst_inode,
592                                              src_dir->ei_qid,
593                                              1 << QTYP_PRJ,
594                                              KEY_TYPE_QUOTA_PREALLOC);
595                 if (ret)
596                         goto err;
597         }
598
599         ret = __bch2_trans_do(&trans, NULL, NULL, 0,
600                         bch2_rename_trans(&trans,
601                                           inode_inum(src_dir), &src_dir_u,
602                                           inode_inum(dst_dir), &dst_dir_u,
603                                           &src_inode_u,
604                                           &dst_inode_u,
605                                           &src_dentry->d_name,
606                                           &dst_dentry->d_name,
607                                           mode));
608         if (unlikely(ret))
609                 goto err;
610
611         BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
612         BUG_ON(dst_inode &&
613                dst_inode->v.i_ino != dst_inode_u.bi_inum);
614
615         bch2_inode_update_after_write(c, src_dir, &src_dir_u,
616                                       ATTR_MTIME|ATTR_CTIME);
617
618         if (src_dir != dst_dir)
619                 bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
620                                               ATTR_MTIME|ATTR_CTIME);
621
622         bch2_inode_update_after_write(c, src_inode, &src_inode_u,
623                                       ATTR_CTIME);
624
625         if (dst_inode)
626                 bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
627                                               ATTR_CTIME);
628 err:
629         bch2_trans_exit(&trans);
630
631         bch2_fs_quota_transfer(c, src_inode,
632                                bch_qid(&src_inode->ei_inode),
633                                1 << QTYP_PRJ,
634                                KEY_TYPE_QUOTA_NOCHECK);
635         if (dst_inode)
636                 bch2_fs_quota_transfer(c, dst_inode,
637                                        bch_qid(&dst_inode->ei_inode),
638                                        1 << QTYP_PRJ,
639                                        KEY_TYPE_QUOTA_NOCHECK);
640
641         bch2_unlock_inodes(INODE_UPDATE_LOCK,
642                            src_dir,
643                            dst_dir,
644                            src_inode,
645                            dst_inode);
646
647         return ret;
648 }
649
650 static void bch2_setattr_copy(struct user_namespace *mnt_userns,
651                               struct bch_inode_info *inode,
652                               struct bch_inode_unpacked *bi,
653                               struct iattr *attr)
654 {
655         struct bch_fs *c = inode->v.i_sb->s_fs_info;
656         unsigned int ia_valid = attr->ia_valid;
657
658         if (ia_valid & ATTR_UID)
659                 bi->bi_uid = from_kuid(mnt_userns, attr->ia_uid);
660         if (ia_valid & ATTR_GID)
661                 bi->bi_gid = from_kgid(mnt_userns, attr->ia_gid);
662
663         if (ia_valid & ATTR_SIZE)
664                 bi->bi_size = attr->ia_size;
665
666         if (ia_valid & ATTR_ATIME)
667                 bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
668         if (ia_valid & ATTR_MTIME)
669                 bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
670         if (ia_valid & ATTR_CTIME)
671                 bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
672
673         if (ia_valid & ATTR_MODE) {
674                 umode_t mode = attr->ia_mode;
675                 kgid_t gid = ia_valid & ATTR_GID
676                         ? attr->ia_gid
677                         : inode->v.i_gid;
678
679                 if (!in_group_p(gid) &&
680                     !capable_wrt_inode_uidgid(mnt_userns, &inode->v, CAP_FSETID))
681                         mode &= ~S_ISGID;
682                 bi->bi_mode = mode;
683         }
684 }
685
686 int bch2_setattr_nonsize(struct user_namespace *mnt_userns,
687                          struct bch_inode_info *inode,
688                          struct iattr *attr)
689 {
690         struct bch_fs *c = inode->v.i_sb->s_fs_info;
691         struct bch_qid qid;
692         struct btree_trans trans;
693         struct btree_iter inode_iter = { NULL };
694         struct bch_inode_unpacked inode_u;
695         struct posix_acl *acl = NULL;
696         int ret;
697
698         mutex_lock(&inode->ei_update_lock);
699
700         qid = inode->ei_qid;
701
702         if (attr->ia_valid & ATTR_UID)
703                 qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid);
704
705         if (attr->ia_valid & ATTR_GID)
706                 qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid);
707
708         ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
709                                      KEY_TYPE_QUOTA_PREALLOC);
710         if (ret)
711                 goto err;
712
713         bch2_trans_init(&trans, c, 0, 0);
714 retry:
715         bch2_trans_begin(&trans);
716         kfree(acl);
717         acl = NULL;
718
719         ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
720                               BTREE_ITER_INTENT);
721         if (ret)
722                 goto btree_err;
723
724         bch2_setattr_copy(mnt_userns, inode, &inode_u, attr);
725
726         if (attr->ia_valid & ATTR_MODE) {
727                 ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
728                                      inode_u.bi_mode, &acl);
729                 if (ret)
730                         goto btree_err;
731         }
732
733         ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
734                 bch2_trans_commit(&trans, NULL, NULL,
735                                   BTREE_INSERT_NOFAIL);
736 btree_err:
737         bch2_trans_iter_exit(&trans, &inode_iter);
738
739         if (ret == -EINTR)
740                 goto retry;
741         if (unlikely(ret))
742                 goto err_trans;
743
744         bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid);
745
746         if (acl)
747                 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
748 err_trans:
749         bch2_trans_exit(&trans);
750 err:
751         mutex_unlock(&inode->ei_update_lock);
752
753         return ret;
754 }
755
756 static int bch2_getattr(struct user_namespace *mnt_userns,
757                         const struct path *path, struct kstat *stat,
758                         u32 request_mask, unsigned query_flags)
759 {
760         struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
761         struct bch_fs *c = inode->v.i_sb->s_fs_info;
762
763         stat->dev       = inode->v.i_sb->s_dev;
764         stat->ino       = inode->v.i_ino;
765         stat->mode      = inode->v.i_mode;
766         stat->nlink     = inode->v.i_nlink;
767         stat->uid       = inode->v.i_uid;
768         stat->gid       = inode->v.i_gid;
769         stat->rdev      = inode->v.i_rdev;
770         stat->size      = i_size_read(&inode->v);
771         stat->atime     = inode->v.i_atime;
772         stat->mtime     = inode->v.i_mtime;
773         stat->ctime     = inode->v.i_ctime;
774         stat->blksize   = block_bytes(c);
775         stat->blocks    = inode->v.i_blocks;
776
777         if (request_mask & STATX_BTIME) {
778                 stat->result_mask |= STATX_BTIME;
779                 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
780         }
781
782         if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
783                 stat->attributes |= STATX_ATTR_IMMUTABLE;
784         stat->attributes_mask    |= STATX_ATTR_IMMUTABLE;
785
786         if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
787                 stat->attributes |= STATX_ATTR_APPEND;
788         stat->attributes_mask    |= STATX_ATTR_APPEND;
789
790         if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
791                 stat->attributes |= STATX_ATTR_NODUMP;
792         stat->attributes_mask    |= STATX_ATTR_NODUMP;
793
794         return 0;
795 }
796
797 static int bch2_setattr(struct user_namespace *mnt_userns,
798                         struct dentry *dentry, struct iattr *iattr)
799 {
800         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
801         int ret;
802
803         lockdep_assert_held(&inode->v.i_rwsem);
804
805         ret = setattr_prepare(mnt_userns, dentry, iattr);
806         if (ret)
807                 return ret;
808
809         return iattr->ia_valid & ATTR_SIZE
810                 ? bch2_truncate(mnt_userns, inode, iattr)
811                 : bch2_setattr_nonsize(mnt_userns, inode, iattr);
812 }
813
814 static int bch2_tmpfile(struct user_namespace *mnt_userns,
815                         struct inode *vdir, struct dentry *dentry, umode_t mode)
816 {
817         struct bch_inode_info *inode =
818                 __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0,
819                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
820
821         if (IS_ERR(inode))
822                 return PTR_ERR(inode);
823
824         d_mark_tmpfile(dentry, &inode->v);
825         d_instantiate(dentry, &inode->v);
826         return 0;
827 }
828
829 static int bch2_fill_extent(struct bch_fs *c,
830                             struct fiemap_extent_info *info,
831                             struct bkey_s_c k, unsigned flags)
832 {
833         if (bkey_extent_is_direct_data(k.k)) {
834                 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
835                 const union bch_extent_entry *entry;
836                 struct extent_ptr_decoded p;
837                 int ret;
838
839                 if (k.k->type == KEY_TYPE_reflink_v)
840                         flags |= FIEMAP_EXTENT_SHARED;
841
842                 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
843                         int flags2 = 0;
844                         u64 offset = p.ptr.offset;
845
846                         if (p.crc.compression_type)
847                                 flags2 |= FIEMAP_EXTENT_ENCODED;
848                         else
849                                 offset += p.crc.offset;
850
851                         if ((offset & (c->opts.block_size - 1)) ||
852                             (k.k->size & (c->opts.block_size - 1)))
853                                 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
854
855                         ret = fiemap_fill_next_extent(info,
856                                                 bkey_start_offset(k.k) << 9,
857                                                 offset << 9,
858                                                 k.k->size << 9, flags|flags2);
859                         if (ret)
860                                 return ret;
861                 }
862
863                 return 0;
864         } else if (bkey_extent_is_inline_data(k.k)) {
865                 return fiemap_fill_next_extent(info,
866                                                bkey_start_offset(k.k) << 9,
867                                                0, k.k->size << 9,
868                                                flags|
869                                                FIEMAP_EXTENT_DATA_INLINE);
870         } else if (k.k->type == KEY_TYPE_reservation) {
871                 return fiemap_fill_next_extent(info,
872                                                bkey_start_offset(k.k) << 9,
873                                                0, k.k->size << 9,
874                                                flags|
875                                                FIEMAP_EXTENT_DELALLOC|
876                                                FIEMAP_EXTENT_UNWRITTEN);
877         } else {
878                 BUG();
879         }
880 }
881
882 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
883                        u64 start, u64 len)
884 {
885         struct bch_fs *c = vinode->i_sb->s_fs_info;
886         struct bch_inode_info *ei = to_bch_ei(vinode);
887         struct btree_trans trans;
888         struct btree_iter iter;
889         struct bkey_s_c k;
890         struct bkey_buf cur, prev;
891         struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
892         unsigned offset_into_extent, sectors;
893         bool have_extent = false;
894         u32 snapshot;
895         int ret = 0;
896
897         ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
898         if (ret)
899                 return ret;
900
901         if (start + len < start)
902                 return -EINVAL;
903
904         start >>= 9;
905
906         bch2_bkey_buf_init(&cur);
907         bch2_bkey_buf_init(&prev);
908         bch2_trans_init(&trans, c, 0, 0);
909 retry:
910         bch2_trans_begin(&trans);
911
912         ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
913         if (ret)
914                 goto err;
915
916         bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
917                              SPOS(ei->v.i_ino, start, snapshot), 0);
918
919         while ((k = bch2_btree_iter_peek(&iter)).k &&
920                !(ret = bkey_err(k)) &&
921                bkey_cmp(iter.pos, end) < 0) {
922                 enum btree_id data_btree = BTREE_ID_extents;
923
924                 if (!bkey_extent_is_data(k.k) &&
925                     k.k->type != KEY_TYPE_reservation) {
926                         bch2_btree_iter_advance(&iter);
927                         continue;
928                 }
929
930                 offset_into_extent      = iter.pos.offset -
931                         bkey_start_offset(k.k);
932                 sectors                 = k.k->size - offset_into_extent;
933
934                 bch2_bkey_buf_reassemble(&cur, c, k);
935
936                 ret = bch2_read_indirect_extent(&trans, &data_btree,
937                                         &offset_into_extent, &cur);
938                 if (ret)
939                         break;
940
941                 k = bkey_i_to_s_c(cur.k);
942                 bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
943
944                 sectors = min(sectors, k.k->size - offset_into_extent);
945
946                 bch2_cut_front(POS(k.k->p.inode,
947                                    bkey_start_offset(k.k) +
948                                    offset_into_extent),
949                                cur.k);
950                 bch2_key_resize(&cur.k->k, sectors);
951                 cur.k->k.p = iter.pos;
952                 cur.k->k.p.offset += cur.k->k.size;
953
954                 if (have_extent) {
955                         ret = bch2_fill_extent(c, info,
956                                         bkey_i_to_s_c(prev.k), 0);
957                         if (ret)
958                                 break;
959                 }
960
961                 bkey_copy(prev.k, cur.k);
962                 have_extent = true;
963
964                 bch2_btree_iter_set_pos(&iter,
965                         POS(iter.pos.inode, iter.pos.offset + sectors));
966         }
967         start = iter.pos.offset;
968         bch2_trans_iter_exit(&trans, &iter);
969 err:
970         if (ret == -EINTR)
971                 goto retry;
972
973         if (!ret && have_extent)
974                 ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
975                                        FIEMAP_EXTENT_LAST);
976
977         bch2_trans_exit(&trans);
978         bch2_bkey_buf_exit(&cur, c);
979         bch2_bkey_buf_exit(&prev, c);
980         return ret < 0 ? ret : 0;
981 }
982
983 static const struct vm_operations_struct bch_vm_ops = {
984         .fault          = bch2_page_fault,
985         .map_pages      = filemap_map_pages,
986         .page_mkwrite   = bch2_page_mkwrite,
987 };
988
989 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
990 {
991         file_accessed(file);
992
993         vma->vm_ops = &bch_vm_ops;
994         return 0;
995 }
996
997 /* Directories: */
998
999 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
1000 {
1001         return generic_file_llseek_size(file, offset, whence,
1002                                         S64_MAX, S64_MAX);
1003 }
1004
1005 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1006 {
1007         struct bch_inode_info *inode = file_bch_inode(file);
1008         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1009
1010         if (!dir_emit_dots(file, ctx))
1011                 return 0;
1012
1013         return bch2_readdir(c, inode_inum(inode), ctx);
1014 }
1015
1016 static const struct file_operations bch_file_operations = {
1017         .llseek         = bch2_llseek,
1018         .read_iter      = bch2_read_iter,
1019         .write_iter     = bch2_write_iter,
1020         .mmap           = bch2_mmap,
1021         .open           = generic_file_open,
1022         .fsync          = bch2_fsync,
1023         .splice_read    = generic_file_splice_read,
1024         .splice_write   = iter_file_splice_write,
1025         .fallocate      = bch2_fallocate_dispatch,
1026         .unlocked_ioctl = bch2_fs_file_ioctl,
1027 #ifdef CONFIG_COMPAT
1028         .compat_ioctl   = bch2_compat_fs_ioctl,
1029 #endif
1030         .remap_file_range = bch2_remap_file_range,
1031 };
1032
1033 static const struct inode_operations bch_file_inode_operations = {
1034         .getattr        = bch2_getattr,
1035         .setattr        = bch2_setattr,
1036         .fiemap         = bch2_fiemap,
1037         .listxattr      = bch2_xattr_list,
1038 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1039         .get_acl        = bch2_get_acl,
1040         .set_acl        = bch2_set_acl,
1041 #endif
1042 };
1043
1044 static const struct inode_operations bch_dir_inode_operations = {
1045         .lookup         = bch2_lookup,
1046         .create         = bch2_create,
1047         .link           = bch2_link,
1048         .unlink         = bch2_unlink,
1049         .symlink        = bch2_symlink,
1050         .mkdir          = bch2_mkdir,
1051         .rmdir          = bch2_unlink,
1052         .mknod          = bch2_mknod,
1053         .rename         = bch2_rename2,
1054         .getattr        = bch2_getattr,
1055         .setattr        = bch2_setattr,
1056         .tmpfile        = bch2_tmpfile,
1057         .listxattr      = bch2_xattr_list,
1058 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1059         .get_acl        = bch2_get_acl,
1060         .set_acl        = bch2_set_acl,
1061 #endif
1062 };
1063
1064 static const struct file_operations bch_dir_file_operations = {
1065         .llseek         = bch2_dir_llseek,
1066         .read           = generic_read_dir,
1067         .iterate_shared = bch2_vfs_readdir,
1068         .fsync          = bch2_fsync,
1069         .unlocked_ioctl = bch2_fs_file_ioctl,
1070 #ifdef CONFIG_COMPAT
1071         .compat_ioctl   = bch2_compat_fs_ioctl,
1072 #endif
1073 };
1074
1075 static const struct inode_operations bch_symlink_inode_operations = {
1076         .get_link       = page_get_link,
1077         .getattr        = bch2_getattr,
1078         .setattr        = bch2_setattr,
1079         .listxattr      = bch2_xattr_list,
1080 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1081         .get_acl        = bch2_get_acl,
1082         .set_acl        = bch2_set_acl,
1083 #endif
1084 };
1085
1086 static const struct inode_operations bch_special_inode_operations = {
1087         .getattr        = bch2_getattr,
1088         .setattr        = bch2_setattr,
1089         .listxattr      = bch2_xattr_list,
1090 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1091         .get_acl        = bch2_get_acl,
1092         .set_acl        = bch2_set_acl,
1093 #endif
1094 };
1095
1096 static const struct address_space_operations bch_address_space_operations = {
1097         .writepage      = bch2_writepage,
1098         .readpage       = bch2_readpage,
1099         .writepages     = bch2_writepages,
1100         .readahead      = bch2_readahead,
1101         .set_page_dirty = __set_page_dirty_nobuffers,
1102         .write_begin    = bch2_write_begin,
1103         .write_end      = bch2_write_end,
1104         .invalidatepage = bch2_invalidatepage,
1105         .releasepage    = bch2_releasepage,
1106         .direct_IO      = noop_direct_IO,
1107 #ifdef CONFIG_MIGRATION
1108         .migratepage    = bch2_migrate_page,
1109 #endif
1110         .error_remove_page = generic_error_remove_page,
1111 };
1112
1113 #if 0
1114 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1115                 u64 ino, u32 generation)
1116 {
1117         struct bch_fs *c = sb->s_fs_info;
1118         struct inode *vinode;
1119
1120         if (ino < BCACHEFS_ROOT_INO)
1121                 return ERR_PTR(-ESTALE);
1122
1123         vinode = bch2_vfs_inode_get(c, ino);
1124         if (IS_ERR(vinode))
1125                 return ERR_CAST(vinode);
1126         if (generation && vinode->i_generation != generation) {
1127                 /* we didn't find the right inode.. */
1128                 iput(vinode);
1129                 return ERR_PTR(-ESTALE);
1130         }
1131         return vinode;
1132 }
1133
1134 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
1135                 int fh_len, int fh_type)
1136 {
1137         return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1138                                     bch2_nfs_get_inode);
1139 }
1140
1141 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
1142                 int fh_len, int fh_type)
1143 {
1144         return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1145                                     bch2_nfs_get_inode);
1146 }
1147 #endif
1148
1149 static const struct export_operations bch_export_ops = {
1150         //.fh_to_dentry = bch2_fh_to_dentry,
1151         //.fh_to_parent = bch2_fh_to_parent,
1152         //.get_parent   = bch2_get_parent,
1153 };
1154
1155 static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum,
1156                                 struct bch_inode_info *inode,
1157                                 struct bch_inode_unpacked *bi)
1158 {
1159         bch2_inode_update_after_write(c, inode, bi, ~0);
1160
1161         inode->v.i_blocks       = bi->bi_sectors;
1162         inode->v.i_ino          = bi->bi_inum;
1163         inode->v.i_rdev         = bi->bi_dev;
1164         inode->v.i_generation   = bi->bi_generation;
1165         inode->v.i_size         = bi->bi_size;
1166
1167         inode->ei_flags         = 0;
1168         inode->ei_quota_reserved = 0;
1169         inode->ei_qid           = bch_qid(bi);
1170         inode->ei_subvol        = inum.subvol;
1171
1172         inode->v.i_mapping->a_ops = &bch_address_space_operations;
1173
1174         switch (inode->v.i_mode & S_IFMT) {
1175         case S_IFREG:
1176                 inode->v.i_op   = &bch_file_inode_operations;
1177                 inode->v.i_fop  = &bch_file_operations;
1178                 break;
1179         case S_IFDIR:
1180                 inode->v.i_op   = &bch_dir_inode_operations;
1181                 inode->v.i_fop  = &bch_dir_file_operations;
1182                 break;
1183         case S_IFLNK:
1184                 inode_nohighmem(&inode->v);
1185                 inode->v.i_op   = &bch_symlink_inode_operations;
1186                 break;
1187         default:
1188                 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1189                 inode->v.i_op   = &bch_special_inode_operations;
1190                 break;
1191         }
1192 }
1193
1194 static struct inode *bch2_alloc_inode(struct super_block *sb)
1195 {
1196         struct bch_inode_info *inode;
1197
1198         inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
1199         if (!inode)
1200                 return NULL;
1201
1202         inode_init_once(&inode->v);
1203         mutex_init(&inode->ei_update_lock);
1204         pagecache_lock_init(&inode->ei_pagecache_lock);
1205         mutex_init(&inode->ei_quota_lock);
1206
1207         return &inode->v;
1208 }
1209
1210 static void bch2_i_callback(struct rcu_head *head)
1211 {
1212         struct inode *vinode = container_of(head, struct inode, i_rcu);
1213         struct bch_inode_info *inode = to_bch_ei(vinode);
1214
1215         kmem_cache_free(bch2_inode_cache, inode);
1216 }
1217
1218 static void bch2_destroy_inode(struct inode *vinode)
1219 {
1220         call_rcu(&vinode->i_rcu, bch2_i_callback);
1221 }
1222
1223 static int inode_update_times_fn(struct bch_inode_info *inode,
1224                                  struct bch_inode_unpacked *bi,
1225                                  void *p)
1226 {
1227         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1228
1229         bi->bi_atime    = timespec_to_bch2_time(c, inode->v.i_atime);
1230         bi->bi_mtime    = timespec_to_bch2_time(c, inode->v.i_mtime);
1231         bi->bi_ctime    = timespec_to_bch2_time(c, inode->v.i_ctime);
1232
1233         return 0;
1234 }
1235
1236 static int bch2_vfs_write_inode(struct inode *vinode,
1237                                 struct writeback_control *wbc)
1238 {
1239         struct bch_fs *c = vinode->i_sb->s_fs_info;
1240         struct bch_inode_info *inode = to_bch_ei(vinode);
1241         int ret;
1242
1243         mutex_lock(&inode->ei_update_lock);
1244         ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1245                                ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1246         mutex_unlock(&inode->ei_update_lock);
1247
1248         return ret;
1249 }
1250
1251 static void bch2_evict_inode(struct inode *vinode)
1252 {
1253         struct bch_fs *c = vinode->i_sb->s_fs_info;
1254         struct bch_inode_info *inode = to_bch_ei(vinode);
1255
1256         truncate_inode_pages_final(&inode->v.i_data);
1257
1258         clear_inode(&inode->v);
1259
1260         BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1261
1262         if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1263                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1264                                 KEY_TYPE_QUOTA_WARN);
1265                 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1266                                 KEY_TYPE_QUOTA_WARN);
1267                 bch2_inode_rm(c, inode_inum(inode), true);
1268         }
1269 }
1270
1271 void bch2_evict_subvolume_inodes(struct bch_fs *c,
1272                                  struct snapshot_id_list *s)
1273 {
1274         struct super_block *sb = c->vfs_sb;
1275         struct inode *inode;
1276
1277         spin_lock(&sb->s_inode_list_lock);
1278         list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1279                 if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
1280                     (inode->i_state & I_FREEING))
1281                         continue;
1282
1283                 d_mark_dontcache(inode);
1284                 d_prune_aliases(inode);
1285         }
1286         spin_unlock(&sb->s_inode_list_lock);
1287 again:
1288         cond_resched();
1289         spin_lock(&sb->s_inode_list_lock);
1290         list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1291                 if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
1292                     (inode->i_state & I_FREEING))
1293                         continue;
1294
1295                 if (!(inode->i_state & I_DONTCACHE)) {
1296                         d_mark_dontcache(inode);
1297                         d_prune_aliases(inode);
1298                 }
1299
1300                 spin_lock(&inode->i_lock);
1301                 if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
1302                     !(inode->i_state & I_FREEING)) {
1303                         wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
1304                         DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
1305                         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
1306                         spin_unlock(&inode->i_lock);
1307                         spin_unlock(&sb->s_inode_list_lock);
1308                         schedule();
1309                         finish_wait(wq, &wait.wq_entry);
1310                         goto again;
1311                 }
1312
1313                 spin_unlock(&inode->i_lock);
1314         }
1315         spin_unlock(&sb->s_inode_list_lock);
1316 }
1317
1318 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1319 {
1320         struct super_block *sb = dentry->d_sb;
1321         struct bch_fs *c = sb->s_fs_info;
1322         struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1323         unsigned shift = sb->s_blocksize_bits - 9;
1324         /*
1325          * this assumes inodes take up 64 bytes, which is a decent average
1326          * number:
1327          */
1328         u64 avail_inodes = ((usage.capacity - usage.used) << 3);
1329         u64 fsid;
1330
1331         buf->f_type     = BCACHEFS_STATFS_MAGIC;
1332         buf->f_bsize    = sb->s_blocksize;
1333         buf->f_blocks   = usage.capacity >> shift;
1334         buf->f_bfree    = usage.free >> shift;
1335         buf->f_bavail   = avail_factor(usage.free) >> shift;
1336
1337         buf->f_files    = usage.nr_inodes + avail_inodes;
1338         buf->f_ffree    = avail_inodes;
1339
1340         fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
1341                le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
1342         buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
1343         buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
1344         buf->f_namelen  = BCH_NAME_MAX;
1345
1346         return 0;
1347 }
1348
1349 static int bch2_sync_fs(struct super_block *sb, int wait)
1350 {
1351         struct bch_fs *c = sb->s_fs_info;
1352
1353         if (c->opts.journal_flush_disabled)
1354                 return 0;
1355
1356         if (!wait) {
1357                 bch2_journal_flush_async(&c->journal, NULL);
1358                 return 0;
1359         }
1360
1361         return bch2_journal_flush(&c->journal);
1362 }
1363
1364 static struct bch_fs *bch2_path_to_fs(const char *path)
1365 {
1366         struct bch_fs *c;
1367         dev_t dev;
1368         int ret;
1369
1370         ret = lookup_bdev(path, &dev);
1371         if (ret)
1372                 return ERR_PTR(ret);
1373
1374         c = bch2_dev_to_fs(dev);
1375         if (c)
1376                 closure_put(&c->cl);
1377         return c ?: ERR_PTR(-ENOENT);
1378 }
1379
1380 static char **split_devs(const char *_dev_name, unsigned *nr)
1381 {
1382         char *dev_name = NULL, **devs = NULL, *s;
1383         size_t i, nr_devs = 0;
1384
1385         dev_name = kstrdup(_dev_name, GFP_KERNEL);
1386         if (!dev_name)
1387                 return NULL;
1388
1389         for (s = dev_name; s; s = strchr(s + 1, ':'))
1390                 nr_devs++;
1391
1392         devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
1393         if (!devs) {
1394                 kfree(dev_name);
1395                 return NULL;
1396         }
1397
1398         for (i = 0, s = dev_name;
1399              s;
1400              (s = strchr(s, ':')) && (*s++ = '\0'))
1401                 devs[i++] = s;
1402
1403         *nr = nr_devs;
1404         return devs;
1405 }
1406
1407 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1408 {
1409         struct bch_fs *c = sb->s_fs_info;
1410         struct bch_opts opts = bch2_opts_empty();
1411         int ret;
1412
1413         opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
1414
1415         ret = bch2_parse_mount_opts(c, &opts, data);
1416         if (ret)
1417                 return ret;
1418
1419         if (opts.read_only != c->opts.read_only) {
1420                 down_write(&c->state_lock);
1421
1422                 if (opts.read_only) {
1423                         bch2_fs_read_only(c);
1424
1425                         sb->s_flags |= SB_RDONLY;
1426                 } else {
1427                         ret = bch2_fs_read_write(c);
1428                         if (ret) {
1429                                 bch_err(c, "error going rw: %i", ret);
1430                                 up_write(&c->state_lock);
1431                                 return -EINVAL;
1432                         }
1433
1434                         sb->s_flags &= ~SB_RDONLY;
1435                 }
1436
1437                 c->opts.read_only = opts.read_only;
1438
1439                 up_write(&c->state_lock);
1440         }
1441
1442         if (opts.errors >= 0)
1443                 c->opts.errors = opts.errors;
1444
1445         return ret;
1446 }
1447
1448 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
1449 {
1450         struct bch_fs *c = root->d_sb->s_fs_info;
1451         struct bch_dev *ca;
1452         unsigned i;
1453         bool first = true;
1454
1455         for_each_online_member(ca, c, i) {
1456                 if (!first)
1457                         seq_putc(seq, ':');
1458                 first = false;
1459                 seq_puts(seq, "/dev/");
1460                 seq_puts(seq, ca->name);
1461         }
1462
1463         return 0;
1464 }
1465
1466 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1467 {
1468         struct bch_fs *c = root->d_sb->s_fs_info;
1469         enum bch_opt_id i;
1470         char buf[512];
1471
1472         for (i = 0; i < bch2_opts_nr; i++) {
1473                 const struct bch_option *opt = &bch2_opt_table[i];
1474                 u64 v = bch2_opt_get_by_id(&c->opts, i);
1475
1476                 if (!(opt->mode & OPT_MOUNT))
1477                         continue;
1478
1479                 if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1480                         continue;
1481
1482                 bch2_opt_to_text(&PBUF(buf), c, opt, v,
1483                                  OPT_SHOW_MOUNT_STYLE);
1484                 seq_putc(seq, ',');
1485                 seq_puts(seq, buf);
1486         }
1487
1488         return 0;
1489 }
1490
1491 static void bch2_put_super(struct super_block *sb)
1492 {
1493         struct bch_fs *c = sb->s_fs_info;
1494
1495         __bch2_fs_stop(c);
1496 }
1497
1498 static const struct super_operations bch_super_operations = {
1499         .alloc_inode    = bch2_alloc_inode,
1500         .destroy_inode  = bch2_destroy_inode,
1501         .write_inode    = bch2_vfs_write_inode,
1502         .evict_inode    = bch2_evict_inode,
1503         .sync_fs        = bch2_sync_fs,
1504         .statfs         = bch2_statfs,
1505         .show_devname   = bch2_show_devname,
1506         .show_options   = bch2_show_options,
1507         .remount_fs     = bch2_remount,
1508         .put_super      = bch2_put_super,
1509 #if 0
1510         .freeze_fs      = bch2_freeze,
1511         .unfreeze_fs    = bch2_unfreeze,
1512 #endif
1513 };
1514
1515 static int bch2_set_super(struct super_block *s, void *data)
1516 {
1517         s->s_fs_info = data;
1518         return 0;
1519 }
1520
1521 static int bch2_noset_super(struct super_block *s, void *data)
1522 {
1523         return -EBUSY;
1524 }
1525
1526 static int bch2_test_super(struct super_block *s, void *data)
1527 {
1528         struct bch_fs *c = s->s_fs_info;
1529         struct bch_fs **devs = data;
1530         unsigned i;
1531
1532         if (!c)
1533                 return false;
1534
1535         for (i = 0; devs[i]; i++)
1536                 if (c != devs[i])
1537                         return false;
1538         return true;
1539 }
1540
1541 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1542                                  int flags, const char *dev_name, void *data)
1543 {
1544         struct bch_fs *c;
1545         struct bch_dev *ca;
1546         struct super_block *sb;
1547         struct inode *vinode;
1548         struct bch_opts opts = bch2_opts_empty();
1549         char **devs;
1550         struct bch_fs **devs_to_fs = NULL;
1551         unsigned i, nr_devs;
1552         int ret;
1553
1554         opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
1555
1556         ret = bch2_parse_mount_opts(NULL, &opts, data);
1557         if (ret)
1558                 return ERR_PTR(ret);
1559
1560         if (!dev_name || strlen(dev_name) == 0)
1561                 return ERR_PTR(-EINVAL);
1562
1563         devs = split_devs(dev_name, &nr_devs);
1564         if (!devs)
1565                 return ERR_PTR(-ENOMEM);
1566
1567         devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
1568         if (!devs_to_fs) {
1569                 sb = ERR_PTR(-ENOMEM);
1570                 goto got_sb;
1571         }
1572
1573         for (i = 0; i < nr_devs; i++)
1574                 devs_to_fs[i] = bch2_path_to_fs(devs[i]);
1575
1576         sb = sget(fs_type, bch2_test_super, bch2_noset_super,
1577                   flags|SB_NOSEC, devs_to_fs);
1578         if (!IS_ERR(sb))
1579                 goto got_sb;
1580
1581         c = bch2_fs_open(devs, nr_devs, opts);
1582         if (IS_ERR(c)) {
1583                 sb = ERR_CAST(c);
1584                 goto got_sb;
1585         }
1586
1587         /* Some options can't be parsed until after the fs is started: */
1588         ret = bch2_parse_mount_opts(c, &opts, data);
1589         if (ret) {
1590                 bch2_fs_stop(c);
1591                 sb = ERR_PTR(ret);
1592                 goto got_sb;
1593         }
1594
1595         bch2_opts_apply(&c->opts, opts);
1596
1597         sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
1598         if (IS_ERR(sb))
1599                 bch2_fs_stop(c);
1600 got_sb:
1601         kfree(devs_to_fs);
1602         kfree(devs[0]);
1603         kfree(devs);
1604
1605         if (IS_ERR(sb))
1606                 return ERR_CAST(sb);
1607
1608         c = sb->s_fs_info;
1609
1610         if (sb->s_root) {
1611                 if ((flags ^ sb->s_flags) & SB_RDONLY) {
1612                         ret = -EBUSY;
1613                         goto err_put_super;
1614                 }
1615                 goto out;
1616         }
1617
1618         sb->s_blocksize         = block_bytes(c);
1619         sb->s_blocksize_bits    = ilog2(block_bytes(c));
1620         sb->s_maxbytes          = MAX_LFS_FILESIZE;
1621         sb->s_op                = &bch_super_operations;
1622         sb->s_export_op         = &bch_export_ops;
1623 #ifdef CONFIG_BCACHEFS_QUOTA
1624         sb->s_qcop              = &bch2_quotactl_operations;
1625         sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1626 #endif
1627         sb->s_xattr             = bch2_xattr_handlers;
1628         sb->s_magic             = BCACHEFS_STATFS_MAGIC;
1629         sb->s_time_gran         = c->sb.nsec_per_time_unit;
1630         sb->s_time_min          = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
1631         sb->s_time_max          = div_s64(S64_MAX, c->sb.time_units_per_sec);
1632         c->vfs_sb               = sb;
1633         strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
1634
1635         ret = super_setup_bdi(sb);
1636         if (ret)
1637                 goto err_put_super;
1638
1639         sb->s_bdi->ra_pages             = VM_READAHEAD_PAGES;
1640
1641         for_each_online_member(ca, c, i) {
1642                 struct block_device *bdev = ca->disk_sb.bdev;
1643
1644                 /* XXX: create an anonymous device for multi device filesystems */
1645                 sb->s_bdev      = bdev;
1646                 sb->s_dev       = bdev->bd_dev;
1647                 percpu_ref_put(&ca->io_ref);
1648                 break;
1649         }
1650
1651         c->dev = sb->s_dev;
1652
1653 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1654         if (c->opts.acl)
1655                 sb->s_flags     |= SB_POSIXACL;
1656 #endif
1657
1658         vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
1659         if (IS_ERR(vinode)) {
1660                 bch_err(c, "error mounting: error getting root inode %i",
1661                         (int) PTR_ERR(vinode));
1662                 ret = PTR_ERR(vinode);
1663                 goto err_put_super;
1664         }
1665
1666         sb->s_root = d_make_root(vinode);
1667         if (!sb->s_root) {
1668                 bch_err(c, "error mounting: error allocating root dentry");
1669                 ret = -ENOMEM;
1670                 goto err_put_super;
1671         }
1672
1673         sb->s_flags |= SB_ACTIVE;
1674 out:
1675         return dget(sb->s_root);
1676
1677 err_put_super:
1678         deactivate_locked_super(sb);
1679         return ERR_PTR(ret);
1680 }
1681
1682 static void bch2_kill_sb(struct super_block *sb)
1683 {
1684         struct bch_fs *c = sb->s_fs_info;
1685
1686         generic_shutdown_super(sb);
1687         bch2_fs_free(c);
1688 }
1689
1690 static struct file_system_type bcache_fs_type = {
1691         .owner          = THIS_MODULE,
1692         .name           = "bcachefs",
1693         .mount          = bch2_mount,
1694         .kill_sb        = bch2_kill_sb,
1695         .fs_flags       = FS_REQUIRES_DEV,
1696 };
1697
1698 MODULE_ALIAS_FS("bcachefs");
1699
1700 void bch2_vfs_exit(void)
1701 {
1702         unregister_filesystem(&bcache_fs_type);
1703         if (bch2_inode_cache)
1704                 kmem_cache_destroy(bch2_inode_cache);
1705 }
1706
1707 int __init bch2_vfs_init(void)
1708 {
1709         int ret = -ENOMEM;
1710
1711         bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
1712         if (!bch2_inode_cache)
1713                 goto err;
1714
1715         ret = register_filesystem(&bcache_fs_type);
1716         if (ret)
1717                 goto err;
1718
1719         return 0;
1720 err:
1721         bch2_vfs_exit();
1722         return ret;
1723 }
1724
1725 #endif /* NO_BCACHEFS_FS */