]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fs.c
Merge https://github.com/YellowOnion/bcachefs-tools
[bcachefs-tools-debian] / libbcachefs / fs.c
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3
4 #include "bcachefs.h"
5 #include "acl.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "chardev.h"
10 #include "dirent.h"
11 #include "extents.h"
12 #include "fs.h"
13 #include "fs-common.h"
14 #include "fs-io.h"
15 #include "fs-ioctl.h"
16 #include "fsck.h"
17 #include "inode.h"
18 #include "io.h"
19 #include "journal.h"
20 #include "keylist.h"
21 #include "quota.h"
22 #include "super.h"
23 #include "xattr.h"
24
25 #include <linux/aio.h>
26 #include <linux/backing-dev.h>
27 #include <linux/exportfs.h>
28 #include <linux/fiemap.h>
29 #include <linux/module.h>
30 #include <linux/pagemap.h>
31 #include <linux/posix_acl.h>
32 #include <linux/random.h>
33 #include <linux/statfs.h>
34 #include <linux/string.h>
35 #include <linux/xattr.h>
36
37 static struct kmem_cache *bch2_inode_cache;
38
39 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
40                                 struct bch_inode_info *,
41                                 struct bch_inode_unpacked *);
42
43 static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
44 {
45         BUG_ON(atomic_long_read(&lock->v) == 0);
46
47         if (atomic_long_sub_return_release(i, &lock->v) == 0)
48                 wake_up_all(&lock->wait);
49 }
50
51 static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
52 {
53         long v = atomic_long_read(&lock->v), old;
54
55         do {
56                 old = v;
57
58                 if (i > 0 ? v < 0 : v > 0)
59                         return false;
60         } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
61                                         old, old + i)) != old);
62         return true;
63 }
64
65 static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
66 {
67         wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
68 }
69
70 void bch2_pagecache_add_put(struct pagecache_lock *lock)
71 {
72         __pagecache_lock_put(lock, 1);
73 }
74
75 bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
76 {
77         return __pagecache_lock_tryget(lock, 1);
78 }
79
80 void bch2_pagecache_add_get(struct pagecache_lock *lock)
81 {
82         __pagecache_lock_get(lock, 1);
83 }
84
85 void bch2_pagecache_block_put(struct pagecache_lock *lock)
86 {
87         __pagecache_lock_put(lock, -1);
88 }
89
90 void bch2_pagecache_block_get(struct pagecache_lock *lock)
91 {
92         __pagecache_lock_get(lock, -1);
93 }
94
95 void bch2_inode_update_after_write(struct btree_trans *trans,
96                                    struct bch_inode_info *inode,
97                                    struct bch_inode_unpacked *bi,
98                                    unsigned fields)
99 {
100         struct bch_fs *c = trans->c;
101
102         BUG_ON(bi->bi_inum != inode->v.i_ino);
103
104         bch2_assert_pos_locked(trans, BTREE_ID_inodes,
105                                POS(0, bi->bi_inum),
106                                0 && c->opts.inodes_use_key_cache);
107
108         set_nlink(&inode->v, bch2_inode_nlink_get(bi));
109         i_uid_write(&inode->v, bi->bi_uid);
110         i_gid_write(&inode->v, bi->bi_gid);
111         inode->v.i_mode = bi->bi_mode;
112
113         if (fields & ATTR_ATIME)
114                 inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
115         if (fields & ATTR_MTIME)
116                 inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
117         if (fields & ATTR_CTIME)
118                 inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
119
120         inode->ei_inode         = *bi;
121
122         bch2_inode_flags_to_vfs(inode);
123 }
124
125 int __must_check bch2_write_inode(struct bch_fs *c,
126                                   struct bch_inode_info *inode,
127                                   inode_set_fn set,
128                                   void *p, unsigned fields)
129 {
130         struct btree_trans trans;
131         struct btree_iter iter = { NULL };
132         struct bch_inode_unpacked inode_u;
133         int ret;
134
135         bch2_trans_init(&trans, c, 0, 512);
136         trans.ip = _RET_IP_;
137 retry:
138         bch2_trans_begin(&trans);
139
140         ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
141                                 BTREE_ITER_INTENT) ?:
142                 (set ? set(inode, &inode_u, p) : 0) ?:
143                 bch2_inode_write(&trans, &iter, &inode_u) ?:
144                 bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL);
145
146         /*
147          * the btree node lock protects inode->ei_inode, not ei_update_lock;
148          * this is important for inode updates via bchfs_write_index_update
149          */
150         if (!ret)
151                 bch2_inode_update_after_write(&trans, inode, &inode_u, fields);
152
153         bch2_trans_iter_exit(&trans, &iter);
154
155         if (ret == -EINTR)
156                 goto retry;
157
158         bch2_trans_exit(&trans);
159         return ret < 0 ? ret : 0;
160 }
161
162 int bch2_fs_quota_transfer(struct bch_fs *c,
163                            struct bch_inode_info *inode,
164                            struct bch_qid new_qid,
165                            unsigned qtypes,
166                            enum quota_acct_mode mode)
167 {
168         unsigned i;
169         int ret;
170
171         qtypes &= enabled_qtypes(c);
172
173         for (i = 0; i < QTYP_NR; i++)
174                 if (new_qid.q[i] == inode->ei_qid.q[i])
175                         qtypes &= ~(1U << i);
176
177         if (!qtypes)
178                 return 0;
179
180         mutex_lock(&inode->ei_quota_lock);
181
182         ret = bch2_quota_transfer(c, qtypes, new_qid,
183                                   inode->ei_qid,
184                                   inode->v.i_blocks +
185                                   inode->ei_quota_reserved,
186                                   mode);
187         if (!ret)
188                 for (i = 0; i < QTYP_NR; i++)
189                         if (qtypes & (1 << i))
190                                 inode->ei_qid.q[i] = new_qid.q[i];
191
192         mutex_unlock(&inode->ei_quota_lock);
193
194         return ret;
195 }
196
197 static int bch2_iget5_test(struct inode *vinode, void *p)
198 {
199         struct bch_inode_info *inode = to_bch_ei(vinode);
200         subvol_inum *inum = p;
201
202         return inode->ei_subvol == inum->subvol &&
203                 inode->ei_inode.bi_inum == inum->inum;
204 }
205
206 static int bch2_iget5_set(struct inode *vinode, void *p)
207 {
208         struct bch_inode_info *inode = to_bch_ei(vinode);
209         subvol_inum *inum = p;
210
211         inode->v.i_ino          = inum->inum;
212         inode->ei_subvol        = inum->subvol;
213         inode->ei_inode.bi_inum = inum->inum;
214         return 0;
215 }
216
217 static unsigned bch2_inode_hash(subvol_inum inum)
218 {
219         return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
220 }
221
222 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
223 {
224         struct bch_inode_unpacked inode_u;
225         struct bch_inode_info *inode;
226         struct btree_trans trans;
227         int ret;
228
229         inode = to_bch_ei(iget5_locked(c->vfs_sb,
230                                        bch2_inode_hash(inum),
231                                        bch2_iget5_test,
232                                        bch2_iget5_set,
233                                        &inum));
234         if (unlikely(!inode))
235                 return ERR_PTR(-ENOMEM);
236         if (!(inode->v.i_state & I_NEW))
237                 return &inode->v;
238
239         bch2_trans_init(&trans, c, 8, 0);
240         ret = lockrestart_do(&trans,
241                 bch2_inode_find_by_inum_trans(&trans, inum, &inode_u));
242
243         if (!ret)
244                 bch2_vfs_inode_init(&trans, inum, inode, &inode_u);
245         bch2_trans_exit(&trans);
246
247         if (ret) {
248                 iget_failed(&inode->v);
249                 return ERR_PTR(ret);
250         }
251
252         unlock_new_inode(&inode->v);
253
254         return &inode->v;
255 }
256
257 struct bch_inode_info *
258 __bch2_create(struct user_namespace *mnt_userns,
259               struct bch_inode_info *dir, struct dentry *dentry,
260               umode_t mode, dev_t rdev, subvol_inum snapshot_src,
261               unsigned flags)
262 {
263         struct bch_fs *c = dir->v.i_sb->s_fs_info;
264         struct btree_trans trans;
265         struct bch_inode_unpacked dir_u;
266         struct bch_inode_info *inode, *old;
267         struct bch_inode_unpacked inode_u;
268         struct posix_acl *default_acl = NULL, *acl = NULL;
269         subvol_inum inum;
270         u64 journal_seq = 0;
271         int ret;
272
273         /*
274          * preallocate acls + vfs inode before btree transaction, so that
275          * nothing can fail after the transaction succeeds:
276          */
277 #ifdef CONFIG_BCACHEFS_POSIX_ACL
278         ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
279         if (ret)
280                 return ERR_PTR(ret);
281 #endif
282         inode = to_bch_ei(new_inode(c->vfs_sb));
283         if (unlikely(!inode)) {
284                 inode = ERR_PTR(-ENOMEM);
285                 goto err;
286         }
287
288         bch2_inode_init_early(c, &inode_u);
289
290         if (!(flags & BCH_CREATE_TMPFILE))
291                 mutex_lock(&dir->ei_update_lock);
292
293         bch2_trans_init(&trans, c, 8,
294                         2048 + (!(flags & BCH_CREATE_TMPFILE)
295                                 ? dentry->d_name.len : 0));
296 retry:
297         bch2_trans_begin(&trans);
298
299         ret   = bch2_create_trans(&trans,
300                                   inode_inum(dir), &dir_u, &inode_u,
301                                   !(flags & BCH_CREATE_TMPFILE)
302                                   ? &dentry->d_name : NULL,
303                                   from_kuid(mnt_userns, current_fsuid()),
304                                   from_kgid(mnt_userns, current_fsgid()),
305                                   mode, rdev,
306                                   default_acl, acl, snapshot_src, flags) ?:
307                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
308                                 KEY_TYPE_QUOTA_PREALLOC);
309         if (unlikely(ret))
310                 goto err_before_quota;
311
312         ret   = bch2_trans_commit(&trans, NULL, &journal_seq, 0);
313         if (unlikely(ret)) {
314                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
315                                 KEY_TYPE_QUOTA_WARN);
316 err_before_quota:
317                 if (ret == -EINTR)
318                         goto retry;
319                 goto err_trans;
320         }
321
322         if (!(flags & BCH_CREATE_TMPFILE)) {
323                 bch2_inode_update_after_write(&trans, dir, &dir_u,
324                                               ATTR_MTIME|ATTR_CTIME);
325                 mutex_unlock(&dir->ei_update_lock);
326         }
327
328         inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
329         inum.inum = inode_u.bi_inum;
330
331         bch2_iget5_set(&inode->v, &inum);
332         bch2_vfs_inode_init(&trans, inum, inode, &inode_u);
333
334         set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
335         set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
336
337         /*
338          * we must insert the new inode into the inode cache before calling
339          * bch2_trans_exit() and dropping locks, else we could race with another
340          * thread pulling the inode in and modifying it:
341          */
342
343         inode->v.i_state |= I_CREATING;
344
345         old = to_bch_ei(inode_insert5(&inode->v,
346                                       bch2_inode_hash(inum),
347                                       bch2_iget5_test,
348                                       bch2_iget5_set,
349                                       &inum));
350         BUG_ON(!old);
351
352         if (unlikely(old != inode)) {
353                 /*
354                  * We raced, another process pulled the new inode into cache
355                  * before us:
356                  */
357                 make_bad_inode(&inode->v);
358                 iput(&inode->v);
359
360                 inode = old;
361         } else {
362                 /*
363                  * we really don't want insert_inode_locked2() to be setting
364                  * I_NEW...
365                  */
366                 unlock_new_inode(&inode->v);
367         }
368
369         bch2_trans_exit(&trans);
370 err:
371         posix_acl_release(default_acl);
372         posix_acl_release(acl);
373         return inode;
374 err_trans:
375         if (!(flags & BCH_CREATE_TMPFILE))
376                 mutex_unlock(&dir->ei_update_lock);
377
378         bch2_trans_exit(&trans);
379         make_bad_inode(&inode->v);
380         iput(&inode->v);
381         inode = ERR_PTR(ret);
382         goto err;
383 }
384
385 /* methods */
386
387 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
388                                   unsigned int flags)
389 {
390         struct bch_fs *c = vdir->i_sb->s_fs_info;
391         struct bch_inode_info *dir = to_bch_ei(vdir);
392         struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
393         struct inode *vinode = NULL;
394         subvol_inum inum = { .subvol = 1 };
395         int ret;
396
397         ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
398                                  &dentry->d_name, &inum);
399
400         if (!ret)
401                 vinode = bch2_vfs_inode_get(c, inum);
402
403         return d_splice_alias(vinode, dentry);
404 }
405
406 static int bch2_mknod(struct user_namespace *mnt_userns,
407                       struct inode *vdir, struct dentry *dentry,
408                       umode_t mode, dev_t rdev)
409 {
410         struct bch_inode_info *inode =
411                 __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev,
412                               (subvol_inum) { 0 }, 0);
413
414         if (IS_ERR(inode))
415                 return PTR_ERR(inode);
416
417         d_instantiate(dentry, &inode->v);
418         return 0;
419 }
420
421 static int bch2_create(struct user_namespace *mnt_userns,
422                        struct inode *vdir, struct dentry *dentry,
423                        umode_t mode, bool excl)
424 {
425         return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFREG, 0);
426 }
427
428 static int __bch2_link(struct bch_fs *c,
429                        struct bch_inode_info *inode,
430                        struct bch_inode_info *dir,
431                        struct dentry *dentry)
432 {
433         struct btree_trans trans;
434         struct bch_inode_unpacked dir_u, inode_u;
435         int ret;
436
437         mutex_lock(&inode->ei_update_lock);
438         bch2_trans_init(&trans, c, 4, 1024);
439
440         ret = __bch2_trans_do(&trans, NULL, NULL, 0,
441                         bch2_link_trans(&trans,
442                                         inode_inum(dir),   &dir_u,
443                                         inode_inum(inode), &inode_u,
444                                         &dentry->d_name));
445
446         if (likely(!ret)) {
447                 bch2_inode_update_after_write(&trans, dir, &dir_u,
448                                               ATTR_MTIME|ATTR_CTIME);
449                 bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
450         }
451
452         bch2_trans_exit(&trans);
453         mutex_unlock(&inode->ei_update_lock);
454         return ret;
455 }
456
457 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
458                      struct dentry *dentry)
459 {
460         struct bch_fs *c = vdir->i_sb->s_fs_info;
461         struct bch_inode_info *dir = to_bch_ei(vdir);
462         struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
463         int ret;
464
465         lockdep_assert_held(&inode->v.i_rwsem);
466
467         ret = __bch2_link(c, inode, dir, dentry);
468         if (unlikely(ret))
469                 return ret;
470
471         ihold(&inode->v);
472         d_instantiate(dentry, &inode->v);
473         return 0;
474 }
475
476 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
477                   bool deleting_snapshot)
478 {
479         struct bch_fs *c = vdir->i_sb->s_fs_info;
480         struct bch_inode_info *dir = to_bch_ei(vdir);
481         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
482         struct bch_inode_unpacked dir_u, inode_u;
483         struct btree_trans trans;
484         int ret;
485
486         bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
487         bch2_trans_init(&trans, c, 4, 1024);
488
489         ret = __bch2_trans_do(&trans, NULL, NULL,
490                               BTREE_INSERT_NOFAIL,
491                         bch2_unlink_trans(&trans,
492                                           inode_inum(dir), &dir_u,
493                                           &inode_u, &dentry->d_name,
494                                           deleting_snapshot));
495
496         if (likely(!ret)) {
497                 bch2_inode_update_after_write(&trans, dir, &dir_u,
498                                               ATTR_MTIME|ATTR_CTIME);
499                 bch2_inode_update_after_write(&trans, inode, &inode_u,
500                                               ATTR_MTIME);
501         }
502
503         bch2_trans_exit(&trans);
504         bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
505
506         return ret;
507 }
508
509 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
510 {
511         return __bch2_unlink(vdir, dentry, false);
512 }
513
514 static int bch2_symlink(struct user_namespace *mnt_userns,
515                         struct inode *vdir, struct dentry *dentry,
516                         const char *symname)
517 {
518         struct bch_fs *c = vdir->i_sb->s_fs_info;
519         struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
520         int ret;
521
522         inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
523                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
524         if (unlikely(IS_ERR(inode)))
525                 return PTR_ERR(inode);
526
527         inode_lock(&inode->v);
528         ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
529         inode_unlock(&inode->v);
530
531         if (unlikely(ret))
532                 goto err;
533
534         ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
535         if (unlikely(ret))
536                 goto err;
537
538         ret = __bch2_link(c, inode, dir, dentry);
539         if (unlikely(ret))
540                 goto err;
541
542         d_instantiate(dentry, &inode->v);
543         return 0;
544 err:
545         iput(&inode->v);
546         return ret;
547 }
548
549 static int bch2_mkdir(struct user_namespace *mnt_userns,
550                       struct inode *vdir, struct dentry *dentry, umode_t mode)
551 {
552         return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFDIR, 0);
553 }
554
555 static int bch2_rename2(struct user_namespace *mnt_userns,
556                         struct inode *src_vdir, struct dentry *src_dentry,
557                         struct inode *dst_vdir, struct dentry *dst_dentry,
558                         unsigned flags)
559 {
560         struct bch_fs *c = src_vdir->i_sb->s_fs_info;
561         struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
562         struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
563         struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
564         struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
565         struct bch_inode_unpacked dst_dir_u, src_dir_u;
566         struct bch_inode_unpacked src_inode_u, dst_inode_u;
567         struct btree_trans trans;
568         enum bch_rename_mode mode = flags & RENAME_EXCHANGE
569                 ? BCH_RENAME_EXCHANGE
570                 : dst_dentry->d_inode
571                 ? BCH_RENAME_OVERWRITE : BCH_RENAME;
572         int ret;
573
574         if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
575                 return -EINVAL;
576
577         if (mode == BCH_RENAME_OVERWRITE) {
578                 ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
579                                                    0, LLONG_MAX);
580                 if (ret)
581                         return ret;
582         }
583
584         bch2_trans_init(&trans, c, 8, 2048);
585
586         bch2_lock_inodes(INODE_UPDATE_LOCK,
587                          src_dir,
588                          dst_dir,
589                          src_inode,
590                          dst_inode);
591
592         if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
593                 ret = bch2_fs_quota_transfer(c, src_inode,
594                                              dst_dir->ei_qid,
595                                              1 << QTYP_PRJ,
596                                              KEY_TYPE_QUOTA_PREALLOC);
597                 if (ret)
598                         goto err;
599         }
600
601         if (mode == BCH_RENAME_EXCHANGE &&
602             inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
603                 ret = bch2_fs_quota_transfer(c, dst_inode,
604                                              src_dir->ei_qid,
605                                              1 << QTYP_PRJ,
606                                              KEY_TYPE_QUOTA_PREALLOC);
607                 if (ret)
608                         goto err;
609         }
610
611         ret = __bch2_trans_do(&trans, NULL, NULL, 0,
612                         bch2_rename_trans(&trans,
613                                           inode_inum(src_dir), &src_dir_u,
614                                           inode_inum(dst_dir), &dst_dir_u,
615                                           &src_inode_u,
616                                           &dst_inode_u,
617                                           &src_dentry->d_name,
618                                           &dst_dentry->d_name,
619                                           mode));
620         if (unlikely(ret))
621                 goto err;
622
623         BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
624         BUG_ON(dst_inode &&
625                dst_inode->v.i_ino != dst_inode_u.bi_inum);
626
627         bch2_inode_update_after_write(&trans, src_dir, &src_dir_u,
628                                       ATTR_MTIME|ATTR_CTIME);
629
630         if (src_dir != dst_dir)
631                 bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u,
632                                               ATTR_MTIME|ATTR_CTIME);
633
634         bch2_inode_update_after_write(&trans, src_inode, &src_inode_u,
635                                       ATTR_CTIME);
636
637         if (dst_inode)
638                 bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u,
639                                               ATTR_CTIME);
640 err:
641         bch2_trans_exit(&trans);
642
643         bch2_fs_quota_transfer(c, src_inode,
644                                bch_qid(&src_inode->ei_inode),
645                                1 << QTYP_PRJ,
646                                KEY_TYPE_QUOTA_NOCHECK);
647         if (dst_inode)
648                 bch2_fs_quota_transfer(c, dst_inode,
649                                        bch_qid(&dst_inode->ei_inode),
650                                        1 << QTYP_PRJ,
651                                        KEY_TYPE_QUOTA_NOCHECK);
652
653         bch2_unlock_inodes(INODE_UPDATE_LOCK,
654                            src_dir,
655                            dst_dir,
656                            src_inode,
657                            dst_inode);
658
659         return ret;
660 }
661
662 static void bch2_setattr_copy(struct user_namespace *mnt_userns,
663                               struct bch_inode_info *inode,
664                               struct bch_inode_unpacked *bi,
665                               struct iattr *attr)
666 {
667         struct bch_fs *c = inode->v.i_sb->s_fs_info;
668         unsigned int ia_valid = attr->ia_valid;
669
670         if (ia_valid & ATTR_UID)
671                 bi->bi_uid = from_kuid(mnt_userns, attr->ia_uid);
672         if (ia_valid & ATTR_GID)
673                 bi->bi_gid = from_kgid(mnt_userns, attr->ia_gid);
674
675         if (ia_valid & ATTR_SIZE)
676                 bi->bi_size = attr->ia_size;
677
678         if (ia_valid & ATTR_ATIME)
679                 bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
680         if (ia_valid & ATTR_MTIME)
681                 bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
682         if (ia_valid & ATTR_CTIME)
683                 bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
684
685         if (ia_valid & ATTR_MODE) {
686                 umode_t mode = attr->ia_mode;
687                 kgid_t gid = ia_valid & ATTR_GID
688                         ? attr->ia_gid
689                         : inode->v.i_gid;
690
691                 if (!in_group_p(gid) &&
692                     !capable_wrt_inode_uidgid(mnt_userns, &inode->v, CAP_FSETID))
693                         mode &= ~S_ISGID;
694                 bi->bi_mode = mode;
695         }
696 }
697
698 int bch2_setattr_nonsize(struct user_namespace *mnt_userns,
699                          struct bch_inode_info *inode,
700                          struct iattr *attr)
701 {
702         struct bch_fs *c = inode->v.i_sb->s_fs_info;
703         struct bch_qid qid;
704         struct btree_trans trans;
705         struct btree_iter inode_iter = { NULL };
706         struct bch_inode_unpacked inode_u;
707         struct posix_acl *acl = NULL;
708         int ret;
709
710         mutex_lock(&inode->ei_update_lock);
711
712         qid = inode->ei_qid;
713
714         if (attr->ia_valid & ATTR_UID)
715                 qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid);
716
717         if (attr->ia_valid & ATTR_GID)
718                 qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid);
719
720         ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
721                                      KEY_TYPE_QUOTA_PREALLOC);
722         if (ret)
723                 goto err;
724
725         bch2_trans_init(&trans, c, 0, 0);
726 retry:
727         bch2_trans_begin(&trans);
728         kfree(acl);
729         acl = NULL;
730
731         ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
732                               BTREE_ITER_INTENT);
733         if (ret)
734                 goto btree_err;
735
736         bch2_setattr_copy(mnt_userns, inode, &inode_u, attr);
737
738         if (attr->ia_valid & ATTR_MODE) {
739                 ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
740                                      inode_u.bi_mode, &acl);
741                 if (ret)
742                         goto btree_err;
743         }
744
745         ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
746                 bch2_trans_commit(&trans, NULL, NULL,
747                                   BTREE_INSERT_NOFAIL);
748 btree_err:
749         bch2_trans_iter_exit(&trans, &inode_iter);
750
751         if (ret == -EINTR)
752                 goto retry;
753         if (unlikely(ret))
754                 goto err_trans;
755
756         bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid);
757
758         if (acl)
759                 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
760 err_trans:
761         bch2_trans_exit(&trans);
762 err:
763         mutex_unlock(&inode->ei_update_lock);
764
765         return ret;
766 }
767
768 static int bch2_getattr(struct user_namespace *mnt_userns,
769                         const struct path *path, struct kstat *stat,
770                         u32 request_mask, unsigned query_flags)
771 {
772         struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
773         struct bch_fs *c = inode->v.i_sb->s_fs_info;
774
775         stat->dev       = inode->v.i_sb->s_dev;
776         stat->ino       = inode->v.i_ino;
777         stat->mode      = inode->v.i_mode;
778         stat->nlink     = inode->v.i_nlink;
779         stat->uid       = inode->v.i_uid;
780         stat->gid       = inode->v.i_gid;
781         stat->rdev      = inode->v.i_rdev;
782         stat->size      = i_size_read(&inode->v);
783         stat->atime     = inode->v.i_atime;
784         stat->mtime     = inode->v.i_mtime;
785         stat->ctime     = inode->v.i_ctime;
786         stat->blksize   = block_bytes(c);
787         stat->blocks    = inode->v.i_blocks;
788
789         if (request_mask & STATX_BTIME) {
790                 stat->result_mask |= STATX_BTIME;
791                 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
792         }
793
794         if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
795                 stat->attributes |= STATX_ATTR_IMMUTABLE;
796         stat->attributes_mask    |= STATX_ATTR_IMMUTABLE;
797
798         if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
799                 stat->attributes |= STATX_ATTR_APPEND;
800         stat->attributes_mask    |= STATX_ATTR_APPEND;
801
802         if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
803                 stat->attributes |= STATX_ATTR_NODUMP;
804         stat->attributes_mask    |= STATX_ATTR_NODUMP;
805
806         return 0;
807 }
808
809 static int bch2_setattr(struct user_namespace *mnt_userns,
810                         struct dentry *dentry, struct iattr *iattr)
811 {
812         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
813         int ret;
814
815         lockdep_assert_held(&inode->v.i_rwsem);
816
817         ret = setattr_prepare(mnt_userns, dentry, iattr);
818         if (ret)
819                 return ret;
820
821         return iattr->ia_valid & ATTR_SIZE
822                 ? bch2_truncate(mnt_userns, inode, iattr)
823                 : bch2_setattr_nonsize(mnt_userns, inode, iattr);
824 }
825
826 static int bch2_tmpfile(struct user_namespace *mnt_userns,
827                         struct inode *vdir, struct dentry *dentry, umode_t mode)
828 {
829         struct bch_inode_info *inode =
830                 __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0,
831                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
832
833         if (IS_ERR(inode))
834                 return PTR_ERR(inode);
835
836         d_mark_tmpfile(dentry, &inode->v);
837         d_instantiate(dentry, &inode->v);
838         return 0;
839 }
840
841 static int bch2_fill_extent(struct bch_fs *c,
842                             struct fiemap_extent_info *info,
843                             struct bkey_s_c k, unsigned flags)
844 {
845         if (bkey_extent_is_direct_data(k.k)) {
846                 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
847                 const union bch_extent_entry *entry;
848                 struct extent_ptr_decoded p;
849                 int ret;
850
851                 if (k.k->type == KEY_TYPE_reflink_v)
852                         flags |= FIEMAP_EXTENT_SHARED;
853
854                 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
855                         int flags2 = 0;
856                         u64 offset = p.ptr.offset;
857
858                         if (p.crc.compression_type)
859                                 flags2 |= FIEMAP_EXTENT_ENCODED;
860                         else
861                                 offset += p.crc.offset;
862
863                         if ((offset & (c->opts.block_size - 1)) ||
864                             (k.k->size & (c->opts.block_size - 1)))
865                                 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
866
867                         ret = fiemap_fill_next_extent(info,
868                                                 bkey_start_offset(k.k) << 9,
869                                                 offset << 9,
870                                                 k.k->size << 9, flags|flags2);
871                         if (ret)
872                                 return ret;
873                 }
874
875                 return 0;
876         } else if (bkey_extent_is_inline_data(k.k)) {
877                 return fiemap_fill_next_extent(info,
878                                                bkey_start_offset(k.k) << 9,
879                                                0, k.k->size << 9,
880                                                flags|
881                                                FIEMAP_EXTENT_DATA_INLINE);
882         } else if (k.k->type == KEY_TYPE_reservation) {
883                 return fiemap_fill_next_extent(info,
884                                                bkey_start_offset(k.k) << 9,
885                                                0, k.k->size << 9,
886                                                flags|
887                                                FIEMAP_EXTENT_DELALLOC|
888                                                FIEMAP_EXTENT_UNWRITTEN);
889         } else {
890                 BUG();
891         }
892 }
893
894 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
895                        u64 start, u64 len)
896 {
897         struct bch_fs *c = vinode->i_sb->s_fs_info;
898         struct bch_inode_info *ei = to_bch_ei(vinode);
899         struct btree_trans trans;
900         struct btree_iter iter;
901         struct bkey_s_c k;
902         struct bkey_buf cur, prev;
903         struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
904         unsigned offset_into_extent, sectors;
905         bool have_extent = false;
906         u32 snapshot;
907         int ret = 0;
908
909         ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
910         if (ret)
911                 return ret;
912
913         if (start + len < start)
914                 return -EINVAL;
915
916         start >>= 9;
917
918         bch2_bkey_buf_init(&cur);
919         bch2_bkey_buf_init(&prev);
920         bch2_trans_init(&trans, c, 0, 0);
921 retry:
922         bch2_trans_begin(&trans);
923
924         ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
925         if (ret)
926                 goto err;
927
928         bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
929                              SPOS(ei->v.i_ino, start, snapshot), 0);
930
931         while ((k = bch2_btree_iter_peek(&iter)).k &&
932                !(ret = bkey_err(k)) &&
933                bkey_cmp(iter.pos, end) < 0) {
934                 enum btree_id data_btree = BTREE_ID_extents;
935
936                 if (!bkey_extent_is_data(k.k) &&
937                     k.k->type != KEY_TYPE_reservation) {
938                         bch2_btree_iter_advance(&iter);
939                         continue;
940                 }
941
942                 offset_into_extent      = iter.pos.offset -
943                         bkey_start_offset(k.k);
944                 sectors                 = k.k->size - offset_into_extent;
945
946                 bch2_bkey_buf_reassemble(&cur, c, k);
947
948                 ret = bch2_read_indirect_extent(&trans, &data_btree,
949                                         &offset_into_extent, &cur);
950                 if (ret)
951                         break;
952
953                 k = bkey_i_to_s_c(cur.k);
954                 bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
955
956                 sectors = min(sectors, k.k->size - offset_into_extent);
957
958                 bch2_cut_front(POS(k.k->p.inode,
959                                    bkey_start_offset(k.k) +
960                                    offset_into_extent),
961                                cur.k);
962                 bch2_key_resize(&cur.k->k, sectors);
963                 cur.k->k.p = iter.pos;
964                 cur.k->k.p.offset += cur.k->k.size;
965
966                 if (have_extent) {
967                         ret = bch2_fill_extent(c, info,
968                                         bkey_i_to_s_c(prev.k), 0);
969                         if (ret)
970                                 break;
971                 }
972
973                 bkey_copy(prev.k, cur.k);
974                 have_extent = true;
975
976                 bch2_btree_iter_set_pos(&iter,
977                         POS(iter.pos.inode, iter.pos.offset + sectors));
978         }
979         start = iter.pos.offset;
980         bch2_trans_iter_exit(&trans, &iter);
981 err:
982         if (ret == -EINTR)
983                 goto retry;
984
985         if (!ret && have_extent)
986                 ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
987                                        FIEMAP_EXTENT_LAST);
988
989         bch2_trans_exit(&trans);
990         bch2_bkey_buf_exit(&cur, c);
991         bch2_bkey_buf_exit(&prev, c);
992         return ret < 0 ? ret : 0;
993 }
994
995 static const struct vm_operations_struct bch_vm_ops = {
996         .fault          = bch2_page_fault,
997         .map_pages      = filemap_map_pages,
998         .page_mkwrite   = bch2_page_mkwrite,
999 };
1000
1001 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
1002 {
1003         file_accessed(file);
1004
1005         vma->vm_ops = &bch_vm_ops;
1006         return 0;
1007 }
1008
1009 /* Directories: */
1010
1011 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
1012 {
1013         return generic_file_llseek_size(file, offset, whence,
1014                                         S64_MAX, S64_MAX);
1015 }
1016
1017 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1018 {
1019         struct bch_inode_info *inode = file_bch_inode(file);
1020         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1021
1022         if (!dir_emit_dots(file, ctx))
1023                 return 0;
1024
1025         return bch2_readdir(c, inode_inum(inode), ctx);
1026 }
1027
1028 static const struct file_operations bch_file_operations = {
1029         .llseek         = bch2_llseek,
1030         .read_iter      = bch2_read_iter,
1031         .write_iter     = bch2_write_iter,
1032         .mmap           = bch2_mmap,
1033         .open           = generic_file_open,
1034         .fsync          = bch2_fsync,
1035         .splice_read    = generic_file_splice_read,
1036         .splice_write   = iter_file_splice_write,
1037         .fallocate      = bch2_fallocate_dispatch,
1038         .unlocked_ioctl = bch2_fs_file_ioctl,
1039 #ifdef CONFIG_COMPAT
1040         .compat_ioctl   = bch2_compat_fs_ioctl,
1041 #endif
1042         .remap_file_range = bch2_remap_file_range,
1043 };
1044
1045 static const struct inode_operations bch_file_inode_operations = {
1046         .getattr        = bch2_getattr,
1047         .setattr        = bch2_setattr,
1048         .fiemap         = bch2_fiemap,
1049         .listxattr      = bch2_xattr_list,
1050 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1051         .get_acl        = bch2_get_acl,
1052         .set_acl        = bch2_set_acl,
1053 #endif
1054 };
1055
1056 static const struct inode_operations bch_dir_inode_operations = {
1057         .lookup         = bch2_lookup,
1058         .create         = bch2_create,
1059         .link           = bch2_link,
1060         .unlink         = bch2_unlink,
1061         .symlink        = bch2_symlink,
1062         .mkdir          = bch2_mkdir,
1063         .rmdir          = bch2_unlink,
1064         .mknod          = bch2_mknod,
1065         .rename         = bch2_rename2,
1066         .getattr        = bch2_getattr,
1067         .setattr        = bch2_setattr,
1068         .tmpfile        = bch2_tmpfile,
1069         .listxattr      = bch2_xattr_list,
1070 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1071         .get_acl        = bch2_get_acl,
1072         .set_acl        = bch2_set_acl,
1073 #endif
1074 };
1075
1076 static const struct file_operations bch_dir_file_operations = {
1077         .llseek         = bch2_dir_llseek,
1078         .read           = generic_read_dir,
1079         .iterate_shared = bch2_vfs_readdir,
1080         .fsync          = bch2_fsync,
1081         .unlocked_ioctl = bch2_fs_file_ioctl,
1082 #ifdef CONFIG_COMPAT
1083         .compat_ioctl   = bch2_compat_fs_ioctl,
1084 #endif
1085 };
1086
1087 static const struct inode_operations bch_symlink_inode_operations = {
1088         .get_link       = page_get_link,
1089         .getattr        = bch2_getattr,
1090         .setattr        = bch2_setattr,
1091         .listxattr      = bch2_xattr_list,
1092 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1093         .get_acl        = bch2_get_acl,
1094         .set_acl        = bch2_set_acl,
1095 #endif
1096 };
1097
1098 static const struct inode_operations bch_special_inode_operations = {
1099         .getattr        = bch2_getattr,
1100         .setattr        = bch2_setattr,
1101         .listxattr      = bch2_xattr_list,
1102 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1103         .get_acl        = bch2_get_acl,
1104         .set_acl        = bch2_set_acl,
1105 #endif
1106 };
1107
1108 static const struct address_space_operations bch_address_space_operations = {
1109         .writepage      = bch2_writepage,
1110         .readpage       = bch2_readpage,
1111         .writepages     = bch2_writepages,
1112         .readahead      = bch2_readahead,
1113         .set_page_dirty = __set_page_dirty_nobuffers,
1114         .write_begin    = bch2_write_begin,
1115         .write_end      = bch2_write_end,
1116         .invalidatepage = bch2_invalidatepage,
1117         .releasepage    = bch2_releasepage,
1118         .direct_IO      = noop_direct_IO,
1119 #ifdef CONFIG_MIGRATION
1120         .migratepage    = bch2_migrate_page,
1121 #endif
1122         .error_remove_page = generic_error_remove_page,
1123 };
1124
1125 struct bcachefs_fid {
1126         u64             inum;
1127         u32             subvol;
1128         u32             gen;
1129 } __packed;
1130
1131 struct bcachefs_fid_with_parent {
1132         struct bcachefs_fid     fid;
1133         struct bcachefs_fid     dir;
1134 } __packed;
1135
1136 static int bcachefs_fid_valid(int fh_len, int fh_type)
1137 {
1138         switch (fh_type) {
1139         case FILEID_BCACHEFS_WITHOUT_PARENT:
1140                 return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
1141         case FILEID_BCACHEFS_WITH_PARENT:
1142                 return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
1143         default:
1144                 return false;
1145         }
1146 }
1147
1148 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
1149 {
1150         return (struct bcachefs_fid) {
1151                 .inum   = inode->ei_inode.bi_inum,
1152                 .subvol = inode->ei_subvol,
1153                 .gen    = inode->ei_inode.bi_generation,
1154         };
1155 }
1156
1157 static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
1158                           struct inode *vdir)
1159 {
1160         struct bch_inode_info *inode    = to_bch_ei(vinode);
1161         struct bch_inode_info *dir      = to_bch_ei(vdir);
1162
1163         if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32))
1164                 return FILEID_INVALID;
1165
1166         if (!S_ISDIR(inode->v.i_mode) && dir) {
1167                 struct bcachefs_fid_with_parent *fid = (void *) fh;
1168
1169                 fid->fid = bch2_inode_to_fid(inode);
1170                 fid->dir = bch2_inode_to_fid(dir);
1171
1172                 *len = sizeof(*fid) / sizeof(u32);
1173                 return FILEID_BCACHEFS_WITH_PARENT;
1174         } else {
1175                 struct bcachefs_fid *fid = (void *) fh;
1176
1177                 *fid = bch2_inode_to_fid(inode);
1178
1179                 *len = sizeof(*fid) / sizeof(u32);
1180                 return FILEID_BCACHEFS_WITHOUT_PARENT;
1181         }
1182 }
1183
1184 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1185                                         struct bcachefs_fid fid)
1186 {
1187         struct bch_fs *c = sb->s_fs_info;
1188         struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
1189                                     .subvol = fid.subvol,
1190                                     .inum = fid.inum,
1191         });
1192         if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
1193                 iput(vinode);
1194                 vinode = ERR_PTR(-ESTALE);
1195         }
1196         return vinode;
1197 }
1198
1199 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
1200                 int fh_len, int fh_type)
1201 {
1202         struct bcachefs_fid *fid = (void *) _fid;
1203
1204         if (!bcachefs_fid_valid(fh_len, fh_type))
1205                 return NULL;
1206
1207         return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
1208 }
1209
1210 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
1211                 int fh_len, int fh_type)
1212 {
1213         struct bcachefs_fid_with_parent *fid = (void *) _fid;
1214
1215         if (!bcachefs_fid_valid(fh_len, fh_type) ||
1216             fh_type != FILEID_BCACHEFS_WITH_PARENT)
1217                 return NULL;
1218
1219         return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
1220 }
1221
1222 static struct dentry *bch2_get_parent(struct dentry *child)
1223 {
1224         struct bch_inode_info *inode = to_bch_ei(child->d_inode);
1225         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1226         subvol_inum parent_inum = {
1227                 .subvol = inode->ei_inode.bi_parent_subvol ?:
1228                         inode->ei_subvol,
1229                 .inum = inode->ei_inode.bi_dir,
1230         };
1231
1232         if (!parent_inum.inum)
1233                 return NULL;
1234
1235         return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
1236 }
1237
1238 static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
1239 {
1240         struct bch_inode_info *inode    = to_bch_ei(child->d_inode);
1241         struct bch_inode_info *dir      = to_bch_ei(parent->d_inode);
1242         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1243         struct btree_trans trans;
1244         struct btree_iter iter1;
1245         struct btree_iter iter2;
1246         struct bkey_s_c k;
1247         struct bkey_s_c_dirent d;
1248         struct bch_inode_unpacked inode_u;
1249         subvol_inum target;
1250         u32 snapshot;
1251         unsigned name_len;
1252         int ret;
1253
1254         if (!S_ISDIR(dir->v.i_mode))
1255                 return -EINVAL;
1256
1257         bch2_trans_init(&trans, c, 0, 0);
1258
1259         bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents,
1260                              POS(dir->ei_inode.bi_inum, 0), 0);
1261         bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents,
1262                              POS(dir->ei_inode.bi_inum, 0), 0);
1263 retry:
1264         bch2_trans_begin(&trans);
1265
1266         ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot);
1267         if (ret)
1268                 goto err;
1269
1270         bch2_btree_iter_set_snapshot(&iter1, snapshot);
1271         bch2_btree_iter_set_snapshot(&iter2, snapshot);
1272
1273         ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u);
1274         if (ret)
1275                 goto err;
1276
1277         if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
1278                 bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
1279
1280                 k = bch2_btree_iter_peek_slot(&iter1);
1281                 ret = bkey_err(k);
1282                 if (ret)
1283                         goto err;
1284
1285                 if (k.k->type != KEY_TYPE_dirent) {
1286                         ret = -ENOENT;
1287                         goto err;
1288                 }
1289
1290                 d = bkey_s_c_to_dirent(k);
1291                 ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
1292                 if (ret > 0)
1293                         ret = -ENOENT;
1294                 if (ret)
1295                         goto err;
1296
1297                 if (target.subvol       == inode->ei_subvol &&
1298                     target.inum         == inode->ei_inode.bi_inum)
1299                         goto found;
1300         } else {
1301                 /*
1302                  * File with multiple hardlinks and our backref is to the wrong
1303                  * directory - linear search:
1304                  */
1305                 for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
1306                         if (k.k->p.inode > dir->ei_inode.bi_inum)
1307                                 break;
1308
1309                         if (k.k->type != KEY_TYPE_dirent)
1310                                 continue;
1311
1312                         d = bkey_s_c_to_dirent(k);
1313                         ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
1314                         if (ret < 0)
1315                                 break;
1316                         if (ret)
1317                                 continue;
1318
1319                         if (target.subvol       == inode->ei_subvol &&
1320                             target.inum         == inode->ei_inode.bi_inum)
1321                                 goto found;
1322                 }
1323         }
1324
1325         ret = -ENOENT;
1326         goto err;
1327 found:
1328         name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX);
1329
1330         memcpy(name, d.v->d_name, name_len);
1331         name[name_len] = '\0';
1332 err:
1333         if (ret == -EINTR)
1334                 goto retry;
1335
1336         bch2_trans_iter_exit(&trans, &iter1);
1337         bch2_trans_iter_exit(&trans, &iter2);
1338         bch2_trans_exit(&trans);
1339
1340         return ret;
1341 }
1342
1343 static const struct export_operations bch_export_ops = {
1344         .encode_fh      = bch2_encode_fh,
1345         .fh_to_dentry   = bch2_fh_to_dentry,
1346         .fh_to_parent   = bch2_fh_to_parent,
1347         .get_parent     = bch2_get_parent,
1348         .get_name       = bch2_get_name,
1349 };
1350
1351 static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
1352                                 struct bch_inode_info *inode,
1353                                 struct bch_inode_unpacked *bi)
1354 {
1355         bch2_inode_update_after_write(trans, inode, bi, ~0);
1356
1357         inode->v.i_blocks       = bi->bi_sectors;
1358         inode->v.i_ino          = bi->bi_inum;
1359         inode->v.i_rdev         = bi->bi_dev;
1360         inode->v.i_generation   = bi->bi_generation;
1361         inode->v.i_size         = bi->bi_size;
1362
1363         inode->ei_flags         = 0;
1364         inode->ei_quota_reserved = 0;
1365         inode->ei_qid           = bch_qid(bi);
1366         inode->ei_subvol        = inum.subvol;
1367
1368         inode->v.i_mapping->a_ops = &bch_address_space_operations;
1369
1370         switch (inode->v.i_mode & S_IFMT) {
1371         case S_IFREG:
1372                 inode->v.i_op   = &bch_file_inode_operations;
1373                 inode->v.i_fop  = &bch_file_operations;
1374                 break;
1375         case S_IFDIR:
1376                 inode->v.i_op   = &bch_dir_inode_operations;
1377                 inode->v.i_fop  = &bch_dir_file_operations;
1378                 break;
1379         case S_IFLNK:
1380                 inode_nohighmem(&inode->v);
1381                 inode->v.i_op   = &bch_symlink_inode_operations;
1382                 break;
1383         default:
1384                 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1385                 inode->v.i_op   = &bch_special_inode_operations;
1386                 break;
1387         }
1388 }
1389
1390 static struct inode *bch2_alloc_inode(struct super_block *sb)
1391 {
1392         struct bch_inode_info *inode;
1393
1394         inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
1395         if (!inode)
1396                 return NULL;
1397
1398         inode_init_once(&inode->v);
1399         mutex_init(&inode->ei_update_lock);
1400         pagecache_lock_init(&inode->ei_pagecache_lock);
1401         mutex_init(&inode->ei_quota_lock);
1402
1403         return &inode->v;
1404 }
1405
1406 static void bch2_i_callback(struct rcu_head *head)
1407 {
1408         struct inode *vinode = container_of(head, struct inode, i_rcu);
1409         struct bch_inode_info *inode = to_bch_ei(vinode);
1410
1411         kmem_cache_free(bch2_inode_cache, inode);
1412 }
1413
1414 static void bch2_destroy_inode(struct inode *vinode)
1415 {
1416         call_rcu(&vinode->i_rcu, bch2_i_callback);
1417 }
1418
1419 static int inode_update_times_fn(struct bch_inode_info *inode,
1420                                  struct bch_inode_unpacked *bi,
1421                                  void *p)
1422 {
1423         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1424
1425         bi->bi_atime    = timespec_to_bch2_time(c, inode->v.i_atime);
1426         bi->bi_mtime    = timespec_to_bch2_time(c, inode->v.i_mtime);
1427         bi->bi_ctime    = timespec_to_bch2_time(c, inode->v.i_ctime);
1428
1429         return 0;
1430 }
1431
1432 static int bch2_vfs_write_inode(struct inode *vinode,
1433                                 struct writeback_control *wbc)
1434 {
1435         struct bch_fs *c = vinode->i_sb->s_fs_info;
1436         struct bch_inode_info *inode = to_bch_ei(vinode);
1437         int ret;
1438
1439         mutex_lock(&inode->ei_update_lock);
1440         ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1441                                ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1442         mutex_unlock(&inode->ei_update_lock);
1443
1444         return ret;
1445 }
1446
1447 static void bch2_evict_inode(struct inode *vinode)
1448 {
1449         struct bch_fs *c = vinode->i_sb->s_fs_info;
1450         struct bch_inode_info *inode = to_bch_ei(vinode);
1451
1452         truncate_inode_pages_final(&inode->v.i_data);
1453
1454         clear_inode(&inode->v);
1455
1456         BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1457
1458         if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1459                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1460                                 KEY_TYPE_QUOTA_WARN);
1461                 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1462                                 KEY_TYPE_QUOTA_WARN);
1463                 bch2_inode_rm(c, inode_inum(inode), true);
1464         }
1465 }
1466
1467 void bch2_evict_subvolume_inodes(struct bch_fs *c,
1468                                  struct snapshot_id_list *s)
1469 {
1470         struct super_block *sb = c->vfs_sb;
1471         struct inode *inode;
1472
1473         spin_lock(&sb->s_inode_list_lock);
1474         list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1475                 if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
1476                     (inode->i_state & I_FREEING))
1477                         continue;
1478
1479                 d_mark_dontcache(inode);
1480                 d_prune_aliases(inode);
1481         }
1482         spin_unlock(&sb->s_inode_list_lock);
1483 again:
1484         cond_resched();
1485         spin_lock(&sb->s_inode_list_lock);
1486         list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1487                 if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
1488                     (inode->i_state & I_FREEING))
1489                         continue;
1490
1491                 if (!(inode->i_state & I_DONTCACHE)) {
1492                         d_mark_dontcache(inode);
1493                         d_prune_aliases(inode);
1494                 }
1495
1496                 spin_lock(&inode->i_lock);
1497                 if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
1498                     !(inode->i_state & I_FREEING)) {
1499                         wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
1500                         DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
1501                         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
1502                         spin_unlock(&inode->i_lock);
1503                         spin_unlock(&sb->s_inode_list_lock);
1504                         schedule();
1505                         finish_wait(wq, &wait.wq_entry);
1506                         goto again;
1507                 }
1508
1509                 spin_unlock(&inode->i_lock);
1510         }
1511         spin_unlock(&sb->s_inode_list_lock);
1512 }
1513
1514 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1515 {
1516         struct super_block *sb = dentry->d_sb;
1517         struct bch_fs *c = sb->s_fs_info;
1518         struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1519         unsigned shift = sb->s_blocksize_bits - 9;
1520         /*
1521          * this assumes inodes take up 64 bytes, which is a decent average
1522          * number:
1523          */
1524         u64 avail_inodes = ((usage.capacity - usage.used) << 3);
1525         u64 fsid;
1526
1527         buf->f_type     = BCACHEFS_STATFS_MAGIC;
1528         buf->f_bsize    = sb->s_blocksize;
1529         buf->f_blocks   = usage.capacity >> shift;
1530         buf->f_bfree    = usage.free >> shift;
1531         buf->f_bavail   = avail_factor(usage.free) >> shift;
1532
1533         buf->f_files    = usage.nr_inodes + avail_inodes;
1534         buf->f_ffree    = avail_inodes;
1535
1536         fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
1537                le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
1538         buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
1539         buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
1540         buf->f_namelen  = BCH_NAME_MAX;
1541
1542         return 0;
1543 }
1544
1545 static int bch2_sync_fs(struct super_block *sb, int wait)
1546 {
1547         struct bch_fs *c = sb->s_fs_info;
1548
1549         if (c->opts.journal_flush_disabled)
1550                 return 0;
1551
1552         if (!wait) {
1553                 bch2_journal_flush_async(&c->journal, NULL);
1554                 return 0;
1555         }
1556
1557         return bch2_journal_flush(&c->journal);
1558 }
1559
1560 static struct bch_fs *bch2_path_to_fs(const char *path)
1561 {
1562         struct bch_fs *c;
1563         dev_t dev;
1564         int ret;
1565
1566         ret = lookup_bdev(path, &dev);
1567         if (ret)
1568                 return ERR_PTR(ret);
1569
1570         c = bch2_dev_to_fs(dev);
1571         if (c)
1572                 closure_put(&c->cl);
1573         return c ?: ERR_PTR(-ENOENT);
1574 }
1575
1576 static char **split_devs(const char *_dev_name, unsigned *nr)
1577 {
1578         char *dev_name = NULL, **devs = NULL, *s;
1579         size_t i, nr_devs = 0;
1580
1581         dev_name = kstrdup(_dev_name, GFP_KERNEL);
1582         if (!dev_name)
1583                 return NULL;
1584
1585         for (s = dev_name; s; s = strchr(s + 1, ':'))
1586                 nr_devs++;
1587
1588         devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
1589         if (!devs) {
1590                 kfree(dev_name);
1591                 return NULL;
1592         }
1593
1594         for (i = 0, s = dev_name;
1595              s;
1596              (s = strchr(s, ':')) && (*s++ = '\0'))
1597                 devs[i++] = s;
1598
1599         *nr = nr_devs;
1600         return devs;
1601 }
1602
1603 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1604 {
1605         struct bch_fs *c = sb->s_fs_info;
1606         struct bch_opts opts = bch2_opts_empty();
1607         int ret;
1608
1609         opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
1610
1611         ret = bch2_parse_mount_opts(c, &opts, data);
1612         if (ret)
1613                 return ret;
1614
1615         if (opts.read_only != c->opts.read_only) {
1616                 down_write(&c->state_lock);
1617
1618                 if (opts.read_only) {
1619                         bch2_fs_read_only(c);
1620
1621                         sb->s_flags |= SB_RDONLY;
1622                 } else {
1623                         ret = bch2_fs_read_write(c);
1624                         if (ret) {
1625                                 bch_err(c, "error going rw: %i", ret);
1626                                 up_write(&c->state_lock);
1627                                 return -EINVAL;
1628                         }
1629
1630                         sb->s_flags &= ~SB_RDONLY;
1631                 }
1632
1633                 c->opts.read_only = opts.read_only;
1634
1635                 up_write(&c->state_lock);
1636         }
1637
1638         if (opts.errors >= 0)
1639                 c->opts.errors = opts.errors;
1640
1641         return ret;
1642 }
1643
1644 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
1645 {
1646         struct bch_fs *c = root->d_sb->s_fs_info;
1647         struct bch_dev *ca;
1648         unsigned i;
1649         bool first = true;
1650
1651         for_each_online_member(ca, c, i) {
1652                 if (!first)
1653                         seq_putc(seq, ':');
1654                 first = false;
1655                 seq_puts(seq, "/dev/");
1656                 seq_puts(seq, ca->name);
1657         }
1658
1659         return 0;
1660 }
1661
1662 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1663 {
1664         struct bch_fs *c = root->d_sb->s_fs_info;
1665         enum bch_opt_id i;
1666         char buf[512];
1667
1668         for (i = 0; i < bch2_opts_nr; i++) {
1669                 const struct bch_option *opt = &bch2_opt_table[i];
1670                 u64 v = bch2_opt_get_by_id(&c->opts, i);
1671
1672                 if (!(opt->mode & OPT_MOUNT))
1673                         continue;
1674
1675                 if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1676                         continue;
1677
1678                 bch2_opt_to_text(&PBUF(buf), c, opt, v,
1679                                  OPT_SHOW_MOUNT_STYLE);
1680                 seq_putc(seq, ',');
1681                 seq_puts(seq, buf);
1682         }
1683
1684         return 0;
1685 }
1686
1687 static void bch2_put_super(struct super_block *sb)
1688 {
1689         struct bch_fs *c = sb->s_fs_info;
1690
1691         __bch2_fs_stop(c);
1692 }
1693
1694 static const struct super_operations bch_super_operations = {
1695         .alloc_inode    = bch2_alloc_inode,
1696         .destroy_inode  = bch2_destroy_inode,
1697         .write_inode    = bch2_vfs_write_inode,
1698         .evict_inode    = bch2_evict_inode,
1699         .sync_fs        = bch2_sync_fs,
1700         .statfs         = bch2_statfs,
1701         .show_devname   = bch2_show_devname,
1702         .show_options   = bch2_show_options,
1703         .remount_fs     = bch2_remount,
1704         .put_super      = bch2_put_super,
1705 #if 0
1706         .freeze_fs      = bch2_freeze,
1707         .unfreeze_fs    = bch2_unfreeze,
1708 #endif
1709 };
1710
1711 static int bch2_set_super(struct super_block *s, void *data)
1712 {
1713         s->s_fs_info = data;
1714         return 0;
1715 }
1716
1717 static int bch2_noset_super(struct super_block *s, void *data)
1718 {
1719         return -EBUSY;
1720 }
1721
1722 static int bch2_test_super(struct super_block *s, void *data)
1723 {
1724         struct bch_fs *c = s->s_fs_info;
1725         struct bch_fs **devs = data;
1726         unsigned i;
1727
1728         if (!c)
1729                 return false;
1730
1731         for (i = 0; devs[i]; i++)
1732                 if (c != devs[i])
1733                         return false;
1734         return true;
1735 }
1736
1737 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1738                                  int flags, const char *dev_name, void *data)
1739 {
1740         struct bch_fs *c;
1741         struct bch_dev *ca;
1742         struct super_block *sb;
1743         struct inode *vinode;
1744         struct bch_opts opts = bch2_opts_empty();
1745         char **devs;
1746         struct bch_fs **devs_to_fs = NULL;
1747         unsigned i, nr_devs;
1748         int ret;
1749
1750         opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
1751
1752         ret = bch2_parse_mount_opts(NULL, &opts, data);
1753         if (ret)
1754                 return ERR_PTR(ret);
1755
1756         if (!dev_name || strlen(dev_name) == 0)
1757                 return ERR_PTR(-EINVAL);
1758
1759         devs = split_devs(dev_name, &nr_devs);
1760         if (!devs)
1761                 return ERR_PTR(-ENOMEM);
1762
1763         devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
1764         if (!devs_to_fs) {
1765                 sb = ERR_PTR(-ENOMEM);
1766                 goto got_sb;
1767         }
1768
1769         for (i = 0; i < nr_devs; i++)
1770                 devs_to_fs[i] = bch2_path_to_fs(devs[i]);
1771
1772         sb = sget(fs_type, bch2_test_super, bch2_noset_super,
1773                   flags|SB_NOSEC, devs_to_fs);
1774         if (!IS_ERR(sb))
1775                 goto got_sb;
1776
1777         c = bch2_fs_open(devs, nr_devs, opts);
1778         if (IS_ERR(c)) {
1779                 sb = ERR_CAST(c);
1780                 goto got_sb;
1781         }
1782
1783         /* Some options can't be parsed until after the fs is started: */
1784         ret = bch2_parse_mount_opts(c, &opts, data);
1785         if (ret) {
1786                 bch2_fs_stop(c);
1787                 sb = ERR_PTR(ret);
1788                 goto got_sb;
1789         }
1790
1791         bch2_opts_apply(&c->opts, opts);
1792
1793         sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
1794         if (IS_ERR(sb))
1795                 bch2_fs_stop(c);
1796 got_sb:
1797         kfree(devs_to_fs);
1798         kfree(devs[0]);
1799         kfree(devs);
1800
1801         if (IS_ERR(sb))
1802                 return ERR_CAST(sb);
1803
1804         c = sb->s_fs_info;
1805
1806         if (sb->s_root) {
1807                 if ((flags ^ sb->s_flags) & SB_RDONLY) {
1808                         ret = -EBUSY;
1809                         goto err_put_super;
1810                 }
1811                 goto out;
1812         }
1813
1814         sb->s_blocksize         = block_bytes(c);
1815         sb->s_blocksize_bits    = ilog2(block_bytes(c));
1816         sb->s_maxbytes          = MAX_LFS_FILESIZE;
1817         sb->s_op                = &bch_super_operations;
1818         sb->s_export_op         = &bch_export_ops;
1819 #ifdef CONFIG_BCACHEFS_QUOTA
1820         sb->s_qcop              = &bch2_quotactl_operations;
1821         sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1822 #endif
1823         sb->s_xattr             = bch2_xattr_handlers;
1824         sb->s_magic             = BCACHEFS_STATFS_MAGIC;
1825         sb->s_time_gran         = c->sb.nsec_per_time_unit;
1826         sb->s_time_min          = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
1827         sb->s_time_max          = div_s64(S64_MAX, c->sb.time_units_per_sec);
1828         c->vfs_sb               = sb;
1829         strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
1830
1831         ret = super_setup_bdi(sb);
1832         if (ret)
1833                 goto err_put_super;
1834
1835         sb->s_bdi->ra_pages             = VM_READAHEAD_PAGES;
1836
1837         for_each_online_member(ca, c, i) {
1838                 struct block_device *bdev = ca->disk_sb.bdev;
1839
1840                 /* XXX: create an anonymous device for multi device filesystems */
1841                 sb->s_bdev      = bdev;
1842                 sb->s_dev       = bdev->bd_dev;
1843                 percpu_ref_put(&ca->io_ref);
1844                 break;
1845         }
1846
1847         c->dev = sb->s_dev;
1848
1849 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1850         if (c->opts.acl)
1851                 sb->s_flags     |= SB_POSIXACL;
1852 #endif
1853
1854         sb->s_shrink.seeks = 0;
1855
1856         vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
1857         if (IS_ERR(vinode)) {
1858                 bch_err(c, "error mounting: error getting root inode %i",
1859                         (int) PTR_ERR(vinode));
1860                 ret = PTR_ERR(vinode);
1861                 goto err_put_super;
1862         }
1863
1864         sb->s_root = d_make_root(vinode);
1865         if (!sb->s_root) {
1866                 bch_err(c, "error mounting: error allocating root dentry");
1867                 ret = -ENOMEM;
1868                 goto err_put_super;
1869         }
1870
1871         sb->s_flags |= SB_ACTIVE;
1872 out:
1873         return dget(sb->s_root);
1874
1875 err_put_super:
1876         deactivate_locked_super(sb);
1877         return ERR_PTR(ret);
1878 }
1879
1880 static void bch2_kill_sb(struct super_block *sb)
1881 {
1882         struct bch_fs *c = sb->s_fs_info;
1883
1884         generic_shutdown_super(sb);
1885         bch2_fs_free(c);
1886 }
1887
1888 static struct file_system_type bcache_fs_type = {
1889         .owner          = THIS_MODULE,
1890         .name           = "bcachefs",
1891         .mount          = bch2_mount,
1892         .kill_sb        = bch2_kill_sb,
1893         .fs_flags       = FS_REQUIRES_DEV,
1894 };
1895
1896 MODULE_ALIAS_FS("bcachefs");
1897
1898 void bch2_vfs_exit(void)
1899 {
1900         unregister_filesystem(&bcache_fs_type);
1901         if (bch2_inode_cache)
1902                 kmem_cache_destroy(bch2_inode_cache);
1903 }
1904
1905 int __init bch2_vfs_init(void)
1906 {
1907         int ret = -ENOMEM;
1908
1909         bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
1910         if (!bch2_inode_cache)
1911                 goto err;
1912
1913         ret = register_filesystem(&bcache_fs_type);
1914         if (ret)
1915                 goto err;
1916
1917         return 0;
1918 err:
1919         bch2_vfs_exit();
1920         return ret;
1921 }
1922
1923 #endif /* NO_BCACHEFS_FS */