]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fs.c
Update bcachefs sources to 380885b0b8 bcachefs: Fix counting iterators for reflink...
[bcachefs-tools-debian] / libbcachefs / fs.c
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3
4 #include "bcachefs.h"
5 #include "acl.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "chardev.h"
9 #include "dirent.h"
10 #include "extents.h"
11 #include "fs.h"
12 #include "fs-io.h"
13 #include "fs-ioctl.h"
14 #include "fsck.h"
15 #include "inode.h"
16 #include "io.h"
17 #include "journal.h"
18 #include "keylist.h"
19 #include "quota.h"
20 #include "super.h"
21 #include "xattr.h"
22
23 #include <linux/aio.h>
24 #include <linux/backing-dev.h>
25 #include <linux/exportfs.h>
26 #include <linux/module.h>
27 #include <linux/posix_acl.h>
28 #include <linux/random.h>
29 #include <linux/statfs.h>
30 #include <linux/xattr.h>
31
32 static struct kmem_cache *bch2_inode_cache;
33
34 static void bch2_vfs_inode_init(struct bch_fs *,
35                                 struct bch_inode_info *,
36                                 struct bch_inode_unpacked *);
37
38 static void journal_seq_copy(struct bch_inode_info *dst,
39                              u64 journal_seq)
40 {
41         u64 old, v = READ_ONCE(dst->ei_journal_seq);
42
43         do {
44                 old = v;
45
46                 if (old >= journal_seq)
47                         break;
48         } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
49 }
50
51 /*
52  * I_SIZE_DIRTY requires special handling:
53  *
54  * To the recovery code, the flag means that there is stale data past i_size
55  * that needs to be deleted; it's used for implementing atomic appends and
56  * truncates.
57  *
58  * On append, we set I_SIZE_DIRTY before doing the write, then after the write
59  * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size
60  * that exposes the data we just wrote.
61  *
62  * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting
63  * i_size to the new smaller size, then we delete the data that we just made
64  * invisible, and then we clear I_SIZE_DIRTY.
65  *
66  * Because there can be multiple appends in flight at a time, we need a refcount
67  * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero
68  * refcount means I_SIZE_DIRTY is set, zero means it's cleared.
69  *
70  * Because write_inode() can be called at any time, i_size_dirty_count means
71  * something different to the runtime code - it means to write_inode() "don't
72  * update i_size yet".
73  *
74  * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when
75  * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must
76  * be set explicitly.
77  */
78
79 void bch2_inode_update_after_write(struct bch_fs *c,
80                                    struct bch_inode_info *inode,
81                                    struct bch_inode_unpacked *bi,
82                                    unsigned fields)
83 {
84         set_nlink(&inode->v, bch2_inode_nlink_get(bi));
85         i_uid_write(&inode->v, bi->bi_uid);
86         i_gid_write(&inode->v, bi->bi_gid);
87         inode->v.i_mode = bi->bi_mode;
88
89         if (fields & ATTR_ATIME)
90                 inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
91         if (fields & ATTR_MTIME)
92                 inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
93         if (fields & ATTR_CTIME)
94                 inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
95
96         inode->ei_inode         = *bi;
97
98         bch2_inode_flags_to_vfs(inode);
99 }
100
101 int __must_check bch2_write_inode_trans(struct btree_trans *trans,
102                                 struct bch_inode_info *inode,
103                                 struct bch_inode_unpacked *inode_u,
104                                 inode_set_fn set,
105                                 void *p)
106 {
107         struct btree_iter *iter = NULL;
108         struct bkey_inode_buf *inode_p;
109         int ret;
110
111         lockdep_assert_held(&inode->ei_update_lock);
112
113         iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
114                                    POS(inode->v.i_ino, 0),
115                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
116         if (IS_ERR(iter))
117                 return PTR_ERR(iter);
118
119         /* The btree node lock is our lock on the inode: */
120         ret = bch2_btree_iter_traverse(iter);
121         if (ret)
122                 return ret;
123
124         *inode_u = inode->ei_inode;
125
126         if (set) {
127                 ret = set(inode, inode_u, p);
128                 if (ret)
129                         return ret;
130         }
131
132         inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
133         if (IS_ERR(inode_p))
134                 return PTR_ERR(inode_p);
135
136         bch2_inode_pack(inode_p, inode_u);
137         bch2_trans_update(trans, iter, &inode_p->inode.k_i);
138
139         return 0;
140 }
141
142 int __must_check bch2_write_inode(struct bch_fs *c,
143                                   struct bch_inode_info *inode,
144                                   inode_set_fn set,
145                                   void *p, unsigned fields)
146 {
147         struct btree_trans trans;
148         struct bch_inode_unpacked inode_u;
149         int ret;
150
151         bch2_trans_init(&trans, c, 0, 0);
152 retry:
153         bch2_trans_begin(&trans);
154
155         ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?:
156                 bch2_trans_commit(&trans, NULL,
157                                   &inode->ei_journal_seq,
158                                   BTREE_INSERT_ATOMIC|
159                                   BTREE_INSERT_NOUNLOCK|
160                                   BTREE_INSERT_NOFAIL);
161         if (ret == -EINTR)
162                 goto retry;
163
164         /*
165          * the btree node lock protects inode->ei_inode, not ei_update_lock;
166          * this is important for inode updates via bchfs_write_index_update
167          */
168         if (!ret)
169                 bch2_inode_update_after_write(c, inode, &inode_u, fields);
170
171         bch2_trans_exit(&trans);
172         return ret < 0 ? ret : 0;
173 }
174
175 int bch2_fs_quota_transfer(struct bch_fs *c,
176                            struct bch_inode_info *inode,
177                            struct bch_qid new_qid,
178                            unsigned qtypes,
179                            enum quota_acct_mode mode)
180 {
181         unsigned i;
182         int ret;
183
184         qtypes &= enabled_qtypes(c);
185
186         for (i = 0; i < QTYP_NR; i++)
187                 if (new_qid.q[i] == inode->ei_qid.q[i])
188                         qtypes &= ~(1U << i);
189
190         if (!qtypes)
191                 return 0;
192
193         mutex_lock(&inode->ei_quota_lock);
194
195         ret = bch2_quota_transfer(c, qtypes, new_qid,
196                                   inode->ei_qid,
197                                   inode->v.i_blocks +
198                                   inode->ei_quota_reserved,
199                                   mode);
200         if (!ret)
201                 for (i = 0; i < QTYP_NR; i++)
202                         if (qtypes & (1 << i))
203                                 inode->ei_qid.q[i] = new_qid.q[i];
204
205         mutex_unlock(&inode->ei_quota_lock);
206
207         return ret;
208 }
209
210 int bch2_reinherit_attrs_fn(struct bch_inode_info *inode,
211                             struct bch_inode_unpacked *bi,
212                             void *p)
213 {
214         struct bch_inode_info *dir = p;
215         u64 src, dst;
216         unsigned id;
217         int ret = 1;
218
219         for (id = 0; id < Inode_opt_nr; id++) {
220                 if (bi->bi_fields_set & (1 << id))
221                         continue;
222
223                 src = bch2_inode_opt_get(&dir->ei_inode, id);
224                 dst = bch2_inode_opt_get(bi, id);
225
226                 if (src == dst)
227                         continue;
228
229                 bch2_inode_opt_set(bi, id, src);
230                 ret = 0;
231         }
232
233         return ret;
234 }
235
236 struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
237 {
238         struct bch_inode_unpacked inode_u;
239         struct bch_inode_info *inode;
240         int ret;
241
242         inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
243         if (unlikely(!inode))
244                 return ERR_PTR(-ENOMEM);
245         if (!(inode->v.i_state & I_NEW))
246                 return &inode->v;
247
248         ret = bch2_inode_find_by_inum(c, inum, &inode_u);
249         if (ret) {
250                 iget_failed(&inode->v);
251                 return ERR_PTR(ret);
252         }
253
254         bch2_vfs_inode_init(c, inode, &inode_u);
255
256         inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
257
258         unlock_new_inode(&inode->v);
259
260         return &inode->v;
261 }
262
263 static void bch2_inode_init_owner(struct bch_inode_unpacked *inode_u,
264                                   const struct inode *dir, umode_t mode)
265 {
266         kuid_t uid = current_fsuid();
267         kgid_t gid;
268
269         if (dir && dir->i_mode & S_ISGID) {
270                 gid = dir->i_gid;
271                 if (S_ISDIR(mode))
272                         mode |= S_ISGID;
273         } else
274                 gid = current_fsgid();
275
276         inode_u->bi_uid         = from_kuid(dir->i_sb->s_user_ns, uid);
277         inode_u->bi_gid         = from_kgid(dir->i_sb->s_user_ns, gid);
278         inode_u->bi_mode        = mode;
279 }
280
281 static int inode_update_for_create_fn(struct bch_inode_info *inode,
282                                       struct bch_inode_unpacked *bi,
283                                       void *p)
284 {
285         struct bch_fs *c = inode->v.i_sb->s_fs_info;
286         struct bch_inode_unpacked *new_inode = p;
287
288         bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
289
290         if (S_ISDIR(new_inode->bi_mode))
291                 bi->bi_nlink++;
292
293         return 0;
294 }
295
296 static struct bch_inode_info *
297 __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
298               umode_t mode, dev_t rdev, bool tmpfile)
299 {
300         struct bch_fs *c = dir->v.i_sb->s_fs_info;
301         struct btree_trans trans;
302         struct bch_inode_unpacked dir_u;
303         struct bch_inode_info *inode, *old;
304         struct bch_inode_unpacked inode_u;
305         struct bch_hash_info hash_info;
306         struct posix_acl *default_acl = NULL, *acl = NULL;
307         u64 journal_seq = 0;
308         int ret;
309
310         bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode);
311         bch2_inode_init_owner(&inode_u, &dir->v, mode);
312
313         hash_info = bch2_hash_info_init(c, &inode_u);
314
315         if (tmpfile)
316                 inode_u.bi_flags |= BCH_INODE_UNLINKED;
317
318         ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
319                               KEY_TYPE_QUOTA_PREALLOC);
320         if (ret)
321                 return ERR_PTR(ret);
322
323 #ifdef CONFIG_BCACHEFS_POSIX_ACL
324         ret = posix_acl_create(&dir->v, &inode_u.bi_mode, &default_acl, &acl);
325         if (ret)
326                 goto err;
327 #endif
328
329         /*
330          * preallocate vfs inode before btree transaction, so that nothing can
331          * fail after the transaction succeeds:
332          */
333         inode = to_bch_ei(new_inode(c->vfs_sb));
334         if (unlikely(!inode)) {
335                 ret = -ENOMEM;
336                 goto err;
337         }
338
339         if (!tmpfile)
340                 mutex_lock(&dir->ei_update_lock);
341
342         bch2_trans_init(&trans, c, 8, 1024);
343 retry:
344         bch2_trans_begin(&trans);
345
346         ret   = __bch2_inode_create(&trans, &inode_u,
347                                     BLOCKDEV_INODE_MAX, 0,
348                                     &c->unused_inode_hint) ?:
349                 (default_acl
350                  ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
351                                       default_acl, ACL_TYPE_DEFAULT)
352                  : 0) ?:
353                 (acl
354                  ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
355                                       acl, ACL_TYPE_ACCESS)
356                  : 0) ?:
357                 (!tmpfile
358                  ? __bch2_dirent_create(&trans, dir->v.i_ino,
359                                         &dir->ei_str_hash,
360                                         mode_to_type(mode),
361                                         &dentry->d_name,
362                                         inode_u.bi_inum,
363                                         BCH_HASH_SET_MUST_CREATE)
364                 : 0) ?:
365                 (!tmpfile
366                  ? bch2_write_inode_trans(&trans, dir, &dir_u,
367                                           inode_update_for_create_fn,
368                                           &inode_u)
369                  : 0) ?:
370                 bch2_trans_commit(&trans, NULL,
371                                   &journal_seq,
372                                   BTREE_INSERT_ATOMIC|
373                                   BTREE_INSERT_NOUNLOCK);
374         if (ret == -EINTR)
375                 goto retry;
376         if (unlikely(ret))
377                 goto err_trans;
378
379         if (!tmpfile) {
380                 bch2_inode_update_after_write(c, dir, &dir_u,
381                                               ATTR_MTIME|ATTR_CTIME);
382                 journal_seq_copy(dir, journal_seq);
383                 mutex_unlock(&dir->ei_update_lock);
384         }
385
386         bch2_vfs_inode_init(c, inode, &inode_u);
387         journal_seq_copy(inode, journal_seq);
388
389         set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
390         set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
391
392         /*
393          * we must insert the new inode into the inode cache before calling
394          * bch2_trans_exit() and dropping locks, else we could race with another
395          * thread pulling the inode in and modifying it:
396          */
397
398         old = to_bch_ei(insert_inode_locked2(&inode->v));
399         if (unlikely(old)) {
400                 /*
401                  * We raced, another process pulled the new inode into cache
402                  * before us:
403                  */
404                 old->ei_journal_seq = inode->ei_journal_seq;
405                 make_bad_inode(&inode->v);
406                 iput(&inode->v);
407
408                 inode = old;
409         } else {
410                 /*
411                  * we really don't want insert_inode_locked2() to be setting
412                  * I_NEW...
413                  */
414                 unlock_new_inode(&inode->v);
415         }
416
417         bch2_trans_exit(&trans);
418 out:
419         posix_acl_release(default_acl);
420         posix_acl_release(acl);
421         return inode;
422 err_trans:
423         if (!tmpfile)
424                 mutex_unlock(&dir->ei_update_lock);
425
426         bch2_trans_exit(&trans);
427         make_bad_inode(&inode->v);
428         iput(&inode->v);
429 err:
430         bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN);
431         inode = ERR_PTR(ret);
432         goto out;
433 }
434
435 /* methods */
436
437 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
438                                   unsigned int flags)
439 {
440         struct bch_fs *c = vdir->i_sb->s_fs_info;
441         struct bch_inode_info *dir = to_bch_ei(vdir);
442         struct inode *vinode = NULL;
443         u64 inum;
444
445         inum = bch2_dirent_lookup(c, dir->v.i_ino,
446                                   &dir->ei_str_hash,
447                                   &dentry->d_name);
448
449         if (inum)
450                 vinode = bch2_vfs_inode_get(c, inum);
451
452         return d_splice_alias(vinode, dentry);
453 }
454
455 static int bch2_create(struct inode *vdir, struct dentry *dentry,
456                        umode_t mode, bool excl)
457 {
458         struct bch_inode_info *inode =
459                 __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0, false);
460
461         if (IS_ERR(inode))
462                 return PTR_ERR(inode);
463
464         d_instantiate(dentry, &inode->v);
465         return 0;
466 }
467
468 static int inode_update_for_link_fn(struct bch_inode_info *inode,
469                                     struct bch_inode_unpacked *bi,
470                                     void *p)
471 {
472         struct bch_fs *c = inode->v.i_sb->s_fs_info;
473
474         bi->bi_ctime = bch2_current_time(c);
475         bch2_inode_nlink_inc(bi);
476         return 0;
477 }
478
479 static int __bch2_link(struct bch_fs *c,
480                        struct bch_inode_info *inode,
481                        struct bch_inode_info *dir,
482                        struct dentry *dentry)
483 {
484         struct btree_trans trans;
485         struct bch_inode_unpacked inode_u;
486         int ret;
487
488         mutex_lock(&inode->ei_update_lock);
489         bch2_trans_init(&trans, c, 4, 1024);
490 retry:
491         bch2_trans_begin(&trans);
492
493         ret   = __bch2_dirent_create(&trans, dir->v.i_ino,
494                                      &dir->ei_str_hash,
495                                      mode_to_type(inode->v.i_mode),
496                                      &dentry->d_name,
497                                      inode->v.i_ino,
498                                      BCH_HASH_SET_MUST_CREATE) ?:
499                 bch2_write_inode_trans(&trans, inode, &inode_u,
500                                        inode_update_for_link_fn,
501                                        NULL) ?:
502                 bch2_trans_commit(&trans, NULL,
503                                   &inode->ei_journal_seq,
504                                   BTREE_INSERT_ATOMIC|
505                                   BTREE_INSERT_NOUNLOCK);
506
507         if (ret == -EINTR)
508                 goto retry;
509
510         if (likely(!ret))
511                 bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
512
513         bch2_trans_exit(&trans);
514         mutex_unlock(&inode->ei_update_lock);
515         return ret;
516 }
517
518 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
519                      struct dentry *dentry)
520 {
521         struct bch_fs *c = vdir->i_sb->s_fs_info;
522         struct bch_inode_info *dir = to_bch_ei(vdir);
523         struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
524         int ret;
525
526         lockdep_assert_held(&inode->v.i_rwsem);
527
528         ret = __bch2_link(c, inode, dir, dentry);
529         if (unlikely(ret))
530                 return ret;
531
532         ihold(&inode->v);
533         d_instantiate(dentry, &inode->v);
534         return 0;
535 }
536
537 static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode,
538                                           struct bch_inode_unpacked *bi,
539                                           void *p)
540 {
541         struct bch_fs *c = inode->v.i_sb->s_fs_info;
542         struct bch_inode_info *unlink_inode = p;
543
544         bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
545
546         bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode);
547
548         return 0;
549 }
550
551 static int inode_update_for_unlink_fn(struct bch_inode_info *inode,
552                                       struct bch_inode_unpacked *bi,
553                                       void *p)
554 {
555         struct bch_fs *c = inode->v.i_sb->s_fs_info;
556
557         bi->bi_ctime = bch2_current_time(c);
558         bch2_inode_nlink_dec(bi);
559         return 0;
560 }
561
562 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
563 {
564         struct bch_fs *c = vdir->i_sb->s_fs_info;
565         struct bch_inode_info *dir = to_bch_ei(vdir);
566         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
567         struct bch_inode_unpacked dir_u, inode_u;
568         struct btree_trans trans;
569         int ret;
570
571         bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
572         bch2_trans_init(&trans, c, 4, 1024);
573 retry:
574         bch2_trans_begin(&trans);
575
576         ret   = __bch2_dirent_delete(&trans, dir->v.i_ino,
577                                      &dir->ei_str_hash,
578                                      &dentry->d_name) ?:
579                 bch2_write_inode_trans(&trans, dir, &dir_u,
580                                        inode_update_dir_for_unlink_fn,
581                                        inode) ?:
582                 bch2_write_inode_trans(&trans, inode, &inode_u,
583                                        inode_update_for_unlink_fn,
584                                        NULL) ?:
585                 bch2_trans_commit(&trans, NULL,
586                                   &dir->ei_journal_seq,
587                                   BTREE_INSERT_ATOMIC|
588                                   BTREE_INSERT_NOUNLOCK|
589                                   BTREE_INSERT_NOFAIL);
590         if (ret == -EINTR)
591                 goto retry;
592         if (ret)
593                 goto err;
594
595         if (dir->ei_journal_seq > inode->ei_journal_seq)
596                 inode->ei_journal_seq = dir->ei_journal_seq;
597
598         bch2_inode_update_after_write(c, dir, &dir_u,
599                                       ATTR_MTIME|ATTR_CTIME);
600         bch2_inode_update_after_write(c, inode, &inode_u,
601                                       ATTR_MTIME);
602 err:
603         bch2_trans_exit(&trans);
604         bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
605
606         return ret;
607 }
608
609 static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
610                         const char *symname)
611 {
612         struct bch_fs *c = vdir->i_sb->s_fs_info;
613         struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
614         int ret;
615
616         inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
617         if (unlikely(IS_ERR(inode)))
618                 return PTR_ERR(inode);
619
620         inode_lock(&inode->v);
621         ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
622         inode_unlock(&inode->v);
623
624         if (unlikely(ret))
625                 goto err;
626
627         ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
628         if (unlikely(ret))
629                 goto err;
630
631         journal_seq_copy(dir, inode->ei_journal_seq);
632
633         ret = __bch2_link(c, inode, dir, dentry);
634         if (unlikely(ret))
635                 goto err;
636
637         d_instantiate(dentry, &inode->v);
638         return 0;
639 err:
640         iput(&inode->v);
641         return ret;
642 }
643
644 static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode)
645 {
646         struct bch_inode_info *inode =
647                 __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFDIR, 0, false);
648
649         if (IS_ERR(inode))
650                 return PTR_ERR(inode);
651
652         d_instantiate(dentry, &inode->v);
653         return 0;
654 }
655
656 static int bch2_rmdir(struct inode *vdir, struct dentry *dentry)
657 {
658         struct bch_fs *c = vdir->i_sb->s_fs_info;
659
660         if (bch2_empty_dir(c, dentry->d_inode->i_ino))
661                 return -ENOTEMPTY;
662
663         return bch2_unlink(vdir, dentry);
664 }
665
666 static int bch2_mknod(struct inode *vdir, struct dentry *dentry,
667                       umode_t mode, dev_t rdev)
668 {
669         struct bch_inode_info *inode =
670                 __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false);
671
672         if (IS_ERR(inode))
673                 return PTR_ERR(inode);
674
675         d_instantiate(dentry, &inode->v);
676         return 0;
677 }
678
679 struct rename_info {
680         u64                     now;
681         struct bch_inode_info   *src_dir;
682         struct bch_inode_info   *dst_dir;
683         struct bch_inode_info   *src_inode;
684         struct bch_inode_info   *dst_inode;
685         enum bch_rename_mode    mode;
686 };
687
688 static int inode_update_for_rename_fn(struct bch_inode_info *inode,
689                                       struct bch_inode_unpacked *bi,
690                                       void *p)
691 {
692         struct rename_info *info = p;
693         int ret;
694
695         if (inode == info->src_dir) {
696                 bi->bi_nlink -= S_ISDIR(info->src_inode->v.i_mode);
697                 bi->bi_nlink += info->dst_inode &&
698                         S_ISDIR(info->dst_inode->v.i_mode) &&
699                         info->mode == BCH_RENAME_EXCHANGE;
700         }
701
702         if (inode == info->dst_dir) {
703                 bi->bi_nlink += S_ISDIR(info->src_inode->v.i_mode);
704                 bi->bi_nlink -= info->dst_inode &&
705                         S_ISDIR(info->dst_inode->v.i_mode);
706         }
707
708         if (inode == info->src_inode) {
709                 ret = bch2_reinherit_attrs_fn(inode, bi, info->dst_dir);
710
711                 BUG_ON(!ret && S_ISDIR(info->src_inode->v.i_mode));
712         }
713
714         if (inode == info->dst_inode &&
715             info->mode == BCH_RENAME_EXCHANGE) {
716                 ret = bch2_reinherit_attrs_fn(inode, bi, info->src_dir);
717
718                 BUG_ON(!ret && S_ISDIR(info->dst_inode->v.i_mode));
719         }
720
721         if (inode == info->dst_inode &&
722             info->mode == BCH_RENAME_OVERWRITE) {
723                 BUG_ON(bi->bi_nlink &&
724                        S_ISDIR(info->dst_inode->v.i_mode));
725
726                 bch2_inode_nlink_dec(bi);
727         }
728
729         if (inode == info->src_dir ||
730             inode == info->dst_dir)
731                 bi->bi_mtime = info->now;
732         bi->bi_ctime = info->now;
733
734         return 0;
735 }
736
737 static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
738                         struct inode *dst_vdir, struct dentry *dst_dentry,
739                         unsigned flags)
740 {
741         struct bch_fs *c = src_vdir->i_sb->s_fs_info;
742         struct rename_info i = {
743                 .src_dir        = to_bch_ei(src_vdir),
744                 .dst_dir        = to_bch_ei(dst_vdir),
745                 .src_inode      = to_bch_ei(src_dentry->d_inode),
746                 .dst_inode      = to_bch_ei(dst_dentry->d_inode),
747                 .mode           = flags & RENAME_EXCHANGE
748                                 ? BCH_RENAME_EXCHANGE
749                         : dst_dentry->d_inode
750                                 ? BCH_RENAME_OVERWRITE : BCH_RENAME,
751         };
752         struct btree_trans trans;
753         struct bch_inode_unpacked dst_dir_u, src_dir_u;
754         struct bch_inode_unpacked src_inode_u, dst_inode_u;
755         u64 journal_seq = 0;
756         int ret;
757
758         if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
759                 return -EINVAL;
760
761         if (i.mode == BCH_RENAME_OVERWRITE) {
762                 if (S_ISDIR(i.src_inode->v.i_mode) !=
763                     S_ISDIR(i.dst_inode->v.i_mode))
764                         return -ENOTDIR;
765
766                 if (S_ISDIR(i.src_inode->v.i_mode) &&
767                     bch2_empty_dir(c, i.dst_inode->v.i_ino))
768                         return -ENOTEMPTY;
769
770                 ret = filemap_write_and_wait_range(i.src_inode->v.i_mapping,
771                                                    0, LLONG_MAX);
772                 if (ret)
773                         return ret;
774         }
775
776         bch2_trans_init(&trans, c, 8, 2048);
777
778         bch2_lock_inodes(INODE_UPDATE_LOCK,
779                          i.src_dir,
780                          i.dst_dir,
781                          i.src_inode,
782                          i.dst_inode);
783
784         if (S_ISDIR(i.src_inode->v.i_mode) &&
785             inode_attrs_changing(i.dst_dir, i.src_inode)) {
786                 ret = -EXDEV;
787                 goto err;
788         }
789
790         if (i.mode == BCH_RENAME_EXCHANGE &&
791             S_ISDIR(i.dst_inode->v.i_mode) &&
792             inode_attrs_changing(i.src_dir, i.dst_inode)) {
793                 ret = -EXDEV;
794                 goto err;
795         }
796
797         if (inode_attr_changing(i.dst_dir, i.src_inode, Inode_opt_project)) {
798                 ret = bch2_fs_quota_transfer(c, i.src_inode,
799                                              i.dst_dir->ei_qid,
800                                              1 << QTYP_PRJ,
801                                              KEY_TYPE_QUOTA_PREALLOC);
802                 if (ret)
803                         goto err;
804         }
805
806         if (i.mode == BCH_RENAME_EXCHANGE &&
807             inode_attr_changing(i.src_dir, i.dst_inode, Inode_opt_project)) {
808                 ret = bch2_fs_quota_transfer(c, i.dst_inode,
809                                              i.src_dir->ei_qid,
810                                              1 << QTYP_PRJ,
811                                              KEY_TYPE_QUOTA_PREALLOC);
812                 if (ret)
813                         goto err;
814         }
815
816 retry:
817         bch2_trans_begin(&trans);
818         i.now = bch2_current_time(c);
819
820         ret   = bch2_dirent_rename(&trans,
821                                    i.src_dir, &src_dentry->d_name,
822                                    i.dst_dir, &dst_dentry->d_name,
823                                    i.mode) ?:
824                 bch2_write_inode_trans(&trans, i.src_dir, &src_dir_u,
825                                        inode_update_for_rename_fn, &i) ?:
826                 (i.src_dir != i.dst_dir
827                  ? bch2_write_inode_trans(&trans, i.dst_dir, &dst_dir_u,
828                                        inode_update_for_rename_fn, &i)
829                  : 0 ) ?:
830                 bch2_write_inode_trans(&trans, i.src_inode, &src_inode_u,
831                                        inode_update_for_rename_fn, &i) ?:
832                 (i.dst_inode
833                  ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u,
834                                        inode_update_for_rename_fn, &i)
835                  : 0 ) ?:
836                 bch2_trans_commit(&trans, NULL,
837                                   &journal_seq,
838                                   BTREE_INSERT_ATOMIC|
839                                   BTREE_INSERT_NOUNLOCK);
840         if (ret == -EINTR)
841                 goto retry;
842         if (unlikely(ret))
843                 goto err;
844
845         bch2_inode_update_after_write(c, i.src_dir, &src_dir_u,
846                                       ATTR_MTIME|ATTR_CTIME);
847         journal_seq_copy(i.src_dir, journal_seq);
848
849         if (i.src_dir != i.dst_dir) {
850                 bch2_inode_update_after_write(c, i.dst_dir, &dst_dir_u,
851                                               ATTR_MTIME|ATTR_CTIME);
852                 journal_seq_copy(i.dst_dir, journal_seq);
853         }
854
855         journal_seq_copy(i.src_inode, journal_seq);
856         if (i.dst_inode)
857                 journal_seq_copy(i.dst_inode, journal_seq);
858
859         bch2_inode_update_after_write(c, i.src_inode, &src_inode_u,
860                                       ATTR_CTIME);
861         if (i.dst_inode)
862                 bch2_inode_update_after_write(c, i.dst_inode, &dst_inode_u,
863                                               ATTR_CTIME);
864 err:
865         bch2_trans_exit(&trans);
866
867         bch2_fs_quota_transfer(c, i.src_inode,
868                                bch_qid(&i.src_inode->ei_inode),
869                                1 << QTYP_PRJ,
870                                KEY_TYPE_QUOTA_NOCHECK);
871         if (i.dst_inode)
872                 bch2_fs_quota_transfer(c, i.dst_inode,
873                                        bch_qid(&i.dst_inode->ei_inode),
874                                        1 << QTYP_PRJ,
875                                        KEY_TYPE_QUOTA_NOCHECK);
876
877         bch2_unlock_inodes(INODE_UPDATE_LOCK,
878                            i.src_dir,
879                            i.dst_dir,
880                            i.src_inode,
881                            i.dst_inode);
882
883         return ret;
884 }
885
886 static int inode_update_for_setattr_fn(struct bch_inode_info *inode,
887                                        struct bch_inode_unpacked *bi,
888                                        void *p)
889 {
890         struct bch_fs *c = inode->v.i_sb->s_fs_info;
891         struct iattr *attr = p;
892         unsigned int ia_valid = attr->ia_valid;
893
894         if (ia_valid & ATTR_UID)
895                 bi->bi_uid = from_kuid(inode->v.i_sb->s_user_ns, attr->ia_uid);
896         if (ia_valid & ATTR_GID)
897                 bi->bi_gid = from_kgid(inode->v.i_sb->s_user_ns, attr->ia_gid);
898
899         if (ia_valid & ATTR_ATIME)
900                 bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
901         if (ia_valid & ATTR_MTIME)
902                 bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
903         if (ia_valid & ATTR_CTIME)
904                 bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
905
906         if (ia_valid & ATTR_MODE) {
907                 umode_t mode = attr->ia_mode;
908                 kgid_t gid = ia_valid & ATTR_GID
909                         ? attr->ia_gid
910                         : inode->v.i_gid;
911
912                 if (!in_group_p(gid) &&
913                     !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID))
914                         mode &= ~S_ISGID;
915                 bi->bi_mode = mode;
916         }
917
918         return 0;
919 }
920
921 static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iattr)
922 {
923         struct bch_fs *c = inode->v.i_sb->s_fs_info;
924         struct bch_qid qid;
925         struct btree_trans trans;
926         struct bch_inode_unpacked inode_u;
927         struct posix_acl *acl = NULL;
928         int ret;
929
930         mutex_lock(&inode->ei_update_lock);
931
932         qid = inode->ei_qid;
933
934         if (iattr->ia_valid & ATTR_UID)
935                 qid.q[QTYP_USR] = from_kuid(&init_user_ns, iattr->ia_uid);
936
937         if (iattr->ia_valid & ATTR_GID)
938                 qid.q[QTYP_GRP] = from_kgid(&init_user_ns, iattr->ia_gid);
939
940         ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
941                                      KEY_TYPE_QUOTA_PREALLOC);
942         if (ret)
943                 goto err;
944
945         bch2_trans_init(&trans, c, 0, 0);
946 retry:
947         bch2_trans_begin(&trans);
948         kfree(acl);
949         acl = NULL;
950
951         ret = bch2_write_inode_trans(&trans, inode, &inode_u,
952                                 inode_update_for_setattr_fn, iattr) ?:
953                 (iattr->ia_valid & ATTR_MODE
954                  ? bch2_acl_chmod(&trans, inode, iattr->ia_mode, &acl)
955                  : 0) ?:
956                 bch2_trans_commit(&trans, NULL,
957                                   &inode->ei_journal_seq,
958                                   BTREE_INSERT_ATOMIC|
959                                   BTREE_INSERT_NOUNLOCK|
960                                   BTREE_INSERT_NOFAIL);
961         if (ret == -EINTR)
962                 goto retry;
963         if (unlikely(ret))
964                 goto err_trans;
965
966         bch2_inode_update_after_write(c, inode, &inode_u, iattr->ia_valid);
967
968         if (acl)
969                 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
970 err_trans:
971         bch2_trans_exit(&trans);
972 err:
973         mutex_unlock(&inode->ei_update_lock);
974
975         return ret;
976 }
977
978 static int bch2_getattr(const struct path *path, struct kstat *stat,
979                         u32 request_mask, unsigned query_flags)
980 {
981         struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
982         struct bch_fs *c = inode->v.i_sb->s_fs_info;
983
984         stat->dev       = inode->v.i_sb->s_dev;
985         stat->ino       = inode->v.i_ino;
986         stat->mode      = inode->v.i_mode;
987         stat->nlink     = inode->v.i_nlink;
988         stat->uid       = inode->v.i_uid;
989         stat->gid       = inode->v.i_gid;
990         stat->rdev      = inode->v.i_rdev;
991         stat->size      = i_size_read(&inode->v);
992         stat->atime     = inode->v.i_atime;
993         stat->mtime     = inode->v.i_mtime;
994         stat->ctime     = inode->v.i_ctime;
995         stat->blksize   = block_bytes(c);
996         stat->blocks    = inode->v.i_blocks;
997
998         if (request_mask & STATX_BTIME) {
999                 stat->result_mask |= STATX_BTIME;
1000                 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
1001         }
1002
1003         if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
1004                 stat->attributes |= STATX_ATTR_IMMUTABLE;
1005         if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
1006                 stat->attributes |= STATX_ATTR_APPEND;
1007         if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
1008                 stat->attributes |= STATX_ATTR_NODUMP;
1009
1010         return 0;
1011 }
1012
1013 static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
1014 {
1015         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
1016         int ret;
1017
1018         lockdep_assert_held(&inode->v.i_rwsem);
1019
1020         ret = setattr_prepare(dentry, iattr);
1021         if (ret)
1022                 return ret;
1023
1024         return iattr->ia_valid & ATTR_SIZE
1025                 ? bch2_truncate(inode, iattr)
1026                 : bch2_setattr_nonsize(inode, iattr);
1027 }
1028
1029 static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
1030 {
1031         struct bch_inode_info *inode =
1032                 __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true);
1033
1034         if (IS_ERR(inode))
1035                 return PTR_ERR(inode);
1036
1037         d_mark_tmpfile(dentry, &inode->v);
1038         d_instantiate(dentry, &inode->v);
1039         return 0;
1040 }
1041
1042 static int bch2_fill_extent(struct bch_fs *c,
1043                             struct fiemap_extent_info *info,
1044                             struct bkey_s_c k, unsigned flags)
1045 {
1046         if (bkey_extent_is_data(k.k)) {
1047                 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1048                 const union bch_extent_entry *entry;
1049                 struct extent_ptr_decoded p;
1050                 int ret;
1051
1052                 if (k.k->type == KEY_TYPE_reflink_v)
1053                         flags |= FIEMAP_EXTENT_SHARED;
1054
1055                 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
1056                         int flags2 = 0;
1057                         u64 offset = p.ptr.offset;
1058
1059                         if (p.crc.compression_type)
1060                                 flags2 |= FIEMAP_EXTENT_ENCODED;
1061                         else
1062                                 offset += p.crc.offset;
1063
1064                         if ((offset & (c->opts.block_size - 1)) ||
1065                             (k.k->size & (c->opts.block_size - 1)))
1066                                 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
1067
1068                         ret = fiemap_fill_next_extent(info,
1069                                                 bkey_start_offset(k.k) << 9,
1070                                                 offset << 9,
1071                                                 k.k->size << 9, flags|flags2);
1072                         if (ret)
1073                                 return ret;
1074                 }
1075
1076                 return 0;
1077         } else if (k.k->type == KEY_TYPE_reservation) {
1078                 return fiemap_fill_next_extent(info,
1079                                                bkey_start_offset(k.k) << 9,
1080                                                0, k.k->size << 9,
1081                                                flags|
1082                                                FIEMAP_EXTENT_DELALLOC|
1083                                                FIEMAP_EXTENT_UNWRITTEN);
1084         } else {
1085                 BUG();
1086         }
1087 }
1088
1089 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
1090                        u64 start, u64 len)
1091 {
1092         struct bch_fs *c = vinode->i_sb->s_fs_info;
1093         struct bch_inode_info *ei = to_bch_ei(vinode);
1094         struct btree_trans trans;
1095         struct btree_iter *iter;
1096         struct bkey_s_c k;
1097         BKEY_PADDED(k) cur, prev;
1098         struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
1099         unsigned offset_into_extent, sectors;
1100         bool have_extent = false;
1101         int ret = 0;
1102
1103         if (start + len < start)
1104                 return -EINVAL;
1105
1106         bch2_trans_init(&trans, c, 0, 0);
1107
1108         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
1109                                    POS(ei->v.i_ino, start >> 9), 0);
1110 retry:
1111         while ((k = bch2_btree_iter_peek(iter)).k &&
1112                !(ret = bkey_err(k)) &&
1113                bkey_cmp(iter->pos, end) < 0) {
1114                 if (!bkey_extent_is_data(k.k) &&
1115                     k.k->type != KEY_TYPE_reservation) {
1116                         bch2_btree_iter_next(iter);
1117                         continue;
1118                 }
1119
1120                 bkey_reassemble(&cur.k, k);
1121                 k = bkey_i_to_s_c(&cur.k);
1122
1123                 offset_into_extent      = iter->pos.offset -
1124                         bkey_start_offset(k.k);
1125                 sectors                 = k.k->size - offset_into_extent;
1126
1127                 ret = bch2_read_indirect_extent(&trans,
1128                                         &offset_into_extent, &cur.k);
1129                 if (ret)
1130                         break;
1131
1132                 sectors = min(sectors, k.k->size - offset_into_extent);
1133
1134                 if (offset_into_extent)
1135                         bch2_cut_front(POS(k.k->p.inode,
1136                                            bkey_start_offset(k.k) +
1137                                            offset_into_extent),
1138                                        &cur.k);
1139                 bch2_key_resize(&cur.k.k, sectors);
1140                 cur.k.k.p = iter->pos;
1141                 cur.k.k.p.offset += cur.k.k.size;
1142
1143                 if (have_extent) {
1144                         ret = bch2_fill_extent(c, info,
1145                                         bkey_i_to_s_c(&prev.k), 0);
1146                         if (ret)
1147                                 break;
1148                 }
1149
1150                 bkey_copy(&prev.k, &cur.k);
1151                 have_extent = true;
1152
1153                 if (k.k->type == KEY_TYPE_reflink_v)
1154                         bch2_btree_iter_set_pos(iter, k.k->p);
1155                 else
1156                         bch2_btree_iter_next(iter);
1157         }
1158
1159         if (ret == -EINTR)
1160                 goto retry;
1161
1162         if (!ret && have_extent)
1163                 ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k),
1164                                        FIEMAP_EXTENT_LAST);
1165
1166         ret = bch2_trans_exit(&trans) ?: ret;
1167         return ret < 0 ? ret : 0;
1168 }
1169
1170 static const struct vm_operations_struct bch_vm_ops = {
1171         .fault          = filemap_fault,
1172         .map_pages      = filemap_map_pages,
1173         .page_mkwrite   = bch2_page_mkwrite,
1174 };
1175
1176 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
1177 {
1178         file_accessed(file);
1179
1180         vma->vm_ops = &bch_vm_ops;
1181         return 0;
1182 }
1183
1184 /* Directories: */
1185
1186 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
1187 {
1188         return generic_file_llseek_size(file, offset, whence,
1189                                         S64_MAX, S64_MAX);
1190 }
1191
1192 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1193 {
1194         struct bch_fs *c = file_inode(file)->i_sb->s_fs_info;
1195
1196         return bch2_readdir(c, file, ctx);
1197 }
1198
1199 static const struct file_operations bch_file_operations = {
1200         .llseek         = bch2_llseek,
1201         .read_iter      = generic_file_read_iter,
1202         .write_iter     = bch2_write_iter,
1203         .mmap           = bch2_mmap,
1204         .open           = generic_file_open,
1205         .fsync          = bch2_fsync,
1206         .splice_read    = generic_file_splice_read,
1207         .splice_write   = iter_file_splice_write,
1208         .fallocate      = bch2_fallocate_dispatch,
1209         .unlocked_ioctl = bch2_fs_file_ioctl,
1210 #ifdef CONFIG_COMPAT
1211         .compat_ioctl   = bch2_compat_fs_ioctl,
1212 #endif
1213         .remap_file_range = bch2_remap_file_range,
1214 };
1215
1216 static const struct inode_operations bch_file_inode_operations = {
1217         .getattr        = bch2_getattr,
1218         .setattr        = bch2_setattr,
1219         .fiemap         = bch2_fiemap,
1220         .listxattr      = bch2_xattr_list,
1221 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1222         .get_acl        = bch2_get_acl,
1223         .set_acl        = bch2_set_acl,
1224 #endif
1225 };
1226
1227 static const struct inode_operations bch_dir_inode_operations = {
1228         .lookup         = bch2_lookup,
1229         .create         = bch2_create,
1230         .link           = bch2_link,
1231         .unlink         = bch2_unlink,
1232         .symlink        = bch2_symlink,
1233         .mkdir          = bch2_mkdir,
1234         .rmdir          = bch2_rmdir,
1235         .mknod          = bch2_mknod,
1236         .rename         = bch2_rename2,
1237         .getattr        = bch2_getattr,
1238         .setattr        = bch2_setattr,
1239         .tmpfile        = bch2_tmpfile,
1240         .listxattr      = bch2_xattr_list,
1241 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1242         .get_acl        = bch2_get_acl,
1243         .set_acl        = bch2_set_acl,
1244 #endif
1245 };
1246
1247 static const struct file_operations bch_dir_file_operations = {
1248         .llseek         = bch2_dir_llseek,
1249         .read           = generic_read_dir,
1250         .iterate        = bch2_vfs_readdir,
1251         .fsync          = bch2_fsync,
1252         .unlocked_ioctl = bch2_fs_file_ioctl,
1253 #ifdef CONFIG_COMPAT
1254         .compat_ioctl   = bch2_compat_fs_ioctl,
1255 #endif
1256 };
1257
1258 static const struct inode_operations bch_symlink_inode_operations = {
1259         .get_link       = page_get_link,
1260         .getattr        = bch2_getattr,
1261         .setattr        = bch2_setattr,
1262         .listxattr      = bch2_xattr_list,
1263 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1264         .get_acl        = bch2_get_acl,
1265         .set_acl        = bch2_set_acl,
1266 #endif
1267 };
1268
1269 static const struct inode_operations bch_special_inode_operations = {
1270         .getattr        = bch2_getattr,
1271         .setattr        = bch2_setattr,
1272         .listxattr      = bch2_xattr_list,
1273 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1274         .get_acl        = bch2_get_acl,
1275         .set_acl        = bch2_set_acl,
1276 #endif
1277 };
1278
1279 static const struct address_space_operations bch_address_space_operations = {
1280         .writepage      = bch2_writepage,
1281         .readpage       = bch2_readpage,
1282         .writepages     = bch2_writepages,
1283         .readpages      = bch2_readpages,
1284         .set_page_dirty = __set_page_dirty_nobuffers,
1285         .write_begin    = bch2_write_begin,
1286         .write_end      = bch2_write_end,
1287         .invalidatepage = bch2_invalidatepage,
1288         .releasepage    = bch2_releasepage,
1289         .direct_IO      = bch2_direct_IO,
1290 #ifdef CONFIG_MIGRATION
1291         .migratepage    = bch2_migrate_page,
1292 #endif
1293         .error_remove_page = generic_error_remove_page,
1294 };
1295
1296 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1297                 u64 ino, u32 generation)
1298 {
1299         struct bch_fs *c = sb->s_fs_info;
1300         struct inode *vinode;
1301
1302         if (ino < BCACHEFS_ROOT_INO)
1303                 return ERR_PTR(-ESTALE);
1304
1305         vinode = bch2_vfs_inode_get(c, ino);
1306         if (IS_ERR(vinode))
1307                 return ERR_CAST(vinode);
1308         if (generation && vinode->i_generation != generation) {
1309                 /* we didn't find the right inode.. */
1310                 iput(vinode);
1311                 return ERR_PTR(-ESTALE);
1312         }
1313         return vinode;
1314 }
1315
1316 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
1317                 int fh_len, int fh_type)
1318 {
1319         return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1320                                     bch2_nfs_get_inode);
1321 }
1322
1323 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
1324                 int fh_len, int fh_type)
1325 {
1326         return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1327                                     bch2_nfs_get_inode);
1328 }
1329
1330 static const struct export_operations bch_export_ops = {
1331         .fh_to_dentry   = bch2_fh_to_dentry,
1332         .fh_to_parent   = bch2_fh_to_parent,
1333         //.get_parent   = bch2_get_parent,
1334 };
1335
1336 static void bch2_vfs_inode_init(struct bch_fs *c,
1337                                 struct bch_inode_info *inode,
1338                                 struct bch_inode_unpacked *bi)
1339 {
1340         bch2_inode_update_after_write(c, inode, bi, ~0);
1341
1342         inode->v.i_blocks       = bi->bi_sectors;
1343         inode->v.i_ino          = bi->bi_inum;
1344         inode->v.i_rdev         = bi->bi_dev;
1345         inode->v.i_generation   = bi->bi_generation;
1346         inode->v.i_size         = bi->bi_size;
1347
1348         inode->ei_journal_seq   = 0;
1349         inode->ei_quota_reserved = 0;
1350         inode->ei_str_hash      = bch2_hash_info_init(c, bi);
1351         inode->ei_qid           = bch_qid(bi);
1352
1353         inode->v.i_mapping->a_ops = &bch_address_space_operations;
1354
1355         switch (inode->v.i_mode & S_IFMT) {
1356         case S_IFREG:
1357                 inode->v.i_op   = &bch_file_inode_operations;
1358                 inode->v.i_fop  = &bch_file_operations;
1359                 break;
1360         case S_IFDIR:
1361                 inode->v.i_op   = &bch_dir_inode_operations;
1362                 inode->v.i_fop  = &bch_dir_file_operations;
1363                 break;
1364         case S_IFLNK:
1365                 inode_nohighmem(&inode->v);
1366                 inode->v.i_op   = &bch_symlink_inode_operations;
1367                 break;
1368         default:
1369                 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1370                 inode->v.i_op   = &bch_special_inode_operations;
1371                 break;
1372         }
1373 }
1374
1375 static struct inode *bch2_alloc_inode(struct super_block *sb)
1376 {
1377         struct bch_inode_info *inode;
1378
1379         inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
1380         if (!inode)
1381                 return NULL;
1382
1383         inode_init_once(&inode->v);
1384         mutex_init(&inode->ei_update_lock);
1385         mutex_init(&inode->ei_quota_lock);
1386         inode->ei_journal_seq = 0;
1387
1388         return &inode->v;
1389 }
1390
1391 static void bch2_i_callback(struct rcu_head *head)
1392 {
1393         struct inode *vinode = container_of(head, struct inode, i_rcu);
1394         struct bch_inode_info *inode = to_bch_ei(vinode);
1395
1396         kmem_cache_free(bch2_inode_cache, inode);
1397 }
1398
1399 static void bch2_destroy_inode(struct inode *vinode)
1400 {
1401         call_rcu(&vinode->i_rcu, bch2_i_callback);
1402 }
1403
1404 static int inode_update_times_fn(struct bch_inode_info *inode,
1405                                  struct bch_inode_unpacked *bi,
1406                                  void *p)
1407 {
1408         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1409
1410         bi->bi_atime    = timespec_to_bch2_time(c, inode->v.i_atime);
1411         bi->bi_mtime    = timespec_to_bch2_time(c, inode->v.i_mtime);
1412         bi->bi_ctime    = timespec_to_bch2_time(c, inode->v.i_ctime);
1413
1414         return 0;
1415 }
1416
1417 static int bch2_vfs_write_inode(struct inode *vinode,
1418                                 struct writeback_control *wbc)
1419 {
1420         struct bch_fs *c = vinode->i_sb->s_fs_info;
1421         struct bch_inode_info *inode = to_bch_ei(vinode);
1422         int ret;
1423
1424         mutex_lock(&inode->ei_update_lock);
1425         ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1426                                ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1427         mutex_unlock(&inode->ei_update_lock);
1428
1429         return ret;
1430 }
1431
1432 static void bch2_evict_inode(struct inode *vinode)
1433 {
1434         struct bch_fs *c = vinode->i_sb->s_fs_info;
1435         struct bch_inode_info *inode = to_bch_ei(vinode);
1436
1437         truncate_inode_pages_final(&inode->v.i_data);
1438
1439         clear_inode(&inode->v);
1440
1441         BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1442
1443         if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1444                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1445                                 KEY_TYPE_QUOTA_WARN);
1446                 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1447                                 KEY_TYPE_QUOTA_WARN);
1448                 bch2_inode_rm(c, inode->v.i_ino);
1449         }
1450 }
1451
1452 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1453 {
1454         struct super_block *sb = dentry->d_sb;
1455         struct bch_fs *c = sb->s_fs_info;
1456         struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1457         unsigned shift = sb->s_blocksize_bits - 9;
1458         u64 fsid;
1459
1460         buf->f_type     = BCACHEFS_STATFS_MAGIC;
1461         buf->f_bsize    = sb->s_blocksize;
1462         buf->f_blocks   = usage.capacity >> shift;
1463         buf->f_bfree    = (usage.capacity - usage.used) >> shift;
1464         buf->f_bavail   = buf->f_bfree;
1465         buf->f_files    = usage.nr_inodes;
1466         buf->f_ffree    = U64_MAX;
1467
1468         fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
1469                le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
1470         buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
1471         buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
1472         buf->f_namelen  = BCH_NAME_MAX;
1473
1474         return 0;
1475 }
1476
1477 static int bch2_sync_fs(struct super_block *sb, int wait)
1478 {
1479         struct bch_fs *c = sb->s_fs_info;
1480
1481         if (c->opts.journal_flush_disabled)
1482                 return 0;
1483
1484         if (!wait) {
1485                 bch2_journal_flush_async(&c->journal, NULL);
1486                 return 0;
1487         }
1488
1489         return bch2_journal_flush(&c->journal);
1490 }
1491
1492 static struct bch_fs *bch2_path_to_fs(const char *dev)
1493 {
1494         struct bch_fs *c;
1495         struct block_device *bdev = lookup_bdev(dev);
1496
1497         if (IS_ERR(bdev))
1498                 return ERR_CAST(bdev);
1499
1500         c = bch2_bdev_to_fs(bdev);
1501         bdput(bdev);
1502         return c ?: ERR_PTR(-ENOENT);
1503 }
1504
1505 static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs,
1506                                                unsigned nr_devs, struct bch_opts opts)
1507 {
1508         struct bch_fs *c, *c1, *c2;
1509         size_t i;
1510
1511         if (!nr_devs)
1512                 return ERR_PTR(-EINVAL);
1513
1514         c = bch2_fs_open(devs, nr_devs, opts);
1515
1516         if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) {
1517                 /*
1518                  * Already open?
1519                  * Look up each block device, make sure they all belong to a
1520                  * filesystem and they all belong to the _same_ filesystem
1521                  */
1522
1523                 c1 = bch2_path_to_fs(devs[0]);
1524                 if (IS_ERR(c1))
1525                         return c;
1526
1527                 for (i = 1; i < nr_devs; i++) {
1528                         c2 = bch2_path_to_fs(devs[i]);
1529                         if (!IS_ERR(c2))
1530                                 closure_put(&c2->cl);
1531
1532                         if (c1 != c2) {
1533                                 closure_put(&c1->cl);
1534                                 return c;
1535                         }
1536                 }
1537
1538                 c = c1;
1539         }
1540
1541         if (IS_ERR(c))
1542                 return c;
1543
1544         mutex_lock(&c->state_lock);
1545
1546         if (!test_bit(BCH_FS_STARTED, &c->flags)) {
1547                 mutex_unlock(&c->state_lock);
1548                 closure_put(&c->cl);
1549                 pr_err("err mounting %s: incomplete filesystem", dev_name);
1550                 return ERR_PTR(-EINVAL);
1551         }
1552
1553         mutex_unlock(&c->state_lock);
1554
1555         set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
1556         return c;
1557 }
1558
1559 static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
1560                                              struct bch_opts opts)
1561 {
1562         char *dev_name = NULL, **devs = NULL, *s;
1563         struct bch_fs *c = ERR_PTR(-ENOMEM);
1564         size_t i, nr_devs = 0;
1565
1566         dev_name = kstrdup(_dev_name, GFP_KERNEL);
1567         if (!dev_name)
1568                 goto err;
1569
1570         for (s = dev_name; s; s = strchr(s + 1, ':'))
1571                 nr_devs++;
1572
1573         devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
1574         if (!devs)
1575                 goto err;
1576
1577         for (i = 0, s = dev_name;
1578              s;
1579              (s = strchr(s, ':')) && (*s++ = '\0'))
1580                 devs[i++] = s;
1581
1582         c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts);
1583 err:
1584         kfree(devs);
1585         kfree(dev_name);
1586         return c;
1587 }
1588
1589 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1590 {
1591         struct bch_fs *c = sb->s_fs_info;
1592         struct bch_opts opts = bch2_opts_empty();
1593         int ret;
1594
1595         opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
1596
1597         ret = bch2_parse_mount_opts(&opts, data);
1598         if (ret)
1599                 return ret;
1600
1601         if (opts.read_only != c->opts.read_only) {
1602                 mutex_lock(&c->state_lock);
1603
1604                 if (opts.read_only) {
1605                         bch2_fs_read_only(c);
1606
1607                         sb->s_flags |= SB_RDONLY;
1608                 } else {
1609                         ret = bch2_fs_read_write(c);
1610                         if (ret) {
1611                                 bch_err(c, "error going rw: %i", ret);
1612                                 mutex_unlock(&c->state_lock);
1613                                 return -EINVAL;
1614                         }
1615
1616                         sb->s_flags &= ~SB_RDONLY;
1617                 }
1618
1619                 c->opts.read_only = opts.read_only;
1620
1621                 mutex_unlock(&c->state_lock);
1622         }
1623
1624         if (opts.errors >= 0)
1625                 c->opts.errors = opts.errors;
1626
1627         return ret;
1628 }
1629
1630 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1631 {
1632         struct bch_fs *c = root->d_sb->s_fs_info;
1633         enum bch_opt_id i;
1634         char buf[512];
1635
1636         for (i = 0; i < bch2_opts_nr; i++) {
1637                 const struct bch_option *opt = &bch2_opt_table[i];
1638                 u64 v = bch2_opt_get_by_id(&c->opts, i);
1639
1640                 if (!(opt->mode & OPT_MOUNT))
1641                         continue;
1642
1643                 if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1644                         continue;
1645
1646                 bch2_opt_to_text(&PBUF(buf), c, opt, v,
1647                                  OPT_SHOW_MOUNT_STYLE);
1648                 seq_putc(seq, ',');
1649                 seq_puts(seq, buf);
1650         }
1651
1652         return 0;
1653
1654 }
1655
1656 static const struct super_operations bch_super_operations = {
1657         .alloc_inode    = bch2_alloc_inode,
1658         .destroy_inode  = bch2_destroy_inode,
1659         .write_inode    = bch2_vfs_write_inode,
1660         .evict_inode    = bch2_evict_inode,
1661         .sync_fs        = bch2_sync_fs,
1662         .statfs         = bch2_statfs,
1663         .show_options   = bch2_show_options,
1664         .remount_fs     = bch2_remount,
1665 #if 0
1666         .put_super      = bch2_put_super,
1667         .freeze_fs      = bch2_freeze,
1668         .unfreeze_fs    = bch2_unfreeze,
1669 #endif
1670 };
1671
1672 static int bch2_test_super(struct super_block *s, void *data)
1673 {
1674         return s->s_fs_info == data;
1675 }
1676
1677 static int bch2_set_super(struct super_block *s, void *data)
1678 {
1679         s->s_fs_info = data;
1680         return 0;
1681 }
1682
1683 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1684                                  int flags, const char *dev_name, void *data)
1685 {
1686         struct bch_fs *c;
1687         struct bch_dev *ca;
1688         struct super_block *sb;
1689         struct inode *vinode;
1690         struct bch_opts opts = bch2_opts_empty();
1691         unsigned i;
1692         int ret;
1693
1694         opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
1695
1696         ret = bch2_parse_mount_opts(&opts, data);
1697         if (ret)
1698                 return ERR_PTR(ret);
1699
1700         c = bch2_open_as_blockdevs(dev_name, opts);
1701         if (IS_ERR(c))
1702                 return ERR_CAST(c);
1703
1704         sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c);
1705         if (IS_ERR(sb)) {
1706                 closure_put(&c->cl);
1707                 return ERR_CAST(sb);
1708         }
1709
1710         BUG_ON(sb->s_fs_info != c);
1711
1712         if (sb->s_root) {
1713                 closure_put(&c->cl);
1714
1715                 if ((flags ^ sb->s_flags) & SB_RDONLY) {
1716                         ret = -EBUSY;
1717                         goto err_put_super;
1718                 }
1719                 goto out;
1720         }
1721
1722         sb->s_blocksize         = block_bytes(c);
1723         sb->s_blocksize_bits    = ilog2(block_bytes(c));
1724         sb->s_maxbytes          = MAX_LFS_FILESIZE;
1725         sb->s_op                = &bch_super_operations;
1726         sb->s_export_op         = &bch_export_ops;
1727 #ifdef CONFIG_BCACHEFS_QUOTA
1728         sb->s_qcop              = &bch2_quotactl_operations;
1729         sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1730 #endif
1731         sb->s_xattr             = bch2_xattr_handlers;
1732         sb->s_magic             = BCACHEFS_STATFS_MAGIC;
1733         sb->s_time_gran         = c->sb.time_precision;
1734         c->vfs_sb               = sb;
1735         strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
1736
1737         ret = super_setup_bdi(sb);
1738         if (ret)
1739                 goto err_put_super;
1740
1741         sb->s_bdi->congested_fn         = bch2_congested;
1742         sb->s_bdi->congested_data       = c;
1743         sb->s_bdi->ra_pages             = VM_READAHEAD_PAGES;
1744
1745         for_each_online_member(ca, c, i) {
1746                 struct block_device *bdev = ca->disk_sb.bdev;
1747
1748                 /* XXX: create an anonymous device for multi device filesystems */
1749                 sb->s_bdev      = bdev;
1750                 sb->s_dev       = bdev->bd_dev;
1751                 percpu_ref_put(&ca->io_ref);
1752                 break;
1753         }
1754
1755 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1756         if (c->opts.acl)
1757                 sb->s_flags     |= SB_POSIXACL;
1758 #endif
1759
1760         vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
1761         if (IS_ERR(vinode)) {
1762                 bch_err(c, "error mounting: error getting root inode %i",
1763                         (int) PTR_ERR(vinode));
1764                 ret = PTR_ERR(vinode);
1765                 goto err_put_super;
1766         }
1767
1768         sb->s_root = d_make_root(vinode);
1769         if (!sb->s_root) {
1770                 bch_err(c, "error mounting: error allocating root dentry");
1771                 ret = -ENOMEM;
1772                 goto err_put_super;
1773         }
1774
1775         sb->s_flags |= SB_ACTIVE;
1776 out:
1777         return dget(sb->s_root);
1778
1779 err_put_super:
1780         deactivate_locked_super(sb);
1781         return ERR_PTR(ret);
1782 }
1783
1784 static void bch2_kill_sb(struct super_block *sb)
1785 {
1786         struct bch_fs *c = sb->s_fs_info;
1787
1788         generic_shutdown_super(sb);
1789
1790         if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
1791                 bch2_fs_stop(c);
1792         else
1793                 closure_put(&c->cl);
1794 }
1795
1796 static struct file_system_type bcache_fs_type = {
1797         .owner          = THIS_MODULE,
1798         .name           = "bcachefs",
1799         .mount          = bch2_mount,
1800         .kill_sb        = bch2_kill_sb,
1801         .fs_flags       = FS_REQUIRES_DEV,
1802 };
1803
1804 MODULE_ALIAS_FS("bcachefs");
1805
1806 void bch2_vfs_exit(void)
1807 {
1808         unregister_filesystem(&bcache_fs_type);
1809         if (bch2_inode_cache)
1810                 kmem_cache_destroy(bch2_inode_cache);
1811 }
1812
1813 int __init bch2_vfs_init(void)
1814 {
1815         int ret = -ENOMEM;
1816
1817         bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
1818         if (!bch2_inode_cache)
1819                 goto err;
1820
1821         ret = register_filesystem(&bcache_fs_type);
1822         if (ret)
1823                 goto err;
1824
1825         return 0;
1826 err:
1827         bch2_vfs_exit();
1828         return ret;
1829 }
1830
1831 #endif /* NO_BCACHEFS_FS */