]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fs.c
Update bcachefs sources to e99d29e402 bcachefs: zstd support, compression refactoring
[bcachefs-tools-debian] / libbcachefs / fs.c
1 #ifndef NO_BCACHEFS_FS
2
3 #include "bcachefs.h"
4 #include "acl.h"
5 #include "btree_update.h"
6 #include "buckets.h"
7 #include "chardev.h"
8 #include "dirent.h"
9 #include "extents.h"
10 #include "fs.h"
11 #include "fs-io.h"
12 #include "fs-ioctl.h"
13 #include "fsck.h"
14 #include "inode.h"
15 #include "io.h"
16 #include "journal.h"
17 #include "keylist.h"
18 #include "quota.h"
19 #include "super.h"
20 #include "xattr.h"
21
22 #include <linux/aio.h>
23 #include <linux/backing-dev.h>
24 #include <linux/exportfs.h>
25 #include <linux/module.h>
26 #include <linux/posix_acl.h>
27 #include <linux/random.h>
28 #include <linux/statfs.h>
29 #include <linux/xattr.h>
30
31 static struct kmem_cache *bch2_inode_cache;
32
33 static void bch2_vfs_inode_init(struct bch_fs *,
34                                 struct bch_inode_info *,
35                                 struct bch_inode_unpacked *);
36
37 /*
38  * I_SIZE_DIRTY requires special handling:
39  *
40  * To the recovery code, the flag means that there is stale data past i_size
41  * that needs to be deleted; it's used for implementing atomic appends and
42  * truncates.
43  *
44  * On append, we set I_SIZE_DIRTY before doing the write, then after the write
45  * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size
46  * that exposes the data we just wrote.
47  *
48  * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting
49  * i_size to the new smaller size, then we delete the data that we just made
50  * invisible, and then we clear I_SIZE_DIRTY.
51  *
52  * Because there can be multiple appends in flight at a time, we need a refcount
53  * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero
54  * refcount means I_SIZE_DIRTY is set, zero means it's cleared.
55  *
56  * Because write_inode() can be called at any time, i_size_dirty_count means
57  * something different to the runtime code - it means to write_inode() "don't
58  * update i_size yet".
59  *
60  * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when
61  * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must
62  * be set explicitly.
63  */
64
65 int __must_check __bch2_write_inode(struct bch_fs *c,
66                                     struct bch_inode_info *inode,
67                                     inode_set_fn set,
68                                     void *p)
69 {
70         struct btree_iter iter;
71         struct bch_inode_unpacked inode_u;
72         struct bkey_inode_buf inode_p;
73         u64 inum = inode->v.i_ino;
74         unsigned i_nlink = READ_ONCE(inode->v.i_nlink);
75         int ret;
76
77         /*
78          * We can't write an inode with i_nlink == 0 because it's stored biased;
79          * however, we don't need to because if i_nlink is 0 the inode is
80          * getting deleted when it's evicted.
81          */
82         if (!i_nlink)
83                 return 0;
84
85         lockdep_assert_held(&inode->ei_update_lock);
86
87         bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0),
88                              BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
89
90         do {
91                 struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
92
93                 if ((ret = btree_iter_err(k)))
94                         goto out;
95
96                 if (WARN_ONCE(k.k->type != BCH_INODE_FS,
97                               "inode %llu not found when updating", inum)) {
98                         bch2_btree_iter_unlock(&iter);
99                         return -ENOENT;
100                 }
101
102                 ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
103                 if (WARN_ONCE(ret,
104                               "error %i unpacking inode %llu", ret, inum)) {
105                         ret = -ENOENT;
106                         break;
107                 }
108
109                 if (set) {
110                         ret = set(inode, &inode_u, p);
111                         if (ret)
112                                 goto out;
113                 }
114
115                 BUG_ON(i_nlink < nlink_bias(inode->v.i_mode));
116
117                 inode_u.bi_mode = inode->v.i_mode;
118                 inode_u.bi_uid  = i_uid_read(&inode->v);
119                 inode_u.bi_gid  = i_gid_read(&inode->v);
120                 inode_u.bi_project = inode->ei_qid.q[QTYP_PRJ];
121                 inode_u.bi_nlink= i_nlink - nlink_bias(inode->v.i_mode);
122                 inode_u.bi_dev  = inode->v.i_rdev;
123                 inode_u.bi_atime= timespec_to_bch2_time(c, inode->v.i_atime);
124                 inode_u.bi_mtime= timespec_to_bch2_time(c, inode->v.i_mtime);
125                 inode_u.bi_ctime= timespec_to_bch2_time(c, inode->v.i_ctime);
126
127                 bch2_inode_pack(&inode_p, &inode_u);
128
129                 ret = bch2_btree_insert_at(c, NULL, NULL,
130                                 &inode->ei_journal_seq,
131                                 BTREE_INSERT_ATOMIC|
132                                 BTREE_INSERT_NOFAIL,
133                                 BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
134         } while (ret == -EINTR);
135
136         if (!ret) {
137                 inode->ei_inode = inode_u;
138                 inode->ei_qid   = bch_qid(&inode_u);
139         }
140 out:
141         bch2_btree_iter_unlock(&iter);
142
143         return ret < 0 ? ret : 0;
144 }
145
146 int __must_check bch2_write_inode(struct bch_fs *c,
147                                   struct bch_inode_info *inode)
148 {
149         return __bch2_write_inode(c, inode, NULL, NULL);
150 }
151
152 static int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
153 {
154         int ret;
155
156         mutex_lock(&inode->ei_update_lock);
157         inc_nlink(&inode->v);
158         ret = bch2_write_inode(c, inode);
159         mutex_unlock(&inode->ei_update_lock);
160
161         return ret;
162 }
163
164 static int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
165 {
166         int ret = 0;
167
168         mutex_lock(&inode->ei_update_lock);
169         drop_nlink(&inode->v);
170         ret = bch2_write_inode(c, inode);
171         mutex_unlock(&inode->ei_update_lock);
172
173         return ret;
174 }
175
176 static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
177 {
178         struct bch_inode_unpacked inode_u;
179         struct bch_inode_info *inode;
180         int ret;
181
182         inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
183         if (unlikely(!inode))
184                 return ERR_PTR(-ENOMEM);
185         if (!(inode->v.i_state & I_NEW))
186                 return &inode->v;
187
188         ret = bch2_inode_find_by_inum(c, inum, &inode_u);
189         if (ret) {
190                 iget_failed(&inode->v);
191                 return ERR_PTR(ret);
192         }
193
194         bch2_vfs_inode_init(c, inode, &inode_u);
195
196         inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
197
198         unlock_new_inode(&inode->v);
199
200         return &inode->v;
201 }
202
203 static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
204                                                     struct bch_inode_info *dir,
205                                                     umode_t mode, dev_t rdev)
206 {
207         struct posix_acl *default_acl = NULL, *acl = NULL;
208         struct bch_inode_info *inode;
209         struct bch_inode_unpacked inode_u;
210         int ret;
211
212         inode = to_bch_ei(new_inode(c->vfs_sb));
213         if (unlikely(!inode))
214                 return ERR_PTR(-ENOMEM);
215
216         inode_init_owner(&inode->v, &dir->v, mode);
217
218 #ifdef CONFIG_BCACHEFS_POSIX_ACL
219         ret = posix_acl_create(&dir->v, &inode->v.i_mode, &default_acl, &acl);
220         if (ret)
221                 goto err_make_bad;
222 #endif
223
224         bch2_inode_init(c, &inode_u,
225                         i_uid_read(&inode->v),
226                         i_gid_read(&inode->v),
227                         inode->v.i_mode, rdev,
228                         &dir->ei_inode);
229
230         inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ];
231
232         ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC);
233         if (ret)
234                 goto err_make_bad;
235
236         ret = bch2_inode_create(c, &inode_u,
237                                 BLOCKDEV_INODE_MAX, 0,
238                                 &c->unused_inode_hint);
239         if (unlikely(ret))
240                 goto err_acct_quota;
241
242         bch2_vfs_inode_init(c, inode, &inode_u);
243         atomic_long_inc(&c->nr_inodes);
244
245         if (default_acl) {
246                 ret = bch2_set_acl(&inode->v, default_acl, ACL_TYPE_DEFAULT);
247                 if (unlikely(ret))
248                         goto err;
249         }
250
251         if (acl) {
252                 ret = bch2_set_acl(&inode->v, acl, ACL_TYPE_ACCESS);
253                 if (unlikely(ret))
254                         goto err;
255         }
256
257         insert_inode_hash(&inode->v);
258 out:
259         posix_acl_release(default_acl);
260         posix_acl_release(acl);
261         return inode;
262 err_acct_quota:
263         bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN);
264 err_make_bad:
265         /*
266          * indicate to bch_evict_inode that the inode was never actually
267          * created:
268          */
269         make_bad_inode(&inode->v);
270 err:
271         clear_nlink(&inode->v);
272         iput(&inode->v);
273         inode = ERR_PTR(ret);
274         goto out;
275 }
276
277 static int bch2_vfs_dirent_create(struct bch_fs *c,
278                                   struct bch_inode_info *dir,
279                                   u8 type, const struct qstr *name,
280                                   u64 dst)
281 {
282         int ret;
283
284         ret = bch2_dirent_create(c, dir->v.i_ino, &dir->ei_str_hash,
285                                 type, name, dst,
286                                 &dir->ei_journal_seq,
287                                 BCH_HASH_SET_MUST_CREATE);
288         if (unlikely(ret))
289                 return ret;
290
291         dir->v.i_mtime = dir->v.i_ctime = current_time(&dir->v);
292         mark_inode_dirty_sync(&dir->v);
293         return 0;
294 }
295
296 static int __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
297                          umode_t mode, dev_t rdev)
298 {
299         struct bch_fs *c = dir->v.i_sb->s_fs_info;
300         struct bch_inode_info *inode;
301         int ret;
302
303         inode = bch2_vfs_inode_create(c, dir, mode, rdev);
304         if (unlikely(IS_ERR(inode)))
305                 return PTR_ERR(inode);
306
307         ret = bch2_vfs_dirent_create(c, dir, mode_to_type(mode),
308                                      &dentry->d_name, inode->v.i_ino);
309         if (unlikely(ret)) {
310                 clear_nlink(&inode->v);
311                 iput(&inode->v);
312                 return ret;
313         }
314
315         if (dir->ei_journal_seq > inode->ei_journal_seq)
316                 inode->ei_journal_seq = dir->ei_journal_seq;
317
318         d_instantiate(dentry, &inode->v);
319         return 0;
320 }
321
322 /* methods */
323
324 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
325                                   unsigned int flags)
326 {
327         struct bch_fs *c = vdir->i_sb->s_fs_info;
328         struct bch_inode_info *dir = to_bch_ei(vdir);
329         struct inode *vinode = NULL;
330         u64 inum;
331
332         inum = bch2_dirent_lookup(c, dir->v.i_ino,
333                                   &dir->ei_str_hash,
334                                   &dentry->d_name);
335
336         if (inum)
337                 vinode = bch2_vfs_inode_get(c, inum);
338
339         return d_splice_alias(vinode, dentry);
340 }
341
342 static int bch2_create(struct inode *vdir, struct dentry *dentry,
343                        umode_t mode, bool excl)
344 {
345         return __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0);
346 }
347
348 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
349                      struct dentry *dentry)
350 {
351         struct bch_fs *c = vdir->i_sb->s_fs_info;
352         struct bch_inode_info *dir = to_bch_ei(vdir);
353         struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
354         int ret;
355
356         lockdep_assert_held(&inode->v.i_rwsem);
357
358         inode->v.i_ctime = current_time(&dir->v);
359
360         ret = bch2_inc_nlink(c, inode);
361         if (ret)
362                 return ret;
363
364         ihold(&inode->v);
365
366         ret = bch2_vfs_dirent_create(c, dir, mode_to_type(inode->v.i_mode),
367                                      &dentry->d_name, inode->v.i_ino);
368         if (unlikely(ret)) {
369                 bch2_dec_nlink(c, inode);
370                 iput(&inode->v);
371                 return ret;
372         }
373
374         d_instantiate(dentry, &inode->v);
375         return 0;
376 }
377
378 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
379 {
380         struct bch_fs *c = vdir->i_sb->s_fs_info;
381         struct bch_inode_info *dir = to_bch_ei(vdir);
382         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
383         int ret;
384
385         lockdep_assert_held(&inode->v.i_rwsem);
386
387         ret = bch2_dirent_delete(c, dir->v.i_ino, &dir->ei_str_hash,
388                                  &dentry->d_name, &dir->ei_journal_seq);
389         if (ret)
390                 return ret;
391
392         if (dir->ei_journal_seq > inode->ei_journal_seq)
393                 inode->ei_journal_seq = dir->ei_journal_seq;
394
395         inode->v.i_ctime = dir->v.i_ctime;
396
397         if (S_ISDIR(inode->v.i_mode)) {
398                 bch2_dec_nlink(c, dir);
399                 drop_nlink(&inode->v);
400         }
401
402         bch2_dec_nlink(c, inode);
403
404         return 0;
405 }
406
407 static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
408                         const char *symname)
409 {
410         struct bch_fs *c = vdir->i_sb->s_fs_info;
411         struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
412         int ret;
413
414         inode = bch2_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0);
415         if (unlikely(IS_ERR(inode)))
416                 return PTR_ERR(inode);
417
418         inode_lock(&inode->v);
419         ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
420         inode_unlock(&inode->v);
421
422         if (unlikely(ret))
423                 goto err;
424
425         ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
426         if (unlikely(ret))
427                 goto err;
428
429         /* XXX: racy */
430         if (dir->ei_journal_seq < inode->ei_journal_seq)
431                 dir->ei_journal_seq = inode->ei_journal_seq;
432
433         ret = bch2_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name,
434                                      inode->v.i_ino);
435         if (unlikely(ret))
436                 goto err;
437
438         d_instantiate(dentry, &inode->v);
439         return 0;
440 err:
441         clear_nlink(&inode->v);
442         iput(&inode->v);
443         return ret;
444 }
445
446 static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode)
447 {
448         struct bch_fs *c = vdir->i_sb->s_fs_info;
449         struct bch_inode_info *dir = to_bch_ei(vdir);
450         int ret;
451
452         lockdep_assert_held(&dir->v.i_rwsem);
453
454         ret = __bch2_create(dir, dentry, mode|S_IFDIR, 0);
455         if (unlikely(ret))
456                 return ret;
457
458         bch2_inc_nlink(c, dir);
459
460         return 0;
461 }
462
463 static int bch2_rmdir(struct inode *vdir, struct dentry *dentry)
464 {
465         struct bch_fs *c = vdir->i_sb->s_fs_info;
466
467         if (bch2_empty_dir(c, dentry->d_inode->i_ino))
468                 return -ENOTEMPTY;
469
470         return bch2_unlink(vdir, dentry);
471 }
472
473 static int bch2_mknod(struct inode *vdir, struct dentry *dentry,
474                       umode_t mode, dev_t rdev)
475 {
476         return __bch2_create(to_bch_ei(vdir), dentry, mode, rdev);
477 }
478
479 static int bch2_rename(struct bch_fs *c,
480                        struct bch_inode_info *old_dir,
481                        struct dentry *old_dentry,
482                        struct bch_inode_info *new_dir,
483                        struct dentry *new_dentry)
484 {
485         struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
486         struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
487         struct timespec now = current_time(&old_dir->v);
488         int ret;
489
490         lockdep_assert_held(&old_dir->v.i_rwsem);
491         lockdep_assert_held(&new_dir->v.i_rwsem);
492
493         if (new_inode)
494                 filemap_write_and_wait_range(old_inode->v.i_mapping,
495                                              0, LLONG_MAX);
496
497         if (new_inode && S_ISDIR(old_inode->v.i_mode)) {
498                 lockdep_assert_held(&new_inode->v.i_rwsem);
499
500                 if (!S_ISDIR(new_inode->v.i_mode))
501                         return -ENOTDIR;
502
503                 if (bch2_empty_dir(c, new_inode->v.i_ino))
504                         return -ENOTEMPTY;
505
506                 ret = bch2_dirent_rename(c,
507                                 old_dir, &old_dentry->d_name,
508                                 new_dir, &new_dentry->d_name,
509                                 &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE);
510                 if (unlikely(ret))
511                         return ret;
512
513                 clear_nlink(&new_inode->v);
514                 bch2_dec_nlink(c, old_dir);
515         } else if (new_inode) {
516                 lockdep_assert_held(&new_inode->v.i_rwsem);
517
518                 ret = bch2_dirent_rename(c,
519                                 old_dir, &old_dentry->d_name,
520                                 new_dir, &new_dentry->d_name,
521                                 &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE);
522                 if (unlikely(ret))
523                         return ret;
524
525                 new_inode->v.i_ctime = now;
526                 bch2_dec_nlink(c, new_inode);
527         } else if (S_ISDIR(old_inode->v.i_mode)) {
528                 ret = bch2_dirent_rename(c,
529                                 old_dir, &old_dentry->d_name,
530                                 new_dir, &new_dentry->d_name,
531                                 &old_inode->ei_journal_seq, BCH_RENAME);
532                 if (unlikely(ret))
533                         return ret;
534
535                 bch2_inc_nlink(c, new_dir);
536                 bch2_dec_nlink(c, old_dir);
537         } else {
538                 ret = bch2_dirent_rename(c,
539                                 old_dir, &old_dentry->d_name,
540                                 new_dir, &new_dentry->d_name,
541                                 &old_inode->ei_journal_seq, BCH_RENAME);
542                 if (unlikely(ret))
543                         return ret;
544         }
545
546         old_dir->v.i_ctime = old_dir->v.i_mtime = now;
547         new_dir->v.i_ctime = new_dir->v.i_mtime = now;
548         mark_inode_dirty_sync(&old_dir->v);
549         mark_inode_dirty_sync(&new_dir->v);
550
551         old_inode->v.i_ctime = now;
552         mark_inode_dirty_sync(&old_inode->v);
553
554         return 0;
555 }
556
557 static int bch2_rename_exchange(struct bch_fs *c,
558                                 struct bch_inode_info *old_dir,
559                                 struct dentry *old_dentry,
560                                 struct bch_inode_info *new_dir,
561                                 struct dentry *new_dentry)
562 {
563         struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
564         struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
565         struct timespec now = current_time(&old_dir->v);
566         int ret;
567
568         ret = bch2_dirent_rename(c,
569                                  old_dir, &old_dentry->d_name,
570                                  new_dir, &new_dentry->d_name,
571                                  &old_inode->ei_journal_seq, BCH_RENAME_EXCHANGE);
572         if (unlikely(ret))
573                 return ret;
574
575         if (S_ISDIR(old_inode->v.i_mode) !=
576             S_ISDIR(new_inode->v.i_mode)) {
577                 if (S_ISDIR(old_inode->v.i_mode)) {
578                         bch2_inc_nlink(c, new_dir);
579                         bch2_dec_nlink(c, old_dir);
580                 } else {
581                         bch2_dec_nlink(c, new_dir);
582                         bch2_inc_nlink(c, old_dir);
583                 }
584         }
585
586         old_dir->v.i_ctime = old_dir->v.i_mtime = now;
587         new_dir->v.i_ctime = new_dir->v.i_mtime = now;
588         mark_inode_dirty_sync(&old_dir->v);
589         mark_inode_dirty_sync(&new_dir->v);
590
591         old_inode->v.i_ctime = now;
592         new_inode->v.i_ctime = now;
593         mark_inode_dirty_sync(&old_inode->v);
594         mark_inode_dirty_sync(&new_inode->v);
595
596         return 0;
597 }
598
599 static int bch2_rename2(struct inode *old_vdir, struct dentry *old_dentry,
600                         struct inode *new_vdir, struct dentry *new_dentry,
601                         unsigned flags)
602 {
603         struct bch_fs *c = old_vdir->i_sb->s_fs_info;
604         struct bch_inode_info *old_dir = to_bch_ei(old_vdir);
605         struct bch_inode_info *new_dir = to_bch_ei(new_vdir);
606
607         if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
608                 return -EINVAL;
609
610         if (flags & RENAME_EXCHANGE)
611                 return bch2_rename_exchange(c, old_dir, old_dentry,
612                                             new_dir, new_dentry);
613
614         return bch2_rename(c, old_dir, old_dentry, new_dir, new_dentry);
615 }
616
617 static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iattr)
618 {
619         struct bch_fs *c = inode->v.i_sb->s_fs_info;
620         struct bch_qid qid = inode->ei_qid;
621         unsigned qtypes = 0;
622         int ret;
623
624         mutex_lock(&inode->ei_update_lock);
625
626         if (c->opts.usrquota &&
627             (iattr->ia_valid & ATTR_UID) &&
628             !uid_eq(iattr->ia_uid, inode->v.i_uid)) {
629                 qid.q[QTYP_USR] = from_kuid(&init_user_ns, iattr->ia_uid),
630                 qtypes |= 1 << QTYP_USR;
631         }
632
633         if (c->opts.grpquota &&
634             (iattr->ia_valid & ATTR_GID) &&
635             !gid_eq(iattr->ia_gid, inode->v.i_gid)) {
636                 qid.q[QTYP_GRP] = from_kgid(&init_user_ns, iattr->ia_gid);
637                 qtypes |= 1 << QTYP_GRP;
638         }
639
640         if (qtypes) {
641                 ret = bch2_quota_transfer(c, qtypes, qid, inode->ei_qid,
642                                           inode->v.i_blocks +
643                                           inode->ei_quota_reserved);
644                 if (ret)
645                         goto out_unlock;
646         }
647
648         setattr_copy(&inode->v, iattr);
649
650         ret = bch2_write_inode(c, inode);
651 out_unlock:
652         mutex_unlock(&inode->ei_update_lock);
653
654         if (!ret &&
655             iattr->ia_valid & ATTR_MODE)
656                 ret = posix_acl_chmod(&inode->v, inode->v.i_mode);
657
658         return ret;
659 }
660
661 static int bch2_getattr(const struct path *path, struct kstat *stat,
662                         u32 request_mask, unsigned query_flags)
663 {
664         struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
665         struct bch_fs *c = inode->v.i_sb->s_fs_info;
666
667         stat->dev       = inode->v.i_sb->s_dev;
668         stat->ino       = inode->v.i_ino;
669         stat->mode      = inode->v.i_mode;
670         stat->nlink     = inode->v.i_nlink;
671         stat->uid       = inode->v.i_uid;
672         stat->gid       = inode->v.i_gid;
673         stat->rdev      = inode->v.i_rdev;
674         stat->size      = i_size_read(&inode->v);
675         stat->atime     = inode->v.i_atime;
676         stat->mtime     = inode->v.i_mtime;
677         stat->ctime     = inode->v.i_ctime;
678         stat->blksize   = block_bytes(c);
679         stat->blocks    = inode->v.i_blocks;
680
681         if (request_mask & STATX_BTIME) {
682                 stat->result_mask |= STATX_BTIME;
683                 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
684         }
685
686         if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
687                 stat->attributes |= STATX_ATTR_IMMUTABLE;
688         if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
689                 stat->attributes |= STATX_ATTR_APPEND;
690         if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
691                 stat->attributes |= STATX_ATTR_NODUMP;
692
693         return 0;
694 }
695
696 static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
697 {
698         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
699         int ret;
700
701         lockdep_assert_held(&inode->v.i_rwsem);
702
703         ret = setattr_prepare(dentry, iattr);
704         if (ret)
705                 return ret;
706
707         return iattr->ia_valid & ATTR_SIZE
708                 ? bch2_truncate(inode, iattr)
709                 : bch2_setattr_nonsize(inode, iattr);
710 }
711
712 static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
713 {
714         struct bch_fs *c = vdir->i_sb->s_fs_info;
715         struct bch_inode_info *dir = to_bch_ei(vdir);
716         struct bch_inode_info *inode;
717
718         /* XXX: i_nlink should be 0? */
719         inode = bch2_vfs_inode_create(c, dir, mode, 0);
720         if (unlikely(IS_ERR(inode)))
721                 return PTR_ERR(inode);
722
723         d_tmpfile(dentry, &inode->v);
724         return 0;
725 }
726
727 static int bch2_fill_extent(struct fiemap_extent_info *info,
728                             const struct bkey_i *k, unsigned flags)
729 {
730         if (bkey_extent_is_data(&k->k)) {
731                 struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
732                 const struct bch_extent_ptr *ptr;
733                 struct bch_extent_crc_unpacked crc;
734                 int ret;
735
736                 extent_for_each_ptr_crc(e, ptr, crc) {
737                         int flags2 = 0;
738                         u64 offset = ptr->offset;
739
740                         if (crc.compression_type)
741                                 flags2 |= FIEMAP_EXTENT_ENCODED;
742                         else
743                                 offset += crc.offset;
744
745                         if ((offset & (PAGE_SECTORS - 1)) ||
746                             (e.k->size & (PAGE_SECTORS - 1)))
747                                 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
748
749                         ret = fiemap_fill_next_extent(info,
750                                                       bkey_start_offset(e.k) << 9,
751                                                       offset << 9,
752                                                       e.k->size << 9, flags|flags2);
753                         if (ret)
754                                 return ret;
755                 }
756
757                 return 0;
758         } else if (k->k.type == BCH_RESERVATION) {
759                 return fiemap_fill_next_extent(info,
760                                                bkey_start_offset(&k->k) << 9,
761                                                0, k->k.size << 9,
762                                                flags|
763                                                FIEMAP_EXTENT_DELALLOC|
764                                                FIEMAP_EXTENT_UNWRITTEN);
765         } else {
766                 BUG();
767         }
768 }
769
770 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
771                        u64 start, u64 len)
772 {
773         struct bch_fs *c = vinode->i_sb->s_fs_info;
774         struct bch_inode_info *ei = to_bch_ei(vinode);
775         struct btree_iter iter;
776         struct bkey_s_c k;
777         BKEY_PADDED(k) tmp;
778         bool have_extent = false;
779         int ret = 0;
780
781         if (start + len < start)
782                 return -EINVAL;
783
784         for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
785                            POS(ei->v.i_ino, start >> 9), 0, k)
786                 if (bkey_extent_is_data(k.k) ||
787                     k.k->type == BCH_RESERVATION) {
788                         if (bkey_cmp(bkey_start_pos(k.k),
789                                      POS(ei->v.i_ino, (start + len) >> 9)) >= 0)
790                                 break;
791
792                         if (have_extent) {
793                                 ret = bch2_fill_extent(info, &tmp.k, 0);
794                                 if (ret)
795                                         goto out;
796                         }
797
798                         bkey_reassemble(&tmp.k, k);
799                         have_extent = true;
800                 }
801
802         if (have_extent)
803                 ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
804 out:
805         bch2_btree_iter_unlock(&iter);
806         return ret < 0 ? ret : 0;
807 }
808
809 static const struct vm_operations_struct bch_vm_ops = {
810         .fault          = filemap_fault,
811         .map_pages      = filemap_map_pages,
812         .page_mkwrite   = bch2_page_mkwrite,
813 };
814
815 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
816 {
817         file_accessed(file);
818
819         vma->vm_ops = &bch_vm_ops;
820         return 0;
821 }
822
823 /* Directories: */
824
825 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
826 {
827         return generic_file_llseek_size(file, offset, whence,
828                                         S64_MAX, S64_MAX);
829 }
830
831 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
832 {
833         struct bch_fs *c = file_inode(file)->i_sb->s_fs_info;
834
835         return bch2_readdir(c, file, ctx);
836 }
837
838 static const struct file_operations bch_file_operations = {
839         .llseek         = bch2_llseek,
840         .read_iter      = generic_file_read_iter,
841         .write_iter     = bch2_write_iter,
842         .mmap           = bch2_mmap,
843         .open           = generic_file_open,
844         .fsync          = bch2_fsync,
845         .splice_read    = generic_file_splice_read,
846         .splice_write   = iter_file_splice_write,
847         .fallocate      = bch2_fallocate_dispatch,
848         .unlocked_ioctl = bch2_fs_file_ioctl,
849 #ifdef CONFIG_COMPAT
850         .compat_ioctl   = bch2_compat_fs_ioctl,
851 #endif
852 };
853
854 static const struct inode_operations bch_file_inode_operations = {
855         .getattr        = bch2_getattr,
856         .setattr        = bch2_setattr,
857         .fiemap         = bch2_fiemap,
858         .listxattr      = bch2_xattr_list,
859 #ifdef CONFIG_BCACHEFS_POSIX_ACL
860         .get_acl        = bch2_get_acl,
861         .set_acl        = bch2_set_acl,
862 #endif
863 };
864
865 static const struct inode_operations bch_dir_inode_operations = {
866         .lookup         = bch2_lookup,
867         .create         = bch2_create,
868         .link           = bch2_link,
869         .unlink         = bch2_unlink,
870         .symlink        = bch2_symlink,
871         .mkdir          = bch2_mkdir,
872         .rmdir          = bch2_rmdir,
873         .mknod          = bch2_mknod,
874         .rename         = bch2_rename2,
875         .getattr        = bch2_getattr,
876         .setattr        = bch2_setattr,
877         .tmpfile        = bch2_tmpfile,
878         .listxattr      = bch2_xattr_list,
879 #ifdef CONFIG_BCACHEFS_POSIX_ACL
880         .get_acl        = bch2_get_acl,
881         .set_acl        = bch2_set_acl,
882 #endif
883 };
884
885 static const struct file_operations bch_dir_file_operations = {
886         .llseek         = bch2_dir_llseek,
887         .read           = generic_read_dir,
888         .iterate        = bch2_vfs_readdir,
889         .fsync          = bch2_fsync,
890         .unlocked_ioctl = bch2_fs_file_ioctl,
891 #ifdef CONFIG_COMPAT
892         .compat_ioctl   = bch2_compat_fs_ioctl,
893 #endif
894 };
895
896 static const struct inode_operations bch_symlink_inode_operations = {
897         .get_link       = page_get_link,
898         .getattr        = bch2_getattr,
899         .setattr        = bch2_setattr,
900         .listxattr      = bch2_xattr_list,
901 #ifdef CONFIG_BCACHEFS_POSIX_ACL
902         .get_acl        = bch2_get_acl,
903         .set_acl        = bch2_set_acl,
904 #endif
905 };
906
907 static const struct inode_operations bch_special_inode_operations = {
908         .getattr        = bch2_getattr,
909         .setattr        = bch2_setattr,
910         .listxattr      = bch2_xattr_list,
911 #ifdef CONFIG_BCACHEFS_POSIX_ACL
912         .get_acl        = bch2_get_acl,
913         .set_acl        = bch2_set_acl,
914 #endif
915 };
916
917 static const struct address_space_operations bch_address_space_operations = {
918         .writepage      = bch2_writepage,
919         .readpage       = bch2_readpage,
920         .writepages     = bch2_writepages,
921         .readpages      = bch2_readpages,
922         .set_page_dirty = bch2_set_page_dirty,
923         .write_begin    = bch2_write_begin,
924         .write_end      = bch2_write_end,
925         .invalidatepage = bch2_invalidatepage,
926         .releasepage    = bch2_releasepage,
927         .direct_IO      = bch2_direct_IO,
928 #ifdef CONFIG_MIGRATION
929         .migratepage    = bch2_migrate_page,
930 #endif
931         .error_remove_page = generic_error_remove_page,
932 };
933
934 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
935                 u64 ino, u32 generation)
936 {
937         struct bch_fs *c = sb->s_fs_info;
938         struct inode *vinode;
939
940         if (ino < BCACHEFS_ROOT_INO)
941                 return ERR_PTR(-ESTALE);
942
943         vinode = bch2_vfs_inode_get(c, ino);
944         if (IS_ERR(vinode))
945                 return ERR_CAST(vinode);
946         if (generation && vinode->i_generation != generation) {
947                 /* we didn't find the right inode.. */
948                 iput(vinode);
949                 return ERR_PTR(-ESTALE);
950         }
951         return vinode;
952 }
953
954 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
955                 int fh_len, int fh_type)
956 {
957         return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
958                                     bch2_nfs_get_inode);
959 }
960
961 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
962                 int fh_len, int fh_type)
963 {
964         return generic_fh_to_parent(sb, fid, fh_len, fh_type,
965                                     bch2_nfs_get_inode);
966 }
967
968 static const struct export_operations bch_export_ops = {
969         .fh_to_dentry   = bch2_fh_to_dentry,
970         .fh_to_parent   = bch2_fh_to_parent,
971         //.get_parent   = bch2_get_parent,
972 };
973
974 static void bch2_vfs_inode_init(struct bch_fs *c,
975                                 struct bch_inode_info *inode,
976                                 struct bch_inode_unpacked *bi)
977 {
978         inode->v.i_mode         = bi->bi_mode;
979         i_uid_write(&inode->v, bi->bi_uid);
980         i_gid_write(&inode->v, bi->bi_gid);
981         inode->v.i_blocks       = bi->bi_sectors;
982         inode->v.i_ino          = bi->bi_inum;
983         set_nlink(&inode->v, bi->bi_nlink + nlink_bias(inode->v.i_mode));
984         inode->v.i_rdev         = bi->bi_dev;
985         inode->v.i_generation   = bi->bi_generation;
986         inode->v.i_size         = bi->bi_size;
987         inode->v.i_atime        = bch2_time_to_timespec(c, bi->bi_atime);
988         inode->v.i_mtime        = bch2_time_to_timespec(c, bi->bi_mtime);
989         inode->v.i_ctime        = bch2_time_to_timespec(c, bi->bi_ctime);
990
991         inode->ei_journal_seq   = 0;
992         inode->ei_quota_reserved = 0;
993         inode->ei_qid           = bch_qid(bi);
994         inode->ei_str_hash      = bch2_hash_info_init(c, bi);
995         inode->ei_inode         = *bi;
996
997         bch2_inode_flags_to_vfs(inode);
998
999         inode->v.i_mapping->a_ops = &bch_address_space_operations;
1000
1001         switch (inode->v.i_mode & S_IFMT) {
1002         case S_IFREG:
1003                 inode->v.i_op   = &bch_file_inode_operations;
1004                 inode->v.i_fop  = &bch_file_operations;
1005                 break;
1006         case S_IFDIR:
1007                 inode->v.i_op   = &bch_dir_inode_operations;
1008                 inode->v.i_fop  = &bch_dir_file_operations;
1009                 break;
1010         case S_IFLNK:
1011                 inode_nohighmem(&inode->v);
1012                 inode->v.i_op   = &bch_symlink_inode_operations;
1013                 break;
1014         default:
1015                 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1016                 inode->v.i_op   = &bch_special_inode_operations;
1017                 break;
1018         }
1019 }
1020
1021 static struct inode *bch2_alloc_inode(struct super_block *sb)
1022 {
1023         struct bch_inode_info *inode;
1024
1025         inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
1026         if (!inode)
1027                 return NULL;
1028
1029         inode_init_once(&inode->v);
1030         mutex_init(&inode->ei_update_lock);
1031         inode->ei_journal_seq = 0;
1032
1033         return &inode->v;
1034 }
1035
1036 static void bch2_i_callback(struct rcu_head *head)
1037 {
1038         struct inode *vinode = container_of(head, struct inode, i_rcu);
1039         struct bch_inode_info *inode = to_bch_ei(vinode);
1040
1041         kmem_cache_free(bch2_inode_cache, inode);
1042 }
1043
1044 static void bch2_destroy_inode(struct inode *vinode)
1045 {
1046         call_rcu(&vinode->i_rcu, bch2_i_callback);
1047 }
1048
1049 static int bch2_vfs_write_inode(struct inode *vinode,
1050                                 struct writeback_control *wbc)
1051 {
1052         struct bch_fs *c = vinode->i_sb->s_fs_info;
1053         struct bch_inode_info *inode = to_bch_ei(vinode);
1054         int ret;
1055
1056         mutex_lock(&inode->ei_update_lock);
1057         ret = bch2_write_inode(c, inode);
1058         mutex_unlock(&inode->ei_update_lock);
1059
1060         if (c->opts.journal_flush_disabled)
1061                 return ret;
1062
1063         if (!ret && wbc->sync_mode == WB_SYNC_ALL)
1064                 ret = bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
1065
1066         return ret;
1067 }
1068
1069 static void bch2_evict_inode(struct inode *vinode)
1070 {
1071         struct bch_fs *c = vinode->i_sb->s_fs_info;
1072         struct bch_inode_info *inode = to_bch_ei(vinode);
1073
1074         truncate_inode_pages_final(&inode->v.i_data);
1075
1076         clear_inode(&inode->v);
1077
1078         BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1079
1080         if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1081                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1082                                 BCH_QUOTA_WARN);
1083                 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1084                                 BCH_QUOTA_WARN);
1085                 bch2_inode_rm(c, inode->v.i_ino);
1086                 atomic_long_dec(&c->nr_inodes);
1087         }
1088 }
1089
1090 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1091 {
1092         struct super_block *sb = dentry->d_sb;
1093         struct bch_fs *c = sb->s_fs_info;
1094         u64 fsid;
1095
1096         buf->f_type     = BCACHEFS_STATFS_MAGIC;
1097         buf->f_bsize    = sb->s_blocksize;
1098         buf->f_blocks   = c->capacity >> PAGE_SECTOR_SHIFT;
1099         buf->f_bfree    = bch2_fs_sectors_free(c, bch2_fs_usage_read(c)) >>
1100                            PAGE_SECTOR_SHIFT;
1101         buf->f_bavail   = buf->f_bfree;
1102         buf->f_files    = atomic_long_read(&c->nr_inodes);
1103         buf->f_ffree    = U64_MAX;
1104
1105         fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
1106                le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
1107         buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
1108         buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
1109         buf->f_namelen  = NAME_MAX;
1110
1111         return 0;
1112 }
1113
1114 static int bch2_sync_fs(struct super_block *sb, int wait)
1115 {
1116         struct bch_fs *c = sb->s_fs_info;
1117
1118         if (!wait) {
1119                 bch2_journal_flush_async(&c->journal, NULL);
1120                 return 0;
1121         }
1122
1123         return bch2_journal_flush(&c->journal);
1124 }
1125
1126 static struct bch_fs *bch2_path_to_fs(const char *dev)
1127 {
1128         struct bch_fs *c;
1129         struct block_device *bdev = lookup_bdev(dev);
1130
1131         if (IS_ERR(bdev))
1132                 return ERR_CAST(bdev);
1133
1134         c = bch2_bdev_to_fs(bdev);
1135         bdput(bdev);
1136         return c ?: ERR_PTR(-ENOENT);
1137 }
1138
1139 static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs,
1140                                                unsigned nr_devs, struct bch_opts opts)
1141 {
1142         struct bch_fs *c, *c1, *c2;
1143         size_t i;
1144
1145         if (!nr_devs)
1146                 return ERR_PTR(-EINVAL);
1147
1148         c = bch2_fs_open(devs, nr_devs, opts);
1149
1150         if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) {
1151                 /*
1152                  * Already open?
1153                  * Look up each block device, make sure they all belong to a
1154                  * filesystem and they all belong to the _same_ filesystem
1155                  */
1156
1157                 c1 = bch2_path_to_fs(devs[0]);
1158                 if (!c1)
1159                         return c;
1160
1161                 for (i = 1; i < nr_devs; i++) {
1162                         c2 = bch2_path_to_fs(devs[i]);
1163                         if (!IS_ERR(c2))
1164                                 closure_put(&c2->cl);
1165
1166                         if (c1 != c2) {
1167                                 closure_put(&c1->cl);
1168                                 return c;
1169                         }
1170                 }
1171
1172                 c = c1;
1173         }
1174
1175         if (IS_ERR(c))
1176                 return c;
1177
1178         mutex_lock(&c->state_lock);
1179
1180         if (!bch2_fs_running(c)) {
1181                 mutex_unlock(&c->state_lock);
1182                 closure_put(&c->cl);
1183                 pr_err("err mounting %s: incomplete filesystem", dev_name);
1184                 return ERR_PTR(-EINVAL);
1185         }
1186
1187         mutex_unlock(&c->state_lock);
1188
1189         set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
1190         return c;
1191 }
1192
1193 static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
1194                                              struct bch_opts opts)
1195 {
1196         char *dev_name = NULL, **devs = NULL, *s;
1197         struct bch_fs *c = ERR_PTR(-ENOMEM);
1198         size_t i, nr_devs = 0;
1199
1200         dev_name = kstrdup(_dev_name, GFP_KERNEL);
1201         if (!dev_name)
1202                 goto err;
1203
1204         for (s = dev_name; s; s = strchr(s + 1, ':'))
1205                 nr_devs++;
1206
1207         devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
1208         if (!devs)
1209                 goto err;
1210
1211         for (i = 0, s = dev_name;
1212              s;
1213              (s = strchr(s, ':')) && (*s++ = '\0'))
1214                 devs[i++] = s;
1215
1216         c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts);
1217 err:
1218         kfree(devs);
1219         kfree(dev_name);
1220         return c;
1221 }
1222
1223 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1224 {
1225         struct bch_fs *c = sb->s_fs_info;
1226         struct bch_opts opts = bch2_opts_empty();
1227         int ret;
1228
1229         opt_set(opts, read_only, (*flags & MS_RDONLY) != 0);
1230
1231         ret = bch2_parse_mount_opts(&opts, data);
1232         if (ret)
1233                 return ret;
1234
1235         if (opts.read_only != c->opts.read_only) {
1236                 const char *err = NULL;
1237
1238                 mutex_lock(&c->state_lock);
1239
1240                 if (opts.read_only) {
1241                         bch2_fs_read_only(c);
1242
1243                         sb->s_flags |= MS_RDONLY;
1244                 } else {
1245                         err = bch2_fs_read_write(c);
1246                         if (err) {
1247                                 bch_err(c, "error going rw: %s", err);
1248                                 return -EINVAL;
1249                         }
1250
1251                         sb->s_flags &= ~MS_RDONLY;
1252                 }
1253
1254                 c->opts.read_only = opts.read_only;
1255
1256                 mutex_unlock(&c->state_lock);
1257         }
1258
1259         if (opts.errors >= 0)
1260                 c->opts.errors = opts.errors;
1261
1262         return ret;
1263 }
1264
1265 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1266 {
1267         struct bch_fs *c = root->d_sb->s_fs_info;
1268         enum bch_opt_id i;
1269
1270         for (i = 0; i < bch2_opts_nr; i++) {
1271                 const struct bch_option *opt = &bch2_opt_table[i];
1272                 u64 v = bch2_opt_get_by_id(&c->opts, i);
1273
1274                 if (opt->mode < OPT_MOUNT)
1275                         continue;
1276
1277                 if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1278                         continue;
1279
1280                 switch (opt->type) {
1281                 case BCH_OPT_BOOL:
1282                         seq_printf(seq, ",%s%s", v ? "" : "no", opt->attr.name);
1283                         break;
1284                 case BCH_OPT_UINT:
1285                         seq_printf(seq, ",%s=%llu", opt->attr.name, v);
1286                         break;
1287                 case BCH_OPT_STR:
1288                         seq_printf(seq, ",%s=%s", opt->attr.name, opt->choices[v]);
1289                         break;
1290                 }
1291         }
1292
1293         return 0;
1294
1295 }
1296
1297 static const struct super_operations bch_super_operations = {
1298         .alloc_inode    = bch2_alloc_inode,
1299         .destroy_inode  = bch2_destroy_inode,
1300         .write_inode    = bch2_vfs_write_inode,
1301         .evict_inode    = bch2_evict_inode,
1302         .sync_fs        = bch2_sync_fs,
1303         .statfs         = bch2_statfs,
1304         .show_options   = bch2_show_options,
1305         .remount_fs     = bch2_remount,
1306 #if 0
1307         .put_super      = bch2_put_super,
1308         .freeze_fs      = bch2_freeze,
1309         .unfreeze_fs    = bch2_unfreeze,
1310 #endif
1311 };
1312
1313 static int bch2_test_super(struct super_block *s, void *data)
1314 {
1315         return s->s_fs_info == data;
1316 }
1317
1318 static int bch2_set_super(struct super_block *s, void *data)
1319 {
1320         s->s_fs_info = data;
1321         return 0;
1322 }
1323
1324 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1325                                  int flags, const char *dev_name, void *data)
1326 {
1327         struct bch_fs *c;
1328         struct bch_dev *ca;
1329         struct super_block *sb;
1330         struct inode *vinode;
1331         struct bch_opts opts = bch2_opts_empty();
1332         unsigned i;
1333         int ret;
1334
1335         opt_set(opts, read_only, (flags & MS_RDONLY) != 0);
1336
1337         ret = bch2_parse_mount_opts(&opts, data);
1338         if (ret)
1339                 return ERR_PTR(ret);
1340
1341         c = bch2_open_as_blockdevs(dev_name, opts);
1342         if (IS_ERR(c))
1343                 return ERR_CAST(c);
1344
1345         sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|MS_NOSEC, c);
1346         if (IS_ERR(sb)) {
1347                 closure_put(&c->cl);
1348                 return ERR_CAST(sb);
1349         }
1350
1351         BUG_ON(sb->s_fs_info != c);
1352
1353         if (sb->s_root) {
1354                 closure_put(&c->cl);
1355
1356                 if ((flags ^ sb->s_flags) & MS_RDONLY) {
1357                         ret = -EBUSY;
1358                         goto err_put_super;
1359                 }
1360                 goto out;
1361         }
1362
1363         /* XXX: blocksize */
1364         sb->s_blocksize         = PAGE_SIZE;
1365         sb->s_blocksize_bits    = PAGE_SHIFT;
1366         sb->s_maxbytes          = MAX_LFS_FILESIZE;
1367         sb->s_op                = &bch_super_operations;
1368         sb->s_export_op         = &bch_export_ops;
1369 #ifdef CONFIG_BCACHEFS_QUOTA
1370         sb->s_qcop              = &bch2_quotactl_operations;
1371         sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1372 #endif
1373         sb->s_xattr             = bch2_xattr_handlers;
1374         sb->s_magic             = BCACHEFS_STATFS_MAGIC;
1375         sb->s_time_gran         = c->sb.time_precision;
1376         c->vfs_sb               = sb;
1377         strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
1378
1379         ret = super_setup_bdi(sb);
1380         if (ret)
1381                 goto err_put_super;
1382
1383         sb->s_bdi->congested_fn         = bch2_congested;
1384         sb->s_bdi->congested_data       = c;
1385         sb->s_bdi->ra_pages             = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
1386
1387         for_each_online_member(ca, c, i) {
1388                 struct block_device *bdev = ca->disk_sb.bdev;
1389
1390                 /* XXX: create an anonymous device for multi device filesystems */
1391                 sb->s_bdev      = bdev;
1392                 sb->s_dev       = bdev->bd_dev;
1393                 percpu_ref_put(&ca->io_ref);
1394                 break;
1395         }
1396
1397 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1398         if (c->opts.acl)
1399                 sb->s_flags     |= MS_POSIXACL;
1400 #endif
1401
1402         vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
1403         if (IS_ERR(vinode)) {
1404                 ret = PTR_ERR(vinode);
1405                 goto err_put_super;
1406         }
1407
1408         sb->s_root = d_make_root(vinode);
1409         if (!sb->s_root) {
1410                 ret = -ENOMEM;
1411                 goto err_put_super;
1412         }
1413
1414         sb->s_flags |= MS_ACTIVE;
1415 out:
1416         return dget(sb->s_root);
1417
1418 err_put_super:
1419         deactivate_locked_super(sb);
1420         return ERR_PTR(ret);
1421 }
1422
1423 static void bch2_kill_sb(struct super_block *sb)
1424 {
1425         struct bch_fs *c = sb->s_fs_info;
1426
1427         generic_shutdown_super(sb);
1428
1429         if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
1430                 bch2_fs_stop(c);
1431         else
1432                 closure_put(&c->cl);
1433 }
1434
1435 static struct file_system_type bcache_fs_type = {
1436         .owner          = THIS_MODULE,
1437         .name           = "bcachefs",
1438         .mount          = bch2_mount,
1439         .kill_sb        = bch2_kill_sb,
1440         .fs_flags       = FS_REQUIRES_DEV,
1441 };
1442
1443 MODULE_ALIAS_FS("bcachefs");
1444
1445 void bch2_vfs_exit(void)
1446 {
1447         unregister_filesystem(&bcache_fs_type);
1448         if (bch2_inode_cache)
1449                 kmem_cache_destroy(bch2_inode_cache);
1450 }
1451
1452 int __init bch2_vfs_init(void)
1453 {
1454         int ret = -ENOMEM;
1455
1456         bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
1457         if (!bch2_inode_cache)
1458                 goto err;
1459
1460         ret = register_filesystem(&bcache_fs_type);
1461         if (ret)
1462                 goto err;
1463
1464         return 0;
1465 err:
1466         bch2_vfs_exit();
1467         return ret;
1468 }
1469
1470 #endif /* NO_BCACHEFS_FS */