]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fs.c
Update bcachefs sources to 0e765bc37c bcachefs: foreground merging of interior btree...
[bcachefs-tools-debian] / libbcachefs / fs.c
1 #ifndef NO_BCACHEFS_FS
2
3 #include "bcachefs.h"
4 #include "acl.h"
5 #include "btree_update.h"
6 #include "buckets.h"
7 #include "chardev.h"
8 #include "dirent.h"
9 #include "extents.h"
10 #include "fs.h"
11 #include "fs-io.h"
12 #include "fs-ioctl.h"
13 #include "fsck.h"
14 #include "inode.h"
15 #include "io.h"
16 #include "journal.h"
17 #include "keylist.h"
18 #include "quota.h"
19 #include "super.h"
20 #include "xattr.h"
21
22 #include <linux/aio.h>
23 #include <linux/backing-dev.h>
24 #include <linux/exportfs.h>
25 #include <linux/module.h>
26 #include <linux/posix_acl.h>
27 #include <linux/random.h>
28 #include <linux/statfs.h>
29 #include <linux/xattr.h>
30
31 static struct kmem_cache *bch2_inode_cache;
32
33 static void bch2_vfs_inode_init(struct bch_fs *,
34                                 struct bch_inode_info *,
35                                 struct bch_inode_unpacked *);
36
37 /*
38  * I_SIZE_DIRTY requires special handling:
39  *
40  * To the recovery code, the flag means that there is stale data past i_size
41  * that needs to be deleted; it's used for implementing atomic appends and
42  * truncates.
43  *
44  * On append, we set I_SIZE_DIRTY before doing the write, then after the write
45  * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size
46  * that exposes the data we just wrote.
47  *
48  * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting
49  * i_size to the new smaller size, then we delete the data that we just made
50  * invisible, and then we clear I_SIZE_DIRTY.
51  *
52  * Because there can be multiple appends in flight at a time, we need a refcount
53  * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero
54  * refcount means I_SIZE_DIRTY is set, zero means it's cleared.
55  *
56  * Because write_inode() can be called at any time, i_size_dirty_count means
57  * something different to the runtime code - it means to write_inode() "don't
58  * update i_size yet".
59  *
60  * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when
61  * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must
62  * be set explicitly.
63  */
64
65 int __must_check __bch2_write_inode(struct bch_fs *c,
66                                     struct bch_inode_info *inode,
67                                     inode_set_fn set,
68                                     void *p)
69 {
70         struct btree_iter iter;
71         struct bch_inode_unpacked inode_u;
72         struct bkey_inode_buf inode_p;
73         u64 inum = inode->v.i_ino;
74         unsigned i_nlink = READ_ONCE(inode->v.i_nlink);
75         int ret;
76
77         /*
78          * We can't write an inode with i_nlink == 0 because it's stored biased;
79          * however, we don't need to because if i_nlink is 0 the inode is
80          * getting deleted when it's evicted.
81          */
82         if (!i_nlink)
83                 return 0;
84
85         lockdep_assert_held(&inode->ei_update_lock);
86
87         bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0),
88                              BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
89
90         do {
91                 struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
92
93                 if ((ret = btree_iter_err(k)))
94                         goto out;
95
96                 if (WARN_ONCE(k.k->type != BCH_INODE_FS,
97                               "inode %llu not found when updating", inum)) {
98                         bch2_btree_iter_unlock(&iter);
99                         return -ENOENT;
100                 }
101
102                 ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
103                 if (WARN_ONCE(ret,
104                               "error %i unpacking inode %llu", ret, inum)) {
105                         ret = -ENOENT;
106                         break;
107                 }
108
109                 if (set) {
110                         ret = set(inode, &inode_u, p);
111                         if (ret)
112                                 goto out;
113                 }
114
115                 BUG_ON(i_nlink < nlink_bias(inode->v.i_mode));
116
117                 inode_u.bi_mode = inode->v.i_mode;
118                 inode_u.bi_uid  = i_uid_read(&inode->v);
119                 inode_u.bi_gid  = i_gid_read(&inode->v);
120                 inode_u.bi_project = inode->ei_qid.q[QTYP_PRJ];
121                 inode_u.bi_nlink= i_nlink - nlink_bias(inode->v.i_mode);
122                 inode_u.bi_dev  = inode->v.i_rdev;
123                 inode_u.bi_atime= timespec_to_bch2_time(c, inode->v.i_atime);
124                 inode_u.bi_mtime= timespec_to_bch2_time(c, inode->v.i_mtime);
125                 inode_u.bi_ctime= timespec_to_bch2_time(c, inode->v.i_ctime);
126
127                 bch2_inode_pack(&inode_p, &inode_u);
128
129                 ret = bch2_btree_insert_at(c, NULL, NULL,
130                                 &inode->ei_journal_seq,
131                                 BTREE_INSERT_ATOMIC|
132                                 BTREE_INSERT_NOFAIL,
133                                 BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
134         } while (ret == -EINTR);
135
136         if (!ret) {
137                 inode->ei_inode = inode_u;
138                 inode->ei_qid   = bch_qid(&inode_u);
139         }
140 out:
141         bch2_btree_iter_unlock(&iter);
142
143         return ret < 0 ? ret : 0;
144 }
145
146 int __must_check bch2_write_inode(struct bch_fs *c,
147                                   struct bch_inode_info *inode)
148 {
149         return __bch2_write_inode(c, inode, NULL, NULL);
150 }
151
152 static int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
153 {
154         int ret;
155
156         mutex_lock(&inode->ei_update_lock);
157         inc_nlink(&inode->v);
158         ret = bch2_write_inode(c, inode);
159         mutex_unlock(&inode->ei_update_lock);
160
161         return ret;
162 }
163
164 static int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
165 {
166         int ret = 0;
167
168         mutex_lock(&inode->ei_update_lock);
169         drop_nlink(&inode->v);
170         ret = bch2_write_inode(c, inode);
171         mutex_unlock(&inode->ei_update_lock);
172
173         return ret;
174 }
175
176 static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
177 {
178         struct bch_inode_unpacked inode_u;
179         struct bch_inode_info *inode;
180         int ret;
181
182         inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
183         if (unlikely(!inode))
184                 return ERR_PTR(-ENOMEM);
185         if (!(inode->v.i_state & I_NEW))
186                 return &inode->v;
187
188         ret = bch2_inode_find_by_inum(c, inum, &inode_u);
189         if (ret) {
190                 iget_failed(&inode->v);
191                 return ERR_PTR(ret);
192         }
193
194         bch2_vfs_inode_init(c, inode, &inode_u);
195
196         inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
197
198         unlock_new_inode(&inode->v);
199
200         return &inode->v;
201 }
202
203 static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
204                                                     struct bch_inode_info *dir,
205                                                     umode_t mode, dev_t rdev)
206 {
207         struct posix_acl *default_acl = NULL, *acl = NULL;
208         struct bch_inode_info *inode;
209         struct bch_inode_unpacked inode_u;
210         int ret;
211
212         inode = to_bch_ei(new_inode(c->vfs_sb));
213         if (unlikely(!inode))
214                 return ERR_PTR(-ENOMEM);
215
216         inode_init_owner(&inode->v, &dir->v, mode);
217
218 #ifdef CONFIG_BCACHEFS_POSIX_ACL
219         ret = posix_acl_create(&dir->v, &inode->v.i_mode, &default_acl, &acl);
220         if (ret)
221                 goto err_make_bad;
222 #endif
223
224         bch2_inode_init(c, &inode_u,
225                         i_uid_read(&inode->v),
226                         i_gid_read(&inode->v),
227                         inode->v.i_mode, rdev,
228                         &dir->ei_inode);
229
230         inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ];
231
232         ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC);
233         if (ret)
234                 goto err_make_bad;
235
236         ret = bch2_inode_create(c, &inode_u,
237                                 BLOCKDEV_INODE_MAX, 0,
238                                 &c->unused_inode_hint);
239         if (unlikely(ret))
240                 goto err_acct_quota;
241
242         bch2_vfs_inode_init(c, inode, &inode_u);
243         atomic_long_inc(&c->nr_inodes);
244
245         if (default_acl) {
246                 ret = bch2_set_acl(&inode->v, default_acl, ACL_TYPE_DEFAULT);
247                 if (unlikely(ret))
248                         goto err;
249         }
250
251         if (acl) {
252                 ret = bch2_set_acl(&inode->v, acl, ACL_TYPE_ACCESS);
253                 if (unlikely(ret))
254                         goto err;
255         }
256
257         insert_inode_hash(&inode->v);
258 out:
259         posix_acl_release(default_acl);
260         posix_acl_release(acl);
261         return inode;
262 err_acct_quota:
263         bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN);
264 err_make_bad:
265         /*
266          * indicate to bch_evict_inode that the inode was never actually
267          * created:
268          */
269         make_bad_inode(&inode->v);
270 err:
271         clear_nlink(&inode->v);
272         iput(&inode->v);
273         inode = ERR_PTR(ret);
274         goto out;
275 }
276
277 static int bch2_vfs_dirent_create(struct bch_fs *c,
278                                   struct bch_inode_info *dir,
279                                   u8 type, const struct qstr *name,
280                                   u64 dst)
281 {
282         int ret;
283
284         ret = bch2_dirent_create(c, dir->v.i_ino, &dir->ei_str_hash,
285                                 type, name, dst,
286                                 &dir->ei_journal_seq,
287                                 BCH_HASH_SET_MUST_CREATE);
288         if (unlikely(ret))
289                 return ret;
290
291         dir->v.i_mtime = dir->v.i_ctime = current_time(&dir->v);
292         mark_inode_dirty_sync(&dir->v);
293         return 0;
294 }
295
296 static int __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
297                          umode_t mode, dev_t rdev)
298 {
299         struct bch_fs *c = dir->v.i_sb->s_fs_info;
300         struct bch_inode_info *inode;
301         int ret;
302
303         inode = bch2_vfs_inode_create(c, dir, mode, rdev);
304         if (unlikely(IS_ERR(inode)))
305                 return PTR_ERR(inode);
306
307         ret = bch2_vfs_dirent_create(c, dir, mode_to_type(mode),
308                                      &dentry->d_name, inode->v.i_ino);
309         if (unlikely(ret)) {
310                 clear_nlink(&inode->v);
311                 iput(&inode->v);
312                 return ret;
313         }
314
315         if (dir->ei_journal_seq > inode->ei_journal_seq)
316                 inode->ei_journal_seq = dir->ei_journal_seq;
317
318         d_instantiate(dentry, &inode->v);
319         return 0;
320 }
321
322 /* methods */
323
324 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
325                                   unsigned int flags)
326 {
327         struct bch_fs *c = vdir->i_sb->s_fs_info;
328         struct bch_inode_info *dir = to_bch_ei(vdir);
329         struct inode *vinode = NULL;
330         u64 inum;
331
332         inum = bch2_dirent_lookup(c, dir->v.i_ino,
333                                   &dir->ei_str_hash,
334                                   &dentry->d_name);
335
336         if (inum)
337                 vinode = bch2_vfs_inode_get(c, inum);
338
339         return d_splice_alias(vinode, dentry);
340 }
341
342 static int bch2_create(struct inode *vdir, struct dentry *dentry,
343                        umode_t mode, bool excl)
344 {
345         return __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0);
346 }
347
348 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
349                      struct dentry *dentry)
350 {
351         struct bch_fs *c = vdir->i_sb->s_fs_info;
352         struct bch_inode_info *dir = to_bch_ei(vdir);
353         struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
354         int ret;
355
356         lockdep_assert_held(&inode->v.i_rwsem);
357
358         inode->v.i_ctime = current_time(&dir->v);
359
360         ret = bch2_inc_nlink(c, inode);
361         if (ret)
362                 return ret;
363
364         ihold(&inode->v);
365
366         ret = bch2_vfs_dirent_create(c, dir, mode_to_type(inode->v.i_mode),
367                                      &dentry->d_name, inode->v.i_ino);
368         if (unlikely(ret)) {
369                 bch2_dec_nlink(c, inode);
370                 iput(&inode->v);
371                 return ret;
372         }
373
374         d_instantiate(dentry, &inode->v);
375         return 0;
376 }
377
378 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
379 {
380         struct bch_fs *c = vdir->i_sb->s_fs_info;
381         struct bch_inode_info *dir = to_bch_ei(vdir);
382         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
383         int ret;
384
385         lockdep_assert_held(&inode->v.i_rwsem);
386
387         ret = bch2_dirent_delete(c, dir->v.i_ino, &dir->ei_str_hash,
388                                  &dentry->d_name, &dir->ei_journal_seq);
389         if (ret)
390                 return ret;
391
392         if (dir->ei_journal_seq > inode->ei_journal_seq)
393                 inode->ei_journal_seq = dir->ei_journal_seq;
394
395         inode->v.i_ctime = dir->v.i_ctime;
396
397         if (S_ISDIR(inode->v.i_mode)) {
398                 bch2_dec_nlink(c, dir);
399                 drop_nlink(&inode->v);
400         }
401
402         bch2_dec_nlink(c, inode);
403
404         return 0;
405 }
406
407 static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
408                         const char *symname)
409 {
410         struct bch_fs *c = vdir->i_sb->s_fs_info;
411         struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
412         int ret;
413
414         inode = bch2_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0);
415         if (unlikely(IS_ERR(inode)))
416                 return PTR_ERR(inode);
417
418         inode_lock(&inode->v);
419         ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
420         inode_unlock(&inode->v);
421
422         if (unlikely(ret))
423                 goto err;
424
425         ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
426         if (unlikely(ret))
427                 goto err;
428
429         /* XXX: racy */
430         if (dir->ei_journal_seq < inode->ei_journal_seq)
431                 dir->ei_journal_seq = inode->ei_journal_seq;
432
433         ret = bch2_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name,
434                                      inode->v.i_ino);
435         if (unlikely(ret))
436                 goto err;
437
438         d_instantiate(dentry, &inode->v);
439         return 0;
440 err:
441         clear_nlink(&inode->v);
442         iput(&inode->v);
443         return ret;
444 }
445
446 static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode)
447 {
448         struct bch_fs *c = vdir->i_sb->s_fs_info;
449         struct bch_inode_info *dir = to_bch_ei(vdir);
450         int ret;
451
452         lockdep_assert_held(&dir->v.i_rwsem);
453
454         ret = __bch2_create(dir, dentry, mode|S_IFDIR, 0);
455         if (unlikely(ret))
456                 return ret;
457
458         bch2_inc_nlink(c, dir);
459
460         return 0;
461 }
462
463 static int bch2_rmdir(struct inode *vdir, struct dentry *dentry)
464 {
465         struct bch_fs *c = vdir->i_sb->s_fs_info;
466
467         if (bch2_empty_dir(c, dentry->d_inode->i_ino))
468                 return -ENOTEMPTY;
469
470         return bch2_unlink(vdir, dentry);
471 }
472
473 static int bch2_mknod(struct inode *vdir, struct dentry *dentry,
474                       umode_t mode, dev_t rdev)
475 {
476         return __bch2_create(to_bch_ei(vdir), dentry, mode, rdev);
477 }
478
479 static int bch2_rename(struct bch_fs *c,
480                        struct bch_inode_info *old_dir,
481                        struct dentry *old_dentry,
482                        struct bch_inode_info *new_dir,
483                        struct dentry *new_dentry)
484 {
485         struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
486         struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
487         struct timespec now = current_time(&old_dir->v);
488         int ret;
489
490         lockdep_assert_held(&old_dir->v.i_rwsem);
491         lockdep_assert_held(&new_dir->v.i_rwsem);
492
493         if (new_inode)
494                 filemap_write_and_wait_range(old_inode->v.i_mapping,
495                                              0, LLONG_MAX);
496
497         if (new_inode && S_ISDIR(old_inode->v.i_mode)) {
498                 lockdep_assert_held(&new_inode->v.i_rwsem);
499
500                 if (!S_ISDIR(new_inode->v.i_mode))
501                         return -ENOTDIR;
502
503                 if (bch2_empty_dir(c, new_inode->v.i_ino))
504                         return -ENOTEMPTY;
505
506                 ret = bch2_dirent_rename(c,
507                                 old_dir, &old_dentry->d_name,
508                                 new_dir, &new_dentry->d_name,
509                                 &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE);
510                 if (unlikely(ret))
511                         return ret;
512
513                 clear_nlink(&new_inode->v);
514                 bch2_dec_nlink(c, old_dir);
515         } else if (new_inode) {
516                 lockdep_assert_held(&new_inode->v.i_rwsem);
517
518                 ret = bch2_dirent_rename(c,
519                                 old_dir, &old_dentry->d_name,
520                                 new_dir, &new_dentry->d_name,
521                                 &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE);
522                 if (unlikely(ret))
523                         return ret;
524
525                 new_inode->v.i_ctime = now;
526                 bch2_dec_nlink(c, new_inode);
527         } else if (S_ISDIR(old_inode->v.i_mode)) {
528                 ret = bch2_dirent_rename(c,
529                                 old_dir, &old_dentry->d_name,
530                                 new_dir, &new_dentry->d_name,
531                                 &old_inode->ei_journal_seq, BCH_RENAME);
532                 if (unlikely(ret))
533                         return ret;
534
535                 bch2_inc_nlink(c, new_dir);
536                 bch2_dec_nlink(c, old_dir);
537         } else {
538                 ret = bch2_dirent_rename(c,
539                                 old_dir, &old_dentry->d_name,
540                                 new_dir, &new_dentry->d_name,
541                                 &old_inode->ei_journal_seq, BCH_RENAME);
542                 if (unlikely(ret))
543                         return ret;
544         }
545
546         old_dir->v.i_ctime = old_dir->v.i_mtime = now;
547         new_dir->v.i_ctime = new_dir->v.i_mtime = now;
548         mark_inode_dirty_sync(&old_dir->v);
549         mark_inode_dirty_sync(&new_dir->v);
550
551         old_inode->v.i_ctime = now;
552         mark_inode_dirty_sync(&old_inode->v);
553
554         return 0;
555 }
556
557 static int bch2_rename_exchange(struct bch_fs *c,
558                                 struct bch_inode_info *old_dir,
559                                 struct dentry *old_dentry,
560                                 struct bch_inode_info *new_dir,
561                                 struct dentry *new_dentry)
562 {
563         struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
564         struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
565         struct timespec now = current_time(&old_dir->v);
566         int ret;
567
568         ret = bch2_dirent_rename(c,
569                                  old_dir, &old_dentry->d_name,
570                                  new_dir, &new_dentry->d_name,
571                                  &old_inode->ei_journal_seq, BCH_RENAME_EXCHANGE);
572         if (unlikely(ret))
573                 return ret;
574
575         if (S_ISDIR(old_inode->v.i_mode) !=
576             S_ISDIR(new_inode->v.i_mode)) {
577                 if (S_ISDIR(old_inode->v.i_mode)) {
578                         bch2_inc_nlink(c, new_dir);
579                         bch2_dec_nlink(c, old_dir);
580                 } else {
581                         bch2_dec_nlink(c, new_dir);
582                         bch2_inc_nlink(c, old_dir);
583                 }
584         }
585
586         old_dir->v.i_ctime = old_dir->v.i_mtime = now;
587         new_dir->v.i_ctime = new_dir->v.i_mtime = now;
588         mark_inode_dirty_sync(&old_dir->v);
589         mark_inode_dirty_sync(&new_dir->v);
590
591         old_inode->v.i_ctime = now;
592         new_inode->v.i_ctime = now;
593         mark_inode_dirty_sync(&old_inode->v);
594         mark_inode_dirty_sync(&new_inode->v);
595
596         return 0;
597 }
598
599 static int bch2_rename2(struct inode *old_vdir, struct dentry *old_dentry,
600                         struct inode *new_vdir, struct dentry *new_dentry,
601                         unsigned flags)
602 {
603         struct bch_fs *c = old_vdir->i_sb->s_fs_info;
604         struct bch_inode_info *old_dir = to_bch_ei(old_vdir);
605         struct bch_inode_info *new_dir = to_bch_ei(new_vdir);
606
607         if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
608                 return -EINVAL;
609
610         if (flags & RENAME_EXCHANGE)
611                 return bch2_rename_exchange(c, old_dir, old_dentry,
612                                             new_dir, new_dentry);
613
614         return bch2_rename(c, old_dir, old_dentry, new_dir, new_dentry);
615 }
616
617 static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iattr)
618 {
619         struct bch_fs *c = inode->v.i_sb->s_fs_info;
620         struct bch_qid qid = inode->ei_qid;
621         unsigned qtypes = 0;
622         int ret;
623
624         mutex_lock(&inode->ei_update_lock);
625
626         if (c->opts.usrquota &&
627             (iattr->ia_valid & ATTR_UID) &&
628             !uid_eq(iattr->ia_uid, inode->v.i_uid)) {
629                 qid.q[QTYP_USR] = from_kuid(&init_user_ns, iattr->ia_uid),
630                 qtypes |= 1 << QTYP_USR;
631         }
632
633         if (c->opts.grpquota &&
634             (iattr->ia_valid & ATTR_GID) &&
635             !gid_eq(iattr->ia_gid, inode->v.i_gid)) {
636                 qid.q[QTYP_GRP] = from_kgid(&init_user_ns, iattr->ia_gid);
637                 qtypes |= 1 << QTYP_GRP;
638         }
639
640         if (qtypes) {
641                 ret = bch2_quota_transfer(c, qtypes, qid, inode->ei_qid,
642                                           inode->v.i_blocks +
643                                           inode->ei_quota_reserved);
644                 if (ret)
645                         goto out_unlock;
646         }
647
648         setattr_copy(&inode->v, iattr);
649
650         ret = bch2_write_inode(c, inode);
651 out_unlock:
652         mutex_unlock(&inode->ei_update_lock);
653
654         if (!ret &&
655             iattr->ia_valid & ATTR_MODE)
656                 ret = posix_acl_chmod(&inode->v, inode->v.i_mode);
657
658         return ret;
659 }
660
661 static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
662 {
663         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
664         int ret;
665
666         lockdep_assert_held(&inode->v.i_rwsem);
667
668         ret = setattr_prepare(dentry, iattr);
669         if (ret)
670                 return ret;
671
672         return iattr->ia_valid & ATTR_SIZE
673                 ? bch2_truncate(inode, iattr)
674                 : bch2_setattr_nonsize(inode, iattr);
675 }
676
677 static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
678 {
679         struct bch_fs *c = vdir->i_sb->s_fs_info;
680         struct bch_inode_info *dir = to_bch_ei(vdir);
681         struct bch_inode_info *inode;
682
683         /* XXX: i_nlink should be 0? */
684         inode = bch2_vfs_inode_create(c, dir, mode, 0);
685         if (unlikely(IS_ERR(inode)))
686                 return PTR_ERR(inode);
687
688         d_tmpfile(dentry, &inode->v);
689         return 0;
690 }
691
692 static int bch2_fill_extent(struct fiemap_extent_info *info,
693                             const struct bkey_i *k, unsigned flags)
694 {
695         if (bkey_extent_is_data(&k->k)) {
696                 struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
697                 const struct bch_extent_ptr *ptr;
698                 struct bch_extent_crc_unpacked crc;
699                 int ret;
700
701                 extent_for_each_ptr_crc(e, ptr, crc) {
702                         int flags2 = 0;
703                         u64 offset = ptr->offset;
704
705                         if (crc.compression_type)
706                                 flags2 |= FIEMAP_EXTENT_ENCODED;
707                         else
708                                 offset += crc.offset;
709
710                         if ((offset & (PAGE_SECTORS - 1)) ||
711                             (e.k->size & (PAGE_SECTORS - 1)))
712                                 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
713
714                         ret = fiemap_fill_next_extent(info,
715                                                       bkey_start_offset(e.k) << 9,
716                                                       offset << 9,
717                                                       e.k->size << 9, flags|flags2);
718                         if (ret)
719                                 return ret;
720                 }
721
722                 return 0;
723         } else if (k->k.type == BCH_RESERVATION) {
724                 return fiemap_fill_next_extent(info,
725                                                bkey_start_offset(&k->k) << 9,
726                                                0, k->k.size << 9,
727                                                flags|
728                                                FIEMAP_EXTENT_DELALLOC|
729                                                FIEMAP_EXTENT_UNWRITTEN);
730         } else {
731                 BUG();
732         }
733 }
734
735 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
736                        u64 start, u64 len)
737 {
738         struct bch_fs *c = vinode->i_sb->s_fs_info;
739         struct bch_inode_info *ei = to_bch_ei(vinode);
740         struct btree_iter iter;
741         struct bkey_s_c k;
742         BKEY_PADDED(k) tmp;
743         bool have_extent = false;
744         int ret = 0;
745
746         if (start + len < start)
747                 return -EINVAL;
748
749         for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
750                            POS(ei->v.i_ino, start >> 9), 0, k)
751                 if (bkey_extent_is_data(k.k) ||
752                     k.k->type == BCH_RESERVATION) {
753                         if (bkey_cmp(bkey_start_pos(k.k),
754                                      POS(ei->v.i_ino, (start + len) >> 9)) >= 0)
755                                 break;
756
757                         if (have_extent) {
758                                 ret = bch2_fill_extent(info, &tmp.k, 0);
759                                 if (ret)
760                                         goto out;
761                         }
762
763                         bkey_reassemble(&tmp.k, k);
764                         have_extent = true;
765                 }
766
767         if (have_extent)
768                 ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
769 out:
770         bch2_btree_iter_unlock(&iter);
771         return ret < 0 ? ret : 0;
772 }
773
774 static const struct vm_operations_struct bch_vm_ops = {
775         .fault          = filemap_fault,
776         .map_pages      = filemap_map_pages,
777         .page_mkwrite   = bch2_page_mkwrite,
778 };
779
780 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
781 {
782         file_accessed(file);
783
784         vma->vm_ops = &bch_vm_ops;
785         return 0;
786 }
787
788 /* Directories: */
789
790 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
791 {
792         return generic_file_llseek_size(file, offset, whence,
793                                         S64_MAX, S64_MAX);
794 }
795
796 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
797 {
798         struct bch_fs *c = file_inode(file)->i_sb->s_fs_info;
799
800         return bch2_readdir(c, file, ctx);
801 }
802
803 static const struct file_operations bch_file_operations = {
804         .llseek         = bch2_llseek,
805         .read_iter      = generic_file_read_iter,
806         .write_iter     = bch2_write_iter,
807         .mmap           = bch2_mmap,
808         .open           = generic_file_open,
809         .fsync          = bch2_fsync,
810         .splice_read    = generic_file_splice_read,
811         .splice_write   = iter_file_splice_write,
812         .fallocate      = bch2_fallocate_dispatch,
813         .unlocked_ioctl = bch2_fs_file_ioctl,
814 #ifdef CONFIG_COMPAT
815         .compat_ioctl   = bch2_compat_fs_ioctl,
816 #endif
817 };
818
819 static const struct inode_operations bch_file_inode_operations = {
820         .setattr        = bch2_setattr,
821         .fiemap         = bch2_fiemap,
822         .listxattr      = bch2_xattr_list,
823 #ifdef CONFIG_BCACHEFS_POSIX_ACL
824         .get_acl        = bch2_get_acl,
825         .set_acl        = bch2_set_acl,
826 #endif
827 };
828
829 static const struct inode_operations bch_dir_inode_operations = {
830         .lookup         = bch2_lookup,
831         .create         = bch2_create,
832         .link           = bch2_link,
833         .unlink         = bch2_unlink,
834         .symlink        = bch2_symlink,
835         .mkdir          = bch2_mkdir,
836         .rmdir          = bch2_rmdir,
837         .mknod          = bch2_mknod,
838         .rename         = bch2_rename2,
839         .setattr        = bch2_setattr,
840         .tmpfile        = bch2_tmpfile,
841         .listxattr      = bch2_xattr_list,
842 #ifdef CONFIG_BCACHEFS_POSIX_ACL
843         .get_acl        = bch2_get_acl,
844         .set_acl        = bch2_set_acl,
845 #endif
846 };
847
848 static const struct file_operations bch_dir_file_operations = {
849         .llseek         = bch2_dir_llseek,
850         .read           = generic_read_dir,
851         .iterate        = bch2_vfs_readdir,
852         .fsync          = bch2_fsync,
853         .unlocked_ioctl = bch2_fs_file_ioctl,
854 #ifdef CONFIG_COMPAT
855         .compat_ioctl   = bch2_compat_fs_ioctl,
856 #endif
857 };
858
859 static const struct inode_operations bch_symlink_inode_operations = {
860         .get_link       = page_get_link,
861         .setattr        = bch2_setattr,
862         .listxattr      = bch2_xattr_list,
863 #ifdef CONFIG_BCACHEFS_POSIX_ACL
864         .get_acl        = bch2_get_acl,
865         .set_acl        = bch2_set_acl,
866 #endif
867 };
868
869 static const struct inode_operations bch_special_inode_operations = {
870         .setattr        = bch2_setattr,
871         .listxattr      = bch2_xattr_list,
872 #ifdef CONFIG_BCACHEFS_POSIX_ACL
873         .get_acl        = bch2_get_acl,
874         .set_acl        = bch2_set_acl,
875 #endif
876 };
877
878 static const struct address_space_operations bch_address_space_operations = {
879         .writepage      = bch2_writepage,
880         .readpage       = bch2_readpage,
881         .writepages     = bch2_writepages,
882         .readpages      = bch2_readpages,
883         .set_page_dirty = bch2_set_page_dirty,
884         .write_begin    = bch2_write_begin,
885         .write_end      = bch2_write_end,
886         .invalidatepage = bch2_invalidatepage,
887         .releasepage    = bch2_releasepage,
888         .direct_IO      = bch2_direct_IO,
889 #ifdef CONFIG_MIGRATION
890         .migratepage    = bch2_migrate_page,
891 #endif
892         .error_remove_page = generic_error_remove_page,
893 };
894
895 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
896                 u64 ino, u32 generation)
897 {
898         struct bch_fs *c = sb->s_fs_info;
899         struct inode *vinode;
900
901         if (ino < BCACHEFS_ROOT_INO)
902                 return ERR_PTR(-ESTALE);
903
904         vinode = bch2_vfs_inode_get(c, ino);
905         if (IS_ERR(vinode))
906                 return ERR_CAST(vinode);
907         if (generation && vinode->i_generation != generation) {
908                 /* we didn't find the right inode.. */
909                 iput(vinode);
910                 return ERR_PTR(-ESTALE);
911         }
912         return vinode;
913 }
914
915 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
916                 int fh_len, int fh_type)
917 {
918         return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
919                                     bch2_nfs_get_inode);
920 }
921
922 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
923                 int fh_len, int fh_type)
924 {
925         return generic_fh_to_parent(sb, fid, fh_len, fh_type,
926                                     bch2_nfs_get_inode);
927 }
928
929 static const struct export_operations bch_export_ops = {
930         .fh_to_dentry   = bch2_fh_to_dentry,
931         .fh_to_parent   = bch2_fh_to_parent,
932         //.get_parent   = bch2_get_parent,
933 };
934
935 static void bch2_vfs_inode_init(struct bch_fs *c,
936                                 struct bch_inode_info *inode,
937                                 struct bch_inode_unpacked *bi)
938 {
939         inode->v.i_mode         = bi->bi_mode;
940         i_uid_write(&inode->v, bi->bi_uid);
941         i_gid_write(&inode->v, bi->bi_gid);
942         inode->v.i_blocks       = bi->bi_sectors;
943         inode->v.i_ino          = bi->bi_inum;
944         set_nlink(&inode->v, bi->bi_nlink + nlink_bias(inode->v.i_mode));
945         inode->v.i_rdev         = bi->bi_dev;
946         inode->v.i_generation   = bi->bi_generation;
947         inode->v.i_size         = bi->bi_size;
948         inode->v.i_atime        = bch2_time_to_timespec(c, bi->bi_atime);
949         inode->v.i_mtime        = bch2_time_to_timespec(c, bi->bi_mtime);
950         inode->v.i_ctime        = bch2_time_to_timespec(c, bi->bi_ctime);
951
952         inode->ei_journal_seq   = 0;
953         inode->ei_quota_reserved = 0;
954         inode->ei_qid           = bch_qid(bi);
955         inode->ei_str_hash      = bch2_hash_info_init(c, bi);
956         inode->ei_inode         = *bi;
957
958         bch2_inode_flags_to_vfs(inode);
959
960         inode->v.i_mapping->a_ops = &bch_address_space_operations;
961
962         switch (inode->v.i_mode & S_IFMT) {
963         case S_IFREG:
964                 inode->v.i_op   = &bch_file_inode_operations;
965                 inode->v.i_fop  = &bch_file_operations;
966                 break;
967         case S_IFDIR:
968                 inode->v.i_op   = &bch_dir_inode_operations;
969                 inode->v.i_fop  = &bch_dir_file_operations;
970                 break;
971         case S_IFLNK:
972                 inode_nohighmem(&inode->v);
973                 inode->v.i_op   = &bch_symlink_inode_operations;
974                 break;
975         default:
976                 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
977                 inode->v.i_op   = &bch_special_inode_operations;
978                 break;
979         }
980 }
981
982 static struct inode *bch2_alloc_inode(struct super_block *sb)
983 {
984         struct bch_inode_info *inode;
985
986         inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
987         if (!inode)
988                 return NULL;
989
990         inode_init_once(&inode->v);
991         mutex_init(&inode->ei_update_lock);
992         inode->ei_journal_seq = 0;
993
994         return &inode->v;
995 }
996
997 static void bch2_i_callback(struct rcu_head *head)
998 {
999         struct inode *vinode = container_of(head, struct inode, i_rcu);
1000         struct bch_inode_info *inode = to_bch_ei(vinode);
1001
1002         kmem_cache_free(bch2_inode_cache, inode);
1003 }
1004
1005 static void bch2_destroy_inode(struct inode *vinode)
1006 {
1007         call_rcu(&vinode->i_rcu, bch2_i_callback);
1008 }
1009
1010 static int bch2_vfs_write_inode(struct inode *vinode,
1011                                 struct writeback_control *wbc)
1012 {
1013         struct bch_fs *c = vinode->i_sb->s_fs_info;
1014         struct bch_inode_info *inode = to_bch_ei(vinode);
1015         int ret;
1016
1017         mutex_lock(&inode->ei_update_lock);
1018         ret = bch2_write_inode(c, inode);
1019         mutex_unlock(&inode->ei_update_lock);
1020
1021         if (c->opts.journal_flush_disabled)
1022                 return ret;
1023
1024         if (!ret && wbc->sync_mode == WB_SYNC_ALL)
1025                 ret = bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
1026
1027         return ret;
1028 }
1029
1030 static void bch2_evict_inode(struct inode *vinode)
1031 {
1032         struct bch_fs *c = vinode->i_sb->s_fs_info;
1033         struct bch_inode_info *inode = to_bch_ei(vinode);
1034
1035         truncate_inode_pages_final(&inode->v.i_data);
1036
1037         clear_inode(&inode->v);
1038
1039         BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1040
1041         if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1042                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1043                                 BCH_QUOTA_WARN);
1044                 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1045                                 BCH_QUOTA_WARN);
1046                 bch2_inode_rm(c, inode->v.i_ino);
1047                 atomic_long_dec(&c->nr_inodes);
1048         }
1049 }
1050
1051 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1052 {
1053         struct super_block *sb = dentry->d_sb;
1054         struct bch_fs *c = sb->s_fs_info;
1055         u64 fsid;
1056
1057         buf->f_type     = BCACHEFS_STATFS_MAGIC;
1058         buf->f_bsize    = sb->s_blocksize;
1059         buf->f_blocks   = c->capacity >> PAGE_SECTOR_SHIFT;
1060         buf->f_bfree    = bch2_fs_sectors_free(c, bch2_fs_usage_read(c)) >>
1061                            PAGE_SECTOR_SHIFT;
1062         buf->f_bavail   = buf->f_bfree;
1063         buf->f_files    = atomic_long_read(&c->nr_inodes);
1064         buf->f_ffree    = U64_MAX;
1065
1066         fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
1067                le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
1068         buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
1069         buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
1070         buf->f_namelen  = NAME_MAX;
1071
1072         return 0;
1073 }
1074
1075 static int bch2_sync_fs(struct super_block *sb, int wait)
1076 {
1077         struct bch_fs *c = sb->s_fs_info;
1078
1079         if (!wait) {
1080                 bch2_journal_flush_async(&c->journal, NULL);
1081                 return 0;
1082         }
1083
1084         return bch2_journal_flush(&c->journal);
1085 }
1086
1087 static struct bch_fs *bch2_path_to_fs(const char *dev)
1088 {
1089         struct bch_fs *c;
1090         struct block_device *bdev = lookup_bdev(dev);
1091
1092         if (IS_ERR(bdev))
1093                 return ERR_CAST(bdev);
1094
1095         c = bch2_bdev_to_fs(bdev);
1096         bdput(bdev);
1097         return c ?: ERR_PTR(-ENOENT);
1098 }
1099
1100 static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs,
1101                                                unsigned nr_devs, struct bch_opts opts)
1102 {
1103         struct bch_fs *c, *c1, *c2;
1104         size_t i;
1105
1106         if (!nr_devs)
1107                 return ERR_PTR(-EINVAL);
1108
1109         c = bch2_fs_open(devs, nr_devs, opts);
1110
1111         if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) {
1112                 /*
1113                  * Already open?
1114                  * Look up each block device, make sure they all belong to a
1115                  * filesystem and they all belong to the _same_ filesystem
1116                  */
1117
1118                 c1 = bch2_path_to_fs(devs[0]);
1119                 if (!c1)
1120                         return c;
1121
1122                 for (i = 1; i < nr_devs; i++) {
1123                         c2 = bch2_path_to_fs(devs[i]);
1124                         if (!IS_ERR(c2))
1125                                 closure_put(&c2->cl);
1126
1127                         if (c1 != c2) {
1128                                 closure_put(&c1->cl);
1129                                 return c;
1130                         }
1131                 }
1132
1133                 c = c1;
1134         }
1135
1136         if (IS_ERR(c))
1137                 return c;
1138
1139         mutex_lock(&c->state_lock);
1140
1141         if (!bch2_fs_running(c)) {
1142                 mutex_unlock(&c->state_lock);
1143                 closure_put(&c->cl);
1144                 pr_err("err mounting %s: incomplete filesystem", dev_name);
1145                 return ERR_PTR(-EINVAL);
1146         }
1147
1148         mutex_unlock(&c->state_lock);
1149
1150         set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
1151         return c;
1152 }
1153
1154 static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
1155                                              struct bch_opts opts)
1156 {
1157         char *dev_name = NULL, **devs = NULL, *s;
1158         struct bch_fs *c = ERR_PTR(-ENOMEM);
1159         size_t i, nr_devs = 0;
1160
1161         dev_name = kstrdup(_dev_name, GFP_KERNEL);
1162         if (!dev_name)
1163                 goto err;
1164
1165         for (s = dev_name; s; s = strchr(s + 1, ':'))
1166                 nr_devs++;
1167
1168         devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
1169         if (!devs)
1170                 goto err;
1171
1172         for (i = 0, s = dev_name;
1173              s;
1174              (s = strchr(s, ':')) && (*s++ = '\0'))
1175                 devs[i++] = s;
1176
1177         c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts);
1178 err:
1179         kfree(devs);
1180         kfree(dev_name);
1181         return c;
1182 }
1183
1184 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1185 {
1186         struct bch_fs *c = sb->s_fs_info;
1187         struct bch_opts opts = bch2_opts_empty();
1188         int ret;
1189
1190         opt_set(opts, read_only, (*flags & MS_RDONLY) != 0);
1191
1192         ret = bch2_parse_mount_opts(&opts, data);
1193         if (ret)
1194                 return ret;
1195
1196         if (opts.read_only != c->opts.read_only) {
1197                 const char *err = NULL;
1198
1199                 mutex_lock(&c->state_lock);
1200
1201                 if (opts.read_only) {
1202                         bch2_fs_read_only(c);
1203
1204                         sb->s_flags |= MS_RDONLY;
1205                 } else {
1206                         err = bch2_fs_read_write(c);
1207                         if (err) {
1208                                 bch_err(c, "error going rw: %s", err);
1209                                 return -EINVAL;
1210                         }
1211
1212                         sb->s_flags &= ~MS_RDONLY;
1213                 }
1214
1215                 c->opts.read_only = opts.read_only;
1216
1217                 mutex_unlock(&c->state_lock);
1218         }
1219
1220         if (opts.errors >= 0)
1221                 c->opts.errors = opts.errors;
1222
1223         return ret;
1224 }
1225
1226 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1227 {
1228         struct bch_fs *c = root->d_sb->s_fs_info;
1229         enum bch_opt_id i;
1230
1231         for (i = 0; i < bch2_opts_nr; i++) {
1232                 const struct bch_option *opt = &bch2_opt_table[i];
1233                 u64 v = bch2_opt_get_by_id(&c->opts, i);
1234
1235                 if (opt->mode < OPT_MOUNT)
1236                         continue;
1237
1238                 if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1239                         continue;
1240
1241                 switch (opt->type) {
1242                 case BCH_OPT_BOOL:
1243                         seq_printf(seq, ",%s%s", v ? "" : "no", opt->attr.name);
1244                         break;
1245                 case BCH_OPT_UINT:
1246                         seq_printf(seq, ",%s=%llu", opt->attr.name, v);
1247                         break;
1248                 case BCH_OPT_STR:
1249                         seq_printf(seq, ",%s=%s", opt->attr.name, opt->choices[v]);
1250                         break;
1251                 }
1252         }
1253
1254         return 0;
1255
1256 }
1257
1258 static const struct super_operations bch_super_operations = {
1259         .alloc_inode    = bch2_alloc_inode,
1260         .destroy_inode  = bch2_destroy_inode,
1261         .write_inode    = bch2_vfs_write_inode,
1262         .evict_inode    = bch2_evict_inode,
1263         .sync_fs        = bch2_sync_fs,
1264         .statfs         = bch2_statfs,
1265         .show_options   = bch2_show_options,
1266         .remount_fs     = bch2_remount,
1267 #if 0
1268         .put_super      = bch2_put_super,
1269         .freeze_fs      = bch2_freeze,
1270         .unfreeze_fs    = bch2_unfreeze,
1271 #endif
1272 };
1273
1274 static int bch2_test_super(struct super_block *s, void *data)
1275 {
1276         return s->s_fs_info == data;
1277 }
1278
1279 static int bch2_set_super(struct super_block *s, void *data)
1280 {
1281         s->s_fs_info = data;
1282         return 0;
1283 }
1284
1285 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1286                                  int flags, const char *dev_name, void *data)
1287 {
1288         struct bch_fs *c;
1289         struct bch_dev *ca;
1290         struct super_block *sb;
1291         struct inode *vinode;
1292         struct bch_opts opts = bch2_opts_empty();
1293         unsigned i;
1294         int ret;
1295
1296         opt_set(opts, read_only, (flags & MS_RDONLY) != 0);
1297
1298         ret = bch2_parse_mount_opts(&opts, data);
1299         if (ret)
1300                 return ERR_PTR(ret);
1301
1302         c = bch2_open_as_blockdevs(dev_name, opts);
1303         if (IS_ERR(c))
1304                 return ERR_CAST(c);
1305
1306         sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|MS_NOSEC, c);
1307         if (IS_ERR(sb)) {
1308                 closure_put(&c->cl);
1309                 return ERR_CAST(sb);
1310         }
1311
1312         BUG_ON(sb->s_fs_info != c);
1313
1314         if (sb->s_root) {
1315                 closure_put(&c->cl);
1316
1317                 if ((flags ^ sb->s_flags) & MS_RDONLY) {
1318                         ret = -EBUSY;
1319                         goto err_put_super;
1320                 }
1321                 goto out;
1322         }
1323
1324         /* XXX: blocksize */
1325         sb->s_blocksize         = PAGE_SIZE;
1326         sb->s_blocksize_bits    = PAGE_SHIFT;
1327         sb->s_maxbytes          = MAX_LFS_FILESIZE;
1328         sb->s_op                = &bch_super_operations;
1329         sb->s_export_op         = &bch_export_ops;
1330 #ifdef CONFIG_BCACHEFS_QUOTA
1331         sb->s_qcop              = &bch2_quotactl_operations;
1332         sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1333 #endif
1334         sb->s_xattr             = bch2_xattr_handlers;
1335         sb->s_magic             = BCACHEFS_STATFS_MAGIC;
1336         sb->s_time_gran         = c->sb.time_precision;
1337         c->vfs_sb               = sb;
1338         strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
1339
1340         ret = super_setup_bdi(sb);
1341         if (ret)
1342                 goto err_put_super;
1343
1344         sb->s_bdi->congested_fn         = bch2_congested;
1345         sb->s_bdi->congested_data       = c;
1346         sb->s_bdi->ra_pages             = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
1347
1348         for_each_online_member(ca, c, i) {
1349                 struct block_device *bdev = ca->disk_sb.bdev;
1350
1351                 /* XXX: create an anonymous device for multi device filesystems */
1352                 sb->s_bdev      = bdev;
1353                 sb->s_dev       = bdev->bd_dev;
1354                 percpu_ref_put(&ca->io_ref);
1355                 break;
1356         }
1357
1358 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1359         if (c->opts.acl)
1360                 sb->s_flags     |= MS_POSIXACL;
1361 #endif
1362
1363         vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
1364         if (IS_ERR(vinode)) {
1365                 ret = PTR_ERR(vinode);
1366                 goto err_put_super;
1367         }
1368
1369         sb->s_root = d_make_root(vinode);
1370         if (!sb->s_root) {
1371                 ret = -ENOMEM;
1372                 goto err_put_super;
1373         }
1374
1375         sb->s_flags |= MS_ACTIVE;
1376 out:
1377         return dget(sb->s_root);
1378
1379 err_put_super:
1380         deactivate_locked_super(sb);
1381         return ERR_PTR(ret);
1382 }
1383
1384 static void bch2_kill_sb(struct super_block *sb)
1385 {
1386         struct bch_fs *c = sb->s_fs_info;
1387
1388         generic_shutdown_super(sb);
1389
1390         if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
1391                 bch2_fs_stop(c);
1392         else
1393                 closure_put(&c->cl);
1394 }
1395
1396 static struct file_system_type bcache_fs_type = {
1397         .owner          = THIS_MODULE,
1398         .name           = "bcachefs",
1399         .mount          = bch2_mount,
1400         .kill_sb        = bch2_kill_sb,
1401         .fs_flags       = FS_REQUIRES_DEV,
1402 };
1403
1404 MODULE_ALIAS_FS("bcachefs");
1405
1406 void bch2_vfs_exit(void)
1407 {
1408         unregister_filesystem(&bcache_fs_type);
1409         if (bch2_inode_cache)
1410                 kmem_cache_destroy(bch2_inode_cache);
1411 }
1412
1413 int __init bch2_vfs_init(void)
1414 {
1415         int ret = -ENOMEM;
1416
1417         bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
1418         if (!bch2_inode_cache)
1419                 goto err;
1420
1421         ret = register_filesystem(&bcache_fs_type);
1422         if (ret)
1423                 goto err;
1424
1425         return 0;
1426 err:
1427         bch2_vfs_exit();
1428         return ret;
1429 }
1430
1431 #endif /* NO_BCACHEFS_FS */