]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fs.c
Update bcachefs sources to 2cb70a82bc bcachefs: delete some debug code
[bcachefs-tools-debian] / libbcachefs / fs.c
1 #ifndef NO_BCACHEFS_FS
2
3 #include "bcachefs.h"
4 #include "acl.h"
5 #include "btree_update.h"
6 #include "buckets.h"
7 #include "chardev.h"
8 #include "dirent.h"
9 #include "extents.h"
10 #include "fs.h"
11 #include "fs-io.h"
12 #include "fs-ioctl.h"
13 #include "fsck.h"
14 #include "inode.h"
15 #include "io.h"
16 #include "journal.h"
17 #include "keylist.h"
18 #include "quota.h"
19 #include "super.h"
20 #include "xattr.h"
21
22 #include <linux/aio.h>
23 #include <linux/backing-dev.h>
24 #include <linux/exportfs.h>
25 #include <linux/module.h>
26 #include <linux/posix_acl.h>
27 #include <linux/random.h>
28 #include <linux/statfs.h>
29 #include <linux/xattr.h>
30
31 static struct kmem_cache *bch2_inode_cache;
32
33 static void bch2_vfs_inode_init(struct bch_fs *,
34                                 struct bch_inode_info *,
35                                 struct bch_inode_unpacked *);
36
37 /*
38  * I_SIZE_DIRTY requires special handling:
39  *
40  * To the recovery code, the flag means that there is stale data past i_size
41  * that needs to be deleted; it's used for implementing atomic appends and
42  * truncates.
43  *
44  * On append, we set I_SIZE_DIRTY before doing the write, then after the write
45  * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size
46  * that exposes the data we just wrote.
47  *
48  * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting
49  * i_size to the new smaller size, then we delete the data that we just made
50  * invisible, and then we clear I_SIZE_DIRTY.
51  *
52  * Because there can be multiple appends in flight at a time, we need a refcount
53  * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero
54  * refcount means I_SIZE_DIRTY is set, zero means it's cleared.
55  *
56  * Because write_inode() can be called at any time, i_size_dirty_count means
57  * something different to the runtime code - it means to write_inode() "don't
58  * update i_size yet".
59  *
60  * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when
61  * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must
62  * be set explicitly.
63  */
64
65 int __must_check __bch2_write_inode(struct bch_fs *c,
66                                     struct bch_inode_info *inode,
67                                     inode_set_fn set,
68                                     void *p)
69 {
70         struct btree_iter iter;
71         struct bch_inode_unpacked inode_u;
72         struct bkey_inode_buf inode_p;
73         u64 inum = inode->v.i_ino;
74         unsigned i_nlink = READ_ONCE(inode->v.i_nlink);
75         int ret;
76
77         /*
78          * We can't write an inode with i_nlink == 0 because it's stored biased;
79          * however, we don't need to because if i_nlink is 0 the inode is
80          * getting deleted when it's evicted.
81          */
82         if (!i_nlink)
83                 return 0;
84
85         lockdep_assert_held(&inode->ei_update_lock);
86
87         bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0),
88                              BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
89
90         do {
91                 struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
92
93                 if ((ret = btree_iter_err(k)))
94                         goto out;
95
96                 if (WARN_ONCE(k.k->type != BCH_INODE_FS,
97                               "inode %llu not found when updating", inum)) {
98                         bch2_btree_iter_unlock(&iter);
99                         return -ENOENT;
100                 }
101
102                 ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
103                 if (WARN_ONCE(ret,
104                               "error %i unpacking inode %llu", ret, inum)) {
105                         ret = -ENOENT;
106                         break;
107                 }
108
109                 BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size);
110
111                 if (set) {
112                         ret = set(inode, &inode_u, p);
113                         if (ret)
114                                 goto out;
115                 }
116
117                 BUG_ON(i_nlink < nlink_bias(inode->v.i_mode));
118
119                 BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size &&
120                        !(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
121                        inode_u.bi_size > i_size_read(&inode->v));
122
123                 inode_u.bi_mode = inode->v.i_mode;
124                 inode_u.bi_uid  = i_uid_read(&inode->v);
125                 inode_u.bi_gid  = i_gid_read(&inode->v);
126                 inode_u.bi_project = inode->ei_qid.q[QTYP_PRJ];
127                 inode_u.bi_nlink= i_nlink - nlink_bias(inode->v.i_mode);
128                 inode_u.bi_dev  = inode->v.i_rdev;
129                 inode_u.bi_atime= timespec_to_bch2_time(c, inode->v.i_atime);
130                 inode_u.bi_mtime= timespec_to_bch2_time(c, inode->v.i_mtime);
131                 inode_u.bi_ctime= timespec_to_bch2_time(c, inode->v.i_ctime);
132
133                 bch2_inode_pack(&inode_p, &inode_u);
134
135                 ret = bch2_btree_insert_at(c, NULL, NULL,
136                                 &inode->ei_journal_seq,
137                                 BTREE_INSERT_ATOMIC|
138                                 BTREE_INSERT_NOUNLOCK|
139                                 BTREE_INSERT_NOFAIL,
140                                 BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
141         } while (ret == -EINTR);
142
143         if (!ret) {
144                 /*
145                  * the btree node lock protects inode->ei_inode, not
146                  * ei_update_lock; this is important for inode updates via
147                  * bchfs_write_index_update
148                  */
149                 inode->ei_inode = inode_u;
150                 inode->ei_qid   = bch_qid(&inode_u);
151         }
152 out:
153         bch2_btree_iter_unlock(&iter);
154
155         return ret < 0 ? ret : 0;
156 }
157
158 int __must_check bch2_write_inode(struct bch_fs *c,
159                                   struct bch_inode_info *inode)
160 {
161         return __bch2_write_inode(c, inode, NULL, NULL);
162 }
163
164 static int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
165 {
166         int ret;
167
168         mutex_lock(&inode->ei_update_lock);
169         inc_nlink(&inode->v);
170         ret = bch2_write_inode(c, inode);
171         mutex_unlock(&inode->ei_update_lock);
172
173         return ret;
174 }
175
176 static int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
177 {
178         int ret = 0;
179
180         mutex_lock(&inode->ei_update_lock);
181         drop_nlink(&inode->v);
182         ret = bch2_write_inode(c, inode);
183         mutex_unlock(&inode->ei_update_lock);
184
185         return ret;
186 }
187
188 static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
189 {
190         struct bch_inode_unpacked inode_u;
191         struct bch_inode_info *inode;
192         int ret;
193
194         inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
195         if (unlikely(!inode))
196                 return ERR_PTR(-ENOMEM);
197         if (!(inode->v.i_state & I_NEW))
198                 return &inode->v;
199
200         ret = bch2_inode_find_by_inum(c, inum, &inode_u);
201         if (ret) {
202                 iget_failed(&inode->v);
203                 return ERR_PTR(ret);
204         }
205
206         bch2_vfs_inode_init(c, inode, &inode_u);
207
208         inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
209
210         unlock_new_inode(&inode->v);
211
212         return &inode->v;
213 }
214
215 static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
216                                                     struct bch_inode_info *dir,
217                                                     umode_t mode, dev_t rdev)
218 {
219         struct posix_acl *default_acl = NULL, *acl = NULL;
220         struct bch_inode_info *inode;
221         struct bch_inode_unpacked inode_u;
222         int ret;
223
224         inode = to_bch_ei(new_inode(c->vfs_sb));
225         if (unlikely(!inode))
226                 return ERR_PTR(-ENOMEM);
227
228         inode_init_owner(&inode->v, &dir->v, mode);
229
230 #ifdef CONFIG_BCACHEFS_POSIX_ACL
231         ret = posix_acl_create(&dir->v, &inode->v.i_mode, &default_acl, &acl);
232         if (ret)
233                 goto err_make_bad;
234 #endif
235
236         bch2_inode_init(c, &inode_u,
237                         i_uid_read(&inode->v),
238                         i_gid_read(&inode->v),
239                         inode->v.i_mode, rdev,
240                         &dir->ei_inode);
241
242         inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ];
243
244         ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC);
245         if (ret)
246                 goto err_make_bad;
247
248         ret = bch2_inode_create(c, &inode_u,
249                                 BLOCKDEV_INODE_MAX, 0,
250                                 &c->unused_inode_hint);
251         if (unlikely(ret))
252                 goto err_acct_quota;
253
254         bch2_vfs_inode_init(c, inode, &inode_u);
255         atomic_long_inc(&c->nr_inodes);
256
257         if (default_acl) {
258                 ret = __bch2_set_acl(&inode->v, default_acl, ACL_TYPE_DEFAULT);
259                 if (unlikely(ret))
260                         goto err;
261         }
262
263         if (acl) {
264                 ret = __bch2_set_acl(&inode->v, acl, ACL_TYPE_ACCESS);
265                 if (unlikely(ret))
266                         goto err;
267         }
268
269         insert_inode_hash(&inode->v);
270 out:
271         posix_acl_release(default_acl);
272         posix_acl_release(acl);
273         return inode;
274 err_acct_quota:
275         bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN);
276 err_make_bad:
277         /*
278          * indicate to bch_evict_inode that the inode was never actually
279          * created:
280          */
281         make_bad_inode(&inode->v);
282 err:
283         clear_nlink(&inode->v);
284         iput(&inode->v);
285         inode = ERR_PTR(ret);
286         goto out;
287 }
288
289 static int bch2_vfs_dirent_create(struct bch_fs *c,
290                                   struct bch_inode_info *dir,
291                                   u8 type, const struct qstr *name,
292                                   u64 dst)
293 {
294         int ret;
295
296         ret = bch2_dirent_create(c, dir->v.i_ino, &dir->ei_str_hash,
297                                 type, name, dst,
298                                 &dir->ei_journal_seq,
299                                 BCH_HASH_SET_MUST_CREATE);
300         if (unlikely(ret))
301                 return ret;
302
303         dir->v.i_mtime = dir->v.i_ctime = current_time(&dir->v);
304         mark_inode_dirty_sync(&dir->v);
305         return 0;
306 }
307
308 static int __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
309                          umode_t mode, dev_t rdev)
310 {
311         struct bch_fs *c = dir->v.i_sb->s_fs_info;
312         struct bch_inode_info *inode;
313         int ret;
314
315         inode = bch2_vfs_inode_create(c, dir, mode, rdev);
316         if (unlikely(IS_ERR(inode)))
317                 return PTR_ERR(inode);
318
319         ret = bch2_vfs_dirent_create(c, dir, mode_to_type(mode),
320                                      &dentry->d_name, inode->v.i_ino);
321         if (unlikely(ret)) {
322                 clear_nlink(&inode->v);
323                 iput(&inode->v);
324                 return ret;
325         }
326
327         if (dir->ei_journal_seq > inode->ei_journal_seq)
328                 inode->ei_journal_seq = dir->ei_journal_seq;
329
330         d_instantiate(dentry, &inode->v);
331         return 0;
332 }
333
334 /* methods */
335
336 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
337                                   unsigned int flags)
338 {
339         struct bch_fs *c = vdir->i_sb->s_fs_info;
340         struct bch_inode_info *dir = to_bch_ei(vdir);
341         struct inode *vinode = NULL;
342         u64 inum;
343
344         inum = bch2_dirent_lookup(c, dir->v.i_ino,
345                                   &dir->ei_str_hash,
346                                   &dentry->d_name);
347
348         if (inum)
349                 vinode = bch2_vfs_inode_get(c, inum);
350
351         return d_splice_alias(vinode, dentry);
352 }
353
354 static int bch2_create(struct inode *vdir, struct dentry *dentry,
355                        umode_t mode, bool excl)
356 {
357         return __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0);
358 }
359
360 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
361                      struct dentry *dentry)
362 {
363         struct bch_fs *c = vdir->i_sb->s_fs_info;
364         struct bch_inode_info *dir = to_bch_ei(vdir);
365         struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
366         int ret;
367
368         lockdep_assert_held(&inode->v.i_rwsem);
369
370         inode->v.i_ctime = current_time(&dir->v);
371
372         ret = bch2_inc_nlink(c, inode);
373         if (ret)
374                 return ret;
375
376         ihold(&inode->v);
377
378         ret = bch2_vfs_dirent_create(c, dir, mode_to_type(inode->v.i_mode),
379                                      &dentry->d_name, inode->v.i_ino);
380         if (unlikely(ret)) {
381                 bch2_dec_nlink(c, inode);
382                 iput(&inode->v);
383                 return ret;
384         }
385
386         d_instantiate(dentry, &inode->v);
387         return 0;
388 }
389
390 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
391 {
392         struct bch_fs *c = vdir->i_sb->s_fs_info;
393         struct bch_inode_info *dir = to_bch_ei(vdir);
394         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
395         int ret;
396
397         lockdep_assert_held(&inode->v.i_rwsem);
398
399         ret = bch2_dirent_delete(c, dir->v.i_ino, &dir->ei_str_hash,
400                                  &dentry->d_name, &dir->ei_journal_seq);
401         if (ret)
402                 return ret;
403
404         if (dir->ei_journal_seq > inode->ei_journal_seq)
405                 inode->ei_journal_seq = dir->ei_journal_seq;
406
407         inode->v.i_ctime = dir->v.i_ctime;
408
409         if (S_ISDIR(inode->v.i_mode)) {
410                 bch2_dec_nlink(c, dir);
411                 drop_nlink(&inode->v);
412         }
413
414         bch2_dec_nlink(c, inode);
415
416         return 0;
417 }
418
419 static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
420                         const char *symname)
421 {
422         struct bch_fs *c = vdir->i_sb->s_fs_info;
423         struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
424         int ret;
425
426         inode = bch2_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0);
427         if (unlikely(IS_ERR(inode)))
428                 return PTR_ERR(inode);
429
430         inode_lock(&inode->v);
431         ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
432         inode_unlock(&inode->v);
433
434         if (unlikely(ret))
435                 goto err;
436
437         ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
438         if (unlikely(ret))
439                 goto err;
440
441         /* XXX: racy */
442         if (dir->ei_journal_seq < inode->ei_journal_seq)
443                 dir->ei_journal_seq = inode->ei_journal_seq;
444
445         ret = bch2_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name,
446                                      inode->v.i_ino);
447         if (unlikely(ret))
448                 goto err;
449
450         d_instantiate(dentry, &inode->v);
451         return 0;
452 err:
453         clear_nlink(&inode->v);
454         iput(&inode->v);
455         return ret;
456 }
457
458 static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode)
459 {
460         struct bch_fs *c = vdir->i_sb->s_fs_info;
461         struct bch_inode_info *dir = to_bch_ei(vdir);
462         int ret;
463
464         lockdep_assert_held(&dir->v.i_rwsem);
465
466         ret = __bch2_create(dir, dentry, mode|S_IFDIR, 0);
467         if (unlikely(ret))
468                 return ret;
469
470         bch2_inc_nlink(c, dir);
471
472         return 0;
473 }
474
475 static int bch2_rmdir(struct inode *vdir, struct dentry *dentry)
476 {
477         struct bch_fs *c = vdir->i_sb->s_fs_info;
478
479         if (bch2_empty_dir(c, dentry->d_inode->i_ino))
480                 return -ENOTEMPTY;
481
482         return bch2_unlink(vdir, dentry);
483 }
484
485 static int bch2_mknod(struct inode *vdir, struct dentry *dentry,
486                       umode_t mode, dev_t rdev)
487 {
488         return __bch2_create(to_bch_ei(vdir), dentry, mode, rdev);
489 }
490
491 static int bch2_rename(struct bch_fs *c,
492                        struct bch_inode_info *old_dir,
493                        struct dentry *old_dentry,
494                        struct bch_inode_info *new_dir,
495                        struct dentry *new_dentry)
496 {
497         struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
498         struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
499         struct timespec now = current_time(&old_dir->v);
500         int ret;
501
502         lockdep_assert_held(&old_dir->v.i_rwsem);
503         lockdep_assert_held(&new_dir->v.i_rwsem);
504
505         if (new_inode)
506                 filemap_write_and_wait_range(old_inode->v.i_mapping,
507                                              0, LLONG_MAX);
508
509         if (new_inode && S_ISDIR(old_inode->v.i_mode)) {
510                 lockdep_assert_held(&new_inode->v.i_rwsem);
511
512                 if (!S_ISDIR(new_inode->v.i_mode))
513                         return -ENOTDIR;
514
515                 if (bch2_empty_dir(c, new_inode->v.i_ino))
516                         return -ENOTEMPTY;
517
518                 ret = bch2_dirent_rename(c,
519                                 old_dir, &old_dentry->d_name,
520                                 new_dir, &new_dentry->d_name,
521                                 &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE);
522                 if (unlikely(ret))
523                         return ret;
524
525                 clear_nlink(&new_inode->v);
526                 bch2_dec_nlink(c, old_dir);
527         } else if (new_inode) {
528                 lockdep_assert_held(&new_inode->v.i_rwsem);
529
530                 ret = bch2_dirent_rename(c,
531                                 old_dir, &old_dentry->d_name,
532                                 new_dir, &new_dentry->d_name,
533                                 &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE);
534                 if (unlikely(ret))
535                         return ret;
536
537                 new_inode->v.i_ctime = now;
538                 bch2_dec_nlink(c, new_inode);
539         } else if (S_ISDIR(old_inode->v.i_mode)) {
540                 ret = bch2_dirent_rename(c,
541                                 old_dir, &old_dentry->d_name,
542                                 new_dir, &new_dentry->d_name,
543                                 &old_inode->ei_journal_seq, BCH_RENAME);
544                 if (unlikely(ret))
545                         return ret;
546
547                 bch2_inc_nlink(c, new_dir);
548                 bch2_dec_nlink(c, old_dir);
549         } else {
550                 ret = bch2_dirent_rename(c,
551                                 old_dir, &old_dentry->d_name,
552                                 new_dir, &new_dentry->d_name,
553                                 &old_inode->ei_journal_seq, BCH_RENAME);
554                 if (unlikely(ret))
555                         return ret;
556         }
557
558         old_dir->v.i_ctime = old_dir->v.i_mtime = now;
559         new_dir->v.i_ctime = new_dir->v.i_mtime = now;
560         mark_inode_dirty_sync(&old_dir->v);
561         mark_inode_dirty_sync(&new_dir->v);
562
563         old_inode->v.i_ctime = now;
564         mark_inode_dirty_sync(&old_inode->v);
565
566         return 0;
567 }
568
569 static int bch2_rename_exchange(struct bch_fs *c,
570                                 struct bch_inode_info *old_dir,
571                                 struct dentry *old_dentry,
572                                 struct bch_inode_info *new_dir,
573                                 struct dentry *new_dentry)
574 {
575         struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
576         struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
577         struct timespec now = current_time(&old_dir->v);
578         int ret;
579
580         ret = bch2_dirent_rename(c,
581                                  old_dir, &old_dentry->d_name,
582                                  new_dir, &new_dentry->d_name,
583                                  &old_inode->ei_journal_seq, BCH_RENAME_EXCHANGE);
584         if (unlikely(ret))
585                 return ret;
586
587         if (S_ISDIR(old_inode->v.i_mode) !=
588             S_ISDIR(new_inode->v.i_mode)) {
589                 if (S_ISDIR(old_inode->v.i_mode)) {
590                         bch2_inc_nlink(c, new_dir);
591                         bch2_dec_nlink(c, old_dir);
592                 } else {
593                         bch2_dec_nlink(c, new_dir);
594                         bch2_inc_nlink(c, old_dir);
595                 }
596         }
597
598         old_dir->v.i_ctime = old_dir->v.i_mtime = now;
599         new_dir->v.i_ctime = new_dir->v.i_mtime = now;
600         mark_inode_dirty_sync(&old_dir->v);
601         mark_inode_dirty_sync(&new_dir->v);
602
603         old_inode->v.i_ctime = now;
604         new_inode->v.i_ctime = now;
605         mark_inode_dirty_sync(&old_inode->v);
606         mark_inode_dirty_sync(&new_inode->v);
607
608         return 0;
609 }
610
611 static int bch2_rename2(struct inode *old_vdir, struct dentry *old_dentry,
612                         struct inode *new_vdir, struct dentry *new_dentry,
613                         unsigned flags)
614 {
615         struct bch_fs *c = old_vdir->i_sb->s_fs_info;
616         struct bch_inode_info *old_dir = to_bch_ei(old_vdir);
617         struct bch_inode_info *new_dir = to_bch_ei(new_vdir);
618
619         if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
620                 return -EINVAL;
621
622         if (flags & RENAME_EXCHANGE)
623                 return bch2_rename_exchange(c, old_dir, old_dentry,
624                                             new_dir, new_dentry);
625
626         return bch2_rename(c, old_dir, old_dentry, new_dir, new_dentry);
627 }
628
629 static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iattr)
630 {
631         struct bch_fs *c = inode->v.i_sb->s_fs_info;
632         struct bch_qid qid = inode->ei_qid;
633         unsigned qtypes = 0;
634         int ret;
635
636         mutex_lock(&inode->ei_update_lock);
637
638         if (c->opts.usrquota &&
639             (iattr->ia_valid & ATTR_UID) &&
640             !uid_eq(iattr->ia_uid, inode->v.i_uid)) {
641                 qid.q[QTYP_USR] = from_kuid(&init_user_ns, iattr->ia_uid),
642                 qtypes |= 1 << QTYP_USR;
643         }
644
645         if (c->opts.grpquota &&
646             (iattr->ia_valid & ATTR_GID) &&
647             !gid_eq(iattr->ia_gid, inode->v.i_gid)) {
648                 qid.q[QTYP_GRP] = from_kgid(&init_user_ns, iattr->ia_gid);
649                 qtypes |= 1 << QTYP_GRP;
650         }
651
652         if (qtypes) {
653                 ret = bch2_quota_transfer(c, qtypes, qid, inode->ei_qid,
654                                           inode->v.i_blocks +
655                                           inode->ei_quota_reserved);
656                 if (ret)
657                         goto out_unlock;
658         }
659
660         setattr_copy(&inode->v, iattr);
661
662         ret = bch2_write_inode(c, inode);
663 out_unlock:
664         mutex_unlock(&inode->ei_update_lock);
665
666         if (!ret &&
667             iattr->ia_valid & ATTR_MODE)
668                 ret = posix_acl_chmod(&inode->v, inode->v.i_mode);
669
670         return ret;
671 }
672
673 static int bch2_getattr(const struct path *path, struct kstat *stat,
674                         u32 request_mask, unsigned query_flags)
675 {
676         struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
677         struct bch_fs *c = inode->v.i_sb->s_fs_info;
678
679         stat->dev       = inode->v.i_sb->s_dev;
680         stat->ino       = inode->v.i_ino;
681         stat->mode      = inode->v.i_mode;
682         stat->nlink     = inode->v.i_nlink;
683         stat->uid       = inode->v.i_uid;
684         stat->gid       = inode->v.i_gid;
685         stat->rdev      = inode->v.i_rdev;
686         stat->size      = i_size_read(&inode->v);
687         stat->atime     = inode->v.i_atime;
688         stat->mtime     = inode->v.i_mtime;
689         stat->ctime     = inode->v.i_ctime;
690         stat->blksize   = block_bytes(c);
691         stat->blocks    = inode->v.i_blocks;
692
693         if (request_mask & STATX_BTIME) {
694                 stat->result_mask |= STATX_BTIME;
695                 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
696         }
697
698         if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
699                 stat->attributes |= STATX_ATTR_IMMUTABLE;
700         if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
701                 stat->attributes |= STATX_ATTR_APPEND;
702         if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
703                 stat->attributes |= STATX_ATTR_NODUMP;
704
705         return 0;
706 }
707
708 static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
709 {
710         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
711         int ret;
712
713         lockdep_assert_held(&inode->v.i_rwsem);
714
715         ret = setattr_prepare(dentry, iattr);
716         if (ret)
717                 return ret;
718
719         return iattr->ia_valid & ATTR_SIZE
720                 ? bch2_truncate(inode, iattr)
721                 : bch2_setattr_nonsize(inode, iattr);
722 }
723
724 static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
725 {
726         struct bch_fs *c = vdir->i_sb->s_fs_info;
727         struct bch_inode_info *dir = to_bch_ei(vdir);
728         struct bch_inode_info *inode;
729
730         /* XXX: i_nlink should be 0? */
731         inode = bch2_vfs_inode_create(c, dir, mode, 0);
732         if (unlikely(IS_ERR(inode)))
733                 return PTR_ERR(inode);
734
735         d_tmpfile(dentry, &inode->v);
736         return 0;
737 }
738
739 static int bch2_fill_extent(struct fiemap_extent_info *info,
740                             const struct bkey_i *k, unsigned flags)
741 {
742         if (bkey_extent_is_data(&k->k)) {
743                 struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
744                 const struct bch_extent_ptr *ptr;
745                 struct bch_extent_crc_unpacked crc;
746                 int ret;
747
748                 extent_for_each_ptr_crc(e, ptr, crc) {
749                         int flags2 = 0;
750                         u64 offset = ptr->offset;
751
752                         if (crc.compression_type)
753                                 flags2 |= FIEMAP_EXTENT_ENCODED;
754                         else
755                                 offset += crc.offset;
756
757                         if ((offset & (PAGE_SECTORS - 1)) ||
758                             (e.k->size & (PAGE_SECTORS - 1)))
759                                 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
760
761                         ret = fiemap_fill_next_extent(info,
762                                                       bkey_start_offset(e.k) << 9,
763                                                       offset << 9,
764                                                       e.k->size << 9, flags|flags2);
765                         if (ret)
766                                 return ret;
767                 }
768
769                 return 0;
770         } else if (k->k.type == BCH_RESERVATION) {
771                 return fiemap_fill_next_extent(info,
772                                                bkey_start_offset(&k->k) << 9,
773                                                0, k->k.size << 9,
774                                                flags|
775                                                FIEMAP_EXTENT_DELALLOC|
776                                                FIEMAP_EXTENT_UNWRITTEN);
777         } else {
778                 BUG();
779         }
780 }
781
782 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
783                        u64 start, u64 len)
784 {
785         struct bch_fs *c = vinode->i_sb->s_fs_info;
786         struct bch_inode_info *ei = to_bch_ei(vinode);
787         struct btree_iter iter;
788         struct bkey_s_c k;
789         BKEY_PADDED(k) tmp;
790         bool have_extent = false;
791         int ret = 0;
792
793         if (start + len < start)
794                 return -EINVAL;
795
796         for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
797                            POS(ei->v.i_ino, start >> 9), 0, k)
798                 if (bkey_extent_is_data(k.k) ||
799                     k.k->type == BCH_RESERVATION) {
800                         if (bkey_cmp(bkey_start_pos(k.k),
801                                      POS(ei->v.i_ino, (start + len) >> 9)) >= 0)
802                                 break;
803
804                         if (have_extent) {
805                                 ret = bch2_fill_extent(info, &tmp.k, 0);
806                                 if (ret)
807                                         goto out;
808                         }
809
810                         bkey_reassemble(&tmp.k, k);
811                         have_extent = true;
812                 }
813
814         if (have_extent)
815                 ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
816 out:
817         bch2_btree_iter_unlock(&iter);
818         return ret < 0 ? ret : 0;
819 }
820
821 static const struct vm_operations_struct bch_vm_ops = {
822         .fault          = filemap_fault,
823         .map_pages      = filemap_map_pages,
824         .page_mkwrite   = bch2_page_mkwrite,
825 };
826
827 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
828 {
829         file_accessed(file);
830
831         vma->vm_ops = &bch_vm_ops;
832         return 0;
833 }
834
835 /* Directories: */
836
837 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
838 {
839         return generic_file_llseek_size(file, offset, whence,
840                                         S64_MAX, S64_MAX);
841 }
842
843 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
844 {
845         struct bch_fs *c = file_inode(file)->i_sb->s_fs_info;
846
847         return bch2_readdir(c, file, ctx);
848 }
849
850 static const struct file_operations bch_file_operations = {
851         .llseek         = bch2_llseek,
852         .read_iter      = generic_file_read_iter,
853         .write_iter     = bch2_write_iter,
854         .mmap           = bch2_mmap,
855         .open           = generic_file_open,
856         .fsync          = bch2_fsync,
857         .splice_read    = generic_file_splice_read,
858         .splice_write   = iter_file_splice_write,
859         .fallocate      = bch2_fallocate_dispatch,
860         .unlocked_ioctl = bch2_fs_file_ioctl,
861 #ifdef CONFIG_COMPAT
862         .compat_ioctl   = bch2_compat_fs_ioctl,
863 #endif
864 };
865
866 static const struct inode_operations bch_file_inode_operations = {
867         .getattr        = bch2_getattr,
868         .setattr        = bch2_setattr,
869         .fiemap         = bch2_fiemap,
870         .listxattr      = bch2_xattr_list,
871 #ifdef CONFIG_BCACHEFS_POSIX_ACL
872         .get_acl        = bch2_get_acl,
873         .set_acl        = bch2_set_acl,
874 #endif
875 };
876
877 static const struct inode_operations bch_dir_inode_operations = {
878         .lookup         = bch2_lookup,
879         .create         = bch2_create,
880         .link           = bch2_link,
881         .unlink         = bch2_unlink,
882         .symlink        = bch2_symlink,
883         .mkdir          = bch2_mkdir,
884         .rmdir          = bch2_rmdir,
885         .mknod          = bch2_mknod,
886         .rename         = bch2_rename2,
887         .getattr        = bch2_getattr,
888         .setattr        = bch2_setattr,
889         .tmpfile        = bch2_tmpfile,
890         .listxattr      = bch2_xattr_list,
891 #ifdef CONFIG_BCACHEFS_POSIX_ACL
892         .get_acl        = bch2_get_acl,
893         .set_acl        = bch2_set_acl,
894 #endif
895 };
896
897 static const struct file_operations bch_dir_file_operations = {
898         .llseek         = bch2_dir_llseek,
899         .read           = generic_read_dir,
900         .iterate        = bch2_vfs_readdir,
901         .fsync          = bch2_fsync,
902         .unlocked_ioctl = bch2_fs_file_ioctl,
903 #ifdef CONFIG_COMPAT
904         .compat_ioctl   = bch2_compat_fs_ioctl,
905 #endif
906 };
907
908 static const struct inode_operations bch_symlink_inode_operations = {
909         .get_link       = page_get_link,
910         .getattr        = bch2_getattr,
911         .setattr        = bch2_setattr,
912         .listxattr      = bch2_xattr_list,
913 #ifdef CONFIG_BCACHEFS_POSIX_ACL
914         .get_acl        = bch2_get_acl,
915         .set_acl        = bch2_set_acl,
916 #endif
917 };
918
919 static const struct inode_operations bch_special_inode_operations = {
920         .getattr        = bch2_getattr,
921         .setattr        = bch2_setattr,
922         .listxattr      = bch2_xattr_list,
923 #ifdef CONFIG_BCACHEFS_POSIX_ACL
924         .get_acl        = bch2_get_acl,
925         .set_acl        = bch2_set_acl,
926 #endif
927 };
928
929 static const struct address_space_operations bch_address_space_operations = {
930         .writepage      = bch2_writepage,
931         .readpage       = bch2_readpage,
932         .writepages     = bch2_writepages,
933         .readpages      = bch2_readpages,
934         .set_page_dirty = bch2_set_page_dirty,
935         .write_begin    = bch2_write_begin,
936         .write_end      = bch2_write_end,
937         .invalidatepage = bch2_invalidatepage,
938         .releasepage    = bch2_releasepage,
939         .direct_IO      = bch2_direct_IO,
940 #ifdef CONFIG_MIGRATION
941         .migratepage    = bch2_migrate_page,
942 #endif
943         .error_remove_page = generic_error_remove_page,
944 };
945
946 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
947                 u64 ino, u32 generation)
948 {
949         struct bch_fs *c = sb->s_fs_info;
950         struct inode *vinode;
951
952         if (ino < BCACHEFS_ROOT_INO)
953                 return ERR_PTR(-ESTALE);
954
955         vinode = bch2_vfs_inode_get(c, ino);
956         if (IS_ERR(vinode))
957                 return ERR_CAST(vinode);
958         if (generation && vinode->i_generation != generation) {
959                 /* we didn't find the right inode.. */
960                 iput(vinode);
961                 return ERR_PTR(-ESTALE);
962         }
963         return vinode;
964 }
965
966 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
967                 int fh_len, int fh_type)
968 {
969         return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
970                                     bch2_nfs_get_inode);
971 }
972
973 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
974                 int fh_len, int fh_type)
975 {
976         return generic_fh_to_parent(sb, fid, fh_len, fh_type,
977                                     bch2_nfs_get_inode);
978 }
979
980 static const struct export_operations bch_export_ops = {
981         .fh_to_dentry   = bch2_fh_to_dentry,
982         .fh_to_parent   = bch2_fh_to_parent,
983         //.get_parent   = bch2_get_parent,
984 };
985
986 static void bch2_vfs_inode_init(struct bch_fs *c,
987                                 struct bch_inode_info *inode,
988                                 struct bch_inode_unpacked *bi)
989 {
990         inode->v.i_mode         = bi->bi_mode;
991         i_uid_write(&inode->v, bi->bi_uid);
992         i_gid_write(&inode->v, bi->bi_gid);
993         inode->v.i_blocks       = bi->bi_sectors;
994         inode->v.i_ino          = bi->bi_inum;
995         set_nlink(&inode->v, bi->bi_nlink + nlink_bias(inode->v.i_mode));
996         inode->v.i_rdev         = bi->bi_dev;
997         inode->v.i_generation   = bi->bi_generation;
998         inode->v.i_size         = bi->bi_size;
999         inode->v.i_atime        = bch2_time_to_timespec(c, bi->bi_atime);
1000         inode->v.i_mtime        = bch2_time_to_timespec(c, bi->bi_mtime);
1001         inode->v.i_ctime        = bch2_time_to_timespec(c, bi->bi_ctime);
1002
1003         inode->ei_journal_seq   = 0;
1004         inode->ei_quota_reserved = 0;
1005         inode->ei_qid           = bch_qid(bi);
1006         inode->ei_str_hash      = bch2_hash_info_init(c, bi);
1007         inode->ei_inode         = *bi;
1008
1009         bch2_inode_flags_to_vfs(inode);
1010
1011         inode->v.i_mapping->a_ops = &bch_address_space_operations;
1012
1013         switch (inode->v.i_mode & S_IFMT) {
1014         case S_IFREG:
1015                 inode->v.i_op   = &bch_file_inode_operations;
1016                 inode->v.i_fop  = &bch_file_operations;
1017                 break;
1018         case S_IFDIR:
1019                 inode->v.i_op   = &bch_dir_inode_operations;
1020                 inode->v.i_fop  = &bch_dir_file_operations;
1021                 break;
1022         case S_IFLNK:
1023                 inode_nohighmem(&inode->v);
1024                 inode->v.i_op   = &bch_symlink_inode_operations;
1025                 break;
1026         default:
1027                 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1028                 inode->v.i_op   = &bch_special_inode_operations;
1029                 break;
1030         }
1031 }
1032
1033 static struct inode *bch2_alloc_inode(struct super_block *sb)
1034 {
1035         struct bch_inode_info *inode;
1036
1037         inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
1038         if (!inode)
1039                 return NULL;
1040
1041         inode_init_once(&inode->v);
1042         mutex_init(&inode->ei_update_lock);
1043         mutex_init(&inode->ei_quota_lock);
1044         inode->ei_journal_seq = 0;
1045
1046         return &inode->v;
1047 }
1048
1049 static void bch2_i_callback(struct rcu_head *head)
1050 {
1051         struct inode *vinode = container_of(head, struct inode, i_rcu);
1052         struct bch_inode_info *inode = to_bch_ei(vinode);
1053
1054         kmem_cache_free(bch2_inode_cache, inode);
1055 }
1056
1057 static void bch2_destroy_inode(struct inode *vinode)
1058 {
1059         call_rcu(&vinode->i_rcu, bch2_i_callback);
1060 }
1061
1062 static int bch2_vfs_write_inode(struct inode *vinode,
1063                                 struct writeback_control *wbc)
1064 {
1065         struct bch_fs *c = vinode->i_sb->s_fs_info;
1066         struct bch_inode_info *inode = to_bch_ei(vinode);
1067         int ret;
1068
1069         mutex_lock(&inode->ei_update_lock);
1070         ret = bch2_write_inode(c, inode);
1071         mutex_unlock(&inode->ei_update_lock);
1072
1073         if (c->opts.journal_flush_disabled)
1074                 return ret;
1075
1076         if (!ret && wbc->sync_mode == WB_SYNC_ALL)
1077                 ret = bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
1078
1079         return ret;
1080 }
1081
1082 static void bch2_evict_inode(struct inode *vinode)
1083 {
1084         struct bch_fs *c = vinode->i_sb->s_fs_info;
1085         struct bch_inode_info *inode = to_bch_ei(vinode);
1086
1087         truncate_inode_pages_final(&inode->v.i_data);
1088
1089         clear_inode(&inode->v);
1090
1091         BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1092
1093         if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1094                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1095                                 BCH_QUOTA_WARN);
1096                 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1097                                 BCH_QUOTA_WARN);
1098                 bch2_inode_rm(c, inode->v.i_ino);
1099                 atomic_long_dec(&c->nr_inodes);
1100         }
1101 }
1102
1103 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1104 {
1105         struct super_block *sb = dentry->d_sb;
1106         struct bch_fs *c = sb->s_fs_info;
1107         u64 fsid;
1108
1109         buf->f_type     = BCACHEFS_STATFS_MAGIC;
1110         buf->f_bsize    = sb->s_blocksize;
1111         buf->f_blocks   = c->capacity >> PAGE_SECTOR_SHIFT;
1112         buf->f_bfree    = bch2_fs_sectors_free(c, bch2_fs_usage_read(c)) >>
1113                            PAGE_SECTOR_SHIFT;
1114         buf->f_bavail   = buf->f_bfree;
1115         buf->f_files    = atomic_long_read(&c->nr_inodes);
1116         buf->f_ffree    = U64_MAX;
1117
1118         fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
1119                le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
1120         buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
1121         buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
1122         buf->f_namelen  = BCH_NAME_MAX;
1123
1124         return 0;
1125 }
1126
1127 static int bch2_sync_fs(struct super_block *sb, int wait)
1128 {
1129         struct bch_fs *c = sb->s_fs_info;
1130
1131         if (!wait) {
1132                 bch2_journal_flush_async(&c->journal, NULL);
1133                 return 0;
1134         }
1135
1136         return bch2_journal_flush(&c->journal);
1137 }
1138
1139 static struct bch_fs *bch2_path_to_fs(const char *dev)
1140 {
1141         struct bch_fs *c;
1142         struct block_device *bdev = lookup_bdev(dev);
1143
1144         if (IS_ERR(bdev))
1145                 return ERR_CAST(bdev);
1146
1147         c = bch2_bdev_to_fs(bdev);
1148         bdput(bdev);
1149         return c ?: ERR_PTR(-ENOENT);
1150 }
1151
1152 static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs,
1153                                                unsigned nr_devs, struct bch_opts opts)
1154 {
1155         struct bch_fs *c, *c1, *c2;
1156         size_t i;
1157
1158         if (!nr_devs)
1159                 return ERR_PTR(-EINVAL);
1160
1161         c = bch2_fs_open(devs, nr_devs, opts);
1162
1163         if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) {
1164                 /*
1165                  * Already open?
1166                  * Look up each block device, make sure they all belong to a
1167                  * filesystem and they all belong to the _same_ filesystem
1168                  */
1169
1170                 c1 = bch2_path_to_fs(devs[0]);
1171                 if (!c1)
1172                         return c;
1173
1174                 for (i = 1; i < nr_devs; i++) {
1175                         c2 = bch2_path_to_fs(devs[i]);
1176                         if (!IS_ERR(c2))
1177                                 closure_put(&c2->cl);
1178
1179                         if (c1 != c2) {
1180                                 closure_put(&c1->cl);
1181                                 return c;
1182                         }
1183                 }
1184
1185                 c = c1;
1186         }
1187
1188         if (IS_ERR(c))
1189                 return c;
1190
1191         mutex_lock(&c->state_lock);
1192
1193         if (!bch2_fs_running(c)) {
1194                 mutex_unlock(&c->state_lock);
1195                 closure_put(&c->cl);
1196                 pr_err("err mounting %s: incomplete filesystem", dev_name);
1197                 return ERR_PTR(-EINVAL);
1198         }
1199
1200         mutex_unlock(&c->state_lock);
1201
1202         set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
1203         return c;
1204 }
1205
1206 static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
1207                                              struct bch_opts opts)
1208 {
1209         char *dev_name = NULL, **devs = NULL, *s;
1210         struct bch_fs *c = ERR_PTR(-ENOMEM);
1211         size_t i, nr_devs = 0;
1212
1213         dev_name = kstrdup(_dev_name, GFP_KERNEL);
1214         if (!dev_name)
1215                 goto err;
1216
1217         for (s = dev_name; s; s = strchr(s + 1, ':'))
1218                 nr_devs++;
1219
1220         devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
1221         if (!devs)
1222                 goto err;
1223
1224         for (i = 0, s = dev_name;
1225              s;
1226              (s = strchr(s, ':')) && (*s++ = '\0'))
1227                 devs[i++] = s;
1228
1229         c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts);
1230 err:
1231         kfree(devs);
1232         kfree(dev_name);
1233         return c;
1234 }
1235
1236 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1237 {
1238         struct bch_fs *c = sb->s_fs_info;
1239         struct bch_opts opts = bch2_opts_empty();
1240         int ret;
1241
1242         opt_set(opts, read_only, (*flags & MS_RDONLY) != 0);
1243
1244         ret = bch2_parse_mount_opts(&opts, data);
1245         if (ret)
1246                 return ret;
1247
1248         if (opts.read_only != c->opts.read_only) {
1249                 const char *err = NULL;
1250
1251                 mutex_lock(&c->state_lock);
1252
1253                 if (opts.read_only) {
1254                         bch2_fs_read_only(c);
1255
1256                         sb->s_flags |= MS_RDONLY;
1257                 } else {
1258                         err = bch2_fs_read_write(c);
1259                         if (err) {
1260                                 bch_err(c, "error going rw: %s", err);
1261                                 return -EINVAL;
1262                         }
1263
1264                         sb->s_flags &= ~MS_RDONLY;
1265                 }
1266
1267                 c->opts.read_only = opts.read_only;
1268
1269                 mutex_unlock(&c->state_lock);
1270         }
1271
1272         if (opts.errors >= 0)
1273                 c->opts.errors = opts.errors;
1274
1275         return ret;
1276 }
1277
1278 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1279 {
1280         struct bch_fs *c = root->d_sb->s_fs_info;
1281         enum bch_opt_id i;
1282         char buf[512];
1283
1284         for (i = 0; i < bch2_opts_nr; i++) {
1285                 const struct bch_option *opt = &bch2_opt_table[i];
1286                 u64 v = bch2_opt_get_by_id(&c->opts, i);
1287
1288                 if (opt->mode < OPT_MOUNT)
1289                         continue;
1290
1291                 if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1292                         continue;
1293
1294                 bch2_opt_to_text(c, buf, sizeof(buf), opt, v,
1295                                  OPT_SHOW_MOUNT_STYLE);
1296                 seq_putc(seq, ',');
1297                 seq_puts(seq, buf);
1298         }
1299
1300         return 0;
1301
1302 }
1303
1304 static const struct super_operations bch_super_operations = {
1305         .alloc_inode    = bch2_alloc_inode,
1306         .destroy_inode  = bch2_destroy_inode,
1307         .write_inode    = bch2_vfs_write_inode,
1308         .evict_inode    = bch2_evict_inode,
1309         .sync_fs        = bch2_sync_fs,
1310         .statfs         = bch2_statfs,
1311         .show_options   = bch2_show_options,
1312         .remount_fs     = bch2_remount,
1313 #if 0
1314         .put_super      = bch2_put_super,
1315         .freeze_fs      = bch2_freeze,
1316         .unfreeze_fs    = bch2_unfreeze,
1317 #endif
1318 };
1319
1320 static int bch2_test_super(struct super_block *s, void *data)
1321 {
1322         return s->s_fs_info == data;
1323 }
1324
1325 static int bch2_set_super(struct super_block *s, void *data)
1326 {
1327         s->s_fs_info = data;
1328         return 0;
1329 }
1330
1331 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1332                                  int flags, const char *dev_name, void *data)
1333 {
1334         struct bch_fs *c;
1335         struct bch_dev *ca;
1336         struct super_block *sb;
1337         struct inode *vinode;
1338         struct bch_opts opts = bch2_opts_empty();
1339         unsigned i;
1340         int ret;
1341
1342         opt_set(opts, read_only, (flags & MS_RDONLY) != 0);
1343
1344         ret = bch2_parse_mount_opts(&opts, data);
1345         if (ret)
1346                 return ERR_PTR(ret);
1347
1348         c = bch2_open_as_blockdevs(dev_name, opts);
1349         if (IS_ERR(c))
1350                 return ERR_CAST(c);
1351
1352         sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|MS_NOSEC, c);
1353         if (IS_ERR(sb)) {
1354                 closure_put(&c->cl);
1355                 return ERR_CAST(sb);
1356         }
1357
1358         BUG_ON(sb->s_fs_info != c);
1359
1360         if (sb->s_root) {
1361                 closure_put(&c->cl);
1362
1363                 if ((flags ^ sb->s_flags) & MS_RDONLY) {
1364                         ret = -EBUSY;
1365                         goto err_put_super;
1366                 }
1367                 goto out;
1368         }
1369
1370         /* XXX: blocksize */
1371         sb->s_blocksize         = PAGE_SIZE;
1372         sb->s_blocksize_bits    = PAGE_SHIFT;
1373         sb->s_maxbytes          = MAX_LFS_FILESIZE;
1374         sb->s_op                = &bch_super_operations;
1375         sb->s_export_op         = &bch_export_ops;
1376 #ifdef CONFIG_BCACHEFS_QUOTA
1377         sb->s_qcop              = &bch2_quotactl_operations;
1378         sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1379 #endif
1380         sb->s_xattr             = bch2_xattr_handlers;
1381         sb->s_magic             = BCACHEFS_STATFS_MAGIC;
1382         sb->s_time_gran         = c->sb.time_precision;
1383         c->vfs_sb               = sb;
1384         strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
1385
1386         ret = super_setup_bdi(sb);
1387         if (ret)
1388                 goto err_put_super;
1389
1390         sb->s_bdi->congested_fn         = bch2_congested;
1391         sb->s_bdi->congested_data       = c;
1392         sb->s_bdi->ra_pages             = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
1393
1394         for_each_online_member(ca, c, i) {
1395                 struct block_device *bdev = ca->disk_sb.bdev;
1396
1397                 /* XXX: create an anonymous device for multi device filesystems */
1398                 sb->s_bdev      = bdev;
1399                 sb->s_dev       = bdev->bd_dev;
1400                 percpu_ref_put(&ca->io_ref);
1401                 break;
1402         }
1403
1404 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1405         if (c->opts.acl)
1406                 sb->s_flags     |= MS_POSIXACL;
1407 #endif
1408
1409         vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
1410         if (IS_ERR(vinode)) {
1411                 ret = PTR_ERR(vinode);
1412                 goto err_put_super;
1413         }
1414
1415         sb->s_root = d_make_root(vinode);
1416         if (!sb->s_root) {
1417                 ret = -ENOMEM;
1418                 goto err_put_super;
1419         }
1420
1421         sb->s_flags |= MS_ACTIVE;
1422 out:
1423         return dget(sb->s_root);
1424
1425 err_put_super:
1426         deactivate_locked_super(sb);
1427         return ERR_PTR(ret);
1428 }
1429
1430 static void bch2_kill_sb(struct super_block *sb)
1431 {
1432         struct bch_fs *c = sb->s_fs_info;
1433
1434         generic_shutdown_super(sb);
1435
1436         if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
1437                 bch2_fs_stop(c);
1438         else
1439                 closure_put(&c->cl);
1440 }
1441
1442 static struct file_system_type bcache_fs_type = {
1443         .owner          = THIS_MODULE,
1444         .name           = "bcachefs",
1445         .mount          = bch2_mount,
1446         .kill_sb        = bch2_kill_sb,
1447         .fs_flags       = FS_REQUIRES_DEV,
1448 };
1449
1450 MODULE_ALIAS_FS("bcachefs");
1451
1452 void bch2_vfs_exit(void)
1453 {
1454         unregister_filesystem(&bcache_fs_type);
1455         if (bch2_inode_cache)
1456                 kmem_cache_destroy(bch2_inode_cache);
1457 }
1458
1459 int __init bch2_vfs_init(void)
1460 {
1461         int ret = -ENOMEM;
1462
1463         bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
1464         if (!bch2_inode_cache)
1465                 goto err;
1466
1467         ret = register_filesystem(&bcache_fs_type);
1468         if (ret)
1469                 goto err;
1470
1471         return 0;
1472 err:
1473         bch2_vfs_exit();
1474         return ret;
1475 }
1476
1477 #endif /* NO_BCACHEFS_FS */