]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/inode.c
Update bcachefs sources to b9bd69421f73 bcachefs: x-macro-ify inode flags enum
[bcachefs-tools-debian] / libbcachefs / inode.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "btree_key_cache.h"
5 #include "btree_write_buffer.h"
6 #include "bkey_methods.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "compress.h"
10 #include "error.h"
11 #include "extents.h"
12 #include "extent_update.h"
13 #include "inode.h"
14 #include "str_hash.h"
15 #include "snapshot.h"
16 #include "subvolume.h"
17 #include "varint.h"
18
19 #include <linux/random.h>
20
21 #include <asm/unaligned.h>
22
23 #define x(name, ...)    #name,
24 const char * const bch2_inode_opts[] = {
25         BCH_INODE_OPTS()
26         NULL,
27 };
28
29 static const char * const bch2_inode_flag_strs[] = {
30         BCH_INODE_FLAGS()
31         NULL
32 };
33 #undef  x
34
35 static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
36
37 static int inode_decode_field(const u8 *in, const u8 *end,
38                               u64 out[2], unsigned *out_bits)
39 {
40         __be64 be[2] = { 0, 0 };
41         unsigned bytes, shift;
42         u8 *p;
43
44         if (in >= end)
45                 return -1;
46
47         if (!*in)
48                 return -1;
49
50         /*
51          * position of highest set bit indicates number of bytes:
52          * shift = number of bits to remove in high byte:
53          */
54         shift   = 8 - __fls(*in); /* 1 <= shift <= 8 */
55         bytes   = byte_table[shift - 1];
56
57         if (in + bytes > end)
58                 return -1;
59
60         p = (u8 *) be + 16 - bytes;
61         memcpy(p, in, bytes);
62         *p ^= (1 << 8) >> shift;
63
64         out[0] = be64_to_cpu(be[0]);
65         out[1] = be64_to_cpu(be[1]);
66         *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
67
68         return bytes;
69 }
70
71 static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
72                                            const struct bch_inode_unpacked *inode)
73 {
74         struct bkey_i_inode_v3 *k = &packed->inode;
75         u8 *out = k->v.fields;
76         u8 *end = (void *) &packed[1];
77         u8 *last_nonzero_field = out;
78         unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
79         unsigned bytes;
80         int ret;
81
82         bkey_inode_v3_init(&packed->inode.k_i);
83         packed->inode.k.p.offset        = inode->bi_inum;
84         packed->inode.v.bi_journal_seq  = cpu_to_le64(inode->bi_journal_seq);
85         packed->inode.v.bi_hash_seed    = inode->bi_hash_seed;
86         packed->inode.v.bi_flags        = cpu_to_le64(inode->bi_flags);
87         packed->inode.v.bi_sectors      = cpu_to_le64(inode->bi_sectors);
88         packed->inode.v.bi_size         = cpu_to_le64(inode->bi_size);
89         packed->inode.v.bi_version      = cpu_to_le64(inode->bi_version);
90         SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
91         SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
92
93
94 #define x(_name, _bits)                                                 \
95         nr_fields++;                                                    \
96                                                                         \
97         if (inode->_name) {                                             \
98                 ret = bch2_varint_encode_fast(out, inode->_name);       \
99                 out += ret;                                             \
100                                                                         \
101                 if (_bits > 64)                                         \
102                         *out++ = 0;                                     \
103                                                                         \
104                 last_nonzero_field = out;                               \
105                 last_nonzero_fieldnr = nr_fields;                       \
106         } else {                                                        \
107                 *out++ = 0;                                             \
108                                                                         \
109                 if (_bits > 64)                                         \
110                         *out++ = 0;                                     \
111         }
112
113         BCH_INODE_FIELDS_v3()
114 #undef  x
115         BUG_ON(out > end);
116
117         out = last_nonzero_field;
118         nr_fields = last_nonzero_fieldnr;
119
120         bytes = out - (u8 *) &packed->inode.v;
121         set_bkey_val_bytes(&packed->inode.k, bytes);
122         memset_u64s_tail(&packed->inode.v, 0, bytes);
123
124         SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
125
126         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
127                 struct bch_inode_unpacked unpacked;
128
129                 ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
130                 BUG_ON(ret);
131                 BUG_ON(unpacked.bi_inum         != inode->bi_inum);
132                 BUG_ON(unpacked.bi_hash_seed    != inode->bi_hash_seed);
133                 BUG_ON(unpacked.bi_sectors      != inode->bi_sectors);
134                 BUG_ON(unpacked.bi_size         != inode->bi_size);
135                 BUG_ON(unpacked.bi_version      != inode->bi_version);
136                 BUG_ON(unpacked.bi_mode         != inode->bi_mode);
137
138 #define x(_name, _bits) if (unpacked._name != inode->_name)             \
139                         panic("unpacked %llu should be %llu",           \
140                               (u64) unpacked._name, (u64) inode->_name);
141                 BCH_INODE_FIELDS_v3()
142 #undef  x
143         }
144 }
145
146 void bch2_inode_pack(struct bkey_inode_buf *packed,
147                      const struct bch_inode_unpacked *inode)
148 {
149         bch2_inode_pack_inlined(packed, inode);
150 }
151
152 static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
153                                 struct bch_inode_unpacked *unpacked)
154 {
155         const u8 *in = inode.v->fields;
156         const u8 *end = bkey_val_end(inode);
157         u64 field[2];
158         unsigned fieldnr = 0, field_bits;
159         int ret;
160
161 #define x(_name, _bits)                                 \
162         if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {                    \
163                 unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
164                 memset((void *) unpacked + offset, 0,                   \
165                        sizeof(*unpacked) - offset);                     \
166                 return 0;                                               \
167         }                                                               \
168                                                                         \
169         ret = inode_decode_field(in, end, field, &field_bits);          \
170         if (ret < 0)                                                    \
171                 return ret;                                             \
172                                                                         \
173         if (field_bits > sizeof(unpacked->_name) * 8)                   \
174                 return -1;                                              \
175                                                                         \
176         unpacked->_name = field[1];                                     \
177         in += ret;
178
179         BCH_INODE_FIELDS_v2()
180 #undef  x
181
182         /* XXX: signal if there were more fields than expected? */
183         return 0;
184 }
185
186 static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
187                                 const u8 *in, const u8 *end,
188                                 unsigned nr_fields)
189 {
190         unsigned fieldnr = 0;
191         int ret;
192         u64 v[2];
193
194 #define x(_name, _bits)                                                 \
195         if (fieldnr < nr_fields) {                                      \
196                 ret = bch2_varint_decode_fast(in, end, &v[0]);          \
197                 if (ret < 0)                                            \
198                         return ret;                                     \
199                 in += ret;                                              \
200                                                                         \
201                 if (_bits > 64) {                                       \
202                         ret = bch2_varint_decode_fast(in, end, &v[1]);  \
203                         if (ret < 0)                                    \
204                                 return ret;                             \
205                         in += ret;                                      \
206                 } else {                                                \
207                         v[1] = 0;                                       \
208                 }                                                       \
209         } else {                                                        \
210                 v[0] = v[1] = 0;                                        \
211         }                                                               \
212                                                                         \
213         unpacked->_name = v[0];                                         \
214         if (v[1] || v[0] != unpacked->_name)                            \
215                 return -1;                                              \
216         fieldnr++;
217
218         BCH_INODE_FIELDS_v2()
219 #undef  x
220
221         /* XXX: signal if there were more fields than expected? */
222         return 0;
223 }
224
225 static int bch2_inode_unpack_v3(struct bkey_s_c k,
226                                 struct bch_inode_unpacked *unpacked)
227 {
228         struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
229         const u8 *in = inode.v->fields;
230         const u8 *end = bkey_val_end(inode);
231         unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
232         unsigned fieldnr = 0;
233         int ret;
234         u64 v[2];
235
236         unpacked->bi_inum       = inode.k->p.offset;
237         unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
238         unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
239         unpacked->bi_flags      = le64_to_cpu(inode.v->bi_flags);
240         unpacked->bi_sectors    = le64_to_cpu(inode.v->bi_sectors);
241         unpacked->bi_size       = le64_to_cpu(inode.v->bi_size);
242         unpacked->bi_version    = le64_to_cpu(inode.v->bi_version);
243         unpacked->bi_mode       = INODEv3_MODE(inode.v);
244
245 #define x(_name, _bits)                                                 \
246         if (fieldnr < nr_fields) {                                      \
247                 ret = bch2_varint_decode_fast(in, end, &v[0]);          \
248                 if (ret < 0)                                            \
249                         return ret;                                     \
250                 in += ret;                                              \
251                                                                         \
252                 if (_bits > 64) {                                       \
253                         ret = bch2_varint_decode_fast(in, end, &v[1]);  \
254                         if (ret < 0)                                    \
255                                 return ret;                             \
256                         in += ret;                                      \
257                 } else {                                                \
258                         v[1] = 0;                                       \
259                 }                                                       \
260         } else {                                                        \
261                 v[0] = v[1] = 0;                                        \
262         }                                                               \
263                                                                         \
264         unpacked->_name = v[0];                                         \
265         if (v[1] || v[0] != unpacked->_name)                            \
266                 return -1;                                              \
267         fieldnr++;
268
269         BCH_INODE_FIELDS_v3()
270 #undef  x
271
272         /* XXX: signal if there were more fields than expected? */
273         return 0;
274 }
275
276 static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
277                                                struct bch_inode_unpacked *unpacked)
278 {
279         memset(unpacked, 0, sizeof(*unpacked));
280
281         switch (k.k->type) {
282         case KEY_TYPE_inode: {
283                 struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
284
285                 unpacked->bi_inum       = inode.k->p.offset;
286                 unpacked->bi_journal_seq= 0;
287                 unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
288                 unpacked->bi_flags      = le32_to_cpu(inode.v->bi_flags);
289                 unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
290
291                 if (INODE_NEW_VARINT(inode.v)) {
292                         return bch2_inode_unpack_v2(unpacked, inode.v->fields,
293                                                     bkey_val_end(inode),
294                                                     INODE_NR_FIELDS(inode.v));
295                 } else {
296                         return bch2_inode_unpack_v1(inode, unpacked);
297                 }
298                 break;
299         }
300         case KEY_TYPE_inode_v2: {
301                 struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
302
303                 unpacked->bi_inum       = inode.k->p.offset;
304                 unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
305                 unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
306                 unpacked->bi_flags      = le64_to_cpu(inode.v->bi_flags);
307                 unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
308
309                 return bch2_inode_unpack_v2(unpacked, inode.v->fields,
310                                             bkey_val_end(inode),
311                                             INODEv2_NR_FIELDS(inode.v));
312         }
313         default:
314                 BUG();
315         }
316 }
317
318 int bch2_inode_unpack(struct bkey_s_c k,
319                       struct bch_inode_unpacked *unpacked)
320 {
321         if (likely(k.k->type == KEY_TYPE_inode_v3))
322                 return bch2_inode_unpack_v3(k, unpacked);
323         return bch2_inode_unpack_slowpath(k, unpacked);
324 }
325
326 static int bch2_inode_peek_nowarn(struct btree_trans *trans,
327                     struct btree_iter *iter,
328                     struct bch_inode_unpacked *inode,
329                     subvol_inum inum, unsigned flags)
330 {
331         struct bkey_s_c k;
332         u32 snapshot;
333         int ret;
334
335         ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
336         if (ret)
337                 return ret;
338
339         k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
340                                SPOS(0, inum.inum, snapshot),
341                                flags|BTREE_ITER_CACHED);
342         ret = bkey_err(k);
343         if (ret)
344                 return ret;
345
346         ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
347         if (ret)
348                 goto err;
349
350         ret = bch2_inode_unpack(k, inode);
351         if (ret)
352                 goto err;
353
354         return 0;
355 err:
356         bch2_trans_iter_exit(trans, iter);
357         return ret;
358 }
359
360 int bch2_inode_peek(struct btree_trans *trans,
361                     struct btree_iter *iter,
362                     struct bch_inode_unpacked *inode,
363                     subvol_inum inum, unsigned flags)
364 {
365         int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
366         bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
367         return ret;
368 }
369
370 int bch2_inode_write(struct btree_trans *trans,
371                      struct btree_iter *iter,
372                      struct bch_inode_unpacked *inode)
373 {
374         struct bkey_inode_buf *inode_p;
375
376         inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
377         if (IS_ERR(inode_p))
378                 return PTR_ERR(inode_p);
379
380         bch2_inode_pack_inlined(inode_p, inode);
381         inode_p->inode.k.p.snapshot = iter->snapshot;
382         return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
383 }
384
385 struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
386 {
387         struct bch_inode_unpacked u;
388         struct bkey_inode_buf *inode_p;
389         int ret;
390
391         if (!bkey_is_inode(&k->k))
392                 return ERR_PTR(-ENOENT);
393
394         inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
395         if (IS_ERR(inode_p))
396                 return ERR_CAST(inode_p);
397
398         ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
399         if (ret)
400                 return ERR_PTR(ret);
401
402         bch2_inode_pack(inode_p, &u);
403         return &inode_p->inode.k_i;
404 }
405
406 static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
407 {
408         struct bch_inode_unpacked unpacked;
409         int ret = 0;
410
411         bkey_fsck_err_on(k.k->p.inode, c, err,
412                          inode_pos_inode_nonzero,
413                          "nonzero k.p.inode");
414
415         bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err,
416                          inode_pos_blockdev_range,
417                          "fs inode in blockdev range");
418
419         bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err,
420                          inode_unpack_error,
421                          "invalid variable length fields");
422
423         bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err,
424                          inode_checksum_type_invalid,
425                          "invalid data checksum type (%u >= %u",
426                          unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
427
428         bkey_fsck_err_on(unpacked.bi_compression &&
429                          !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err,
430                          inode_compression_type_invalid,
431                          "invalid compression opt %u", unpacked.bi_compression - 1);
432
433         bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
434                          unpacked.bi_nlink != 0, c, err,
435                          inode_unlinked_but_nlink_nonzero,
436                          "flagged as unlinked but bi_nlink != 0");
437
438         bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err,
439                          inode_subvol_root_but_not_dir,
440                          "subvolume root but not a directory");
441 fsck_err:
442         return ret;
443 }
444
445 int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k,
446                        enum bkey_invalid_flags flags,
447                        struct printbuf *err)
448 {
449         struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
450         int ret = 0;
451
452         bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
453                          inode_str_hash_invalid,
454                          "invalid str hash type (%llu >= %u)",
455                          INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
456
457         ret = __bch2_inode_invalid(c, k, err);
458 fsck_err:
459         return ret;
460 }
461
462 int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
463                           enum bkey_invalid_flags flags,
464                           struct printbuf *err)
465 {
466         struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
467         int ret = 0;
468
469         bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
470                          inode_str_hash_invalid,
471                          "invalid str hash type (%llu >= %u)",
472                          INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
473
474         ret = __bch2_inode_invalid(c, k, err);
475 fsck_err:
476         return ret;
477 }
478
479 int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
480                           enum bkey_invalid_flags flags,
481                           struct printbuf *err)
482 {
483         struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
484         int ret = 0;
485
486         bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
487                          INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err,
488                          inode_v3_fields_start_bad,
489                          "invalid fields_start (got %llu, min %u max %zu)",
490                          INODEv3_FIELDS_START(inode.v),
491                          INODEv3_FIELDS_START_INITIAL,
492                          bkey_val_u64s(inode.k));
493
494         bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
495                          inode_str_hash_invalid,
496                          "invalid str hash type (%llu >= %u)",
497                          INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
498
499         ret = __bch2_inode_invalid(c, k, err);
500 fsck_err:
501         return ret;
502 }
503
504 static void __bch2_inode_unpacked_to_text(struct printbuf *out,
505                                           struct bch_inode_unpacked *inode)
506 {
507         prt_printf(out, "mode=%o ", inode->bi_mode);
508
509         prt_str(out, "flags=");
510         prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
511         prt_printf(out, " (%x)", inode->bi_flags);
512
513         prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu",
514                inode->bi_journal_seq,
515                inode->bi_size,
516                inode->bi_sectors,
517                inode->bi_version);
518
519 #define x(_name, _bits)                                         \
520         prt_printf(out, " "#_name "=%llu", (u64) inode->_name);
521         BCH_INODE_FIELDS_v3()
522 #undef  x
523 }
524
525 void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
526 {
527         prt_printf(out, "inum: %llu ", inode->bi_inum);
528         __bch2_inode_unpacked_to_text(out, inode);
529 }
530
531 void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
532 {
533         struct bch_inode_unpacked inode;
534
535         if (bch2_inode_unpack(k, &inode)) {
536                 prt_printf(out, "(unpack error)");
537                 return;
538         }
539
540         __bch2_inode_unpacked_to_text(out, &inode);
541 }
542
543 static inline u64 bkey_inode_flags(struct bkey_s_c k)
544 {
545         switch (k.k->type) {
546         case KEY_TYPE_inode:
547                 return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
548         case KEY_TYPE_inode_v2:
549                 return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
550         case KEY_TYPE_inode_v3:
551                 return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
552         default:
553                 return 0;
554         }
555 }
556
557 static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
558 {
559         return bkey_inode_flags(k) & BCH_INODE_unlinked;
560 }
561
562 int bch2_trans_mark_inode(struct btree_trans *trans,
563                           enum btree_id btree_id, unsigned level,
564                           struct bkey_s_c old,
565                           struct bkey_i *new,
566                           unsigned flags)
567 {
568         int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
569         bool old_deleted = bkey_is_deleted_inode(old);
570         bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new));
571
572         if (nr) {
573                 int ret = bch2_replicas_deltas_realloc(trans, 0);
574                 struct replicas_delta_list *d = trans->fs_usage_deltas;
575
576                 if (ret)
577                         return ret;
578
579                 d->nr_inodes += nr;
580         }
581
582         if (old_deleted != new_deleted) {
583                 int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted);
584                 if (ret)
585                         return ret;
586         }
587
588         return 0;
589 }
590
591 int bch2_mark_inode(struct btree_trans *trans,
592                     enum btree_id btree_id, unsigned level,
593                     struct bkey_s_c old, struct bkey_s_c new,
594                     unsigned flags)
595 {
596         struct bch_fs *c = trans->c;
597         struct bch_fs_usage *fs_usage;
598         u64 journal_seq = trans->journal_res.seq;
599
600         if (flags & BTREE_TRIGGER_INSERT) {
601                 struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v;
602
603                 BUG_ON(!journal_seq);
604                 BUG_ON(new.k->type != KEY_TYPE_inode_v3);
605
606                 v->bi_journal_seq = cpu_to_le64(journal_seq);
607         }
608
609         if (flags & BTREE_TRIGGER_GC) {
610                 percpu_down_read(&c->mark_lock);
611                 preempt_disable();
612
613                 fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
614                 fs_usage->nr_inodes += bkey_is_inode(new.k);
615                 fs_usage->nr_inodes -= bkey_is_inode(old.k);
616
617                 preempt_enable();
618                 percpu_up_read(&c->mark_lock);
619         }
620         return 0;
621 }
622
623 int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k,
624                                   enum bkey_invalid_flags flags,
625                                   struct printbuf *err)
626 {
627         int ret = 0;
628
629         bkey_fsck_err_on(k.k->p.inode, c, err,
630                          inode_pos_inode_nonzero,
631                          "nonzero k.p.inode");
632 fsck_err:
633         return ret;
634 }
635
636 void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
637                                    struct bkey_s_c k)
638 {
639         struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
640
641         prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
642 }
643
644 void bch2_inode_init_early(struct bch_fs *c,
645                            struct bch_inode_unpacked *inode_u)
646 {
647         enum bch_str_hash_type str_hash =
648                 bch2_str_hash_opt_to_type(c, c->opts.str_hash);
649
650         memset(inode_u, 0, sizeof(*inode_u));
651
652         /* ick */
653         inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
654         get_random_bytes(&inode_u->bi_hash_seed,
655                          sizeof(inode_u->bi_hash_seed));
656 }
657
658 void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
659                           uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
660                           struct bch_inode_unpacked *parent)
661 {
662         inode_u->bi_mode        = mode;
663         inode_u->bi_uid         = uid;
664         inode_u->bi_gid         = gid;
665         inode_u->bi_dev         = rdev;
666         inode_u->bi_atime       = now;
667         inode_u->bi_mtime       = now;
668         inode_u->bi_ctime       = now;
669         inode_u->bi_otime       = now;
670
671         if (parent && parent->bi_mode & S_ISGID) {
672                 inode_u->bi_gid = parent->bi_gid;
673                 if (S_ISDIR(mode))
674                         inode_u->bi_mode |= S_ISGID;
675         }
676
677         if (parent) {
678 #define x(_name, ...)   inode_u->bi_##_name = parent->bi_##_name;
679                 BCH_INODE_OPTS()
680 #undef x
681         }
682 }
683
684 void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
685                      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
686                      struct bch_inode_unpacked *parent)
687 {
688         bch2_inode_init_early(c, inode_u);
689         bch2_inode_init_late(inode_u, bch2_current_time(c),
690                              uid, gid, mode, rdev, parent);
691 }
692
693 static inline u32 bkey_generation(struct bkey_s_c k)
694 {
695         switch (k.k->type) {
696         case KEY_TYPE_inode:
697         case KEY_TYPE_inode_v2:
698                 BUG();
699         case KEY_TYPE_inode_generation:
700                 return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
701         default:
702                 return 0;
703         }
704 }
705
706 /*
707  * This just finds an empty slot:
708  */
709 int bch2_inode_create(struct btree_trans *trans,
710                       struct btree_iter *iter,
711                       struct bch_inode_unpacked *inode_u,
712                       u32 snapshot, u64 cpu)
713 {
714         struct bch_fs *c = trans->c;
715         struct bkey_s_c k;
716         u64 min, max, start, pos, *hint;
717         int ret = 0;
718         unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
719
720         if (c->opts.shard_inode_numbers) {
721                 bits -= c->inode_shard_bits;
722
723                 min = (cpu << bits);
724                 max = (cpu << bits) | ~(ULLONG_MAX << bits);
725
726                 min = max_t(u64, min, BLOCKDEV_INODE_MAX);
727                 hint = c->unused_inode_hints + cpu;
728         } else {
729                 min = BLOCKDEV_INODE_MAX;
730                 max = ~(ULLONG_MAX << bits);
731                 hint = c->unused_inode_hints;
732         }
733
734         start = READ_ONCE(*hint);
735
736         if (start >= max || start < min)
737                 start = min;
738
739         pos = start;
740         bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
741                              BTREE_ITER_ALL_SNAPSHOTS|
742                              BTREE_ITER_INTENT);
743 again:
744         while ((k = bch2_btree_iter_peek(iter)).k &&
745                !(ret = bkey_err(k)) &&
746                bkey_lt(k.k->p, POS(0, max))) {
747                 if (pos < iter->pos.offset)
748                         goto found_slot;
749
750                 /*
751                  * We don't need to iterate over keys in every snapshot once
752                  * we've found just one:
753                  */
754                 pos = iter->pos.offset + 1;
755                 bch2_btree_iter_set_pos(iter, POS(0, pos));
756         }
757
758         if (!ret && pos < max)
759                 goto found_slot;
760
761         if (!ret && start == min)
762                 ret = -BCH_ERR_ENOSPC_inode_create;
763
764         if (ret) {
765                 bch2_trans_iter_exit(trans, iter);
766                 return ret;
767         }
768
769         /* Retry from start */
770         pos = start = min;
771         bch2_btree_iter_set_pos(iter, POS(0, pos));
772         goto again;
773 found_slot:
774         bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
775         k = bch2_btree_iter_peek_slot(iter);
776         ret = bkey_err(k);
777         if (ret) {
778                 bch2_trans_iter_exit(trans, iter);
779                 return ret;
780         }
781
782         *hint                   = k.k->p.offset;
783         inode_u->bi_inum        = k.k->p.offset;
784         inode_u->bi_generation  = bkey_generation(k);
785         return 0;
786 }
787
788 static int bch2_inode_delete_keys(struct btree_trans *trans,
789                                   subvol_inum inum, enum btree_id id)
790 {
791         struct btree_iter iter;
792         struct bkey_s_c k;
793         struct bkey_i delete;
794         struct bpos end = POS(inum.inum, U64_MAX);
795         u32 snapshot;
796         int ret = 0;
797
798         /*
799          * We're never going to be deleting partial extents, no need to use an
800          * extent iterator:
801          */
802         bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
803                              BTREE_ITER_INTENT);
804
805         while (1) {
806                 bch2_trans_begin(trans);
807
808                 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
809                 if (ret)
810                         goto err;
811
812                 bch2_btree_iter_set_snapshot(&iter, snapshot);
813
814                 k = bch2_btree_iter_peek_upto(&iter, end);
815                 ret = bkey_err(k);
816                 if (ret)
817                         goto err;
818
819                 if (!k.k)
820                         break;
821
822                 bkey_init(&delete.k);
823                 delete.k.p = iter.pos;
824
825                 if (iter.flags & BTREE_ITER_IS_EXTENTS)
826                         bch2_key_resize(&delete.k,
827                                         bpos_min(end, k.k->p).offset -
828                                         iter.pos.offset);
829
830                 ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
831                       bch2_trans_commit(trans, NULL, NULL,
832                                         BTREE_INSERT_NOFAIL);
833 err:
834                 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
835                         break;
836         }
837
838         bch2_trans_iter_exit(trans, &iter);
839         return ret;
840 }
841
842 int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
843 {
844         struct btree_trans *trans = bch2_trans_get(c);
845         struct btree_iter iter = { NULL };
846         struct bkey_i_inode_generation delete;
847         struct bch_inode_unpacked inode_u;
848         struct bkey_s_c k;
849         u32 snapshot;
850         int ret;
851
852         /*
853          * If this was a directory, there shouldn't be any real dirents left -
854          * but there could be whiteouts (from hash collisions) that we should
855          * delete:
856          *
857          * XXX: the dirent could ideally would delete whiteouts when they're no
858          * longer needed
859          */
860         ret   = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
861                 bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
862                 bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
863         if (ret)
864                 goto err;
865 retry:
866         bch2_trans_begin(trans);
867
868         ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
869         if (ret)
870                 goto err;
871
872         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
873                                SPOS(0, inum.inum, snapshot),
874                                BTREE_ITER_INTENT|BTREE_ITER_CACHED);
875         ret = bkey_err(k);
876         if (ret)
877                 goto err;
878
879         if (!bkey_is_inode(k.k)) {
880                 bch2_fs_inconsistent(c,
881                                      "inode %llu:%u not found when deleting",
882                                      inum.inum, snapshot);
883                 ret = -EIO;
884                 goto err;
885         }
886
887         bch2_inode_unpack(k, &inode_u);
888
889         bkey_inode_generation_init(&delete.k_i);
890         delete.k.p = iter.pos;
891         delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
892
893         ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
894                 bch2_trans_commit(trans, NULL, NULL,
895                                 BTREE_INSERT_NOFAIL);
896 err:
897         bch2_trans_iter_exit(trans, &iter);
898         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
899                 goto retry;
900
901         bch2_trans_put(trans);
902         return ret;
903 }
904
905 int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
906                                   subvol_inum inum,
907                                   struct bch_inode_unpacked *inode)
908 {
909         struct btree_iter iter;
910         int ret;
911
912         ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
913         if (!ret)
914                 bch2_trans_iter_exit(trans, &iter);
915         return ret;
916 }
917
918 int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
919                                   subvol_inum inum,
920                                   struct bch_inode_unpacked *inode)
921 {
922         struct btree_iter iter;
923         int ret;
924
925         ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
926         if (!ret)
927                 bch2_trans_iter_exit(trans, &iter);
928         return ret;
929 }
930
931 int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
932                             struct bch_inode_unpacked *inode)
933 {
934         return bch2_trans_do(c, NULL, NULL, 0,
935                 bch2_inode_find_by_inum_trans(trans, inum, inode));
936 }
937
938 int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
939 {
940         if (bi->bi_flags & BCH_INODE_unlinked)
941                 bi->bi_flags &= ~BCH_INODE_unlinked;
942         else {
943                 if (bi->bi_nlink == U32_MAX)
944                         return -EINVAL;
945
946                 bi->bi_nlink++;
947         }
948
949         return 0;
950 }
951
952 void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
953 {
954         if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
955                 bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
956                                         bi->bi_inum);
957                 return;
958         }
959
960         if (bi->bi_flags & BCH_INODE_unlinked) {
961                 bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
962                 return;
963         }
964
965         if (bi->bi_nlink)
966                 bi->bi_nlink--;
967         else
968                 bi->bi_flags |= BCH_INODE_unlinked;
969 }
970
971 struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
972 {
973         struct bch_opts ret = { 0 };
974 #define x(_name, _bits)                                                 \
975         if (inode->bi_##_name)                                          \
976                 opt_set(ret, _name, inode->bi_##_name - 1);
977         BCH_INODE_OPTS()
978 #undef x
979         return ret;
980 }
981
982 void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
983                          struct bch_inode_unpacked *inode)
984 {
985 #define x(_name, _bits)         opts->_name = inode_opt_get(c, inode, _name);
986         BCH_INODE_OPTS()
987 #undef x
988
989         if (opts->nocow)
990                 opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
991 }
992
993 int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
994 {
995         struct bch_inode_unpacked inode;
996         int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
997
998         if (ret)
999                 return ret;
1000
1001         bch2_inode_opts_get(opts, trans->c, &inode);
1002         return 0;
1003 }
1004
1005 int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
1006 {
1007         struct bch_fs *c = trans->c;
1008         struct btree_iter iter = { NULL };
1009         struct bkey_i_inode_generation delete;
1010         struct bch_inode_unpacked inode_u;
1011         struct bkey_s_c k;
1012         int ret;
1013
1014         do {
1015                 ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1016                                                       SPOS(inum, 0, snapshot),
1017                                                       SPOS(inum, U64_MAX, snapshot),
1018                                                       0, NULL) ?:
1019                         bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
1020                                                       SPOS(inum, 0, snapshot),
1021                                                       SPOS(inum, U64_MAX, snapshot),
1022                                                       0, NULL) ?:
1023                         bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
1024                                                       SPOS(inum, 0, snapshot),
1025                                                       SPOS(inum, U64_MAX, snapshot),
1026                                                       0, NULL);
1027         } while (ret == -BCH_ERR_transaction_restart_nested);
1028         if (ret)
1029                 goto err;
1030 retry:
1031         bch2_trans_begin(trans);
1032
1033         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
1034                                SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
1035         ret = bkey_err(k);
1036         if (ret)
1037                 goto err;
1038
1039         if (!bkey_is_inode(k.k)) {
1040                 bch2_fs_inconsistent(c,
1041                                      "inode %llu:%u not found when deleting",
1042                                      inum, snapshot);
1043                 ret = -EIO;
1044                 goto err;
1045         }
1046
1047         bch2_inode_unpack(k, &inode_u);
1048
1049         /* Subvolume root? */
1050         if (inode_u.bi_subvol)
1051                 bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
1052
1053         bkey_inode_generation_init(&delete.k_i);
1054         delete.k.p = iter.pos;
1055         delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
1056
1057         ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
1058                 bch2_trans_commit(trans, NULL, NULL,
1059                                 BTREE_INSERT_NOFAIL);
1060 err:
1061         bch2_trans_iter_exit(trans, &iter);
1062         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1063                 goto retry;
1064
1065         return ret ?: -BCH_ERR_transaction_restart_nested;
1066 }
1067
1068 static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
1069 {
1070         struct bch_fs *c = trans->c;
1071         struct btree_iter iter;
1072         struct bkey_s_c k;
1073         struct bch_inode_unpacked inode;
1074         int ret;
1075
1076         if (bch2_snapshot_is_internal_node(c, pos.snapshot))
1077                 return 0;
1078
1079         if (!fsck_err_on(c->sb.clean, c,
1080                          deleted_inode_but_clean,
1081                          "filesystem marked as clean but have deleted inode %llu:%u",
1082                          pos.offset, pos.snapshot))
1083                 return 0;
1084
1085         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
1086         ret = bkey_err(k);
1087         if (ret)
1088                 return ret;
1089
1090         ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
1091         if (fsck_err_on(!bkey_is_inode(k.k), c,
1092                         deleted_inode_missing,
1093                         "nonexistent inode %llu:%u in deleted_inodes btree",
1094                         pos.offset, pos.snapshot))
1095                 goto delete;
1096
1097         ret = bch2_inode_unpack(k, &inode);
1098         if (ret)
1099                 goto err;
1100
1101         if (fsck_err_on(S_ISDIR(inode.bi_mode), c,
1102                         deleted_inode_is_dir,
1103                         "directory %llu:%u in deleted_inodes btree",
1104                         pos.offset, pos.snapshot))
1105                 goto delete;
1106
1107         if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c,
1108                         deleted_inode_not_unlinked,
1109                         "non-deleted inode %llu:%u in deleted_inodes btree",
1110                         pos.offset, pos.snapshot))
1111                 goto delete;
1112
1113         return 1;
1114 err:
1115 fsck_err:
1116         return ret;
1117 delete:
1118         return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
1119 }
1120
1121 int bch2_delete_dead_inodes(struct bch_fs *c)
1122 {
1123         struct btree_trans *trans = bch2_trans_get(c);
1124         struct btree_iter iter;
1125         struct bkey_s_c k;
1126         int ret;
1127
1128         ret = bch2_btree_write_buffer_flush_sync(trans);
1129         if (ret)
1130                 goto err;
1131
1132         /*
1133          * Weird transaction restart handling here because on successful delete,
1134          * bch2_inode_rm_snapshot() will return a nested transaction restart,
1135          * but we can't retry because the btree write buffer won't have been
1136          * flushed and we'd spin:
1137          */
1138         for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
1139                            BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
1140                 ret = lockrestart_do(trans, may_delete_deleted_inode(trans, k.k->p));
1141                 if (ret < 0)
1142                         break;
1143
1144                 if (ret) {
1145                         if (!test_bit(BCH_FS_RW, &c->flags)) {
1146                                 bch2_trans_unlock(trans);
1147                                 bch2_fs_lazy_rw(c);
1148                         }
1149
1150                         ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
1151                         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
1152                                 break;
1153                 }
1154         }
1155         bch2_trans_iter_exit(trans, &iter);
1156 err:
1157         bch2_trans_put(trans);
1158
1159         return ret;
1160 }