]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/inode.c
Update bcachefs sources to d464ec667b2b bcachefs: Add missing printk newlines
[bcachefs-tools-debian] / libbcachefs / inode.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "btree_key_cache.h"
5 #include "btree_write_buffer.h"
6 #include "bkey_methods.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "compress.h"
10 #include "error.h"
11 #include "extents.h"
12 #include "extent_update.h"
13 #include "inode.h"
14 #include "str_hash.h"
15 #include "snapshot.h"
16 #include "subvolume.h"
17 #include "varint.h"
18
19 #include <linux/random.h>
20
21 #include <asm/unaligned.h>
22
23 #define x(name, ...)    #name,
24 const char * const bch2_inode_opts[] = {
25         BCH_INODE_OPTS()
26         NULL,
27 };
28
29 static const char * const bch2_inode_flag_strs[] = {
30         BCH_INODE_FLAGS()
31         NULL
32 };
33 #undef  x
34
35 static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
36
37 static int inode_decode_field(const u8 *in, const u8 *end,
38                               u64 out[2], unsigned *out_bits)
39 {
40         __be64 be[2] = { 0, 0 };
41         unsigned bytes, shift;
42         u8 *p;
43
44         if (in >= end)
45                 return -1;
46
47         if (!*in)
48                 return -1;
49
50         /*
51          * position of highest set bit indicates number of bytes:
52          * shift = number of bits to remove in high byte:
53          */
54         shift   = 8 - __fls(*in); /* 1 <= shift <= 8 */
55         bytes   = byte_table[shift - 1];
56
57         if (in + bytes > end)
58                 return -1;
59
60         p = (u8 *) be + 16 - bytes;
61         memcpy(p, in, bytes);
62         *p ^= (1 << 8) >> shift;
63
64         out[0] = be64_to_cpu(be[0]);
65         out[1] = be64_to_cpu(be[1]);
66         *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
67
68         return bytes;
69 }
70
71 static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
72                                            const struct bch_inode_unpacked *inode)
73 {
74         struct bkey_i_inode_v3 *k = &packed->inode;
75         u8 *out = k->v.fields;
76         u8 *end = (void *) &packed[1];
77         u8 *last_nonzero_field = out;
78         unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
79         unsigned bytes;
80         int ret;
81
82         bkey_inode_v3_init(&packed->inode.k_i);
83         packed->inode.k.p.offset        = inode->bi_inum;
84         packed->inode.v.bi_journal_seq  = cpu_to_le64(inode->bi_journal_seq);
85         packed->inode.v.bi_hash_seed    = inode->bi_hash_seed;
86         packed->inode.v.bi_flags        = cpu_to_le64(inode->bi_flags);
87         packed->inode.v.bi_sectors      = cpu_to_le64(inode->bi_sectors);
88         packed->inode.v.bi_size         = cpu_to_le64(inode->bi_size);
89         packed->inode.v.bi_version      = cpu_to_le64(inode->bi_version);
90         SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
91         SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
92
93
94 #define x(_name, _bits)                                                 \
95         nr_fields++;                                                    \
96                                                                         \
97         if (inode->_name) {                                             \
98                 ret = bch2_varint_encode_fast(out, inode->_name);       \
99                 out += ret;                                             \
100                                                                         \
101                 if (_bits > 64)                                         \
102                         *out++ = 0;                                     \
103                                                                         \
104                 last_nonzero_field = out;                               \
105                 last_nonzero_fieldnr = nr_fields;                       \
106         } else {                                                        \
107                 *out++ = 0;                                             \
108                                                                         \
109                 if (_bits > 64)                                         \
110                         *out++ = 0;                                     \
111         }
112
113         BCH_INODE_FIELDS_v3()
114 #undef  x
115         BUG_ON(out > end);
116
117         out = last_nonzero_field;
118         nr_fields = last_nonzero_fieldnr;
119
120         bytes = out - (u8 *) &packed->inode.v;
121         set_bkey_val_bytes(&packed->inode.k, bytes);
122         memset_u64s_tail(&packed->inode.v, 0, bytes);
123
124         SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
125
126         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
127                 struct bch_inode_unpacked unpacked;
128
129                 ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
130                 BUG_ON(ret);
131                 BUG_ON(unpacked.bi_inum         != inode->bi_inum);
132                 BUG_ON(unpacked.bi_hash_seed    != inode->bi_hash_seed);
133                 BUG_ON(unpacked.bi_sectors      != inode->bi_sectors);
134                 BUG_ON(unpacked.bi_size         != inode->bi_size);
135                 BUG_ON(unpacked.bi_version      != inode->bi_version);
136                 BUG_ON(unpacked.bi_mode         != inode->bi_mode);
137
138 #define x(_name, _bits) if (unpacked._name != inode->_name)             \
139                         panic("unpacked %llu should be %llu",           \
140                               (u64) unpacked._name, (u64) inode->_name);
141                 BCH_INODE_FIELDS_v3()
142 #undef  x
143         }
144 }
145
146 void bch2_inode_pack(struct bkey_inode_buf *packed,
147                      const struct bch_inode_unpacked *inode)
148 {
149         bch2_inode_pack_inlined(packed, inode);
150 }
151
152 static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
153                                 struct bch_inode_unpacked *unpacked)
154 {
155         const u8 *in = inode.v->fields;
156         const u8 *end = bkey_val_end(inode);
157         u64 field[2];
158         unsigned fieldnr = 0, field_bits;
159         int ret;
160
161 #define x(_name, _bits)                                 \
162         if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {                    \
163                 unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
164                 memset((void *) unpacked + offset, 0,                   \
165                        sizeof(*unpacked) - offset);                     \
166                 return 0;                                               \
167         }                                                               \
168                                                                         \
169         ret = inode_decode_field(in, end, field, &field_bits);          \
170         if (ret < 0)                                                    \
171                 return ret;                                             \
172                                                                         \
173         if (field_bits > sizeof(unpacked->_name) * 8)                   \
174                 return -1;                                              \
175                                                                         \
176         unpacked->_name = field[1];                                     \
177         in += ret;
178
179         BCH_INODE_FIELDS_v2()
180 #undef  x
181
182         /* XXX: signal if there were more fields than expected? */
183         return 0;
184 }
185
186 static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
187                                 const u8 *in, const u8 *end,
188                                 unsigned nr_fields)
189 {
190         unsigned fieldnr = 0;
191         int ret;
192         u64 v[2];
193
194 #define x(_name, _bits)                                                 \
195         if (fieldnr < nr_fields) {                                      \
196                 ret = bch2_varint_decode_fast(in, end, &v[0]);          \
197                 if (ret < 0)                                            \
198                         return ret;                                     \
199                 in += ret;                                              \
200                                                                         \
201                 if (_bits > 64) {                                       \
202                         ret = bch2_varint_decode_fast(in, end, &v[1]);  \
203                         if (ret < 0)                                    \
204                                 return ret;                             \
205                         in += ret;                                      \
206                 } else {                                                \
207                         v[1] = 0;                                       \
208                 }                                                       \
209         } else {                                                        \
210                 v[0] = v[1] = 0;                                        \
211         }                                                               \
212                                                                         \
213         unpacked->_name = v[0];                                         \
214         if (v[1] || v[0] != unpacked->_name)                            \
215                 return -1;                                              \
216         fieldnr++;
217
218         BCH_INODE_FIELDS_v2()
219 #undef  x
220
221         /* XXX: signal if there were more fields than expected? */
222         return 0;
223 }
224
225 static int bch2_inode_unpack_v3(struct bkey_s_c k,
226                                 struct bch_inode_unpacked *unpacked)
227 {
228         struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
229         const u8 *in = inode.v->fields;
230         const u8 *end = bkey_val_end(inode);
231         unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
232         unsigned fieldnr = 0;
233         int ret;
234         u64 v[2];
235
236         unpacked->bi_inum       = inode.k->p.offset;
237         unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
238         unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
239         unpacked->bi_flags      = le64_to_cpu(inode.v->bi_flags);
240         unpacked->bi_sectors    = le64_to_cpu(inode.v->bi_sectors);
241         unpacked->bi_size       = le64_to_cpu(inode.v->bi_size);
242         unpacked->bi_version    = le64_to_cpu(inode.v->bi_version);
243         unpacked->bi_mode       = INODEv3_MODE(inode.v);
244
245 #define x(_name, _bits)                                                 \
246         if (fieldnr < nr_fields) {                                      \
247                 ret = bch2_varint_decode_fast(in, end, &v[0]);          \
248                 if (ret < 0)                                            \
249                         return ret;                                     \
250                 in += ret;                                              \
251                                                                         \
252                 if (_bits > 64) {                                       \
253                         ret = bch2_varint_decode_fast(in, end, &v[1]);  \
254                         if (ret < 0)                                    \
255                                 return ret;                             \
256                         in += ret;                                      \
257                 } else {                                                \
258                         v[1] = 0;                                       \
259                 }                                                       \
260         } else {                                                        \
261                 v[0] = v[1] = 0;                                        \
262         }                                                               \
263                                                                         \
264         unpacked->_name = v[0];                                         \
265         if (v[1] || v[0] != unpacked->_name)                            \
266                 return -1;                                              \
267         fieldnr++;
268
269         BCH_INODE_FIELDS_v3()
270 #undef  x
271
272         /* XXX: signal if there were more fields than expected? */
273         return 0;
274 }
275
276 static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
277                                                struct bch_inode_unpacked *unpacked)
278 {
279         memset(unpacked, 0, sizeof(*unpacked));
280
281         switch (k.k->type) {
282         case KEY_TYPE_inode: {
283                 struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
284
285                 unpacked->bi_inum       = inode.k->p.offset;
286                 unpacked->bi_journal_seq= 0;
287                 unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
288                 unpacked->bi_flags      = le32_to_cpu(inode.v->bi_flags);
289                 unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
290
291                 if (INODE_NEW_VARINT(inode.v)) {
292                         return bch2_inode_unpack_v2(unpacked, inode.v->fields,
293                                                     bkey_val_end(inode),
294                                                     INODE_NR_FIELDS(inode.v));
295                 } else {
296                         return bch2_inode_unpack_v1(inode, unpacked);
297                 }
298                 break;
299         }
300         case KEY_TYPE_inode_v2: {
301                 struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
302
303                 unpacked->bi_inum       = inode.k->p.offset;
304                 unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
305                 unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
306                 unpacked->bi_flags      = le64_to_cpu(inode.v->bi_flags);
307                 unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
308
309                 return bch2_inode_unpack_v2(unpacked, inode.v->fields,
310                                             bkey_val_end(inode),
311                                             INODEv2_NR_FIELDS(inode.v));
312         }
313         default:
314                 BUG();
315         }
316 }
317
318 int bch2_inode_unpack(struct bkey_s_c k,
319                       struct bch_inode_unpacked *unpacked)
320 {
321         if (likely(k.k->type == KEY_TYPE_inode_v3))
322                 return bch2_inode_unpack_v3(k, unpacked);
323         return bch2_inode_unpack_slowpath(k, unpacked);
324 }
325
326 static int bch2_inode_peek_nowarn(struct btree_trans *trans,
327                     struct btree_iter *iter,
328                     struct bch_inode_unpacked *inode,
329                     subvol_inum inum, unsigned flags)
330 {
331         struct bkey_s_c k;
332         u32 snapshot;
333         int ret;
334
335         ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
336         if (ret)
337                 return ret;
338
339         k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
340                                SPOS(0, inum.inum, snapshot),
341                                flags|BTREE_ITER_CACHED);
342         ret = bkey_err(k);
343         if (ret)
344                 return ret;
345
346         ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
347         if (ret)
348                 goto err;
349
350         ret = bch2_inode_unpack(k, inode);
351         if (ret)
352                 goto err;
353
354         return 0;
355 err:
356         bch2_trans_iter_exit(trans, iter);
357         return ret;
358 }
359
360 int bch2_inode_peek(struct btree_trans *trans,
361                     struct btree_iter *iter,
362                     struct bch_inode_unpacked *inode,
363                     subvol_inum inum, unsigned flags)
364 {
365         int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
366         bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
367         return ret;
368 }
369
370 int bch2_inode_write_flags(struct btree_trans *trans,
371                      struct btree_iter *iter,
372                      struct bch_inode_unpacked *inode,
373                      enum btree_update_flags flags)
374 {
375         struct bkey_inode_buf *inode_p;
376
377         inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
378         if (IS_ERR(inode_p))
379                 return PTR_ERR(inode_p);
380
381         bch2_inode_pack_inlined(inode_p, inode);
382         inode_p->inode.k.p.snapshot = iter->snapshot;
383         return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
384 }
385
386 struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
387 {
388         struct bch_inode_unpacked u;
389         struct bkey_inode_buf *inode_p;
390         int ret;
391
392         if (!bkey_is_inode(&k->k))
393                 return ERR_PTR(-ENOENT);
394
395         inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
396         if (IS_ERR(inode_p))
397                 return ERR_CAST(inode_p);
398
399         ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
400         if (ret)
401                 return ERR_PTR(ret);
402
403         bch2_inode_pack(inode_p, &u);
404         return &inode_p->inode.k_i;
405 }
406
407 static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
408 {
409         struct bch_inode_unpacked unpacked;
410         int ret = 0;
411
412         bkey_fsck_err_on(k.k->p.inode, c, err,
413                          inode_pos_inode_nonzero,
414                          "nonzero k.p.inode");
415
416         bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err,
417                          inode_pos_blockdev_range,
418                          "fs inode in blockdev range");
419
420         bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err,
421                          inode_unpack_error,
422                          "invalid variable length fields");
423
424         bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err,
425                          inode_checksum_type_invalid,
426                          "invalid data checksum type (%u >= %u",
427                          unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
428
429         bkey_fsck_err_on(unpacked.bi_compression &&
430                          !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err,
431                          inode_compression_type_invalid,
432                          "invalid compression opt %u", unpacked.bi_compression - 1);
433
434         bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
435                          unpacked.bi_nlink != 0, c, err,
436                          inode_unlinked_but_nlink_nonzero,
437                          "flagged as unlinked but bi_nlink != 0");
438
439         bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err,
440                          inode_subvol_root_but_not_dir,
441                          "subvolume root but not a directory");
442 fsck_err:
443         return ret;
444 }
445
446 int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k,
447                        enum bkey_invalid_flags flags,
448                        struct printbuf *err)
449 {
450         struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
451         int ret = 0;
452
453         bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
454                          inode_str_hash_invalid,
455                          "invalid str hash type (%llu >= %u)",
456                          INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
457
458         ret = __bch2_inode_invalid(c, k, err);
459 fsck_err:
460         return ret;
461 }
462
463 int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
464                           enum bkey_invalid_flags flags,
465                           struct printbuf *err)
466 {
467         struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
468         int ret = 0;
469
470         bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
471                          inode_str_hash_invalid,
472                          "invalid str hash type (%llu >= %u)",
473                          INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
474
475         ret = __bch2_inode_invalid(c, k, err);
476 fsck_err:
477         return ret;
478 }
479
480 int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
481                           enum bkey_invalid_flags flags,
482                           struct printbuf *err)
483 {
484         struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
485         int ret = 0;
486
487         bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
488                          INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err,
489                          inode_v3_fields_start_bad,
490                          "invalid fields_start (got %llu, min %u max %zu)",
491                          INODEv3_FIELDS_START(inode.v),
492                          INODEv3_FIELDS_START_INITIAL,
493                          bkey_val_u64s(inode.k));
494
495         bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
496                          inode_str_hash_invalid,
497                          "invalid str hash type (%llu >= %u)",
498                          INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
499
500         ret = __bch2_inode_invalid(c, k, err);
501 fsck_err:
502         return ret;
503 }
504
505 static void __bch2_inode_unpacked_to_text(struct printbuf *out,
506                                           struct bch_inode_unpacked *inode)
507 {
508         prt_printf(out, "mode=%o ", inode->bi_mode);
509
510         prt_str(out, "flags=");
511         prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
512         prt_printf(out, " (%x)", inode->bi_flags);
513
514         prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu",
515                inode->bi_journal_seq,
516                inode->bi_size,
517                inode->bi_sectors,
518                inode->bi_version);
519
520 #define x(_name, _bits)                                         \
521         prt_printf(out, " "#_name "=%llu", (u64) inode->_name);
522         BCH_INODE_FIELDS_v3()
523 #undef  x
524 }
525
526 void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
527 {
528         prt_printf(out, "inum: %llu ", inode->bi_inum);
529         __bch2_inode_unpacked_to_text(out, inode);
530 }
531
532 void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
533 {
534         struct bch_inode_unpacked inode;
535
536         if (bch2_inode_unpack(k, &inode)) {
537                 prt_printf(out, "(unpack error)");
538                 return;
539         }
540
541         __bch2_inode_unpacked_to_text(out, &inode);
542 }
543
544 static inline u64 bkey_inode_flags(struct bkey_s_c k)
545 {
546         switch (k.k->type) {
547         case KEY_TYPE_inode:
548                 return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
549         case KEY_TYPE_inode_v2:
550                 return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
551         case KEY_TYPE_inode_v3:
552                 return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
553         default:
554                 return 0;
555         }
556 }
557
558 static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
559 {
560         return bkey_inode_flags(k) & BCH_INODE_unlinked;
561 }
562
563 int bch2_trans_mark_inode(struct btree_trans *trans,
564                           enum btree_id btree_id, unsigned level,
565                           struct bkey_s_c old,
566                           struct bkey_i *new,
567                           unsigned flags)
568 {
569         int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
570         bool old_deleted = bkey_is_deleted_inode(old);
571         bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new));
572
573         if (nr) {
574                 int ret = bch2_replicas_deltas_realloc(trans, 0);
575                 struct replicas_delta_list *d = trans->fs_usage_deltas;
576
577                 if (ret)
578                         return ret;
579
580                 d->nr_inodes += nr;
581         }
582
583         if (old_deleted != new_deleted) {
584                 int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted);
585                 if (ret)
586                         return ret;
587         }
588
589         return 0;
590 }
591
592 int bch2_mark_inode(struct btree_trans *trans,
593                     enum btree_id btree_id, unsigned level,
594                     struct bkey_s_c old, struct bkey_s_c new,
595                     unsigned flags)
596 {
597         struct bch_fs *c = trans->c;
598         struct bch_fs_usage *fs_usage;
599         u64 journal_seq = trans->journal_res.seq;
600
601         if (flags & BTREE_TRIGGER_INSERT) {
602                 struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v;
603
604                 BUG_ON(!journal_seq);
605                 BUG_ON(new.k->type != KEY_TYPE_inode_v3);
606
607                 v->bi_journal_seq = cpu_to_le64(journal_seq);
608         }
609
610         if (flags & BTREE_TRIGGER_GC) {
611                 percpu_down_read(&c->mark_lock);
612                 preempt_disable();
613
614                 fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
615                 fs_usage->nr_inodes += bkey_is_inode(new.k);
616                 fs_usage->nr_inodes -= bkey_is_inode(old.k);
617
618                 preempt_enable();
619                 percpu_up_read(&c->mark_lock);
620         }
621         return 0;
622 }
623
624 int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k,
625                                   enum bkey_invalid_flags flags,
626                                   struct printbuf *err)
627 {
628         int ret = 0;
629
630         bkey_fsck_err_on(k.k->p.inode, c, err,
631                          inode_pos_inode_nonzero,
632                          "nonzero k.p.inode");
633 fsck_err:
634         return ret;
635 }
636
637 void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
638                                    struct bkey_s_c k)
639 {
640         struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
641
642         prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
643 }
644
645 void bch2_inode_init_early(struct bch_fs *c,
646                            struct bch_inode_unpacked *inode_u)
647 {
648         enum bch_str_hash_type str_hash =
649                 bch2_str_hash_opt_to_type(c, c->opts.str_hash);
650
651         memset(inode_u, 0, sizeof(*inode_u));
652
653         /* ick */
654         inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
655         get_random_bytes(&inode_u->bi_hash_seed,
656                          sizeof(inode_u->bi_hash_seed));
657 }
658
659 void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
660                           uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
661                           struct bch_inode_unpacked *parent)
662 {
663         inode_u->bi_mode        = mode;
664         inode_u->bi_uid         = uid;
665         inode_u->bi_gid         = gid;
666         inode_u->bi_dev         = rdev;
667         inode_u->bi_atime       = now;
668         inode_u->bi_mtime       = now;
669         inode_u->bi_ctime       = now;
670         inode_u->bi_otime       = now;
671
672         if (parent && parent->bi_mode & S_ISGID) {
673                 inode_u->bi_gid = parent->bi_gid;
674                 if (S_ISDIR(mode))
675                         inode_u->bi_mode |= S_ISGID;
676         }
677
678         if (parent) {
679 #define x(_name, ...)   inode_u->bi_##_name = parent->bi_##_name;
680                 BCH_INODE_OPTS()
681 #undef x
682         }
683 }
684
685 void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
686                      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
687                      struct bch_inode_unpacked *parent)
688 {
689         bch2_inode_init_early(c, inode_u);
690         bch2_inode_init_late(inode_u, bch2_current_time(c),
691                              uid, gid, mode, rdev, parent);
692 }
693
694 static inline u32 bkey_generation(struct bkey_s_c k)
695 {
696         switch (k.k->type) {
697         case KEY_TYPE_inode:
698         case KEY_TYPE_inode_v2:
699                 BUG();
700         case KEY_TYPE_inode_generation:
701                 return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
702         default:
703                 return 0;
704         }
705 }
706
707 /*
708  * This just finds an empty slot:
709  */
710 int bch2_inode_create(struct btree_trans *trans,
711                       struct btree_iter *iter,
712                       struct bch_inode_unpacked *inode_u,
713                       u32 snapshot, u64 cpu)
714 {
715         struct bch_fs *c = trans->c;
716         struct bkey_s_c k;
717         u64 min, max, start, pos, *hint;
718         int ret = 0;
719         unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
720
721         if (c->opts.shard_inode_numbers) {
722                 bits -= c->inode_shard_bits;
723
724                 min = (cpu << bits);
725                 max = (cpu << bits) | ~(ULLONG_MAX << bits);
726
727                 min = max_t(u64, min, BLOCKDEV_INODE_MAX);
728                 hint = c->unused_inode_hints + cpu;
729         } else {
730                 min = BLOCKDEV_INODE_MAX;
731                 max = ~(ULLONG_MAX << bits);
732                 hint = c->unused_inode_hints;
733         }
734
735         start = READ_ONCE(*hint);
736
737         if (start >= max || start < min)
738                 start = min;
739
740         pos = start;
741         bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
742                              BTREE_ITER_ALL_SNAPSHOTS|
743                              BTREE_ITER_INTENT);
744 again:
745         while ((k = bch2_btree_iter_peek(iter)).k &&
746                !(ret = bkey_err(k)) &&
747                bkey_lt(k.k->p, POS(0, max))) {
748                 if (pos < iter->pos.offset)
749                         goto found_slot;
750
751                 /*
752                  * We don't need to iterate over keys in every snapshot once
753                  * we've found just one:
754                  */
755                 pos = iter->pos.offset + 1;
756                 bch2_btree_iter_set_pos(iter, POS(0, pos));
757         }
758
759         if (!ret && pos < max)
760                 goto found_slot;
761
762         if (!ret && start == min)
763                 ret = -BCH_ERR_ENOSPC_inode_create;
764
765         if (ret) {
766                 bch2_trans_iter_exit(trans, iter);
767                 return ret;
768         }
769
770         /* Retry from start */
771         pos = start = min;
772         bch2_btree_iter_set_pos(iter, POS(0, pos));
773         goto again;
774 found_slot:
775         bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
776         k = bch2_btree_iter_peek_slot(iter);
777         ret = bkey_err(k);
778         if (ret) {
779                 bch2_trans_iter_exit(trans, iter);
780                 return ret;
781         }
782
783         *hint                   = k.k->p.offset;
784         inode_u->bi_inum        = k.k->p.offset;
785         inode_u->bi_generation  = bkey_generation(k);
786         return 0;
787 }
788
789 static int bch2_inode_delete_keys(struct btree_trans *trans,
790                                   subvol_inum inum, enum btree_id id)
791 {
792         struct btree_iter iter;
793         struct bkey_s_c k;
794         struct bkey_i delete;
795         struct bpos end = POS(inum.inum, U64_MAX);
796         u32 snapshot;
797         int ret = 0;
798
799         /*
800          * We're never going to be deleting partial extents, no need to use an
801          * extent iterator:
802          */
803         bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
804                              BTREE_ITER_INTENT);
805
806         while (1) {
807                 bch2_trans_begin(trans);
808
809                 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
810                 if (ret)
811                         goto err;
812
813                 bch2_btree_iter_set_snapshot(&iter, snapshot);
814
815                 k = bch2_btree_iter_peek_upto(&iter, end);
816                 ret = bkey_err(k);
817                 if (ret)
818                         goto err;
819
820                 if (!k.k)
821                         break;
822
823                 bkey_init(&delete.k);
824                 delete.k.p = iter.pos;
825
826                 if (iter.flags & BTREE_ITER_IS_EXTENTS)
827                         bch2_key_resize(&delete.k,
828                                         bpos_min(end, k.k->p).offset -
829                                         iter.pos.offset);
830
831                 ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
832                       bch2_trans_commit(trans, NULL, NULL,
833                                         BTREE_INSERT_NOFAIL);
834 err:
835                 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
836                         break;
837         }
838
839         bch2_trans_iter_exit(trans, &iter);
840         return ret;
841 }
842
843 int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
844 {
845         struct btree_trans *trans = bch2_trans_get(c);
846         struct btree_iter iter = { NULL };
847         struct bkey_i_inode_generation delete;
848         struct bch_inode_unpacked inode_u;
849         struct bkey_s_c k;
850         u32 snapshot;
851         int ret;
852
853         /*
854          * If this was a directory, there shouldn't be any real dirents left -
855          * but there could be whiteouts (from hash collisions) that we should
856          * delete:
857          *
858          * XXX: the dirent could ideally would delete whiteouts when they're no
859          * longer needed
860          */
861         ret   = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
862                 bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
863                 bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
864         if (ret)
865                 goto err;
866 retry:
867         bch2_trans_begin(trans);
868
869         ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
870         if (ret)
871                 goto err;
872
873         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
874                                SPOS(0, inum.inum, snapshot),
875                                BTREE_ITER_INTENT|BTREE_ITER_CACHED);
876         ret = bkey_err(k);
877         if (ret)
878                 goto err;
879
880         if (!bkey_is_inode(k.k)) {
881                 bch2_fs_inconsistent(c,
882                                      "inode %llu:%u not found when deleting",
883                                      inum.inum, snapshot);
884                 ret = -EIO;
885                 goto err;
886         }
887
888         bch2_inode_unpack(k, &inode_u);
889
890         bkey_inode_generation_init(&delete.k_i);
891         delete.k.p = iter.pos;
892         delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
893
894         ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
895                 bch2_trans_commit(trans, NULL, NULL,
896                                 BTREE_INSERT_NOFAIL);
897 err:
898         bch2_trans_iter_exit(trans, &iter);
899         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
900                 goto retry;
901
902         bch2_trans_put(trans);
903         return ret;
904 }
905
906 int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
907                                   subvol_inum inum,
908                                   struct bch_inode_unpacked *inode)
909 {
910         struct btree_iter iter;
911         int ret;
912
913         ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
914         if (!ret)
915                 bch2_trans_iter_exit(trans, &iter);
916         return ret;
917 }
918
919 int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
920                                   subvol_inum inum,
921                                   struct bch_inode_unpacked *inode)
922 {
923         struct btree_iter iter;
924         int ret;
925
926         ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
927         if (!ret)
928                 bch2_trans_iter_exit(trans, &iter);
929         return ret;
930 }
931
932 int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
933                             struct bch_inode_unpacked *inode)
934 {
935         return bch2_trans_do(c, NULL, NULL, 0,
936                 bch2_inode_find_by_inum_trans(trans, inum, inode));
937 }
938
939 int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
940 {
941         if (bi->bi_flags & BCH_INODE_unlinked)
942                 bi->bi_flags &= ~BCH_INODE_unlinked;
943         else {
944                 if (bi->bi_nlink == U32_MAX)
945                         return -EINVAL;
946
947                 bi->bi_nlink++;
948         }
949
950         return 0;
951 }
952
953 void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
954 {
955         if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
956                 bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
957                                         bi->bi_inum);
958                 return;
959         }
960
961         if (bi->bi_flags & BCH_INODE_unlinked) {
962                 bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
963                 return;
964         }
965
966         if (bi->bi_nlink)
967                 bi->bi_nlink--;
968         else
969                 bi->bi_flags |= BCH_INODE_unlinked;
970 }
971
972 struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
973 {
974         struct bch_opts ret = { 0 };
975 #define x(_name, _bits)                                                 \
976         if (inode->bi_##_name)                                          \
977                 opt_set(ret, _name, inode->bi_##_name - 1);
978         BCH_INODE_OPTS()
979 #undef x
980         return ret;
981 }
982
983 void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
984                          struct bch_inode_unpacked *inode)
985 {
986 #define x(_name, _bits)         opts->_name = inode_opt_get(c, inode, _name);
987         BCH_INODE_OPTS()
988 #undef x
989
990         if (opts->nocow)
991                 opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
992 }
993
994 int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
995 {
996         struct bch_inode_unpacked inode;
997         int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
998
999         if (ret)
1000                 return ret;
1001
1002         bch2_inode_opts_get(opts, trans->c, &inode);
1003         return 0;
1004 }
1005
1006 int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
1007 {
1008         struct bch_fs *c = trans->c;
1009         struct btree_iter iter = { NULL };
1010         struct bkey_i_inode_generation delete;
1011         struct bch_inode_unpacked inode_u;
1012         struct bkey_s_c k;
1013         int ret;
1014
1015         do {
1016                 ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1017                                                       SPOS(inum, 0, snapshot),
1018                                                       SPOS(inum, U64_MAX, snapshot),
1019                                                       0, NULL) ?:
1020                         bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
1021                                                       SPOS(inum, 0, snapshot),
1022                                                       SPOS(inum, U64_MAX, snapshot),
1023                                                       0, NULL) ?:
1024                         bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
1025                                                       SPOS(inum, 0, snapshot),
1026                                                       SPOS(inum, U64_MAX, snapshot),
1027                                                       0, NULL);
1028         } while (ret == -BCH_ERR_transaction_restart_nested);
1029         if (ret)
1030                 goto err;
1031 retry:
1032         bch2_trans_begin(trans);
1033
1034         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
1035                                SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
1036         ret = bkey_err(k);
1037         if (ret)
1038                 goto err;
1039
1040         if (!bkey_is_inode(k.k)) {
1041                 bch2_fs_inconsistent(c,
1042                                      "inode %llu:%u not found when deleting",
1043                                      inum, snapshot);
1044                 ret = -EIO;
1045                 goto err;
1046         }
1047
1048         bch2_inode_unpack(k, &inode_u);
1049
1050         /* Subvolume root? */
1051         if (inode_u.bi_subvol)
1052                 bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
1053
1054         bkey_inode_generation_init(&delete.k_i);
1055         delete.k.p = iter.pos;
1056         delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
1057
1058         ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
1059                 bch2_trans_commit(trans, NULL, NULL,
1060                                 BTREE_INSERT_NOFAIL);
1061 err:
1062         bch2_trans_iter_exit(trans, &iter);
1063         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1064                 goto retry;
1065
1066         return ret ?: -BCH_ERR_transaction_restart_nested;
1067 }
1068
1069 static int may_delete_deleted_inode(struct btree_trans *trans,
1070                                     struct btree_iter *iter,
1071                                     struct bpos pos,
1072                                     bool *need_another_pass)
1073 {
1074         struct bch_fs *c = trans->c;
1075         struct btree_iter inode_iter;
1076         struct bkey_s_c k;
1077         struct bch_inode_unpacked inode;
1078         int ret;
1079
1080         k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
1081         ret = bkey_err(k);
1082         if (ret)
1083                 return ret;
1084
1085         ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
1086         if (fsck_err_on(!bkey_is_inode(k.k), c,
1087                         deleted_inode_missing,
1088                         "nonexistent inode %llu:%u in deleted_inodes btree",
1089                         pos.offset, pos.snapshot))
1090                 goto delete;
1091
1092         ret = bch2_inode_unpack(k, &inode);
1093         if (ret)
1094                 goto err;
1095
1096         if (fsck_err_on(S_ISDIR(inode.bi_mode), c,
1097                         deleted_inode_is_dir,
1098                         "directory %llu:%u in deleted_inodes btree",
1099                         pos.offset, pos.snapshot))
1100                 goto delete;
1101
1102         if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c,
1103                         deleted_inode_not_unlinked,
1104                         "non-deleted inode %llu:%u in deleted_inodes btree",
1105                         pos.offset, pos.snapshot))
1106                 goto delete;
1107
1108         if (c->sb.clean &&
1109             !fsck_err(c,
1110                       deleted_inode_but_clean,
1111                       "filesystem marked as clean but have deleted inode %llu:%u",
1112                       pos.offset, pos.snapshot))
1113                 return 0;
1114
1115         if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
1116                 struct bpos new_min_pos;
1117
1118                 ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
1119                 if (ret)
1120                         goto err;
1121
1122                 inode.bi_flags &= ~BCH_INODE_unlinked;
1123
1124                 ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
1125                                              BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1126                 bch_err_msg(c, ret, "clearing inode unlinked flag");
1127                 if (ret)
1128                         return ret;
1129
1130                 /*
1131                  * We'll need another write buffer flush to pick up the new
1132                  * unlinked inodes in the snapshot leaves:
1133                  */
1134                 *need_another_pass = true;
1135                 return 0;
1136         }
1137
1138         return 1;
1139 err:
1140 fsck_err:
1141         return ret;
1142 delete:
1143         return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
1144 }
1145
1146 int bch2_delete_dead_inodes(struct bch_fs *c)
1147 {
1148         struct btree_trans *trans = bch2_trans_get(c);
1149         struct btree_iter iter;
1150         struct bkey_s_c k;
1151         bool need_another_pass;
1152         int ret;
1153 again:
1154         need_another_pass = false;
1155
1156         ret = bch2_btree_write_buffer_flush_sync(trans);
1157         if (ret)
1158                 goto err;
1159
1160         /*
1161          * Weird transaction restart handling here because on successful delete,
1162          * bch2_inode_rm_snapshot() will return a nested transaction restart,
1163          * but we can't retry because the btree write buffer won't have been
1164          * flushed and we'd spin:
1165          */
1166         for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
1167                            BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
1168                 ret = lockrestart_do(trans, may_delete_deleted_inode(trans, &iter, k.k->p,
1169                                                                      &need_another_pass));
1170                 if (ret < 0)
1171                         break;
1172
1173                 if (ret) {
1174                         if (!test_bit(BCH_FS_RW, &c->flags)) {
1175                                 bch2_trans_unlock(trans);
1176                                 bch2_fs_lazy_rw(c);
1177                         }
1178
1179                         bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
1180
1181                         ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
1182                         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
1183                                 break;
1184                 }
1185         }
1186         bch2_trans_iter_exit(trans, &iter);
1187
1188         if (!ret && need_another_pass)
1189                 goto again;
1190 err:
1191         bch2_trans_put(trans);
1192
1193         return ret;
1194 }