]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to f026e4e024
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 15 Apr 2017 04:38:49 +0000 (20:38 -0800)
committerKent Overstreet <kent.overstreet@gmail.com>
Sat, 15 Apr 2017 04:40:31 +0000 (20:40 -0800)
36 files changed:
.bcachefs_revision
Makefile
include/linux/bio.h
libbcachefs/bcachefs.h
libbcachefs/bkey_methods.c
libbcachefs/bkey_methods.h
libbcachefs/btree_cache.c
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_types.h
libbcachefs/btree_update.c
libbcachefs/btree_update.h
libbcachefs/debug.c
libbcachefs/dirent.c
libbcachefs/dirent.h
libbcachefs/error.c
libbcachefs/error.h
libbcachefs/fs-io.c
libbcachefs/fs.c
libbcachefs/fsck.c [moved from libbcachefs/fs-gc.c with 75% similarity]
libbcachefs/fsck.h [moved from libbcachefs/fs-gc.h with 100% similarity]
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io.c
libbcachefs/journal.c
libbcachefs/str_hash.h
libbcachefs/super.c
libbcachefs/sysfs.c
libbcachefs/util.c
libbcachefs/util.h
libbcachefs/xattr.c
libbcachefs/xattr.h
linux/bio.c

index 9a3f68733a7ab51a15fc1d721266457bf11f97f8..2a5e858f5bdb9dffaf112ef11e90d3ee3ebfdb26 100644 (file)
@@ -1 +1 @@
-3b4024f94489e4d8dc8eb7f1278754a2545f8026
+f026e4e0243cc10e721504a8bfaa131ea8aa4c91
index e8a80c7c316246f687402163a70f99548f623678..327fec25fe248fe4ff79d6cd31fbeb065edd0cd8 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -78,7 +78,7 @@ SRCS=bcachefs.c                               \
      libbcachefs/dirent.c              \
      libbcachefs/error.c               \
      libbcachefs/extents.c             \
-     libbcachefs/fs-gc.c               \
+     libbcachefs/fsck.c                        \
      libbcachefs/inode.c               \
      libbcachefs/io.c                  \
      libbcachefs/journal.c             \
index 94e9048d3238cc4ae11d3221d23a8a1541543553..49d26b53fa103d54bc87acf892c7648599f9fcae 100644 (file)
@@ -288,8 +288,8 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 {
 }
 
-extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
-                              struct bio *src, struct bvec_iter src_iter);
+extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
+                              struct bio *src, struct bvec_iter *src_iter);
 extern void bio_copy_data(struct bio *dst, struct bio *src);
 extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 
index c170e85a34b22c0b432a8d3b112fb4b0a7be5934..b1f2528a329bb5b0234e01fbbe8e1362b0923c75 100644 (file)
@@ -458,6 +458,7 @@ enum {
        BCH_FS_BDEV_MOUNTED,
        BCH_FS_ERROR,
        BCH_FS_FSCK_FIXED_ERRORS,
+       BCH_FS_FSCK_DONE,
        BCH_FS_FIXED_GENS,
 };
 
@@ -724,6 +725,11 @@ struct bch_fs {
        struct work_struct      read_retry_work;
        spinlock_t              read_retry_lock;
 
+       /* ERRORS */
+       struct list_head        fsck_errors;
+       struct mutex            fsck_error_lock;
+       bool                    fsck_alloc_err;
+
        /* FILESYSTEM */
        wait_queue_head_t       writeback_wait;
        atomic_t                writeback_pages;
index 51a13fca2e7d6ee2ed2720051ca4073e63642c85..cd9a60c1144a7fed0ea610dcc5c89506ba72d043 100644 (file)
@@ -89,18 +89,20 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
                ops->key_debugcheck(c, b, k);
 }
 
-void bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
-                    char *buf, size_t size, struct bkey_s_c k)
+char *bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
+                      char *buf, size_t size, struct bkey_s_c k)
 {
        const struct bkey_ops *ops = bch2_bkey_ops[type];
 
        if (k.k->type >= KEY_TYPE_GENERIC_NR &&
            ops->val_to_text)
                ops->val_to_text(c, buf, size, k);
+
+       return buf;
 }
 
-void bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
-                         char *buf, size_t size, struct bkey_s_c k)
+char *bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
+                           char *buf, size_t size, struct bkey_s_c k)
 {
        const struct bkey_ops *ops = bch2_bkey_ops[type];
        char *out = buf, *end = buf + size;
@@ -109,9 +111,11 @@ void bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
 
        if (k.k->type >= KEY_TYPE_GENERIC_NR &&
            ops->val_to_text) {
-               out += scnprintf(out, end - out, " -> ");
+               out += scnprintf(out, end - out, ": ");
                ops->val_to_text(c, out, end - out, k);
        }
+
+       return buf;
 }
 
 void bch2_bkey_swab(enum bkey_type type,
index f795db6dc0b3e770d2e37d1bf7ce1677772acd42..2d526f560caf2bd8f9c15a038c804bbd86958c4e 100644 (file)
@@ -67,10 +67,10 @@ const char *bch2_btree_bkey_invalid(struct bch_fs *, struct btree *,
                                    struct bkey_s_c);
 
 void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-void bch2_val_to_text(struct bch_fs *, enum bkey_type,
-                     char *, size_t, struct bkey_s_c);
-void bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
-                          char *, size_t, struct bkey_s_c);
+char *bch2_val_to_text(struct bch_fs *, enum bkey_type,
+                      char *, size_t, struct bkey_s_c);
+char *bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
+                           char *, size_t, struct bkey_s_c);
 
 void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
                    struct bkey_packed *);
index bd47aecf7b57b177b13e6dadd828a9bbf7ceef85..c37c8959d02ba511c277174481a363d5f6e045da 100644 (file)
@@ -91,6 +91,7 @@ static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp)
        six_lock_init(&b->lock);
        INIT_LIST_HEAD(&b->list);
        INIT_LIST_HEAD(&b->write_blocked);
+       INIT_LIST_HEAD(&b->reachable);
 
        mca_data_alloc(c, b, gfp);
        return b->data ? b : NULL;
index fc06a63a4ee856b2978c91faf3f1fa4067b46254..88ae396782d6a1a8d1d6b77a974668081774e81f 100644 (file)
@@ -605,10 +605,12 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
                bch2_btree_interior_update_will_free_node(c, as, old_nodes[i]);
 
        /* Repack everything with @new_format and sort down to one bset */
-       for (i = 0; i < nr_old_nodes; i++)
+       for (i = 0; i < nr_old_nodes; i++) {
                new_nodes[i] =
                        __bch2_btree_node_alloc_replacement(c, old_nodes[i],
                                                            new_format, res);
+               list_add(&new_nodes[i]->reachable, &as->reachable_list);
+       }
 
        /*
         * Conceptually we concatenate the nodes together and slice them
@@ -645,6 +647,7 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
 
                        set_btree_bset_end(n1, n1->set);
 
+                       list_del_init(&n2->reachable);
                        six_unlock_write(&n2->lock);
                        bch2_btree_node_free_never_inserted(c, n2);
                        six_unlock_intent(&n2->lock);
index 8152dc4b90d738df845b508f71c940997812c480..82dd196d6cddd12ed78f3fa298904d478e3312de 100644 (file)
@@ -872,32 +872,57 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, struct nonce nonce)
                    vstruct_end(i) - (void *) i->_data);
 }
 
-#define btree_node_error(b, c, ptr, fmt, ...)                          \
-       bch2_fs_inconsistent(c,                                         \
-               "btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\
-               (b)->btree_id, (b)->level, btree_node_root(c, b)        \
-                           ? btree_node_root(c, b)->level : -1,        \
-               PTR_BUCKET_NR(ca, ptr), (b)->written,                   \
-               le16_to_cpu((i)->u64s), ##__VA_ARGS__)
-
-static const char *validate_bset(struct bch_fs *c, struct btree *b,
-                                struct bch_dev *ca,
-                                const struct bch_extent_ptr *ptr,
-                                struct bset *i, unsigned sectors,
-                                unsigned *whiteout_u64s)
+#define btree_node_error(c, b, ptr, msg, ...)                          \
+do {                                                                   \
+       if (write == READ &&                                            \
+           !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {             \
+               mustfix_fsck_err(c,                                     \
+                        "btree node read error at btree %u level %u/%u\n"\
+                       "sector %llu node offset %u bset u64s %u: " msg,\
+                       (b)->btree_id, (b)->level,                      \
+                       (c)->btree_roots[(b)->btree_id].level,          \
+                       (u64) ptr->offset, (b)->written,                \
+                       le16_to_cpu((i)->u64s), ##__VA_ARGS__);         \
+       } else {                                                        \
+               bch_err(c, "%s at btree %u level %u/%u\n"               \
+                       "sector %llu node offset %u bset u64s %u: " msg,\
+                       write == WRITE                                  \
+                       ? "corrupt metadata in btree node write"        \
+                       : "btree node error",                           \
+                       (b)->btree_id, (b)->level,                      \
+                       (c)->btree_roots[(b)->btree_id].level,          \
+                       (u64) ptr->offset, (b)->written,                \
+                       le16_to_cpu((i)->u64s), ##__VA_ARGS__);         \
+               ret = BCH_FSCK_ERRORS_NOT_FIXED;                        \
+               goto fsck_err;                                          \
+       }                                                               \
+} while (0)
+
+static int validate_bset(struct bch_fs *c, struct btree *b,
+                        const struct bch_extent_ptr *ptr,
+                        struct bset *i, unsigned sectors,
+                        unsigned *whiteout_u64s,
+                        int write)
 {
        struct bkey_packed *k, *prev = NULL;
        struct bpos prev_pos = POS_MIN;
        bool seen_non_whiteout = false;
+       int ret = 0;
 
-       if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION)
-               return "unsupported bset version";
+       if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) {
+               btree_node_error(c, b, ptr, "unsupported bset version");
+               i->u64s = 0;
+               return 0;
+       }
 
-       if (b->written + sectors > c->sb.btree_node_size)
-               return  "bset past end of btree node";
+       if (b->written + sectors > c->sb.btree_node_size) {
+               btree_node_error(c, b, ptr, "bset past end of btree node");
+               i->u64s = 0;
+               return 0;
+       }
 
-       if (i != &b->data->keys && !i->u64s)
-               btree_node_error(b, c, ptr, "empty set");
+       if (b->written && !i->u64s)
+               btree_node_error(c, b, ptr, "empty set");
 
        if (!BSET_SEPARATE_WHITEOUTS(i)) {
                seen_non_whiteout = true;
@@ -911,7 +936,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
                const char *invalid;
 
                if (!k->u64s) {
-                       btree_node_error(b, c, ptr,
+                       btree_node_error(c, b, ptr,
                                "KEY_U64s 0: %zu bytes of metadata lost",
                                vstruct_end(i) - (void *) k);
 
@@ -920,7 +945,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
                }
 
                if (bkey_next(k) > vstruct_last(i)) {
-                       btree_node_error(b, c, ptr,
+                       btree_node_error(c, b, ptr,
                                         "key extends past end of bset");
 
                        i->u64s = cpu_to_le16((u64 *) k - i->_data);
@@ -928,7 +953,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
                }
 
                if (k->format > KEY_FORMAT_CURRENT) {
-                       btree_node_error(b, c, ptr,
+                       btree_node_error(c, b, ptr,
                                         "invalid bkey format %u", k->format);
 
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
@@ -947,8 +972,8 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
                        char buf[160];
 
                        bch2_bkey_val_to_text(c, btree_node_type(b),
-                                            buf, sizeof(buf), u);
-                       btree_node_error(b, c, ptr,
+                                             buf, sizeof(buf), u);
+                       btree_node_error(c, b, ptr,
                                         "invalid bkey %s: %s", buf, invalid);
 
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
@@ -969,7 +994,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
                        *whiteout_u64s = k->_data - i->_data;
                        seen_non_whiteout = true;
                } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
-                       btree_node_error(b, c, ptr,
+                       btree_node_error(c, b, ptr,
                                         "keys out of order: %llu:%llu > %llu:%llu",
                                         prev_pos.inode,
                                         prev_pos.offset,
@@ -984,7 +1009,8 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
        }
 
        SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-       return NULL;
+fsck_err:
+       return ret;
 }
 
 static bool extent_contains_ptr(struct bkey_s_c_extent e,
@@ -1012,7 +1038,7 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
        const char *err;
        struct bch_csum csum;
        struct nonce nonce;
-       int ret;
+       int ret, write = READ;
 
        iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
        __bch2_btree_node_iter_init(iter, btree_node_is_extents(b));
@@ -1115,9 +1141,10 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
                        sectors = vstruct_sectors(bne, c->block_bits);
                }
 
-               err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s);
-               if (err)
-                       goto err;
+               ret = validate_bset(c, b, ptr, i, sectors,
+                                   &whiteout_u64s, READ);
+               if (ret)
+                       goto fsck_err;
 
                b->written += sectors;
 
@@ -1172,8 +1199,10 @@ out:
        mempool_free(iter, &c->fill_iter);
        return;
 err:
+       btree_node_error(c, b, ptr, "%s", err);
+fsck_err:
+       bch2_inconsistent_error(c);
        set_btree_node_read_error(b);
-       btree_node_error(b, c, ptr, "%s", err);
        goto out;
 }
 
@@ -1309,6 +1338,23 @@ static void btree_node_write_endio(struct bio *bio)
        }
 }
 
+static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
+                                  struct bset *i, unsigned sectors)
+{
+       const struct bch_extent_ptr *ptr;
+       unsigned whiteout_u64s = 0;
+       int ret;
+
+       extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
+               break;
+
+       ret = validate_bset(c, b, ptr, i, sectors, &whiteout_u64s, WRITE);
+       if (ret)
+               bch2_fatal_error(c);
+
+       return ret;
+}
+
 void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                            struct closure *parent,
                            enum six_lock_type lock_type_held)
@@ -1343,18 +1389,24 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                if (!(old & (1 << BTREE_NODE_dirty)))
                        return;
 
+               if (b->written &&
+                   !btree_node_may_write(b))
+                       return;
+
                if (old & (1 << BTREE_NODE_write_in_flight)) {
                        btree_node_wait_on_io(b);
                        continue;
                }
 
                new &= ~(1 << BTREE_NODE_dirty);
+               new &= ~(1 << BTREE_NODE_need_write);
                new |=  (1 << BTREE_NODE_write_in_flight);
                new |=  (1 << BTREE_NODE_just_written);
                new ^=  (1 << BTREE_NODE_write_idx);
        } while (cmpxchg_acquire(&b->flags, old, new) != old);
 
        BUG_ON(!list_empty(&b->write_blocked));
+       BUG_ON(!list_empty_careful(&b->reachable) != !b->written);
 
        BUG_ON(b->written >= c->sb.btree_node_size);
        BUG_ON(bset_written(b, btree_bset_last(b)));
@@ -1430,13 +1482,17 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
        clear_needs_whiteout(i);
 
-       if (b->written && !i->u64s) {
-               /* Nothing to write: */
-               btree_bounce_free(c, order, used_mempool, data);
-               btree_node_write_done(c, b);
-               return;
-       }
+       /* do we have data to write? */
+       if (b->written && !i->u64s)
+               goto nowrite;
+
+       bytes_to_write = vstruct_end(i) - data;
+       sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
+
+       memset(data + bytes_to_write, 0,
+              (sectors_to_write << 9) - bytes_to_write);
 
+       BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
        BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
        BUG_ON(i->seq != b->data->keys.seq);
 
@@ -1445,6 +1501,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
        nonce = btree_nonce(b, i, b->written << 9);
 
+       /* if we're going to be encrypting, check metadata validity first: */
+       if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+           validate_bset_for_write(c, b, i, sectors_to_write))
+               goto err;
+
        if (bn) {
                bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
                            &bn->flags,
@@ -1464,15 +1525,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
        }
 
-       bytes_to_write = vstruct_end(i) - data;
-       sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
-
-       memset(data + bytes_to_write, 0,
-              (sectors_to_write << 9) - bytes_to_write);
-
-       BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
-
-       trace_btree_write(b, bytes_to_write, sectors_to_write);
+       /* if we're not encrypting, check metadata after checksumming: */
+       if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+           validate_bset_for_write(c, b, i, sectors_to_write))
+               goto err;
 
        /*
         * We handle btree write errors by immediately halting the journal -
@@ -1488,14 +1544,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
         * break:
         */
        if (bch2_journal_error(&c->journal) ||
-           c->opts.nochanges) {
-               set_btree_node_noevict(b);
-               b->written += sectors_to_write;
+           c->opts.nochanges)
+               goto err;
 
-               btree_bounce_free(c, order, used_mempool, data);
-               btree_node_write_done(c, b);
-               return;
-       }
+       trace_btree_write(b, bytes_to_write, sectors_to_write);
 
        bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write);
 
@@ -1543,6 +1595,13 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        b->written += sectors_to_write;
 
        bch2_submit_wbio_replicas(wbio, c, &k.key);
+       return;
+err:
+       set_btree_node_noevict(b);
+       b->written += sectors_to_write;
+nowrite:
+       btree_bounce_free(c, order, used_mempool, data);
+       btree_node_write_done(c, b);
 }
 
 /*
index 3014b5f069424aa48984a2809e93ddc05be58ea3..d023dfae6d9786b5ee47c789a74000b907274fc5 100644 (file)
@@ -27,7 +27,8 @@ static inline void btree_node_wait_on_io(struct btree *b)
 
 static inline bool btree_node_may_write(struct btree *b)
 {
-       return list_empty_careful(&b->write_blocked);
+       return list_empty_careful(&b->write_blocked) &&
+               list_empty_careful(&b->reachable);
 }
 
 enum compact_mode {
@@ -80,6 +81,8 @@ void bch2_btree_node_write(struct bch_fs *, struct btree *,
 #define bch2_btree_node_write_dirty(_c, _b, _cl, cond)                 \
 do {                                                                   \
        while ((_b)->written && btree_node_dirty(_b) && (cond)) {       \
+               set_btree_node_need_write(_b);                          \
+                                                                       \
                if (!btree_node_may_write(_b))                          \
                        break;                                          \
                                                                        \
index 55303f09e51517052fea562dc530c9adb8f4c2ba..0b28082e670ceb66baf80043a528f6975931671f 100644 (file)
@@ -1109,6 +1109,26 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
        prefetch(c->btree_roots[btree_id].b);
 }
 
+void bch2_btree_iter_unlink(struct btree_iter *iter)
+{
+       struct btree_iter *linked;
+
+       __bch2_btree_iter_unlock(iter);
+
+       if (!btree_iter_linked(iter))
+               return;
+
+       for_each_linked_btree_iter(iter, linked) {
+
+               if (linked->next == iter) {
+                       linked->next = iter->next;
+                       return;
+               }
+       }
+
+       BUG();
+}
+
 void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
 {
        BUG_ON(btree_iter_linked(new));
@@ -1128,7 +1148,7 @@ void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
 
 void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
 {
-       bch2_btree_iter_unlock(dst);
+       __bch2_btree_iter_unlock(dst);
        memcpy(dst, src, offsetof(struct btree_iter, next));
        dst->nodes_locked = dst->nodes_intent_locked = 0;
 }
index 39731f0bcbc61c4c7061fe2602f85c0f023b1f2c..7cf9bd633e3d7da08a0d4086a4ce85f748b7459a 100644 (file)
@@ -185,6 +185,7 @@ static inline void bch2_btree_iter_init_intent(struct btree_iter *iter,
 }
 
 void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *);
+void bch2_btree_iter_unlink(struct btree_iter *);
 void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *);
 
 static inline struct bpos btree_type_successor(enum btree_id id,
index 915e42c2f1857deb046640d9cd414172f8f45afd..a0f5b579fe2a92f50ac69160aa9f81fb12ec25b2 100644 (file)
@@ -110,6 +110,14 @@ struct btree {
         */
        struct list_head        write_blocked;
 
+       /*
+        * Also for asynchronous splits/interior node updates:
+        * If a btree node isn't reachable yet, we don't want to kick off
+        * another write - because that write also won't yet be reachable and
+        * marking it as completed before it's reachable would be incorrect:
+        */
+       struct list_head        reachable;
+
        struct open_bucket      *ob;
 
        /* lru list */
@@ -136,6 +144,7 @@ enum btree_flags {
        BTREE_NODE_read_error,
        BTREE_NODE_write_error,
        BTREE_NODE_dirty,
+       BTREE_NODE_need_write,
        BTREE_NODE_noevict,
        BTREE_NODE_write_idx,
        BTREE_NODE_accessed,
@@ -146,6 +155,7 @@ enum btree_flags {
 BTREE_FLAG(read_error);
 BTREE_FLAG(write_error);
 BTREE_FLAG(dirty);
+BTREE_FLAG(need_write);
 BTREE_FLAG(noevict);
 BTREE_FLAG(write_idx);
 BTREE_FLAG(accessed);
index 196b74238503b1a77a21a366abb8c2d2aae3988e..cfd2a455fffe6ee58efce84b827f09050faa35cf 100644 (file)
@@ -162,9 +162,11 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
        trace_btree_node_free(c, b);
 
        BUG_ON(btree_node_dirty(b));
+       BUG_ON(btree_node_need_write(b));
        BUG_ON(b == btree_node_root(c, b));
        BUG_ON(b->ob);
        BUG_ON(!list_empty(&b->write_blocked));
+       BUG_ON(!list_empty(&b->reachable));
 
        clear_btree_node_noevict(b);
 
@@ -589,7 +591,6 @@ struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
        unsigned nr_nodes = btree_reserve_required_nodes(depth) + extra_nodes;
 
        return __bch2_btree_reserve_get(c, nr_nodes, flags, cl);
-
 }
 
 int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
@@ -598,6 +599,7 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
        struct closure cl;
        struct btree_reserve *reserve;
        struct btree *b;
+       LIST_HEAD(reachable_list);
 
        closure_init_stack(&cl);
 
@@ -614,11 +616,14 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
        }
 
        b = __btree_root_alloc(c, 0, id, reserve);
+       list_add(&b->reachable, &reachable_list);
 
        bch2_btree_node_write(c, b, writes, SIX_LOCK_intent);
 
        bch2_btree_set_root_initial(c, b, reserve);
        bch2_btree_open_bucket_put(c, b);
+
+       list_del_init(&b->reachable);
        six_unlock_intent(&b->lock);
 
        bch2_btree_reserve_put(c, reserve);
@@ -659,6 +664,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_iter *iter,
 
        bch2_btree_bset_insert_key(iter, b, node_iter, insert);
        set_btree_node_dirty(b);
+       set_btree_node_need_write(b);
 }
 
 /* Inserting into a given leaf node (last stage of insert): */
@@ -798,12 +804,6 @@ void bch2_btree_journal_key(struct btree_insert *trans,
                u64 seq = trans->journal_res.seq;
                bool needs_whiteout = insert->k.needs_whiteout;
 
-               /*
-                * have a bug where we're seeing an extent with an invalid crc
-                * entry in the journal, trying to track it down:
-                */
-               BUG_ON(bch2_bkey_invalid(c, b->btree_id, bkey_i_to_s_c(insert)));
-
                /* ick */
                insert->k.needs_whiteout = false;
                bch2_journal_add_keys(j, &trans->journal_res,
@@ -878,6 +878,8 @@ bch2_btree_interior_update_alloc(struct bch_fs *c)
        closure_init(&as->cl, &c->cl);
        as->c           = c;
        as->mode        = BTREE_INTERIOR_NO_UPDATE;
+       INIT_LIST_HEAD(&as->write_blocked_list);
+       INIT_LIST_HEAD(&as->reachable_list);
 
        bch2_keylist_init(&as->parent_keys, as->inline_keys,
                         ARRAY_SIZE(as->inline_keys));
@@ -908,6 +910,18 @@ static void btree_interior_update_nodes_reachable(struct closure *cl)
 
        mutex_lock(&c->btree_interior_update_lock);
 
+       while (!list_empty(&as->reachable_list)) {
+               struct btree *b = list_first_entry(&as->reachable_list,
+                                                  struct btree, reachable);
+               list_del_init(&b->reachable);
+               mutex_unlock(&c->btree_interior_update_lock);
+
+               six_lock_read(&b->lock);
+               bch2_btree_node_write_dirty(c, b, NULL, btree_node_need_write(b));
+               six_unlock_read(&b->lock);
+               mutex_lock(&c->btree_interior_update_lock);
+       }
+
        for (i = 0; i < as->nr_pending; i++)
                bch2_btree_node_free_ondisk(c, &as->pending[i]);
        as->nr_pending = 0;
@@ -929,6 +943,7 @@ static void btree_interior_update_nodes_written(struct closure *cl)
 
        if (bch2_journal_error(&c->journal)) {
                /* XXX what? */
+               /* we don't want to free the nodes on disk, that's what */
        }
 
        /* XXX: missing error handling, damnit */
@@ -962,7 +977,8 @@ retry:
                list_del(&as->write_blocked_list);
                mutex_unlock(&c->btree_interior_update_lock);
 
-               bch2_btree_node_write_dirty(c, b, NULL, true);
+               bch2_btree_node_write_dirty(c, b, NULL,
+                                           btree_node_need_write(b));
                six_unlock_read(&b->lock);
                break;
 
@@ -1135,6 +1151,7 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
        }
 
        clear_btree_node_dirty(b);
+       clear_btree_node_need_write(b);
        w = btree_current_write(b);
 
        llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
@@ -1152,6 +1169,8 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
                                      &as->journal, interior_update_flush);
        bch2_journal_pin_drop(&c->journal, &w->journal);
 
+       if (!list_empty(&b->reachable))
+               list_del_init(&b->reachable);
 
        mutex_unlock(&c->btree_interior_update_lock);
 }
@@ -1265,7 +1284,8 @@ bch2_btree_insert_keys_interior(struct btree *b,
  * node)
  */
 static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n1,
-                                       struct btree_reserve *reserve)
+                                       struct btree_reserve *reserve,
+                                       struct btree_interior_update *as)
 {
        size_t nr_packed = 0, nr_unpacked = 0;
        struct btree *n2;
@@ -1273,6 +1293,8 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n
        struct bkey_packed *k, *prev = NULL;
 
        n2 = bch2_btree_node_alloc(iter->c, n1->level, iter->btree_id, reserve);
+       list_add(&n2->reachable, &as->reachable_list);
+
        n2->data->max_key       = n1->data->max_key;
        n2->data->format        = n1->format;
        n2->key.k.p = n1->key.k.p;
@@ -1421,13 +1443,15 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
        bch2_btree_interior_update_will_free_node(c, as, b);
 
        n1 = bch2_btree_node_alloc_replacement(c, b, reserve);
+       list_add(&n1->reachable, &as->reachable_list);
+
        if (b->level)
                btree_split_insert_keys(iter, n1, insert_keys, reserve);
 
        if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
                trace_btree_node_split(c, b, b->nr.live_u64s);
 
-               n2 = __btree_split_node(iter, n1, reserve);
+               n2 = __btree_split_node(iter, n1, reserve, as);
 
                bch2_btree_build_aux_trees(n2);
                bch2_btree_build_aux_trees(n1);
@@ -1449,6 +1473,8 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
                        n3 = __btree_root_alloc(c, b->level + 1,
                                                iter->btree_id,
                                                reserve);
+                       list_add(&n3->reachable, &as->reachable_list);
+
                        n3->sib_u64s[0] = U16_MAX;
                        n3->sib_u64s[1] = U16_MAX;
 
@@ -1748,6 +1774,8 @@ retry:
        bch2_btree_interior_update_will_free_node(c, as, m);
 
        n = bch2_btree_node_alloc(c, b->level, b->btree_id, reserve);
+       list_add(&n->reachable, &as->reachable_list);
+
        n->data->min_key        = prev->data->min_key;
        n->data->max_key        = next->data->max_key;
        n->data->format         = new_f;
@@ -1914,8 +1942,8 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
        int ret;
 
        trans_for_each_entry(trans, i) {
-               EBUG_ON(i->iter->level);
-               EBUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+               BUG_ON(i->iter->level);
+               BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
        }
 
        sort(trans->entries, trans->nr, sizeof(trans->entries[0]),
@@ -2076,6 +2104,19 @@ err:
        goto out;
 }
 
+int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
+{
+       struct bkey_i k;
+
+       bkey_init(&k.k);
+       k.k.p = iter->pos;
+
+       return bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
+                                   BTREE_INSERT_NOFAIL|
+                                   BTREE_INSERT_USE_RESERVE|flags,
+                                   BTREE_INSERT_ENTRY(iter, &k));
+}
+
 int bch2_btree_insert_list_at(struct btree_iter *iter,
                             struct keylist *keys,
                             struct disk_reservation *disk_res,
@@ -2104,45 +2145,6 @@ int bch2_btree_insert_list_at(struct btree_iter *iter,
        return 0;
 }
 
-/**
- * bch_btree_insert_check_key - insert dummy key into btree
- *
- * We insert a random key on a cache miss, then compare exchange on it
- * once the cache promotion or backing device read completes. This
- * ensures that if this key is written to after the read, the read will
- * lose and not overwrite the key with stale data.
- *
- * Return values:
- * -EAGAIN: @iter->cl was put on a waitlist waiting for btree node allocation
- * -EINTR: btree node was changed while upgrading to write lock
- */
-int bch2_btree_insert_check_key(struct btree_iter *iter,
-                              struct bkey_i *check_key)
-{
-       struct bpos saved_pos = iter->pos;
-       struct bkey_i_cookie *cookie;
-       BKEY_PADDED(key) tmp;
-       int ret;
-
-       BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&check_key->k)));
-
-       check_key->k.type = KEY_TYPE_COOKIE;
-       set_bkey_val_bytes(&check_key->k, sizeof(struct bch_cookie));
-
-       cookie = bkey_i_to_cookie(check_key);
-       get_random_bytes(&cookie->v, sizeof(cookie->v));
-
-       bkey_copy(&tmp.key, check_key);
-
-       ret = bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
-                                 BTREE_INSERT_ATOMIC,
-                                 BTREE_INSERT_ENTRY(iter, &tmp.key));
-
-       bch2_btree_iter_rewind(iter, saved_pos);
-
-       return ret;
-}
-
 /**
  * bch_btree_insert - insert keys into the extent btree
  * @c:                 pointer to struct bch_fs
@@ -2310,6 +2312,7 @@ int bch2_btree_node_rewrite(struct btree_iter *iter, struct btree *b,
        bch2_btree_interior_update_will_free_node(c, as, b);
 
        n = bch2_btree_node_alloc_replacement(c, b, reserve);
+       list_add(&n->reachable, &as->reachable_list);
 
        bch2_btree_build_aux_trees(n);
        six_unlock_write(&n->lock);
index b18c44c744449537c5cf0bda8a3637c41c72fd33..a933d5a96d11d0e94dab6899fc2a782f53803b19 100644 (file)
@@ -64,7 +64,7 @@ struct pending_btree_node_free {
  */
 struct btree_interior_update {
        struct closure                  cl;
-       struct bch_fs           *c;
+       struct bch_fs                   *c;
 
        struct list_head                list;
 
@@ -86,6 +86,7 @@ struct btree_interior_update {
         */
        struct btree                    *b;
        struct list_head                write_blocked_list;
+       struct list_head                reachable_list;
 
        /*
         * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
@@ -317,7 +318,6 @@ struct btree_insert {
 
 int __bch2_btree_insert_at(struct btree_insert *);
 
-
 #define _TENTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...)   N
 #define COUNT_ARGS(...)  _TENTH_ARG(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1)
 
@@ -380,6 +380,8 @@ int __bch2_btree_insert_at(struct btree_insert *);
  */
 #define BTREE_INSERT_JOURNAL_REPLAY    (1 << 3)
 
+int bch2_btree_delete_at(struct btree_iter *, unsigned);
+
 int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
                             struct disk_reservation *,
                             struct extent_insert_hook *, u64 *, unsigned);
@@ -403,7 +405,6 @@ static inline bool journal_res_insert_fits(struct btree_insert *trans,
        return u64s <= trans->journal_res.u64s;
 }
 
-int bch2_btree_insert_check_key(struct btree_iter *, struct bkey_i *);
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
                     struct disk_reservation *,
                     struct extent_insert_hook *, u64 *, int flags);
index 248bc7a16b4750374d3296c5e2e3efd94e4c1a82..bf160e0b93daf2a19f32d27661700258d14610f9 100644 (file)
@@ -15,7 +15,7 @@
 #include "debug.h"
 #include "error.h"
 #include "extents.h"
-#include "fs-gc.h"
+#include "fsck.h"
 #include "inode.h"
 #include "io.h"
 #include "super.h"
index 503f0dc4bb086a7f3c1bc94673acf1920614c803..e2978bab46c92594800fede5ec65b795c25da58a 100644 (file)
@@ -20,6 +20,11 @@ unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
        return len;
 }
 
+static unsigned dirent_val_u64s(unsigned len)
+{
+       return DIV_ROUND_UP(sizeof(struct bch_dirent) + len, sizeof(u64));
+}
+
 static u64 bch2_dirent_hash(const struct bch_hash_info *info,
                            const struct qstr *name)
 {
@@ -64,7 +69,7 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
        return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
 }
 
-static const struct bch_hash_desc dirent_hash_desc = {
+const struct bch_hash_desc bch2_dirent_hash_desc = {
        .btree_id       = BTREE_ID_DIRENTS,
        .key_type       = BCH_DIRENT,
        .whiteout_type  = BCH_DIRENT_WHITEOUT,
@@ -77,12 +82,30 @@ static const struct bch_hash_desc dirent_hash_desc = {
 static const char *bch2_dirent_invalid(const struct bch_fs *c,
                                       struct bkey_s_c k)
 {
+       struct bkey_s_c_dirent d;
+       unsigned len;
+
        switch (k.k->type) {
        case BCH_DIRENT:
-               return bkey_val_bytes(k.k) < sizeof(struct bch_dirent)
-                       ? "value too small"
-                       : NULL;
+               if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
+                       return "value too small";
+
+               d = bkey_s_c_to_dirent(k);
+               len = bch2_dirent_name_bytes(d);
+
+               if (!len)
+                       return "empty name";
+
+               if (bkey_val_u64s(k.k) > dirent_val_u64s(len))
+                       return "value too big";
+
+               if (len > NAME_MAX)
+                       return "dirent name too big";
 
+               if (memchr(d.v->d_name, '/', len))
+                       return "dirent name has invalid characters";
+
+               return NULL;
        case BCH_DIRENT_WHITEOUT:
                return bkey_val_bytes(k.k) != 0
                        ? "value size should be zero"
@@ -97,21 +120,15 @@ static void bch2_dirent_to_text(struct bch_fs *c, char *buf,
                                size_t size, struct bkey_s_c k)
 {
        struct bkey_s_c_dirent d;
+       size_t n = 0;
 
        switch (k.k->type) {
        case BCH_DIRENT:
                d = bkey_s_c_to_dirent(k);
 
-               if (size) {
-                       unsigned n = min_t(unsigned, size,
-                                          bch2_dirent_name_bytes(d));
-                       memcpy(buf, d.v->d_name, n);
-                       buf[size - 1] = '\0';
-                       buf += n;
-                       size -= n;
-               }
-
-               scnprintf(buf, size, " -> %llu", d.v->d_inum);
+               n += bch_scnmemcpy(buf + n, size - n, d.v->d_name,
+                                  bch2_dirent_name_bytes(d));
+               n += scnprintf(buf + n, size - n, " -> %llu", d.v->d_inum);
                break;
        case BCH_DIRENT_WHITEOUT:
                scnprintf(buf, size, "whiteout");
@@ -128,9 +145,7 @@ static struct bkey_i_dirent *dirent_create_key(u8 type,
                                const struct qstr *name, u64 dst)
 {
        struct bkey_i_dirent *dirent;
-       unsigned u64s = BKEY_U64s +
-               DIV_ROUND_UP(sizeof(struct bch_dirent) + name->len,
-                            sizeof(u64));
+       unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
 
        dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS);
        if (!dirent)
@@ -163,7 +178,7 @@ int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
        if (!dirent)
                return -ENOMEM;
 
-       ret = bch2_hash_set(dirent_hash_desc, hash_info, c, dir_inum,
+       ret = bch2_hash_set(bch2_dirent_hash_desc, hash_info, c, dir_inum,
                           journal_seq, &dirent->k_i, flags);
        kfree(dirent);
 
@@ -223,13 +238,13 @@ retry:
         * from the original hashed position (like we do when creating dirents,
         * in bch_hash_set) -  we never move existing dirents to different slot:
         */
-       old_src = bch2_hash_lookup_at(dirent_hash_desc,
+       old_src = bch2_hash_lookup_at(bch2_dirent_hash_desc,
                                     &src_ei->str_hash,
                                     &src_iter, src_name);
        if ((ret = btree_iter_err(old_src)))
                goto err;
 
-       ret = bch2_hash_needs_whiteout(dirent_hash_desc,
+       ret = bch2_hash_needs_whiteout(bch2_dirent_hash_desc,
                                &src_ei->str_hash,
                                &whiteout_iter, &src_iter);
        if (ret < 0)
@@ -242,8 +257,8 @@ retry:
         * to do that check for us for correctness:
         */
        old_dst = mode == BCH_RENAME
-               ? bch2_hash_hole_at(dirent_hash_desc, &dst_iter)
-               : bch2_hash_lookup_at(dirent_hash_desc,
+               ? bch2_hash_hole_at(bch2_dirent_hash_desc, &dst_iter)
+               : bch2_hash_lookup_at(bch2_dirent_hash_desc,
                                     &dst_ei->str_hash,
                                     &dst_iter, dst_name);
        if ((ret = btree_iter_err(old_dst)))
@@ -330,7 +345,7 @@ int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
                       const struct qstr *name,
                       u64 *journal_seq)
 {
-       return bch2_hash_delete(dirent_hash_desc, hash_info,
+       return bch2_hash_delete(bch2_dirent_hash_desc, hash_info,
                               c, dir_inum, journal_seq, name);
 }
 
@@ -342,7 +357,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
        struct bkey_s_c k;
        u64 inum;
 
-       k = bch2_hash_lookup(dirent_hash_desc, hash_info, c,
+       k = bch2_hash_lookup(bch2_dirent_hash_desc, hash_info, c,
                            dir_inum, &iter, name);
        if (IS_ERR(k.k)) {
                bch2_btree_iter_unlock(&iter);
index b1a30bda10af6ff2e30615e7040b1541504cc39d..fb2950a377b6f4c195bf7ff5f150f7e910516dca 100644 (file)
@@ -1,6 +1,9 @@
 #ifndef _BCACHE_DIRENT_H
 #define _BCACHE_DIRENT_H
 
+#include "str_hash.h"
+
+extern const struct bch_hash_desc bch2_dirent_hash_desc;
 extern const struct bkey_ops bch2_bkey_dirent_ops;
 
 struct qstr;
index 8babf196b5c2638ec373d02b8e7bbe5f99e2b666..5b7316de1077453205fd9b9a7cf86a770bce5dac 100644 (file)
@@ -49,3 +49,102 @@ void bch2_nonfatal_io_error(struct bch_dev *ca)
 {
        queue_work(system_long_wq, &ca->io_error_work);
 }
+
+#ifdef __KERNEL__
+#define ask_yn()       false
+#else
+#include "tools-util.h"
+#endif
+
+enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
+                               const char *fmt, ...)
+{
+       struct fsck_err_state *s;
+       va_list args;
+       bool fix = false, print = true, suppressing = false;
+       char _buf[sizeof(s->buf)], *buf = _buf;
+
+       mutex_lock(&c->fsck_error_lock);
+
+       if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+               goto print;
+
+       list_for_each_entry(s, &c->fsck_errors, list)
+               if (s->fmt == fmt)
+                       goto found;
+
+       s = kzalloc(sizeof(*s), GFP_KERNEL);
+       if (!s) {
+               if (!c->fsck_alloc_err)
+                       bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
+               c->fsck_alloc_err = true;
+               buf = _buf;
+               goto print;
+       }
+
+       INIT_LIST_HEAD(&s->list);
+       s->fmt = fmt;
+found:
+       list_move(&s->list, &c->fsck_errors);
+       s->nr++;
+       suppressing     = s->nr == 10;
+       print           = s->nr <= 10;
+       buf             = s->buf;
+print:
+       va_start(args, fmt);
+       vscnprintf(buf, sizeof(_buf), fmt, args);
+       va_end(args);
+
+       if (flags & FSCK_CAN_FIX) {
+               if (c->opts.fix_errors == FSCK_ERR_ASK) {
+                       printk(KERN_ERR "%s: fix?", buf);
+                       fix = ask_yn();
+               } else if (c->opts.fix_errors == FSCK_ERR_YES ||
+                          (c->opts.nochanges &&
+                           !(flags & FSCK_CAN_IGNORE))) {
+                       if (print)
+                               bch_err(c, "%s, fixing", buf);
+                       fix = true;
+               } else {
+                       if (print)
+                               bch_err(c, "%s, not fixing", buf);
+                       fix = false;
+               }
+       } else if (flags & FSCK_NEED_FSCK) {
+               if (print)
+                       bch_err(c, "%s (run fsck to correct)", buf);
+       } else {
+               if (print)
+                       bch_err(c, "%s (repair unimplemented)", buf);
+       }
+
+       if (suppressing)
+               bch_err(c, "Ratelimiting new instances of previous error");
+
+       mutex_unlock(&c->fsck_error_lock);
+
+       if (fix)
+               set_bit(BCH_FS_FSCK_FIXED_ERRORS, &c->flags);
+
+       return fix                              ? FSCK_ERR_FIX
+               : flags & FSCK_CAN_IGNORE       ? FSCK_ERR_IGNORE
+                                               : FSCK_ERR_EXIT;
+}
+
+void bch2_flush_fsck_errs(struct bch_fs *c)
+{
+       struct fsck_err_state *s, *n;
+
+       mutex_lock(&c->fsck_error_lock);
+       set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
+       list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
+               if (s->nr > 10)
+                       bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->buf);
+
+               list_del(&s->list);
+               kfree(s);
+       }
+
+       mutex_unlock(&c->fsck_error_lock);
+}
index 5f81c346db9152e97c9361870dd98a10098d6b2b..750c676a65d60c647455ef0753ce888c4d23086b 100644 (file)
@@ -95,62 +95,38 @@ enum {
        BCH_FSCK_UNKNOWN_VERSION        = 4,
 };
 
-/* These macros return true if error should be fixed: */
-
-/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
-
 enum fsck_err_opts {
        FSCK_ERR_NO,
        FSCK_ERR_YES,
        FSCK_ERR_ASK,
 };
 
-#ifdef __KERNEL__
-#define __fsck_err_should_fix(c, msg, ...)                             \
-({                                                                     \
-       bool _fix = (c)->opts.fix_errors;                               \
-       bch_err(c, msg ", %sfixing", ##__VA_ARGS__, _fix ? "" : "not ");\
-       _fix;                                                           \
-})
-#else
-#include "tools-util.h"
+enum fsck_err_ret {
+       FSCK_ERR_IGNORE = 0,
+       FSCK_ERR_FIX    = 1,
+       FSCK_ERR_EXIT   = 2,
+};
 
-#define __fsck_err_should_fix(c, msg, ...)                             \
-({                                                                     \
-       bool _fix = false;                                              \
-       switch ((c)->opts.fix_errors) {                                 \
-       case FSCK_ERR_ASK:                                              \
-               printf(msg ": fix?", ##__VA_ARGS__);                    \
-               _fix = ask_yn();                                        \
-               break;                                                  \
-       case FSCK_ERR_YES:                                              \
-               bch_err(c, msg ", fixing", ##__VA_ARGS__);              \
-               _fix = true;                                            \
-               break;                                                  \
-       case FSCK_ERR_NO:                                               \
-               bch_err(c, msg, ##__VA_ARGS__);                         \
-               _fix = false;                                           \
-               break;                                                  \
-       }                                                               \
-       _fix;                                                           \
-})
-#endif
+struct fsck_err_state {
+       struct list_head        list;
+       const char              *fmt;
+       u64                     nr;
+       char                    buf[512];
+};
+
+#define FSCK_CAN_FIX           (1 << 0)
+#define FSCK_CAN_IGNORE                (1 << 1)
+#define FSCK_NEED_FSCK         (1 << 2)
 
-#define __fsck_err(c, _can_fix, _can_ignore, _nofix_msg, msg, ...)     \
+enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
+                               unsigned, const char *, ...);
+void bch2_flush_fsck_errs(struct bch_fs *);
+
+#define __fsck_err(c, _flags, msg, ...)                                        \
 ({                                                                     \
-       bool _fix;                                                      \
-                                                                       \
-       if (_can_fix) {                                                 \
-               _fix = __fsck_err_should_fix(c, msg, ##__VA_ARGS__);    \
-       } else {                                                        \
-               bch_err(c, msg " ("_nofix_msg")", ##__VA_ARGS__);       \
-               _fix = false;                                           \
-       }                                                               \
+       int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
                                                                        \
-       if (_fix)                                                       \
-               set_bit(BCH_FS_FSCK_FIXED_ERRORS, &(c)->flags);         \
-                                                                       \
-       if (!_fix && !_can_ignore) {                                    \
+       if (_fix == FSCK_ERR_EXIT) {                                    \
                bch_err(c, "Unable to continue, halting");              \
                ret = BCH_FSCK_ERRORS_NOT_FIXED;                        \
                goto fsck_err;                                          \
@@ -159,24 +135,27 @@ enum fsck_err_opts {
        _fix;                                                           \
 })
 
-#define __fsck_err_on(cond, c, _can_fix, _can_ignore, _nofix_msg, ...) \
-       ((cond) ? __fsck_err(c, _can_fix, _can_ignore,                  \
-                            _nofix_msg, ##__VA_ARGS__) : false)
+/* These macros return true if error should be fixed: */
+
+/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
+
+#define __fsck_err_on(cond, c, _flags, ...)                            \
+       ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
 
 #define unfixable_fsck_err_on(cond, c, ...)                            \
-       __fsck_err_on(cond, c, false, true, "repair unimplemented", ##__VA_ARGS__)
+       __fsck_err_on(cond, c, FSCK_CAN_IGNORE, ##__VA_ARGS__)
 
 #define need_fsck_err_on(cond, c, ...)                                 \
-       __fsck_err_on(cond, c, false, true, "run fsck to correct", ##__VA_ARGS__)
+       __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
 
 #define mustfix_fsck_err(c, ...)                                       \
-       __fsck_err(c, true, false, "not fixing", ##__VA_ARGS__)
+       __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
 
 #define mustfix_fsck_err_on(cond, c, ...)                              \
-       __fsck_err_on(cond, c, true, false, "not fixing", ##__VA_ARGS__)
+       __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
 
 #define fsck_err_on(cond, c, ...)                                      \
-       __fsck_err_on(cond, c, true, true, "not fixing", ##__VA_ARGS__)
+       __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
 
 /*
  * Fatal errors: these don't indicate a bug, but we can't continue running in RW
index 8ad192c4b32cd34e841d38b1c472fe192002129b..dc5c7f4cdbc668af73371f1741d66083a5dda165 100644 (file)
@@ -5,8 +5,8 @@
 #include "clock.h"
 #include "error.h"
 #include "fs.h"
-#include "fs-gc.h"
 #include "fs-io.h"
+#include "fsck.h"
 #include "inode.h"
 #include "journal.h"
 #include "io.h"
index 94c5a9e6bdd7afa2a74691d21a06f4bea83f058d..3c02b0c6eb744a8d547265881e63e5f5dffba0b8 100644 (file)
@@ -7,8 +7,8 @@
 #include "dirent.h"
 #include "extents.h"
 #include "fs.h"
-#include "fs-gc.h"
 #include "fs-io.h"
+#include "fsck.h"
 #include "inode.h"
 #include "journal.h"
 #include "keylist.h"
similarity index 75%
rename from libbcachefs/fs-gc.c
rename to libbcachefs/fsck.c
index 03370c0ef11ac903895d583febef6948dfda00e1..3fe0387239d1fe3bfce053298edb158550e90661 100644 (file)
@@ -4,10 +4,11 @@
 #include "dirent.h"
 #include "error.h"
 #include "fs.h"
-#include "fs-gc.h"
+#include "fsck.h"
 #include "inode.h"
 #include "keylist.h"
 #include "super.h"
+#include "xattr.h"
 
 #include <linux/dcache.h> /* struct qstr */
 #include <linux/generic-radix-tree.h>
@@ -37,12 +38,16 @@ static int remove_dirent(struct bch_fs *c, struct btree_iter *iter,
        bch2_btree_iter_unlock(iter);
 
        ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode);
-       if (ret)
+       if (ret) {
+               bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
                goto err;
+       }
 
        dir_hash_info = bch2_hash_info_init(c, &dir_inode);
 
        ret = bch2_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL);
+       if (ret)
+               bch_err(c, "remove_dirent: err %i deleting dirent", ret);
 err:
        kfree(buf);
        return ret;
@@ -108,6 +113,118 @@ static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum)
        return 0;
 }
 
+struct hash_check {
+       struct bch_hash_info    info;
+       struct btree_iter       chain;
+       struct btree_iter       iter;
+       u64                     next;
+};
+
+static void hash_check_init(const struct bch_hash_desc desc,
+                           struct hash_check *h, struct bch_fs *c)
+{
+       bch2_btree_iter_init(&h->chain, c, desc.btree_id, POS_MIN);
+       bch2_btree_iter_init(&h->iter, c, desc.btree_id, POS_MIN);
+}
+
+static void hash_check_set_inode(struct hash_check *h, struct bch_fs *c,
+                                const struct bch_inode_unpacked *bi)
+{
+       h->info = bch2_hash_info_init(c, bi);
+       h->next = -1;
+}
+
+static int hash_redo_key(const struct bch_hash_desc desc,
+                        struct hash_check *h, struct bch_fs *c,
+                        struct btree_iter *k_iter, struct bkey_s_c k,
+                        u64 hashed)
+{
+       struct bkey_i *tmp;
+       int ret = 0;
+
+       tmp = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+       if (!tmp)
+               return -ENOMEM;
+
+       bkey_reassemble(tmp, k);
+
+       ret = bch2_btree_delete_at(k_iter, 0);
+       if (ret)
+               goto err;
+
+       bch2_btree_iter_unlock(k_iter);
+
+       bch2_hash_set(desc, &h->info, c, k_iter->pos.inode, NULL,
+                     tmp, BCH_HASH_SET_MUST_CREATE);
+err:
+       kfree(tmp);
+       return ret;
+}
+
+static int hash_check_key(const struct bch_hash_desc desc,
+                         struct hash_check *h, struct bch_fs *c,
+                         struct btree_iter *k_iter, struct bkey_s_c k)
+{
+       char buf[200];
+       u64 hashed;
+       int ret = 0;
+
+       if (k.k->type != desc.whiteout_type &&
+           k.k->type != desc.key_type)
+               return 0;
+
+       if (k.k->p.offset != h->next) {
+               if (!btree_iter_linked(&h->chain)) {
+                       bch2_btree_iter_link(k_iter, &h->chain);
+                       bch2_btree_iter_link(k_iter, &h->iter);
+               }
+               bch2_btree_iter_copy(&h->chain, k_iter);
+       }
+       h->next = k.k->p.offset + 1;
+
+       if (k.k->type != desc.key_type)
+               return 0;
+
+       hashed = desc.hash_bkey(&h->info, k);
+
+       if (fsck_err_on(hashed < h->chain.pos.offset ||
+                       hashed > k.k->p.offset, c,
+                       "hash table key at wrong offset: %llu, "
+                       "hashed to %llu chain starts at %llu\n%s",
+                       k.k->p.offset, hashed, h->chain.pos.offset,
+                       bch2_bkey_val_to_text(c, desc.btree_id,
+                                             buf, sizeof(buf), k))) {
+               ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
+               if (ret) {
+                       bch_err(c, "hash_redo_key err %i", ret);
+                       return ret;
+               }
+               return 1;
+       }
+
+       if (!bkey_cmp(h->chain.pos, k_iter->pos))
+               return 0;
+
+       bch2_btree_iter_copy(&h->iter, &h->chain);
+       while (bkey_cmp(h->iter.pos, k_iter->pos) < 0) {
+               struct bkey_s_c k2 = bch2_btree_iter_peek(&h->iter);
+
+               if (fsck_err_on(k2.k->type == desc.key_type &&
+                               !desc.cmp_bkey(k, k2), c,
+                               "duplicate hash table keys:\n%s",
+                               bch2_bkey_val_to_text(c, desc.btree_id,
+                                                     buf, sizeof(buf), k))) {
+                       ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL);
+                       if (ret)
+                               return ret;
+                       return 1;
+               }
+               bch2_btree_iter_advance_pos(&h->iter);
+       }
+fsck_err:
+       return ret;
+}
+
 /*
  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
  * that i_size an i_sectors are consistent
@@ -130,14 +247,18 @@ static int check_extents(struct bch_fs *c)
                if (ret)
                        break;
 
-               unfixable_fsck_err_on(!w.have_inode, c,
+               if (fsck_err_on(!w.have_inode, c,
                        "extent type %u for missing inode %llu",
-                       k.k->type, k.k->p.inode);
-
-               unfixable_fsck_err_on(w.have_inode &&
+                       k.k->type, k.k->p.inode) ||
+                   fsck_err_on(w.have_inode &&
                        !S_ISREG(w.inode.i_mode) && !S_ISLNK(w.inode.i_mode), c,
                        "extent type %u for non regular file, inode %llu mode %o",
-                       k.k->type, k.k->p.inode, w.inode.i_mode);
+                       k.k->type, k.k->p.inode, w.inode.i_mode)) {
+                       ret = bch2_btree_delete_at(&iter, 0);
+                       if (ret)
+                               goto err;
+                       continue;
+               }
 
                unfixable_fsck_err_on(w.first_this_inode &&
                        w.have_inode &&
@@ -154,6 +275,7 @@ static int check_extents(struct bch_fs *c)
                        "extent type %u offset %llu past end of inode %llu, i_size %llu",
                        k.k->type, k.k->p.offset, k.k->p.inode, w.inode.i_size);
        }
+err:
 fsck_err:
        return bch2_btree_iter_unlock(&iter) ?: ret;
 }
@@ -166,10 +288,15 @@ noinline_for_stack
 static int check_dirents(struct bch_fs *c)
 {
        struct inode_walker w = inode_walker_init();
+       struct hash_check h;
        struct btree_iter iter;
        struct bkey_s_c k;
+       unsigned name_len;
+       char buf[200];
        int ret = 0;
 
+       hash_check_init(bch2_dirent_hash_desc, &h, c);
+
        for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
                           POS(BCACHE_ROOT_INO, 0), k) {
                struct bkey_s_c_dirent d;
@@ -181,13 +308,32 @@ static int check_dirents(struct bch_fs *c)
                if (ret)
                        break;
 
-               unfixable_fsck_err_on(!w.have_inode, c,
-                                     "dirent in nonexisting directory %llu",
-                                     k.k->p.inode);
+               if (fsck_err_on(!w.have_inode, c,
+                               "dirent in nonexisting directory:\n%s",
+                               bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
+                                                     buf, sizeof(buf), k)) ||
+                   fsck_err_on(!S_ISDIR(w.inode.i_mode), c,
+                               "dirent in non directory inode type %u:\n%s",
+                               mode_to_type(w.inode.i_mode),
+                               bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
+                                                     buf, sizeof(buf), k))) {
+                       ret = bch2_btree_delete_at(&iter, 0);
+                       if (ret)
+                               goto err;
+                       continue;
+               }
+
+               if (w.first_this_inode && w.have_inode)
+                       hash_check_set_inode(&h, c, &w.inode);
+
+               ret = hash_check_key(bch2_dirent_hash_desc, &h, c, &iter, k);
+               if (ret > 0) {
+                       ret = 0;
+                       continue;
+               }
 
-               unfixable_fsck_err_on(!S_ISDIR(w.inode.i_mode), c,
-                                     "dirent in non directory inode %llu, type %u",
-                                     k.k->p.inode, mode_to_type(w.inode.i_mode));
+               if (ret)
+                       goto fsck_err;
 
                if (k.k->type != BCH_DIRENT)
                        continue;
@@ -195,8 +341,25 @@ static int check_dirents(struct bch_fs *c)
                d = bkey_s_c_to_dirent(k);
                d_inum = le64_to_cpu(d.v->d_inum);
 
+               name_len = bch2_dirent_name_bytes(d);
+
+               if (fsck_err_on(!name_len, c, "empty dirent") ||
+                   fsck_err_on(name_len == 1 &&
+                               !memcmp(d.v->d_name, ".", 1), c,
+                               ". dirent") ||
+                   fsck_err_on(name_len == 2 &&
+                               !memcmp(d.v->d_name, "..", 2), c,
+                               ".. dirent")) {
+                       ret = remove_dirent(c, &iter, d);
+                       if (ret)
+                               goto err;
+                       continue;
+               }
+
                if (fsck_err_on(d_inum == d.k->p.inode, c,
-                               "dirent points to own directory")) {
+                               "dirent points to own directory:\n%s",
+                               bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
+                                                     buf, sizeof(buf), k))) {
                        ret = remove_dirent(c, &iter, d);
                        if (ret)
                                goto err;
@@ -211,8 +374,9 @@ static int check_dirents(struct bch_fs *c)
                ret = 0;
 
                if (fsck_err_on(!have_target, c,
-                               "dirent points to missing inode %llu, type %u filename %s",
-                               d_inum, d.v->d_type, d.v->d_name)) {
+                               "dirent points to missing inode:\n%s",
+                               bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
+                                                     buf, sizeof(buf), k))) {
                        ret = remove_dirent(c, &iter, d);
                        if (ret)
                                goto err;
@@ -222,10 +386,10 @@ static int check_dirents(struct bch_fs *c)
                if (fsck_err_on(have_target &&
                                d.v->d_type !=
                                mode_to_type(le16_to_cpu(target.i_mode)), c,
-                               "incorrect d_type: got %u should be %u, filename %s",
-                               d.v->d_type,
+                               "incorrect d_type: should be %u:\n%s",
                                mode_to_type(le16_to_cpu(target.i_mode)),
-                               d.v->d_name)) {
+                               bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
+                                                     buf, sizeof(buf), k))) {
                        struct bkey_i_dirent *n;
 
                        n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
@@ -248,6 +412,8 @@ static int check_dirents(struct bch_fs *c)
        }
 err:
 fsck_err:
+       bch2_btree_iter_unlock(&h.chain);
+       bch2_btree_iter_unlock(&h.iter);
        return bch2_btree_iter_unlock(&iter) ?: ret;
 }
 
@@ -258,21 +424,39 @@ noinline_for_stack
 static int check_xattrs(struct bch_fs *c)
 {
        struct inode_walker w = inode_walker_init();
+       struct hash_check h;
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
+       hash_check_init(bch2_xattr_hash_desc, &h, c);
+
        for_each_btree_key(&iter, c, BTREE_ID_XATTRS,
                           POS(BCACHE_ROOT_INO, 0), k) {
                ret = walk_inode(c, &w, k.k->p.inode);
                if (ret)
                        break;
 
-               unfixable_fsck_err_on(!w.have_inode, c,
-                       "xattr for missing inode %llu",
-                       k.k->p.inode);
+               if (fsck_err_on(!w.have_inode, c,
+                               "xattr for missing inode %llu",
+                               k.k->p.inode)) {
+                       ret = bch2_btree_delete_at(&iter, 0);
+                       if (ret)
+                               goto err;
+                       continue;
+               }
+
+               if (w.first_this_inode && w.have_inode)
+                       hash_check_set_inode(&h, c, &w.inode);
+
+               ret = hash_check_key(bch2_xattr_hash_desc, &h, c, &iter, k);
+               if (ret)
+                       goto fsck_err;
        }
+err:
 fsck_err:
+       bch2_btree_iter_unlock(&h.chain);
+       bch2_btree_iter_unlock(&h.iter);
        return bch2_btree_iter_unlock(&iter) ?: ret;
 }
 
@@ -445,6 +629,8 @@ static int check_directory_structure(struct bch_fs *c,
 
        /* DFS: */
 restart_dfs:
+       had_unreachable = false;
+
        ret = inode_bitmap_set(&dirs_done, BCACHE_ROOT_INO);
        if (ret)
                goto err;
@@ -478,7 +664,8 @@ next:
                        d_inum = le64_to_cpu(dirent.v->d_inum);
 
                        if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c,
-                                       "directory with multiple hardlinks")) {
+                                       "directory %llu has multiple hardlinks",
+                                       d_inum)) {
                                ret = remove_dirent(c, &iter, dirent);
                                if (ret)
                                        goto err;
@@ -503,8 +690,6 @@ up:
                path.nr--;
        }
 
-       had_unreachable = false;
-
        for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
                if (k.k->type != BCH_INODE_FS ||
                    !S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->i_mode)))
@@ -640,7 +825,7 @@ static int bch2_gc_do_inode(struct bch_fs *c,
 
        ret = bch2_inode_unpack(inode, &u);
        if (bch2_fs_inconsistent_on(ret, c,
-                        "error unpacking inode %llu in fs-gc",
+                        "error unpacking inode %llu in fsck",
                         inode.k->p.inode))
                return ret;
 
@@ -894,36 +1079,59 @@ int bch2_fsck(struct bch_fs *c, bool full_fsck)
        struct bch_inode_unpacked root_inode, lostfound_inode;
        int ret;
 
-       ret = check_root(c, &root_inode);
-       if (ret)
-               return ret;
+       if (full_fsck) {
+               bch_verbose(c, "checking extents");
+               ret = check_extents(c);
+               if (ret)
+                       return ret;
 
-       ret = check_lostfound(c, &root_inode, &lostfound_inode);
-       if (ret)
-               return ret;
+               bch_verbose(c, "checking dirents");
+               ret = check_dirents(c);
+               if (ret)
+                       return ret;
 
-       if (!full_fsck)
-               goto check_nlinks;
+               bch_verbose(c, "checking xattrs");
+               ret = check_xattrs(c);
+               if (ret)
+                       return ret;
 
-       ret = check_extents(c);
-       if (ret)
-               return ret;
+               bch_verbose(c, "checking root directory");
+               ret = check_root(c, &root_inode);
+               if (ret)
+                       return ret;
 
-       ret = check_dirents(c);
-       if (ret)
-               return ret;
+               bch_verbose(c, "checking lost+found");
+               ret = check_lostfound(c, &root_inode, &lostfound_inode);
+               if (ret)
+                       return ret;
 
-       ret = check_xattrs(c);
-       if (ret)
-               return ret;
+               bch_verbose(c, "checking directory structure");
+               ret = check_directory_structure(c, &lostfound_inode);
+               if (ret)
+                       return ret;
 
-       ret = check_directory_structure(c, &lostfound_inode);
-       if (ret)
-               return ret;
-check_nlinks:
-       ret = check_inode_nlinks(c, &lostfound_inode);
-       if (ret)
-               return ret;
+               bch_verbose(c, "checking inode nlinks");
+               ret = check_inode_nlinks(c, &lostfound_inode);
+               if (ret)
+                       return ret;
+       } else {
+               bch_verbose(c, "checking root directory");
+               ret = check_root(c, &root_inode);
+               if (ret)
+                       return ret;
+
+               bch_verbose(c, "checking lost+found");
+               ret = check_lostfound(c, &root_inode, &lostfound_inode);
+               if (ret)
+                       return ret;
+
+               bch_verbose(c, "checking inode nlinks");
+               ret = check_inode_nlinks(c, &lostfound_inode);
+               if (ret)
+                       return ret;
+       }
+
+       bch2_flush_fsck_errs(c);
 
        return 0;
 }
similarity index 100%
rename from libbcachefs/fs-gc.h
rename to libbcachefs/fsck.h
index 7a8467c4580ec1030201eb696f1dcf85ef15c9de..5b56a628a77bac67211dc463608908736c930658 100644 (file)
@@ -25,14 +25,12 @@ static const u8 bits_table[8] = {
        13 * 8 - 8,
 };
 
-static int inode_encode_field(u8 *out, u8 *end, const u64 in[2])
+static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
 {
-       unsigned bytes, bits, shift;
-
-       if (likely(!in[1]))
-               bits = fls64(in[0]);
-       else
-               bits = fls64(in[1]) + 64;
+       __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
+       unsigned shift, bytes, bits = likely(!hi)
+               ? fls64(lo)
+               : fls64(hi) + 64;
 
        for (shift = 1; shift <= 8; shift++)
                if (bits < bits_table[shift - 1])
@@ -44,17 +42,7 @@ got_shift:
 
        BUG_ON(out + bytes > end);
 
-       if (likely(bytes <= 8)) {
-               u64 b = cpu_to_be64(in[0]);
-
-               memcpy(out, (void *) &b + 8 - bytes, bytes);
-       } else {
-               u64 b = cpu_to_be64(in[1]);
-
-               memcpy(out, (void *) &b + 16 - bytes, bytes);
-               put_unaligned_be64(in[0], out + bytes - 8);
-       }
-
+       memcpy(out, (u8 *) in + 16 - bytes, bytes);
        *out |= (1 << 8) >> shift;
 
        return bytes;
@@ -63,7 +51,9 @@ got_shift:
 static int inode_decode_field(const u8 *in, const u8 *end,
                              u64 out[2], unsigned *out_bits)
 {
-       unsigned bytes, bits, shift;
+       __be64 be[2] = { 0, 0 };
+       unsigned bytes, shift;
+       u8 *p;
 
        if (in >= end)
                return -1;
@@ -77,29 +67,18 @@ static int inode_decode_field(const u8 *in, const u8 *end,
         */
        shift   = 8 - __fls(*in); /* 1 <= shift <= 8 */
        bytes   = byte_table[shift - 1];
-       bits    = bytes * 8 - shift;
 
        if (in + bytes > end)
                return -1;
 
-       /*
-        * we're assuming it's safe to deref up to 7 bytes < in; this will work
-        * because keys always start quite a bit more than 7 bytes after the
-        * start of the btree node header:
-        */
-       if (likely(bytes <= 8)) {
-               out[0] = get_unaligned_be64(in + bytes - 8);
-               out[0] <<= 64 - bits;
-               out[0] >>= 64 - bits;
-               out[1] = 0;
-       } else {
-               out[0] = get_unaligned_be64(in + bytes - 8);
-               out[1] = get_unaligned_be64(in + bytes - 16);
-               out[1] <<= 128 - bits;
-               out[1] >>= 128 - bits;
-       }
+       p = (u8 *) be + 16 - bytes;
+       memcpy(p, in, bytes);
+       *p ^= (1 << 8) >> shift;
+
+       out[0] = be64_to_cpu(be[0]);
+       out[1] = be64_to_cpu(be[1]);
+       *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
 
-       *out_bits = out[1] ? 64 + fls64(out[1]) : fls64(out[0]);
        return bytes;
 }
 
@@ -109,7 +88,6 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
        u8 *out = packed->inode.v.fields;
        u8 *end = (void *) &packed[1];
        u8 *last_nonzero_field = out;
-       u64 field[2];
        unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
 
        bkey_inode_init(&packed->inode.k_i);
@@ -119,12 +97,10 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
        packed->inode.v.i_mode          = cpu_to_le16(inode->i_mode);
 
 #define BCH_INODE_FIELD(_name, _bits)                                  \
-       field[0] = inode->_name;                                        \
-       field[1] = 0;                                                   \
-       out += inode_encode_field(out, end, field);                     \
+       out += inode_encode_field(out, end, 0, inode->_name);           \
        nr_fields++;                                                    \
                                                                        \
-       if (field[0] | field[1]) {                                      \
+       if (inode->_name) {                                             \
                last_nonzero_field = out;                               \
                last_nonzero_fieldnr = nr_fields;                       \
        }
@@ -187,7 +163,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
        if (field_bits > sizeof(unpacked->_name) * 8)                   \
                return -1;                                              \
                                                                        \
-       unpacked->_name = field[0];                                     \
+       unpacked->_name = field[1];                                     \
        in += ret;
 
        BCH_INODE_FIELDS()
@@ -449,3 +425,32 @@ int bch2_cached_dev_inode_find_by_uuid(struct bch_fs *c, uuid_le *uuid,
        bch2_btree_iter_unlock(&iter);
        return -ENOENT;
 }
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_inode_pack_test(void)
+{
+       struct bch_inode_unpacked *u, test_inodes[] = {
+               {
+                       .i_atime        = U64_MAX,
+                       .i_ctime        = U64_MAX,
+                       .i_mtime        = U64_MAX,
+                       .i_otime        = U64_MAX,
+                       .i_size         = U64_MAX,
+                       .i_sectors      = U64_MAX,
+                       .i_uid          = U32_MAX,
+                       .i_gid          = U32_MAX,
+                       .i_nlink        = U32_MAX,
+                       .i_generation   = U32_MAX,
+                       .i_dev          = U32_MAX,
+               },
+       };
+
+       for (u = test_inodes;
+            u < test_inodes + ARRAY_SIZE(test_inodes);
+            u++) {
+               struct bkey_inode_buf p;
+
+               bch2_inode_pack(&p, u);
+       }
+}
+#endif
index d1d64a7fba5163044a8c794af79733f8734bf67a..06e2ffdac8f104aa3f9686aef06edd9925f5fa58 100644 (file)
@@ -54,4 +54,10 @@ static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec ts)
        return div_s64(ns, c->sb.time_precision);
 }
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_inode_pack_test(void);
+#else
+static inline void bch2_inode_pack_test(void) {}
+#endif
+
 #endif
index 039dd044153ecf36123ad571a0baafb36eb0e6c4..0f27eaf6f63d48192eb97f77a64a9ea02d271b12 100644 (file)
@@ -910,8 +910,8 @@ static int bio_checksum_uncompress(struct bch_fs *c,
                bch2_encrypt_bio(c, rbio->crc.csum_type,
                                nonce, src);
 
-               bio_copy_data_iter(dst, dst_iter,
-                                  src, src->bi_iter);
+               bio_copy_data_iter(dst, &dst_iter,
+                                  src, &src->bi_iter);
        } else {
                bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
        }
index f6203f1edf0261ff5811bf3724b87cf5c335031c..ca96330ce27f436c4b92803fcd40d5b1d4e9b78f 100644 (file)
@@ -527,62 +527,34 @@ fsck_err:
 #define JOURNAL_ENTRY_NONE     6
 #define JOURNAL_ENTRY_BAD      7
 
-static int journal_entry_validate(struct bch_fs *c,
-                                 struct jset *j, u64 sector,
-                                 unsigned bucket_sectors_left,
-                                 unsigned sectors_read)
+#define journal_entry_err(c, msg, ...)                                 \
+({                                                                     \
+       if (write == READ) {                                            \
+               mustfix_fsck_err(c, msg, ##__VA_ARGS__);                \
+       } else {                                                        \
+               bch_err(c, "detected corrupt metadata before write:\n"  \
+                       msg, ##__VA_ARGS__);                            \
+               ret = BCH_FSCK_ERRORS_NOT_FIXED;                        \
+               goto fsck_err;                                          \
+       }                                                               \
+       true;                                                           \
+})
+
+#define journal_entry_err_on(cond, c, msg, ...)                                \
+       ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
+
+static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
+                                   int write)
 {
        struct jset_entry *entry;
-       size_t bytes = vstruct_bytes(j);
-       struct bch_csum csum;
        int ret = 0;
 
-       if (le64_to_cpu(j->magic) != jset_magic(c))
-               return JOURNAL_ENTRY_NONE;
-
-       if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) {
-               bch_err(c, "unknown journal entry version %u",
-                       le32_to_cpu(j->version));
-               return BCH_FSCK_UNKNOWN_VERSION;
-       }
-
-       if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9, c,
-                       "journal entry too big (%zu bytes), sector %lluu",
-                       bytes, sector)) {
-               /* XXX: note we might have missing journal entries */
-               return JOURNAL_ENTRY_BAD;
-       }
-
-       if (bytes > sectors_read << 9)
-               return JOURNAL_ENTRY_REREAD;
-
-       if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c,
-                       "journal entry with unknown csum type %llu sector %lluu",
-                       JSET_CSUM_TYPE(j), sector))
-               return JOURNAL_ENTRY_BAD;
-
-       csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
-       if (mustfix_fsck_err_on(bch2_crc_cmp(csum, j->csum), c,
-                       "journal checksum bad, sector %llu", sector)) {
-               /* XXX: retry IO, when we start retrying checksum errors */
-               /* XXX: note we might have missing journal entries */
-               return JOURNAL_ENTRY_BAD;
-       }
-
-       bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
-                   j->encrypted_start,
-                   vstruct_end(j) - (void *) j->encrypted_start);
-
-       if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c,
-                       "invalid journal entry: last_seq > seq"))
-               j->last_seq = j->seq;
-
        vstruct_for_each(j, entry) {
                struct bkey_i *k;
 
-               if (mustfix_fsck_err_on(vstruct_next(entry) >
-                                       vstruct_last(j), c,
-                               "journal entry extents past end of jset")) {
+               if (journal_entry_err_on(vstruct_next(entry) >
+                                        vstruct_last(j), c,
+                               "journal entry extends past end of jset")) {
                        j->u64s = cpu_to_le64((u64 *) entry - j->_data);
                        break;
                }
@@ -602,7 +574,7 @@ static int journal_entry_validate(struct bch_fs *c,
                case JOURNAL_ENTRY_BTREE_ROOT:
                        k = entry->start;
 
-                       if (mustfix_fsck_err_on(!entry->u64s ||
+                       if (journal_entry_err_on(!entry->u64s ||
                                        le16_to_cpu(entry->u64s) != k->k.u64s, c,
                                        "invalid btree root journal entry: wrong number of keys")) {
                                journal_entry_null_range(entry,
@@ -620,7 +592,7 @@ static int journal_entry_validate(struct bch_fs *c,
                        break;
 
                case JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED:
-                       if (mustfix_fsck_err_on(le16_to_cpu(entry->u64s) != 1, c,
+                       if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
                                "invalid journal seq blacklist entry: bad size")) {
                                journal_entry_null_range(entry,
                                                vstruct_next(entry));
@@ -628,7 +600,7 @@ static int journal_entry_validate(struct bch_fs *c,
 
                        break;
                default:
-                       mustfix_fsck_err(c, "invalid journal entry type %llu",
+                       journal_entry_err(c, "invalid journal entry type %llu",
                                 JOURNAL_ENTRY_TYPE(entry));
                        journal_entry_null_range(entry, vstruct_next(entry));
                        break;
@@ -639,6 +611,61 @@ fsck_err:
        return ret;
 }
 
+static int journal_entry_validate(struct bch_fs *c,
+                                 struct jset *j, u64 sector,
+                                 unsigned bucket_sectors_left,
+                                 unsigned sectors_read,
+                                 int write)
+{
+       size_t bytes = vstruct_bytes(j);
+       struct bch_csum csum;
+       int ret = 0;
+
+       if (le64_to_cpu(j->magic) != jset_magic(c))
+               return JOURNAL_ENTRY_NONE;
+
+       if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) {
+               bch_err(c, "unknown journal entry version %u",
+                       le32_to_cpu(j->version));
+               return BCH_FSCK_UNKNOWN_VERSION;
+       }
+
+       if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
+                       "journal entry too big (%zu bytes), sector %lluu",
+                       bytes, sector)) {
+               /* XXX: note we might have missing journal entries */
+               return JOURNAL_ENTRY_BAD;
+       }
+
+       if (bytes > sectors_read << 9)
+               return JOURNAL_ENTRY_REREAD;
+
+       if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c,
+                       "journal entry with unknown csum type %llu sector %lluu",
+                       JSET_CSUM_TYPE(j), sector))
+               return JOURNAL_ENTRY_BAD;
+
+       csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
+       if (journal_entry_err_on(bch2_crc_cmp(csum, j->csum), c,
+                       "journal checksum bad, sector %llu", sector)) {
+               /* XXX: retry IO, when we start retrying checksum errors */
+               /* XXX: note we might have missing journal entries */
+               return JOURNAL_ENTRY_BAD;
+       }
+
+       bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
+                   j->encrypted_start,
+                   vstruct_end(j) - (void *) j->encrypted_start);
+
+       if (journal_entry_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c,
+                       "invalid journal entry: last_seq > seq"))
+               j->last_seq = j->seq;
+
+       return __journal_entry_validate(c, j, write);
+fsck_err:
+       return ret;
+}
+
 struct journal_read_buf {
        void            *data;
        size_t          size;
@@ -705,7 +732,8 @@ reread:                     sectors_read = min_t(unsigned,
                }
 
                ret = journal_entry_validate(c, j, offset,
-                                       end - offset, sectors_read);
+                                       end - offset, sectors_read,
+                                       READ);
                switch (ret) {
                case BCH_FSCK_OK:
                        break;
@@ -2274,6 +2302,10 @@ static void journal_write(struct closure *cl)
        SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
        SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
 
+       if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+           __journal_entry_validate(c, jset, WRITE))
+               goto err;
+
        bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
                    jset->encrypted_start,
                    vstruct_end(jset) - (void *) jset->encrypted_start);
@@ -2281,6 +2313,10 @@ static void journal_write(struct closure *cl)
        jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
                                  journal_nonce(jset), jset);
 
+       if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+           __journal_entry_validate(c, jset, WRITE))
+               goto err;
+
        sectors = vstruct_sectors(jset, c->block_bits);
        BUG_ON(sectors > j->prev_buf_sectors);
 
@@ -2349,6 +2385,9 @@ no_io:
                ptr->offset += sectors;
 
        closure_return_with_destructor(cl, journal_write_done);
+err:
+       bch2_fatal_error(c);
+       closure_return_with_destructor(cl, journal_write_done);
 }
 
 static void journal_write_work(struct work_struct *work)
index 6eac6fc0ac928b436fb92a25aab4ac296d188062..8b31c7d7d1f2bf8b779196575b594de6357d81bd 100644 (file)
@@ -2,7 +2,9 @@
 #define _BCACHE_STR_HASH_H
 
 #include "btree_iter.h"
+#include "btree_update.h"
 #include "checksum.h"
+#include "error.h"
 #include "inode.h"
 #include "siphash.h"
 #include "super.h"
@@ -341,6 +343,36 @@ err:
        return ret;
 }
 
+static inline int bch2_hash_delete_at(const struct bch_hash_desc desc,
+                                     const struct bch_hash_info *info,
+                                     struct btree_iter *iter,
+                                     u64 *journal_seq)
+{
+       struct btree_iter whiteout_iter;
+       struct bkey_i delete;
+       int ret = -ENOENT;
+
+       bch2_btree_iter_init(&whiteout_iter, iter->c, desc.btree_id,
+                            iter->pos);
+       bch2_btree_iter_link(iter, &whiteout_iter);
+
+       ret = bch2_hash_needs_whiteout(desc, info, &whiteout_iter, iter);
+       if (ret < 0)
+               goto err;
+
+       bkey_init(&delete.k);
+       delete.k.p = iter->pos;
+       delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
+
+       ret = bch2_btree_insert_at(iter->c, NULL, NULL, journal_seq,
+                                 BTREE_INSERT_NOFAIL|
+                                 BTREE_INSERT_ATOMIC,
+                                 BTREE_INSERT_ENTRY(iter, &delete));
+err:
+       bch2_btree_iter_unlink(&whiteout_iter);
+       return ret;
+}
+
 static inline int bch2_hash_delete(const struct bch_hash_desc desc,
                                  const struct bch_hash_info *info,
                                  struct bch_fs *c, u64 inode,
@@ -348,7 +380,6 @@ static inline int bch2_hash_delete(const struct bch_hash_desc desc,
 {
        struct btree_iter iter, whiteout_iter;
        struct bkey_s_c k;
-       struct bkey_i delete;
        int ret = -ENOENT;
 
        bch2_btree_iter_init_intent(&iter, c, desc.btree_id,
@@ -361,18 +392,7 @@ retry:
        if ((ret = btree_iter_err(k)))
                goto err;
 
-       ret = bch2_hash_needs_whiteout(desc, info, &whiteout_iter, &iter);
-       if (ret < 0)
-               goto err;
-
-       bkey_init(&delete.k);
-       delete.k.p = k.k->p;
-       delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
-
-       ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
-                                 BTREE_INSERT_NOFAIL|
-                                 BTREE_INSERT_ATOMIC,
-                                 BTREE_INSERT_ENTRY(&iter, &delete));
+       ret = bch2_hash_delete_at(desc, info, &iter, journal_seq);
 err:
        if (ret == -EINTR)
                goto retry;
index f5ee2de3fb1f06ea91a8e9acc61c4a7d0994e2d8..7a98136047d5757da2a52704895bf341b69319eb 100644 (file)
@@ -19,7 +19,7 @@
 #include "debug.h"
 #include "error.h"
 #include "fs.h"
-#include "fs-gc.h"
+#include "fsck.h"
 #include "inode.h"
 #include "io.h"
 #include "journal.h"
@@ -513,6 +513,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        INIT_WORK(&c->read_retry_work, bch2_read_retry_work);
        mutex_init(&c->zlib_workspace_lock);
 
+       INIT_LIST_HEAD(&c->fsck_errors);
+       mutex_init(&c->fsck_error_lock);
+
        seqcount_init(&c->gc_pos_lock);
 
        c->prio_clock[READ].hand = 1;
@@ -875,12 +878,12 @@ err:
        switch (ret) {
        case BCH_FSCK_ERRORS_NOT_FIXED:
                bch_err(c, "filesystem contains errors: please report this to the developers");
-               pr_cont("mount with -o fix_errors to repair");
+               pr_cont("mount with -o fix_errors to repair\n");
                err = "fsck error";
                break;
        case BCH_FSCK_REPAIR_UNIMPLEMENTED:
                bch_err(c, "filesystem contains errors: please report this to the developers");
-               pr_cont("repair unimplemented: inform the developers so that it can be added");
+               pr_cont("repair unimplemented: inform the developers so that it can be added\n");
                err = "fsck error";
                break;
        case BCH_FSCK_REPAIR_IMPOSSIBLE:
@@ -979,8 +982,8 @@ static void bch2_dev_free(struct bch_dev *ca)
        kvpfree(ca->disk_buckets, bucket_bytes(ca));
        kfree(ca->prio_buckets);
        kfree(ca->bio_prio);
-       vfree(ca->buckets);
-       vfree(ca->oldest_gens);
+       kvpfree(ca->buckets,     ca->mi.nbuckets * sizeof(struct bucket));
+       kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
        free_heap(&ca->heap);
        free_fifo(&ca->free_inc);
 
@@ -1140,10 +1143,12 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
            !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
            !init_fifo(&ca->free_inc,   free_inc_reserve, GFP_KERNEL) ||
            !init_heap(&ca->heap,       heap_size, GFP_KERNEL) ||
-           !(ca->oldest_gens   = vzalloc(sizeof(u8) *
-                                         ca->mi.nbuckets)) ||
-           !(ca->buckets       = vzalloc(sizeof(struct bucket) *
-                                         ca->mi.nbuckets)) ||
+           !(ca->oldest_gens   = kvpmalloc(ca->mi.nbuckets *
+                                           sizeof(u8),
+                                           GFP_KERNEL|__GFP_ZERO)) ||
+           !(ca->buckets       = kvpmalloc(ca->mi.nbuckets *
+                                           sizeof(struct bucket),
+                                           GFP_KERNEL|__GFP_ZERO)) ||
            !(ca->prio_buckets  = kzalloc(sizeof(u64) * prio_buckets(ca) *
                                          2, GFP_KERNEL)) ||
            !(ca->disk_buckets  = kvpmalloc(bucket_bytes(ca), GFP_KERNEL)) ||
@@ -1871,6 +1876,7 @@ static void bcachefs_exit(void)
 static int __init bcachefs_init(void)
 {
        bch2_bkey_pack_test();
+       bch2_inode_pack_test();
 
        if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
            bch2_chardev_init() ||
index ba04bbad9af98af77ea1069d61c091b8b0bf9066..c34048a32b56fad176e3b1ec4882fe0b66d0afee 100644 (file)
@@ -512,7 +512,7 @@ STORE(bch2_fs_opts_dir)
 {
        struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
        const struct bch_option *opt;
-       enum bch_opt_id id;
+       int id;
        u64 v;
 
        id = bch2_parse_sysfs_opt(attr->name, buf, &v);
index f57224a6f00d80d3ec4d3ad448bbe7c3dcf60e4d..79a2aeb1a8469825c15963a31e102a52e3f4787d 100644 (file)
@@ -417,3 +417,17 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
                dst += bv.bv_len;
        }
 }
+
+size_t bch_scnmemcpy(char *buf, size_t size, const char *src, size_t len)
+{
+       size_t n;
+
+       if (!size)
+               return 0;
+
+       n = min(size - 1, len);
+       memcpy(buf, src, n);
+       buf[n] = '\0';
+
+       return n;
+}
index 5669cb8abf42766b954be91076cc82063159ddc9..8aa5c34b456dbf8f8e35aaa04ac5e50102b78cc8 100644 (file)
@@ -93,7 +93,8 @@ static inline void kvpfree(void *p, size_t size)
 static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
 {
        return size < PAGE_SIZE ? kmalloc(size, gfp_mask)
-               :  (void *) __get_free_pages(gfp_mask, get_order(size))
+               :  (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
+                                            get_order(size))
                ?: __vmalloc(size, gfp_mask, PAGE_KERNEL);
 }
 
@@ -750,4 +751,6 @@ static inline struct bio_vec next_contig_bvec(struct bio *bio,
 #define bio_for_each_contig_segment(bv, bio, iter)                     \
        __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
 
+size_t bch_scnmemcpy(char *, size_t, const char *, size_t);
+
 #endif /* _BCACHE_UTIL_H */
index 488d5369e1f55d4830cd8f165d7b626fe7d94f45..4e82e42cec14c808eaf3482cce253d953f8b7c7c 100644 (file)
 #include <linux/posix_acl_xattr.h>
 #include <linux/xattr.h>
 
+static unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
+{
+       return DIV_ROUND_UP(sizeof(struct bch_xattr) +
+                           name_len + val_len, sizeof(u64));
+}
+
+#define xattr_val(_xattr)      ((_xattr)->x_name + (_xattr)->x_name_len)
+
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
+
 struct xattr_search_key {
        u8              type;
        struct qstr     name;
@@ -31,8 +41,6 @@ static u64 bch2_xattr_hash(const struct bch_hash_info *info,
        return bch2_str_hash_end(&ctx, info);
 }
 
-#define xattr_val(_xattr)      ((_xattr)->x_name + (_xattr)->x_name_len)
-
 static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
 {
        return bch2_xattr_hash(info, key);
@@ -66,7 +74,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
                memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
 }
 
-static const struct bch_hash_desc xattr_hash_desc = {
+const struct bch_hash_desc bch2_xattr_hash_desc = {
        .btree_id       = BTREE_ID_XATTRS,
        .key_type       = BCH_XATTR,
        .whiteout_type  = BCH_XATTR_WHITEOUT,
@@ -79,12 +87,33 @@ static const struct bch_hash_desc xattr_hash_desc = {
 static const char *bch2_xattr_invalid(const struct bch_fs *c,
                                     struct bkey_s_c k)
 {
+       const struct xattr_handler *handler;
+       struct bkey_s_c_xattr xattr;
+       unsigned u64s;
+
        switch (k.k->type) {
        case BCH_XATTR:
-               return bkey_val_bytes(k.k) < sizeof(struct bch_xattr)
-                       ? "value too small"
-                       : NULL;
+               if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
+                       return "value too small";
 
+               xattr = bkey_s_c_to_xattr(k);
+               u64s = xattr_val_u64s(xattr.v->x_name_len,
+                                     le16_to_cpu(xattr.v->x_val_len));
+
+               if (bkey_val_u64s(k.k) < u64s)
+                       return "value too small";
+
+               if (bkey_val_u64s(k.k) > u64s)
+                       return "value too big";
+
+               handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+               if (!handler)
+                       return "invalid type";
+
+               if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
+                       return "xattr name has invalid characters";
+
+               return NULL;
        case BCH_XATTR_WHITEOUT:
                return bkey_val_bytes(k.k) != 0
                        ? "value size should be zero"
@@ -98,34 +127,29 @@ static const char *bch2_xattr_invalid(const struct bch_fs *c,
 static void bch2_xattr_to_text(struct bch_fs *c, char *buf,
                              size_t size, struct bkey_s_c k)
 {
+       const struct xattr_handler *handler;
        struct bkey_s_c_xattr xattr;
-       int n;
+       size_t n = 0;
 
        switch (k.k->type) {
        case BCH_XATTR:
                xattr = bkey_s_c_to_xattr(k);
 
-               if (size) {
-                       n = min_t(unsigned, size, xattr.v->x_name_len);
-                       memcpy(buf, xattr.v->x_name, n);
-                       buf[size - 1] = '\0';
-                       buf += n;
-                       size -= n;
-               }
-
-               n = scnprintf(buf, size, " -> ");
-               buf += n;
-               size -= n;
-
-               if (size) {
-                       n = min_t(unsigned, size,
-                                 le16_to_cpu(xattr.v->x_val_len));
-                       memcpy(buf, xattr_val(xattr.v), n);
-                       buf[size - 1] = '\0';
-                       buf += n;
-                       size -= n;
-               }
-
+               handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+               if (handler && handler->prefix)
+                       n += scnprintf(buf + n, size - n, "%s", handler->prefix);
+               else if (handler)
+                       n += scnprintf(buf + n, size - n, "(type %u)",
+                                      xattr.v->x_type);
+               else
+                       n += scnprintf(buf + n, size - n, "(unknown type %u)",
+                                      xattr.v->x_type);
+
+               n += bch_scnmemcpy(buf + n, size - n, xattr.v->x_name,
+                                  xattr.v->x_name_len);
+               n += scnprintf(buf + n, size - n, ":");
+               n += bch_scnmemcpy(buf + n, size - n, xattr_val(xattr.v),
+                                  le16_to_cpu(xattr.v->x_val_len));
                break;
        case BCH_XATTR_WHITEOUT:
                scnprintf(buf, size, "whiteout");
@@ -147,7 +171,7 @@ int bch2_xattr_get(struct bch_fs *c, struct inode *inode,
        struct bkey_s_c_xattr xattr;
        int ret;
 
-       k = bch2_hash_lookup(xattr_hash_desc, &ei->str_hash, c,
+       k = bch2_hash_lookup(bch2_xattr_hash_desc, &ei->str_hash, c,
                            ei->vfs_inode.i_ino, &iter,
                            &X_SEARCH(type, name, strlen(name)));
        if (IS_ERR(k.k))
@@ -175,15 +199,13 @@ int __bch2_xattr_set(struct bch_fs *c, u64 inum,
        int ret;
 
        if (!value) {
-               ret = bch2_hash_delete(xattr_hash_desc, hash_info,
+               ret = bch2_hash_delete(bch2_xattr_hash_desc, hash_info,
                                      c, inum,
                                      journal_seq, &search);
        } else {
                struct bkey_i_xattr *xattr;
                unsigned u64s = BKEY_U64s +
-                       DIV_ROUND_UP(sizeof(struct bch_xattr) +
-                                    search.name.len + size,
-                                    sizeof(u64));
+                       xattr_val_u64s(search.name.len, size);
 
                if (u64s > U8_MAX)
                        return -ERANGE;
@@ -200,7 +222,7 @@ int __bch2_xattr_set(struct bch_fs *c, u64 inum,
                memcpy(xattr->v.x_name, search.name.name, search.name.len);
                memcpy(xattr_val(&xattr->v), value, size);
 
-               ret = bch2_hash_set(xattr_hash_desc, hash_info, c,
+               ret = bch2_hash_set(bch2_xattr_hash_desc, hash_info, c,
                                inum, journal_seq,
                                &xattr->k_i,
                                (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
@@ -225,8 +247,6 @@ int bch2_xattr_set(struct bch_fs *c, struct inode *inode,
                               &ei->journal_seq);
 }
 
-static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
-
 static size_t bch2_xattr_emit(struct dentry *dentry,
                             const struct bch_xattr *xattr,
                             char *buffer, size_t buffer_size)
index 14eba241869a87f05c7206be81bac0b376f9d87e..9bc5376fa77154bda4cc857f3fb94517eaa10004 100644 (file)
@@ -1,6 +1,9 @@
 #ifndef _BCACHE_XATTR_H
 #define _BCACHE_XATTR_H
 
+#include "str_hash.h"
+
+extern const struct bch_hash_desc bch2_xattr_hash_desc;
 extern const struct bkey_ops bch2_bkey_xattr_ops;
 
 struct dentry;
index 966f22713544e8d0b180cb40ee5eff553ec922d5..8fb10ce4a46d039b89076979ee4f9c2357ce6625 100644 (file)
 #include <linux/kernel.h>
 #include <linux/export.h>
 
-void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
-                       struct bio *src, struct bvec_iter src_iter)
+void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
+                       struct bio *src, struct bvec_iter *src_iter)
 {
        struct bio_vec src_bv, dst_bv;
        void *src_p, *dst_p;
        unsigned bytes;
 
-       while (1) {
-               if (!src_iter.bi_size) {
-                       src = src->bi_next;
-                       if (!src)
-                               break;
-
-                       src_iter = src->bi_iter;
-               }
-
-               if (!dst_iter.bi_size) {
-                       dst = dst->bi_next;
-                       if (!dst)
-                               break;
-
-                       dst_iter = dst->bi_iter;
-               }
-
-               src_bv = bio_iter_iovec(src, src_iter);
-               dst_bv = bio_iter_iovec(dst, dst_iter);
+       while (src_iter->bi_size && dst_iter->bi_size) {
+               src_bv = bio_iter_iovec(src, *src_iter);
+               dst_bv = bio_iter_iovec(dst, *dst_iter);
 
                bytes = min(src_bv.bv_len, dst_bv.bv_len);
 
@@ -60,15 +44,27 @@ void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
                kunmap_atomic(dst_p);
                kunmap_atomic(src_p);
 
-               bio_advance_iter(src, &src_iter, bytes);
-               bio_advance_iter(dst, &dst_iter, bytes);
+               flush_dcache_page(dst_bv.bv_page);
+
+               bio_advance_iter(src, src_iter, bytes);
+               bio_advance_iter(dst, dst_iter, bytes);
        }
 }
 
+/**
+ * bio_copy_data - copy contents of data buffers from one bio to another
+ * @src: source bio
+ * @dst: destination bio
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
 void bio_copy_data(struct bio *dst, struct bio *src)
 {
-       bio_copy_data_iter(dst, dst->bi_iter,
-                          src, src->bi_iter);
+       struct bvec_iter src_iter = src->bi_iter;
+       struct bvec_iter dst_iter = dst->bi_iter;
+
+       bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
 }
 
 void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)