Update bcachefs sources to da037866e6

author Kent Overstreet <kent.overstreet@gmail.com>

Mon, 10 Apr 2017 04:11:10 +0000 (20:11 -0800)

committer Kent Overstreet <kent.overstreet@gmail.com>

Mon, 10 Apr 2017 04:12:37 +0000 (20:12 -0800)
author Kent Overstreet <kent.overstreet@gmail.com>
Mon, 10 Apr 2017 04:11:10 +0000 (20:11 -0800)
committer Kent Overstreet <kent.overstreet@gmail.com>
Mon, 10 Apr 2017 04:12:37 +0000 (20:12 -0800)
diff --git a/.bcachefs_revision b/.bcachefs_revision

index f4cee9aaf7e64ecdfe7effef8016d9837c1eac92..35e8c14b9e58db60a0273821940725fb3097e771 100644 (file)
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-297c81ae4d608707fdabedc60158ff1f4fbec257
+da037866e669b09edc6b049ce09535d3456474cb
diff --git a/cmd_format.c b/cmd_format.c

index ae6dd33d6bf638dd7f25890101dc7974fb2093f5..a4d12d77d5705dc441e3b367790dc3b94a4810d1 100644 (file)
--- a/cmd_format.c
+++ b/cmd_format.c
@@ -41,7 +41,6 @@ x(0,  metadata_replicas,      "#",                    NULL)                   \
  x(0,   encrypted,              NULL,                   "Enable whole filesystem encryption (chacha20/poly1305)")\
  x(0,   no_passphrase,          NULL,                   "Don't encrypt master encryption key")\
  x('e', error_action,           "(continue|readonly|panic)", NULL)              \
-x(0,   max_journal_entry_size, "size",                 NULL)                   \
  x('L', label,                  "label",                NULL)                   \
  x('U', uuid,                   "uuid",                 NULL)                   \
  x('f', force,                  NULL,                   NULL)                   \
@@ -80,7 +79,6 @@ static void usage(void)
              "      --no_passphrase         Don't encrypt master encryption key\n"
              "      --error_action=(continue|readonly|panic)\n"
              "                              Action to take on filesystem error\n"
-            "      --max_journal_entry_size=size\n"
              "  -l, --label=label\n"
              "      --uuid=uuid\n"
              "  -f, --force\n"
@@ -185,10 +183,6 @@ int cmd_format(int argc, char *argv[])
                                 read_string_list_or_die(optarg,
                                                 bch2_error_actions, "error action");
                         break;
-               case O_max_journal_entry_size:
-                       opts.max_journal_entry_size =
-                               hatoi_validate(optarg, "journal entry size");
-                       break;
                 case O_label:
                 case 'L':
                         opts.label = strdup(optarg);
diff --git a/include/linux/bitops.h b/include/linux/bitops.h

index a0c6508cc8a4ee2ab541d7fad0b034297829d497..47fffb79bb4cf0f98b7c390a46b636a311207e5b 100644 (file)
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -247,29 +247,4 @@ unsigned long rounddown_pow_of_two(unsigned long n)
         return 1UL << (fls_long(n) - 1);
  }
  
-static inline __attribute_const__
-int __get_order(unsigned long size)
-{
-       int order;
-
-       size--;
-       size >>= PAGE_SHIFT;
-#if BITS_PER_LONG == 32
-       order = fls(size);
-#else
-       order = fls64(size);
-#endif
-       return order;
-}
-
-#define get_order(n)                                           \
-(                                                              \
-       __builtin_constant_p(n) ? (                             \
-               ((n) == 0UL) ? BITS_PER_LONG - PAGE_SHIFT :     \
-               (((n) < (1UL << PAGE_SHIFT)) ? 0 :              \
-                ilog2((n) - 1) - PAGE_SHIFT + 1)               \
-       ) :                                                     \
-       __get_order(n)                                          \
-)
-
  #endif
diff --git a/include/linux/log2.h b/include/linux/log2.h

index 395cda29180447f74279cc32999d19bde0caeefd..6fecd39325601717a368e071542d5f00db96940c 100644 (file)
--- a/include/linux/log2.h
+++ b/include/linux/log2.h
@@ -184,4 +184,29 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
         __rounddown_pow_of_two(n)               \
   )
  
+static inline __attribute_const__
+int __get_order(unsigned long size)
+{
+       int order;
+
+       size--;
+       size >>= PAGE_SHIFT;
+#if BITS_PER_LONG == 32
+       order = fls(size);
+#else
+       order = fls64(size);
+#endif
+       return order;
+}
+
+#define get_order(n)                                           \
+(                                                              \
+       __builtin_constant_p(n) ? (                             \
+               ((n) == 0UL) ? BITS_PER_LONG - PAGE_SHIFT :     \
+               (((n) < (1UL << PAGE_SHIFT)) ? 0 :              \
+                ilog2((n) - 1) - PAGE_SHIFT + 1)               \
+       ) :                                                     \
+       __get_order(n)                                          \
+)
+
  #endif /* _TOOLS_LINUX_LOG2_H */
diff --git a/libbcachefs.c b/libbcachefs.c

index 0fdf5da4d6f9c460d99288e8b29a6c4be5a5408c..16bcd0c6e836432aad00ced8e808048bd60664a9 100644 (file)
--- a/libbcachefs.c
+++ b/libbcachefs.c
@@ -149,14 +149,6 @@ struct bch_sb *bch2_format(struct format_opts opts,
                                 min(opts.btree_node_size, i->bucket_size);
         }
  
-       if (!opts.max_journal_entry_size) {
-               /* 2 MB default: */
-               opts.max_journal_entry_size = 4096;
-       }
-
-       opts.max_journal_entry_size =
-               roundup_pow_of_two(opts.max_journal_entry_size);
-
         if (uuid_is_null(opts.uuid.b))
                 uuid_generate(opts.uuid.b);
  
@@ -191,7 +183,6 @@ struct bch_sb *bch2_format(struct format_opts opts,
         SET_BCH_SB_DATA_REPLICAS_REQ(sb,        opts.data_replicas_required);
         SET_BCH_SB_ERROR_ACTION(sb,             opts.on_error_action);
         SET_BCH_SB_STR_HASH_TYPE(sb,            BCH_STR_HASH_SIPHASH);
-       SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb,       ilog2(opts.max_journal_entry_size));
  
         struct timespec now;
         if (clock_gettime(CLOCK_REALTIME, &now))
@@ -319,7 +310,6 @@ void bch2_super_print(struct bch_sb *sb, int units)
                "Version:                        %llu\n"
                "Block_size:                     %s\n"
                "Btree node size:                %s\n"
-              "Max journal entry size:         %s\n"
                "Error action:                   %s\n"
                "Clean:                          %llu\n"
  
@@ -342,7 +332,6 @@ void bch2_super_print(struct bch_sb *sb, int units)
                le64_to_cpu(sb->version),
                pr_units(le16_to_cpu(sb->block_size), units),
                pr_units(BCH_SB_BTREE_NODE_SIZE(sb), units),
-              pr_units(1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb), units),
  
                BCH_SB_ERROR_ACTION(sb) < BCH_NR_ERROR_ACTIONS
                ? bch2_error_actions[BCH_SB_ERROR_ACTION(sb)]
diff --git a/libbcachefs.h b/libbcachefs.h

index e5f3b8678c6c7015ce452d8bbc8e4c9916b79019..35ff73b281a77c5f0cc24cc40053a4f4e35ec26b 100644 (file)
--- a/libbcachefs.h
+++ b/libbcachefs.h
@@ -13,7 +13,6 @@ struct format_opts {
         uuid_le         uuid;
  
         unsigned        on_error_action;
-       unsigned        max_journal_entry_size; /* will be removed */
  
         unsigned        block_size;
         unsigned        btree_node_size;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h

index 8d780d271864cc32882fbc690089839c31835c80..a99d96cd6297a92f251a7ce26318a96babc3b212 100644 (file)
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -971,7 +971,7 @@ LE64_BITMASK(BCH_SB_INODE_32BIT,    struct bch_sb, flags[1],  8,  9);
  
  LE64_BITMASK(BCH_SB_128_BIT_MACS,      struct bch_sb, flags[1],  9, 10);
  LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,   struct bch_sb, flags[1], 10, 14);
-LE64_BITMASK(BCH_SB_JOURNAL_ENTRY_SIZE,        struct bch_sb, flags[1], 14, 20);
+/* 14-20 unused, was JOURNAL_ENTRY_SIZE */
  
  LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
  LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28);
diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h

index 76a83fcb92bd4c9153c5e080f55b95cb6edee791..660a72837389ea591629ea3c7cc58777495fe7eb 100644 (file)
--- a/libbcachefs/bset.h
+++ b/libbcachefs/bset.h
@@ -191,6 +191,12 @@ bkey_unpack_key_format_checked(const struct btree *b,
                 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
                         struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
  
+                       /*
+                        * hack around a harmless race when compacting whiteouts
+                        * for a write:
+                        */
+                       dst2.needs_whiteout = dst.needs_whiteout;
+
                         BUG_ON(memcmp(&dst, &dst2, sizeof(dst)));
                 }
         }
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c

index e98d9c16c9e8cfe759982153b60623638e3d08fb..bd47aecf7b57b177b13e6dadd828a9bbf7ceef85 100644 (file)
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -87,6 +87,7 @@ static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp)
         if (!b)
                 return NULL;
  
+       bkey_extent_init(&b->key);
         six_lock_init(&b->lock);
         INIT_LIST_HEAD(&b->list);
         INIT_LIST_HEAD(&b->write_blocked);
@@ -141,8 +142,10 @@ static inline struct btree *mca_find(struct bch_fs *c,
   * this version is for btree nodes that have already been freed (we're not
   * reaping a real btree node)
   */
-static int mca_reap_notrace(struct bch_fs *c, struct btree *b, bool flush)
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
  {
+       int ret = 0;
+
         lockdep_assert_held(&c->btree_cache_lock);
  
         if (!six_trylock_intent(&b->lock))
@@ -155,45 +158,48 @@ static int mca_reap_notrace(struct bch_fs *c, struct btree *b, bool flush)
             btree_node_noevict(b))
                 goto out_unlock;
  
-       if (!list_empty(&b->write_blocked))
+       if (!btree_node_may_write(b))
                 goto out_unlock;
  
-       if (!flush &&
-           (btree_node_dirty(b) ||
-            btree_node_write_in_flight(b)))
-               goto out_unlock;
+       if (btree_node_dirty(b) ||
+           btree_node_write_in_flight(b)) {
+               if (!flush)
+                       goto out_unlock;
  
-       /*
-        * Using the underscore version because we don't want to compact bsets
-        * after the write, since this node is about to be evicted - unless
-        * btree verify mode is enabled, since it runs out of the post write
-        * cleanup:
-        */
-       if (btree_node_dirty(b)) {
+               /*
+                * Using the underscore version because we don't want to compact
+                * bsets after the write, since this node is about to be evicted
+                * - unless btree verify mode is enabled, since it runs out of
+                * the post write cleanup:
+                */
                 if (verify_btree_ondisk(c))
-                       bch2_btree_node_write(c, b, NULL, SIX_LOCK_intent, -1);
+                       bch2_btree_node_write(c, b, NULL, SIX_LOCK_intent);
                 else
-                       __bch2_btree_node_write(c, b, NULL, SIX_LOCK_read, -1);
-       }
+                       __bch2_btree_node_write(c, b, NULL, SIX_LOCK_read);
  
-       /* wait for any in flight btree write */
-       wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
-                      TASK_UNINTERRUPTIBLE);
-
-       return 0;
+               /* wait for any in flight btree write */
+               btree_node_wait_on_io(b);
+       }
+out:
+       if (PTR_HASH(&b->key))
+               trace_btree_node_reap(c, b, ret);
+       return ret;
  out_unlock:
         six_unlock_write(&b->lock);
  out_unlock_intent:
         six_unlock_intent(&b->lock);
-       return -ENOMEM;
+       ret = -ENOMEM;
+       goto out;
  }
  
-static int mca_reap(struct bch_fs *c, struct btree *b, bool flush)
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
  {
-       int ret = mca_reap_notrace(c, b, flush);
+       return __btree_node_reclaim(c, b, false);
+}
  
-       trace_btree_node_reap(c, b, ret);
-       return ret;
+static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
+{
+       return __btree_node_reclaim(c, b, true);
  }
  
  static unsigned long bch2_mca_scan(struct shrinker *shrink,
@@ -239,7 +245,7 @@ static unsigned long bch2_mca_scan(struct shrinker *shrink,
                         break;
  
                 if (++i > 3 &&
-                   !mca_reap_notrace(c, b, false)) {
+                   !btree_node_reclaim(c, b)) {
                         mca_data_free(c, b);
                         six_unlock_write(&b->lock);
                         six_unlock_intent(&b->lock);
@@ -258,7 +264,7 @@ restart:
                 }
  
                 if (!btree_node_accessed(b) &&
-                   !mca_reap(c, b, false)) {
+                   !btree_node_reclaim(c, b)) {
                         /* can't call bch2_btree_node_hash_remove under btree_cache_lock  */
                         freed++;
                         if (&t->list != &c->btree_cache)
@@ -445,12 +451,12 @@ static struct btree *mca_cannibalize(struct bch_fs *c)
         struct btree *b;
  
         list_for_each_entry_reverse(b, &c->btree_cache, list)
-               if (!mca_reap(c, b, false))
+               if (!btree_node_reclaim(c, b))
                         return b;
  
         while (1) {
                 list_for_each_entry_reverse(b, &c->btree_cache, list)
-                       if (!mca_reap(c, b, true))
+                       if (!btree_node_write_and_reclaim(c, b))
                                 return b;
  
                 /*
@@ -474,7 +480,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
          * the list. Check if there's any freed nodes there:
          */
         list_for_each_entry(b, &c->btree_cache_freeable, list)
-               if (!mca_reap_notrace(c, b, false))
+               if (!btree_node_reclaim(c, b))
                         goto out_unlock;
  
         /*
@@ -482,7 +488,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
          * disk node. Check the freed list before allocating a new one:
          */
         list_for_each_entry(b, &c->btree_cache_freed, list)
-               if (!mca_reap_notrace(c, b, false)) {
+               if (!btree_node_reclaim(c, b)) {
                         mca_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
                         if (b->data)
                                 goto out_unlock;
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c

index e07a3f97c1131998348f09474d6c1509783fd4a1..fc06a63a4ee856b2978c91faf3f1fa4067b46254 100644 (file)
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -685,7 +685,7 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
                 bch2_btree_build_aux_trees(n);
                 six_unlock_write(&n->lock);
  
-               bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
+               bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent);
         }
  
         /*
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c

index d827692bf9963027ab148b545fc7f8fa7612d1b8..b56b17350d3e9d7aa1e05e9f4b105fc78a1ca7ec 100644 (file)
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1311,8 +1311,7 @@ static void btree_node_write_endio(struct bio *bio)
  
  void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                             struct closure *parent,
-                           enum six_lock_type lock_type_held,
-                           int idx_to_write)
+                           enum six_lock_type lock_type_held)
  {
         struct bio *bio;
         struct bch_write_bio *wbio;
@@ -1344,14 +1343,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                 if (!(old & (1 << BTREE_NODE_dirty)))
                         return;
  
-               if (idx_to_write >= 0 &&
-                   idx_to_write != !!(old & (1 << BTREE_NODE_write_idx)))
-                       return;
-
                 if (old & (1 << BTREE_NODE_write_in_flight)) {
-                       wait_on_bit_io(&b->flags,
-                                      BTREE_NODE_write_in_flight,
-                                      TASK_UNINTERRUPTIBLE);
+                       btree_node_wait_on_io(b);
                         continue;
                 }
  
@@ -1614,37 +1607,29 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
   */
  void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                           struct closure *parent,
-                         enum six_lock_type lock_type_held,
-                         int idx_to_write)
+                         enum six_lock_type lock_type_held)
  {
         BUG_ON(lock_type_held == SIX_LOCK_write);
  
         if (lock_type_held == SIX_LOCK_intent ||
             six_trylock_convert(&b->lock, SIX_LOCK_read,
                                 SIX_LOCK_intent)) {
-               __bch2_btree_node_write(c, b, parent, SIX_LOCK_intent, idx_to_write);
+               __bch2_btree_node_write(c, b, parent, SIX_LOCK_intent);
  
-               six_lock_write(&b->lock);
-               bch2_btree_post_write_cleanup(c, b);
-               six_unlock_write(&b->lock);
+               /* don't cycle lock unnecessarily: */
+               if (btree_node_just_written(b)) {
+                       six_lock_write(&b->lock);
+                       bch2_btree_post_write_cleanup(c, b);
+                       six_unlock_write(&b->lock);
+               }
  
                 if (lock_type_held == SIX_LOCK_read)
                         six_lock_downgrade(&b->lock);
         } else {
-               __bch2_btree_node_write(c, b, parent, SIX_LOCK_read, idx_to_write);
+               __bch2_btree_node_write(c, b, parent, SIX_LOCK_read);
         }
  }
  
-static void bch2_btree_node_write_dirty(struct bch_fs *c, struct btree *b,
-                                      struct closure *parent)
-{
-       six_lock_read(&b->lock);
-       BUG_ON(b->level);
-
-       bch2_btree_node_write(c, b, parent, SIX_LOCK_read, -1);
-       six_unlock_read(&b->lock);
-}
-
  /*
   * Write all dirty btree nodes to disk, including roots
   */
@@ -1654,7 +1639,7 @@ void bch2_btree_flush(struct bch_fs *c)
         struct btree *b;
         struct bucket_table *tbl;
         struct rhash_head *pos;
-       bool dropped_lock;
+       bool saw_dirty;
         unsigned i;
  
         closure_init_stack(&cl);
@@ -1662,26 +1647,27 @@ void bch2_btree_flush(struct bch_fs *c)
         rcu_read_lock();
  
         do {
-               dropped_lock = false;
+               saw_dirty = false;
                 i = 0;
  restart:
                 tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
                                           &c->btree_cache_table);
  
                 for (; i < tbl->size; i++)
-                       rht_for_each_entry_rcu(b, pos, tbl, i, hash)
-                               /*
-                                * XXX - locking for b->level, when called from
-                                * bch2_journal_move()
-                                */
-                               if (!b->level && btree_node_dirty(b)) {
+                       rht_for_each_entry_rcu(b, pos, tbl, i, hash) {
+                               saw_dirty |= btree_node_dirty(b);
+
+                               if (btree_node_dirty(b) &&
+                                   btree_node_may_write(b)) {
                                         rcu_read_unlock();
-                                       bch2_btree_node_write_dirty(c, b, &cl);
-                                       dropped_lock = true;
+                                       six_lock_read(&b->lock);
+                                       bch2_btree_node_write_dirty(c, b, &cl, 1);
+                                       six_unlock_read(&b->lock);
                                         rcu_read_lock();
                                         goto restart;
                                 }
-       } while (dropped_lock);
+                       }
+       } while (saw_dirty);
  
         rcu_read_unlock();
  
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h

index 290fb5d718d46a4b86fb9ef2fe15ca6794b01aae..84731144b5e5d93362bf561ebe33f58193041753 100644 (file)
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -19,6 +19,17 @@ static inline void btree_node_io_lock(struct btree *b)
                             TASK_UNINTERRUPTIBLE);
  }
  
+static inline void btree_node_wait_on_io(struct btree *b)
+{
+       wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+                      TASK_UNINTERRUPTIBLE);
+}
+
+static inline bool btree_node_may_write(struct btree *b)
+{
+       return list_empty_careful(&b->write_blocked);
+}
+
  enum compact_mode {
         COMPACT_LAZY,
         COMPACT_WRITTEN,
@@ -60,11 +71,28 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *,
                               struct btree_write *);
  
  void __bch2_btree_node_write(struct bch_fs *, struct btree *,
-                           struct closure *, enum six_lock_type, int);
+                           struct closure *, enum six_lock_type);
  bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
  
  void bch2_btree_node_write(struct bch_fs *, struct btree *,
-                         struct closure *, enum six_lock_type, int);
+                         struct closure *, enum six_lock_type);
+
+#define bch2_btree_node_write_dirty(_c, _b, _cl, cond)                 \
+do {                                                                   \
+       while ((_b)->written && btree_node_dirty(_b) && (cond)) {       \
+               if (!btree_node_may_write(_b))                          \
+                       break;                                          \
+                                                                       \
+               if (!btree_node_write_in_flight(_b)) {                  \
+                       bch2_btree_node_write(_c, _b, _cl, SIX_LOCK_read);\
+                       break;                                          \
+               }                                                       \
+                                                                       \
+               six_unlock_read(&(_b)->lock);                           \
+               btree_node_wait_on_io(_b);                              \
+               six_lock_read(&(_b)->lock);                             \
+       }                                                               \
+} while (0)
  
  void bch2_btree_flush(struct bch_fs *);
  void bch2_btree_node_flush_journal_entries(struct bch_fs *, struct btree *,
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c

index 51dff1b7dc77ead3067fa3740d4b264602966003..cdbc0de40a18443dfc37ccf5e9f183a7f7e7ab8d 100644 (file)
--- a/libbcachefs/btree_update.c
+++ b/libbcachefs/btree_update.c
@@ -614,7 +614,7 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
  
         b = __btree_root_alloc(c, 0, id, reserve);
  
-       bch2_btree_node_write(c, b, writes, SIX_LOCK_intent, -1);
+       bch2_btree_node_write(c, b, writes, SIX_LOCK_intent);
  
         bch2_btree_set_root_initial(c, b, reserve);
         bch2_btree_open_bucket_put(c, b);
@@ -750,39 +750,27 @@ overwrite:
  }
  
  static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
-                              unsigned i)
+                              unsigned i, u64 seq)
  {
         struct bch_fs *c = container_of(j, struct bch_fs, journal);
         struct btree_write *w = container_of(pin, struct btree_write, journal);
         struct btree *b = container_of(w, struct btree, writes[i]);
  
         six_lock_read(&b->lock);
-       /*
-        * Reusing a btree node can race with the journal reclaim code calling
-        * the journal pin flush fn, and there's no good fix for this: we don't
-        * really want journal_pin_drop() to block until the flush fn is no
-        * longer running, because journal_pin_drop() is called from the btree
-        * node write endio function, and we can't wait on the flush fn to
-        * finish running in mca_reap() - where we make reused btree nodes ready
-        * to use again - because there, we're holding the lock this function
-        * needs - deadlock.
-        *
-        * So, the b->level check is a hack so we don't try to write nodes we
-        * shouldn't:
-        */
-       if (!b->level)
-               bch2_btree_node_write(c, b, NULL, SIX_LOCK_read, i);
+       bch2_btree_node_write_dirty(c, b, NULL,
+                       (btree_current_write(b) == w &&
+                        w->journal.pin_list == journal_seq_pin(j, seq)));
         six_unlock_read(&b->lock);
  }
  
-static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin)
+static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
  {
-       return __btree_node_flush(j, pin, 0);
+       return __btree_node_flush(j, pin, 0, seq);
  }
  
-static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin)
+static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
  {
-       return __btree_node_flush(j, pin, 1);
+       return __btree_node_flush(j, pin, 1, seq);
  }
  
  void bch2_btree_journal_key(struct btree_insert *trans,
@@ -799,10 +787,11 @@ void bch2_btree_journal_key(struct btree_insert *trans,
                 test_bit(JOURNAL_REPLAY_DONE, &j->flags));
  
         if (!journal_pin_active(&w->journal))
-               bch2_journal_pin_add(j, &w->journal,
-                                   btree_node_write_idx(b) == 0
-                                   ? btree_node_flush0
-                                   : btree_node_flush1);
+               bch2_journal_pin_add(j, &trans->journal_res,
+                                    &w->journal,
+                                    btree_node_write_idx(b) == 0
+                                    ? btree_node_flush0
+                                    : btree_node_flush1);
  
         if (trans->journal_res.ref) {
                 u64 seq = trans->journal_res.seq;
@@ -972,9 +961,9 @@ retry:
                 closure_wait(&btree_current_write(b)->wait, cl);
  
                 list_del(&as->write_blocked_list);
+               mutex_unlock(&c->btree_interior_update_lock);
  
-               if (list_empty(&b->write_blocked))
-                       bch2_btree_node_write(c, b, NULL, SIX_LOCK_read, -1);
+               bch2_btree_node_write_dirty(c, b, NULL, true);
                 six_unlock_read(&b->lock);
                 break;
  
@@ -991,6 +980,7 @@ retry:
                  * and then we have to wait on that btree_interior_update to finish:
                  */
                 closure_wait(&as->parent_as->wait, cl);
+               mutex_unlock(&c->btree_interior_update_lock);
                 break;
  
         case BTREE_INTERIOR_UPDATING_ROOT:
@@ -1017,8 +1007,9 @@ retry:
                  * can reuse the old nodes it'll have to do a journal commit:
                  */
                 six_unlock_read(&b->lock);
+               mutex_unlock(&c->btree_interior_update_lock);
+               break;
         }
-       mutex_unlock(&c->btree_interior_update_lock);
  
         continue_at(cl, btree_interior_update_nodes_reachable, system_wq);
  }
@@ -1083,7 +1074,8 @@ static void btree_interior_update_updated_root(struct bch_fs *c,
                     system_freezable_wq);
  }
  
-static void interior_update_flush(struct journal *j, struct journal_entry_pin *pin)
+static void interior_update_flush(struct journal *j,
+                       struct journal_entry_pin *pin, u64 seq)
  {
         struct btree_interior_update *as =
                 container_of(pin, struct btree_interior_update, journal);
@@ -1441,7 +1433,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
                 six_unlock_write(&n2->lock);
                 six_unlock_write(&n1->lock);
  
-               bch2_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent, -1);
+               bch2_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent);
  
                 /*
                  * Note that on recursive parent_keys == insert_keys, so we
@@ -1461,7 +1453,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
  
                         btree_split_insert_keys(iter, n3, &as->parent_keys,
                                                 reserve);
-                       bch2_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent, -1);
+                       bch2_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent);
                 }
         } else {
                 trace_btree_node_compact(c, b, b->nr.live_u64s);
@@ -1472,7 +1464,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
                 bch2_keylist_add(&as->parent_keys, &n1->key);
         }
  
-       bch2_btree_node_write(c, n1, &as->cl, SIX_LOCK_intent, -1);
+       bch2_btree_node_write(c, n1, &as->cl, SIX_LOCK_intent);
  
         /* New nodes all written, now make them visible: */
  
@@ -1773,7 +1765,7 @@ retry:
         bch2_keylist_add(&as->parent_keys, &delete);
         bch2_keylist_add(&as->parent_keys, &n->key);
  
-       bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
+       bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent);
  
         bch2_btree_insert_node(parent, iter, &as->parent_keys, reserve, as);
  
@@ -2323,7 +2315,7 @@ int bch2_btree_node_rewrite(struct btree_iter *iter, struct btree *b,
  
         trace_btree_gc_rewrite_node(c, b);
  
-       bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
+       bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent);
  
         if (parent) {
                 bch2_btree_insert_node(parent, iter,
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c

index d3cfb00b3024f9309d6902da066ace1c872d244a..2d20061da6422065f84c239a38d02f2fa0548cd2 100644 (file)
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -49,7 +49,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
                         if (ca->disk_sb.bdev == bdev)
                                 goto found;
  
-               ca = NULL;
+               ca = ERR_PTR(-ENOENT);
  found:
                 bdput(bdev);
         }
diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h

index a391277e7b7a5717c99fafcd30847fa03002b150..853815f87b3d74a5f0438b9b94a39e1399615475 100644 (file)
--- a/libbcachefs/fifo.h
+++ b/libbcachefs/fifo.h
@@ -1,45 +1,30 @@
  #ifndef _BCACHE_FIFO_H
  #define _BCACHE_FIFO_H
  
+#include "util.h"
+
  #define DECLARE_FIFO(type, name)                                       \
         struct {                                                        \
                 size_t front, back, size, mask;                         \
                 type *data;                                             \
         } name
  
+#define fifo_buf_size(fifo)                                            \
+       (roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]))
+
  #define init_fifo(fifo, _size, _gfp)                                   \
  ({                                                                     \
-       bool _ret = true;                                               \
-       gfp_t gfp_flags = (_gfp);                                       \
-                                                                       \
-       if (gfp_flags & GFP_KERNEL)                                     \
-               gfp_flags |= __GFP_NOWARN;                              \
-                                                                       \
-       (fifo)->size    = (_size);                                      \
         (fifo)->front   = (fifo)->back = 0;                             \
-       (fifo)->data    = NULL;                                         \
-                                                                       \
-       if ((fifo)->size) {                                             \
-               size_t _allocated_size, _bytes;                         \
-                                                                       \
-               _allocated_size = roundup_pow_of_two((fifo)->size);     \
-               _bytes = _allocated_size * sizeof(*(fifo)->data);       \
-                                                                       \
-               (fifo)->mask = _allocated_size - 1;                     \
-                                                                       \
-               if (_bytes < KMALLOC_MAX_SIZE)                          \
-                       (fifo)->data = kmalloc(_bytes, gfp_flags);      \
-               if ((!(fifo)->data) && (gfp_flags & GFP_KERNEL))        \
-                       (fifo)->data = vmalloc(_bytes);                 \
-               if ((!(fifo)->data))                                    \
-                       _ret = false;                                   \
-       }                                                               \
-       _ret;                                                           \
+       (fifo)->size    = (_size);                                      \
+       (fifo)->mask    = (fifo)->size                                  \
+               ? roundup_pow_of_two((fifo)->size) - 1                  \
+               : 0;                                                    \
+       (fifo)->data    = kvpmalloc(fifo_buf_size(fifo), (_gfp));       \
  })
  
  #define free_fifo(fifo)                                                        \
  do {                                                                   \
-       kvfree((fifo)->data);                                           \
+       kvpfree((fifo)->data, fifo_buf_size(fifo));                     \
         (fifo)->data = NULL;                                            \
  } while (0)
  
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c

index 0fc680b414a461221fb48fb8498a3135886e53f7..9e2906181776f635bbefa298342743de5e8791df 100644 (file)
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -138,7 +138,7 @@ static inline void bch2_journal_add_prios(struct journal *j,
  }
  
  static void journal_seq_blacklist_flush(struct journal *j,
-                                       struct journal_entry_pin *pin)
+                               struct journal_entry_pin *pin, u64 seq)
  {
         struct bch_fs *c =
                 container_of(j, struct bch_fs, journal);
@@ -406,7 +406,8 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
                 if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
                         break;
                 list_del(&i->list);
-               kfree(i);
+               kvpfree(i, offsetof(struct journal_replay, j) +
+                       vstruct_bytes(&i->j));
         }
  
         list_for_each_entry_reverse(i, jlist->head, list) {
@@ -429,7 +430,7 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
  
         where = jlist->head;
  add:
-       i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+       i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
         if (!i) {
                 ret = -ENOMEM;
                 goto out;
@@ -646,12 +647,16 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
  {
         void *n;
  
+       /* the bios are sized for this many pages, max: */
+       if (new_size > JOURNAL_ENTRY_SIZE_MAX)
+               return -ENOMEM;
+
         new_size = roundup_pow_of_two(new_size);
-       n = (void *) __get_free_pages(GFP_KERNEL, get_order(new_size));
+       n = kvpmalloc(new_size, GFP_KERNEL);
         if (!n)
                 return -ENOMEM;
  
-       free_pages((unsigned long) b->data, get_order(b->size));
+       kvpfree(b->data, b->size);
         b->data = n;
         b->size = new_size;
         return 0;
@@ -894,7 +899,7 @@ search_done:
                     !read_bucket(i))
                         break;
  out:
-       free_pages((unsigned long) buf.data, get_order(buf.size));
+       kvpfree(buf.data, buf.size);
         percpu_ref_put(&ca->io_ref);
         closure_return(cl);
  err:
@@ -912,7 +917,8 @@ void bch2_journal_entries_free(struct list_head *list)
                 struct journal_replay *i =
                         list_first_entry(list, struct journal_replay, list);
                 list_del(&i->list);
-               kvfree(i);
+               kvpfree(i, offsetof(struct journal_replay, j) +
+                       vstruct_bytes(&i->j));
         }
  }
  
@@ -958,14 +964,14 @@ static inline bool journal_has_keys(struct list_head *list)
  
  int bch2_journal_read(struct bch_fs *c, struct list_head *list)
  {
+       struct journal *j = &c->journal;
         struct jset_entry *prio_ptrs;
         struct journal_list jlist;
         struct journal_replay *i;
-       struct jset *j;
         struct journal_entry_pin_list *p;
         struct bch_dev *ca;
         u64 cur_seq, end_seq;
-       unsigned iter;
+       unsigned iter, keys = 0, entries = 0;
         int ret = 0;
  
         closure_init_stack(&jlist.cl);
@@ -994,63 +1000,59 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
         fsck_err_on(c->sb.clean && journal_has_keys(list), c,
                     "filesystem marked clean but journal has keys to replay");
  
-       j = &list_entry(list->prev, struct journal_replay, list)->j;
+       i = list_last_entry(list, struct journal_replay, list);
  
-       unfixable_fsck_err_on(le64_to_cpu(j->seq) -
-                       le64_to_cpu(j->last_seq) + 1 >
-                       c->journal.pin.size, c,
+       unfixable_fsck_err_on(le64_to_cpu(i->j.seq) -
+                       le64_to_cpu(i->j.last_seq) + 1 > j->pin.size, c,
                         "too many journal entries open for refcount fifo");
  
-       c->journal.pin.back = le64_to_cpu(j->seq) -
-               le64_to_cpu(j->last_seq) + 1;
+       atomic64_set(&j->seq, le64_to_cpu(i->j.seq));
+       j->last_seq_ondisk = le64_to_cpu(i->j.last_seq);
  
-       atomic64_set(&c->journal.seq, le64_to_cpu(j->seq));
-       c->journal.last_seq_ondisk = le64_to_cpu(j->last_seq);
+       j->pin.front    = le64_to_cpu(i->j.last_seq);
+       j->pin.back     = le64_to_cpu(i->j.seq) + 1;
  
-       BUG_ON(last_seq(&c->journal) != le64_to_cpu(j->last_seq));
-
-       i = list_first_entry(list, struct journal_replay, list);
-
-       mutex_lock(&c->journal.blacklist_lock);
-
-       fifo_for_each_entry_ptr(p, &c->journal.pin, iter) {
-               u64 seq = journal_pin_seq(&c->journal, p);
+       BUG_ON(last_seq(j) != le64_to_cpu(i->j.last_seq));
+       BUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) !=
+              &fifo_peek_back(&j->pin));
  
+       fifo_for_each_entry_ptr(p, &j->pin, iter) {
                 INIT_LIST_HEAD(&p->list);
+               atomic_set(&p->count, 0);
+       }
  
-               if (i && le64_to_cpu(i->j.seq) == seq) {
-                       atomic_set(&p->count, 1);
+       mutex_lock(&j->blacklist_lock);
  
-                       if (journal_seq_blacklist_read(&c->journal, i, p)) {
-                               mutex_unlock(&c->journal.blacklist_lock);
-                               return -ENOMEM;
-                       }
+       list_for_each_entry(i, list, list) {
+               p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
  
-                       i = list_is_last(&i->list, list)
-                               ? NULL
-                               : list_next_entry(i, list);
-               } else {
-                       atomic_set(&p->count, 0);
+               atomic_set(&p->count, 1);
+
+               if (journal_seq_blacklist_read(j, i, p)) {
+                       mutex_unlock(&j->blacklist_lock);
+                       return -ENOMEM;
                 }
         }
  
-       mutex_unlock(&c->journal.blacklist_lock);
+       mutex_unlock(&j->blacklist_lock);
  
-       cur_seq = last_seq(&c->journal);
+       cur_seq = last_seq(j);
         end_seq = le64_to_cpu(list_last_entry(list,
                                 struct journal_replay, list)->j.seq);
  
         list_for_each_entry(i, list, list) {
+               struct jset_entry *entry;
+               struct bkey_i *k, *_n;
                 bool blacklisted;
  
-               mutex_lock(&c->journal.blacklist_lock);
+               mutex_lock(&j->blacklist_lock);
                 while (cur_seq < le64_to_cpu(i->j.seq) &&
-                      journal_seq_blacklist_find(&c->journal, cur_seq))
+                      journal_seq_blacklist_find(j, cur_seq))
                         cur_seq++;
  
-               blacklisted = journal_seq_blacklist_find(&c->journal,
+               blacklisted = journal_seq_blacklist_find(j,
                                                          le64_to_cpu(i->j.seq));
-               mutex_unlock(&c->journal.blacklist_lock);
+               mutex_unlock(&j->blacklist_lock);
  
                 fsck_err_on(blacklisted, c,
                             "found blacklisted journal entry %llu",
@@ -1059,17 +1061,25 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                 fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
                         "journal entries %llu-%llu missing! (replaying %llu-%llu)",
                         cur_seq, le64_to_cpu(i->j.seq) - 1,
-                       last_seq(&c->journal), end_seq);
+                       last_seq(j), end_seq);
  
                 cur_seq = le64_to_cpu(i->j.seq) + 1;
+
+               for_each_jset_key(k, _n, entry, &i->j)
+                       keys++;
+               entries++;
         }
  
-       prio_ptrs = bch2_journal_find_entry(j, JOURNAL_ENTRY_PRIO_PTRS, 0);
+       bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
+                keys, entries, (u64) atomic64_read(&j->seq));
+
+       i = list_last_entry(list, struct journal_replay, list);
+       prio_ptrs = bch2_journal_find_entry(&i->j, JOURNAL_ENTRY_PRIO_PTRS, 0);
         if (prio_ptrs) {
-               memcpy_u64s(c->journal.prio_buckets,
+               memcpy_u64s(j->prio_buckets,
                             prio_ptrs->_data,
                             le16_to_cpu(prio_ptrs->u64s));
-               c->journal.nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s);
+               j->nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s);
         }
  fsck_err:
         return ret;
@@ -1105,6 +1115,9 @@ static bool journal_entry_is_open(struct journal *j)
  void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
  {
         struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct journal_buf *w = journal_prev_buf(j);
+
+       atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count);
  
         if (!need_write_just_set &&
             test_bit(JOURNAL_NEED_WRITE, &j->flags))
@@ -1120,8 +1133,7 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
  #endif
  }
  
-static struct journal_entry_pin_list *
-__journal_entry_new(struct journal *j, int count)
+static void __journal_entry_new(struct journal *j, int count)
  {
         struct journal_entry_pin_list *p = fifo_push_ref(&j->pin);
  
@@ -1131,25 +1143,18 @@ __journal_entry_new(struct journal *j, int count)
          */
         atomic64_inc(&j->seq);
  
-       BUG_ON(journal_pin_seq(j, p) != atomic64_read(&j->seq));
+       BUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) !=
+              &fifo_peek_back(&j->pin));
  
         INIT_LIST_HEAD(&p->list);
         atomic_set(&p->count, count);
-
-       return p;
  }
  
  static void __bch2_journal_next_entry(struct journal *j)
  {
-       struct journal_entry_pin_list *p;
         struct journal_buf *buf;
  
-       p = __journal_entry_new(j, 1);
-
-       if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) {
-               smp_wmb();
-               j->cur_pin_list = p;
-       }
+       __journal_entry_new(j, 1);
  
         buf = journal_cur_buf(j);
         memset(buf->has_inode, 0, sizeof(buf->has_inode));
@@ -1181,6 +1186,8 @@ static enum {
         union journal_res_state old, new;
         u64 v = atomic64_read(&j->reservations.counter);
  
+       lockdep_assert_held(&j->lock);
+
         do {
                 old.v = new.v = v;
                 if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
@@ -1221,7 +1228,6 @@ static enum {
  
         BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
  
-       atomic_dec_bug(&fifo_peek_back(&j->pin).count);
         __bch2_journal_next_entry(j);
  
         cancel_delayed_work(&j->write_work);
@@ -1295,7 +1301,7 @@ static int journal_entry_sectors(struct journal *j)
         struct bch_fs *c = container_of(j, struct bch_fs, journal);
         struct bch_dev *ca;
         struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
-       unsigned sectors_available = j->entry_size_max >> 9;
+       unsigned sectors_available = UINT_MAX;
         unsigned i, nr_online = 0, nr_devs = 0;
  
         lockdep_assert_held(&j->lock);
@@ -1363,6 +1369,10 @@ static int journal_entry_open(struct journal *j)
         if (sectors <= 0)
                 return sectors;
  
+       buf->disk_sectors       = sectors;
+
+       sectors = min_t(unsigned, sectors, buf->size >> 9);
+
         j->cur_buf_sectors      = sectors;
         buf->nr_prio_buckets    = j->nr_prio_buckets;
  
@@ -1464,18 +1474,15 @@ void bch2_journal_start(struct bch_fs *c)
  
  int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
  {
-       int ret = 0, keys = 0, entries = 0;
         struct journal *j = &c->journal;
         struct bkey_i *k, *_n;
         struct jset_entry *entry;
         struct journal_replay *i, *n;
+       int ret = 0, did_replay = 0;
  
         list_for_each_entry_safe(i, n, list, list) {
-               j->cur_pin_list =
-                       &j->pin.data[((j->pin.back - 1 -
-                                      (atomic64_read(&j->seq) -
-                                       le64_to_cpu(i->j.seq))) &
-                                     j->pin.mask)];
+               j->replay_pin_list =
+                       journal_seq_pin(j, le64_to_cpu(i->j.seq));
  
                 for_each_jset_key(k, _n, entry, &i->j) {
                         struct disk_reservation disk_res;
@@ -1499,16 +1506,16 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
                         }
  
                         cond_resched();
-                       keys++;
+                       did_replay = true;
                 }
  
-               if (atomic_dec_and_test(&j->cur_pin_list->count))
+               if (atomic_dec_and_test(&j->replay_pin_list->count))
                         wake_up(&j->wait);
-
-               entries++;
         }
  
-       if (keys) {
+       j->replay_pin_list = NULL;
+
+       if (did_replay) {
                 bch2_btree_flush(c);
  
                 /*
@@ -1517,17 +1524,14 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
                  * arbitrarily far in the future vs. the most recently written journal
                  * entry on disk, if we crash before writing the next journal entry:
                  */
-               ret = bch2_journal_meta(&c->journal);
+               ret = bch2_journal_meta(j);
                 if (ret) {
                         bch_err(c, "journal replay: error %d flushing journal", ret);
                         goto err;
                 }
         }
  
-       bch_info(c, "journal replay done, %i keys in %i entries, seq %llu",
-                keys, entries, (u64) atomic64_read(&j->seq));
-
-       bch2_journal_set_replay_done(&c->journal);
+       bch2_journal_set_replay_done(j);
  err:
         bch2_journal_entries_free(list);
         return ret;
@@ -1763,11 +1767,16 @@ static void journal_pin_add_entry(struct journal *j,
  }
  
  void bch2_journal_pin_add(struct journal *j,
-                        struct journal_entry_pin *pin,
-                        journal_pin_flush_fn flush_fn)
+                         struct journal_res *res,
+                         struct journal_entry_pin *pin,
+                         journal_pin_flush_fn flush_fn)
  {
+       struct journal_entry_pin_list *pin_list = res->ref
+               ? journal_seq_pin(j, res->seq)
+               : j->replay_pin_list;
+
         spin_lock_irq(&j->pin_lock);
-       __journal_pin_add(j, j->cur_pin_list, pin, flush_fn);
+       __journal_pin_add(j, pin_list, pin, flush_fn);
         spin_unlock_irq(&j->pin_lock);
  }
  
@@ -1828,7 +1837,7 @@ void bch2_journal_pin_add_if_older(struct journal *j,
  }
  
  static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush)
+journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
  {
         struct journal_entry_pin_list *pin_list;
         struct journal_entry_pin *ret = NULL;
@@ -1851,6 +1860,7 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush)
                 if (ret) {
                         /* must be list_del_init(), see bch2_journal_pin_drop() */
                         list_del_init(&ret->list);
+                       *seq = journal_pin_seq(j, pin_list);
                         break;
                 }
         }
@@ -1875,9 +1885,10 @@ static bool journal_has_pins(struct journal *j)
  void bch2_journal_flush_pins(struct journal *j)
  {
         struct journal_entry_pin *pin;
+       u64 seq;
  
-       while ((pin = journal_get_next_pin(j, U64_MAX)))
-               pin->flush(j, pin);
+       while ((pin = journal_get_next_pin(j, U64_MAX, &seq)))
+               pin->flush(j, pin, seq);
  
         wait_event(j->wait, !journal_has_pins(j) || bch2_journal_error(j));
  }
@@ -1920,7 +1931,7 @@ static void journal_reclaim_work(struct work_struct *work)
         struct journal *j = &c->journal;
         struct bch_dev *ca;
         struct journal_entry_pin *pin;
-       u64 seq_to_flush = 0;
+       u64 seq, seq_to_flush = 0;
         unsigned iter, bucket_to_flush;
         unsigned long next_flush;
         bool reclaim_lock_held = false, need_flush;
@@ -1994,9 +2005,9 @@ static void journal_reclaim_work(struct work_struct *work)
  
         while ((pin = journal_get_next_pin(j, need_flush
                                            ? U64_MAX
-                                          : seq_to_flush))) {
+                                          : seq_to_flush, &seq))) {
                 __set_current_state(TASK_RUNNING);
-               pin->flush(j, pin);
+               pin->flush(j, pin, seq);
                 need_flush = false;
  
                 j->last_flushed = jiffies;
@@ -2196,17 +2207,39 @@ static void journal_write_done(struct closure *cl)
         mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
  }
  
+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
+{
+       /* we aren't holding j->lock: */
+       unsigned new_size = READ_ONCE(j->buf_size_want);
+       void *new_buf;
+
+       if (buf->size >= new_size)
+               return;
+
+       new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
+       if (!new_buf)
+               return;
+
+       memcpy(new_buf, buf->data, buf->size);
+       kvpfree(buf->data, buf->size);
+       buf->data       = new_buf;
+       buf->size       = new_size;
+}
+
  static void journal_write(struct closure *cl)
  {
         struct journal *j = container_of(cl, struct journal, io);
         struct bch_fs *c = container_of(j, struct bch_fs, journal);
         struct bch_dev *ca;
         struct journal_buf *w = journal_prev_buf(j);
-       struct jset *jset = w->data;
+       struct jset *jset;
         struct bio *bio;
         struct bch_extent_ptr *ptr;
         unsigned i, sectors, bytes;
  
+       journal_buf_realloc(j, w);
+       jset = w->data;
+
         j->write_start_time = local_clock();
  
         bch2_journal_add_prios(j, w);
@@ -2346,6 +2379,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
                               unsigned u64s_min, unsigned u64s_max)
  {
         struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct journal_buf *buf;
         int ret;
  retry:
         ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
@@ -2365,7 +2399,18 @@ retry:
         }
  
         /*
-        * Ok, no more room in the current journal entry - try to start a new
+        * If we couldn't get a reservation because the current buf filled up,
+        * and we had room for a bigger entry on disk, signal that we want to
+        * realloc the journal bufs:
+        */
+       buf = journal_cur_buf(j);
+       if (journal_entry_is_open(j) &&
+           buf->size >> 9 < buf->disk_sectors &&
+           buf->size < JOURNAL_ENTRY_SIZE_MAX)
+               j->buf_size_want = max(j->buf_size_want, buf->size << 1);
+
+       /*
+        * Close the current journal entry if necessary, then try to start a new
          * one:
          */
         switch (journal_buf_switch(j, false)) {
@@ -2765,11 +2810,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
         struct journal_device *ja = &ca->journal;
         struct bch_sb_field_journal *journal_buckets =
                 bch2_sb_get_journal(sb);
-       unsigned i, journal_entry_pages;
-
-       journal_entry_pages =
-               DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb),
-                            PAGE_SECTORS);
+       unsigned i;
  
         ja->nr = bch2_nr_journal_buckets(journal_buckets);
  
@@ -2777,7 +2818,8 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
         if (!ja->bucket_seq)
                 return -ENOMEM;
  
-       ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages);
+       ca->journal.bio = bio_kmalloc(GFP_KERNEL,
+                       DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE));
         if (!ca->journal.bio)
                 return -ENOMEM;
  
@@ -2793,17 +2835,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
  
  void bch2_fs_journal_exit(struct journal *j)
  {
-       unsigned order = get_order(j->entry_size_max);
-
-       free_pages((unsigned long) j->buf[1].data, order);
-       free_pages((unsigned long) j->buf[0].data, order);
+       kvpfree(j->buf[1].data, j->buf[1].size);
+       kvpfree(j->buf[0].data, j->buf[0].size);
         free_fifo(&j->pin);
  }
  
-int bch2_fs_journal_init(struct journal *j, unsigned entry_size_max)
+int bch2_fs_journal_init(struct journal *j)
  {
         static struct lock_class_key res_key;
-       unsigned order = get_order(entry_size_max);
  
         spin_lock_init(&j->lock);
         spin_lock_init(&j->pin_lock);
@@ -2817,7 +2856,8 @@ int bch2_fs_journal_init(struct journal *j, unsigned entry_size_max)
  
         lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
  
-       j->entry_size_max       = entry_size_max;
+       j->buf[0].size          = JOURNAL_ENTRY_SIZE_MIN;
+       j->buf[1].size          = JOURNAL_ENTRY_SIZE_MIN;
         j->write_delay_ms       = 100;
         j->reclaim_delay_ms     = 100;
  
@@ -2828,9 +2868,11 @@ int bch2_fs_journal_init(struct journal *j, unsigned entry_size_max)
                  { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
  
         if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
-           !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
-           !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
+           !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) ||
+           !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL)))
                 return -ENOMEM;
  
+       j->pin.front = j->pin.back = 1;
+
         return 0;
  }
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h

index 3825f0dc3acc25a64c4d87736788e36b89052bcc..9ad82c6081c18524a70fbdc3530e92cd09f46ac4 100644 (file)
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -121,15 +121,21 @@ struct journal_replay {
         struct jset             j;
  };
  
-#define JOURNAL_PIN    ((32 * 1024) - 1)
+#define JOURNAL_PIN    (32 * 1024)
  
  static inline bool journal_pin_active(struct journal_entry_pin *pin)
  {
         return pin->pin_list != NULL;
  }
  
-void bch2_journal_pin_add(struct journal *, struct journal_entry_pin *,
-                        journal_pin_flush_fn);
+static inline struct journal_entry_pin_list *
+journal_seq_pin(struct journal *j, u64 seq)
+{
+       return &j->pin.data[(size_t) seq & j->pin.mask];
+}
+
+void bch2_journal_pin_add(struct journal *, struct journal_res *,
+                         struct journal_entry_pin *, journal_pin_flush_fn);
  void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
  void bch2_journal_pin_add_if_older(struct journal *,
                                   struct journal_entry_pin *,
@@ -343,12 +349,8 @@ int bch2_journal_replay(struct bch_fs *, struct list_head *);
  
  static inline void bch2_journal_set_replay_done(struct journal *j)
  {
-       spin_lock(&j->lock);
         BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
-
         set_bit(JOURNAL_REPLAY_DONE, &j->flags);
-       j->cur_pin_list = &fifo_peek_back(&j->pin);
-       spin_unlock(&j->lock);
  }
  
  ssize_t bch2_journal_print_debug(struct journal *, char *);
@@ -368,6 +370,6 @@ void bch2_fs_journal_stop(struct journal *);
  void bch2_dev_journal_exit(struct bch_dev *);
  int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
  void bch2_fs_journal_exit(struct journal *);
-int bch2_fs_journal_init(struct journal *, unsigned);
+int bch2_fs_journal_init(struct journal *);
  
  #endif /* _BCACHE_JOURNAL_H */
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h

index ebc340adbe2319218074e579b43c0eed58c0468c..75712aed8afed56883daf04583884fa00605cab1 100644 (file)
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -15,8 +15,12 @@ struct journal_res;
   */
  struct journal_buf {
         struct jset             *data;
+
         struct closure_waitlist wait;
  
+       unsigned                size;
+       unsigned                disk_sectors;
+
         /*
          * ugh, prio_buckets are stupid - need to convert them to new
          * transaction machinery when it arrives
@@ -39,7 +43,8 @@ struct journal_entry_pin_list {
  
  struct journal;
  struct journal_entry_pin;
-typedef void (*journal_pin_flush_fn)(struct journal *j, struct journal_entry_pin *);
+typedef void (*journal_pin_flush_fn)(struct journal *j,
+                               struct journal_entry_pin *, u64);
  
  struct journal_entry_pin {
         struct list_head                list;
@@ -90,11 +95,13 @@ union journal_res_state {
         };
  };
  
-/* 4 mb, in bytes: */
-#define JOURNAL_ENTRY_SIZE_MAX         (4U << 20)
+/* bytes: */
+#define JOURNAL_ENTRY_SIZE_MIN         (64U << 10) /* 64k */
+#define JOURNAL_ENTRY_SIZE_MAX         (4U  << 20) /* 4M */
  
  /*
   * We stash some journal state as sentinal values in cur_entry_offset:
+ * note - cur_entry_offset is in units of u64s
   */
  #define JOURNAL_ENTRY_OFFSET_MAX       ((1U << 20) - 1)
  
@@ -123,7 +130,7 @@ struct journal {
         unsigned                cur_entry_u64s;
         unsigned                prev_buf_sectors;
         unsigned                cur_buf_sectors;
-       unsigned                entry_size_max; /* bytes */
+       unsigned                buf_size_want;
  
         /*
          * Two journal entries -- one is currently open for new entries, the
@@ -162,7 +169,7 @@ struct journal {
          * longer needed, the bucket can be discarded and reused.
          */
         DECLARE_FIFO(struct journal_entry_pin_list, pin);
-       struct journal_entry_pin_list *cur_pin_list;
+       struct journal_entry_pin_list *replay_pin_list;
  
         /*
          * Protects the pin lists - the fifo itself is still protected by
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c

index 9f41d71d6c11a8753441b0405d33497b48240151..fa020af376baa81c489ca4a759930d3fd2d0c682 100644 (file)
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -377,13 +377,6 @@ const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb)
         if (BCH_SB_GC_RESERVE(sb) < 5)
                 return "gc reserve percentage too small";
  
-       if (1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) < block_size)
-               return "max journal entry size too small";
-
-       /* 4 mb max: */
-       if (512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX)
-               return "max journal entry size too big";
-
         if (!sb->time_precision ||
             le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
                 return "invalid time precision";
diff --git a/libbcachefs/super.c b/libbcachefs/super.c

index 19f96921c6ee554a35125c1c73f8e058a248037b..6cbfc801b70b92e0a561423d03126511058054d5 100644 (file)
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -460,14 +460,11 @@ void bch2_fs_stop(struct bch_fs *c)
         bch2_fs_exit(c);
  }
  
-#define alloc_bucket_pages(gfp, ca)                    \
-       ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
-
  static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
  {
         struct bch_sb_field_members *mi;
         struct bch_fs *c;
-       unsigned i, iter_size, journal_entry_bytes;
+       unsigned i, iter_size;
  
         c = kzalloc(sizeof(struct bch_fs), GFP_KERNEL);
         if (!c)
@@ -555,8 +552,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
         iter_size = (btree_blocks(c) + 1) * 2 *
                 sizeof(struct btree_node_iter_set);
  
-       journal_entry_bytes = 512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb);
-
         if (!(c->wq = alloc_workqueue("bcachefs",
                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
             !(c->copygc_wq = alloc_workqueue("bcache_copygc",
@@ -583,7 +578,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
             bdi_setup_and_register(&c->bdi, "bcachefs") ||
             bch2_io_clock_init(&c->io_clock[READ]) ||
             bch2_io_clock_init(&c->io_clock[WRITE]) ||
-           bch2_fs_journal_init(&c->journal, journal_entry_bytes) ||
+           bch2_fs_journal_init(&c->journal) ||
             bch2_fs_btree_init(c) ||
             bch2_fs_encryption_init(c) ||
             bch2_fs_compress_init(c) ||
@@ -974,7 +969,7 @@ static void bch2_dev_free(struct bch_dev *ca)
         free_percpu(ca->sectors_written);
         bioset_exit(&ca->replica_set);
         free_percpu(ca->usage_percpu);
-       free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
+       kvpfree(ca->disk_buckets, bucket_bytes(ca));
         kfree(ca->prio_buckets);
         kfree(ca->bio_prio);
         vfree(ca->buckets);
@@ -1144,7 +1139,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
                                           ca->mi.nbuckets)) ||
             !(ca->prio_buckets  = kzalloc(sizeof(u64) * prio_buckets(ca) *
                                           2, GFP_KERNEL)) ||
-           !(ca->disk_buckets  = alloc_bucket_pages(GFP_KERNEL, ca)) ||
+           !(ca->disk_buckets  = kvpmalloc(bucket_bytes(ca), GFP_KERNEL)) ||
             !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
             !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
             bioset_init(&ca->replica_set, 4,
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c

index a0b9faeb96b25db679604898cd49d7ac6b59e451..808b3089362d3a58bb311efdb997d2882aa4e0da 100644 (file)
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -175,7 +175,6 @@ read_attribute(cache_read_races);
  
  rw_attribute(journal_write_delay_ms);
  rw_attribute(journal_reclaim_delay_ms);
-read_attribute(journal_entry_size_max);
  
  rw_attribute(discard);
  rw_attribute(cache_replacement_policy);
@@ -406,7 +405,6 @@ SHOW(bch2_fs)
  
         sysfs_print(journal_write_delay_ms,     c->journal.write_delay_ms);
         sysfs_print(journal_reclaim_delay_ms,   c->journal.reclaim_delay_ms);
-       sysfs_hprint(journal_entry_size_max,    c->journal.entry_size_max);
  
         sysfs_hprint(block_size,                block_bytes(c));
         sysfs_print(block_size_bytes,           block_bytes(c));
@@ -561,7 +559,6 @@ SYSFS_OPS(bch2_fs);
  struct attribute *bch2_fs_files[] = {
         &sysfs_journal_write_delay_ms,
         &sysfs_journal_reclaim_delay_ms,
-       &sysfs_journal_entry_size_max,
  
         &sysfs_block_size,
         &sysfs_block_size_bytes,
diff --git a/libbcachefs/util.h b/libbcachefs/util.h

index 5f13c8244a78784c1977c0ae894b45e71ec41d19..5669cb8abf42766b954be91076cc82063159ddc9 100644 (file)
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -9,6 +9,7 @@
  #include <linux/freezer.h>
  #include <linux/kernel.h>
  #include <linux/llist.h>
+#include <linux/log2.h>
  #include <linux/ratelimit.h>
  #include <linux/slab.h>
  #include <linux/vmalloc.h>
@@ -78,16 +79,22 @@ do {                                                                        \
         (__builtin_types_compatible_p(typeof(_val), _type) ||           \
          __builtin_types_compatible_p(typeof(_val), const _type))
  
-static inline void *kvmalloc(size_t bytes, gfp_t gfp)
+static inline void kvpfree(void *p, size_t size)
  {
-       if (bytes <= PAGE_SIZE ||
-           !(gfp & GFP_KERNEL))
-               return kmalloc(bytes, gfp);
-
-       return ((bytes <= KMALLOC_MAX_SIZE)
-               ? kmalloc(bytes, gfp|__GFP_NOWARN)
-               : NULL) ?:
-               vmalloc(bytes);
+       if (size < PAGE_SIZE)
+               kfree(p);
+       else if (is_vmalloc_addr(p))
+               vfree(p);
+       else
+               free_pages((unsigned long) p, get_order(size));
+
+}
+
+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
+{
+       return size < PAGE_SIZE ? kmalloc(size, gfp_mask)
+               :  (void *) __get_free_pages(gfp_mask, get_order(size))
+               ?: __vmalloc(size, gfp_mask, PAGE_KERNEL);
  }
  
  #define DECLARE_HEAP(type, name)                                       \
@@ -98,17 +105,15 @@ static inline void *kvmalloc(size_t bytes, gfp_t gfp)
  
  #define init_heap(heap, _size, gfp)                                    \
  ({                                                                     \
-       size_t _bytes;                                                  \
         (heap)->used = 0;                                               \
         (heap)->size = (_size);                                         \
-       _bytes = (heap)->size * sizeof(*(heap)->data);                  \
-       (heap)->data = kvmalloc(_bytes, (gfp));                         \
-       (heap)->data;                                                   \
+       (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+                                (gfp));                                \
  })
  
  #define free_heap(heap)                                                        \
  do {                                                                   \
-       kvfree((heap)->data);                                           \
+       kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0]));  \
         (heap)->data = NULL;                                            \
  } while (0)
author	Kent Overstreet <kent.overstreet@gmail.com>
	Mon, 10 Apr 2017 04:11:10 +0000 (20:11 -0800)
committer	Kent Overstreet <kent.overstreet@gmail.com>
	Mon, 10 Apr 2017 04:12:37 +0000 (20:12 -0800)
.bcachefs_revision		patch \| blob \| history
cmd_format.c		patch \| blob \| history
include/linux/bitops.h		patch \| blob \| history
include/linux/log2.h		patch \| blob \| history
libbcachefs.c		patch \| blob \| history
libbcachefs.h		patch \| blob \| history
libbcachefs/bcachefs_format.h		patch \| blob \| history
libbcachefs/bset.h		patch \| blob \| history
libbcachefs/btree_cache.c		patch \| blob \| history
libbcachefs/btree_gc.c		patch \| blob \| history
libbcachefs/btree_io.c		patch \| blob \| history
libbcachefs/btree_io.h		patch \| blob \| history
libbcachefs/btree_update.c		patch \| blob \| history
libbcachefs/chardev.c		patch \| blob \| history
libbcachefs/fifo.h		patch \| blob \| history
libbcachefs/journal.c		patch \| blob \| history
libbcachefs/journal.h		patch \| blob \| history
libbcachefs/journal_types.h		patch \| blob \| history
libbcachefs/super-io.c		patch \| blob \| history
libbcachefs/super.c		patch \| blob \| history
libbcachefs/sysfs.c		patch \| blob \| history
libbcachefs/util.h		patch \| blob \| history