-297c81ae4d608707fdabedc60158ff1f4fbec257
+da037866e669b09edc6b049ce09535d3456474cb
x(0, encrypted, NULL, "Enable whole filesystem encryption (chacha20/poly1305)")\
x(0, no_passphrase, NULL, "Don't encrypt master encryption key")\
x('e', error_action, "(continue|readonly|panic)", NULL) \
-x(0, max_journal_entry_size, "size", NULL) \
x('L', label, "label", NULL) \
x('U', uuid, "uuid", NULL) \
x('f', force, NULL, NULL) \
" --no_passphrase Don't encrypt master encryption key\n"
" --error_action=(continue|readonly|panic)\n"
" Action to take on filesystem error\n"
- " --max_journal_entry_size=size\n"
" -l, --label=label\n"
" --uuid=uuid\n"
" -f, --force\n"
read_string_list_or_die(optarg,
bch2_error_actions, "error action");
break;
- case O_max_journal_entry_size:
- opts.max_journal_entry_size =
- hatoi_validate(optarg, "journal entry size");
- break;
case O_label:
case 'L':
opts.label = strdup(optarg);
return 1UL << (fls_long(n) - 1);
}
-static inline __attribute_const__
-int __get_order(unsigned long size)
-{
- int order;
-
- size--;
- size >>= PAGE_SHIFT;
-#if BITS_PER_LONG == 32
- order = fls(size);
-#else
- order = fls64(size);
-#endif
- return order;
-}
-
-#define get_order(n) \
-( \
- __builtin_constant_p(n) ? ( \
- ((n) == 0UL) ? BITS_PER_LONG - PAGE_SHIFT : \
- (((n) < (1UL << PAGE_SHIFT)) ? 0 : \
- ilog2((n) - 1) - PAGE_SHIFT + 1) \
- ) : \
- __get_order(n) \
-)
-
#endif
__rounddown_pow_of_two(n) \
)
+static inline __attribute_const__
+int __get_order(unsigned long size)
+{
+ int order;
+
+ size--;
+ size >>= PAGE_SHIFT;
+#if BITS_PER_LONG == 32
+ order = fls(size);
+#else
+ order = fls64(size);
+#endif
+ return order;
+}
+
+#define get_order(n) \
+( \
+ __builtin_constant_p(n) ? ( \
+ ((n) == 0UL) ? BITS_PER_LONG - PAGE_SHIFT : \
+ (((n) < (1UL << PAGE_SHIFT)) ? 0 : \
+ ilog2((n) - 1) - PAGE_SHIFT + 1) \
+ ) : \
+ __get_order(n) \
+)
+
#endif /* _TOOLS_LINUX_LOG2_H */
min(opts.btree_node_size, i->bucket_size);
}
- if (!opts.max_journal_entry_size) {
- /* 2 MB default: */
- opts.max_journal_entry_size = 4096;
- }
-
- opts.max_journal_entry_size =
- roundup_pow_of_two(opts.max_journal_entry_size);
-
if (uuid_is_null(opts.uuid.b))
uuid_generate(opts.uuid.b);
SET_BCH_SB_DATA_REPLICAS_REQ(sb, opts.data_replicas_required);
SET_BCH_SB_ERROR_ACTION(sb, opts.on_error_action);
SET_BCH_SB_STR_HASH_TYPE(sb, BCH_STR_HASH_SIPHASH);
- SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb, ilog2(opts.max_journal_entry_size));
struct timespec now;
if (clock_gettime(CLOCK_REALTIME, &now))
"Version: %llu\n"
"Block_size: %s\n"
"Btree node size: %s\n"
- "Max journal entry size: %s\n"
"Error action: %s\n"
"Clean: %llu\n"
le64_to_cpu(sb->version),
pr_units(le16_to_cpu(sb->block_size), units),
pr_units(BCH_SB_BTREE_NODE_SIZE(sb), units),
- pr_units(1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb), units),
BCH_SB_ERROR_ACTION(sb) < BCH_NR_ERROR_ACTIONS
? bch2_error_actions[BCH_SB_ERROR_ACTION(sb)]
uuid_le uuid;
unsigned on_error_action;
- unsigned max_journal_entry_size; /* will be removed */
unsigned block_size;
unsigned btree_node_size;
LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10);
LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14);
-LE64_BITMASK(BCH_SB_JOURNAL_ENTRY_SIZE, struct bch_sb, flags[1], 14, 20);
+/* 14-20 unused, was JOURNAL_ENTRY_SIZE */
LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
+ /*
+ * hack around a harmless race when compacting whiteouts
+ * for a write:
+ */
+ dst2.needs_whiteout = dst.needs_whiteout;
+
BUG_ON(memcmp(&dst, &dst2, sizeof(dst)));
}
}
if (!b)
return NULL;
+ bkey_extent_init(&b->key);
six_lock_init(&b->lock);
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
* this version is for btree nodes that have already been freed (we're not
* reaping a real btree node)
*/
-static int mca_reap_notrace(struct bch_fs *c, struct btree *b, bool flush)
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
{
+ int ret = 0;
+
lockdep_assert_held(&c->btree_cache_lock);
if (!six_trylock_intent(&b->lock))
btree_node_noevict(b))
goto out_unlock;
- if (!list_empty(&b->write_blocked))
+ if (!btree_node_may_write(b))
goto out_unlock;
- if (!flush &&
- (btree_node_dirty(b) ||
- btree_node_write_in_flight(b)))
- goto out_unlock;
+ if (btree_node_dirty(b) ||
+ btree_node_write_in_flight(b)) {
+ if (!flush)
+ goto out_unlock;
- /*
- * Using the underscore version because we don't want to compact bsets
- * after the write, since this node is about to be evicted - unless
- * btree verify mode is enabled, since it runs out of the post write
- * cleanup:
- */
- if (btree_node_dirty(b)) {
+ /*
+ * Using the underscore version because we don't want to compact
+ * bsets after the write, since this node is about to be evicted
+ * - unless btree verify mode is enabled, since it runs out of
+ * the post write cleanup:
+ */
if (verify_btree_ondisk(c))
- bch2_btree_node_write(c, b, NULL, SIX_LOCK_intent, -1);
+ bch2_btree_node_write(c, b, NULL, SIX_LOCK_intent);
else
- __bch2_btree_node_write(c, b, NULL, SIX_LOCK_read, -1);
- }
+ __bch2_btree_node_write(c, b, NULL, SIX_LOCK_read);
- /* wait for any in flight btree write */
- wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
- TASK_UNINTERRUPTIBLE);
-
- return 0;
+ /* wait for any in flight btree write */
+ btree_node_wait_on_io(b);
+ }
+out:
+ if (PTR_HASH(&b->key))
+ trace_btree_node_reap(c, b, ret);
+ return ret;
out_unlock:
six_unlock_write(&b->lock);
out_unlock_intent:
six_unlock_intent(&b->lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
-static int mca_reap(struct bch_fs *c, struct btree *b, bool flush)
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
{
- int ret = mca_reap_notrace(c, b, flush);
+ return __btree_node_reclaim(c, b, false);
+}
- trace_btree_node_reap(c, b, ret);
- return ret;
+static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
+{
+ return __btree_node_reclaim(c, b, true);
}
static unsigned long bch2_mca_scan(struct shrinker *shrink,
break;
if (++i > 3 &&
- !mca_reap_notrace(c, b, false)) {
+ !btree_node_reclaim(c, b)) {
mca_data_free(c, b);
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
}
if (!btree_node_accessed(b) &&
- !mca_reap(c, b, false)) {
+ !btree_node_reclaim(c, b)) {
/* can't call bch2_btree_node_hash_remove under btree_cache_lock */
freed++;
if (&t->list != &c->btree_cache)
struct btree *b;
list_for_each_entry_reverse(b, &c->btree_cache, list)
- if (!mca_reap(c, b, false))
+ if (!btree_node_reclaim(c, b))
return b;
while (1) {
list_for_each_entry_reverse(b, &c->btree_cache, list)
- if (!mca_reap(c, b, true))
+ if (!btree_node_write_and_reclaim(c, b))
return b;
/*
* the list. Check if there's any freed nodes there:
*/
list_for_each_entry(b, &c->btree_cache_freeable, list)
- if (!mca_reap_notrace(c, b, false))
+ if (!btree_node_reclaim(c, b))
goto out_unlock;
/*
* disk node. Check the freed list before allocating a new one:
*/
list_for_each_entry(b, &c->btree_cache_freed, list)
- if (!mca_reap_notrace(c, b, false)) {
+ if (!btree_node_reclaim(c, b)) {
mca_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
if (b->data)
goto out_unlock;
bch2_btree_build_aux_trees(n);
six_unlock_write(&n->lock);
- bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
+ bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent);
}
/*
void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
struct closure *parent,
- enum six_lock_type lock_type_held,
- int idx_to_write)
+ enum six_lock_type lock_type_held)
{
struct bio *bio;
struct bch_write_bio *wbio;
if (!(old & (1 << BTREE_NODE_dirty)))
return;
- if (idx_to_write >= 0 &&
- idx_to_write != !!(old & (1 << BTREE_NODE_write_idx)))
- return;
-
if (old & (1 << BTREE_NODE_write_in_flight)) {
- wait_on_bit_io(&b->flags,
- BTREE_NODE_write_in_flight,
- TASK_UNINTERRUPTIBLE);
+ btree_node_wait_on_io(b);
continue;
}
*/
void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
struct closure *parent,
- enum six_lock_type lock_type_held,
- int idx_to_write)
+ enum six_lock_type lock_type_held)
{
BUG_ON(lock_type_held == SIX_LOCK_write);
if (lock_type_held == SIX_LOCK_intent ||
six_trylock_convert(&b->lock, SIX_LOCK_read,
SIX_LOCK_intent)) {
- __bch2_btree_node_write(c, b, parent, SIX_LOCK_intent, idx_to_write);
+ __bch2_btree_node_write(c, b, parent, SIX_LOCK_intent);
- six_lock_write(&b->lock);
- bch2_btree_post_write_cleanup(c, b);
- six_unlock_write(&b->lock);
+ /* don't cycle lock unnecessarily: */
+ if (btree_node_just_written(b)) {
+ six_lock_write(&b->lock);
+ bch2_btree_post_write_cleanup(c, b);
+ six_unlock_write(&b->lock);
+ }
if (lock_type_held == SIX_LOCK_read)
six_lock_downgrade(&b->lock);
} else {
- __bch2_btree_node_write(c, b, parent, SIX_LOCK_read, idx_to_write);
+ __bch2_btree_node_write(c, b, parent, SIX_LOCK_read);
}
}
-static void bch2_btree_node_write_dirty(struct bch_fs *c, struct btree *b,
- struct closure *parent)
-{
- six_lock_read(&b->lock);
- BUG_ON(b->level);
-
- bch2_btree_node_write(c, b, parent, SIX_LOCK_read, -1);
- six_unlock_read(&b->lock);
-}
-
/*
* Write all dirty btree nodes to disk, including roots
*/
struct btree *b;
struct bucket_table *tbl;
struct rhash_head *pos;
- bool dropped_lock;
+ bool saw_dirty;
unsigned i;
closure_init_stack(&cl);
rcu_read_lock();
do {
- dropped_lock = false;
+ saw_dirty = false;
i = 0;
restart:
tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
&c->btree_cache_table);
for (; i < tbl->size; i++)
- rht_for_each_entry_rcu(b, pos, tbl, i, hash)
- /*
- * XXX - locking for b->level, when called from
- * bch2_journal_move()
- */
- if (!b->level && btree_node_dirty(b)) {
+ rht_for_each_entry_rcu(b, pos, tbl, i, hash) {
+ saw_dirty |= btree_node_dirty(b);
+
+ if (btree_node_dirty(b) &&
+ btree_node_may_write(b)) {
rcu_read_unlock();
- bch2_btree_node_write_dirty(c, b, &cl);
- dropped_lock = true;
+ six_lock_read(&b->lock);
+ bch2_btree_node_write_dirty(c, b, &cl, 1);
+ six_unlock_read(&b->lock);
rcu_read_lock();
goto restart;
}
- } while (dropped_lock);
+ }
+ } while (saw_dirty);
rcu_read_unlock();
TASK_UNINTERRUPTIBLE);
}
+static inline void btree_node_wait_on_io(struct btree *b)
+{
+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static inline bool btree_node_may_write(struct btree *b)
+{
+ return list_empty_careful(&b->write_blocked);
+}
+
enum compact_mode {
COMPACT_LAZY,
COMPACT_WRITTEN,
struct btree_write *);
void __bch2_btree_node_write(struct bch_fs *, struct btree *,
- struct closure *, enum six_lock_type, int);
+ struct closure *, enum six_lock_type);
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
- struct closure *, enum six_lock_type, int);
+ struct closure *, enum six_lock_type);
+
+#define bch2_btree_node_write_dirty(_c, _b, _cl, cond) \
+do { \
+ while ((_b)->written && btree_node_dirty(_b) && (cond)) { \
+ if (!btree_node_may_write(_b)) \
+ break; \
+ \
+ if (!btree_node_write_in_flight(_b)) { \
+ bch2_btree_node_write(_c, _b, _cl, SIX_LOCK_read);\
+ break; \
+ } \
+ \
+ six_unlock_read(&(_b)->lock); \
+ btree_node_wait_on_io(_b); \
+ six_lock_read(&(_b)->lock); \
+ } \
+} while (0)
void bch2_btree_flush(struct bch_fs *);
void bch2_btree_node_flush_journal_entries(struct bch_fs *, struct btree *,
b = __btree_root_alloc(c, 0, id, reserve);
- bch2_btree_node_write(c, b, writes, SIX_LOCK_intent, -1);
+ bch2_btree_node_write(c, b, writes, SIX_LOCK_intent);
bch2_btree_set_root_initial(c, b, reserve);
bch2_btree_open_bucket_put(c, b);
}
static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
- unsigned i)
+ unsigned i, u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct btree_write *w = container_of(pin, struct btree_write, journal);
struct btree *b = container_of(w, struct btree, writes[i]);
six_lock_read(&b->lock);
- /*
- * Reusing a btree node can race with the journal reclaim code calling
- * the journal pin flush fn, and there's no good fix for this: we don't
- * really want journal_pin_drop() to block until the flush fn is no
- * longer running, because journal_pin_drop() is called from the btree
- * node write endio function, and we can't wait on the flush fn to
- * finish running in mca_reap() - where we make reused btree nodes ready
- * to use again - because there, we're holding the lock this function
- * needs - deadlock.
- *
- * So, the b->level check is a hack so we don't try to write nodes we
- * shouldn't:
- */
- if (!b->level)
- bch2_btree_node_write(c, b, NULL, SIX_LOCK_read, i);
+ bch2_btree_node_write_dirty(c, b, NULL,
+ (btree_current_write(b) == w &&
+ w->journal.pin_list == journal_seq_pin(j, seq)));
six_unlock_read(&b->lock);
}
-static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin)
+static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
- return __btree_node_flush(j, pin, 0);
+ return __btree_node_flush(j, pin, 0, seq);
}
-static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin)
+static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
- return __btree_node_flush(j, pin, 1);
+ return __btree_node_flush(j, pin, 1, seq);
}
void bch2_btree_journal_key(struct btree_insert *trans,
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
if (!journal_pin_active(&w->journal))
- bch2_journal_pin_add(j, &w->journal,
- btree_node_write_idx(b) == 0
- ? btree_node_flush0
- : btree_node_flush1);
+ bch2_journal_pin_add(j, &trans->journal_res,
+ &w->journal,
+ btree_node_write_idx(b) == 0
+ ? btree_node_flush0
+ : btree_node_flush1);
if (trans->journal_res.ref) {
u64 seq = trans->journal_res.seq;
closure_wait(&btree_current_write(b)->wait, cl);
list_del(&as->write_blocked_list);
+ mutex_unlock(&c->btree_interior_update_lock);
- if (list_empty(&b->write_blocked))
- bch2_btree_node_write(c, b, NULL, SIX_LOCK_read, -1);
+ bch2_btree_node_write_dirty(c, b, NULL, true);
six_unlock_read(&b->lock);
break;
* and then we have to wait on that btree_interior_update to finish:
*/
closure_wait(&as->parent_as->wait, cl);
+ mutex_unlock(&c->btree_interior_update_lock);
break;
case BTREE_INTERIOR_UPDATING_ROOT:
* can reuse the old nodes it'll have to do a journal commit:
*/
six_unlock_read(&b->lock);
+ mutex_unlock(&c->btree_interior_update_lock);
+ break;
}
- mutex_unlock(&c->btree_interior_update_lock);
continue_at(cl, btree_interior_update_nodes_reachable, system_wq);
}
system_freezable_wq);
}
-static void interior_update_flush(struct journal *j, struct journal_entry_pin *pin)
+static void interior_update_flush(struct journal *j,
+ struct journal_entry_pin *pin, u64 seq)
{
struct btree_interior_update *as =
container_of(pin, struct btree_interior_update, journal);
six_unlock_write(&n2->lock);
six_unlock_write(&n1->lock);
- bch2_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent, -1);
+ bch2_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent);
/*
* Note that on recursive parent_keys == insert_keys, so we
btree_split_insert_keys(iter, n3, &as->parent_keys,
reserve);
- bch2_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent, -1);
+ bch2_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent);
}
} else {
trace_btree_node_compact(c, b, b->nr.live_u64s);
bch2_keylist_add(&as->parent_keys, &n1->key);
}
- bch2_btree_node_write(c, n1, &as->cl, SIX_LOCK_intent, -1);
+ bch2_btree_node_write(c, n1, &as->cl, SIX_LOCK_intent);
/* New nodes all written, now make them visible: */
bch2_keylist_add(&as->parent_keys, &delete);
bch2_keylist_add(&as->parent_keys, &n->key);
- bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
+ bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent);
bch2_btree_insert_node(parent, iter, &as->parent_keys, reserve, as);
trace_btree_gc_rewrite_node(c, b);
- bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
+ bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent);
if (parent) {
bch2_btree_insert_node(parent, iter,
if (ca->disk_sb.bdev == bdev)
goto found;
- ca = NULL;
+ ca = ERR_PTR(-ENOENT);
found:
bdput(bdev);
}
#ifndef _BCACHE_FIFO_H
#define _BCACHE_FIFO_H
+#include "util.h"
+
#define DECLARE_FIFO(type, name) \
struct { \
size_t front, back, size, mask; \
type *data; \
} name
+#define fifo_buf_size(fifo) \
+ (roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]))
+
#define init_fifo(fifo, _size, _gfp) \
({ \
- bool _ret = true; \
- gfp_t gfp_flags = (_gfp); \
- \
- if (gfp_flags & GFP_KERNEL) \
- gfp_flags |= __GFP_NOWARN; \
- \
- (fifo)->size = (_size); \
(fifo)->front = (fifo)->back = 0; \
- (fifo)->data = NULL; \
- \
- if ((fifo)->size) { \
- size_t _allocated_size, _bytes; \
- \
- _allocated_size = roundup_pow_of_two((fifo)->size); \
- _bytes = _allocated_size * sizeof(*(fifo)->data); \
- \
- (fifo)->mask = _allocated_size - 1; \
- \
- if (_bytes < KMALLOC_MAX_SIZE) \
- (fifo)->data = kmalloc(_bytes, gfp_flags); \
- if ((!(fifo)->data) && (gfp_flags & GFP_KERNEL)) \
- (fifo)->data = vmalloc(_bytes); \
- if ((!(fifo)->data)) \
- _ret = false; \
- } \
- _ret; \
+ (fifo)->size = (_size); \
+ (fifo)->mask = (fifo)->size \
+ ? roundup_pow_of_two((fifo)->size) - 1 \
+ : 0; \
+ (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \
})
#define free_fifo(fifo) \
do { \
- kvfree((fifo)->data); \
+ kvpfree((fifo)->data, fifo_buf_size(fifo)); \
(fifo)->data = NULL; \
} while (0)
}
static void journal_seq_blacklist_flush(struct journal *j,
- struct journal_entry_pin *pin)
+ struct journal_entry_pin *pin, u64 seq)
{
struct bch_fs *c =
container_of(j, struct bch_fs, journal);
if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
break;
list_del(&i->list);
- kfree(i);
+ kvpfree(i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&i->j));
}
list_for_each_entry_reverse(i, jlist->head, list) {
where = jlist->head;
add:
- i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
if (!i) {
ret = -ENOMEM;
goto out;
{
void *n;
+ /* the bios are sized for this many pages, max: */
+ if (new_size > JOURNAL_ENTRY_SIZE_MAX)
+ return -ENOMEM;
+
new_size = roundup_pow_of_two(new_size);
- n = (void *) __get_free_pages(GFP_KERNEL, get_order(new_size));
+ n = kvpmalloc(new_size, GFP_KERNEL);
if (!n)
return -ENOMEM;
- free_pages((unsigned long) b->data, get_order(b->size));
+ kvpfree(b->data, b->size);
b->data = n;
b->size = new_size;
return 0;
!read_bucket(i))
break;
out:
- free_pages((unsigned long) buf.data, get_order(buf.size));
+ kvpfree(buf.data, buf.size);
percpu_ref_put(&ca->io_ref);
closure_return(cl);
err:
struct journal_replay *i =
list_first_entry(list, struct journal_replay, list);
list_del(&i->list);
- kvfree(i);
+ kvpfree(i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&i->j));
}
}
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
{
+ struct journal *j = &c->journal;
struct jset_entry *prio_ptrs;
struct journal_list jlist;
struct journal_replay *i;
- struct jset *j;
struct journal_entry_pin_list *p;
struct bch_dev *ca;
u64 cur_seq, end_seq;
- unsigned iter;
+ unsigned iter, keys = 0, entries = 0;
int ret = 0;
closure_init_stack(&jlist.cl);
fsck_err_on(c->sb.clean && journal_has_keys(list), c,
"filesystem marked clean but journal has keys to replay");
- j = &list_entry(list->prev, struct journal_replay, list)->j;
+ i = list_last_entry(list, struct journal_replay, list);
- unfixable_fsck_err_on(le64_to_cpu(j->seq) -
- le64_to_cpu(j->last_seq) + 1 >
- c->journal.pin.size, c,
+ unfixable_fsck_err_on(le64_to_cpu(i->j.seq) -
+ le64_to_cpu(i->j.last_seq) + 1 > j->pin.size, c,
"too many journal entries open for refcount fifo");
- c->journal.pin.back = le64_to_cpu(j->seq) -
- le64_to_cpu(j->last_seq) + 1;
+ atomic64_set(&j->seq, le64_to_cpu(i->j.seq));
+ j->last_seq_ondisk = le64_to_cpu(i->j.last_seq);
- atomic64_set(&c->journal.seq, le64_to_cpu(j->seq));
- c->journal.last_seq_ondisk = le64_to_cpu(j->last_seq);
+ j->pin.front = le64_to_cpu(i->j.last_seq);
+ j->pin.back = le64_to_cpu(i->j.seq) + 1;
- BUG_ON(last_seq(&c->journal) != le64_to_cpu(j->last_seq));
-
- i = list_first_entry(list, struct journal_replay, list);
-
- mutex_lock(&c->journal.blacklist_lock);
-
- fifo_for_each_entry_ptr(p, &c->journal.pin, iter) {
- u64 seq = journal_pin_seq(&c->journal, p);
+ BUG_ON(last_seq(j) != le64_to_cpu(i->j.last_seq));
+ BUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) !=
+ &fifo_peek_back(&j->pin));
+ fifo_for_each_entry_ptr(p, &j->pin, iter) {
INIT_LIST_HEAD(&p->list);
+ atomic_set(&p->count, 0);
+ }
- if (i && le64_to_cpu(i->j.seq) == seq) {
- atomic_set(&p->count, 1);
+ mutex_lock(&j->blacklist_lock);
- if (journal_seq_blacklist_read(&c->journal, i, p)) {
- mutex_unlock(&c->journal.blacklist_lock);
- return -ENOMEM;
- }
+ list_for_each_entry(i, list, list) {
+ p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
- i = list_is_last(&i->list, list)
- ? NULL
- : list_next_entry(i, list);
- } else {
- atomic_set(&p->count, 0);
+ atomic_set(&p->count, 1);
+
+ if (journal_seq_blacklist_read(j, i, p)) {
+ mutex_unlock(&j->blacklist_lock);
+ return -ENOMEM;
}
}
- mutex_unlock(&c->journal.blacklist_lock);
+ mutex_unlock(&j->blacklist_lock);
- cur_seq = last_seq(&c->journal);
+ cur_seq = last_seq(j);
end_seq = le64_to_cpu(list_last_entry(list,
struct journal_replay, list)->j.seq);
list_for_each_entry(i, list, list) {
+ struct jset_entry *entry;
+ struct bkey_i *k, *_n;
bool blacklisted;
- mutex_lock(&c->journal.blacklist_lock);
+ mutex_lock(&j->blacklist_lock);
while (cur_seq < le64_to_cpu(i->j.seq) &&
- journal_seq_blacklist_find(&c->journal, cur_seq))
+ journal_seq_blacklist_find(j, cur_seq))
cur_seq++;
- blacklisted = journal_seq_blacklist_find(&c->journal,
+ blacklisted = journal_seq_blacklist_find(j,
le64_to_cpu(i->j.seq));
- mutex_unlock(&c->journal.blacklist_lock);
+ mutex_unlock(&j->blacklist_lock);
fsck_err_on(blacklisted, c,
"found blacklisted journal entry %llu",
fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
"journal entries %llu-%llu missing! (replaying %llu-%llu)",
cur_seq, le64_to_cpu(i->j.seq) - 1,
- last_seq(&c->journal), end_seq);
+ last_seq(j), end_seq);
cur_seq = le64_to_cpu(i->j.seq) + 1;
+
+ for_each_jset_key(k, _n, entry, &i->j)
+ keys++;
+ entries++;
}
- prio_ptrs = bch2_journal_find_entry(j, JOURNAL_ENTRY_PRIO_PTRS, 0);
+ bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
+ keys, entries, (u64) atomic64_read(&j->seq));
+
+ i = list_last_entry(list, struct journal_replay, list);
+ prio_ptrs = bch2_journal_find_entry(&i->j, JOURNAL_ENTRY_PRIO_PTRS, 0);
if (prio_ptrs) {
- memcpy_u64s(c->journal.prio_buckets,
+ memcpy_u64s(j->prio_buckets,
prio_ptrs->_data,
le16_to_cpu(prio_ptrs->u64s));
- c->journal.nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s);
+ j->nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s);
}
fsck_err:
return ret;
void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_buf *w = journal_prev_buf(j);
+
+ atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count);
if (!need_write_just_set &&
test_bit(JOURNAL_NEED_WRITE, &j->flags))
#endif
}
-static struct journal_entry_pin_list *
-__journal_entry_new(struct journal *j, int count)
+static void __journal_entry_new(struct journal *j, int count)
{
struct journal_entry_pin_list *p = fifo_push_ref(&j->pin);
*/
atomic64_inc(&j->seq);
- BUG_ON(journal_pin_seq(j, p) != atomic64_read(&j->seq));
+ BUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) !=
+ &fifo_peek_back(&j->pin));
INIT_LIST_HEAD(&p->list);
atomic_set(&p->count, count);
-
- return p;
}
static void __bch2_journal_next_entry(struct journal *j)
{
- struct journal_entry_pin_list *p;
struct journal_buf *buf;
- p = __journal_entry_new(j, 1);
-
- if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) {
- smp_wmb();
- j->cur_pin_list = p;
- }
+ __journal_entry_new(j, 1);
buf = journal_cur_buf(j);
memset(buf->has_inode, 0, sizeof(buf->has_inode));
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
+ lockdep_assert_held(&j->lock);
+
do {
old.v = new.v = v;
if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
- atomic_dec_bug(&fifo_peek_back(&j->pin).count);
__bch2_journal_next_entry(j);
cancel_delayed_work(&j->write_work);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
- unsigned sectors_available = j->entry_size_max >> 9;
+ unsigned sectors_available = UINT_MAX;
unsigned i, nr_online = 0, nr_devs = 0;
lockdep_assert_held(&j->lock);
if (sectors <= 0)
return sectors;
+ buf->disk_sectors = sectors;
+
+ sectors = min_t(unsigned, sectors, buf->size >> 9);
+
j->cur_buf_sectors = sectors;
buf->nr_prio_buckets = j->nr_prio_buckets;
int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
{
- int ret = 0, keys = 0, entries = 0;
struct journal *j = &c->journal;
struct bkey_i *k, *_n;
struct jset_entry *entry;
struct journal_replay *i, *n;
+ int ret = 0, did_replay = 0;
list_for_each_entry_safe(i, n, list, list) {
- j->cur_pin_list =
- &j->pin.data[((j->pin.back - 1 -
- (atomic64_read(&j->seq) -
- le64_to_cpu(i->j.seq))) &
- j->pin.mask)];
+ j->replay_pin_list =
+ journal_seq_pin(j, le64_to_cpu(i->j.seq));
for_each_jset_key(k, _n, entry, &i->j) {
struct disk_reservation disk_res;
}
cond_resched();
- keys++;
+ did_replay = true;
}
- if (atomic_dec_and_test(&j->cur_pin_list->count))
+ if (atomic_dec_and_test(&j->replay_pin_list->count))
wake_up(&j->wait);
-
- entries++;
}
- if (keys) {
+ j->replay_pin_list = NULL;
+
+ if (did_replay) {
bch2_btree_flush(c);
/*
* arbitrarily far in the future vs. the most recently written journal
* entry on disk, if we crash before writing the next journal entry:
*/
- ret = bch2_journal_meta(&c->journal);
+ ret = bch2_journal_meta(j);
if (ret) {
bch_err(c, "journal replay: error %d flushing journal", ret);
goto err;
}
}
- bch_info(c, "journal replay done, %i keys in %i entries, seq %llu",
- keys, entries, (u64) atomic64_read(&j->seq));
-
- bch2_journal_set_replay_done(&c->journal);
+ bch2_journal_set_replay_done(j);
err:
bch2_journal_entries_free(list);
return ret;
}
void bch2_journal_pin_add(struct journal *j,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
+ struct journal_res *res,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
{
+ struct journal_entry_pin_list *pin_list = res->ref
+ ? journal_seq_pin(j, res->seq)
+ : j->replay_pin_list;
+
spin_lock_irq(&j->pin_lock);
- __journal_pin_add(j, j->cur_pin_list, pin, flush_fn);
+ __journal_pin_add(j, pin_list, pin, flush_fn);
spin_unlock_irq(&j->pin_lock);
}
}
static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush)
+journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret = NULL;
if (ret) {
/* must be list_del_init(), see bch2_journal_pin_drop() */
list_del_init(&ret->list);
+ *seq = journal_pin_seq(j, pin_list);
break;
}
}
void bch2_journal_flush_pins(struct journal *j)
{
struct journal_entry_pin *pin;
+ u64 seq;
- while ((pin = journal_get_next_pin(j, U64_MAX)))
- pin->flush(j, pin);
+ while ((pin = journal_get_next_pin(j, U64_MAX, &seq)))
+ pin->flush(j, pin, seq);
wait_event(j->wait, !journal_has_pins(j) || bch2_journal_error(j));
}
struct journal *j = &c->journal;
struct bch_dev *ca;
struct journal_entry_pin *pin;
- u64 seq_to_flush = 0;
+ u64 seq, seq_to_flush = 0;
unsigned iter, bucket_to_flush;
unsigned long next_flush;
bool reclaim_lock_held = false, need_flush;
while ((pin = journal_get_next_pin(j, need_flush
? U64_MAX
- : seq_to_flush))) {
+ : seq_to_flush, &seq))) {
__set_current_state(TASK_RUNNING);
- pin->flush(j, pin);
+ pin->flush(j, pin, seq);
need_flush = false;
j->last_flushed = jiffies;
mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
}
+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
+{
+ /* we aren't holding j->lock: */
+ unsigned new_size = READ_ONCE(j->buf_size_want);
+ void *new_buf;
+
+ if (buf->size >= new_size)
+ return;
+
+ new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
+ if (!new_buf)
+ return;
+
+ memcpy(new_buf, buf->data, buf->size);
+ kvpfree(buf->data, buf->size);
+ buf->data = new_buf;
+ buf->size = new_size;
+}
+
static void journal_write(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_prev_buf(j);
- struct jset *jset = w->data;
+ struct jset *jset;
struct bio *bio;
struct bch_extent_ptr *ptr;
unsigned i, sectors, bytes;
+ journal_buf_realloc(j, w);
+ jset = w->data;
+
j->write_start_time = local_clock();
bch2_journal_add_prios(j, w);
unsigned u64s_min, unsigned u64s_max)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_buf *buf;
int ret;
retry:
ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
}
/*
- * Ok, no more room in the current journal entry - try to start a new
+ * If we couldn't get a reservation because the current buf filled up,
+ * and we had room for a bigger entry on disk, signal that we want to
+ * realloc the journal bufs:
+ */
+ buf = journal_cur_buf(j);
+ if (journal_entry_is_open(j) &&
+ buf->size >> 9 < buf->disk_sectors &&
+ buf->size < JOURNAL_ENTRY_SIZE_MAX)
+ j->buf_size_want = max(j->buf_size_want, buf->size << 1);
+
+ /*
+ * Close the current journal entry if necessary, then try to start a new
* one:
*/
switch (journal_buf_switch(j, false)) {
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets =
bch2_sb_get_journal(sb);
- unsigned i, journal_entry_pages;
-
- journal_entry_pages =
- DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb),
- PAGE_SECTORS);
+ unsigned i;
ja->nr = bch2_nr_journal_buckets(journal_buckets);
if (!ja->bucket_seq)
return -ENOMEM;
- ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages);
+ ca->journal.bio = bio_kmalloc(GFP_KERNEL,
+ DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE));
if (!ca->journal.bio)
return -ENOMEM;
void bch2_fs_journal_exit(struct journal *j)
{
- unsigned order = get_order(j->entry_size_max);
-
- free_pages((unsigned long) j->buf[1].data, order);
- free_pages((unsigned long) j->buf[0].data, order);
+ kvpfree(j->buf[1].data, j->buf[1].size);
+ kvpfree(j->buf[0].data, j->buf[0].size);
free_fifo(&j->pin);
}
-int bch2_fs_journal_init(struct journal *j, unsigned entry_size_max)
+int bch2_fs_journal_init(struct journal *j)
{
static struct lock_class_key res_key;
- unsigned order = get_order(entry_size_max);
spin_lock_init(&j->lock);
spin_lock_init(&j->pin_lock);
lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
- j->entry_size_max = entry_size_max;
+ j->buf[0].size = JOURNAL_ENTRY_SIZE_MIN;
+ j->buf[1].size = JOURNAL_ENTRY_SIZE_MIN;
j->write_delay_ms = 100;
j->reclaim_delay_ms = 100;
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
- !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
- !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
+ !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) ||
+ !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL)))
return -ENOMEM;
+ j->pin.front = j->pin.back = 1;
+
return 0;
}
struct jset j;
};
-#define JOURNAL_PIN ((32 * 1024) - 1)
+#define JOURNAL_PIN (32 * 1024)
static inline bool journal_pin_active(struct journal_entry_pin *pin)
{
return pin->pin_list != NULL;
}
-void bch2_journal_pin_add(struct journal *, struct journal_entry_pin *,
- journal_pin_flush_fn);
+static inline struct journal_entry_pin_list *
+journal_seq_pin(struct journal *j, u64 seq)
+{
+ return &j->pin.data[(size_t) seq & j->pin.mask];
+}
+
+void bch2_journal_pin_add(struct journal *, struct journal_res *,
+ struct journal_entry_pin *, journal_pin_flush_fn);
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
void bch2_journal_pin_add_if_older(struct journal *,
struct journal_entry_pin *,
static inline void bch2_journal_set_replay_done(struct journal *j)
{
- spin_lock(&j->lock);
BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
-
set_bit(JOURNAL_REPLAY_DONE, &j->flags);
- j->cur_pin_list = &fifo_peek_back(&j->pin);
- spin_unlock(&j->lock);
}
ssize_t bch2_journal_print_debug(struct journal *, char *);
void bch2_dev_journal_exit(struct bch_dev *);
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
void bch2_fs_journal_exit(struct journal *);
-int bch2_fs_journal_init(struct journal *, unsigned);
+int bch2_fs_journal_init(struct journal *);
#endif /* _BCACHE_JOURNAL_H */
*/
struct journal_buf {
struct jset *data;
+
struct closure_waitlist wait;
+ unsigned size;
+ unsigned disk_sectors;
+
/*
* ugh, prio_buckets are stupid - need to convert them to new
* transaction machinery when it arrives
struct journal;
struct journal_entry_pin;
-typedef void (*journal_pin_flush_fn)(struct journal *j, struct journal_entry_pin *);
+typedef void (*journal_pin_flush_fn)(struct journal *j,
+ struct journal_entry_pin *, u64);
struct journal_entry_pin {
struct list_head list;
};
};
-/* 4 mb, in bytes: */
-#define JOURNAL_ENTRY_SIZE_MAX (4U << 20)
+/* bytes: */
+#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
+#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
/*
* We stash some journal state as sentinal values in cur_entry_offset:
+ * note - cur_entry_offset is in units of u64s
*/
#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1)
unsigned cur_entry_u64s;
unsigned prev_buf_sectors;
unsigned cur_buf_sectors;
- unsigned entry_size_max; /* bytes */
+ unsigned buf_size_want;
/*
* Two journal entries -- one is currently open for new entries, the
* longer needed, the bucket can be discarded and reused.
*/
DECLARE_FIFO(struct journal_entry_pin_list, pin);
- struct journal_entry_pin_list *cur_pin_list;
+ struct journal_entry_pin_list *replay_pin_list;
/*
* Protects the pin lists - the fifo itself is still protected by
if (BCH_SB_GC_RESERVE(sb) < 5)
return "gc reserve percentage too small";
- if (1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) < block_size)
- return "max journal entry size too small";
-
- /* 4 mb max: */
- if (512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX)
- return "max journal entry size too big";
-
if (!sb->time_precision ||
le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
return "invalid time precision";
bch2_fs_exit(c);
}
-#define alloc_bucket_pages(gfp, ca) \
- ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
-
static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
{
struct bch_sb_field_members *mi;
struct bch_fs *c;
- unsigned i, iter_size, journal_entry_bytes;
+ unsigned i, iter_size;
c = kzalloc(sizeof(struct bch_fs), GFP_KERNEL);
if (!c)
iter_size = (btree_blocks(c) + 1) * 2 *
sizeof(struct btree_node_iter_set);
- journal_entry_bytes = 512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb);
-
if (!(c->wq = alloc_workqueue("bcachefs",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcache_copygc",
bdi_setup_and_register(&c->bdi, "bcachefs") ||
bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
- bch2_fs_journal_init(&c->journal, journal_entry_bytes) ||
+ bch2_fs_journal_init(&c->journal) ||
bch2_fs_btree_init(c) ||
bch2_fs_encryption_init(c) ||
bch2_fs_compress_init(c) ||
free_percpu(ca->sectors_written);
bioset_exit(&ca->replica_set);
free_percpu(ca->usage_percpu);
- free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
+ kvpfree(ca->disk_buckets, bucket_bytes(ca));
kfree(ca->prio_buckets);
kfree(ca->bio_prio);
vfree(ca->buckets);
ca->mi.nbuckets)) ||
!(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) *
2, GFP_KERNEL)) ||
- !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
+ !(ca->disk_buckets = kvpmalloc(bucket_bytes(ca), GFP_KERNEL)) ||
!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
!(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
bioset_init(&ca->replica_set, 4,
rw_attribute(journal_write_delay_ms);
rw_attribute(journal_reclaim_delay_ms);
-read_attribute(journal_entry_size_max);
rw_attribute(discard);
rw_attribute(cache_replacement_policy);
sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms);
sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
- sysfs_hprint(journal_entry_size_max, c->journal.entry_size_max);
sysfs_hprint(block_size, block_bytes(c));
sysfs_print(block_size_bytes, block_bytes(c));
struct attribute *bch2_fs_files[] = {
&sysfs_journal_write_delay_ms,
&sysfs_journal_reclaim_delay_ms,
- &sysfs_journal_entry_size_max,
&sysfs_block_size,
&sysfs_block_size_bytes,
#include <linux/freezer.h>
#include <linux/kernel.h>
#include <linux/llist.h>
+#include <linux/log2.h>
#include <linux/ratelimit.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
(__builtin_types_compatible_p(typeof(_val), _type) || \
__builtin_types_compatible_p(typeof(_val), const _type))
-static inline void *kvmalloc(size_t bytes, gfp_t gfp)
+static inline void kvpfree(void *p, size_t size)
{
- if (bytes <= PAGE_SIZE ||
- !(gfp & GFP_KERNEL))
- return kmalloc(bytes, gfp);
-
- return ((bytes <= KMALLOC_MAX_SIZE)
- ? kmalloc(bytes, gfp|__GFP_NOWARN)
- : NULL) ?:
- vmalloc(bytes);
+ if (size < PAGE_SIZE)
+ kfree(p);
+ else if (is_vmalloc_addr(p))
+ vfree(p);
+ else
+ free_pages((unsigned long) p, get_order(size));
+
+}
+
+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
+{
+ return size < PAGE_SIZE ? kmalloc(size, gfp_mask)
+ : (void *) __get_free_pages(gfp_mask, get_order(size))
+ ?: __vmalloc(size, gfp_mask, PAGE_KERNEL);
}
#define DECLARE_HEAP(type, name) \
#define init_heap(heap, _size, gfp) \
({ \
- size_t _bytes; \
(heap)->used = 0; \
(heap)->size = (_size); \
- _bytes = (heap)->size * sizeof(*(heap)->data); \
- (heap)->data = kvmalloc(_bytes, (gfp)); \
- (heap)->data; \
+ (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+ (gfp)); \
})
#define free_heap(heap) \
do { \
- kvfree((heap)->data); \
+ kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \
(heap)->data = NULL; \
} while (0)