#include "bkey_methods.h"
#include "btree_gc.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
#include "checksum.h"
#include "debug.h"
#include "dirent.h"
#include "extents.h"
#include "inode.h"
#include "journal.h"
+#include "super.h"
#include "super-io.h"
+#include "util.h"
#include "xattr.h"
#include <trace/events/bcachefs.h>
static enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
struct bkey_i *, struct bkey_i *);
-static void sort_key_next(struct btree_node_iter *iter,
+static void sort_key_next(struct btree_node_iter_large *iter,
struct btree *b,
struct btree_node_iter_set *i)
{
* Necessary for btree_sort_fixup() - if there are multiple keys that compare
* equal in different sets, we have to process them newest to oldest.
*/
-#define key_sort_cmp(l, r) \
+#define key_sort_cmp(h, l, r) \
({ \
- int _c = bkey_cmp_packed(b, \
- __btree_node_offset_to_key(b, (l).k), \
- __btree_node_offset_to_key(b, (r).k)); \
+ bkey_cmp_packed(b, \
+ __btree_node_offset_to_key(b, (l).k), \
+ __btree_node_offset_to_key(b, (r).k)) \
\
- _c ? _c > 0 : (l).k > (r).k; \
+ ?: (l).k - (r).k; \
})
-static inline bool should_drop_next_key(struct btree_node_iter *iter,
+static inline bool should_drop_next_key(struct btree_node_iter_large *iter,
struct btree *b)
{
struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
return false;
if (iter->used > 2 &&
- key_sort_cmp(r[0], r[1]))
+ key_sort_cmp(iter, r[0], r[1]) >= 0)
r++;
/*
}
struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
- struct btree *b,
- struct btree_node_iter *iter)
+ struct btree *b,
+ struct btree_node_iter_large *iter)
{
struct bkey_packed *out = dst->start;
struct btree_nr_keys nr;
heap_resort(iter, key_sort_cmp);
- while (!bch2_btree_node_iter_end(iter)) {
+ while (!bch2_btree_node_iter_large_end(iter)) {
if (!should_drop_next_key(iter, b)) {
struct bkey_packed *k =
__btree_node_offset_to_key(b, iter->data->k);
}
sort_key_next(iter, b, iter->data);
- heap_sift(iter, 0, key_sort_cmp);
+ heap_sift_down(iter, 0, key_sort_cmp);
}
dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
return NULL;
}
+bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev)
+{
+ struct bch_extent_ptr *ptr;
+ bool dropped = false;
+
+ extent_for_each_ptr_backwards(e, ptr)
+ if (ptr->dev == dev) {
+ __bch2_extent_drop_ptr(e, ptr);
+ dropped = true;
+ }
+
+ if (dropped)
+ bch2_extent_drop_redundant_crcs(e);
+ return dropped;
+}
+
+const struct bch_extent_ptr *
+bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
+{
+ const struct bch_extent_ptr *ptr;
+
+ extent_for_each_ptr(e, ptr) {
+ struct bch_dev *ca = c->devs[ptr->dev];
+
+ if (ca->mi.group &&
+ ca->mi.group - 1 == group)
+ return ptr;
+ }
+
+ return NULL;
+}
+
+const struct bch_extent_ptr *
+bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
+{
+ const struct bch_extent_ptr *ptr;
+
+ extent_for_each_ptr(e, ptr)
+ if (dev_in_target(c->devs[ptr->dev], target))
+ return ptr;
+
+ return NULL;
+}
+
unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e)
{
const struct bch_extent_ptr *ptr;
return nr_ptrs;
}
-/* returns true if equal */
-static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r)
+unsigned bch2_extent_ptr_durability(struct bch_fs *c,
+ const struct bch_extent_ptr *ptr)
+{
+ struct bch_dev *ca;
+
+ if (ptr->cached)
+ return 0;
+
+ ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
+ return 0;
+
+ return ca->mi.durability;
+}
+
+unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
{
- return extent_crc_type(l) == extent_crc_type(r) &&
- !memcmp(l, r, extent_entry_bytes(to_entry(l)));
+ const struct bch_extent_ptr *ptr;
+ unsigned durability = 0;
+
+ extent_for_each_ptr(e, ptr)
+ durability += bch2_extent_ptr_durability(c, ptr);
+
+ return durability;
}
-/* Increment pointers after @crc by crc's offset until the next crc entry: */
-void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc)
+unsigned bch2_extent_is_compressed(struct bkey_s_c k)
{
- union bch_extent_entry *entry;
+ struct bkey_s_c_extent e;
+ const struct bch_extent_ptr *ptr;
+ struct bch_extent_crc_unpacked crc;
+ unsigned ret = 0;
- extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) {
- if (!extent_entry_is_ptr(entry))
- return;
+ switch (k.k->type) {
+ case BCH_EXTENT:
+ case BCH_EXTENT_CACHED:
+ e = bkey_s_c_to_extent(k);
- entry->ptr.offset += crc_offset(crc);
+ extent_for_each_ptr_crc(e, ptr, crc)
+ if (!ptr->cached &&
+ crc.compression_type != BCH_COMPRESSION_NONE &&
+ crc.compressed_size < crc.live_size)
+ ret = max_t(unsigned, ret, crc.compressed_size);
}
+
+ return ret;
+}
+
+bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
+ struct bch_extent_ptr m, u64 offset)
+{
+ const struct bch_extent_ptr *ptr;
+ struct bch_extent_crc_unpacked crc;
+
+ extent_for_each_ptr_crc(e, ptr, crc)
+ if (ptr->dev == m.dev &&
+ ptr->gen == m.gen &&
+ (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) ==
+ (s64) m.offset - offset)
+ return ptr;
+
+ return NULL;
+}
+
+/* Doesn't cleanup redundant crcs */
+void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+{
+ EBUG_ON(ptr < &e.v->start->ptr ||
+ ptr >= &extent_entry_last(e)->ptr);
+ EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+ memmove_u64s_down(ptr, ptr + 1,
+ (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
+ e.k->u64s -= sizeof(*ptr) / sizeof(u64);
+}
+
+void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+{
+ __bch2_extent_drop_ptr(e, ptr);
+ bch2_extent_drop_redundant_crcs(e);
+}
+
+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
+ struct bch_extent_crc_unpacked n)
+{
+ return !u.compression_type &&
+ u.csum_type &&
+ u.uncompressed_size > u.live_size &&
+ bch2_csum_type_is_encryption(u.csum_type) ==
+ bch2_csum_type_is_encryption(n.csum_type);
+}
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
+ struct bch_extent_crc_unpacked n)
+{
+ struct bch_extent_crc_unpacked crc;
+ const union bch_extent_entry *i;
+
+ if (!n.csum_type)
+ return false;
+
+ extent_for_each_crc(e, crc, i)
+ if (can_narrow_crc(crc, n))
+ return true;
+
+ return false;
}
/*
* not compressed, we can modify them to point to only the data that is
* currently live (so that readers won't have to bounce) while we've got the
* checksum we need:
- *
- * XXX: to guard against data being corrupted while in memory, instead of
- * recomputing the checksum here, it would be better in the read path to instead
- * of computing the checksum of the entire extent:
- *
- * | extent |
- *
- * compute the checksums of the live and dead data separately
- * | dead data || live data || dead data |
- *
- * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
- * use crc_live here (that we verified was correct earlier)
- *
- * note: doesn't work with encryption
*/
-void bch2_extent_narrow_crcs(struct bkey_s_extent e)
+bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
+ struct bch_extent_crc_unpacked n)
{
- union bch_extent_crc *crc;
- bool have_wide = false, have_narrow = false;
- struct bch_csum csum = { 0 };
- unsigned csum_type = 0;
-
- extent_for_each_crc(e, crc) {
- if (crc_compression_type(crc) ||
- bch2_csum_type_is_encryption(crc_csum_type(crc)))
- continue;
-
- if (crc_uncompressed_size(e.k, crc) != e.k->size) {
- have_wide = true;
- } else {
- have_narrow = true;
- csum = crc_csum(crc);
- csum_type = crc_csum_type(crc);
- }
- }
-
- if (!have_wide || !have_narrow)
- return;
-
- extent_for_each_crc(e, crc) {
- if (crc_compression_type(crc))
- continue;
-
- if (crc_uncompressed_size(e.k, crc) != e.k->size) {
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- BUG();
- case BCH_EXTENT_CRC32:
- if (bch_crc_bytes[csum_type] > 4)
- continue;
-
- bch2_extent_crc_narrow_pointers(e, crc);
- crc->crc32._compressed_size = e.k->size - 1;
- crc->crc32._uncompressed_size = e.k->size - 1;
- crc->crc32.offset = 0;
- crc->crc32.csum_type = csum_type;
- crc->crc32.csum = csum.lo;
+ struct bch_extent_crc_unpacked u;
+ struct bch_extent_ptr *ptr;
+ union bch_extent_entry *i;
+
+ /* Find a checksum entry that covers only live data: */
+ if (!n.csum_type)
+ extent_for_each_crc(extent_i_to_s(e), u, i)
+ if (!u.compression_type &&
+ u.csum_type &&
+ u.live_size == u.uncompressed_size) {
+ n = u;
break;
- case BCH_EXTENT_CRC64:
- if (bch_crc_bytes[csum_type] > 10)
- continue;
+ }
- bch2_extent_crc_narrow_pointers(e, crc);
- crc->crc64._compressed_size = e.k->size - 1;
- crc->crc64._uncompressed_size = e.k->size - 1;
- crc->crc64.offset = 0;
- crc->crc64.csum_type = csum_type;
- crc->crc64.csum_lo = csum.lo;
- crc->crc64.csum_hi = csum.hi;
- break;
- case BCH_EXTENT_CRC128:
- if (bch_crc_bytes[csum_type] > 16)
- continue;
+ if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n))
+ return false;
- bch2_extent_crc_narrow_pointers(e, crc);
- crc->crc128._compressed_size = e.k->size - 1;
- crc->crc128._uncompressed_size = e.k->size - 1;
- crc->crc128.offset = 0;
- crc->crc128.csum_type = csum_type;
- crc->crc128.csum = csum;
- break;
- }
+ BUG_ON(n.compression_type);
+ BUG_ON(n.offset);
+ BUG_ON(n.live_size != e->k.size);
+
+ bch2_extent_crc_append(e, n);
+restart_narrow_pointers:
+ extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u)
+ if (can_narrow_crc(u, n)) {
+ ptr->offset += u.offset;
+ extent_ptr_append(e, *ptr);
+ __bch2_extent_drop_ptr(extent_i_to_s(e), ptr);
+ goto restart_narrow_pointers;
}
- }
+
+ bch2_extent_drop_redundant_crcs(extent_i_to_s(e));
+ return true;
}
void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
{
union bch_extent_entry *entry = e.v->start;
union bch_extent_crc *crc, *prev = NULL;
+ struct bch_extent_crc_unpacked u, prev_u;
while (entry != extent_entry_last(e)) {
union bch_extent_entry *next = extent_entry_next(entry);
goto next;
crc = entry_to_crc(entry);
+ u = bch2_extent_crc_unpack(e.k, crc);
if (next == extent_entry_last(e)) {
/* crc entry with no pointers after it: */
goto drop;
}
- if (prev && crc_cmp(crc, prev)) {
+ if (prev && !memcmp(&u, &prev_u, sizeof(u))) {
/* identical to previous crc entry: */
goto drop;
}
if (!prev &&
- !crc_csum_type(crc) &&
- !crc_compression_type(crc)) {
+ !u.csum_type &&
+ !u.compression_type) {
/* null crc entry: */
- bch2_extent_crc_narrow_pointers(e, crc);
+ union bch_extent_entry *e2;
+
+ extent_for_each_entry_from(e, e2, extent_entry_next(entry)) {
+ if (!extent_entry_is_ptr(e2))
+ break;
+
+ e2->ptr.offset += u.offset;
+ }
goto drop;
}
prev = crc;
+ prev_u = u;
next:
entry = next;
continue;
struct bkey_s_c_extent e,
const struct bch_extent_ptr *ptr)
{
- return ptr->cached && ptr_stale(c->devs[ptr->dev], ptr);
+ return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr);
}
static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
break;
case BCH_EXTENT_ENTRY_crc128:
- entry->crc128.csum.hi = swab64(entry->crc64.csum_hi);
- entry->crc128.csum.lo = swab64(entry->crc64.csum_lo);
+ entry->crc128.csum.hi = (__force __le64)
+ swab64((__force u64) entry->crc128.csum.hi);
+ entry->crc128.csum.lo = (__force __le64)
+ swab64((__force u64) entry->crc128.csum.lo);
break;
case BCH_EXTENT_ENTRY_ptr:
break;
const struct bch_extent_ptr *ptr2;
struct bch_dev *ca;
- if (ptr->dev >= c->sb.nr_devices)
+ if (ptr->dev >= c->sb.nr_devices ||
+ !c->devs[ptr->dev])
return "pointer to invalid device";
- ca = c->devs[ptr->dev];
+ ca = bch_dev_bkey_exists(c, ptr->dev);
if (!ca)
return "pointer to invalid device";
if (ptr != ptr2 && ptr->dev == ptr2->dev)
return "multiple pointers to same device";
- if (ptr->offset + size_ondisk > ca->mi.bucket_size * ca->mi.nbuckets)
+ if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
return "offset past end of device";
- if (ptr->offset < ca->mi.bucket_size * ca->mi.first_bucket)
+ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
return "offset before first bucket";
- if ((ptr->offset & (ca->mi.bucket_size - 1)) +
+ if (bucket_remainder(ca, ptr->offset) +
size_ondisk > ca->mi.bucket_size)
return "spans multiple buckets";
- if (!(metadata ? ca->mi.has_metadata : ca->mi.has_data))
- return "device not marked as containing data";
-
return NULL;
}
{
char *out = buf, *end = buf + size;
const union bch_extent_entry *entry;
- const union bch_extent_crc *crc;
+ struct bch_extent_crc_unpacked crc;
const struct bch_extent_ptr *ptr;
struct bch_dev *ca;
bool first = true;
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128:
- crc = entry_to_crc(entry);
-
- p("crc: c_size %u size %u offset %u csum %u compress %u",
- crc_compressed_size(e.k, crc),
- crc_uncompressed_size(e.k, crc),
- crc_offset(crc), crc_csum_type(crc),
- crc_compression_type(crc));
+ crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
+
+ p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+ crc.compressed_size,
+ crc.uncompressed_size,
+ crc.offset, crc.nonce,
+ crc.csum_type,
+ crc.compression_type);
break;
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
- ca = c->devs[ptr->dev];
+ ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+ ? bch_dev_bkey_exists(c, ptr->dev)
+ : NULL;
p("ptr: %u:%llu gen %u%s", ptr->dev,
(u64) ptr->offset, ptr->gen,
return out - buf;
}
+static inline bool dev_latency_better(struct bch_dev *dev1,
+ struct bch_dev *dev2)
+{
+ unsigned l1 = atomic_read(&dev1->latency[READ]);
+ unsigned l2 = atomic_read(&dev2->latency[READ]);
+
+ /* Pick at random, biased in favor of the faster device: */
+
+ return bch2_rand_range(l1 + l2) > l1;
+}
+
+static void extent_pick_read_device(struct bch_fs *c,
+ struct bkey_s_c_extent e,
+ struct bch_devs_mask *avoid,
+ struct extent_pick_ptr *pick)
+{
+ const struct bch_extent_ptr *ptr;
+ struct bch_extent_crc_unpacked crc;
+
+ extent_for_each_ptr_crc(e, ptr, crc) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ if (ptr->cached && ptr_stale(ca, ptr))
+ continue;
+
+ if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
+ continue;
+
+ if (avoid) {
+ if (test_bit(ca->dev_idx, avoid->d))
+ continue;
+
+ if (pick->ca &&
+ test_bit(pick->ca->dev_idx, avoid->d))
+ goto use;
+ }
+
+ if (pick->ca && !dev_latency_better(ca, pick->ca))
+ continue;
+use:
+ if (!percpu_ref_tryget(&ca->io_ref))
+ continue;
+
+ if (pick->ca)
+ percpu_ref_put(&pick->ca->io_ref);
+
+ *pick = (struct extent_pick_ptr) {
+ .ptr = *ptr,
+ .crc = crc,
+ .ca = ca,
+ };
+ }
+}
+
/* Btree ptrs */
static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const union bch_extent_entry *entry;
const struct bch_extent_ptr *ptr;
- const union bch_extent_crc *crc;
const char *reason;
- extent_for_each_entry(e, entry)
+ extent_for_each_entry(e, entry) {
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
return "invalid extent entry type";
- extent_for_each_ptr_crc(e, ptr, crc) {
+ if (extent_entry_is_crc(entry))
+ return "has crc field";
+ }
+
+ extent_for_each_ptr(e, ptr) {
reason = extent_ptr_invalid(c, e, ptr,
- c->sb.btree_node_size,
+ c->opts.btree_node_size,
true);
if (reason)
return reason;
}
- if (crc)
- return "has crc field";
-
return NULL;
}
unsigned seq;
const char *err;
char buf[160];
- struct bucket *g;
+ struct bucket_mark mark;
struct bch_dev *ca;
unsigned replicas = 0;
bool bad;
extent_for_each_ptr(e, ptr) {
- ca = c->devs[ptr->dev];
- g = PTR_BUCKET(ca, ptr);
+ ca = bch_dev_bkey_exists(c, ptr->dev);
replicas++;
+ if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags))
+ continue;
+
err = "stale";
if (ptr_stale(ca, ptr))
goto err;
do {
seq = read_seqcount_begin(&c->gc_pos_lock);
+ mark = ptr_bucket_mark(ca, ptr);
+
bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
- g->mark.data_type != BUCKET_BTREE;
+ (mark.data_type != BCH_DATA_BTREE ||
+ mark.dirty_sectors < c->opts.btree_node_size);
} while (read_seqcount_retry(&c->gc_pos_lock, seq));
err = "inconsistent";
goto err;
}
- if (replicas < c->sb.meta_replicas_have) {
+ if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) {
bch2_bkey_val_to_text(c, btree_node_type(b),
buf, sizeof(buf), k);
bch2_fs_bug(c,
- "btree key bad (too few replicas, %u < %u): %s",
- replicas, c->sb.meta_replicas_have, buf);
+ "btree key bad (replicas not marked in superblock):\n%s",
+ buf);
return;
}
return;
err:
bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
- bch2_fs_bug(c, "%s btree pointer %s: bucket %zi prio %i "
- "gen %i last_gc %i mark %08x",
+ bch2_fs_bug(c, "%s btree pointer %s: bucket %zi "
+ "gen %i mark %08x",
err, buf, PTR_BUCKET_NR(ca, ptr),
- g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen,
- ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
- (unsigned) g->mark.counter);
+ mark.gen, (unsigned) mark.counter);
}
static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
}
struct extent_pick_ptr
-bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b)
+bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
+ struct bch_devs_mask *avoid)
{
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
- const union bch_extent_crc *crc;
- const struct bch_extent_ptr *ptr;
struct extent_pick_ptr pick = { .ca = NULL };
- extent_for_each_ptr_crc(e, ptr, crc) {
- struct bch_dev *ca = c->devs[ptr->dev];
- struct btree *root = btree_node_root(c, b);
-
- if (bch2_fs_inconsistent_on(crc, c,
- "btree node pointer with crc at btree %u level %u/%u bucket %zu",
- b->btree_id, b->level, root ? root->level : -1,
- PTR_BUCKET_NR(ca, ptr)))
- break;
-
- if (bch2_dev_inconsistent_on(ptr_stale(ca, ptr), ca,
- "stale btree node pointer at btree %u level %u/%u bucket %zu",
- b->btree_id, b->level, root ? root->level : -1,
- PTR_BUCKET_NR(ca, ptr)))
- continue;
-
- if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
- continue;
-
- if (pick.ca && pick.ca->mi.tier < ca->mi.tier)
- continue;
-
- if (!percpu_ref_tryget(&ca->io_ref))
- continue;
-
- if (pick.ca)
- percpu_ref_put(&pick.ca->io_ref);
-
- pick.ca = ca;
- pick.ptr = *ptr;
- }
+ extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
+ avoid, &pick);
return pick;
}
__set_bkey_deleted(k.k);
else if (bkey_extent_is_data(k.k)) {
struct bkey_s_extent e = bkey_s_to_extent(k);
- struct bch_extent_ptr *ptr;
- union bch_extent_crc *crc, *prev_crc = NULL;
+ union bch_extent_entry *entry;
+ bool seen_crc = false;
- extent_for_each_ptr_crc(e, ptr, crc) {
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- ptr->offset += e.k->size - len;
+ extent_for_each_entry(e, entry) {
+ switch (extent_entry_type(entry)) {
+ case BCH_EXTENT_ENTRY_ptr:
+ if (!seen_crc)
+ entry->ptr.offset += e.k->size - len;
break;
- case BCH_EXTENT_CRC32:
- if (prev_crc != crc)
- crc->crc32.offset += e.k->size - len;
+ case BCH_EXTENT_ENTRY_crc32:
+ entry->crc32.offset += e.k->size - len;
break;
- case BCH_EXTENT_CRC64:
- if (prev_crc != crc)
- crc->crc64.offset += e.k->size - len;
+ case BCH_EXTENT_ENTRY_crc64:
+ entry->crc64.offset += e.k->size - len;
break;
- case BCH_EXTENT_CRC128:
- if (prev_crc != crc)
- crc->crc128.offset += e.k->size - len;
+ case BCH_EXTENT_ENTRY_crc128:
+ entry->crc128.offset += e.k->size - len;
break;
}
- prev_crc = crc;
+
+ if (extent_entry_is_crc(entry))
+ seen_crc = true;
}
}
}
/*
- * Returns true if l > r - unless l == r, in which case returns true if l is
- * older than r.
+ * If keys compare equal, compare by pointer order:
*
* Necessary for sort_fix_overlapping() - if there are multiple keys that
* compare equal in different sets, we have to process them newest to oldest.
*/
-#define extent_sort_cmp(l, r) \
+#define extent_sort_cmp(h, l, r) \
({ \
struct bkey _ul = bkey_unpack_key(b, \
__btree_node_offset_to_key(b, (l).k)); \
struct bkey _ur = bkey_unpack_key(b, \
__btree_node_offset_to_key(b, (r).k)); \
\
- int _c = bkey_cmp(bkey_start_pos(&_ul), bkey_start_pos(&_ur)); \
- _c ? _c > 0 : (l).k < (r).k; \
+ bkey_cmp(bkey_start_pos(&_ul), \
+ bkey_start_pos(&_ur)) ?: (r).k - (l).k; \
})
-static inline void extent_sort_sift(struct btree_node_iter *iter,
+static inline void extent_sort_sift(struct btree_node_iter_large *iter,
struct btree *b, size_t i)
{
- heap_sift(iter, i, extent_sort_cmp);
+ heap_sift_down(iter, i, extent_sort_cmp);
}
-static inline void extent_sort_next(struct btree_node_iter *iter,
+static inline void extent_sort_next(struct btree_node_iter_large *iter,
struct btree *b,
struct btree_node_iter_set *i)
{
sort_key_next(iter, b, i);
- heap_sift(iter, i - iter->data, extent_sort_cmp);
+ heap_sift_down(iter, i - iter->data, extent_sort_cmp);
}
static void extent_sort_append(struct bch_fs *c,
struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
struct bset *dst,
struct btree *b,
- struct btree_node_iter *iter)
+ struct btree_node_iter_large *iter)
{
struct bkey_format *f = &b->format;
struct btree_node_iter_set *_l = iter->data, *_r;
heap_resort(iter, extent_sort_cmp);
- while (!bch2_btree_node_iter_end(iter)) {
+ while (!bch2_btree_node_iter_large_end(iter)) {
lk = __btree_node_offset_to_key(b, _l->k);
if (iter->used == 1) {
_r = iter->data + 1;
if (iter->used > 2 &&
- extent_sort_cmp(_r[0], _r[1]))
+ extent_sort_cmp(iter, _r[0], _r[1]) >= 0)
_r++;
rk = __btree_node_offset_to_key(b, _r->k);
};
static void bch2_add_sectors(struct extent_insert_state *s,
- struct bkey_s_c k, u64 offset, s64 sectors)
+ struct bkey_s_c k, u64 offset, s64 sectors)
{
struct bch_fs *c = s->trans->c;
- struct btree *b = s->insert->iter->nodes[0];
+ struct btree *b = s->insert->iter->l[0].b;
EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0);
return;
bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
- &s->stats, s->trans->journal_res.seq);
+ &s->stats, s->trans->journal_res.seq, 0);
}
static void bch2_subtract_sectors(struct extent_insert_state *s,
static enum btree_insert_ret
extent_insert_should_stop(struct extent_insert_state *s)
{
- struct btree *b = s->insert->iter->nodes[0];
+ struct btree *b = s->insert->iter->l[0].b;
/*
* Check if we have sufficient space in both the btree node and the
static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
struct bkey_i *insert)
{
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
- struct bset_tree *t = bset_tree_last(b);
+ struct btree_iter_level *l = &iter->l[0];
+ struct bset_tree *t = bset_tree_last(l->b);
struct bkey_packed *where =
- bch2_btree_node_iter_bset_pos(node_iter, b, t);
- struct bkey_packed *prev = bch2_bkey_prev(b, t, where);
+ bch2_btree_node_iter_bset_pos(&l->iter, l->b, t);
+ struct bkey_packed *prev = bch2_bkey_prev(l->b, t, where);
struct bkey_packed *next_live_key = where;
unsigned clobber_u64s;
if (prev)
where = bkey_next(prev);
- while (next_live_key != btree_bkey_last(b, t) &&
+ while (next_live_key != btree_bkey_last(l->b, t) &&
bkey_deleted(next_live_key))
next_live_key = bkey_next(next_live_key);
bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true))
goto drop_deleted_keys;
- if (next_live_key != btree_bkey_last(b, t) &&
+ if (next_live_key != btree_bkey_last(l->b, t) &&
bch2_extent_merge_inline(c, iter, bkey_to_packed(insert),
next_live_key, false))
goto drop_deleted_keys;
- bch2_bset_insert(b, node_iter, where, insert, clobber_u64s);
- bch2_btree_node_iter_fix(iter, b, node_iter, t, where,
+ bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s);
+ bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where,
clobber_u64s, where->u64s);
return;
drop_deleted_keys:
- bch2_bset_delete(b, where, clobber_u64s);
- bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, 0);
+ bch2_bset_delete(l->b, where, clobber_u64s);
+ bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
+ where, clobber_u64s, 0);
}
static void extent_insert_committed(struct extent_insert_state *s)
if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
bkey_cmp(s->committed, insert->k.p) &&
- bkey_extent_is_compressed(bkey_i_to_s_c(insert))) {
+ bch2_extent_is_compressed(bkey_i_to_s_c(insert))) {
/* XXX: possibly need to increase our reservation? */
bch2_cut_subtract_back(s, s->committed,
bkey_i_to_s(&split.k));
}
if (debug_check_bkeys(c))
- bch2_bkey_debugcheck(c, iter->nodes[iter->level],
- bkey_i_to_s_c(&split.k));
+ bch2_bkey_debugcheck(c, iter->l[0].b, bkey_i_to_s_c(&split.k));
bch2_btree_journal_key(s->trans, iter, &split.k);
s->trans->did_work = true;
}
-static enum extent_insert_hook_ret
+static enum btree_insert_ret
__extent_insert_advance_pos(struct extent_insert_state *s,
struct bpos next_pos,
struct bkey_s_c k)
{
struct extent_insert_hook *hook = s->trans->hook;
- enum extent_insert_hook_ret ret;
-#if 0
- /*
- * Currently disabled for encryption - broken with fcollapse. Will have
- * to reenable when versions are exposed for send/receive - versions
- * will have to be monotonic then:
- */
- if (k.k && k.k->size &&
- !bversion_zero(s->insert->k->k.version) &&
- bversion_cmp(k.k->version, s->insert->k->k.version) > 0) {
- ret = BTREE_HOOK_NO_INSERT;
- } else
-#endif
+ enum btree_insert_ret ret;
+
if (hook)
ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
else
- ret = BTREE_HOOK_DO_INSERT;
+ ret = BTREE_INSERT_OK;
EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size);
- switch (ret) {
- case BTREE_HOOK_DO_INSERT:
- break;
- case BTREE_HOOK_NO_INSERT:
- extent_insert_committed(s);
- bch2_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k));
-
- bch2_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos);
- break;
- case BTREE_HOOK_RESTART_TRANS:
- return ret;
- }
+ if (ret == BTREE_INSERT_OK)
+ s->committed = next_pos;
- s->committed = next_pos;
return ret;
}
* Update iter->pos, marking how much of @insert we've processed, and call hook
* fn:
*/
-static enum extent_insert_hook_ret
+static enum btree_insert_ret
extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
{
- struct btree *b = s->insert->iter->nodes[0];
+ struct btree *b = s->insert->iter->l[0].b;
struct bpos next_pos = bpos_min(s->insert->k->k.p,
k.k ? k.k->p : b->key.k.p);
+ enum btree_insert_ret ret;
+
+ if (race_fault())
+ return BTREE_INSERT_NEED_TRAVERSE;
/* hole? */
if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
- bool have_uncommitted = bkey_cmp(s->committed,
- bkey_start_pos(&s->insert->k->k)) > 0;
-
- switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k),
- bkey_s_c_null)) {
- case BTREE_HOOK_DO_INSERT:
- break;
- case BTREE_HOOK_NO_INSERT:
- /*
- * we had to split @insert and insert the committed
- * part - need to bail out and recheck journal
- * reservation/btree node before we advance pos past @k:
- */
- if (have_uncommitted)
- return BTREE_HOOK_NO_INSERT;
- break;
- case BTREE_HOOK_RESTART_TRANS:
- return BTREE_HOOK_RESTART_TRANS;
- }
+ ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k),
+ bkey_s_c_null);
+ if (ret != BTREE_INSERT_OK)
+ return ret;
}
/* avoid redundant calls to hook fn: */
if (!bkey_cmp(s->committed, next_pos))
- return BTREE_HOOK_DO_INSERT;
+ return BTREE_INSERT_OK;
return __extent_insert_advance_pos(s, next_pos, k);
}
unsigned sectors;
if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
- (sectors = bkey_extent_is_compressed(k))) {
+ (sectors = bch2_extent_is_compressed(k))) {
int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
if (s->trans->flags & BTREE_INSERT_NOFAIL)
switch (bch2_disk_reservation_add(c,
s->trans->disk_res,
- sectors, flags)) {
+ sectors * bch2_extent_nr_dirty_ptrs(k),
+ flags)) {
case 0:
break;
case -ENOSPC:
{
struct bch_fs *c = s->trans->c;
struct btree_iter *iter = s->insert->iter;
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
+ struct btree_iter_level *l = &iter->l[0];
+ struct btree *b = l->b;
+ struct btree_node_iter *node_iter = &l->iter;
+ enum btree_insert_ret ret;
switch (overlap) {
case BCH_EXTENT_OVERLAP_FRONT:
k.k->p = orig_pos;
extent_save(b, node_iter, _k, k.k);
- if (extent_insert_advance_pos(s, k.s_c) ==
- BTREE_HOOK_RESTART_TRANS)
- return BTREE_INSERT_NEED_TRAVERSE;
+ ret = extent_insert_advance_pos(s, k.s_c);
+ if (ret != BTREE_INSERT_OK)
+ return ret;
extent_insert_committed(s);
/*
{
struct bch_fs *c = s->trans->c;
struct btree_iter *iter = s->insert->iter;
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
+ struct btree_iter_level *l = &iter->l[0];
+ struct btree *b = l->b;
+ struct btree_node_iter *node_iter = &l->iter;
struct bkey_packed *_k;
struct bkey unpacked;
struct bkey_i *insert = s->insert->k;
if (ret != BTREE_INSERT_OK)
goto stop;
- switch (extent_insert_advance_pos(s, k.s_c)) {
- case BTREE_HOOK_DO_INSERT:
- break;
- case BTREE_HOOK_NO_INSERT:
- continue;
- case BTREE_HOOK_RESTART_TRANS:
- ret = BTREE_INSERT_NEED_TRAVERSE;
+ ret = extent_insert_advance_pos(s, k.s_c);
+ if (ret)
goto stop;
- }
s->do_journal = true;
bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
}
- if (bkey_cmp(s->committed, insert->k.p) < 0 &&
- ret == BTREE_INSERT_OK &&
- extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
- ret = BTREE_INSERT_NEED_TRAVERSE;
+ if (ret == BTREE_INSERT_OK &&
+ bkey_cmp(s->committed, insert->k.p) < 0)
+ ret = extent_insert_advance_pos(s, bkey_s_c_null);
stop:
extent_insert_committed(s);
gc_pos_btree_node(b));
EBUG_ON(bkey_cmp(iter->pos, s->committed));
- EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf);
+ EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
+ !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
bch2_cut_front(iter->pos, insert);
- if (insert->k.size && iter->at_end_of_leaf)
+ if (insert->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
ret = BTREE_INSERT_NEED_TRAVERSE;
EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK);
{
struct bch_fs *c = trans->c;
struct btree_iter *iter = insert->iter;
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
+ struct btree_iter_level *l = &iter->l[0];
+ struct btree *b = l->b;
+ struct btree_node_iter *node_iter = &l->iter;
struct bkey_packed *_k;
struct bkey unpacked;
enum btree_insert_ret ret = BTREE_INSERT_OK;
/*
* Only call advance pos & call hook for nonzero size extents:
- * If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer
- * overlaps with @k:
*/
- switch (extent_insert_advance_pos(&s, k.s_c)) {
- case BTREE_HOOK_DO_INSERT:
- break;
- case BTREE_HOOK_NO_INSERT:
- continue;
- case BTREE_HOOK_RESTART_TRANS:
- ret = BTREE_INSERT_NEED_TRAVERSE;
+ ret = extent_insert_advance_pos(&s, k.s_c);
+ if (ret != BTREE_INSERT_OK)
goto stop;
- }
if (k.k->size &&
(k.k->needs_whiteout || bset_written(b, bset(b, t))))
goto stop;
}
- if (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
- ret == BTREE_INSERT_OK &&
- extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
- ret = BTREE_INSERT_NEED_TRAVERSE;
+ if (ret == BTREE_INSERT_OK &&
+ bkey_cmp(s.committed, insert->k->k.p) < 0)
+ ret = extent_insert_advance_pos(&s, bkey_s_c_null);
stop:
extent_insert_committed(&s);
/*
EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
EBUG_ON(bkey_cmp(iter->pos, s.committed));
- EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf);
+ EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
+ !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
- if (insert->k->k.size && iter->at_end_of_leaf)
+ if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
ret = BTREE_INSERT_NEED_TRAVERSE;
EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK);
case BCH_EXTENT_CACHED: {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const union bch_extent_entry *entry;
- const union bch_extent_crc *crc;
+ struct bch_extent_crc_unpacked crc;
const struct bch_extent_ptr *ptr;
unsigned size_ondisk = e.k->size;
const char *reason;
+ unsigned nonce = UINT_MAX;
extent_for_each_entry(e, entry) {
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
return "invalid extent entry type";
if (extent_entry_is_crc(entry)) {
- crc = entry_to_crc(entry);
+ crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
- if (crc_offset(crc) + e.k->size >
- crc_uncompressed_size(e.k, crc))
+ if (crc.offset + e.k->size >
+ crc.uncompressed_size)
return "checksum offset + key size > uncompressed size";
- size_ondisk = crc_compressed_size(e.k, crc);
+ size_ondisk = crc.compressed_size;
- if (!bch2_checksum_type_valid(c, crc_csum_type(crc)))
+ if (!bch2_checksum_type_valid(c, crc.csum_type))
return "invalid checksum type";
- if (crc_compression_type(crc) >= BCH_COMPRESSION_NR)
+ if (crc.compression_type >= BCH_COMPRESSION_NR)
return "invalid compression type";
+
+ if (bch2_csum_type_is_encryption(crc.csum_type)) {
+ if (nonce == UINT_MAX)
+ nonce = crc.offset + crc.nonce;
+ else if (nonce != crc.offset + crc.nonce)
+ return "incorrect nonce";
+ }
} else {
ptr = entry_to_ptr(entry);
{
const struct bch_extent_ptr *ptr;
struct bch_dev *ca;
- struct bucket *g;
+ struct bucket_mark mark;
unsigned seq, stale;
char buf[160];
bool bad;
- unsigned ptrs_per_tier[BCH_TIER_MAX];
unsigned replicas = 0;
/*
* going to get overwritten during replay)
*/
- memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
-
extent_for_each_ptr(e, ptr) {
- ca = c->devs[ptr->dev];
- g = PTR_BUCKET(ca, ptr);
+ ca = bch_dev_bkey_exists(c, ptr->dev);
replicas++;
- ptrs_per_tier[ca->mi.tier]++;
/*
* If journal replay hasn't finished, we might be seeing keys
stale = 0;
do {
- struct bucket_mark mark;
-
seq = read_seqcount_begin(&c->gc_pos_lock);
- mark = READ_ONCE(g->mark);
+ mark = ptr_bucket_mark(ca, ptr);
/* between mark and bucket gen */
smp_rmb();
if (stale)
break;
- bad = (mark.data_type != BUCKET_DATA ||
- (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
- !mark.owned_by_allocator &&
- !(ptr->cached
- ? mark.cached_sectors
- : mark.dirty_sectors)));
+ bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
+ (mark.data_type != BCH_DATA_USER ||
+ !(ptr->cached
+ ? mark.cached_sectors
+ : mark.dirty_sectors));
} while (read_seqcount_retry(&c->gc_pos_lock, seq));
if (bad)
}
if (!bkey_extent_is_cached(e.k) &&
- replicas < c->sb.data_replicas_have) {
- bch2_bkey_val_to_text(c, btree_node_type(b), buf,
- sizeof(buf), e.s_c);
+ !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) {
+ bch2_bkey_val_to_text(c, btree_node_type(b),
+ buf, sizeof(buf), e.s_c);
bch2_fs_bug(c,
- "extent key bad (too few replicas, %u < %u): %s",
- replicas, c->sb.data_replicas_have, buf);
+ "extent key bad (replicas not marked in superblock):\n%s",
+ buf);
return;
}
bad_ptr:
bch2_bkey_val_to_text(c, btree_node_type(b), buf,
sizeof(buf), e.s_c);
- bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu prio %i "
- "gen %i last_gc %i mark 0x%08x",
- buf, PTR_BUCKET_NR(ca, ptr),
- g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen,
- ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
- (unsigned) g->mark.counter);
+ bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu "
+ "gen %i type %u", buf,
+ PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type);
return;
}
#undef p
}
-static unsigned PTR_TIER(struct bch_fs *c,
- const struct bch_extent_ptr *ptr)
-{
- return c->devs[ptr->dev]->mi.tier;
-}
-
static void bch2_extent_crc_init(union bch_extent_crc *crc,
- unsigned compressed_size,
- unsigned uncompressed_size,
- unsigned compression_type,
- unsigned nonce,
- struct bch_csum csum, unsigned csum_type)
-{
- if (bch_crc_bytes[csum_type] <= 4 &&
- uncompressed_size <= CRC32_SIZE_MAX &&
- nonce <= CRC32_NONCE_MAX) {
+ struct bch_extent_crc_unpacked new)
+{
+#define common_fields(_crc) \
+ .csum_type = _crc.csum_type, \
+ .compression_type = _crc.compression_type, \
+ ._compressed_size = _crc.compressed_size - 1, \
+ ._uncompressed_size = _crc.uncompressed_size - 1, \
+ .offset = _crc.offset
+
+ if (bch_crc_bytes[new.csum_type] <= 4 &&
+ new.uncompressed_size <= CRC32_SIZE_MAX &&
+ new.nonce <= CRC32_NONCE_MAX) {
crc->crc32 = (struct bch_extent_crc32) {
.type = 1 << BCH_EXTENT_ENTRY_crc32,
- ._compressed_size = compressed_size - 1,
- ._uncompressed_size = uncompressed_size - 1,
- .offset = 0,
- .compression_type = compression_type,
- .csum_type = csum_type,
- .csum = *((__le32 *) &csum.lo),
+ common_fields(new),
+ .csum = *((__le32 *) &new.csum.lo),
};
return;
}
- if (bch_crc_bytes[csum_type] <= 10 &&
- uncompressed_size <= CRC64_SIZE_MAX &&
- nonce <= CRC64_NONCE_MAX) {
+ if (bch_crc_bytes[new.csum_type] <= 10 &&
+ new.uncompressed_size <= CRC64_SIZE_MAX &&
+ new.nonce <= CRC64_NONCE_MAX) {
crc->crc64 = (struct bch_extent_crc64) {
.type = 1 << BCH_EXTENT_ENTRY_crc64,
- ._compressed_size = compressed_size - 1,
- ._uncompressed_size = uncompressed_size - 1,
- .offset = 0,
- .nonce = nonce,
- .compression_type = compression_type,
- .csum_type = csum_type,
- .csum_lo = csum.lo,
- .csum_hi = *((__le16 *) &csum.hi),
+ common_fields(new),
+ .nonce = new.nonce,
+ .csum_lo = new.csum.lo,
+ .csum_hi = *((__le16 *) &new.csum.hi),
};
return;
}
- if (bch_crc_bytes[csum_type] <= 16 &&
- uncompressed_size <= CRC128_SIZE_MAX &&
- nonce <= CRC128_NONCE_MAX) {
+ if (bch_crc_bytes[new.csum_type] <= 16 &&
+ new.uncompressed_size <= CRC128_SIZE_MAX &&
+ new.nonce <= CRC128_NONCE_MAX) {
crc->crc128 = (struct bch_extent_crc128) {
.type = 1 << BCH_EXTENT_ENTRY_crc128,
- ._compressed_size = compressed_size - 1,
- ._uncompressed_size = uncompressed_size - 1,
- .offset = 0,
- .nonce = nonce,
- .compression_type = compression_type,
- .csum_type = csum_type,
- .csum = csum,
+ common_fields(new),
+ .nonce = new.nonce,
+ .csum = new.csum,
};
return;
}
-
+#undef common_fields
BUG();
}
void bch2_extent_crc_append(struct bkey_i_extent *e,
- unsigned compressed_size,
- unsigned uncompressed_size,
- unsigned compression_type,
- unsigned nonce,
- struct bch_csum csum, unsigned csum_type)
+ struct bch_extent_crc_unpacked new)
{
- union bch_extent_crc *crc;
+ struct bch_extent_crc_unpacked crc;
+ const union bch_extent_entry *i;
- BUG_ON(compressed_size > uncompressed_size);
- BUG_ON(uncompressed_size != e->k.size);
- BUG_ON(!compressed_size || !uncompressed_size);
+ BUG_ON(new.compressed_size > new.uncompressed_size);
+ BUG_ON(new.live_size != e->k.size);
+ BUG_ON(!new.compressed_size || !new.uncompressed_size);
/*
* Look up the last crc entry, so we can check if we need to add
* another:
*/
- extent_for_each_crc(extent_i_to_s(e), crc)
+ extent_for_each_crc(extent_i_to_s(e), crc, i)
;
- if (!crc && !csum_type && !compression_type)
- return;
-
- if (crc &&
- crc_compressed_size(&e->k, crc) == compressed_size &&
- crc_uncompressed_size(&e->k, crc) == uncompressed_size &&
- crc_offset(crc) == 0 &&
- crc_nonce(crc) == nonce &&
- crc_csum_type(crc) == csum_type &&
- crc_compression_type(crc) == compression_type &&
- crc_csum(crc).lo == csum.lo &&
- crc_csum(crc).hi == csum.hi)
+ if (!memcmp(&crc, &new, sizeof(crc)))
return;
- bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)),
- compressed_size,
- uncompressed_size,
- compression_type,
- nonce, csum, csum_type);
+ bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
__extent_entry_push(e);
}
void bch2_extent_mark_replicas_cached(struct bch_fs *c,
struct bkey_s_extent e,
- unsigned nr_cached)
+ unsigned nr_desired_replicas,
+ unsigned target)
{
struct bch_extent_ptr *ptr;
- bool have_higher_tier;
- unsigned tier = 0;
+ int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;
- if (!nr_cached)
+ if (extra <= 0)
return;
- do {
- have_higher_tier = false;
-
- extent_for_each_ptr(e, ptr) {
- if (!ptr->cached &&
- PTR_TIER(c, ptr) == tier) {
- ptr->cached = true;
- nr_cached--;
- if (!nr_cached)
- return;
- }
+ extent_for_each_ptr(e, ptr) {
+ int n = bch2_extent_ptr_durability(c, ptr);
- if (PTR_TIER(c, ptr) > tier)
- have_higher_tier = true;
+ if (n && n <= extra &&
+ !dev_in_target(c->devs[ptr->dev], target)) {
+ ptr->cached = true;
+ extra -= n;
}
+ }
+
+ extent_for_each_ptr(e, ptr) {
+ int n = bch2_extent_ptr_durability(c, ptr);
- tier++;
- } while (have_higher_tier);
+ if (n && n <= extra) {
+ ptr->cached = true;
+ extra -= n;
+ }
+ }
}
/*
- * This picks a non-stale pointer, preferabbly from a device other than
- * avoid. Avoid can be NULL, meaning pick any. If there are no non-stale
- * pointers to other devices, it will still pick a pointer from avoid.
- * Note that it prefers lowered-numbered pointers to higher-numbered pointers
- * as the pointers are sorted by tier, hence preferring pointers to tier 0
- * rather than pointers to tier 1.
+ * This picks a non-stale pointer, preferably from a device other than @avoid.
+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
+ * other devices, it will still pick a pointer from avoid.
*/
-void bch2_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k,
- struct bch_dev *avoid,
- struct extent_pick_ptr *ret)
+void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_devs_mask *avoid,
+ struct extent_pick_ptr *ret)
{
struct bkey_s_c_extent e;
- const union bch_extent_crc *crc;
- const struct bch_extent_ptr *ptr;
switch (k.k->type) {
case KEY_TYPE_DELETED:
e = bkey_s_c_to_extent(k);
ret->ca = NULL;
- extent_for_each_ptr_crc(e, ptr, crc) {
- struct bch_dev *ca = c->devs[ptr->dev];
-
- if (ptr->cached && ptr_stale(ca, ptr))
- continue;
-
- if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
- continue;
-
- if (ret->ca &&
- (ca == avoid ||
- ret->ca->mi.tier < ca->mi.tier))
- continue;
-
- if (!percpu_ref_tryget(&ca->io_ref))
- continue;
-
- if (ret->ca)
- percpu_ref_put(&ret->ca->io_ref);
-
- *ret = (struct extent_pick_ptr) {
- .crc = crc_to_128(e.k, crc),
- .ptr = *ptr,
- .ca = ca,
- };
- }
+ extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret);
if (!ret->ca && !bkey_extent_is_cached(e.k))
ret->ca = ERR_PTR(-EIO);
extent_for_each_entry(el, en_l) {
struct bch_extent_ptr *lp, *rp;
- unsigned bucket_size;
+ struct bch_dev *ca;
en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
return BCH_MERGE_NOMERGE;
/* We don't allow extents to straddle buckets: */
- bucket_size = c->devs[lp->dev]->mi.bucket_size;
+ ca = bch_dev_bkey_exists(c, lp->dev);
- if ((lp->offset & ~((u64) bucket_size - 1)) !=
- (rp->offset & ~((u64) bucket_size - 1)))
+ if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
return BCH_MERGE_NOMERGE;
}
struct bkey_packed *k, struct bkey uk,
bool check, bool could_pack)
{
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
+ struct btree_iter_level *l = &iter->l[0];
BUG_ON(!bkey_deleted(k));
return !bkey_packed(k) || could_pack;
} else {
uk.p = new_pos;
- extent_save(b, node_iter, k, &uk);
- bch2_bset_fix_invalidated_key(b, t, k);
- bch2_btree_node_iter_fix(iter, b, node_iter, t,
- k, k->u64s, k->u64s);
+ extent_save(l->b, &l->iter, k, &uk);
+ bch2_bset_fix_invalidated_key(l->b, t, k);
+ bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
+ k, k->u64s, k->u64s);
return true;
}
}
static bool extent_merge_do_overlapping(struct btree_iter *iter,
struct bkey *m, bool back_merge)
{
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
+ struct btree_iter_level *l = &iter->l[0];
+ struct btree *b = l->b;
+ struct btree_node_iter *node_iter = &l->iter;
struct bset_tree *t;
struct bkey_packed *k;
struct bkey uk;
struct bkey_packed *r,
bool back_merge)
{
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
+ struct btree *b = iter->l[0].b;
+ struct btree_node_iter *node_iter = &iter->l[0].iter;
const struct bkey_format *f = &b->format;
struct bset_tree *t = bset_tree_last(b);
struct bkey_packed *m;
if (back_merge)
bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
- bch2_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
- t, m, m->u64s, m->u64s);
+ bch2_btree_node_iter_fix(iter, b, node_iter,
+ t, m, m->u64s, m->u64s);
if (!back_merge)
bkey_copy(packed_to_bkey(l), &li.k);
extent_i_save(b, m, &li.k);
bch2_bset_fix_invalidated_key(b, t, m);
- bch2_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
- t, m, m->u64s, m->u64s);
+ bch2_btree_node_iter_fix(iter, b, node_iter,
+ t, m, m->u64s, m->u64s);
return true;
default:
BUG();
}
}
+int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
+{
+ struct btree_iter iter;
+ struct bpos end = pos;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ end.offset += size;
+
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
+ BTREE_ITER_SLOTS, k) {
+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+ break;
+
+ if (!bch2_extent_is_fully_allocated(k)) {
+ ret = -ENOSPC;
+ break;
+ }
+ }
+ bch2_btree_iter_unlock(&iter);
+
+ return ret;
+}
+
const struct bkey_ops bch2_bkey_extent_ops = {
.key_invalid = bch2_extent_invalid,
.key_debugcheck = bch2_extent_debugcheck,