X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=database-builder.cpp;h=75b96f8c02a6c4108f2b0497e2aa5881167db80a;hb=fe6f818642f912d3d3c3634f6346dbdfd314c433;hp=439e25a59e88980ef6a2f8f7bb9d38c43f7134ec;hpb=dfa0744dde7ac45840573a6d6835156ab62fd5ff;p=plocate diff --git a/database-builder.cpp b/database-builder.cpp index 439e25a..75b96f8 100644 --- a/database-builder.cpp +++ b/database-builder.cpp @@ -5,6 +5,9 @@ #include #include +#ifdef HAS_ENDIAN_H +#include +#endif #include #include #include @@ -26,20 +29,14 @@ constexpr unsigned num_overflow_slots = 16; string zstd_compress(const string &src, ZSTD_CDict *cdict, string *tempbuf); -static inline uint32_t read_unigram(const string_view s, size_t idx) -{ - if (idx < s.size()) { - return (unsigned char)s[idx]; - } else { - return 0; - } -} - +// NOTE: Will read one byte past the end of the trigram, but it's OK, +// since we always call it from contexts where there's a terminating zero byte. static inline uint32_t read_trigram(const string_view s, size_t start) { - return read_unigram(s, start) | - (read_unigram(s, start + 1) << 8) | - (read_unigram(s, start + 2) << 16); + uint32_t trgm; + memcpy(&trgm, s.data() + start, sizeof(trgm)); + trgm = le32toh(trgm); + return trgm & 0xffffff; } class PostingListBuilder { @@ -48,7 +45,7 @@ public: inline void add_first_docid(uint32_t docid); void finish(); - string encoded; + vector encoded; size_t num_docids = 0; private: @@ -67,8 +64,6 @@ void PostingListBuilder::add_docid(uint32_t docid) return; } - assert(num_docids != 0); - pending_deltas.push_back(docid - last_docid - 1); last_docid = docid; if (pending_deltas.size() == 128) { @@ -96,7 +91,7 @@ void PostingListBuilder::finish() // No interleaving for partial blocks. unsigned char buf[P4NENC_BOUND(128)]; unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), pending_deltas.size(), /*interleaved=*/false, buf); - encoded.append(reinterpret_cast(buf), reinterpret_cast(end)); + encoded.insert(encoded.end(), buf, end); } void PostingListBuilder::append_block() @@ -104,14 +99,14 @@ void PostingListBuilder::append_block() unsigned char buf[P4NENC_BOUND(128)]; assert(pending_deltas.size() == 128); unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), 128, /*interleaved=*/true, buf); - encoded.append(reinterpret_cast(buf), reinterpret_cast(end)); + encoded.insert(encoded.end(), buf, end); } void PostingListBuilder::write_header(uint32_t docid) { unsigned char buf[P4NENC_BOUND(1)]; unsigned char *end = write_baseval(docid, buf); - encoded.append(reinterpret_cast(buf), end - buf); + encoded.insert(encoded.end(), buf, end); } void DictionaryBuilder::add_file(string filename, dir_time) @@ -617,7 +612,7 @@ void DatabaseBuilder::finish_corpus() continue; } - const string &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded; + const vector &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded; offset += encoded.size(); } @@ -631,7 +626,7 @@ void DatabaseBuilder::finish_corpus() if (hashtable[i].num_docids == 0) { continue; } - const string &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded; + const vector &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded; fwrite(encoded.data(), encoded.size(), 1, outfp); }