#include <algorithm>
#include <assert.h>
+#ifdef HAS_ENDIAN_H
+#include <endian.h>
+#endif
#include <fcntl.h>
#include <string.h>
#include <string_view>
string zstd_compress(const string &src, ZSTD_CDict *cdict, string *tempbuf);
-static inline uint32_t read_unigram(const string_view s, size_t idx)
-{
- if (idx < s.size()) {
- return (unsigned char)s[idx];
- } else {
- return 0;
- }
-}
-
+// NOTE: Will read one byte past the end of the trigram, but it's OK,
+// since we always call it from contexts where there's a terminating zero byte.
static inline uint32_t read_trigram(const string_view s, size_t start)
{
- return read_unigram(s, start) |
- (read_unigram(s, start + 1) << 8) |
- (read_unigram(s, start + 2) << 16);
+ uint32_t trgm;
+ memcpy(&trgm, s.data() + start, sizeof(trgm));
+ trgm = le32toh(trgm);
+ return trgm & 0xffffff;
}
class PostingListBuilder {
inline void add_first_docid(uint32_t docid);
void finish();
- string encoded;
+ vector<unsigned char> encoded;
size_t num_docids = 0;
private:
return;
}
- assert(num_docids != 0);
-
pending_deltas.push_back(docid - last_docid - 1);
last_docid = docid;
if (pending_deltas.size() == 128) {
// No interleaving for partial blocks.
unsigned char buf[P4NENC_BOUND(128)];
unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), pending_deltas.size(), /*interleaved=*/false, buf);
- encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
+ encoded.insert(encoded.end(), buf, end);
}
void PostingListBuilder::append_block()
unsigned char buf[P4NENC_BOUND(128)];
assert(pending_deltas.size() == 128);
unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), 128, /*interleaved=*/true, buf);
- encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
+ encoded.insert(encoded.end(), buf, end);
}
void PostingListBuilder::write_header(uint32_t docid)
{
unsigned char buf[P4NENC_BOUND(1)];
unsigned char *end = write_baseval(docid, buf);
- encoded.append(reinterpret_cast<char *>(buf), end - buf);
+ encoded.insert(encoded.end(), buf, end);
}
void DictionaryBuilder::add_file(string filename, dir_time)
continue;
}
- const string &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded;
+ const vector<unsigned char> &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded;
offset += encoded.size();
}
if (hashtable[i].num_docids == 0) {
continue;
}
- const string &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded;
+ const vector<unsigned char> &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded;
fwrite(encoded.data(), encoded.size(), 1, outfp);
}