-string zstd_compress(const string &src, ZSTD_CDict *cdict, string *tempbuf)
-{
- static ZSTD_CCtx *ctx = nullptr;
- if (ctx == nullptr) {
- ctx = ZSTD_createCCtx();
- }
-
- size_t max_size = ZSTD_compressBound(src.size());
- if (tempbuf->size() < max_size) {
- tempbuf->resize(max_size);
- }
- size_t size;
- if (cdict == nullptr) {
- size = ZSTD_compressCCtx(ctx, &(*tempbuf)[0], max_size, src.data(), src.size(), /*level=*/6);
- } else {
- size = ZSTD_compress_usingCDict(ctx, &(*tempbuf)[0], max_size, src.data(), src.size(), cdict);
- }
- return string(tempbuf->data(), size);
-}
-
-bool is_prime(uint32_t x)
-{
- if ((x % 2) == 0 || (x % 3) == 0) {
- return false;
- }
- uint32_t limit = ceil(sqrt(x));
- for (uint32_t factor = 5; factor <= limit; ++factor) {
- if ((x % factor) == 0) {
- return false;
- }
- }
- return true;
-}
-
-uint32_t next_prime(uint32_t x)
-{
- if ((x % 2) == 0) {
- ++x;
- }
- while (!is_prime(x)) {
- x += 2;
- }
- return x;
-}
-
-unique_ptr<Trigram[]> create_hashtable(Corpus &corpus, const vector<uint32_t> &all_trigrams, uint32_t ht_size, uint32_t num_overflow_slots)
-{
- unique_ptr<Trigram[]> ht(new Trigram[ht_size + num_overflow_slots + 1]); // 1 for the sentinel element at the end.
- for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
- ht[i].trgm = uint32_t(-1);
- ht[i].num_docids = 0;
- ht[i].offset = 0;
- }
- for (uint32_t trgm : all_trigrams) {
- // We don't know offset yet, so set it to zero.
- Trigram to_insert{ trgm, uint32_t(corpus.get_pl_builder(trgm).num_docids), 0 };
-
- uint32_t bucket = hash_trigram(trgm, ht_size);
- unsigned distance = 0;
- while (ht[bucket].num_docids != 0) {
- // Robin Hood hashing; reduces the longest distance by a lot.
- unsigned other_distance = bucket - hash_trigram(ht[bucket].trgm, ht_size);
- if (distance > other_distance) {
- swap(to_insert, ht[bucket]);
- distance = other_distance;
- }
-
- ++bucket, ++distance;
- if (distance > num_overflow_slots) {
- return nullptr;
- }
- }
- ht[bucket] = to_insert;
- }
- return ht;
-}
-
-class DatabaseBuilder {
-public:
- DatabaseBuilder(const char *outfile, int block_size, string dictionary);
- Corpus *start_corpus();
- void finish_corpus();
-
-private:
- FILE *outfp;
- Header hdr;
- const int block_size;
- steady_clock::time_point corpus_start;
- Corpus *corpus = nullptr;
- ZSTD_CDict *cdict = nullptr;
-};
-
-DatabaseBuilder::DatabaseBuilder(const char *outfile, int block_size, string dictionary)
- : block_size(block_size)
-{
- umask(0027);
- outfp = fopen(outfile, "wb");
- if (outfp == nullptr) {
- perror(outfile);
- exit(1);
- }
-
- // Write the header.
- memcpy(hdr.magic, "\0plocate", 8);
- hdr.version = -1; // Mark as broken.
- hdr.hashtable_size = 0; // Not known yet.
- hdr.extra_ht_slots = num_overflow_slots;
- hdr.num_docids = 0;
- hdr.hash_table_offset_bytes = -1; // We don't know these offsets yet.
- hdr.max_version = 1;
- hdr.filename_index_offset_bytes = -1;
- hdr.zstd_dictionary_length_bytes = -1;
- fwrite(&hdr, sizeof(hdr), 1, outfp);
-
- if (dictionary.empty()) {
- hdr.zstd_dictionary_offset_bytes = 0;
- hdr.zstd_dictionary_length_bytes = 0;
- } else {
- hdr.zstd_dictionary_offset_bytes = ftell(outfp);
- fwrite(dictionary.data(), dictionary.size(), 1, outfp);
- hdr.zstd_dictionary_length_bytes = dictionary.size();
- cdict = ZSTD_createCDict(dictionary.data(), dictionary.size(), /*level=*/6);
- }
-}
-
-Corpus *DatabaseBuilder::start_corpus()
-{
- corpus_start = steady_clock::now();
- corpus = new Corpus(outfp, block_size, cdict);
- return corpus;
-}
-
-void DatabaseBuilder::finish_corpus()
-{
- corpus->flush_block();
- hdr.num_docids = corpus->filename_blocks.size();
-
- // Stick an empty block at the end as sentinel.
- corpus->filename_blocks.push_back(ftell(outfp));
- const size_t bytes_for_filenames = corpus->filename_blocks.back() - corpus->filename_blocks.front();
-
- // Write the offsets to the filenames.
- hdr.filename_index_offset_bytes = ftell(outfp);
- const size_t bytes_for_filename_index = corpus->filename_blocks.size() * sizeof(uint64_t);
- fwrite(corpus->filename_blocks.data(), corpus->filename_blocks.size(), sizeof(uint64_t), outfp);
- corpus->filename_blocks.clear();
- corpus->filename_blocks.shrink_to_fit();
-
- // Finish up encoding the posting lists.
- size_t trigrams = 0, longest_posting_list = 0;
- size_t bytes_for_posting_lists = 0;
- for (unsigned trgm = 0; trgm < NUM_TRIGRAMS; ++trgm) {
- if (!corpus->seen_trigram(trgm))
- continue;
- PostingListBuilder &pl_builder = corpus->get_pl_builder(trgm);
- pl_builder.finish();
- longest_posting_list = max(longest_posting_list, pl_builder.num_docids);
- trigrams += pl_builder.num_docids;
- bytes_for_posting_lists += pl_builder.encoded.size();
- }
- size_t num_trigrams = corpus->num_trigrams();
- dprintf("%zu files, %zu different trigrams, %zu entries, avg len %.2f, longest %zu\n",
- corpus->num_files, num_trigrams, trigrams, double(trigrams) / num_trigrams, longest_posting_list);
- dprintf("%zu bytes used for posting lists (%.2f bits/entry)\n", bytes_for_posting_lists, 8 * bytes_for_posting_lists / double(trigrams));
-
- dprintf("Building posting lists took %.1f ms.\n\n", 1e3 * duration<float>(steady_clock::now() - corpus_start).count());
-
- // Find the used trigrams.
- vector<uint32_t> all_trigrams;
- for (unsigned trgm = 0; trgm < NUM_TRIGRAMS; ++trgm) {
- if (corpus->seen_trigram(trgm)) {
- all_trigrams.push_back(trgm);
- }
- }
-
- // Create the hash table.
- unique_ptr<Trigram[]> hashtable;
- uint32_t ht_size = next_prime(all_trigrams.size());
- for (;;) {
- hashtable = create_hashtable(*corpus, all_trigrams, ht_size, num_overflow_slots);
- if (hashtable == nullptr) {
- dprintf("Failed creating hash table of size %u, increasing by 5%% and trying again.\n", ht_size);
- ht_size = next_prime(ht_size * 1.05);
- } else {
- dprintf("Created hash table of size %u.\n\n", ht_size);
- break;
- }
- }
-
- // Find the offsets for each posting list.
- size_t bytes_for_hashtable = (ht_size + num_overflow_slots + 1) * sizeof(Trigram);
- uint64_t offset = ftell(outfp) + bytes_for_hashtable;
- for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
- hashtable[i].offset = offset; // Needs to be there even for empty slots.
- if (hashtable[i].num_docids == 0) {
- continue;
- }
-
- const string &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded;
- offset += encoded.size();
- }
-
- // Write the hash table.
- hdr.hash_table_offset_bytes = ftell(outfp);
- hdr.hashtable_size = ht_size;
- fwrite(hashtable.get(), ht_size + num_overflow_slots + 1, sizeof(Trigram), outfp);
-
- // Write the actual posting lists.
- for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
- if (hashtable[i].num_docids == 0) {
- continue;
- }
- const string &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded;
- fwrite(encoded.data(), encoded.size(), 1, outfp);
- }
-
- // Rewind, and write the updated header.
- hdr.version = 1;
- fseek(outfp, 0, SEEK_SET);
- fwrite(&hdr, sizeof(hdr), 1, outfp);
- fclose(outfp);
-
- size_t total_bytes = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
-
- dprintf("Block size: %7d files\n", block_size);
- dprintf("Dictionary: %'7.1f MB\n", hdr.zstd_dictionary_length_bytes / 1048576.0);
- dprintf("Hash table: %'7.1f MB\n", bytes_for_hashtable / 1048576.0);
- dprintf("Posting lists: %'7.1f MB\n", bytes_for_posting_lists / 1048576.0);
- dprintf("Filename index: %'7.1f MB\n", bytes_for_filename_index / 1048576.0);
- dprintf("Filenames: %'7.1f MB\n", bytes_for_filenames / 1048576.0);
- dprintf("Total: %'7.1f MB\n", total_bytes / 1048576.0);
- dprintf("\n");
-}
-