#include "db.h"
+#include "dprintf.h"
#include "turbopfor-encode.h"
#include <algorithm>
#include <zstd.h>
#define P4NENC_BOUND(n) ((n + 127) / 128 + (n + 32) * sizeof(uint32_t))
-#define dprintf(...)
-//#define dprintf(...) fprintf(stderr, __VA_ARGS__);
#define NUM_TRIGRAMS 16777216
string zstd_compress(const string &src, ZSTD_CDict *cdict, string *tempbuf);
constexpr unsigned num_overflow_slots = 16;
+bool use_debug = false;
static inline uint32_t read_unigram(const string_view s, size_t idx)
{
string buf;
buf.resize(buf_size);
size_t ret = ZDICT_trainFromBuffer(&buf[0], buf_size, dictionary_buf.data(), lengths.data(), lengths.size());
- dprintf(stderr, "Sampled %zu bytes in %zu blocks, built a dictionary of size %zu\n", dictionary_buf.size(), lengths.size(), ret);
+ dprintf("Sampled %zu bytes in %zu blocks, built a dictionary of size %zu\n", dictionary_buf.size(), lengths.size(), ret);
buf.resize(ret);
sampled_blocks.clear();
}
return *invindex[trgm];
}
+ size_t num_trigrams() const;
private:
unique_ptr<PostingListBuilder *[]> invindex;
++num_blocks;
}
+size_t Corpus::num_trigrams() const
+{
+ size_t num = 0;
+ for (unsigned trgm = 0; trgm < NUM_TRIGRAMS; ++trgm) {
+ if (invindex[trgm] != nullptr) {
+ ++num;
+ }
+ }
+ return num;
+}
+
string read_cstr(FILE *fp)
{
string ret;
}
}
-void read_mlocate(const char *filename, DatabaseReceiver *receiver)
+void read_mlocate(FILE *fp, DatabaseReceiver *receiver)
{
- FILE *fp = fopen(filename, "rb");
- if (fp == nullptr) {
- perror(filename);
+ if (fseek(fp, 0, SEEK_SET) != 0) {
+ perror("fseek");
exit(1);
}
while (!feof(fp)) {
handle_directory(fp, receiver);
}
- fclose(fp);
}
string zstd_compress(const string &src, ZSTD_CDict *cdict, string *tempbuf)
{
steady_clock::time_point start __attribute__((unused)) = steady_clock::now();
+ FILE *infp = fopen(infile, "rb");
+ if (infp == nullptr) {
+ perror(infile);
+ exit(1);
+ }
+
umask(0027);
FILE *outfp = fopen(outfile, "wb");
+ if (outfp == nullptr) {
+ perror(outfile);
+ exit(1);
+ }
// Write the header.
Header hdr;
// dictionary size is ~100 kB, but 1 kB seems to actually compress better for us,
// and decompress just as fast.
DictionaryBuilder builder(/*blocks_to_keep=*/1000, block_size);
- read_mlocate(infile, &builder);
+ read_mlocate(infp, &builder);
string dictionary = builder.train(1024);
ZSTD_CDict *cdict = ZSTD_createCDict(dictionary.data(), dictionary.size(), /*level=*/6);
hdr.zstd_dictionary_length_bytes = dictionary.size();
Corpus corpus(outfp, block_size, cdict);
- read_mlocate(infile, &corpus);
+ read_mlocate(infp, &corpus);
+ fclose(infp);
+
if (false) { // To read a plain text file.
FILE *fp = fopen(infile, "r");
while (!feof(fp)) {
trigrams += pl_builder.num_docids;
bytes_for_posting_lists += pl_builder.encoded.size();
}
+ size_t num_trigrams = corpus.num_trigrams();
dprintf("%zu files, %zu different trigrams, %zu entries, avg len %.2f, longest %zu\n",
- corpus.num_files, corpus.invindex.size(), trigrams, double(trigrams) / corpus.invindex.size(), longest_posting_list);
+ corpus.num_files, num_trigrams, trigrams, double(trigrams) / num_trigrams, longest_posting_list);
dprintf("%zu bytes used for posting lists (%.2f bits/entry)\n", bytes_for_posting_lists, 8 * bytes_for_posting_lists / double(trigrams));
dprintf("Building posting lists took %.1f ms.\n\n", 1e3 * duration<float>(steady_clock::now() - start).count());
{ "block-size", required_argument, 0, 'b' },
{ "help", no_argument, 0, 'h' },
{ "version", no_argument, 0, 'V' },
+ { "debug", no_argument, 0, 'D' }, // Not documented.
{ 0, 0, 0, 0 }
};
setlocale(LC_ALL, "");
for (;;) {
int option_index = 0;
- int c = getopt_long(argc, argv, "b:hV", long_options, &option_index);
+ int c = getopt_long(argc, argv, "b:hVD", long_options, &option_index);
if (c == -1) {
break;
}
case 'h':
usage();
exit(0);
- case 'v':
+ case 'V':
version();
exit(0);
+ case 'D':
+ use_debug = true;
+ break;
default:
exit(1);
}