#include "db.h"
+#include "dprintf.h"
#include "turbopfor-encode.h"
#include <algorithm>
#include <zstd.h>
#define P4NENC_BOUND(n) ((n + 127) / 128 + (n + 32) * sizeof(uint32_t))
-#define dprintf(...)
-//#define dprintf(...) fprintf(stderr, __VA_ARGS__);
#define NUM_TRIGRAMS 16777216
string zstd_compress(const string &src, ZSTD_CDict *cdict, string *tempbuf);
constexpr unsigned num_overflow_slots = 16;
+bool use_debug = false;
static inline uint32_t read_unigram(const string_view s, size_t idx)
{
string buf;
buf.resize(buf_size);
size_t ret = ZDICT_trainFromBuffer(&buf[0], buf_size, dictionary_buf.data(), lengths.data(), lengths.size());
- dprintf(stderr, "Sampled %zu bytes in %zu blocks, built a dictionary of size %zu\n", dictionary_buf.size(), lengths.size(), ret);
+ if (ret == size_t(-1)) {
+ return "";
+ }
+ dprintf("Sampled %zu bytes in %zu blocks, built a dictionary of size %zu\n", dictionary_buf.size(), lengths.size(), ret);
buf.resize(ret);
sampled_blocks.clear();
}
return *invindex[trgm];
}
+ size_t num_trigrams() const;
private:
unique_ptr<PostingListBuilder *[]> invindex;
++num_blocks;
}
+size_t Corpus::num_trigrams() const
+{
+ size_t num = 0;
+ for (unsigned trgm = 0; trgm < NUM_TRIGRAMS; ++trgm) {
+ if (invindex[trgm] != nullptr) {
+ ++num;
+ }
+ }
+ return num;
+}
+
string read_cstr(FILE *fp)
{
string ret;
}
}
-void read_mlocate(const char *filename, DatabaseReceiver *receiver)
+void read_plaintext(FILE *fp, DatabaseReceiver *receiver)
+{
+ if (fseek(fp, 0, SEEK_SET) != 0) {
+ perror("fseek");
+ exit(1);
+ }
+
+ while (!feof(fp)) {
+ char buf[1024];
+ if (fgets(buf, sizeof(buf), fp) == nullptr) {
+ break;
+ }
+ string s(buf);
+ assert(!s.empty());
+ while (s.back() != '\n' && !feof(fp)) {
+ // The string was longer than the buffer, so read again.
+ if (fgets(buf, sizeof(buf), fp) == nullptr) {
+ break;
+ }
+ s += buf;
+ }
+ if (!s.empty() && s.back() == '\n')
+ s.pop_back();
+ receiver->add_file(move(s));
+ }
+}
+
+void read_mlocate(FILE *fp, DatabaseReceiver *receiver)
{
- FILE *fp = fopen(filename, "rb");
- if (fp == nullptr) {
- perror(filename);
+ if (fseek(fp, 0, SEEK_SET) != 0) {
+ perror("fseek");
exit(1);
}
while (!feof(fp)) {
handle_directory(fp, receiver);
}
- fclose(fp);
}
string zstd_compress(const string &src, ZSTD_CDict *cdict, string *tempbuf)
return ht;
}
-void do_build(const char *infile, const char *outfile, int block_size)
+void do_build(const char *infile, const char *outfile, int block_size, bool plaintext)
{
- steady_clock::time_point start __attribute__((unused)) = steady_clock::now();
+ steady_clock::time_point start = steady_clock::now();
+
+ FILE *infp = fopen(infile, "rb");
+ if (infp == nullptr) {
+ perror(infile);
+ exit(1);
+ }
umask(0027);
FILE *outfp = fopen(outfile, "wb");
+ if (outfp == nullptr) {
+ perror(outfile);
+ exit(1);
+ }
// Write the header.
Header hdr;
// dictionary size is ~100 kB, but 1 kB seems to actually compress better for us,
// and decompress just as fast.
DictionaryBuilder builder(/*blocks_to_keep=*/1000, block_size);
- read_mlocate(infile, &builder);
+ if (plaintext) {
+ read_plaintext(infp, &builder);
+ } else {
+ read_mlocate(infp, &builder);
+ }
string dictionary = builder.train(1024);
ZSTD_CDict *cdict = ZSTD_createCDict(dictionary.data(), dictionary.size(), /*level=*/6);
hdr.zstd_dictionary_length_bytes = dictionary.size();
Corpus corpus(outfp, block_size, cdict);
- read_mlocate(infile, &corpus);
- if (false) { // To read a plain text file.
- FILE *fp = fopen(infile, "r");
- while (!feof(fp)) {
- char buf[1024];
- if (fgets(buf, 1024, fp) == nullptr || feof(fp)) {
- break;
- }
- string s(buf);
- if (s.back() == '\n')
- s.pop_back();
- corpus.add_file(move(s));
- }
- fclose(fp);
+ if (plaintext) {
+ read_plaintext(infp, &corpus);
+ } else {
+ read_mlocate(infp, &corpus);
}
+ fclose(infp);
+
corpus.flush_block();
dprintf("Read %zu files from %s\n", corpus.num_files, infile);
hdr.num_docids = corpus.filename_blocks.size();
trigrams += pl_builder.num_docids;
bytes_for_posting_lists += pl_builder.encoded.size();
}
+ size_t num_trigrams = corpus.num_trigrams();
dprintf("%zu files, %zu different trigrams, %zu entries, avg len %.2f, longest %zu\n",
- corpus.num_files, corpus.invindex.size(), trigrams, double(trigrams) / corpus.invindex.size(), longest_posting_list);
+ corpus.num_files, num_trigrams, trigrams, double(trigrams) / num_trigrams, longest_posting_list);
dprintf("%zu bytes used for posting lists (%.2f bits/entry)\n", bytes_for_posting_lists, 8 * bytes_for_posting_lists / double(trigrams));
dprintf("Building posting lists took %.1f ms.\n\n", 1e3 * duration<float>(steady_clock::now() - start).count());
fwrite(&hdr, sizeof(hdr), 1, outfp);
fclose(outfp);
- size_t total_bytes __attribute__((unused)) = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
+ size_t total_bytes = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
dprintf("Block size: %7d files\n", block_size);
dprintf("Dictionary: %'7.1f MB\n", dictionary.size() / 1048576.0);
"Normally, the destination should be /var/lib/mlocate/plocate.db.\n"
"\n"
" -b, --block-size SIZE number of filenames to store in each block (default 32)\n"
+ " -p, --plaintext input is a plaintext file, not an mlocate database\n"
" --help print this help\n"
" --version print version information\n");
}
{
static const struct option long_options[] = {
{ "block-size", required_argument, 0, 'b' },
+ { "plaintext", no_argument, 0, 'p' },
{ "help", no_argument, 0, 'h' },
{ "version", no_argument, 0, 'V' },
+ { "debug", no_argument, 0, 'D' }, // Not documented.
{ 0, 0, 0, 0 }
};
int block_size = 32;
+ bool plaintext = false;
setlocale(LC_ALL, "");
for (;;) {
int option_index = 0;
- int c = getopt_long(argc, argv, "b:hV", long_options, &option_index);
+ int c = getopt_long(argc, argv, "b:hpVD", long_options, &option_index);
if (c == -1) {
break;
}
case 'b':
block_size = atoi(optarg);
break;
+ case 'p':
+ plaintext = true;
+ break;
case 'h':
usage();
exit(0);
- case 'v':
+ case 'V':
version();
exit(0);
+ case 'D':
+ use_debug = true;
+ break;
default:
exit(1);
}
exit(1);
}
- do_build(argv[optind], argv[optind + 1], block_size);
+ do_build(argv[optind], argv[optind + 1], block_size, plaintext);
exit(EXIT_SUCCESS);
}