]> git.sesse.net Git - plocate/blobdiff - parse_trigrams.cpp
Proof-of-concept of using ICU for strength-zero searches.
[plocate] / parse_trigrams.cpp
index 678f520fd03cc7c11c63b3cae7f77e712196c56f..97734e18cbd81656eb2c439741b29d390e521ed7 100644 (file)
@@ -1,11 +1,15 @@
 #include "parse_trigrams.h"
 
+#include "dprintf.h"
 #include "unique_sort.h"
 
 #include <assert.h>
+#include <memory>
 #include <string.h>
 #include <wctype.h>
 
+#include <unicode/coll.h>
+
 using namespace std;
 
 string print_td(const TrigramDisjunction &td)
@@ -171,6 +175,7 @@ void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction>
        // involving ICU or the likes.
        mbtowc(nullptr, 0, 0);
        const char *ptr = needle.c_str();
+       unique_ptr<char[]> buf(new char[MB_CUR_MAX]);
        while (*ptr != '\0') {
                wchar_t ch;
                int ret = mbtowc(&ch, ptr, strlen(ptr));
@@ -179,17 +184,16 @@ void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction>
                        exit(1);
                }
 
-               char buf[MB_CUR_MAX];
                vector<string> alt;
                alt.push_back(string(ptr, ret));
                ptr += ret;
                if (towlower(ch) != wint_t(ch)) {
-                       ret = wctomb(buf, towlower(ch));
-                       alt.push_back(string(buf, ret));
+                       ret = wctomb(buf.get(), towlower(ch));
+                       alt.push_back(string(buf.get(), ret));
                }
                if (towupper(ch) != wint_t(ch) && towupper(ch) != towlower(ch)) {
-                       ret = wctomb(buf, towupper(ch));
-                       alt.push_back(string(buf, ret));
+                       ret = wctomb(buf.get(), towupper(ch));
+                       alt.push_back(string(buf.get(), ret));
                }
                alternatives_for_cp.push_back(move(alt));
        }
@@ -283,6 +287,38 @@ void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction>
 
 void parse_trigrams(const string &needle, bool ignore_case, vector<TrigramDisjunction> *trigram_groups)
 {
+       // ICU...
+       string needle2;
+       for (char ch : needle) {
+               if (ch != '*') needle2.push_back(ch);
+       }
+
+       dprintf("posix locale = %s\n", setlocale(LC_CTYPE, NULL));
+       icu::Locale locale = icu::Locale::createCanonical(setlocale(LC_CTYPE, NULL));
+       dprintf("icu locale = %s\n", locale.getName());
+       UErrorCode status = U_ZERO_ERROR;
+       icu::Collator *coll = icu::Collator::createInstance(locale, status);
+       // FIXME check for failure
+       uint8_t needlebuf[1024];  // FIXME
+       coll->setStrength(icu::Collator::PRIMARY);
+       int len = coll->getSortKey(icu::UnicodeString::fromUTF8(needle2), needlebuf, sizeof(needlebuf));
+       dprintf("needlelen = %d (from ascii %zu, needle '%s')\n", len, needle2.size(), needle2.c_str());
+       for (size_t i = 0; i < len; ++i) {
+               dprintf(" %02x", needlebuf[i]);
+       }
+       dprintf("\n");
+       for (size_t i = 0; i < len - 3; ++i) {
+               uint32_t trgm = needlebuf[i] | (needlebuf[i + 1] << 8) | (needlebuf[i + 2] << 16);
+               dprintf("trgm = %06x\n", trgm);
+
+               TrigramDisjunction new_pt;
+               new_pt.remaining_trigrams_to_read = 1;
+               new_pt.trigram_alternatives.push_back(trgm);
+               new_pt.max_num_docids = 0;
+               trigram_groups->push_back(move(new_pt));
+       }
+       return;
+
        if (ignore_case) {
                parse_trigrams_ignore_case(needle, trigram_groups);
                return;