X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=turbopfor-encode.h;h=66396fd93c5f44b0828279a1eb4d45a4cfe3cb5b;hb=fd6198891d6fd9642effc0843fef6f23b991af3e;hp=532aa19d07b01faf6643f54153ac019387907d38;hpb=0f7ca618fb8a2e501fe68e1760de9ee716e37c40;p=plocate

diff --git a/turbopfor-encode.h b/turbopfor-encode.h
index 532aa19..66396fd 100644
--- a/turbopfor-encode.h
+++ b/turbopfor-encode.h
@@ -5,17 +5,20 @@
 // for encoding. It is _much_ slower than the reference implementation, but we
 // encode only during build, and most time in build is spent in other things
 // than encoding posting lists, so it only costs ~5-10% overall. Does not use
-// any special character sets, and generally isn't optimized at all.
+// any special instruction sets, and generally isn't optimized at all.
 //
-// It encodes about 0.01% denser than the reference encoder (averaged over
-// a real plocate corpus), probably since it has a slower but more precise
-// method for estimating the cost of a PFOR + varbyte block.
+// It encodes roughly about as dense as the reference encoder.
 
 #include "turbopfor-common.h"
 
+#include <algorithm>
 #include <assert.h>
+#ifdef HAS_ENDIAN_H
+#include <endian.h>
+#endif
 #include <limits.h>
 #include <stdint.h>
+#include <string.h>
 
 template<class Docid>
 void write_le(Docid val, void *out)
@@ -27,7 +30,7 @@ void write_le(Docid val, void *out)
 	} else if constexpr (sizeof(Docid) == 2) {
 		val = htole16(val);
 	} else if constexpr (sizeof(Docid) == 1) {
-		val = val;
+		// No change.
 	} else {
 		assert(false);
 	}
@@ -50,6 +53,12 @@ unsigned char *write_baseval(Docid in, unsigned char *out)
 		out[1] = in & 0xff;
 		out[2] = (in >> 8) & 0xff;
 		return out + 3;
+	} else if (in < 0x10000000) {
+		out[0] = (in >> 24) | 0xe0;
+		out[1] = (in >> 16) & 0xff;
+		out[2] = (in >> 8) & 0xff;
+		out[3] = in & 0xff;
+		return out + 4;
 	} else {
 		assert(false);  // Not implemented.
 	}
@@ -86,11 +95,20 @@ unsigned char *write_vb(Docid val, unsigned char *out)
 template<class Docid>
 inline unsigned num_bits(Docid x)
 {
+#ifdef __GNUC__
 	if (x == 0) {
 		return 0;
 	} else {
 		return sizeof(Docid) * CHAR_BIT - __builtin_clz(x);
 	}
+#else
+	for (int i = sizeof(Docid) * CHAR_BIT; i-- > 0;) {
+		if (x & (Docid{ 1 } << i)) {
+			return i;
+		}
+	}
+	return 0;
+#endif
 }
 
 struct BitWriter {
@@ -291,27 +309,36 @@ BlockType decide_block_type(const Docid *in, unsigned num, unsigned *bit_width,
 		}
 	}
 
+	// Make the histogram cumulative; histogram[n] now means the the number of
+	// elements with n bits or more.
+	for (unsigned i = max_bits; i > 0; --i) {
+		histogram[i - 1] += histogram[i];
+	}
+
 	// Try PFOR with varbyte exceptions.
 	bool best_is_varbyte = false;
 	for (unsigned test_bit_width = 0; test_bit_width < max_bits; ++test_bit_width) {
 		// 1 byte for signaling number of exceptions, plus the base values,
-		// and then we count up the varbytes and indexes. (This is precise
-		// but very slow.)
+		// and then we estimate up the varbytes and indexes using histogram
+		// indexes. This isn't exact, but it only helps ~0.1% on the total
+		// plocate.db size.
 		unsigned cost = 1 + bytes_for_packed_bits(num, test_bit_width);
-		for (unsigned i = 0; i < num && cost < best_cost; ++i) {
-			unsigned val = in[i] >> test_bit_width;
-			if (val == 0) {
-				// Not stored, and then also no index.
-			} else if (val <= 176) {
-				cost += 2;
-			} else if (val <= 16560) {
-				cost += 3;
-			} else if (val <= 540848) {
-				cost += 4;
-			} else if (val <= 16777215) {
-				cost += 5;
-			} else {
-				cost += 6;
+		if (cost >= best_cost) {
+			break;
+		}
+		if (test_bit_width + 1 <= max_bits) {
+			cost += 2 * histogram[test_bit_width + 1];  // > 0.
+			if (test_bit_width + 7 <= max_bits) {
+				cost += histogram[test_bit_width + 7];  // > 176, very roughly.
+				if (test_bit_width + 14 <= max_bits) {
+					cost += histogram[test_bit_width + 14];  // > 16560, very roughly.
+					if (test_bit_width + 19 <= max_bits) {
+						cost += histogram[test_bit_width + 19];  // > 540848, very roughly.
+						if (test_bit_width + 24 <= max_bits) {
+							cost += histogram[test_bit_width + 24];  // > 16777215, very roughly.
+						}
+					}
+				}
 			}
 		}
 		if (cost < best_cost) {