From: Steinar H. Gunderson <steinar+git@gunderson.no>
Date: Wed, 7 Oct 2020 23:01:55 +0000 (+0200)
Subject: Switch to our own TurboPFor encoder.
X-Git-Tag: 1.0.0~37
X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=0f7ca618fb8a2e501fe68e1760de9ee716e37c40;p=plocate

Switch to our own TurboPFor encoder.

This is much slower (plocate-build becomes ~6% slower or so),
but allows us to ditch the external TurboPFor dependency entirely,
and with it, the SSE4.1 demand. This should make us much more palatable
for most distributions.

The benchmark program is extended with some tests that all posting lists
in plocate.db round-trip properly through our encoder, which found a
lot of bugs during development.
---

diff --git a/.gitmodules b/.gitmodules
index 80bc7e5..e69de29 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "TurboPFor-Integer-Compression"]
-	path = TurboPFor-Integer-Compression
-	url = https://github.com/powturbo/TurboPFor-Integer-Compression
diff --git a/Makefile b/Makefile
index 702c530..dc96d75 100644
--- a/Makefile
+++ b/Makefile
@@ -13,18 +13,15 @@ endif
 
 all: plocate plocate-build
 
-plocate: plocate.o io_uring_engine.o TurboPFor-Integer-Compression/libic.a
+plocate: plocate.o io_uring_engine.o
 	$(CXX) -o $@ $^ -lzstd $(URING_LIBS) $(LDFLAGS)
 
-plocate-build: plocate-build.o TurboPFor-Integer-Compression/libic.a
+plocate-build: plocate-build.o
 	$(CXX) -o $@ $^ -lzstd $(LDFLAGS)
 
-TurboPFor-Integer-Compression/libic.a:
-	cd TurboPFor-Integer-Compression/ && $(MAKE)
-
 clean:
 	$(RM) plocate.o plocate-build.o io_uring_engine.o bench.o plocate plocate-build bench
-	cd TurboPFor-Integer-Compression/ && $(MAKE) clean
+	! [ -d TurboPFor-Integer-Compression/ ] || ( cd TurboPFor-Integer-Compression/ && $(MAKE) clean )
 
 install: all
 	$(INSTALL) -m 2755 -g mlocate plocate $(PREFIX)/bin/
@@ -33,6 +30,9 @@ install: all
 
 bench.o: bench.cpp turbopfor.h
 
+TurboPFor-Integer-Compression/libic.a:
+	cd TurboPFor-Integer-Compression/ && $(MAKE)
+
 bench: bench.o io_uring_engine.o TurboPFor-Integer-Compression/libic.a
 	$(CXX) -o $@ $^ $(URING_LIBS) $(LDFLAGS)
 
diff --git a/README b/README
index 8202052..bae2170 100644
--- a/README
+++ b/README
@@ -2,10 +2,13 @@ plocate, a locate based on posting lists, consuming mlocate inputs
 and making a much faster index. Does not support querying by regex,
 case-insensitivity or really any options.
 
-Alpha stage; file format is subject to change. To build:
+Alpha stage; file format is subject to change. To build, run make.
 
-  git submodule init
-  make -j8
+If you wish to run some tests of the TurboPFor implementation against
+the reference implementation, you can run:
+
+  git clone https://github.com/powturbo/TurboPFor-Integer-Compression
+  make -j8 bench
 
 Copyright 2020 Steinar H. Gunderson <steinar+git@gunderson.no>.
 Licensed under the GNU General Public License, either version 2,
diff --git a/TurboPFor-Integer-Compression b/TurboPFor-Integer-Compression
deleted file mode 160000
index 4ab9f5b..0000000
--- a/TurboPFor-Integer-Compression
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 4ab9f5b0e023e836c5d7f31aa67440916889570a
diff --git a/bench.cpp b/bench.cpp
index 78fa339..afec897 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -10,6 +10,7 @@
 #include "db.h"
 #include "io_uring_engine.h"
 #include "turbopfor.h"
+#include "turbopfor-encode.h"
 #include "vp4.h"
 
 using namespace std;
@@ -29,7 +30,7 @@ int main(void)
 	unique_ptr<Trigram[]> ht(new Trigram[hdr.hashtable_size + hdr.extra_ht_slots + 1]);
 	complete_pread(fd, ht.get(), (hdr.hashtable_size + hdr.extra_ht_slots + 1) * sizeof(Trigram), hdr.hash_table_offset_bytes);
 
-	size_t posting_list_bytes = 0, total_elements = 0;
+	size_t posting_list_bytes = 0, own_posting_list_bytes = 0, total_elements = 0, most_bytes_pl = 0;
 	uint32_t longest_pl = 0;
 	vector<pair<string, unsigned>> posting_lists;
 	for (unsigned i = 0; i < hdr.hashtable_size + hdr.extra_ht_slots; ++i) {
@@ -42,13 +43,17 @@ int main(void)
 		complete_pread(fd, &str[0], len, ht[i].offset);
 		posting_lists.emplace_back(move(str), ht[i].num_docids);
 		longest_pl = std::max(ht[i].num_docids, longest_pl);
+		most_bytes_pl = std::max(len, most_bytes_pl);
 		posting_list_bytes += len;
 		total_elements += ht[i].num_docids;
 	}
 	ht.reset();
 	fprintf(stderr, "Read %zu posting lists.\n", posting_lists.size());
 
-	size_t num_errors = 0;
+	string encoded_pl;
+	encoded_pl.resize(longest_pl * 2 + 16384);  // Lots of margin.
+
+	size_t num_decode_errors = 0, num_encode_errors = 0;
 	for (auto &[pl, num_docids] : posting_lists) {
 		//fprintf(stderr, "%zu bytes, %u docids\n", pl.size(), num_docids);
 		vector<uint32_t> out1, out2;
@@ -59,7 +64,38 @@ int main(void)
 		decode_pfor_delta1<128>(pldata, num_docids, /*interleaved=*/true, &out2[0]);
 		for (unsigned i = 0; i < num_docids; ++i) {
 			if (out1[i] != out2[i]) {
-				if (++num_errors < 10) {
+				if (++num_decode_errors < 10) {
+					fprintf(stderr, "Decode error:\n");
+					for (unsigned j = 0; j < num_docids; ++j) {
+						fprintf(stderr, "%3u: reference=%u ours=%u  (diff=%d)\n", j, out1[j], out2[j], out1[j] - out2[j]);
+					}
+				}
+				break;
+			}
+		}
+
+		// Test encoding, by encoding with out own implementation
+		// and checking that decoding with the reference gives
+		// the same result. We do not measure performance (we're slow).
+		uint32_t deltas[128];
+		unsigned char *ptr = reinterpret_cast<unsigned char *>(&encoded_pl[0]);
+		ptr = write_baseval(out1[0], ptr);
+		for (unsigned i = 1; i < num_docids; i += 128) {
+			unsigned num_docids_this_block = std::min(num_docids - i, 128u);
+			for (unsigned j = 0; j < num_docids_this_block; ++j) {
+				deltas[j] = out1[i + j] - out1[i + j - 1] - 1;
+			}
+			bool interleaved = (num_docids_this_block == 128);
+			ptr = encode_pfor_single_block<128>(deltas, num_docids_this_block, interleaved, ptr);
+		}
+		own_posting_list_bytes += ptr - reinterpret_cast<unsigned char *>(&encoded_pl[0]);
+
+		pldata = reinterpret_cast<unsigned char *>(&encoded_pl[0]);
+		p4nd1dec128v32(pldata, num_docids, &out2[0]);
+		for (unsigned i = 0; i < num_docids; ++i) {
+			if (out1[i] != out2[i]) {
+				if (++num_encode_errors < 10) {
+					fprintf(stderr, "Encode error:\n");
 					for (unsigned j = 0; j < num_docids; ++j) {
 						fprintf(stderr, "%3u: reference=%u ours=%u  (diff=%d)\n", j, out1[j], out2[j], out1[j] - out2[j]);
 					}
@@ -68,8 +104,10 @@ int main(void)
 			}
 		}
 	}
-	fprintf(stderr, "%zu/%zu posting lists had errors in decoding.\n", num_errors, posting_lists.size());
+	fprintf(stderr, "%zu/%zu posting lists had errors in decoding.\n", num_decode_errors, posting_lists.size());
+	fprintf(stderr, "%zu/%zu posting lists had errors in encoding.\n", num_encode_errors, posting_lists.size());
 
+	// Benchmark.
 	vector<uint32_t> dummy;
 	dummy.resize(longest_pl + 128);
 	steady_clock::time_point start = steady_clock::now();
@@ -89,6 +127,7 @@ int main(void)
 	end = steady_clock::now();
 	double own_sec = duration<double>(end - start).count();
 	fprintf(stderr, "Decoding with own implementation: %.3f ms (%.2f%% speed)\n", 1e3 * own_sec, 100.0 * reference_sec / own_sec);
+	fprintf(stderr, "Size with own implementation: %.1f MB (%.2f%% of reference, %+d bytes)\n", own_posting_list_bytes / 1048576.0, 100.0 * own_posting_list_bytes / posting_list_bytes, int(own_posting_list_bytes) - int(posting_list_bytes));
 
 	// Three numbers giving rules of thumb for judging our own implementation:
 	//
diff --git a/plocate-build.cpp b/plocate-build.cpp
index 90141ff..2f8bf31 100644
--- a/plocate-build.cpp
+++ b/plocate-build.cpp
@@ -1,5 +1,4 @@
 #include "db.h"
-#include "vp4.h"
 
 #include <algorithm>
 #include <arpa/inet.h>
@@ -19,6 +18,8 @@
 #include <vector>
 #include <zstd.h>
 
+#include "turbopfor-encode.h"
+
 #define P4NENC_BOUND(n) ((n + 127) / 128 + (n + 32) * sizeof(uint32_t))
 #define dprintf(...)
 //#define dprintf(...) fprintf(stderr, __VA_ARGS__);
@@ -122,7 +123,7 @@ void PostingListBuilder::finish()
 
 	// No interleaving for partial blocks.
 	unsigned char buf[P4NENC_BOUND(128)];
-	unsigned char *end = p4enc32(pending_deltas.data(), pending_deltas.size(), buf);
+	unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), pending_deltas.size(), /*interleaved=*/false, buf);
 	encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
 }
 
@@ -130,15 +131,15 @@ void PostingListBuilder::append_block()
 {
 	unsigned char buf[P4NENC_BOUND(128)];
 	assert(pending_deltas.size() == 128);
-	unsigned char *end = p4enc128v32(pending_deltas.data(), 128, buf);
+	unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), 128, /*interleaved=*/true, buf);
 	encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
 }
 
 void PostingListBuilder::write_header(uint32_t docid)
 {
 	unsigned char buf[P4NENC_BOUND(1)];
-	size_t bytes = p4nd1enc128v32(&docid, 1, buf);
-	encoded.append(reinterpret_cast<char *>(buf), bytes);
+	unsigned char *end = write_baseval(docid, buf);
+	encoded.append(reinterpret_cast<char *>(buf), end - buf);
 }
 
 class Corpus {
diff --git a/turbopfor-common.h b/turbopfor-common.h
new file mode 100644
index 0000000..0d493b5
--- /dev/null
+++ b/turbopfor-common.h
@@ -0,0 +1,36 @@
+#ifndef _TURBOPFOR_COMMON_H
+#define _TURBOPFOR_COMMON_H 1
+
+// Common definitions and utilities between turbopfor.h (decode)
+// and turbopfor-encode.h (encode).
+
+#include <limits.h>
+
+enum BlockType {
+	FOR = 0,
+	PFOR_VB = 1,
+	PFOR_BITMAP = 2,
+	CONSTANT = 3
+};
+
+// Does not properly account for overflow.
+inline unsigned div_round_up(unsigned val, unsigned div)
+{
+	return (val + div - 1) / div;
+}
+
+inline unsigned bytes_for_packed_bits(unsigned num, unsigned bit_width)
+{
+	return div_round_up(num * bit_width, CHAR_BIT);
+}
+
+constexpr uint32_t mask_for_bits(unsigned bit_width)
+{
+	if (bit_width == 32) {
+		return 0xFFFFFFFF;
+	} else {
+		return (1U << bit_width) - 1;
+	}
+}
+
+#endif  // !defined(_TURBOPFOR_COMMON_H)
diff --git a/turbopfor-encode.h b/turbopfor-encode.h
new file mode 100644
index 0000000..532aa19
--- /dev/null
+++ b/turbopfor-encode.h
@@ -0,0 +1,379 @@
+#ifndef _TURBOPFOR_ENCODE_H
+#define _TURBOPFOR_ENCODE_H
+
+// Much like turbopfor.h (and shares all of the same caveats), except this is
+// for encoding. It is _much_ slower than the reference implementation, but we
+// encode only during build, and most time in build is spent in other things
+// than encoding posting lists, so it only costs ~5-10% overall. Does not use
+// any special character sets, and generally isn't optimized at all.
+//
+// It encodes about 0.01% denser than the reference encoder (averaged over
+// a real plocate corpus), probably since it has a slower but more precise
+// method for estimating the cost of a PFOR + varbyte block.
+
+#include "turbopfor-common.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <stdint.h>
+
+template<class Docid>
+void write_le(Docid val, void *out)
+{
+	if constexpr (sizeof(Docid) == 8) {
+		val = htole64(val);
+	} else if constexpr (sizeof(Docid) == 4) {
+		val = htole32(val);
+	} else if constexpr (sizeof(Docid) == 2) {
+		val = htole16(val);
+	} else if constexpr (sizeof(Docid) == 1) {
+		val = val;
+	} else {
+		assert(false);
+	}
+	memcpy(out, &val, sizeof(val));
+}
+
+// Corresponds to read_baseval.
+template<class Docid>
+unsigned char *write_baseval(Docid in, unsigned char *out)
+{
+	if (in < 128) {
+		*out = in;
+		return out + 1;
+	} else if (in < 0x4000) {
+		out[0] = (in >> 8) | 0x80;
+		out[1] = in & 0xff;
+		return out + 2;
+	} else if (in < 0x200000) {
+		out[0] = (in >> 16) | 0xc0;
+		out[1] = in & 0xff;
+		out[2] = (in >> 8) & 0xff;
+		return out + 3;
+	} else {
+		assert(false);  // Not implemented.
+	}
+}
+
+// Writes a varbyte-encoded exception.
+template<class Docid>
+unsigned char *write_vb(Docid val, unsigned char *out)
+{
+	if (val <= 176) {
+		*out++ = val;
+		return out;
+	} else if (val <= 16560) {
+		val -= 177;
+		*out++ = (val >> 8) + 177;
+		*out++ = val & 0xff;
+		return out;
+	} else if (val <= 540848) {
+		val -= 16561;
+		*out = (val >> 16) + 241;
+		write_le<uint16_t>(val & 0xffff, out + 1);
+		return out + 3;
+	} else if (val <= 16777215) {
+		*out = 249;
+		write_le<uint32_t>(val, out + 1);
+		return out + 4;
+	} else {
+		*out = 250;
+		write_le<uint32_t>(val, out + 1);
+		return out + 5;
+	}
+}
+
+template<class Docid>
+inline unsigned num_bits(Docid x)
+{
+	if (x == 0) {
+		return 0;
+	} else {
+		return sizeof(Docid) * CHAR_BIT - __builtin_clz(x);
+	}
+}
+
+struct BitWriter {
+public:
+	BitWriter(unsigned char *out, unsigned bits)
+		: out(out), bits(bits) {}
+	void write(uint32_t val)
+	{
+		cur_val |= val << bits_used;
+		write_le<uint32_t>(cur_val, out);
+
+		bits_used += bits;
+		cur_val >>= (bits_used / 8) * 8;
+		out += bits_used / 8;
+		bits_used %= 8;
+	}
+
+private:
+	unsigned char *out;
+	const unsigned bits;
+	unsigned bits_used = 0;
+	unsigned cur_val = 0;
+};
+
+template<unsigned NumStreams>
+struct InterleavedBitWriter {
+public:
+	InterleavedBitWriter(unsigned char *out, unsigned bits)
+		: out(out), bits(bits) {}
+	void write(uint32_t val)
+	{
+		cur_val |= uint64_t(val) << bits_used;
+		if (bits_used + bits >= 32) {
+			write_le<uint32_t>(cur_val & 0xffffffff, out);
+			out += Stride;
+			cur_val >>= 32;
+			bits_used -= 32;  // Underflow, but will be fixed below.
+		}
+		write_le<uint32_t>(cur_val, out);
+		bits_used += bits;
+	}
+
+private:
+	static constexpr unsigned Stride = NumStreams * sizeof(uint32_t);
+	unsigned char *out;
+	const unsigned bits;
+	unsigned bits_used = 0;
+	uint64_t cur_val = 0;
+};
+
+// Bitpacks a set of values (making sure the top bits are lopped off).
+// If interleaved is set, makes SSE2-compatible interleaving (this is
+// only allowed for full blocks).
+template<class Docid>
+unsigned char *encode_bitmap(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
+{
+	unsigned mask = mask_for_bits(bit_width);
+	if (interleaved) {
+		InterleavedBitWriter<4> bs0(out + 0 * sizeof(uint32_t), bit_width);
+		InterleavedBitWriter<4> bs1(out + 1 * sizeof(uint32_t), bit_width);
+		InterleavedBitWriter<4> bs2(out + 2 * sizeof(uint32_t), bit_width);
+		InterleavedBitWriter<4> bs3(out + 3 * sizeof(uint32_t), bit_width);
+		assert(num % 4 == 0);
+		for (unsigned i = 0; i < num / 4; ++i) {
+			bs0.write(in[i * 4 + 0] & mask);
+			bs1.write(in[i * 4 + 1] & mask);
+			bs2.write(in[i * 4 + 2] & mask);
+			bs3.write(in[i * 4 + 3] & mask);
+		}
+	} else {
+		BitWriter bs(out, bit_width);
+		for (unsigned i = 0; i < num; ++i) {
+			bs.write(in[i] & mask);
+		}
+	}
+	return out + bytes_for_packed_bits(num, bit_width);
+}
+
+// See decode_for() for the format.
+template<class Docid>
+unsigned char *encode_for(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
+{
+	return encode_bitmap(in, num, bit_width, interleaved, out);
+}
+
+// See decode_pfor_bitmap() for the format.
+template<class Docid>
+unsigned char *encode_pfor_bitmap(const Docid *in, unsigned num, unsigned bit_width, unsigned exception_bit_width, bool interleaved, unsigned char *out)
+{
+	*out++ = exception_bit_width;
+
+	// Bitmap of exceptions.
+	{
+		BitWriter bs(out, 1);
+		for (unsigned i = 0; i < num; ++i) {
+			bs.write((in[i] >> bit_width) != 0);
+		}
+		out += bytes_for_packed_bits(num, 1);
+	}
+
+	// Exceptions.
+	{
+		BitWriter bs(out, exception_bit_width);
+		unsigned num_exceptions = 0;
+		for (unsigned i = 0; i < num; ++i) {
+			if ((in[i] >> bit_width) != 0) {
+				bs.write(in[i] >> bit_width);
+				++num_exceptions;
+			}
+		}
+		out += bytes_for_packed_bits(num_exceptions, exception_bit_width);
+	}
+
+	// Base values.
+	out = encode_bitmap(in, num, bit_width, interleaved, out);
+
+	return out;
+}
+
+// See decode_pfor_vb() for the format.
+template<class Docid>
+unsigned char *encode_pfor_vb(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
+{
+	unsigned num_exceptions = 0;
+	for (unsigned i = 0; i < num; ++i) {
+		if ((in[i] >> bit_width) != 0) {
+			++num_exceptions;
+		}
+	}
+	*out++ = num_exceptions;
+
+	// Base values.
+	out = encode_bitmap(in, num, bit_width, interleaved, out);
+
+	// Exceptions.
+	for (unsigned i = 0; i < num; ++i) {
+		unsigned val = in[i] >> bit_width;
+		if (val != 0) {
+			out = write_vb(val, out);
+		}
+	}
+
+	// Exception indexes.
+	for (unsigned i = 0; i < num; ++i) {
+		unsigned val = in[i] >> bit_width;
+		if (val != 0) {
+			*out++ = i;
+		}
+	}
+
+	return out;
+}
+
+// Find out which block type would be the smallest for the given data.
+template<class Docid>
+BlockType decide_block_type(const Docid *in, unsigned num, unsigned *bit_width, unsigned *exception_bit_width)
+{
+	// Check if the block is constant.
+	bool constant = true;
+	for (unsigned i = 1; i < num; ++i) {
+		if (in[i] != in[0]) {
+			constant = false;
+			break;
+		}
+	}
+	if (constant) {
+		*bit_width = num_bits(in[0]);
+		return BlockType::CONSTANT;
+	}
+
+	// Build up a histogram of bit sizes.
+	unsigned histogram[sizeof(Docid) * CHAR_BIT + 1] = { 0 };
+	unsigned max_bits = 0;
+	for (unsigned i = 0; i < num; ++i) {
+		unsigned bits = num_bits(in[i]);
+		++histogram[bits];
+		max_bits = std::max(max_bits, bits);
+	}
+
+	// Straight-up FOR.
+	unsigned best_cost = bytes_for_packed_bits(num, max_bits);
+	unsigned best_bit_width = max_bits;
+
+	// Try PFOR with bitmap exceptions.
+	const unsigned bitmap_cost = bytes_for_packed_bits(num, 1);
+	unsigned num_exceptions = 0;
+	for (unsigned exception_bit_width = 1; exception_bit_width <= max_bits; ++exception_bit_width) {
+		unsigned test_bit_width = max_bits - exception_bit_width;
+		num_exceptions += histogram[test_bit_width + 1];
+
+		// 1 byte for signaling exception bit width, then the bitmap,
+		// then the base values, then the exceptions.
+		unsigned cost = 1 + bitmap_cost + bytes_for_packed_bits(num, test_bit_width) +
+			bytes_for_packed_bits(num_exceptions, exception_bit_width);
+		if (cost < best_cost) {
+			best_cost = cost;
+			best_bit_width = test_bit_width;
+		}
+	}
+
+	// Try PFOR with varbyte exceptions.
+	bool best_is_varbyte = false;
+	for (unsigned test_bit_width = 0; test_bit_width < max_bits; ++test_bit_width) {
+		// 1 byte for signaling number of exceptions, plus the base values,
+		// and then we count up the varbytes and indexes. (This is precise
+		// but very slow.)
+		unsigned cost = 1 + bytes_for_packed_bits(num, test_bit_width);
+		for (unsigned i = 0; i < num && cost < best_cost; ++i) {
+			unsigned val = in[i] >> test_bit_width;
+			if (val == 0) {
+				// Not stored, and then also no index.
+			} else if (val <= 176) {
+				cost += 2;
+			} else if (val <= 16560) {
+				cost += 3;
+			} else if (val <= 540848) {
+				cost += 4;
+			} else if (val <= 16777215) {
+				cost += 5;
+			} else {
+				cost += 6;
+			}
+		}
+		if (cost < best_cost) {
+			best_cost = cost;
+			best_bit_width = test_bit_width;
+			best_is_varbyte = true;
+		}
+	}
+
+	// TODO: Consider the last-resort option of just raw storage (255).
+
+	if (best_is_varbyte) {
+		*bit_width = best_bit_width;
+		return BlockType::PFOR_VB;
+	} else if (best_bit_width == max_bits) {
+		*bit_width = max_bits;
+		return BlockType::FOR;
+	} else {
+		*bit_width = best_bit_width;
+		*exception_bit_width = max_bits - best_bit_width;
+		return BlockType::PFOR_BITMAP;
+	}
+}
+
+// The basic entry point. Takes one block of integers (which already must
+// be delta-minus-1-encoded) and packs it into TurboPFor format.
+// interleaved corresponds to the interleaved parameter in decode_pfor_delta1()
+// or the â128vâ infix in the reference code's function names; such formats
+// are much faster to decode, so for full blocks, you probably want it.
+// The interleaved flag isn't stored anywhere; it's implicit whether you
+// want to use it for full blocks or not.
+//
+// The first value must already be written using write_baseval() (so the delta
+// coding starts from the second value). Returns the end of the string.
+// May write 4 bytes past the end.
+template<unsigned BlockSize, class Docid>
+unsigned char *encode_pfor_single_block(const Docid *in, unsigned num, bool interleaved, unsigned char *out)
+{
+	assert(num > 0);
+	if (interleaved) {
+		assert(num == BlockSize);
+	}
+
+	unsigned bit_width, exception_bit_width;
+	BlockType block_type = decide_block_type(in, num, &bit_width, &exception_bit_width);
+	*out++ = (block_type << 6) | bit_width;
+
+	switch (block_type) {
+	case BlockType::CONSTANT: {
+		unsigned bit_width = num_bits(in[0]);
+		write_le<Docid>(in[0], out);
+		return out + div_round_up(bit_width, 8);
+	}
+	case BlockType::FOR:
+		return encode_for(in, num, bit_width, interleaved, out);
+	case BlockType::PFOR_BITMAP:
+		return encode_pfor_bitmap(in, num, bit_width, exception_bit_width, interleaved, out);
+	case BlockType::PFOR_VB:
+		return encode_pfor_vb(in, num, bit_width, interleaved, out);
+	default:
+		assert(false);
+	}
+}
+
+#endif  // !defined(_TURBOPFOR_ENCODE_H)
diff --git a/turbopfor.h b/turbopfor.h
index 1796708..a21727a 100644
--- a/turbopfor.h
+++ b/turbopfor.h
@@ -25,6 +25,8 @@
 #include <immintrin.h>
 #endif
 
+#include "turbopfor-common.h"
+
 // Forward declarations to declare to the template code below that they exist.
 // (These must seemingly be non-templates for function multiversioning to work.)
 __attribute__((target("default")))
@@ -49,15 +51,6 @@ const unsigned char *
 decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out);
 #endif
 
-constexpr uint32_t mask_for_bits(unsigned bit_width)
-{
-	if (bit_width == 32) {
-		return 0xFFFFFFFF;
-	} else {
-		return (1U << bit_width) - 1;
-	}
-}
-
 template<class Docid>
 Docid read_le(const void *in)
 {
@@ -206,17 +199,6 @@ private:
 };
 #endif
 
-// Does not properly account for overflow.
-inline unsigned div_round_up(unsigned val, unsigned div)
-{
-	return (val + div - 1) / div;
-}
-
-inline unsigned bytes_for_packed_bits(unsigned num, unsigned bit_width)
-{
-	return div_round_up(num * bit_width, CHAR_BIT);
-}
-
 // Constant block. Layout:
 //
 //  - Bit width (6 bits) | type << 6
@@ -727,13 +709,6 @@ decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out)
 	return in;
 }
 
-enum BlockType {
-	FOR = 0,
-	PFOR_VB = 1,
-	PFOR_BITMAP = 2,
-	CONSTANT = 3
-};
-
 template<unsigned BlockSize, class Docid>
 const unsigned char *decode_pfor_delta1(const unsigned char *in, unsigned num, bool interleaved, Docid *out)
 {