--- /dev/null
+#include <string.h>
+#include <assert.h>
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <smmintrin.h>
+
+#include "unstuff.h"
+
+#define MARKER_CHAR 0xff
+#define STUFF_MARKER 0x00
+
+int unstuff_reference(uint8_t* dst, const uint8_t* src, size_t len)
+{
+ size_t bytes_written = 0;
+
+ for (unsigned i = 0; i < len; ++i, ++dst, ++src, ++bytes_written) {
+ *dst = *src;
+ if (*src == MARKER_CHAR) {
+ if (i == len - 1 || src[1] != STUFF_MARKER) {
+ return -1;
+ }
+
+ // Skip the stuff byte.
+ ++src, ++i;
+ }
+ }
+
+ assert(bytes_written <= len);
+ return bytes_written;
+}
+
+int unstuff_fast(uint8_t* dst, const uint8_t* src, size_t len)
+{
+ size_t bytes_read = 0;
+ size_t bytes_written = 0;
+
+ while (len - bytes_read >= 0) {
+ // Find the first marker byte in the rest of the stream.
+ const uint8_t* ptr = memchr(src, MARKER_CHAR, len - bytes_read);
+ if (ptr == NULL) {
+ // No marker bytes left.
+ size_t len_to_copy = len - bytes_read;
+ memcpy(dst, src, len_to_copy);
+ bytes_written += len_to_copy;
+ break;
+ }
+
+ const size_t len_to_copy = ptr - src + 1;
+ memcpy(dst, src, len_to_copy);
+
+ src += len_to_copy;
+ dst += len_to_copy;
+ bytes_read += len_to_copy;
+ bytes_written += len_to_copy;
+
+ assert(bytes_read <= len);
+ if (bytes_read == len) {
+ // Partial marker.
+ return -1;
+ } else {
+ if (*src != STUFF_MARKER) {
+ return -1;
+ }
+ ++src;
+ ++bytes_read;
+ }
+ }
+
+ return bytes_written;
+}
+
+int unstuff_sse41(uint8_t* dst, const uint8_t* src, size_t len)
+{
+ __m128i marker_search = _mm_set1_epi8(MARKER_CHAR);
+
+ size_t bytes_read = 0;
+ size_t bytes_written = 0;
+ while (len - bytes_read >= 16) {
+ __m128i data = _mm_lddqu_si128((const __m128i*)src);
+
+ // The store here is safe (if there's stuff bytes, the data
+ // will simply get overwritten in the slow path); fire it off
+ // here so it can run in parallel with the compare.
+ _mm_storeu_si128((__m128i*)dst, data);
+
+ __m128i eq_mask = _mm_cmpeq_epi8(data, marker_search);
+ if (_mm_test_all_zeros(eq_mask, eq_mask)) {
+ // Fast path; no stuff byte found.
+ src += 16;
+ dst += 16;
+ bytes_read += 16;
+ bytes_written += 16;
+ continue;
+ }
+
+ // We found a stuff byte. If it was the last byte, we just
+ // defer that to the next chunk. Apart from that, we just keep
+ // going one by one byte. We could perhaps speed this up with
+ // the data from eq_mask(), but we're not doing that yet.
+ size_t len_this_chunk = (src[15] == 0xff ? 15 : 16);
+ for (unsigned j = 0; j < len_this_chunk; ++j, ++dst, ++src, ++bytes_written) {
+ *dst = *src;
+
+ if (*src == MARKER_CHAR) {
+ assert(j != 15);
+ if (src[1] != STUFF_MARKER) {
+ return -1;
+ }
+
+ // Skip the stuff byte.
+ ++src, ++j;
+ }
+ }
+ bytes_read += len_this_chunk;
+ }
+
+ // Do the final bytes via the reference path.
+ int ret = unstuff_reference(dst, src, len - bytes_read);
+ if (ret == -1) {
+ return -1;
+ } else {
+ return bytes_written + ret;
+ }
+}
--- /dev/null
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <time.h>
+#include <sys/time.h>
+
+#include "unstuff.h"
+
+void test_basic_unstuff(unstuff_func_t* unstuff)
+{
+ uint8_t bytes[] = { 1, 2, 3, 4, 5, 6, 0xff, 0x00, 7, 8, 9, 0xff, 0x00, 10, 11, 12, 13, 14 };
+ uint8_t ref[] = { 1, 2, 3, 4, 5, 6, 0xff, 7, 8, 9, 0xff, 10, 11, 12, 13, 14 };
+ uint8_t dst[sizeof(bytes)];
+ int ret = (*unstuff)(dst, bytes, sizeof(bytes));
+
+ assert(ret == sizeof(ref));
+ assert(memcmp(dst, ref, ret) == 0);
+}
+
+// Stuff byte crossing the 16-byte boundaries the SSE4.1 function uses.
+void test_edge_unstuff(unstuff_func_t* unstuff)
+{
+ uint8_t bytes[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0xff, 0x00, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 };
+ uint8_t ref[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0xff, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 };
+ uint8_t dst[sizeof(bytes)];
+ int ret = (*unstuff)(dst, bytes, sizeof(bytes));
+
+ assert(ret == sizeof(ref));
+ assert(memcmp(dst, ref, ret) == 0);
+}
+
+void test_marker(unstuff_func_t* unstuff)
+{
+ uint8_t bytes[] = { 1, 2, 3, 4, 5, 6, 0xff, 0x01, 7, 8, 9, 0xff, 0x00, 10, 11, 12, 13, 14 };
+ uint8_t dst[sizeof(bytes)];
+ int ret = (*unstuff)(dst, bytes, sizeof(bytes));
+
+ assert(ret == -1);
+}
+
+void test_marker_end(unstuff_func_t* unstuff)
+{
+ uint8_t bytes[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xff };
+ uint8_t dst[sizeof(bytes)];
+ int ret = (*unstuff)(dst, bytes, sizeof(bytes));
+
+ assert(ret == -1);
+}
+
+void gen_random_stuffed_data(uint8_t* dst, size_t len)
+{
+ // Standard NR LCG (we avoid rand() to get consistent behavior across platforms).
+ uint32_t seed = 1234;
+ for (int i = 0; i < len; ++i) {
+ seed = seed * 1664525 + 1013904223;
+ uint8_t byte = (uint8_t)(seed & 0xff);
+ if (byte != 0xff) {
+ dst[i] = byte;
+ continue;
+ } else if (i == len - 1) {
+ // Don't make a partial marker.
+ dst[i] = 0x00;
+ } else {
+ dst[i] = 0xff;
+ dst[++i] = 0x00;
+ }
+ }
+}
+
+double timediff(const struct timeval* a, const struct timeval* b)
+{
+ return (double)(b->tv_sec - a->tv_sec) +
+ (double)(b->tv_usec - a->tv_usec) * 1e-6;
+}
+
+void test_performance(unstuff_func_t* unstuff)
+{
+ const size_t len = 4096;
+ const unsigned num_runs = 400000;
+
+ uint8_t src[len], dst[len];
+ gen_random_stuffed_data(src, len);
+
+ struct timeval start, now;
+ gettimeofday(&start, NULL);
+
+ for (int i = 0; i < num_runs; ++i) {
+ int ret = unstuff(dst, src, len);
+ assert(ret != -1);
+ }
+
+ gettimeofday(&now, NULL);
+
+ double diff = timediff(&start, &now);
+ double mb_sec = (len * num_runs) / (1048576.0 * diff);
+ printf("%u runs with %zu bytes in %.2f seconds = %.2f MB/sec\n",
+ num_runs, len, diff, mb_sec);
+}
+
+void test_all_unstuff(unstuff_func_t* unstuff)
+{
+ printf(" test_basic_unstuff()\n");
+ test_basic_unstuff(unstuff);
+
+ printf(" test_edge_unstuff()\n");
+ test_edge_unstuff(unstuff);
+
+ printf(" test_marker()\n");
+ test_marker(unstuff);
+
+ printf(" test_marker_end()\n");
+ test_marker_end(unstuff);
+
+ printf(" performance test: ");
+ test_performance(unstuff);
+}
+
+int main(void)
+{
+ printf("unstuff_reference:\n");
+ test_all_unstuff(unstuff_reference);
+
+ printf("unstuff_fast:\n");
+ test_all_unstuff(unstuff_fast);
+
+ printf("unstuff_sse41:\n");
+ test_all_unstuff(unstuff_sse41);
+
+ printf("All tests pass.\n");
+ return 0;
+}