int unstuff_fast(uint8_t* dst, const uint8_t* src, size_t len)
{
- size_t bytes_read = 0;
- size_t bytes_written = 0;
+ const uint8_t* sptr = src;
+ uint8_t* dptr = dst;
- while (bytes_read < len) {
+ while (len > 0) {
// Find the first marker byte in the rest of the stream.
- const uint8_t* ptr = memchr(src, MARKER_CHAR, len - bytes_read);
+ const uint8_t* ptr = memchr(sptr, MARKER_CHAR, len);
if (ptr == NULL) {
// No marker bytes left.
- size_t len_to_copy = len - bytes_read;
- memcpy(dst, src, len_to_copy);
- bytes_written += len_to_copy;
+ memcpy(dptr, sptr, len);
+ dptr += len;
break;
}
- const size_t len_to_copy = ptr - src + 1;
- memcpy(dst, src, len_to_copy);
+ const size_t len_to_copy = ptr - sptr + 1;
+ memcpy(dptr, sptr, len_to_copy);
- src += len_to_copy;
- dst += len_to_copy;
- bytes_read += len_to_copy;
- bytes_written += len_to_copy;
+ sptr += len_to_copy;
+ dptr += len_to_copy;
+ len -= len_to_copy;
- assert(bytes_read <= len);
- if (bytes_read == len) {
+ if (len == 0) {
// Partial marker.
return -1;
} else {
- if (*src != STUFF_MARKER) {
+ if (*sptr != STUFF_MARKER) {
return -1;
}
- ++src;
- ++bytes_read;
+ ++sptr;
+ --len;
}
}
- return bytes_written;
+ return dptr - dst;
}
int unstuff_sse41(uint8_t* dst, const uint8_t* src, size_t len)
{
__m128i marker_search = _mm_set1_epi8(MARKER_CHAR);
- size_t bytes_read = 0;
- size_t bytes_written = 0;
- while (len - bytes_read >= 16) {
- __m128i data = _mm_lddqu_si128((const __m128i*)src);
+ const uint8_t* sptr = src;
+ uint8_t* dptr = dst;
+ while (len >= 16) {
+ __m128i data = _mm_lddqu_si128((const __m128i*)sptr);
// The store here is safe (if there's stuff bytes, the data
// will simply get overwritten in the slow path); fire it off
// here so it can run in parallel with the compare.
- _mm_storeu_si128((__m128i*)dst, data);
+ _mm_storeu_si128((__m128i*)dptr, data);
__m128i eq_mask = _mm_cmpeq_epi8(data, marker_search);
if (_mm_test_all_zeros(eq_mask, eq_mask)) {
// Fast path; no stuff byte found.
- src += 16;
- dst += 16;
- bytes_read += 16;
- bytes_written += 16;
+ sptr += 16;
+ dptr += 16;
+ len -= 16;
continue;
}
// defer that to the next chunk. Apart from that, we just keep
// going one by one byte. We could perhaps speed this up with
// the data from eq_mask(), but we're not doing that yet.
- size_t len_this_chunk = (src[15] == 0xff ? 15 : 16);
- for (unsigned j = 0; j < len_this_chunk; ++j, ++dst, ++src, ++bytes_written) {
- *dst = *src;
+ size_t len_this_chunk = (sptr[15] == 0xff ? 15 : 16);
+ for (unsigned j = 0; j < len_this_chunk; ++j, ++dptr, ++sptr) {
+ *dptr = *sptr;
- if (*src == MARKER_CHAR) {
+ if (*sptr == MARKER_CHAR) {
assert(j != 15);
- if (src[1] != STUFF_MARKER) {
+ if (sptr[1] != STUFF_MARKER) {
return -1;
}
// Skip the stuff byte.
- ++src, ++j;
+ ++sptr, ++j;
}
}
- bytes_read += len_this_chunk;
+ len -= len_this_chunk;
}
// Do the final bytes via the reference path.
- int ret = unstuff_reference(dst, src, len - bytes_read);
+ int ret = unstuff_reference(dptr, sptr, len);
if (ret == -1) {
return -1;
} else {
- return bytes_written + ret;
+ return (dptr - dst) + ret;
}
}