From: Steinar H. Gunderson Date: Sun, 31 May 2009 19:42:30 +0000 (+0200) Subject: Try to reduce the register pressure a bit in the unstuff code. X-Git-Url: https://git.sesse.net/?p=fjl;a=commitdiff_plain;h=2427a61d1b1db701b057eeeb269c1e4211b982d7;hp=e58d7fc6e17349a9a456632f9403a73262a428e5 Try to reduce the register pressure a bit in the unstuff code. --- diff --git a/unstuff.c b/unstuff.c index 997a562..84c38b9 100644 --- a/unstuff.c +++ b/unstuff.c @@ -31,65 +31,62 @@ int unstuff_reference(uint8_t* dst, const uint8_t* src, size_t len) int unstuff_fast(uint8_t* dst, const uint8_t* src, size_t len) { - size_t bytes_read = 0; size_t bytes_written = 0; + const uint8_t* sptr = src; + uint8_t* dptr = dst; - while (bytes_read < len) { + while (len > 0) { // Find the first marker byte in the rest of the stream. - const uint8_t* ptr = memchr(src, MARKER_CHAR, len - bytes_read); + const uint8_t* ptr = memchr(sptr, MARKER_CHAR, len); if (ptr == NULL) { // No marker bytes left. - size_t len_to_copy = len - bytes_read; - memcpy(dst, src, len_to_copy); - bytes_written += len_to_copy; + memcpy(dptr, sptr, len); + dptr += len; break; } - const size_t len_to_copy = ptr - src + 1; - memcpy(dst, src, len_to_copy); + const size_t len_to_copy = ptr - sptr + 1; + memcpy(dptr, sptr, len_to_copy); - src += len_to_copy; - dst += len_to_copy; - bytes_read += len_to_copy; - bytes_written += len_to_copy; + sptr += len_to_copy; + dptr += len_to_copy; + len -= len_to_copy; - assert(bytes_read <= len); - if (bytes_read == len) { + if (len == 0) { // Partial marker. return -1; } else { - if (*src != STUFF_MARKER) { + if (*sptr != STUFF_MARKER) { return -1; } - ++src; - ++bytes_read; + ++sptr; + --len; } } - return bytes_written; + return dptr - dst; } int unstuff_sse41(uint8_t* dst, const uint8_t* src, size_t len) { __m128i marker_search = _mm_set1_epi8(MARKER_CHAR); - size_t bytes_read = 0; - size_t bytes_written = 0; - while (len - bytes_read >= 16) { - __m128i data = _mm_lddqu_si128((const __m128i*)src); + const uint8_t* sptr = src; + uint8_t* dptr = dst; + while (len >= 16) { + __m128i data = _mm_lddqu_si128((const __m128i*)sptr); // The store here is safe (if there's stuff bytes, the data // will simply get overwritten in the slow path); fire it off // here so it can run in parallel with the compare. - _mm_storeu_si128((__m128i*)dst, data); + _mm_storeu_si128((__m128i*)dptr, data); __m128i eq_mask = _mm_cmpeq_epi8(data, marker_search); if (_mm_test_all_zeros(eq_mask, eq_mask)) { // Fast path; no stuff byte found. - src += 16; - dst += 16; - bytes_read += 16; - bytes_written += 16; + sptr += 16; + dptr += 16; + len -= 16; continue; } @@ -97,28 +94,28 @@ int unstuff_sse41(uint8_t* dst, const uint8_t* src, size_t len) // defer that to the next chunk. Apart from that, we just keep // going one by one byte. We could perhaps speed this up with // the data from eq_mask(), but we're not doing that yet. - size_t len_this_chunk = (src[15] == 0xff ? 15 : 16); - for (unsigned j = 0; j < len_this_chunk; ++j, ++dst, ++src, ++bytes_written) { - *dst = *src; + size_t len_this_chunk = (sptr[15] == 0xff ? 15 : 16); + for (unsigned j = 0; j < len_this_chunk; ++j, ++dptr, ++sptr) { + *dptr = *sptr; - if (*src == MARKER_CHAR) { + if (*sptr == MARKER_CHAR) { assert(j != 15); - if (src[1] != STUFF_MARKER) { + if (sptr[1] != STUFF_MARKER) { return -1; } // Skip the stuff byte. - ++src, ++j; + ++sptr, ++j; } } - bytes_read += len_this_chunk; + len -= len_this_chunk; } // Do the final bytes via the reference path. - int ret = unstuff_reference(dst, src, len - bytes_read); + int ret = unstuff_reference(dptr, sptr, len); if (ret == -1) { return -1; } else { - return bytes_written + ret; + return (dptr - dst) + ret; } }