Try to reduce the register pressure a bit in the unstuff code.
authorSteinar H. Gunderson <sesse@debian.org>
Sun, 31 May 2009 19:42:30 +0000 (21:42 +0200)
committerSteinar H. Gunderson <sesse@debian.org>
Sun, 31 May 2009 19:42:30 +0000 (21:42 +0200)
unstuff.c

index 997a56241825016fd8a81a19800ddc98af812b05..84c38b9b47097b22780d3060fcc93d1635d0e6c4 100644 (file)
--- a/unstuff.c
+++ b/unstuff.c
@@ -31,65 +31,62 @@ int unstuff_reference(uint8_t* dst, const uint8_t* src, size_t len)
 
 int unstuff_fast(uint8_t* dst, const uint8_t* src, size_t len)
 {
-       size_t bytes_read = 0;
        size_t bytes_written = 0;
+       const uint8_t* sptr = src;
+       uint8_t* dptr = dst;
 
-       while (bytes_read < len) {
+       while (len > 0) {
                // Find the first marker byte in the rest of the stream.
-               const uint8_t* ptr = memchr(src, MARKER_CHAR, len - bytes_read);
+               const uint8_t* ptr = memchr(sptr, MARKER_CHAR, len);
                if (ptr == NULL) {
                        // No marker bytes left.
-                       size_t len_to_copy = len - bytes_read;
-                       memcpy(dst, src, len_to_copy);
-                       bytes_written += len_to_copy;
+                       memcpy(dptr, sptr, len);
+                       dptr += len;
                        break;
                }
 
-               const size_t len_to_copy = ptr - src + 1;
-               memcpy(dst, src, len_to_copy);
+               const size_t len_to_copy = ptr - sptr + 1;
+               memcpy(dptr, sptr, len_to_copy);
 
-               src += len_to_copy;
-               dst += len_to_copy;
-               bytes_read += len_to_copy;
-               bytes_written += len_to_copy;
+               sptr += len_to_copy;
+               dptr += len_to_copy;
+               len -= len_to_copy;
 
-               assert(bytes_read <= len);
-               if (bytes_read == len) {
+               if (len == 0) {
                        // Partial marker.
                        return -1;
                } else {
-                       if (*src != STUFF_MARKER) {
+                       if (*sptr != STUFF_MARKER) {
                                return -1;
                        }
-                       ++src;
-                       ++bytes_read;
+                       ++sptr;
+                       --len;
                }
        }
 
-       return bytes_written;   
+       return dptr - dst;      
 }
 
 int unstuff_sse41(uint8_t* dst, const uint8_t* src, size_t len)
 {
        __m128i marker_search = _mm_set1_epi8(MARKER_CHAR);
 
-       size_t bytes_read = 0;
-       size_t bytes_written = 0;
-       while (len - bytes_read >= 16) {
-               __m128i data = _mm_lddqu_si128((const __m128i*)src);
+       const uint8_t* sptr = src;
+       uint8_t* dptr = dst;
+       while (len >= 16) {
+               __m128i data = _mm_lddqu_si128((const __m128i*)sptr);
 
                // The store here is safe (if there's stuff bytes, the data
                // will simply get overwritten in the slow path); fire it off
                // here so it can run in parallel with the compare.
-               _mm_storeu_si128((__m128i*)dst, data);
+               _mm_storeu_si128((__m128i*)dptr, data);
 
                __m128i eq_mask = _mm_cmpeq_epi8(data, marker_search);
                if (_mm_test_all_zeros(eq_mask, eq_mask)) {
                        // Fast path; no stuff byte found.
-                       src += 16;
-                       dst += 16;
-                       bytes_read += 16;
-                       bytes_written += 16;
+                       sptr += 16;
+                       dptr += 16;
+                       len -= 16;
                        continue;
                }
 
@@ -97,28 +94,28 @@ int unstuff_sse41(uint8_t* dst, const uint8_t* src, size_t len)
                // defer that to the next chunk. Apart from that, we just keep
                // going one by one byte. We could perhaps speed this up with
                // the data from eq_mask(), but we're not doing that yet.
-               size_t len_this_chunk = (src[15] == 0xff ? 15 : 16);
-               for (unsigned j = 0; j < len_this_chunk; ++j, ++dst, ++src, ++bytes_written) {
-                       *dst = *src;
+               size_t len_this_chunk = (sptr[15] == 0xff ? 15 : 16);
+               for (unsigned j = 0; j < len_this_chunk; ++j, ++dptr, ++sptr) {
+                       *dptr = *sptr;
 
-                       if (*src == MARKER_CHAR) {
+                       if (*sptr == MARKER_CHAR) {
                                assert(j != 15);
-                               if (src[1] != STUFF_MARKER) {
+                               if (sptr[1] != STUFF_MARKER) {
                                        return -1;
                                }
                                        
                                // Skip the stuff byte.
-                               ++src, ++j;
+                               ++sptr, ++j;
                        }
                }
-               bytes_read += len_this_chunk;
+               len -= len_this_chunk;
        }
 
        // Do the final bytes via the reference path.
-       int ret = unstuff_reference(dst, src, len - bytes_read);
+       int ret = unstuff_reference(dptr, sptr, len);
        if (ret == -1) {
                return -1;
        } else {
-               return bytes_written + ret;
+               return (dptr - dst) + ret;
        }
 }