Try to reduce the register pressure a bit in the unstuff code.
[fjl] / unstuff.c
1 #include <string.h>
2 #include <assert.h>
3 #include <mmintrin.h>
4 #include <xmmintrin.h>
5 #include <smmintrin.h>
6
7 #include "unstuff.h"
8
9 #define MARKER_CHAR 0xff
10 #define STUFF_MARKER 0x00
11
12 int unstuff_reference(uint8_t* dst, const uint8_t* src, size_t len)
13 {
14         size_t bytes_written = 0;
15
16         for (unsigned i = 0; i < len; ++i, ++dst, ++src, ++bytes_written) {
17                 *dst = *src;
18                 if (*src == MARKER_CHAR) {
19                         if (i == len - 1 || src[1] != STUFF_MARKER) {
20                                 return -1;
21                         }
22
23                         // Skip the stuff byte.
24                         ++src, ++i;
25                 }
26         }
27
28         assert(bytes_written <= len);
29         return bytes_written;
30 }
31
32 int unstuff_fast(uint8_t* dst, const uint8_t* src, size_t len)
33 {
34         size_t bytes_written = 0;
35         const uint8_t* sptr = src;
36         uint8_t* dptr = dst;
37
38         while (len > 0) {
39                 // Find the first marker byte in the rest of the stream.
40                 const uint8_t* ptr = memchr(sptr, MARKER_CHAR, len);
41                 if (ptr == NULL) {
42                         // No marker bytes left.
43                         memcpy(dptr, sptr, len);
44                         dptr += len;
45                         break;
46                 }
47
48                 const size_t len_to_copy = ptr - sptr + 1;
49                 memcpy(dptr, sptr, len_to_copy);
50
51                 sptr += len_to_copy;
52                 dptr += len_to_copy;
53                 len -= len_to_copy;
54
55                 if (len == 0) {
56                         // Partial marker.
57                         return -1;
58                 } else {
59                         if (*sptr != STUFF_MARKER) {
60                                 return -1;
61                         }
62                         ++sptr;
63                         --len;
64                 }
65         }
66
67         return dptr - dst;      
68 }
69
70 int unstuff_sse41(uint8_t* dst, const uint8_t* src, size_t len)
71 {
72         __m128i marker_search = _mm_set1_epi8(MARKER_CHAR);
73
74         const uint8_t* sptr = src;
75         uint8_t* dptr = dst;
76         while (len >= 16) {
77                 __m128i data = _mm_lddqu_si128((const __m128i*)sptr);
78
79                 // The store here is safe (if there's stuff bytes, the data
80                 // will simply get overwritten in the slow path); fire it off
81                 // here so it can run in parallel with the compare.
82                 _mm_storeu_si128((__m128i*)dptr, data);
83
84                 __m128i eq_mask = _mm_cmpeq_epi8(data, marker_search);
85                 if (_mm_test_all_zeros(eq_mask, eq_mask)) {
86                         // Fast path; no stuff byte found.
87                         sptr += 16;
88                         dptr += 16;
89                         len -= 16;
90                         continue;
91                 }
92
93                 // We found a stuff byte. If it was the last byte, we just
94                 // defer that to the next chunk. Apart from that, we just keep
95                 // going one by one byte. We could perhaps speed this up with
96                 // the data from eq_mask(), but we're not doing that yet.
97                 size_t len_this_chunk = (sptr[15] == 0xff ? 15 : 16);
98                 for (unsigned j = 0; j < len_this_chunk; ++j, ++dptr, ++sptr) {
99                         *dptr = *sptr;
100
101                         if (*sptr == MARKER_CHAR) {
102                                 assert(j != 15);
103                                 if (sptr[1] != STUFF_MARKER) {
104                                         return -1;
105                                 }
106                                         
107                                 // Skip the stuff byte.
108                                 ++sptr, ++j;
109                         }
110                 }
111                 len -= len_this_chunk;
112         }
113
114         // Do the final bytes via the reference path.
115         int ret = unstuff_reference(dptr, sptr, len);
116         if (ret == -1) {
117                 return -1;
118         } else {
119                 return (dptr - dst) + ret;
120         }
121 }