X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=fake_capture.cpp;h=7d23d8cafdc70df47f123f06200b9730c5897742;hb=08a992d58c34da8d8bbd70226f7e85c9f30d9514;hp=5ad0b6205b41b26965a94a2fbb61afaa5e364670;hpb=884f2c044c4ab008ce9295edd343ee851eba4576;p=nageru diff --git a/fake_capture.cpp b/fake_capture.cpp index 5ad0b62..7d23d8c 100644 --- a/fake_capture.cpp +++ b/fake_capture.cpp @@ -10,6 +10,9 @@ #include #include #include +#if __SSE2__ +#include +#endif #include #include "bmusb/bmusb.h" @@ -26,13 +29,28 @@ constexpr uint8_t crs[NUM_COLORS] = { 240, 34, 110, 128 }; using namespace std; +namespace bmusb { namespace { -// TODO: SSE2-optimize (or at least write full int64s) if speed becomes a problem. - void memset2(uint8_t *s, const uint8_t c[2], size_t n) { - for (size_t i = 0; i < n; ++i) { + size_t i = 0; +#if __SSE2__ + const uint8_t c_expanded[16] = { + c[0], c[1], c[0], c[1], c[0], c[1], c[0], c[1], + c[0], c[1], c[0], c[1], c[0], c[1], c[0], c[1] + }; + __m128i cc = *(__m128i *)c_expanded; + __m128i *out = (__m128i *)s; + + for ( ; i < (n & ~15); i += 16) { + _mm_storeu_si128(out++, cc); + _mm_storeu_si128(out++, cc); + } + + s = (uint8_t *)out; +#endif + for ( ; i < n; ++i) { *s++ = c[0]; *s++ = c[1]; } @@ -40,7 +58,23 @@ void memset2(uint8_t *s, const uint8_t c[2], size_t n) void memset4(uint8_t *s, const uint8_t c[4], size_t n) { - for (size_t i = 0; i < n; ++i) { + size_t i = 0; +#if __SSE2__ + const uint8_t c_expanded[16] = { + c[0], c[1], c[2], c[3], c[0], c[1], c[2], c[3], + c[0], c[1], c[2], c[3], c[0], c[1], c[2], c[3] + }; + __m128i cc = *(__m128i *)c_expanded; + __m128i *out = (__m128i *)s; + + for ( ; i < (n & ~7); i += 8) { + _mm_storeu_si128(out++, cc); + _mm_storeu_si128(out++, cc); + } + + s = (uint8_t *)out; +#endif + for ( ; i < n; ++i) { *s++ = c[0]; *s++ = c[1]; *s++ = c[2]; @@ -230,3 +264,5 @@ void FakeCapture::producer_thread_func() dequeue_cleanup_callback(); } } + +} // namespace bmusb