-/*\r
-* copyright (c) 2010 Sveriges Television AB <info@casparcg.com>\r
-*\r
-* This file is part of CasparCG.\r
-*\r
-* CasparCG is free software: you can redistribute it and/or modify\r
-* it under the terms of the GNU General Public License as published by\r
-* the Free Software Foundation, either version 3 of the License, or\r
-* (at your option) any later version.\r
-*\r
-* CasparCG is distributed in the hope that it will be useful,\r
-* but WITHOUT ANY WARRANTY; without even the implied warranty of\r
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\r
-* GNU General Public License for more details.\r
-\r
-* You should have received a copy of the GNU General Public License\r
-* along with CasparCG. If not, see <http://www.gnu.org/licenses/>.\r
-*\r
-*/\r
- \r
-#include "../stdafx.h"\r
-\r
-#include "shuffle.h"\r
-\r
-#include "../utility/types.h"\r
-\r
-#include "tbb/parallel_for.h"\r
-#include "tbb/blocked_range.h"\r
-\r
-#include <cassert>\r
-#include <intrin.h>\r
-#include <functional>\r
-\r
-using namespace std::tr1::placeholders;\r
-\r
-namespace caspar{ namespace common{ namespace image{\r
-\r
-static const size_t STRIDE = sizeof(__m128i)*4;\r
-\r
-void DoshuffleParallel(const tbb::blocked_range<size_t>& r, const std::tr1::function<void(void*, const void*, size_t, const u8, const u8, const u8, const u8)>& func, void* dest, const void* source, const u8 red, const u8 green, const u8 blue, const u8 alpha)\r
-{\r
- size_t offset = r.begin()*STRIDE;\r
- size_t size = r.size()*STRIDE;\r
- func(reinterpret_cast<s8*>(dest) + offset, reinterpret_cast<const s8*>(source) + offset, size, red, green, blue, alpha);\r
-}\r
-\r
-void shuffleParallel(const std::tr1::function<void(void*, const void*, size_t, const u8, const u8, const u8, const u8)>& func, void* dest, const void* source, size_t size, const u8 red, const u8 green, const u8 blue, const u8 alpha)\r
-{\r
- tbb::parallel_for(tbb::blocked_range<size_t>(0, size/STRIDE), std::bind(&DoshuffleParallel, std::placeholders::_1, func, dest, source, red, green, blue, alpha)); \r
-}\r
-\r
-shuffle_fun get_shuffle_fun(SIMD simd)\r
-{\r
- if(simd >= SSSE3)\r
- return shuffleParallel_SSSE3;\r
- else if(simd >= SSE2)\r
- return shuffleParallel_SSE2;\r
- else\r
- return shuffleParallel_REF;\r
-}\r
-\r
-void shuffle_SSSE3(void* dest, const void* source, size_t size, const u8 red, const u8 green, const u8 blue, const u8 alpha)\r
-{\r
- static const unsigned int PSD = 64; \r
-\r
- assert(source != NULL && dest != NULL);\r
- assert(red > -1 && red < 4 && green > -1 && green < 4 && blue > -1 && blue < 4 && alpha > -1 && alpha < 4 && "Invalid mask");\r
- assert(size % STRIDE == 0);\r
-\r
- const __m128i* source128 = reinterpret_cast<const __m128i*>(source);\r
- __m128i* dest128 = reinterpret_cast<__m128i*>(dest); \r
-\r
- __m128i reg0 = _mm_setzero_si128(); \r
- __m128i reg1 = _mm_setzero_si128(); \r
- __m128i reg2 = _mm_setzero_si128(); \r
- __m128i reg3 = _mm_setzero_si128(); \r
-\r
- const __m128i mask128 = _mm_set_epi8(alpha+12, blue+12, green+12, red+12, alpha+8, blue+8, green+8, red+8, alpha+4, blue+4, green+4, red+4, alpha, blue, green, red);\r
-\r
- for(size_t k = 0, length = size/STRIDE; k < length; ++k) \r
- {\r
- // TODO: put prefetch between calculations?(R.N)\r
- _mm_prefetch(reinterpret_cast<const s8*>(source128 + PSD), _MM_HINT_NTA);\r
-\r
- // work on entire cacheline before next prefetch\r
-\r
- // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/\r
-\r
- reg0 = _mm_load_si128(source128++); \r
- reg1 = _mm_load_si128(source128++); \r
-\r
- _mm_stream_si128(dest128++, _mm_shuffle_epi8(reg0, mask128));\r
-\r
- reg2 = _mm_load_si128(source128++); \r
-\r
- _mm_stream_si128(dest128++, _mm_shuffle_epi8(reg1, mask128));\r
-\r
- reg3 = _mm_load_si128(source128++); \r
- \r
- _mm_stream_si128(dest128++, _mm_shuffle_epi8(reg2, mask128)); \r
- _mm_stream_si128(dest128++, _mm_shuffle_epi8(reg3, mask128)); \r
- }\r
- _mm_mfence(); //ensure last WC buffers get flushed to memory\r
-}\r
-\r
-void shuffleParallel_SSSE3(void* dest, const void* source, size_t size, const u8 red, const u8 green, const u8 blue, const u8 alpha)\r
-{\r
- shuffleParallel(&shuffle_SSSE3, dest, source, size, red, green, blue, alpha);\r
-}\r
-\r
-// TODO: should be optimized for different combinations (R.N)\r
-void shuffle_SSE2(void* dest, const void* source, size_t size, const u8 red, const u8 green, const u8 blue, const u8 alpha)\r
-{\r
- static const size_t stride = sizeof(__m128i)*4;\r
- static const u32 PSD = 64;\r
-\r
- static const __m128i himask = _mm_set1_epi32(0xFF000000); \r
- static const __m128i lomask = _mm_set1_epi32(0x000000FF);\r
- \r
- assert(source != NULL && dest != NULL);\r
- assert(red > -1 && red < 4 && green > -1 && green < 4 && blue > -1 && blue < 4 && alpha > -1 && alpha < 4);\r
- assert(size % stride == 0);\r
-\r
- const __m128i* source128 = reinterpret_cast<const __m128i*>(source);\r
- __m128i* dest128 = reinterpret_cast<__m128i*>(dest); \r
-\r
- __m128i s, m0, m1, r;\r
-\r
- const int shft0 = (red)*8;\r
- const int shft1 = (green)*8;\r
- const int shft2 = (3-blue)*8;\r
- const int shft3 = (3-alpha)*8;\r
-\r
- for(int k = 0, length = size/stride; k < length; ++k) \r
- {\r
- // TODO: dynamic prefetch schedluing distance? needs to be optimized (R.N) \r
- // TODO: put prefetch between calculations?(R.N)\r
- _mm_prefetch(reinterpret_cast<const s8*>(source128 + PSD), _MM_HINT_NTA);\r
-\r
- // work on entire cacheline before next prefetch\r
-\r
- // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/\r
-\r
- for(int n = 0; n < 4; ++n, ++dest128, ++source128)\r
- {\r
- s = _mm_load_si128(source128);\r
- \r
- m0 = _mm_srli_epi32(s, shft0);\r
- m0 = _mm_and_si128(m0, lomask);\r
-\r
- m1 = _mm_srli_epi32(s, shft1);\r
- m1 = _mm_and_si128(m1, lomask);\r
- m1 = _mm_slli_epi32(m1, 8);\r
- \r
- r = _mm_or_si128(m0, m1);\r
-\r
- m0 = _mm_slli_epi32(s, shft2);\r
- m0 = _mm_and_si128(m0, himask);\r
- m0 = _mm_srli_epi32(m0, 8); \r
-\r
- m1 = _mm_slli_epi32(s, shft3);\r
- m1 = _mm_and_si128(m1, himask);\r
- \r
- m0 = _mm_or_si128(m0, m1);\r
-\r
- r = _mm_or_si128(r, m0);\r
-\r
- _mm_stream_si128(dest128, r);\r
- }\r
- }\r
- _mm_mfence(); //ensure last WC buffers get flushed to memory\r
-}\r
-\r
-void shuffleParallel_SSE2(void* dest, const void* source, size_t size, const u8 red, const u8 green, const u8 blue, const u8 alpha)\r
-{\r
- shuffleParallel(&shuffle_SSE2, dest, source, size, red, green, blue, alpha);\r
-}\r
-\r
-void shuffle_REF(void* dest, const void* source, size_t size, const u8 red, const u8 green, const u8 blue, const u8 alpha)\r
-{\r
- assert(source != NULL && dest != NULL);\r
- assert(red > -1 && red < 4 && green > -1 && green < 4 && blue > -1 && blue < 4 && alpha > -1 && alpha < 4);\r
- assert(size % 4 == 0);\r
-\r
- const u8* source8 = reinterpret_cast<const u8*>(source);\r
- u8* dest8 = reinterpret_cast<u8*>(dest); \r
-\r
- for(size_t n = 0; n < size; n+=4)\r
- {\r
- u8 r = source8[n+red];\r
- u8 g = source8[n+green];\r
- u8 b = source8[n+blue];\r
- u8 a = source8[n+alpha];\r
-\r
- dest8[n+0] = r;\r
- dest8[n+1] = g;\r
- dest8[n+2] = b;\r
- dest8[n+3] = a;\r
- }\r
-}\r
-\r
-void shuffleParallel_REF(void* dest, const void* source, size_t size, const u8 red, const u8 green, const u8 blue, const u8 alpha)\r
-{\r
- shuffleParallel(&shuffle_REF, dest, source, size, red, green, blue, alpha);\r
-}\r
-\r
-}}}\r