+++ /dev/null
-/*\r
-* Copyright (c) 2011 Sveriges Television AB <info@casparcg.com>\r
-*\r
-* This file is part of CasparCG (www.casparcg.com).\r
-*\r
-* CasparCG is free software: you can redistribute it and/or modify\r
-* it under the terms of the GNU General Public License as published by\r
-* the Free Software Foundation, either version 3 of the License, or\r
-* (at your option) any later version.\r
-*\r
-* CasparCG is distributed in the hope that it will be useful,\r
-* but WITHOUT ANY WARRANTY; without even the implied warranty of\r
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\r
-* GNU General Public License for more details.\r
-*\r
-* You should have received a copy of the GNU General Public License\r
-* along with CasparCG. If not, see <http://www.gnu.org/licenses/>.\r
-*\r
-* Author: Robert Nagy, ronag89@gmail.com\r
-*/\r
-\r
-#pragma once\r
-\r
-#include "../utility/assert.h"\r
-#include "../memory/safe_ptr.h"\r
-\r
-#include <assert.h>\r
-#include <intrin.h>\r
-\r
-namespace caspar {\r
-\r
-namespace detail {\r
-\r
-struct true_type{};\r
-struct false_type{};\r
-\r
-template<typename D_A = false_type, typename S_A = false_type>\r
-struct memcpy_impl\r
-{\r
- void* operator()(void* dest, const void* source, size_t count)\r
- { \r
- CASPAR_ASSERT(dest != nullptr);\r
- CASPAR_ASSERT(source != nullptr);\r
- \r
- auto dest128 = reinterpret_cast<__m128i*>(dest);\r
- auto source128 = reinterpret_cast<const __m128i*>(source);\r
- \r
- size_t rest = count & 255;\r
- count &= ~255;\r
-\r
- for(int n = 0; n < count; n += 256)\r
- {\r
- auto xmm0 = _mm_loadu_si128(source128++);\r
- auto xmm1 = _mm_loadu_si128(source128++);\r
- auto xmm2 = _mm_loadu_si128(source128++);\r
- auto xmm3 = _mm_loadu_si128(source128++);\r
- auto xmm4 = _mm_loadu_si128(source128++);\r
- auto xmm5 = _mm_loadu_si128(source128++);\r
- auto xmm6 = _mm_loadu_si128(source128++);\r
- auto xmm7 = _mm_loadu_si128(source128++);\r
- auto xmm8 = _mm_loadu_si128(source128++);\r
- auto xmm9 = _mm_loadu_si128(source128++);\r
- auto xmm10 = _mm_loadu_si128(source128++);\r
- auto xmm11 = _mm_loadu_si128(source128++);\r
- auto xmm12 = _mm_loadu_si128(source128++);\r
- auto xmm13 = _mm_loadu_si128(source128++);\r
- auto xmm14 = _mm_loadu_si128(source128++);\r
- auto xmm15 = _mm_loadu_si128(source128++);\r
- auto xmm16 = _mm_loadu_si128(source128++);\r
-\r
- _mm_storeu_si128(dest128++, xmm0);\r
- _mm_storeu_si128(dest128++, xmm1);\r
- _mm_storeu_si128(dest128++, xmm2);\r
- _mm_storeu_si128(dest128++, xmm3);\r
- _mm_storeu_si128(dest128++, xmm4);\r
- _mm_storeu_si128(dest128++, xmm5);\r
- _mm_storeu_si128(dest128++, xmm6);\r
- _mm_storeu_si128(dest128++, xmm7);\r
- _mm_storeu_si128(dest128++, xmm8);\r
- _mm_storeu_si128(dest128++, xmm9);\r
- _mm_storeu_si128(dest128++, xmm10);\r
- _mm_storeu_si128(dest128++, xmm11);\r
- _mm_storeu_si128(dest128++, xmm12);\r
- _mm_storeu_si128(dest128++, xmm13);\r
- _mm_storeu_si128(dest128++, xmm14);\r
- _mm_storeu_si128(dest128++, xmm15);\r
- _mm_storeu_si128(dest128++, xmm16);\r
- }\r
- \r
- return memcpy(reinterpret_cast<int8_t*>(dest)+count, reinterpret_cast<const int8_t*>(source)+count, rest);\r
- }\r
-};\r
-\r
-template<>\r
-struct memcpy_impl<true_type, true_type>\r
-{\r
- void* operator()(void* dest, const void* source, size_t count)\r
- {\r
- CASPAR_ASSERT(dest != nullptr);\r
- CASPAR_ASSERT(source != nullptr);\r
- CASPAR_ASSERT(reinterpret_cast<int>(dest) % 16 == 0);\r
- CASPAR_ASSERT(reinterpret_cast<int>(source) % 16 == 0);\r
- \r
- auto dest128 = reinterpret_cast<__m128i*>(dest);\r
- auto source128 = reinterpret_cast<const __m128i*>(source);\r
- \r
- size_t rest = count % 256;\r
- count -= rest;\r
-\r
- for(int n = 0; n < count; n += 256)\r
- {\r
- auto xmm0 = _mm_load_si128(source128++);\r
- auto xmm1 = _mm_load_si128(source128++);\r
- auto xmm2 = _mm_load_si128(source128++);\r
- auto xmm3 = _mm_load_si128(source128++);\r
- auto xmm4 = _mm_load_si128(source128++);\r
- auto xmm5 = _mm_load_si128(source128++);\r
- auto xmm6 = _mm_load_si128(source128++);\r
- auto xmm7 = _mm_load_si128(source128++);\r
- auto xmm8 = _mm_load_si128(source128++);\r
- auto xmm9 = _mm_load_si128(source128++);\r
- auto xmm10 = _mm_load_si128(source128++);\r
- auto xmm11 = _mm_load_si128(source128++);\r
- auto xmm12 = _mm_load_si128(source128++);\r
- auto xmm13 = _mm_load_si128(source128++);\r
- auto xmm14 = _mm_load_si128(source128++);\r
- auto xmm15 = _mm_load_si128(source128++);\r
- auto xmm16 = _mm_load_si128(source128++);\r
-\r
- _mm_stream_si128(dest128++, xmm0);\r
- _mm_stream_si128(dest128++, xmm1);\r
- _mm_stream_si128(dest128++, xmm2);\r
- _mm_stream_si128(dest128++, xmm3);\r
- _mm_stream_si128(dest128++, xmm4);\r
- _mm_stream_si128(dest128++, xmm5);\r
- _mm_stream_si128(dest128++, xmm6);\r
- _mm_stream_si128(dest128++, xmm7);\r
- _mm_stream_si128(dest128++, xmm8);\r
- _mm_stream_si128(dest128++, xmm9);\r
- _mm_stream_si128(dest128++, xmm10);\r
- _mm_stream_si128(dest128++, xmm11);\r
- _mm_stream_si128(dest128++, xmm12);\r
- _mm_stream_si128(dest128++, xmm13);\r
- _mm_stream_si128(dest128++, xmm14);\r
- _mm_stream_si128(dest128++, xmm15);\r
- _mm_stream_si128(dest128++, xmm16);\r
- }\r
- \r
- return memcpy(reinterpret_cast<int8_t*>(dest)+count, reinterpret_cast<const int8_t*>(source)+count, rest);\r
- }\r
-};\r
-\r
-template<>\r
-struct memcpy_impl<true_type, false_type>\r
-{\r
- void* operator()(void* dest, const void* source, size_t count)\r
- {\r
- CASPAR_ASSERT(dest != nullptr);\r
- CASPAR_ASSERT(source != nullptr);\r
- CASPAR_ASSERT(reinterpret_cast<int>(dest) % 16 == 0);\r
- \r
- auto dest128 = reinterpret_cast<__m128i*>(dest);\r
- auto source128 = reinterpret_cast<const __m128i*>(source);\r
- \r
- size_t rest = count % 256;\r
- count -= rest;\r
-\r
- for(int n = 0; n < count; n += 256)\r
- {\r
- auto xmm0 = _mm_loadu_si128(source128++);\r
- auto xmm1 = _mm_loadu_si128(source128++);\r
- auto xmm2 = _mm_loadu_si128(source128++);\r
- auto xmm3 = _mm_loadu_si128(source128++);\r
- auto xmm4 = _mm_loadu_si128(source128++);\r
- auto xmm5 = _mm_loadu_si128(source128++);\r
- auto xmm6 = _mm_loadu_si128(source128++);\r
- auto xmm7 = _mm_loadu_si128(source128++);\r
- auto xmm8 = _mm_loadu_si128(source128++);\r
- auto xmm9 = _mm_loadu_si128(source128++);\r
- auto xmm10 = _mm_loadu_si128(source128++);\r
- auto xmm11 = _mm_loadu_si128(source128++);\r
- auto xmm12 = _mm_loadu_si128(source128++);\r
- auto xmm13 = _mm_loadu_si128(source128++);\r
- auto xmm14 = _mm_loadu_si128(source128++);\r
- auto xmm15 = _mm_loadu_si128(source128++);\r
- auto xmm16 = _mm_loadu_si128(source128++);\r
-\r
- _mm_stream_si128(dest128++, xmm0);\r
- _mm_stream_si128(dest128++, xmm1);\r
- _mm_stream_si128(dest128++, xmm2);\r
- _mm_stream_si128(dest128++, xmm3);\r
- _mm_stream_si128(dest128++, xmm4);\r
- _mm_stream_si128(dest128++, xmm5);\r
- _mm_stream_si128(dest128++, xmm6);\r
- _mm_stream_si128(dest128++, xmm7);\r
- _mm_stream_si128(dest128++, xmm8);\r
- _mm_stream_si128(dest128++, xmm9);\r
- _mm_stream_si128(dest128++, xmm10);\r
- _mm_stream_si128(dest128++, xmm11);\r
- _mm_stream_si128(dest128++, xmm12);\r
- _mm_stream_si128(dest128++, xmm13);\r
- _mm_stream_si128(dest128++, xmm14);\r
- _mm_stream_si128(dest128++, xmm15);\r
- _mm_stream_si128(dest128++, xmm16);\r
- }\r
- \r
- return memcpy(reinterpret_cast<int8_t*>(dest)+count, reinterpret_cast<const int8_t*>(source)+count, rest);\r
- }\r
-};\r
-\r
-template<>\r
-struct memcpy_impl<false_type, true_type>\r
-{\r
- void* operator()(void* dest, const void* source, size_t count)\r
- {\r
- CASPAR_ASSERT(dest != nullptr);\r
- CASPAR_ASSERT(source != nullptr);\r
- CASPAR_ASSERT(reinterpret_cast<int>(source) % 16 == 0);\r
- \r
- auto dest128 = reinterpret_cast<__m128i*>(dest);\r
- auto source128 = reinterpret_cast<const __m128i*>(source);\r
- \r
- size_t rest = count % 256;\r
- count -= rest;\r
-\r
- for(int n = 0; n < count; n += 256)\r
- {\r
- auto xmm0 = _mm_load_si128(source128++);\r
- auto xmm1 = _mm_load_si128(source128++);\r
- auto xmm2 = _mm_load_si128(source128++);\r
- auto xmm3 = _mm_load_si128(source128++);\r
- auto xmm4 = _mm_load_si128(source128++);\r
- auto xmm5 = _mm_load_si128(source128++);\r
- auto xmm6 = _mm_load_si128(source128++);\r
- auto xmm7 = _mm_load_si128(source128++);\r
- auto xmm8 = _mm_load_si128(source128++);\r
- auto xmm9 = _mm_load_si128(source128++);\r
- auto xmm10 = _mm_load_si128(source128++);\r
- auto xmm11 = _mm_load_si128(source128++);\r
- auto xmm12 = _mm_load_si128(source128++);\r
- auto xmm13 = _mm_load_si128(source128++);\r
- auto xmm14 = _mm_load_si128(source128++);\r
- auto xmm15 = _mm_load_si128(source128++);\r
- auto xmm16 = _mm_load_si128(source128++);\r
-\r
- _mm_storeu_si128(dest128++, xmm0);\r
- _mm_storeu_si128(dest128++, xmm1);\r
- _mm_storeu_si128(dest128++, xmm2);\r
- _mm_storeu_si128(dest128++, xmm3);\r
- _mm_storeu_si128(dest128++, xmm4);\r
- _mm_storeu_si128(dest128++, xmm5);\r
- _mm_storeu_si128(dest128++, xmm6);\r
- _mm_storeu_si128(dest128++, xmm7);\r
- _mm_storeu_si128(dest128++, xmm8);\r
- _mm_storeu_si128(dest128++, xmm9);\r
- _mm_storeu_si128(dest128++, xmm10);\r
- _mm_storeu_si128(dest128++, xmm11);\r
- _mm_storeu_si128(dest128++, xmm12);\r
- _mm_storeu_si128(dest128++, xmm13);\r
- _mm_storeu_si128(dest128++, xmm14);\r
- _mm_storeu_si128(dest128++, xmm15);\r
- _mm_storeu_si128(dest128++, xmm16);\r
- }\r
-\r
- return memcpy(reinterpret_cast<int8_t*>(dest)+count, reinterpret_cast<const int8_t*>(source)+count, rest);\r
- }\r
-};\r
-\r
-}\r
-\r
-template<typename T>\r
-T* fast_memcpy(T* dest, const void* source, size_t count)\r
-{ \r
- //auto s_align = reinterpret_cast<int>(source) & 15;\r
- //auto d_align = reinterpret_cast<int>(dest) & 15;\r
- \r
- //if(s_align == 0 && s_align == 0)\r
- // return reinterpret_cast<T*>(detail::memcpy_impl<detail::true_type, detail::true_type>()(dest, source, count));\r
- //else if(d_align == 0)\r
- // return reinterpret_cast<T*>(detail::memcpy_impl<detail::true_type, detail::false_type>()(dest, source, count));\r
- //else if(s_align == 0)\r
- // return reinterpret_cast<T*>(detail::memcpy_impl<detail::false_type, detail::true_type>()(dest, source, count));\r
-\r
- //return reinterpret_cast<T*>(detail::memcpy_impl<detail::false_type, detail::false_type>()(dest, source, count));\r
- return reinterpret_cast<T*>(memcpy(dest, source, count));\r
-}\r
-\r
-}\r