+++ /dev/null
-#define NOMINMAX\r
-\r
-#include <functional>\r
-#include "timer.h"\r
-\r
-#include <iostream>\r
-#include <limits>\r
-#include <numeric>\r
-\r
-#include <tbb/scalable_allocator.h>\r
-#include <tbb/parallel_for.h>\r
-#include <intrin.h>\r
- \r
-const unsigned int TEST_COUNT = 1000;\r
-const unsigned int TEST_SIZE = 1920*1080*4;\r
-\r
-void test(const std::function<void*(void*, const void*, size_t)>& func)\r
-{\r
- size_t size = TEST_SIZE;\r
- void* src = scalable_aligned_malloc(size, 16);\r
- void* dest = scalable_aligned_malloc(size, 16);\r
-\r
- timer timer;\r
- double total_time = 0.0;\r
- for(int i = 0 ;i < TEST_COUNT; ++i) \r
- {\r
- timer.start();\r
- func(dest, src, size);\r
- total_time += timer.time();\r
- if(memcmp(dest, src, size) != 0)\r
- std::cout << "ERROR";\r
- memset(src, rand(), size); // flush\r
- }\r
-\r
- scalable_aligned_free(dest);\r
- scalable_aligned_free(src);\r
-\r
- double unit_time = total_time/static_cast<double>(TEST_COUNT);\r
- std::cout << 1.0/unit_time*static_cast<double>(TEST_SIZE)/1000000000.0 << " gb/s";\r
-}\r
-\r
-void* memcpy_SSE2_1(void* dest, const void* source, size_t size)\r
-{ \r
- __m128i* dest_128 = reinterpret_cast<__m128i*>(dest);\r
- const __m128i* source_128 = reinterpret_cast<const __m128i*>(source);\r
-\r
- for(size_t n = 0; n < size/16; n += 8) \r
- {\r
- _mm_prefetch(reinterpret_cast<const char*>(source_128+8), _MM_HINT_NTA);\r
- _mm_prefetch(reinterpret_cast<const char*>(source_128+12), _MM_HINT_NTA);\r
-\r
- __m128i xmm0 = _mm_load_si128(source_128++);\r
- __m128i xmm1 = _mm_load_si128(source_128++);\r
- __m128i xmm2 = _mm_load_si128(source_128++);\r
- __m128i xmm3 = _mm_load_si128(source_128++);\r
- \r
- _mm_stream_si128(dest_128++, xmm0);\r
- _mm_stream_si128(dest_128++, xmm1);\r
- _mm_stream_si128(dest_128++, xmm2);\r
- _mm_stream_si128(dest_128++, xmm3);\r
- \r
- __m128i xmm4 = _mm_load_si128(source_128++);\r
- __m128i xmm5 = _mm_load_si128(source_128++);\r
- __m128i xmm6 = _mm_load_si128(source_128++);\r
- __m128i xmm7 = _mm_load_si128(source_128++);\r
-\r
- _mm_stream_si128(dest_128++, xmm4);\r
- _mm_stream_si128(dest_128++, xmm5);\r
- _mm_stream_si128(dest_128++, xmm6);\r
- _mm_stream_si128(dest_128++, xmm7);\r
- }\r
- return dest;\r
-}\r
-\r
-void* memcpy_SSE2_2(void* dest, const void* source, size_t num)\r
-{ \r
- __asm\r
- {\r
- mov esi, source; \r
- mov edi, dest; \r
- \r
- mov ebx, num; \r
- shr ebx, 7; \r
- \r
- cpy:\r
- prefetchnta [esi+80h];\r
- prefetchnta [esi+0A0h];\r
- prefetchnta [esi+0C0h];\r
- prefetchnta [esi+0E0h];\r
- \r
- movdqa xmm0, [esi+00h];\r
- movdqa xmm1, [esi+10h];\r
- movdqa xmm2, [esi+20h];\r
- movdqa xmm3, [esi+30h];\r
- \r
- movntdq [edi+00h], xmm0;\r
- movntdq [edi+10h], xmm1;\r
- movntdq [edi+20h], xmm2;\r
- movntdq [edi+30h], xmm3;\r
- \r
- movdqa xmm4, [esi+40h];\r
- movdqa xmm5, [esi+50h];\r
- movdqa xmm6, [esi+60h];\r
- movdqa xmm7, [esi+70h];\r
- \r
- movntdq [edi+40h], xmm4;\r
- movntdq [edi+50h], xmm5;\r
- movntdq [edi+60h], xmm6;\r
- movntdq [edi+70h], xmm7;\r
- \r
- lea edi,[edi+80h];\r
- lea esi,[esi+80h];\r
- dec ebx;\r
- \r
- jnz cpy;\r
- }\r
- return dest;\r
-}\r
-\r
-void* memcpy_SSE2_3(void* dest, const void* source, size_t num)\r
-{ \r
- __asm\r
- {\r
- mov esi, source; \r
- mov edi, dest; \r
- \r
- mov ebx, num; \r
- shr ebx, 7; \r
- \r
- cpy:\r
- prefetchnta [esi+80h];\r
- prefetchnta [esi+0C0h];\r
- \r
- movdqa xmm0, [esi+00h];\r
- movdqa xmm1, [esi+10h];\r
- movdqa xmm2, [esi+20h];\r
- movdqa xmm3, [esi+30h];\r
- \r
- movntdq [edi+00h], xmm0;\r
- movntdq [edi+10h], xmm1;\r
- movntdq [edi+20h], xmm2;\r
- movntdq [edi+30h], xmm3;\r
- \r
- movdqa xmm4, [esi+40h];\r
- movdqa xmm5, [esi+50h];\r
- movdqa xmm6, [esi+60h];\r
- movdqa xmm7, [esi+70h];\r
- \r
- movntdq [edi+40h], xmm4;\r
- movntdq [edi+50h], xmm5;\r
- movntdq [edi+60h], xmm6;\r
- movntdq [edi+70h], xmm7;\r
- \r
- lea edi, [edi+80h];\r
- lea esi, [esi+80h];\r
- dec ebx; \r
- jnz cpy;\r
- }\r
- return dest;\r
-}\r
-\r
-void* memcpy_SSE2_3_tbb(void* dest, const void* source, size_t num)\r
-{ \r
- tbb::parallel_for(tbb::blocked_range<size_t>(0, num/128), [&](const tbb::blocked_range<size_t>& r)\r
- {\r
- memcpy_SSE2_3(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128);\r
- }, tbb::affinity_partitioner());\r
-\r
- return dest;\r
-}\r
-\r
-void* X_aligned_memcpy_sse2(void* dest, const void* src, size_t size_t)\r
-{ \r
- __asm\r
- {\r
- mov esi, src; //src pointer\r
- mov edi, dest; //dest pointer\r
- \r
- mov ebx, size_t; //ebx is our counter \r
- shr ebx, 7; //divide by 128 (8 * 128bit registers)\r
- \r
- \r
- loop_copy:\r
- prefetchnta 128[ESI]; //SSE2 prefetch\r
- prefetchnta 160[ESI];\r
- prefetchnta 192[ESI];\r
- prefetchnta 224[ESI];\r
- \r
- movdqa xmm0, 0[ESI]; //move data from src to registers\r
- movdqa xmm1, 16[ESI];\r
- movdqa xmm2, 32[ESI];\r
- movdqa xmm3, 48[ESI];\r
- movdqa xmm4, 64[ESI];\r
- movdqa xmm5, 80[ESI];\r
- movdqa xmm6, 96[ESI];\r
- movdqa xmm7, 112[ESI];\r
- \r
- movntdq 0[EDI], xmm0; //move data from registers to dest\r
- movntdq 16[EDI], xmm1;\r
- movntdq 32[EDI], xmm2;\r
- movntdq 48[EDI], xmm3;\r
- movntdq 64[EDI], xmm4;\r
- movntdq 80[EDI], xmm5;\r
- movntdq 96[EDI], xmm6;\r
- movntdq 112[EDI], xmm7;\r
- \r
- add esi, 128;\r
- add edi, 128;\r
- dec ebx;\r
- \r
- jnz loop_copy; //loop please\r
- }\r
- return dest;\r
-}\r
-\r
-int main(int argc, wchar_t* argv[])\r
-{\r
- SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST);\r
- test(memcpy);\r
- std::cout << " memcpy" << std::endl;\r
- test(memcpy_SSE2_1);\r
- std::cout << " memcpy_SSE2_1" << std::endl;\r
- test(memcpy_SSE2_2);\r
- std::cout << " memcpy_SSE2_2" << std::endl;\r
- test(memcpy_SSE2_3);\r
- std::cout << " memcpy_SSE2_3" << std::endl;\r
- test(memcpy_SSE2_3_tbb);\r
- std::cout << " memcpy_SSE2_3_tbb" << std::endl;\r
- test(X_aligned_memcpy_sse2);\r
- std::cout << " X_aligned_memcpy_sse2" << std::endl;\r
-\r
- std::cout << "Press ENTER to continue... " << std::endl;\r
- std::cin.ignore(std::numeric_limits<std::streamsize>::max(), '\n');\r
-\r
- return 0;\r
-}\r
-\r
+++ /dev/null
-<?xml version="1.0" encoding="utf-8"?>\r
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">\r
- <ItemGroup Label="ProjectConfigurations">\r
- <ProjectConfiguration Include="Debug|Win32">\r
- <Configuration>Debug</Configuration>\r
- <Platform>Win32</Platform>\r
- </ProjectConfiguration>\r
- <ProjectConfiguration Include="Release|Win32">\r
- <Configuration>Release</Configuration>\r
- <Platform>Win32</Platform>\r
- </ProjectConfiguration>\r
- </ItemGroup>\r
- <PropertyGroup Label="Globals">\r
- <ProjectGuid>{CE1CD805-3904-4E58-824E-09C027585991}</ProjectGuid>\r
- <Keyword>Win32Proj</Keyword>\r
- <RootNamespace>test</RootNamespace>\r
- </PropertyGroup>\r
- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />\r
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">\r
- <ConfigurationType>Application</ConfigurationType>\r
- <UseDebugLibraries>true</UseDebugLibraries>\r
- <CharacterSet>Unicode</CharacterSet>\r
- </PropertyGroup>\r
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">\r
- <ConfigurationType>Application</ConfigurationType>\r
- <UseDebugLibraries>false</UseDebugLibraries>\r
- <WholeProgramOptimization>true</WholeProgramOptimization>\r
- <CharacterSet>Unicode</CharacterSet>\r
- </PropertyGroup>\r
- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />\r
- <ImportGroup Label="ExtensionSettings">\r
- </ImportGroup>\r
- <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">\r
- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />\r
- </ImportGroup>\r
- <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">\r
- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />\r
- </ImportGroup>\r
- <PropertyGroup Label="UserMacros" />\r
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">\r
- <LinkIncremental>true</LinkIncremental>\r
- </PropertyGroup>\r
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">\r
- <LinkIncremental>false</LinkIncremental>\r
- <IncludePath>..\..\dependencies\tbb30_20100406oss\include;..\..\..\dependencies\tbb30_20100406oss\include;$(IncludePath)</IncludePath>\r
- <LibraryPath>..\..\..\dependencies\tbb30_20100406oss\lib\ia32\vc10\;..\..\dependencies\tbb30_20100406oss\lib\ia32\vc10\;$(LibraryPath)</LibraryPath>\r
- <OutDir>$(SolutionDir)\$(ProjectName)\$(Configuration)\</OutDir>\r
- </PropertyGroup>\r
- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">\r
- <ClCompile>\r
- <PrecompiledHeader>\r
- </PrecompiledHeader>\r
- <WarningLevel>Level3</WarningLevel>\r
- <Optimization>Disabled</Optimization>\r
- <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r
- </ClCompile>\r
- <Link>\r
- <SubSystem>Console</SubSystem>\r
- <GenerateDebugInformation>true</GenerateDebugInformation>\r
- </Link>\r
- </ItemDefinitionGroup>\r
- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">\r
- <ClCompile>\r
- <WarningLevel>Level3</WarningLevel>\r
- <PrecompiledHeader>\r
- </PrecompiledHeader>\r
- <Optimization>MaxSpeed</Optimization>\r
- <FunctionLevelLinking>true</FunctionLevelLinking>\r
- <IntrinsicFunctions>true</IntrinsicFunctions>\r
- <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r
- <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>\r
- <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>\r
- </ClCompile>\r
- <Link>\r
- <SubSystem>Console</SubSystem>\r
- <GenerateDebugInformation>true</GenerateDebugInformation>\r
- <EnableCOMDATFolding>true</EnableCOMDATFolding>\r
- <OptimizeReferences>true</OptimizeReferences>\r
- <AdditionalDependencies>tbb.lib; kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>\r
- </Link>\r
- </ItemDefinitionGroup>\r
- <ItemGroup>\r
- <ClCompile Include="test.cpp">\r
- <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>\r
- <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">\r
- </PrecompiledHeaderFile>\r
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>\r
- </ClCompile>\r
- </ItemGroup>\r
- <ItemGroup>\r
- <ClInclude Include="timer.h" />\r
- </ItemGroup>\r
- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />\r
- <ImportGroup Label="ExtensionTargets">\r
- </ImportGroup>\r
-</Project>
\ No newline at end of file