#include <core/mixer/write_frame.h>\r
#include <core/producer/frame/frame_transform.h>\r
\r
-#include <tbb/parallel_for.h>\r
+#include <ppl.h>\r
\r
#include <safeint.h>\r
\r
\r
audio_buffer mix()\r
{ \r
+ // NOTE: auto data should be larger than format_desc_.audio_samples_per_frame to allow sse to read/write beyond size.\r
+\r
auto intermediate = std::vector<float, tbb::cache_aligned_allocator<float>>(format_desc_.audio_samples_per_frame+128, 0.0f);\r
\r
std::map<const void*, core::frame_transform> next_frame_transforms;\r
-\r
+ \r
BOOST_FOREACH(auto& item, items)\r
- { \r
+ { \r
const auto next = item.transform;\r
auto prev = next;\r
\r
\r
const float prev_volume = static_cast<float>(prev.volume);\r
const float next_volume = static_cast<float>(next.volume);\r
- const float delta = 1.0f/static_cast<float>(format_desc_.audio_samples_per_frame/2);\r
+ const float delta = 1.0f/static_cast<float>(format_desc_.audio_samples_per_frame/format_desc_.audio_channels);\r
\r
- tbb::parallel_for\r
- (\r
- tbb::blocked_range<size_t>(0, format_desc_.audio_samples_per_frame/4),\r
- [&](const tbb::blocked_range<size_t>& r)\r
- {\r
- for(size_t n = r.begin(); n < r.end(); ++n)\r
- {\r
- const float alpha0 = (n*2+0) * delta;\r
- const float alpha1 = (n*2+1) * delta;\r
- const float volume0 = (prev_volume * (1.0f - alpha0)) + (next_volume * alpha0);\r
- const float volume1 = (prev_volume * (1.0f - alpha1)) + (next_volume * alpha1);\r
-\r
- auto sample_ps = _mm_cvtepi32_ps(_mm_load_si128(reinterpret_cast<__m128i*>(&item.audio_data[n*4])));\r
- auto res_sample_ps = _mm_load_ps(&intermediate[n*4]); \r
- sample_ps = _mm_mul_ps(sample_ps, _mm_setr_ps(volume1, volume1, volume0, volume0)); \r
- res_sample_ps = _mm_add_ps(sample_ps, res_sample_ps); \r
-\r
- _mm_store_ps(&intermediate[n*4], res_sample_ps);\r
- }\r
- }\r
- );\r
+ auto alpha_ps = _mm_setr_ps(delta, delta, 0.0f, 0.0f);\r
+ auto delta2_ps = _mm_set_ps1(delta*2.0f);\r
+ auto prev_ps = _mm_set_ps1(prev_volume);\r
+ auto next_ps = _mm_set_ps1(next_volume); \r
+\r
+ Concurrency::parallel_for<int>(0, format_desc_.audio_samples_per_frame/4,\r
+ [&](int n)\r
+ { \r
+ auto next2_ps = _mm_mul_ps(next_ps, alpha_ps);\r
+ auto prev2_ps = _mm_sub_ps(prev_ps, _mm_mul_ps(prev_ps, alpha_ps));\r
+ auto volume_ps = _mm_add_ps(next2_ps, prev2_ps);\r
+\r
+ auto sample_ps = _mm_cvtepi32_ps(_mm_load_si128(reinterpret_cast<__m128i*>(&item.audio_data[n*4])));\r
+ auto res_sample_ps = _mm_load_ps(&intermediate[n*4]); \r
+ sample_ps = _mm_mul_ps(sample_ps, volume_ps); \r
+ res_sample_ps = _mm_add_ps(sample_ps, res_sample_ps); \r
+\r
+ alpha_ps = _mm_add_ps(alpha_ps, delta2_ps);\r
+\r
+ _mm_store_ps(&intermediate[n*4], res_sample_ps);\r
+ });\r
}\r
\r
auto result = audio_buffer(format_desc_.audio_samples_per_frame+128, 0); \r
-\r
- auto intermediate_128 = reinterpret_cast<__m128i*>(intermediate.data());\r
- auto result_128 = reinterpret_cast<__m128i*>(result.data());\r
- for(size_t n = 0; n < format_desc_.audio_samples_per_frame/32; ++n)\r
- {\r
+ \r
+ Concurrency::parallel_for<int>(0, format_desc_.audio_samples_per_frame/32, \r
+ [&](int n)\r
+ { \r
+ auto intermediate_128 = reinterpret_cast<__m128i*>(intermediate.data()+n*32);\r
+ auto result_128 = reinterpret_cast<__m128i*>(result.data()+n*32);\r
+ \r
auto xmm0 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
auto xmm1 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
auto xmm2 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
_mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm5));\r
_mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm6));\r
_mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm7));\r
- }\r
+ });\r
\r
items.clear();\r
prev_frame_transforms_ = std::move(next_frame_transforms); \r