From fc492373c22aa2f7509e2f29165309968b7028f1 Mon Sep 17 00:00:00 2001 From: ronag Date: Sat, 3 Sep 2011 19:28:18 +0000 Subject: [PATCH] 2.0. audio_mixer: Optimized. git-svn-id: https://casparcg.svn.sourceforge.net/svnroot/casparcg/server/branches/2.0.0.2@1338 362d55ac-95cf-4e76-9f9a-cbaa9c17b72d --- core/mixer/audio/audio_mixer.cpp | 78 +++++++++++++++++++------------- 1 file changed, 47 insertions(+), 31 deletions(-) diff --git a/core/mixer/audio/audio_mixer.cpp b/core/mixer/audio/audio_mixer.cpp index 659572210..9f4baa7f7 100644 --- a/core/mixer/audio/audio_mixer.cpp +++ b/core/mixer/audio/audio_mixer.cpp @@ -95,6 +95,8 @@ public: auto intermediate = std::vector>(format_desc_.audio_samples_per_frame+128, 0.0f); std::map next_frame_transforms; + + tbb::affinity_partitioner ap; BOOST_FOREACH(auto& item, items) { @@ -118,55 +120,69 @@ public: const float prev_volume = static_cast(prev.volume); const float next_volume = static_cast(next.volume); - const float delta = 1.0f/static_cast(format_desc_.audio_samples_per_frame/2); + const float delta = 1.0f/static_cast(format_desc_.audio_samples_per_frame/format_desc_.audio_channels); tbb::parallel_for ( tbb::blocked_range(0, format_desc_.audio_samples_per_frame/4), [&](const tbb::blocked_range& r) - { + { + auto alpha_ps = _mm_setr_ps(delta, delta, 0.0f, 0.0f); + auto delta2_ps = _mm_set_ps1(delta*2.0f); + auto prev_ps = _mm_set_ps1(prev_volume); + auto next_ps = _mm_set_ps1(next_volume); + for(size_t n = r.begin(); n < r.end(); ++n) - { - const float alpha0 = (n*2+0) * delta; - const float alpha1 = (n*2+1) * delta; - const float volume0 = (prev_volume * (1.0f - alpha0)) + (next_volume * alpha0); - const float volume1 = (prev_volume * (1.0f - alpha1)) + (next_volume * alpha1); + { + auto next2_ps = _mm_mul_ps(next_ps, alpha_ps); + auto prev2_ps = _mm_sub_ps(prev_ps, _mm_mul_ps(prev_ps, alpha_ps)); + auto volume_ps = _mm_add_ps(next2_ps, prev2_ps); auto sample_ps = _mm_cvtepi32_ps(_mm_load_si128(reinterpret_cast<__m128i*>(&item.audio_data[n*4]))); auto res_sample_ps = _mm_load_ps(&intermediate[n*4]); - sample_ps = _mm_mul_ps(sample_ps, _mm_setr_ps(volume1, volume1, volume0, volume0)); + sample_ps = _mm_mul_ps(sample_ps, volume_ps); res_sample_ps = _mm_add_ps(sample_ps, res_sample_ps); + alpha_ps = _mm_add_ps(alpha_ps, delta2_ps); + _mm_store_ps(&intermediate[n*4], res_sample_ps); } } - ); + , ap); } auto result = audio_buffer(format_desc_.audio_samples_per_frame+128, 0); - - auto intermediate_128 = reinterpret_cast<__m128i*>(intermediate.data()); - auto result_128 = reinterpret_cast<__m128i*>(result.data()); - for(size_t n = 0; n < format_desc_.audio_samples_per_frame/32; ++n) - { - auto xmm0 = _mm_load_ps(reinterpret_cast(intermediate_128++)); - auto xmm1 = _mm_load_ps(reinterpret_cast(intermediate_128++)); - auto xmm2 = _mm_load_ps(reinterpret_cast(intermediate_128++)); - auto xmm3 = _mm_load_ps(reinterpret_cast(intermediate_128++)); - auto xmm4 = _mm_load_ps(reinterpret_cast(intermediate_128++)); - auto xmm5 = _mm_load_ps(reinterpret_cast(intermediate_128++)); - auto xmm6 = _mm_load_ps(reinterpret_cast(intermediate_128++)); - auto xmm7 = _mm_load_ps(reinterpret_cast(intermediate_128++)); + + tbb::parallel_for + ( + tbb::blocked_range(0, format_desc_.audio_samples_per_frame/32), + [&](const tbb::blocked_range& r) + { + auto intermediate_128 = reinterpret_cast<__m128i*>(intermediate.data()+r.begin()*32); + auto result_128 = reinterpret_cast<__m128i*>(result.data()+r.begin()*32); + + for(size_t n = r.begin(); n < r.end(); ++n) + { + auto xmm0 = _mm_load_ps(reinterpret_cast(intermediate_128++)); + auto xmm1 = _mm_load_ps(reinterpret_cast(intermediate_128++)); + auto xmm2 = _mm_load_ps(reinterpret_cast(intermediate_128++)); + auto xmm3 = _mm_load_ps(reinterpret_cast(intermediate_128++)); + auto xmm4 = _mm_load_ps(reinterpret_cast(intermediate_128++)); + auto xmm5 = _mm_load_ps(reinterpret_cast(intermediate_128++)); + auto xmm6 = _mm_load_ps(reinterpret_cast(intermediate_128++)); + auto xmm7 = _mm_load_ps(reinterpret_cast(intermediate_128++)); - _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm0)); - _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm1)); - _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm2)); - _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm3)); - _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm4)); - _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm5)); - _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm6)); - _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm7)); - } + _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm0)); + _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm1)); + _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm2)); + _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm3)); + _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm4)); + _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm5)); + _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm6)); + _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm7)); + } + } + , ap); items.clear(); prev_frame_transforms_ = std::move(next_frame_transforms); -- 2.39.2