{ \r
// NOTE: auto data should be larger than format_desc_.audio_samples_per_frame to allow sse to read/write beyond size.\r
\r
- auto intermediate = std::vector<float, tbb::cache_aligned_allocator<float>>(format_desc_.audio_samples_per_frame+128, 0.0f);\r
- auto result = audio_buffer(format_desc_.audio_samples_per_frame+128); \r
+ auto intermediate = std::vector<float, tbb::cache_aligned_allocator<float>>(format_desc_.audio_samples_per_frame+128, 0.0f);\r
+ auto result = audio_buffer(format_desc_.audio_samples_per_frame+128); \r
+ auto result_128 = reinterpret_cast<__m128i*>(result.data());\r
\r
std::map<const void*, core::frame_transform> next_frame_transforms;\r
- \r
+ \r
BOOST_FOREACH(auto& item, items_)\r
{ \r
const auto next = item.transform;\r
auto alpha_ps = _mm_set_ps1(alpha*2.0f);\r
auto volume_ps = _mm_setr_ps(prev_volume, prev_volume, prev_volume+alpha, prev_volume+alpha);\r
\r
- for(size_t n = 0; n < format_desc_.audio_samples_per_frame/4; ++n)\r
- { \r
- auto sample_ps = _mm_cvtepi32_ps(_mm_load_si128(reinterpret_cast<__m128i*>(&item.audio_data[n*4])));\r
- auto res_sample_ps = _mm_load_ps(&intermediate[n*4]); \r
- sample_ps = _mm_mul_ps(sample_ps, volume_ps); \r
- res_sample_ps = _mm_add_ps(sample_ps, res_sample_ps); \r
+ if(&item != &items_.back())\r
+ {\r
+ for(size_t n = 0; n < format_desc_.audio_samples_per_frame/4; ++n)\r
+ { \r
+ auto sample_ps = _mm_cvtepi32_ps(_mm_load_si128(reinterpret_cast<__m128i*>(&item.audio_data[n*4])));\r
+ auto res_sample_ps = _mm_load_ps(&intermediate[n*4]); \r
+ sample_ps = _mm_mul_ps(sample_ps, volume_ps); \r
+ res_sample_ps = _mm_add_ps(sample_ps, res_sample_ps); \r
\r
- volume_ps = _mm_add_ps(volume_ps, alpha_ps);\r
+ volume_ps = _mm_add_ps(volume_ps, alpha_ps);\r
\r
- _mm_store_ps(&intermediate[n*4], res_sample_ps);\r
+ _mm_store_ps(&intermediate[n*4], res_sample_ps);\r
+ }\r
}\r
- } \r
- \r
- auto intermediate_128 = reinterpret_cast<__m128i*>(intermediate.data());\r
- auto result_128 = reinterpret_cast<__m128i*>(result.data());\r
- \r
- for(size_t n = 0; n < format_desc_.audio_samples_per_frame/32; ++n)\r
- { \r
- auto xmm0 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
- auto xmm1 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
- auto xmm2 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
- auto xmm3 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
- auto xmm4 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
- auto xmm5 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
- auto xmm6 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
- auto xmm7 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
- \r
- _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm0));\r
- _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm1));\r
- _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm2));\r
- _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm3));\r
- _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm4));\r
- _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm5));\r
- _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm6));\r
- _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm7));\r
- }\r
+ else\r
+ {\r
+ for(size_t n = 0; n < format_desc_.audio_samples_per_frame/4; ++n)\r
+ { \r
+ auto sample_ps = _mm_cvtepi32_ps(_mm_load_si128(reinterpret_cast<__m128i*>(&item.audio_data[n*4])));\r
+ auto res_sample_ps = _mm_load_ps(&intermediate[n*4]); \r
+ sample_ps = _mm_mul_ps(sample_ps, volume_ps); \r
+ res_sample_ps = _mm_add_ps(sample_ps, res_sample_ps); \r
+ \r
+ _mm_stream_si128(result_128++, _mm_cvtps_epi32(res_sample_ps));\r
+ }\r
+ }\r
+ } \r
\r
items_.clear();\r
prev_frame_transforms_ = std::move(next_frame_transforms); \r