\r
#include <tbb/parallel_for.h>\r
\r
+#include <safeint.h>\r
+\r
#include <stack>\r
#include <deque>\r
\r
\r
struct audio_item\r
{\r
- const void* tag;\r
- frame_transform transform;\r
- std::vector<int16_t> audio_data;\r
+ const void* tag;\r
+ frame_transform transform;\r
+ audio_buffer audio_data;\r
};\r
\r
struct audio_mixer::implementation\r
{\r
- std::stack<core::frame_transform> transform_stack_;\r
-\r
- std::map<const void*, core::frame_transform> prev_frame_transforms_;\r
- std::map<const void*, core::frame_transform> next_frame_transforms_;\r
-\r
- const core::video_format_desc format_desc_;\r
-\r
- std::vector<audio_item> items;\r
+ std::stack<core::frame_transform> transform_stack_;\r
+ std::map<const void*, core::frame_transform> prev_frame_transforms_;\r
+ const core::video_format_desc format_desc_;\r
+ std::vector<audio_item> items;\r
\r
public:\r
implementation(const core::video_format_desc& format_desc)\r
transform_stack_.push(transform_stack_.top()*frame.get_frame_transform());\r
}\r
\r
- void visit(const core::write_frame& frame)\r
+ void visit(core::write_frame& frame)\r
{\r
// We only care about the last field.\r
if(format_desc_.field_mode == field_mode::upper && transform_stack_.top().field_mode == field_mode::upper)\r
audio_item item;\r
item.tag = frame.tag();\r
item.transform = transform_stack_.top();\r
- item.audio_data = std::vector<int16_t>(frame.audio_data().begin(), frame.audio_data().end());\r
+ item.audio_data = std::move(frame.audio_data());\r
\r
items.push_back(item); \r
}\r
transform_stack_.pop();\r
}\r
\r
- std::vector<int16_t> mix()\r
- {\r
- auto result = std::vector<int16_t>(format_desc_.audio_samples_per_frame);\r
+ audio_buffer mix()\r
+ { \r
+ // NOTE: auto data should be larger than format_desc_.audio_samples_per_frame to allow sse to read/write beyond size.\r
+\r
+ auto intermediate = std::vector<float, tbb::cache_aligned_allocator<float>>(format_desc_.audio_samples_per_frame+128, 0.0f);\r
+\r
+ std::map<const void*, core::frame_transform> next_frame_transforms;\r
+ \r
+ tbb::affinity_partitioner ap;\r
\r
BOOST_FOREACH(auto& item, items)\r
- { \r
+ { \r
const auto next = item.transform;\r
auto prev = next;\r
\r
if(it != prev_frame_transforms_.end())\r
prev = it->second;\r
\r
- next_frame_transforms_[item.tag] = next; // Store all active tags, inactive tags will be removed at the end.\r
+ next_frame_transforms[item.tag] = next; // Store all active tags, inactive tags will be removed at the end.\r
\r
if(next.volume < 0.001 && prev.volume < 0.001)\r
continue;\r
- \r
- static const int BASE = 1<<15;\r
-\r
- const auto next_volume = static_cast<int>(next.volume*BASE);\r
- const auto prev_volume = static_cast<int>(prev.volume*BASE);\r
- \r
- const int n_samples = result.size();\r
- \r
- const auto in_size = static_cast<size_t>(item.audio_data.size());\r
- CASPAR_VERIFY(in_size == 0 || in_size == result.size());\r
-\r
- if(in_size > result.size())\r
+ \r
+ if(static_cast<size_t>(item.audio_data.size()) != format_desc_.audio_samples_per_frame)\r
continue;\r
\r
+ CASPAR_ASSERT(format_desc_.audio_channels == 2);\r
+ CASPAR_ASSERT(format_desc_.audio_samples_per_frame % 4 == 0);\r
+ \r
+ const float prev_volume = static_cast<float>(prev.volume);\r
+ const float next_volume = static_cast<float>(next.volume);\r
+ const float delta = 1.0f/static_cast<float>(format_desc_.audio_samples_per_frame/format_desc_.audio_channels);\r
+ \r
tbb::parallel_for\r
(\r
- tbb::blocked_range<size_t>(0, item.audio_data.size()),\r
+ tbb::blocked_range<size_t>(0, format_desc_.audio_samples_per_frame/4),\r
[&](const tbb::blocked_range<size_t>& r)\r
- {\r
+ { \r
+ auto alpha_ps = _mm_setr_ps(delta, delta, 0.0f, 0.0f);\r
+ auto delta2_ps = _mm_set_ps1(delta*2.0f);\r
+ auto prev_ps = _mm_set_ps1(prev_volume);\r
+ auto next_ps = _mm_set_ps1(next_volume); \r
+\r
for(size_t n = r.begin(); n < r.end(); ++n)\r
- {\r
- const int sample_volume = (prev_volume - (prev_volume * n)/n_samples) + (next_volume * n)/n_samples;\r
- const int sample = (static_cast<int>(item.audio_data[n])*sample_volume)/BASE;\r
- result[n] = static_cast<int16_t>((static_cast<int>(result[n]) + sample) & 0xFFFF);\r
+ { \r
+ auto next2_ps = _mm_mul_ps(next_ps, alpha_ps);\r
+ auto prev2_ps = _mm_sub_ps(prev_ps, _mm_mul_ps(prev_ps, alpha_ps));\r
+ auto volume_ps = _mm_add_ps(next2_ps, prev2_ps);\r
+\r
+ auto sample_ps = _mm_cvtepi32_ps(_mm_load_si128(reinterpret_cast<__m128i*>(&item.audio_data[n*4])));\r
+ auto res_sample_ps = _mm_load_ps(&intermediate[n*4]); \r
+ sample_ps = _mm_mul_ps(sample_ps, volume_ps); \r
+ res_sample_ps = _mm_add_ps(sample_ps, res_sample_ps); \r
+\r
+ alpha_ps = _mm_add_ps(alpha_ps, delta2_ps);\r
+\r
+ _mm_store_ps(&intermediate[n*4], res_sample_ps);\r
}\r
}\r
- );\r
+ , ap);\r
}\r
+ \r
+ auto result = audio_buffer(format_desc_.audio_samples_per_frame+128, 0); \r
+ \r
+ tbb::parallel_for\r
+ (\r
+ tbb::blocked_range<size_t>(0, format_desc_.audio_samples_per_frame/32),\r
+ [&](const tbb::blocked_range<size_t>& r)\r
+ { \r
+ auto intermediate_128 = reinterpret_cast<__m128i*>(intermediate.data()+r.begin()*32);\r
+ auto result_128 = reinterpret_cast<__m128i*>(result.data()+r.begin()*32);\r
+ \r
+ for(size_t n = r.begin(); n < r.end(); ++n)\r
+ { \r
+ auto xmm0 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
+ auto xmm1 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
+ auto xmm2 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
+ auto xmm3 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
+ auto xmm4 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
+ auto xmm5 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
+ auto xmm6 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
+ auto xmm7 = _mm_load_ps(reinterpret_cast<float*>(intermediate_128++));\r
+ \r
+ _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm0));\r
+ _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm1));\r
+ _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm2));\r
+ _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm3));\r
+ _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm4));\r
+ _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm5));\r
+ _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm6));\r
+ _mm_stream_si128(result_128++, _mm_cvtps_epi32(xmm7));\r
+ }\r
+ }\r
+ , ap);\r
\r
items.clear();\r
- prev_frame_transforms_ = std::move(next_frame_transforms_); \r
+ prev_frame_transforms_ = std::move(next_frame_transforms); \r
\r
+ result.resize(format_desc_.audio_samples_per_frame);\r
return std::move(result);\r
}\r
};\r
void audio_mixer::begin(core::basic_frame& frame){impl_->begin(frame);}\r
void audio_mixer::visit(core::write_frame& frame){impl_->visit(frame);}\r
void audio_mixer::end(){impl_->end();}\r
-std::vector<int16_t> audio_mixer::mix(){return impl_->mix();}\r
+audio_buffer audio_mixer::mix(){return impl_->mix();}\r
audio_mixer& audio_mixer::operator=(audio_mixer&& other)\r
{\r
impl_ = std::move(other.impl_);\r