X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=decklink_capture.cpp;h=0f48e3e77f48ffe18c74175675060b52ea02f50b;hb=817fffe1ef3bd87f2387395f49487cf5255d8daf;hp=6dba068fca51108fcbfc3a55050efcc1223880ee;hpb=7c26af1581077cb63dc0bd76942f1f4262312f7f;p=nageru diff --git a/decklink_capture.cpp b/decklink_capture.cpp index 6dba068..0f48e3e 100644 --- a/decklink_capture.cpp +++ b/decklink_capture.cpp @@ -21,6 +21,7 @@ #include "bmusb/bmusb.h" #include "decklink_util.h" #include "flags.h" +#include "memcpy_interleaved.h" #include "v210_converter.h" #define FRAME_SIZE (8 << 20) // 8 MB. @@ -32,114 +33,6 @@ using namespace bmusb; namespace { -// TODO: Support stride. -void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n) -{ - assert(n % 2 == 0); - uint8_t *dptr1 = dest1; - uint8_t *dptr2 = dest2; - - for (size_t i = 0; i < n; i += 2) { - *dptr1++ = *src++; - *dptr2++ = *src++; - } -} - -#ifdef __SSE2__ - -// Returns the number of bytes consumed. -size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n) -{ - const uint8_t *limit = src + n; - size_t consumed = 0; - - // Align end to 32 bytes. - limit = (const uint8_t *)(intptr_t(limit) & ~31); - - if (src >= limit) { - return 0; - } - - // Process [0,31] bytes, such that start gets aligned to 32 bytes. - const uint8_t *aligned_src = (const uint8_t *)(intptr_t(src + 31) & ~31); - if (aligned_src != src) { - size_t n2 = aligned_src - src; - memcpy_interleaved(dest1, dest2, src, n2); - dest1 += n2 / 2; - dest2 += n2 / 2; - if (n2 % 2) { - swap(dest1, dest2); - } - src = aligned_src; - consumed += n2; - } - - // Make the length a multiple of 64. - if (((limit - src) % 64) != 0) { - limit -= 32; - } - assert(((limit - src) % 64) == 0); - -#if __AVX2__ - const __m256i * __restrict in = (const __m256i *)src; - __m256i * __restrict out1 = (__m256i *)dest1; - __m256i * __restrict out2 = (__m256i *)dest2; - - __m256i shuffle_cw = _mm256_set_epi8( - 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, - 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); - while (in < (const __m256i *)limit) { - // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128). - __m256i data1 = _mm256_stream_load_si256(in); // AaBbCcDd EeFfGgHh - __m256i data2 = _mm256_stream_load_si256(in + 1); // IiJjKkLl MmNnOoPp - - data1 = _mm256_shuffle_epi8(data1, shuffle_cw); // ABCDabcd EFGHefgh - data2 = _mm256_shuffle_epi8(data2, shuffle_cw); // IJKLijkl MNOPmnop - - data1 = _mm256_permute4x64_epi64(data1, 0b11011000); // ABCDEFGH abcdefgh - data2 = _mm256_permute4x64_epi64(data2, 0b11011000); // IJKLMNOP ijklmnop - - __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000); - __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001); - - _mm256_storeu_si256(out1, lo); - _mm256_storeu_si256(out2, hi); - - in += 2; - ++out1; - ++out2; - consumed += 64; - } -#else - const __m128i * __restrict in = (const __m128i *)src; - __m128i * __restrict out1 = (__m128i *)dest1; - __m128i * __restrict out2 = (__m128i *)dest2; - - __m128i mask_lower_byte = _mm_set1_epi16(0x00ff); - while (in < (const __m128i *)limit) { - __m128i data1 = _mm_load_si128(in); - __m128i data2 = _mm_load_si128(in + 1); - __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte); - __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte); - __m128i data1_hi = _mm_srli_epi16(data1, 8); - __m128i data2_hi = _mm_srli_epi16(data2, 8); - __m128i lo = _mm_packus_epi16(data1_lo, data2_lo); - _mm_storeu_si128(out1, lo); - __m128i hi = _mm_packus_epi16(data1_hi, data2_hi); - _mm_storeu_si128(out2, hi); - - in += 2; - ++out1; - ++out2; - consumed += 32; - } -#endif - - return consumed; -} - -#endif // __SSE2__ - BMDPixelFormat pixel_format_to_bmd(PixelFormat pixel_format) { switch (pixel_format) { @@ -368,24 +261,13 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived( if (current_video_frame.interleaved) { uint8_t *data = current_video_frame.data; uint8_t *data2 = current_video_frame.data2; -#ifdef __SSE2__ - size_t consumed = memcpy_interleaved_fastpath(data, data2, frame_bytes, num_bytes); - frame_bytes += consumed; - data += consumed / 2; - data2 += consumed / 2; - if (num_bytes % 2) { - swap(data, data2); - } - current_video_frame.len += consumed; - num_bytes -= consumed; -#endif - - if (num_bytes > 0) { - memcpy_interleaved(data, data2, frame_bytes, num_bytes); - } + memcpy_interleaved(data, data2, frame_bytes, num_bytes); } else { memcpy(current_video_frame.data, frame_bytes, num_bytes); } + if (current_video_frame.data_copy != nullptr) { + memcpy(current_video_frame.data_copy, frame_bytes, num_bytes); + } current_video_frame.len += num_bytes; video_format.width = width; @@ -468,14 +350,11 @@ void DeckLinkCapture::stop_dequeue_thread() fprintf(stderr, "StopStreams failed with error 0x%x\n", result); exit(1); } - if (input->DisableVideoInput() != S_OK) { - fprintf(stderr, "Failed to disable video input for card %d\n", card_index); - exit(1); - } - if (input->DisableAudioInput() != S_OK) { - fprintf(stderr, "Failed to disable audio input for card %d\n", card_index); - exit(1); - } + + // We could call DisableVideoInput() and DisableAudioInput() here, + // but they seem to be taking a really long time, and we only do this + // during shutdown anyway, so StopStreams() will suffice. + running = false; }