From: Steinar H. Gunderson <sgunderson@bigfoot.com>
Date: Sun, 3 Jun 2018 10:51:36 +0000 (+0200)
Subject: Move memcpy_interleaved into its own file.
X-Git-Tag: 1.7.4~6
X-Git-Url: https://git.sesse.net/?p=nageru;a=commitdiff_plain;h=8657914cd9b4f4a8c9da98282cecc3f60f3d09ab

Move memcpy_interleaved into its own file.
---

diff --git a/Makefile b/Makefile
index 663100b..a44ee78 100644
--- a/Makefile
+++ b/Makefile
@@ -62,7 +62,7 @@ OBJS += chroma_subsampler.o v210_converter.o mixer.o basic_stats.o metrics.o pbo
 OBJS += quicksync_encoder.o x264_encoder.o x264_dynamic.o x264_speed_control.o video_encoder.o metacube2.o mux.o audio_encoder.o ffmpeg_raii.o ffmpeg_util.o json.pb.o
 
 # DeckLink
-OBJS += decklink_capture.o decklink_util.o decklink_output.o decklink/DeckLinkAPIDispatch.o
+OBJS += decklink_capture.o decklink_util.o decklink_output.o memcpy_interleaved.o decklink/DeckLinkAPIDispatch.o
 
 KAERU_OBJS = kaeru.o x264_encoder.o mux.o basic_stats.o metrics.o flags.o audio_encoder.o x264_speed_control.o print_latency.o x264_dynamic.o ffmpeg_raii.o ref_counted_frame.o ffmpeg_capture.o ffmpeg_util.o httpd.o json.pb.o metacube2.o
 
diff --git a/decklink_capture.cpp b/decklink_capture.cpp
index 6507959..881e181 100644
--- a/decklink_capture.cpp
+++ b/decklink_capture.cpp
@@ -21,6 +21,7 @@
 #include "bmusb/bmusb.h"
 #include "decklink_util.h"
 #include "flags.h"
+#include "memcpy_interleaved.h"
 #include "v210_converter.h"
 
 #define FRAME_SIZE (8 << 20)  // 8 MB.
@@ -32,114 +33,6 @@ using namespace bmusb;
 
 namespace {
 
-// TODO: Support stride.
-void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
-{
-	assert(n % 2 == 0);
-	uint8_t *dptr1 = dest1;
-	uint8_t *dptr2 = dest2;
-
-	for (size_t i = 0; i < n; i += 2) {
-		*dptr1++ = *src++;
-		*dptr2++ = *src++;
-	}
-}
-
-#ifdef __SSE2__
-
-// Returns the number of bytes consumed.
-size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
-{
-	const uint8_t *limit = src + n;
-	size_t consumed = 0;
-
-	// Align end to 32 bytes.
-	limit = (const uint8_t *)(intptr_t(limit) & ~31);
-
-	if (src >= limit) {
-		return 0;
-	}
-
-	// Process [0,31] bytes, such that start gets aligned to 32 bytes.
-	const uint8_t *aligned_src = (const uint8_t *)(intptr_t(src + 31) & ~31);
-	if (aligned_src != src) {
-		size_t n2 = aligned_src - src;
-		memcpy_interleaved(dest1, dest2, src, n2);
-		dest1 += n2 / 2;
-		dest2 += n2 / 2;
-		if (n2 % 2) {
-			swap(dest1, dest2);
-		}
-		src = aligned_src;
-		consumed += n2;
-	}
-
-	// Make the length a multiple of 64.
-	if (((limit - src) % 64) != 0) {
-		limit -= 32;
-	}
-	assert(((limit - src) % 64) == 0);
-
-#if __AVX2__
-	const __m256i * __restrict in = (const __m256i *)src;
-	__m256i * __restrict out1 = (__m256i *)dest1;
-	__m256i * __restrict out2 = (__m256i *)dest2;
-
-	__m256i shuffle_cw = _mm256_set_epi8(
-		15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
-		15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
-	while (in < (const __m256i *)limit) {
-		// Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
-		__m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
-		__m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
-
-		data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
-		data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
-	
-		data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
-		data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
-
-		__m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
-		__m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
-
-		_mm256_storeu_si256(out1, lo);
-		_mm256_storeu_si256(out2, hi);
-
-		in += 2;
-		++out1;
-		++out2;
-		consumed += 64;
-	}
-#else
-	const __m128i * __restrict in = (const __m128i *)src;
-	__m128i * __restrict out1 = (__m128i *)dest1;
-	__m128i * __restrict out2 = (__m128i *)dest2;
-
-	__m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
-	while (in < (const __m128i *)limit) {
-		__m128i data1 = _mm_load_si128(in);
-		__m128i data2 = _mm_load_si128(in + 1);
-		__m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
-		__m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
-		__m128i data1_hi = _mm_srli_epi16(data1, 8);
-		__m128i data2_hi = _mm_srli_epi16(data2, 8);
-		__m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
-		_mm_storeu_si128(out1, lo);
-		__m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
-		_mm_storeu_si128(out2, hi);
-
-		in += 2;
-		++out1;
-		++out2;
-		consumed += 32;
-	}
-#endif
-
-	return consumed;
-}
-
-#endif  // __SSE2__
-
 BMDPixelFormat pixel_format_to_bmd(PixelFormat pixel_format)
 {
 	switch (pixel_format) {
@@ -368,21 +261,7 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
 			if (current_video_frame.interleaved) {
 				uint8_t *data = current_video_frame.data;
 				uint8_t *data2 = current_video_frame.data2;
-#ifdef __SSE2__
-				size_t consumed = memcpy_interleaved_fastpath(data, data2, frame_bytes, num_bytes);
-				frame_bytes += consumed;
-				data += consumed / 2;
-				data2 += consumed / 2;
-				if (num_bytes % 2) {
-					swap(data, data2);
-				}
-				current_video_frame.len += consumed;
-				num_bytes -= consumed;
-#endif
-
-				if (num_bytes > 0) {
-					memcpy_interleaved(data, data2, frame_bytes, num_bytes);
-				}
+				memcpy_interleaved(data, data2, frame_bytes, num_bytes);
 			} else {
 				memcpy(current_video_frame.data, frame_bytes, num_bytes);
 			}
diff --git a/memcpy_interleaved.cpp b/memcpy_interleaved.cpp
new file mode 100644
index 0000000..9a41cdd
--- /dev/null
+++ b/memcpy_interleaved.cpp
@@ -0,0 +1,136 @@
+#include <cstdint>
+#include <algorithm>
+#include <assert.h>
+#if __SSE2__
+#include <immintrin.h>
+#endif
+
+using namespace std;
+
+// TODO: Support stride.
+void memcpy_interleaved_slow(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+	assert(n % 2 == 0);
+	uint8_t *dptr1 = dest1;
+	uint8_t *dptr2 = dest2;
+
+	for (size_t i = 0; i < n; i += 2) {
+		*dptr1++ = *src++;
+		*dptr2++ = *src++;
+	}
+}
+
+#ifdef __SSE2__
+
+// Returns the number of bytes consumed.
+size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+	const uint8_t *limit = src + n;
+	size_t consumed = 0;
+
+	// Align end to 32 bytes.
+	limit = (const uint8_t *)(intptr_t(limit) & ~31);
+
+	if (src >= limit) {
+		return 0;
+	}
+
+	// Process [0,31] bytes, such that start gets aligned to 32 bytes.
+	const uint8_t *aligned_src = (const uint8_t *)(intptr_t(src + 31) & ~31);
+	if (aligned_src != src) {
+		size_t n2 = aligned_src - src;
+		memcpy_interleaved_slow(dest1, dest2, src, n2);
+		dest1 += n2 / 2;
+		dest2 += n2 / 2;
+		if (n2 % 2) {
+			swap(dest1, dest2);
+		}
+		src = aligned_src;
+		consumed += n2;
+	}
+
+	// Make the length a multiple of 64.
+	if (((limit - src) % 64) != 0) {
+		limit -= 32;
+	}
+	assert(((limit - src) % 64) == 0);
+
+#if __AVX2__
+	const __m256i * __restrict in = (const __m256i *)src;
+	__m256i * __restrict out1 = (__m256i *)dest1;
+	__m256i * __restrict out2 = (__m256i *)dest2;
+
+	__m256i shuffle_cw = _mm256_set_epi8(
+		15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
+		15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+	while (in < (const __m256i *)limit) {
+		// Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
+		__m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
+		__m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
+
+		data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
+		data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
+	
+		data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
+		data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
+
+		__m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
+		__m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
+
+		_mm256_storeu_si256(out1, lo);
+		_mm256_storeu_si256(out2, hi);
+
+		in += 2;
+		++out1;
+		++out2;
+		consumed += 64;
+	}
+#else
+	const __m128i * __restrict in = (const __m128i *)src;
+	__m128i * __restrict out1 = (__m128i *)dest1;
+	__m128i * __restrict out2 = (__m128i *)dest2;
+
+	__m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
+	while (in < (const __m128i *)limit) {
+		__m128i data1 = _mm_load_si128(in);
+		__m128i data2 = _mm_load_si128(in + 1);
+		__m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
+		__m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
+		__m128i data1_hi = _mm_srli_epi16(data1, 8);
+		__m128i data2_hi = _mm_srli_epi16(data2, 8);
+		__m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
+		_mm_storeu_si128(out1, lo);
+		__m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
+		_mm_storeu_si128(out2, hi);
+
+		in += 2;
+		++out1;
+		++out2;
+		consumed += 32;
+	}
+#endif
+
+	return consumed;
+}
+
+#endif  // defined(__SSE2__)
+
+void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+#ifdef __SSE2__
+	size_t consumed = memcpy_interleaved_fastpath(dest1, dest2, src, n);
+	src += consumed;
+	dest1 += consumed / 2;
+	dest2 += consumed / 2;
+	if (consumed % 2) {
+		swap(dest1, dest2);
+	}
+	n -= consumed;
+
+	if (n > 0) {
+		memcpy_interleaved_slow(dest1, dest2, src, n);
+	}
+#else
+	memcpy_interleaved_slow(dest1, dest2, src, n);
+#endif
+}
diff --git a/memcpy_interleaved.h b/memcpy_interleaved.h
new file mode 100644
index 0000000..a7f8994
--- /dev/null
+++ b/memcpy_interleaved.h
@@ -0,0 +1,11 @@
+#ifndef _MEMCPY_INTERLEAVED_H
+#define _MEMCPY_INTERLEAVED_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+// Copies every other byte from src to dest1 and dest2.
+// TODO: Support stride.
+void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n);
+
+#endif  // !defined(_MEMCPY_INTERLEAVED_H)