From 9dd6b94f84be634306e1266e2f65479fef76a10f Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Wed, 7 Nov 2018 19:21:09 +0100 Subject: [PATCH] Add another non-interleaved data copy (intended for VA-API MJPEG uploads). --- bmusb.cpp | 31 +++++++++++++++++++++++++------ bmusb/bmusb.h | 1 + fake_capture.cpp | 34 +++++++++++++++++++++------------- 3 files changed, 47 insertions(+), 19 deletions(-) diff --git a/bmusb.cpp b/bmusb.cpp index cb2858e..abab109 100644 --- a/bmusb.cpp +++ b/bmusb.cpp @@ -536,6 +536,9 @@ void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_n } //dump_frame(); } else { + if (current_frame->data_copy != nullptr) { + memcpy(current_frame->data_copy + current_frame->len, start, bytes); + } if (current_frame->interleaved) { uint8_t *data = current_frame->data + current_frame->len / 2; uint8_t *data2 = current_frame->data2 + current_frame->len / 2; @@ -685,6 +688,7 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, { const __m256i needle = _mm256_set1_epi8(sync_char); + size_t bytes_copied; const __restrict __m256i *in = (const __m256i *)aligned_start; if (current_frame->interleaved) { __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2); @@ -725,9 +729,10 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, ++out1; ++out2; } - current_frame->len += (uint8_t *)in - aligned_start; + bytes_copied = (uint8_t *)in - aligned_start; } else { - __m256i *out = (__m256i *)(current_frame->data + current_frame->len); + uint8_t *old_end = current_frame->data + current_frame->len; + __m256i *out = (__m256i *)old_end; while (in < (const __m256i *)limit) { __m256i data = _mm256_load_si256(in); _mm256_storeu_si256(out, data); // Store as early as possible, even if the data isn't used. @@ -739,8 +744,14 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, ++in; ++out; } - current_frame->len = (uint8_t *)out - current_frame->data; + bytes_copied = (uint8_t *)out - old_end; + } + if (current_frame->data_copy != nullptr) { + // TODO: It would be somewhat more cache-efficient to write this in the + // same loop as above. However, it might not be worth the extra complexity. + memcpy(current_frame->data_copy + current_frame->len, aligned_start, bytes_copied); } + current_frame->len += bytes_copied; //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes); return (const uint8_t *)in; @@ -752,6 +763,7 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const __m128i needle = _mm_set1_epi8(sync_char); const __m128i *in = (const __m128i *)aligned_start; + size_t bytes_copied; if (current_frame->interleaved) { __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2); __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2); @@ -782,9 +794,10 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, ++out1; ++out2; } - current_frame->len += (uint8_t *)in - aligned_start; + bytes_copied = (uint8_t *)in - aligned_start; } else { - __m128i *out = (__m128i *)(current_frame->data + current_frame->len); + uint8_t *old_end = current_frame->data + current_frame->len; + __m128i *out = (__m128i *)old_end; while (in < (const __m128i *)limit) { __m128i data = _mm_load_si128(in); _mm_storeu_si128(out, data); // Store as early as possible, even if the data isn't used. @@ -796,8 +809,14 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, ++in; ++out; } - current_frame->len = (uint8_t *)out - current_frame->data; + bytes_copied = (uint8_t *)out - old_end; + } + if (current_frame->data_copy != nullptr) { + // TODO: It would be somewhat more cache-efficient to write this in the + // same loop as above. However, it might not be worth the extra complexity. + memcpy(current_frame->data_copy + current_frame->len, aligned_start, bytes_copied); } + current_frame->len += bytes_copied; //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes); return (const uint8_t *)in; diff --git a/bmusb/bmusb.h b/bmusb/bmusb.h index b1861ef..a484496 100644 --- a/bmusb/bmusb.h +++ b/bmusb/bmusb.h @@ -31,6 +31,7 @@ class FrameAllocator { struct Frame { uint8_t *data = nullptr; uint8_t *data2 = nullptr; // Only if interleaved == true. + uint8_t *data_copy = nullptr; // Will get a non-interleaved copy if not nullptr. size_t len = 0; // Number of bytes we actually have. size_t size = 0; // Number of bytes we have room for. size_t overflow = 0; diff --git a/fake_capture.cpp b/fake_capture.cpp index c98e74c..d8f5144 100644 --- a/fake_capture.cpp +++ b/fake_capture.cpp @@ -218,6 +218,23 @@ bool timespec_less_than(const timespec &a, const timespec &b) return make_pair(a.tv_sec, a.tv_nsec) < make_pair(b.tv_sec, b.tv_nsec); } +void fill_color_noninterleaved(uint8_t *dst, uint8_t y, uint8_t cb, uint8_t cr, const VideoFormat &video_format, bool ten_bit) +{ + if (ten_bit) { + // Just use the 8-bit-values shifted left by 2. + // It's not 100% correct, but it's close enough. + uint32_t pix[4]; + pix[0] = (cb << 2) | (y << 12) | (cr << 22); + pix[1] = (y << 2) | (cb << 12) | ( y << 22); + pix[2] = (cr << 2) | (y << 12) | (cb << 22); + pix[3] = (y << 2) | (cr << 12) | ( y << 22); + memset16(dst, pix, video_format.stride * video_format.height / sizeof(pix)); + } else { + uint8_t ycbcr[] = { y, cb, y, cr }; + memset4(dst, ycbcr, video_format.width * video_format.height / 2); + } +} + } // namespace void FakeCapture::producer_thread_func() @@ -285,19 +302,10 @@ void FakeCapture::producer_thread_func() memset2(video_frame.data, cbcr, width * height / 2); memset(video_frame.data2, y, width * height); } else { - if (current_pixel_format == PixelFormat_10BitYCbCr) { - // Just use the 8-bit-values shifted left by 2. - // It's not 100% correct, but it's close enough. - uint32_t pix[4]; - pix[0] = (cb << 2) | (y << 12) | (cr << 22); - pix[1] = (y << 2) | (cb << 12) | ( y << 22); - pix[2] = (cr << 2) | (y << 12) | (cb << 22); - pix[3] = (y << 2) | (cr << 12) | ( y << 22); - memset16(video_frame.data, pix, video_format.stride * height / sizeof(pix)); - } else { - uint8_t ycbcr[] = { y, cb, y, cr }; - memset4(video_frame.data, ycbcr, width * height / 2); - } + fill_color_noninterleaved(video_frame.data, y, cb, cr, video_format, current_pixel_format); + } + if (video_frame.data_copy != nullptr) { + fill_color_noninterleaved(video_frame.data_copy, y, cb, cr, video_format, current_pixel_format); } video_frame.len = video_format.stride * height; video_frame.received_timestamp = timestamp; -- 2.39.2