Add another non-interleaved data copy (intended for VA-API MJPEG uploads).

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Wed, 7 Nov 2018 18:21:09 +0000 (19:21 +0100)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Wed, 7 Nov 2018 22:11:24 +0000 (23:11 +0100)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Wed, 7 Nov 2018 18:21:09 +0000 (19:21 +0100)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Wed, 7 Nov 2018 22:11:24 +0000 (23:11 +0100)
diff --git a/bmusb.cpp b/bmusb.cpp

index cb2858e4f275e19a85594b5b903883e54e5a614a..abab109dc8ceb4e7fe77c07493017a898d53a8ee 100644 (file)
--- a/bmusb.cpp
+++ b/bmusb.cpp
@@ -536,6 +536,9 @@ void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_n
                 }
                 //dump_frame();
         } else {
+               if (current_frame->data_copy != nullptr) {
+                       memcpy(current_frame->data_copy + current_frame->len, start, bytes);
+               }
                 if (current_frame->interleaved) {
                         uint8_t *data = current_frame->data + current_frame->len / 2;
                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
@@ -685,6 +688,7 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame,
  {
         const __m256i needle = _mm256_set1_epi8(sync_char);
  
+       size_t bytes_copied;
         const __restrict __m256i *in = (const __m256i *)aligned_start;
         if (current_frame->interleaved) {
                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
@@ -725,9 +729,10 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame,
                         ++out1;
                         ++out2;
                 }
-               current_frame->len += (uint8_t *)in - aligned_start;
+               bytes_copied = (uint8_t *)in - aligned_start;
         } else {
-               __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
+               uint8_t *old_end = current_frame->data + current_frame->len;
+               __m256i *out = (__m256i *)old_end;
                 while (in < (const __m256i *)limit) {
                         __m256i data = _mm256_load_si256(in);
                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
@@ -739,8 +744,14 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame,
                         ++in;
                         ++out;
                 }
-               current_frame->len = (uint8_t *)out - current_frame->data;
+               bytes_copied = (uint8_t *)out - old_end;
+       }
+       if (current_frame->data_copy != nullptr) {
+               // TODO: It would be somewhat more cache-efficient to write this in the
+               // same loop as above. However, it might not be worth the extra complexity.
+               memcpy(current_frame->data_copy + current_frame->len, aligned_start, bytes_copied);
         }
+       current_frame->len += bytes_copied;
  
         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
         return (const uint8_t *)in;
@@ -752,6 +763,7 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame,
         const __m128i needle = _mm_set1_epi8(sync_char);
  
         const __m128i *in = (const __m128i *)aligned_start;
+       size_t bytes_copied;
         if (current_frame->interleaved) {
                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
@@ -782,9 +794,10 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame,
                         ++out1;
                         ++out2;
                 }
-               current_frame->len += (uint8_t *)in - aligned_start;
+               bytes_copied = (uint8_t *)in - aligned_start;
         } else {
-               __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
+               uint8_t *old_end = current_frame->data + current_frame->len;
+               __m128i *out = (__m128i *)old_end;
                 while (in < (const __m128i *)limit) {
                         __m128i data = _mm_load_si128(in);
                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
@@ -796,8 +809,14 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame,
                         ++in;
                         ++out;
                 }
-               current_frame->len = (uint8_t *)out - current_frame->data;
+               bytes_copied = (uint8_t *)out - old_end;
+       }
+       if (current_frame->data_copy != nullptr) {
+               // TODO: It would be somewhat more cache-efficient to write this in the
+               // same loop as above. However, it might not be worth the extra complexity.
+               memcpy(current_frame->data_copy + current_frame->len, aligned_start, bytes_copied);
         }
+       current_frame->len += bytes_copied;
  
         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
         return (const uint8_t *)in;
diff --git a/bmusb/bmusb.h b/bmusb/bmusb.h

index b1861efb6ddf045ae7d6a49a31d2c4667e314a25..a484496bd6be5cd0aa83512258c0e87f75a79978 100644 (file)
--- a/bmusb/bmusb.h
+++ b/bmusb/bmusb.h
@@ -31,6 +31,7 @@ class FrameAllocator {
         struct Frame {
                 uint8_t *data = nullptr;
                 uint8_t *data2 = nullptr;  // Only if interleaved == true.
+               uint8_t *data_copy = nullptr;  // Will get a non-interleaved copy if not nullptr.
                 size_t len = 0;  // Number of bytes we actually have.
                 size_t size = 0;  // Number of bytes we have room for.
                 size_t overflow = 0;
diff --git a/fake_capture.cpp b/fake_capture.cpp

index c98e74cf3351f5063cad260001b236acc9870034..d8f5144adf727008277dcd537f2dbeb6ace98de8 100644 (file)
--- a/fake_capture.cpp
+++ b/fake_capture.cpp
@@ -218,6 +218,23 @@ bool timespec_less_than(const timespec &a, const timespec &b)
         return make_pair(a.tv_sec, a.tv_nsec) < make_pair(b.tv_sec, b.tv_nsec);
  }
  
+void fill_color_noninterleaved(uint8_t *dst, uint8_t y, uint8_t cb, uint8_t cr, const VideoFormat &video_format, bool ten_bit)
+{
+       if (ten_bit) {
+               // Just use the 8-bit-values shifted left by 2.
+               // It's not 100% correct, but it's close enough.
+               uint32_t pix[4];
+               pix[0] = (cb << 2) | (y  << 12) | (cr << 22);
+               pix[1] = (y  << 2) | (cb << 12) | ( y << 22);
+               pix[2] = (cr << 2) | (y  << 12) | (cb << 22);
+               pix[3] = (y  << 2) | (cr << 12) | ( y << 22);
+               memset16(dst, pix, video_format.stride * video_format.height / sizeof(pix));
+       } else {
+               uint8_t ycbcr[] = { y, cb, y, cr };
+               memset4(dst, ycbcr, video_format.width * video_format.height / 2);
+       }
+}
+
  }  // namespace
  
  void FakeCapture::producer_thread_func()
@@ -285,19 +302,10 @@ void FakeCapture::producer_thread_func()
                                 memset2(video_frame.data, cbcr, width * height / 2);
                                 memset(video_frame.data2, y, width * height);
                         } else {
-                               if (current_pixel_format == PixelFormat_10BitYCbCr) {
-                                       // Just use the 8-bit-values shifted left by 2.
-                                       // It's not 100% correct, but it's close enough.
-                                       uint32_t pix[4];
-                                       pix[0] = (cb << 2) | (y  << 12) | (cr << 22);
-                                       pix[1] = (y  << 2) | (cb << 12) | ( y << 22);
-                                       pix[2] = (cr << 2) | (y  << 12) | (cb << 22);
-                                       pix[3] = (y  << 2) | (cr << 12) | ( y << 22);
-                                       memset16(video_frame.data, pix, video_format.stride * height / sizeof(pix));
-                               } else {
-                                       uint8_t ycbcr[] = { y, cb, y, cr };
-                                       memset4(video_frame.data, ycbcr, width * height / 2);
-                               }
+                               fill_color_noninterleaved(video_frame.data, y, cb, cr, video_format, current_pixel_format);
+                       }
+                       if (video_frame.data_copy != nullptr) {
+                               fill_color_noninterleaved(video_frame.data_copy, y, cb, cr, video_format, current_pixel_format);
                         }
                         video_frame.len = video_format.stride * height;
                         video_frame.received_timestamp = timestamp;
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Wed, 7 Nov 2018 18:21:09 +0000 (19:21 +0100)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Wed, 7 Nov 2018 22:11:24 +0000 (23:11 +0100)
bmusb.cpp		patch \| blob \| history
bmusb/bmusb.h		patch \| blob \| history
fake_capture.cpp		patch \| blob \| history