}
//dump_frame();
} else {
+ if (current_frame->data_copy != nullptr) {
+ memcpy(current_frame->data_copy + current_frame->len, start, bytes);
+ }
if (current_frame->interleaved) {
uint8_t *data = current_frame->data + current_frame->len / 2;
uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
{
const __m256i needle = _mm256_set1_epi8(sync_char);
+ size_t bytes_copied;
const __restrict __m256i *in = (const __m256i *)aligned_start;
if (current_frame->interleaved) {
__restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
++out1;
++out2;
}
- current_frame->len += (uint8_t *)in - aligned_start;
+ bytes_copied = (uint8_t *)in - aligned_start;
} else {
- __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
+ uint8_t *old_end = current_frame->data + current_frame->len;
+ __m256i *out = (__m256i *)old_end;
while (in < (const __m256i *)limit) {
__m256i data = _mm256_load_si256(in);
_mm256_storeu_si256(out, data); // Store as early as possible, even if the data isn't used.
++in;
++out;
}
- current_frame->len = (uint8_t *)out - current_frame->data;
+ bytes_copied = (uint8_t *)out - old_end;
+ }
+ if (current_frame->data_copy != nullptr) {
+ // TODO: It would be somewhat more cache-efficient to write this in the
+ // same loop as above. However, it might not be worth the extra complexity.
+ memcpy(current_frame->data_copy + current_frame->len, aligned_start, bytes_copied);
}
+ current_frame->len += bytes_copied;
//printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
return (const uint8_t *)in;
const __m128i needle = _mm_set1_epi8(sync_char);
const __m128i *in = (const __m128i *)aligned_start;
+ size_t bytes_copied;
if (current_frame->interleaved) {
__m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
__m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
++out1;
++out2;
}
- current_frame->len += (uint8_t *)in - aligned_start;
+ bytes_copied = (uint8_t *)in - aligned_start;
} else {
- __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
+ uint8_t *old_end = current_frame->data + current_frame->len;
+ __m128i *out = (__m128i *)old_end;
while (in < (const __m128i *)limit) {
__m128i data = _mm_load_si128(in);
_mm_storeu_si128(out, data); // Store as early as possible, even if the data isn't used.
++in;
++out;
}
- current_frame->len = (uint8_t *)out - current_frame->data;
+ bytes_copied = (uint8_t *)out - old_end;
+ }
+ if (current_frame->data_copy != nullptr) {
+ // TODO: It would be somewhat more cache-efficient to write this in the
+ // same loop as above. However, it might not be worth the extra complexity.
+ memcpy(current_frame->data_copy + current_frame->len, aligned_start, bytes_copied);
}
+ current_frame->len += bytes_copied;
//printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
return (const uint8_t *)in;
return make_pair(a.tv_sec, a.tv_nsec) < make_pair(b.tv_sec, b.tv_nsec);
}
+void fill_color_noninterleaved(uint8_t *dst, uint8_t y, uint8_t cb, uint8_t cr, const VideoFormat &video_format, bool ten_bit)
+{
+ if (ten_bit) {
+ // Just use the 8-bit-values shifted left by 2.
+ // It's not 100% correct, but it's close enough.
+ uint32_t pix[4];
+ pix[0] = (cb << 2) | (y << 12) | (cr << 22);
+ pix[1] = (y << 2) | (cb << 12) | ( y << 22);
+ pix[2] = (cr << 2) | (y << 12) | (cb << 22);
+ pix[3] = (y << 2) | (cr << 12) | ( y << 22);
+ memset16(dst, pix, video_format.stride * video_format.height / sizeof(pix));
+ } else {
+ uint8_t ycbcr[] = { y, cb, y, cr };
+ memset4(dst, ycbcr, video_format.width * video_format.height / 2);
+ }
+}
+
} // namespace
void FakeCapture::producer_thread_func()
memset2(video_frame.data, cbcr, width * height / 2);
memset(video_frame.data2, y, width * height);
} else {
- if (current_pixel_format == PixelFormat_10BitYCbCr) {
- // Just use the 8-bit-values shifted left by 2.
- // It's not 100% correct, but it's close enough.
- uint32_t pix[4];
- pix[0] = (cb << 2) | (y << 12) | (cr << 22);
- pix[1] = (y << 2) | (cb << 12) | ( y << 22);
- pix[2] = (cr << 2) | (y << 12) | (cb << 22);
- pix[3] = (y << 2) | (cr << 12) | ( y << 22);
- memset16(video_frame.data, pix, video_format.stride * height / sizeof(pix));
- } else {
- uint8_t ycbcr[] = { y, cb, y, cr };
- memset4(video_frame.data, ycbcr, width * height / 2);
- }
+ fill_color_noninterleaved(video_frame.data, y, cb, cr, video_format, current_pixel_format);
+ }
+ if (video_frame.data_copy != nullptr) {
+ fill_color_noninterleaved(video_frame.data_copy, y, cb, cr, video_format, current_pixel_format);
}
video_frame.len = video_format.stride * height;
video_frame.received_timestamp = timestamp;