+ if (current_frame->interleaved) {
+ uint8_t *data = current_frame->data + current_frame->len / 2;
+ uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
+ if (current_frame->len % 2 == 1) {
+ ++data;
+ swap(data, data2);
+ }
+ if (bytes % 2 == 1) {
+ *data++ = *start++;
+ swap(data, data2);
+ ++current_frame->len;
+ --bytes;
+ }
+ memcpy_interleaved(data, data2, start, bytes);
+ current_frame->len += bytes;
+ } else {
+ memcpy(current_frame->data + current_frame->len, start, bytes);
+ current_frame->len += bytes;
+ }
+ }
+}
+
+#if 0
+void avx2_dump(const char *name, __m256i n)
+{
+ printf("%-10s:", name);
+ printf(" %02x", _mm256_extract_epi8(n, 0));
+ printf(" %02x", _mm256_extract_epi8(n, 1));
+ printf(" %02x", _mm256_extract_epi8(n, 2));
+ printf(" %02x", _mm256_extract_epi8(n, 3));
+ printf(" %02x", _mm256_extract_epi8(n, 4));
+ printf(" %02x", _mm256_extract_epi8(n, 5));
+ printf(" %02x", _mm256_extract_epi8(n, 6));
+ printf(" %02x", _mm256_extract_epi8(n, 7));
+ printf(" ");
+ printf(" %02x", _mm256_extract_epi8(n, 8));
+ printf(" %02x", _mm256_extract_epi8(n, 9));
+ printf(" %02x", _mm256_extract_epi8(n, 10));
+ printf(" %02x", _mm256_extract_epi8(n, 11));
+ printf(" %02x", _mm256_extract_epi8(n, 12));
+ printf(" %02x", _mm256_extract_epi8(n, 13));
+ printf(" %02x", _mm256_extract_epi8(n, 14));
+ printf(" %02x", _mm256_extract_epi8(n, 15));
+ printf(" ");
+ printf(" %02x", _mm256_extract_epi8(n, 16));
+ printf(" %02x", _mm256_extract_epi8(n, 17));
+ printf(" %02x", _mm256_extract_epi8(n, 18));
+ printf(" %02x", _mm256_extract_epi8(n, 19));
+ printf(" %02x", _mm256_extract_epi8(n, 20));
+ printf(" %02x", _mm256_extract_epi8(n, 21));
+ printf(" %02x", _mm256_extract_epi8(n, 22));
+ printf(" %02x", _mm256_extract_epi8(n, 23));
+ printf(" ");
+ printf(" %02x", _mm256_extract_epi8(n, 24));
+ printf(" %02x", _mm256_extract_epi8(n, 25));
+ printf(" %02x", _mm256_extract_epi8(n, 26));
+ printf(" %02x", _mm256_extract_epi8(n, 27));
+ printf(" %02x", _mm256_extract_epi8(n, 28));
+ printf(" %02x", _mm256_extract_epi8(n, 29));
+ printf(" %02x", _mm256_extract_epi8(n, 30));
+ printf(" %02x", _mm256_extract_epi8(n, 31));
+ printf("\n");
+}
+#endif
+
+#ifndef HAS_MULTIVERSIONING
+
+const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
+{
+ // No fast path possible unless we have multiversioning.
+ return start;
+}
+
+#else // defined(HAS_MULTIVERSIONING)
+
+const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
+
+// Does a memcpy and memchr in one to reduce processing time.
+// Note that the benefit is somewhat limited if your L3 cache is small,
+// as you'll (unfortunately) spend most of the time loading the data
+// from main memory.
+//
+// Complicated cases are left to the slow path; it basically stops copying
+// up until the first instance of "sync_char" (usually a bit before, actually).
+// This is fine, since 0x00 bytes shouldn't really show up in normal picture
+// data, and what we really need this for is the 00 00 ff ff marker in video data.
+__attribute__((target("default")))
+const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
+{
+ // No fast path possible unless we have SSE 4.1 or higher.
+ return start;
+}
+
+__attribute__((target("sse4.1", "avx2")))
+const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
+{
+ if (current_frame->data == nullptr ||
+ current_frame->len > current_frame->size ||
+ start == limit) {
+ return start;
+ }
+ size_t orig_bytes = limit - start;
+ if (orig_bytes < 128) {
+ // Don't bother.
+ return start;
+ }
+
+ // Don't read more bytes than we can write.
+ limit = min(limit, start + (current_frame->size - current_frame->len));
+
+ // Align end to 32 bytes.
+ limit = (const uint8_t *)(intptr_t(limit) & ~31);
+
+ if (start >= limit) {
+ return start;
+ }
+
+ // Process [0,31] bytes, such that start gets aligned to 32 bytes.
+ const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
+ if (aligned_start != start) {
+ const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
+ if (sync_start == nullptr) {
+ add_to_frame(current_frame, "", start, aligned_start);
+ } else {
+ add_to_frame(current_frame, "", start, sync_start);
+ return sync_start;
+ }
+ }
+
+ // Make the length a multiple of 64.
+ if (current_frame->interleaved) {
+ if (((limit - aligned_start) % 64) != 0) {
+ limit -= 32;
+ }
+ assert(((limit - aligned_start) % 64) == 0);
+ }
+
+ return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
+}
+
+__attribute__((target("avx2")))
+const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
+{
+ const __m256i needle = _mm256_set1_epi8(sync_char);
+
+ const __restrict __m256i *in = (const __m256i *)aligned_start;
+ if (current_frame->interleaved) {
+ __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
+ __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
+ if (current_frame->len % 2 == 1) {
+ swap(out1, out2);
+ }
+
+ __m256i shuffle_cw = _mm256_set_epi8(
+ 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
+ 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+ while (in < (const __m256i *)limit) {
+ // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
+ __m256i data1 = _mm256_stream_load_si256(in); // AaBbCcDd EeFfGgHh
+ __m256i data2 = _mm256_stream_load_si256(in + 1); // IiJjKkLl MmNnOoPp
+
+ __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
+ __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
+ __m256i found = _mm256_or_si256(found1, found2);
+
+ data1 = _mm256_shuffle_epi8(data1, shuffle_cw); // ABCDabcd EFGHefgh
+ data2 = _mm256_shuffle_epi8(data2, shuffle_cw); // IJKLijkl MNOPmnop
+
+ data1 = _mm256_permute4x64_epi64(data1, 0b11011000); // ABCDEFGH abcdefgh
+ data2 = _mm256_permute4x64_epi64(data2, 0b11011000); // IJKLMNOP ijklmnop
+
+ __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
+ __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
+
+ _mm256_storeu_si256(out1, lo); // Store as early as possible, even if the data isn't used.
+ _mm256_storeu_si256(out2, hi);
+
+ if (!_mm256_testz_si256(found, found)) {
+ break;
+ }
+
+ in += 2;
+ ++out1;
+ ++out2;
+ }
+ current_frame->len += (uint8_t *)in - aligned_start;
+ } else {
+ __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
+ while (in < (const __m256i *)limit) {
+ __m256i data = _mm256_load_si256(in);
+ _mm256_storeu_si256(out, data); // Store as early as possible, even if the data isn't used.
+ __m256i found = _mm256_cmpeq_epi8(data, needle);
+ if (!_mm256_testz_si256(found, found)) {
+ break;
+ }
+
+ ++in;
+ ++out;
+ }
+ current_frame->len = (uint8_t *)out - current_frame->data;
+ }
+
+ //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
+ return (const uint8_t *)in;
+}
+
+__attribute__((target("sse4.1")))
+const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
+{
+ const __m128i needle = _mm_set1_epi8(sync_char);
+
+ const __m128i *in = (const __m128i *)aligned_start;
+ if (current_frame->interleaved) {
+ __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
+ __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
+ if (current_frame->len % 2 == 1) {
+ swap(out1, out2);
+ }
+
+ __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
+ while (in < (const __m128i *)limit) {
+ __m128i data1 = _mm_load_si128(in);
+ __m128i data2 = _mm_load_si128(in + 1);
+ __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
+ __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
+ __m128i data1_hi = _mm_srli_epi16(data1, 8);
+ __m128i data2_hi = _mm_srli_epi16(data2, 8);
+ __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
+ _mm_storeu_si128(out1, lo); // Store as early as possible, even if the data isn't used.
+ __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
+ _mm_storeu_si128(out2, hi);
+ __m128i found1 = _mm_cmpeq_epi8(data1, needle);
+ __m128i found2 = _mm_cmpeq_epi8(data2, needle);
+ if (!_mm_testz_si128(found1, found1) ||
+ !_mm_testz_si128(found2, found2)) {
+ break;
+ }
+
+ in += 2;
+ ++out1;
+ ++out2;
+ }
+ current_frame->len += (uint8_t *)in - aligned_start;
+ } else {
+ __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
+ while (in < (const __m128i *)limit) {
+ __m128i data = _mm_load_si128(in);
+ _mm_storeu_si128(out, data); // Store as early as possible, even if the data isn't used.
+ __m128i found = _mm_cmpeq_epi8(data, needle);
+ if (!_mm_testz_si128(found, found)) {
+ break;
+ }
+
+ ++in;
+ ++out;
+ }
+ current_frame->len = (uint8_t *)out - current_frame->data;