This is only for cleanness, since stride == width * 2 for all 8-bit
modes that we know of. Also fixes an overrun for non-SSE2.
uint8_t *origptr = video_frame.data + video_offset + video_format.extra_lines_top * video_format.stride;
#if __SSE2__
__m128i *ptr = (__m128i *)origptr;
uint8_t *origptr = video_frame.data + video_offset + video_format.extra_lines_top * video_format.stride;
#if __SSE2__
__m128i *ptr = (__m128i *)origptr;
- for (unsigned i = 0; i < video_format.width * video_format.height / 8; ++i) {
+ for (unsigned i = 0; i < video_format.stride * video_format.height / 16; ++i) {
__m128i val = _mm_loadu_si128(ptr);
val = _mm_slli_epi16(val, 8) | _mm_srli_epi16(val, 8);
_mm_storeu_si128(ptr, val);
__m128i val = _mm_loadu_si128(ptr);
val = _mm_slli_epi16(val, 8) | _mm_srli_epi16(val, 8);
_mm_storeu_si128(ptr, val);
}
#else
uint8_t *ptr = origptr;
}
#else
uint8_t *ptr = origptr;
- for (unsigned i = 0; i < video_format.width * video_format.height; ++i) {
+ for (unsigned i = 0; i < video_format.stride * video_format.height / 4; ++i) {
swap(ptr[0], ptr[1]);
swap(ptr[2], ptr[3]);
ptr += 4;
swap(ptr[0], ptr[1]);
swap(ptr[2], ptr[3]);
ptr += 4;