#if defined (MODULE_NAME_IS_i420_rgb)
# include "i420_rgb_c.h"
#elif defined (MODULE_NAME_IS_i420_rgb_mmx)
+# if defined(HAVE_MMX_INTRINSICS)
+# include <mmintrin.h>
+# endif
# include "i420_rgb_mmx.h"
#endif
/* 15bpp 5/5/5 */
for ( i_x = p_vout->render.i_width / 8; i_x--; )
{
+# if defined (HAVE_MMX_INTRINSICS)
+ __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
+ INTRINSICS_INIT_16
+ INTRINSICS_YUV_MUL
+ INTRINSICS_YUV_ADD
+ INTRINSICS_UNPACK_15
+# else
__asm__( MMX_INIT_16
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
MMX_YUV_ADD
MMX_UNPACK_15
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
+# endif
p_y += 8;
p_u += 4;
/* 16bpp 5/6/5 */
for ( i_x = p_vout->render.i_width / 8; i_x--; )
{
+# if defined (HAVE_MMX_INTRINSICS)
+ __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
+ INTRINSICS_INIT_16
+ INTRINSICS_YUV_MUL
+ INTRINSICS_YUV_ADD
+ INTRINSICS_UNPACK_16
+# else
__asm__( MMX_INIT_16
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
MMX_YUV_ADD
MMX_UNPACK_16
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
+# endif
p_y += 8;
p_u += 4;
* at least we have all the pixels */
if( i_rewind )
{
+#if defined (MODULE_NAME_IS_i420_rgb_mmx)
+# if defined (HAVE_MMX_INTRINSICS)
+ __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
+# endif
+#endif
p_y -= i_rewind;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
#elif defined (MODULE_NAME_IS_i420_rgb_mmx)
+
+# if defined (HAVE_MMX_INTRINSICS)
+ INTRINSICS_INIT_16
+# else
__asm__( MMX_INIT_16
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
+# endif
if( p_vout->output.i_rmask == 0x7c00 )
{
/* 15bpp 5/5/5 */
+# if defined (HAVE_MMX_INTRINSICS)
+ INTRINSICS_YUV_MUL
+ INTRINSICS_YUV_ADD
+ INTRINSICS_UNPACK_15
+# else
__asm__( ".align 8"
MMX_YUV_MUL
MMX_YUV_ADD
MMX_UNPACK_15
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
+# endif
}
else
{
+# if defined (HAVE_MMX_INTRINSICS)
+ INTRINSICS_YUV_MUL
+ INTRINSICS_YUV_ADD
+ INTRINSICS_UNPACK_16
+# else
/* 16bpp 5/6/5 */
__asm__( ".align 8"
MMX_YUV_MUL
MMX_YUV_ADD
MMX_UNPACK_16
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
+# endif
}
p_y += 8;
CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
#elif defined (MODULE_NAME_IS_i420_rgb_mmx)
+# if defined (HAVE_MMX_INTRINSICS)
+ __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
+ INTRINSICS_INIT_32
+ INTRINSICS_YUV_MUL
+ INTRINSICS_YUV_ADD
+ INTRINSICS_UNPACK_32
+# else
__asm__( MMX_INIT_32
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
MMX_YUV_ADD
MMX_UNPACK_32
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
+# endif
p_y += 8;
p_u += 4;
* at least we have all the pixels */
if( i_rewind )
{
+#if defined (MODULE_NAME_IS_i420_rgb_mmx)
+# if defined (HAVE_MMX_INTRINSICS)
+ __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
+# endif
+#endif
p_y -= i_rewind;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
#elif defined (MODULE_NAME_IS_i420_rgb_mmx)
+# if defined (HAVE_MMX_INTRINSICS)
+ INTRINSICS_INIT_32
+ INTRINSICS_YUV_MUL
+ INTRINSICS_YUV_ADD
+ INTRINSICS_UNPACK_32
+# else
__asm__( MMX_INIT_32
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
MMX_YUV_ADD
MMX_UNPACK_32
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
+# endif
p_y += 8;
p_u += 4;
#movl $0, (%3) # cache preload for image \n\
"
+#define INTRINSICS_INIT_16 \
+ mm0 = (__m64)(uint64_t)*(uint32_t *)p_u; \
+ mm1 = (__m64)(uint64_t)*(uint32_t *)p_v; \
+ mm4 = (__m64)(uint64_t)0; \
+ mm6 = (__m64)*(uint64_t *)p_y; \
+ /* *(uint16_t *)p_buffer = 0; */
+
#define MMX_INIT_16_GRAY " \n\
movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
#movl $0, (%3) # cache preload for image \n\
movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
+#define INTRINSICS_INIT_32 \
+ mm0 = (__m64)(uint64_t)*(uint32_t *)p_u; \
+ *(uint16_t *)p_buffer = 0; \
+ mm1 = (__m64)(uint64_t)*(uint32_t *)p_v; \
+ mm4 = (__m64)(uint64_t)0; \
+ mm6 = (__m64)*(uint64_t *)p_y;
+
/*
* Do the multiply part of the conversion for even and odd pixels,
* register usage:
pmulhw mmx_Y_coeff, %%mm7 # Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
"
+#define INTRINSICS_YUV_MUL \
+ mm0 = _mm_unpacklo_pi8(mm0, mm4); \
+ mm1 = _mm_unpacklo_pi8(mm1, mm4); \
+ mm0 = _mm_subs_pi16(mm0, (__m64)mmx_80w); \
+ mm1 = _mm_subs_pi16(mm1, (__m64)mmx_80w); \
+ mm0 = _mm_slli_pi16(mm0, 3); \
+ mm1 = _mm_slli_pi16(mm1, 3); \
+ mm2 = mm0; \
+ mm3 = mm1; \
+ mm2 = _mm_mulhi_pi16(mm2, (__m64)mmx_U_green); \
+ mm3 = _mm_mulhi_pi16(mm3, (__m64)mmx_V_green); \
+ mm0 = _mm_mulhi_pi16(mm0, (__m64)mmx_U_blue); \
+ mm1 = _mm_mulhi_pi16(mm1, (__m64)mmx_V_red); \
+ mm2 = _mm_adds_pi16(mm2, mm3); \
+ \
+ mm6 = _mm_subs_pu8(mm6, (__m64)mmx_10w); \
+ mm7 = mm6; \
+ mm6 = _mm_and_si64(mm6, (__m64)mmx_00ffw); \
+ mm7 = _mm_srli_pi16(mm7, 8); \
+ mm6 = _mm_slli_pi16(mm6, 3); \
+ mm7 = _mm_slli_pi16(mm7, 3); \
+ mm6 = _mm_mulhi_pi16(mm6, (__m64)mmx_Y_coeff); \
+ mm7 = _mm_mulhi_pi16(mm7, (__m64)mmx_Y_coeff);
+
/*
* Do the addition part of the conversion for even and odd pixels,
* register usage:
punpcklbw %%mm5, %%mm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
"
+#define INTRINSICS_YUV_ADD \
+ mm3 = mm0; \
+ mm4 = mm1; \
+ mm5 = mm2; \
+ mm0 = _mm_adds_pi16(mm0, mm6); \
+ mm3 = _mm_adds_pi16(mm3, mm7); \
+ mm1 = _mm_adds_pi16(mm1, mm6); \
+ mm4 = _mm_adds_pi16(mm4, mm7); \
+ mm2 = _mm_adds_pi16(mm2, mm6); \
+ mm5 = _mm_adds_pi16(mm5, mm7); \
+ \
+ mm0 = _mm_packs_pu16(mm0, mm0); \
+ mm1 = _mm_packs_pu16(mm1, mm1); \
+ mm2 = _mm_packs_pu16(mm2, mm2); \
+ \
+ mm3 = _mm_packs_pu16(mm3, mm3); \
+ mm4 = _mm_packs_pu16(mm4, mm4); \
+ mm5 = _mm_packs_pu16(mm5, mm5); \
+ \
+ mm0 = _mm_unpacklo_pi8(mm0, mm3); \
+ mm1 = _mm_unpacklo_pi8(mm1, mm4); \
+ mm2 = _mm_unpacklo_pi8(mm2, mm5);
+
/*
* Grayscale case, only use Y
*/
movq %%mm5, 8(%3) # store pixel 4-7 \n\
"
+#define INTRINSICS_UNPACK_15 \
+ mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
+ mm0 = _mm_srli_pi16(mm0, 3); \
+ mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_f8); \
+ mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
+ mm1 = _mm_srli_pi16(mm1, 1); \
+ mm4 = (__m64)(uint64_t)0; \
+ mm5 = mm0; \
+ mm7 = mm2; \
+ \
+ mm2 = _mm_unpacklo_pi8(mm2, mm4); \
+ mm0 = _mm_unpacklo_pi8(mm0, mm1); \
+ mm2 = _mm_slli_pi16(mm2, 2); \
+ mm0 = _mm_or_si64(mm0, mm2); \
+ mm6 = (__m64)*(uint64_t *)(p_y + 8); \
+ *(uint64_t *)p_buffer = (uint64_t)mm0; \
+ \
+ mm7 = _mm_unpackhi_pi8(mm7, mm4); \
+ mm5 = _mm_unpackhi_pi8(mm5, mm1); \
+ mm7 = _mm_slli_pi16(mm7, 2); \
+ mm0 = (__m64)(uint64_t)*(uint32_t *)(p_u + 4); \
+ mm5 = _mm_or_si64(mm5, mm7); \
+ mm1 = (__m64)(uint64_t)*(uint32_t *)(p_v + 4); \
+ *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
+
/*
* convert RGB plane to RGB 16 bits,
* mm0 -> B, mm1 -> R, mm2 -> G,
movq %%mm5, 8(%3) # store pixel 4-7 \n\
"
+#define INTRINSICS_UNPACK_16 \
+ mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
+ mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_fc); \
+ mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
+ mm0 = _mm_srli_pi16(mm0, 3); \
+ mm4 = (__m64)(uint64_t)0; \
+ mm5 = mm0; \
+ mm7 = mm2; \
+ \
+ mm2 = _mm_unpacklo_pi8(mm2, mm4); \
+ mm0 = _mm_unpacklo_pi8(mm0, mm1); \
+ mm2 = _mm_slli_pi16(mm2, 3); \
+ mm0 = _mm_or_si64(mm0, mm2); \
+ mm6 = (__m64)*(uint64_t *)(p_y + 8); \
+ *(uint64_t *)p_buffer = (uint64_t)mm0; \
+ \
+ mm7 = _mm_unpackhi_pi8(mm7, mm4); \
+ mm5 = _mm_unpackhi_pi8(mm5, mm1); \
+ mm7 = _mm_slli_pi16(mm7, 3); \
+ mm0 = (__m64)(uint64_t)*(uint32_t *)(p_u + 4); \
+ mm5 = _mm_or_si64(mm5, mm7); \
+ mm1 = (__m64)(uint64_t)*(uint32_t *)(p_v + 4); \
+ *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
+
/*
* convert RGB plane to RGB packed format,
* mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
#movq 8(%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
+#define INTRINSICS_UNPACK_32 \
+ mm3 = (__m64)(uint64_t)0; \
+ mm6 = mm0; \
+ mm7 = mm1; \
+ mm4 = mm0; \
+ mm5 = mm1; \
+ mm6 = _mm_unpacklo_pi8(mm6, mm2); \
+ mm7 = _mm_unpacklo_pi8(mm7, mm3); \
+ mm6 = _mm_unpacklo_pi16(mm6, mm7); \
+ *(uint64_t *)p_buffer = (uint64_t)mm6; \
+ mm6 = mm0; \
+ mm6 = _mm_unpacklo_pi8(mm6, mm2); \
+ mm6 = _mm_unpackhi_pi16(mm6, mm7); \
+ *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6; \
+ mm4 = _mm_unpackhi_pi8(mm4, mm2); \
+ mm5 = _mm_unpackhi_pi8(mm5, mm3); \
+ mm4 = _mm_unpacklo_pi16(mm4, mm5); \
+ *(uint64_t *)(p_buffer + 4) = (uint64_t)mm4; \
+ mm4 = mm0; \
+ mm4 = _mm_unpackhi_pi8(mm4, mm2); \
+ mm4 = _mm_unpackhi_pi16(mm4, mm5); \
+ *(uint64_t *)(p_buffer + 6) = (uint64_t)mm4; \
+