#if defined (MODULE_NAME_IS_i420_rgb)
# include "i420_rgb_c.h"
#elif defined (MODULE_NAME_IS_i420_rgb_mmx)
-# if defined(HAVE_MMX_INTRINSICS)
-# include <mmintrin.h>
-# endif
# include "i420_rgb_mmx.h"
#elif defined (MODULE_NAME_IS_i420_rgb_sse2)
-# if defined(HAVE_SSE2_INTRINSICS)
-# include <emmintrin.h>
-# endif
# include "i420_rgb_mmx.h"
#endif
}
}
-#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
+#else // ! defined (MODULE_NAME_IS_i420_rgb)
void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
picture_t *p_dest )
for ( i_x = p_vout->render.i_width/16; i_x--; )
{
-#if defined (CAN_COMPILE_SSE2)
- __asm__( ".p2align 3"
- SSE2_INIT_16_ALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_15_ALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
- SSE2_INTRINSICS_INIT_16_ALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_15_ALIGNED
-#endif
+ SSE2_CALL (
+ SSE2_INIT_16_ALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_15_ALIGNED
+ );
p_y += 16;
p_u += 8;
p_v += 8;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
-#if defined (CAN_COMPILE_SSE2)
- __asm__( ".p2align 3"
- SSE2_INIT_16_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_15_UNALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- {
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
- SSE2_INTRINSICS_INIT_16_UNALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_15_UNALIGNED
- }
-#endif
+ SSE2_CALL (
+ SSE2_INIT_16_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_15_UNALIGNED
+ );
p_y += 16;
p_u += 8;
p_v += 8;
for ( i_x = p_vout->render.i_width/16; i_x--; )
{
-#if defined (CAN_COMPILE_SSE2)
- __asm__( ".p2align 3"
- SSE2_INIT_16_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_15_UNALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
- SSE2_INTRINSICS_INIT_16_UNALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_15_UNALIGNED
-#endif
+ SSE2_CALL (
+ SSE2_INIT_16_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_15_UNALIGNED
+ );
p_y += 16;
p_u += 8;
p_v += 8;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
-#if defined (CAN_COMPILE_SSE2)
- __asm__( ".p2align 3"
- SSE2_INIT_16_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_15_UNALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- {
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
- SSE2_INTRINSICS_INIT_16_UNALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_15_UNALIGNED
- }
-#endif
+ SSE2_CALL (
+ SSE2_INIT_16_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_15_UNALIGNED
+ );
p_y += 16;
p_u += 8;
p_v += 8;
}
/* make sure all SSE2 stores are visible thereafter */
-#if defined (CAN_COMPILE_SSE2)
- __asm__ __volatile__ ( "sfence" ::: "memory" );
-#else
- _mm_sfence();
-#endif
+ SSE2_END;
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
for ( i_x = p_vout->render.i_width / 8; i_x--; )
{
-#if defined (CAN_COMPILE_MMX)
- __asm__( ".p2align 3"
- MMX_INIT_16
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_15
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
-#else
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
- uint64_t tmp64;
- MMX_INTRINSICS_INIT_16
- MMX_INTRINSICS_YUV_MUL
- MMX_INTRINSICS_YUV_ADD
- MMX_INTRINSICS_UNPACK_15
-#endif
-
+ MMX_CALL (
+ MMX_INIT_16
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_15
+ );
p_y += 8;
p_u += 4;
p_v += 4;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
-#if defined (CAN_COMPILE_MMX)
- __asm__( ".p2align 3"
- MMX_INIT_16
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_15
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
-#else
- {
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
- uint64_t tmp64;
-
- MMX_INTRINSICS_INIT_16
- MMX_INTRINSICS_YUV_MUL
- MMX_INTRINSICS_YUV_ADD
- MMX_INTRINSICS_UNPACK_15
- }
-#endif
+ MMX_CALL (
+ MMX_INIT_16
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_15
+ );
p_y += 8;
p_u += 4;
p_v += 4;
}
}
/* re-enable FPU registers */
-#if defined (CAN_COMPILE_MMX)
- __asm__ __volatile__ ( "emms" );
-#else
- _mm_empty();
-#endif
+ MMX_END;
#endif
}
for ( i_x = p_vout->render.i_width/16; i_x--; )
{
-#if defined (CAN_COMPILE_SSE2)
- __asm__( ".p2align 3"
- SSE2_INIT_16_ALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_16_ALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
- SSE2_INTRINSICS_INIT_16_ALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_16_ALIGNED
-#endif
+ SSE2_CALL (
+ SSE2_INIT_16_ALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_16_ALIGNED
+ );
p_y += 16;
p_u += 8;
p_v += 8;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
-#if defined (CAN_COMPILE_SSE2)
- __asm__( ".p2align 3"
- SSE2_INIT_16_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_16_UNALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- {
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
- SSE2_INTRINSICS_INIT_16_UNALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_16_UNALIGNED
- }
-#endif
+ SSE2_CALL (
+ SSE2_INIT_16_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_16_UNALIGNED
+ );
p_y += 16;
p_u += 8;
p_v += 8;
for ( i_x = p_vout->render.i_width/16; i_x--; )
{
-#if defined (CAN_COMPILE_SSE2)
- __asm__( ".p2align 3"
- SSE2_INIT_16_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_16_UNALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
- SSE2_INTRINSICS_INIT_16_UNALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_16_UNALIGNED
-#endif
+ SSE2_CALL(
+ SSE2_INIT_16_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_16_UNALIGNED
+ );
p_y += 16;
p_u += 8;
p_v += 8;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
-#if defined (CAN_COMPILE_SSE2)
- __asm__( ".p2align 3"
- SSE2_INIT_16_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_16_UNALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- {
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
- SSE2_INTRINSICS_INIT_16_UNALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_16_UNALIGNED
- }
-#endif
+ SSE2_CALL(
+ SSE2_INIT_16_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_16_UNALIGNED
+ );
p_y += 16;
p_u += 8;
p_v += 8;
}
/* make sure all SSE2 stores are visible thereafter */
-#if defined (CAN_COMPILE_SSE2)
- __asm__ __volatile__ ( "sfence" ::: "memory" );
-#else
- _mm_sfence();
-#endif
+ SSE2_END;
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
for ( i_x = p_vout->render.i_width / 8; i_x--; )
{
-#if defined (CAN_COMPILE_MMX)
- __asm__( ".p2align 3"
- MMX_INIT_16
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_16
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
-#else
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
- uint64_t tmp64;
- MMX_INTRINSICS_INIT_16
- MMX_INTRINSICS_YUV_MUL
- MMX_INTRINSICS_YUV_ADD
- MMX_INTRINSICS_UNPACK_16
-#endif
-
+ MMX_CALL (
+ MMX_INIT_16
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_16
+ );
p_y += 8;
p_u += 4;
p_v += 4;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
-#if defined (CAN_COMPILE_MMX)
- __asm__( ".p2align 3"
- MMX_INIT_16
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_16
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
-#else
- {
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
- uint64_t tmp64;
-
- MMX_INTRINSICS_INIT_16
- MMX_INTRINSICS_YUV_MUL
- MMX_INTRINSICS_YUV_ADD
- MMX_INTRINSICS_UNPACK_16
- }
-#endif
+ MMX_CALL (
+ MMX_INIT_16
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_16
+ );
p_y += 8;
p_u += 4;
p_v += 4;
}
}
/* re-enable FPU registers */
-#if defined (CAN_COMPILE_MMX)
- __asm__ __volatile__ ( "emms" );
-#else
- _mm_empty();
-#endif
+ MMX_END;
#endif
}
for ( i_x = p_vout->render.i_width / 16; i_x--; )
{
-#if defined (CAN_COMPILE_SSE2)
- /* use inline SSE2 assembly */
- __asm__( ".p2align 3"
- SSE2_INIT_32_ALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_ARGB_ALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- /* otherwise use SSE2 C intrinsics wrappers */
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
- SSE2_INTRINSICS_INIT_32_ALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_32_ARGB_ALIGNED
-#endif
+ SSE2_CALL (
+ SSE2_INIT_32_ALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ARGB_ALIGNED
+ );
p_y += 16;
p_u += 8;
p_v += 8;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
-#if defined (CAN_COMPILE_SSE2)
- /* use inline SSE2 assembly */
- __asm__( ".p2align 3"
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_ARGB_UNALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- /* otherwise use SSE2 intrinsics wrappers */
- {
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
- SSE2_INTRINSICS_INIT_32_UNALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
- }
-#endif
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ARGB_UNALIGNED
+ );
p_y += 16;
p_u += 4;
p_v += 4;
for ( i_x = p_vout->render.i_width / 16; i_x--; )
{
-#if defined (CAN_COMPILE_SSE2)
- /* use inline SSE2 assembly */
- __asm__( ".p2align 3"
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_ARGB_UNALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- /* otherwise use SSE2 C intrinsics wrappers */
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
- SSE2_INTRINSICS_INIT_32_UNALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
-#endif
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ARGB_UNALIGNED
+ );
p_y += 16;
p_u += 8;
p_v += 8;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
-#if defined (CAN_COMPILE_SSE2)
- /* use inline SSE2 assembly */
- __asm__( ".p2align 3"
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_ARGB_UNALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- /* otherwise use SSE2 intrinsics wrappers */
- {
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
- SSE2_INTRINSICS_INIT_32_UNALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
- }
-#endif
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ARGB_UNALIGNED
+ );
p_y += 16;
p_u += 8;
p_v += 8;
}
/* make sure all SSE2 stores are visible thereafter */
-#if defined (CAN_COMPILE_SSE2)
- __asm__ __volatile__ ( "sfence" ::: "memory" );
-#else
- _mm_sfence();
-#endif
+ SSE2_END;
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
for ( i_x = p_vout->render.i_width / 8; i_x--; )
{
-#if defined (CAN_COMPILE_MMX)
- /* use inline MMX assembly */
- __asm__( MMX_INIT_32
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
-
- __asm__( ".p2align 3"
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_32_ARGB
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
-#else
- /* otherwise use MMX C intrinsics wrappers */
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
- uint64_t tmp64;
-
- MMX_INTRINSICS_INIT_32
- MMX_INTRINSICS_YUV_MUL
- MMX_INTRINSICS_YUV_ADD
- MMX_INTRINSICS_UNPACK_32_ARGB
-#endif
+ MMX_CALL (
+ MMX_INIT_32
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_32_ARGB
+ );
p_y += 8;
p_u += 4;
p_v += 4;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
-#if defined (CAN_COMPILE_MMX)
- /* use inline MMX assembly */
- __asm__( ".p2align 3"
- MMX_INIT_32
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_32_ARGB
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
-#else
- /* otherwise use MMX intrinsics wrappers */
- {
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
- uint64_t tmp64;
-
- MMX_INTRINSICS_INIT_32
- MMX_INTRINSICS_YUV_MUL
- MMX_INTRINSICS_YUV_ADD
- MMX_INTRINSICS_UNPACK_32_ARGB
- }
-#endif
+ MMX_CALL (
+ MMX_INIT_32
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_32_ARGB
+ );
p_y += 8;
p_u += 4;
p_v += 4;
p_v += i_source_margin_c;
}
}
+
/* re-enable FPU registers */
-#if defined (CAN_COMPILE_MMX)
- __asm__ __volatile__ ( "emms" );
-#else
- _mm_empty();
-#endif
+ MMX_END;
#endif
}
for ( i_x = p_vout->render.i_width / 16; i_x--; )
{
-#if defined (CAN_COMPILE_SSE2)
- /* use inline SSE2 assembly */
- __asm__( ".p2align 3"
- SSE2_INIT_32_ALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_BGRA_ALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- /* otherwise use SSE2 C intrinsics wrappers */
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
- SSE2_INTRINSICS_INIT_32_ALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_32_BGRA_ALIGNED
-#endif
+ SSE2_CALL (
+ SSE2_INIT_32_ALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_BGRA_ALIGNED
+ );
p_y += 16;
p_u += 8;
p_v += 8;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
-#if defined (CAN_COMPILE_SSE2)
- /* use inline SSE2 assembly */
- __asm__( ".p2align 3"
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_BGRA_UNALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- /* otherwise use SSE2 intrinsics wrappers */
- {
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
- SSE2_INTRINSICS_INIT_32_UNALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
- }
-#endif
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_BGRA_UNALIGNED
+ );
p_y += 16;
p_u += 4;
p_v += 4;
for ( i_x = p_vout->render.i_width / 16; i_x--; )
{
-#if defined (CAN_COMPILE_SSE2)
- /* use inline SSE2 assembly */
- __asm__( ".p2align 3"
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_BGRA_UNALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- /* otherwise use SSE2 C intrinsics wrappers */
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
- SSE2_INTRINSICS_INIT_32_UNALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
-#endif
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_BGRA_UNALIGNED
+ );
p_y += 16;
p_u += 8;
p_v += 8;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
-#if defined (CAN_COMPILE_SSE2)
- /* use inline SSE2 assembly */
- __asm__( ".p2align 3"
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_BGRA_UNALIGNED
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
-#else
- /* otherwise use SSE2 intrinsics wrappers */
- {
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
- SSE2_INTRINSICS_INIT_32_UNALIGNED
- SSE2_INTRINSICS_YUV_MUL
- SSE2_INTRINSICS_YUV_ADD
- SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
- }
-#endif
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_BGRA_UNALIGNED
+ );
p_y += 16;
p_u += 8;
p_v += 8;
for ( i_x = p_vout->render.i_width / 8; i_x--; )
{
-#if defined (CAN_COMPILE_MMX)
- /* use inline MMX assembly */
- __asm__( MMX_INIT_32
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
-
- __asm__( ".p2align 3"
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_32_ARGB
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
-#else
- /* otherwise use MMX C intrinsics wrappers */
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
- uint64_t tmp64;
-
- MMX_INTRINSICS_INIT_32
- MMX_INTRINSICS_YUV_MUL
- MMX_INTRINSICS_YUV_ADD
- MMX_INTRINSICS_UNPACK_32_BGRA
-#endif
+ MMX_CALL (
+ MMX_INIT_32
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_32_BGRA
+ );
p_y += 8;
p_u += 4;
p_v += 4;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
-#if defined (CAN_COMPILE_MMX)
- /* use inline MMX assembly */
- __asm__( ".p2align 3"
- MMX_INIT_32
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_32_BGRA
- : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
-#else
- /* otherwise use MMX intrinsics wrappers */
+ MMX_CALL (
+ MMX_INIT_32
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_32_BGRA
+ );
+ p_y += 8;
+ p_u += 4;
+ p_v += 4;
+ p_buffer += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 4 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ }
+
+ /* re-enable FPU registers */
+ MMX_END;
+
+#endif
+}
+
+void E_(I420_A8B8G8R8)( vout_thread_t *p_vout, picture_t *p_src,
+ picture_t *p_dest )
+{
+ /* We got this one from the old arguments */
+ uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
+ uint8_t *p_y = p_src->Y_PIXELS;
+ uint8_t *p_u = p_src->U_PIXELS;
+ uint8_t *p_v = p_src->V_PIXELS;
+
+ vlc_bool_t b_hscale; /* horizontal scaling type */
+ unsigned int i_vscale; /* vertical scaling type */
+ unsigned int i_x, i_y; /* horizontal and vertical indexes */
+
+ int i_right_margin;
+ int i_rewind;
+ int i_scale_count; /* scale modulo counter */
+ int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
+ uint32_t * p_pic_start; /* beginning of the current line for copy */
+ /* Conversion buffer pointer */
+ uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
+ uint32_t * p_buffer;
+
+ /* Offset array pointer */
+ int * p_offset_start = p_vout->chroma.p_sys->p_offset;
+ int * p_offset;
+
+ const int i_source_margin = p_src->p[0].i_pitch
+ - p_src->p[0].i_visible_pitch;
+ const int i_source_margin_c = p_src->p[1].i_pitch
+ - p_src->p[1].i_visible_pitch;
+
+ i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
+
+ /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
+ * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
+ * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
+ SetOffset( p_vout->render.i_width, p_vout->render.i_height,
+ p_vout->output.i_width, p_vout->output.i_height,
+ &b_hscale, &i_vscale, p_offset_start );
+
+ /*
+ * Perform conversion
+ */
+ i_scale_count = ( i_vscale == 1 ) ?
+ p_vout->output.i_height : p_vout->render.i_height;
+
+#if defined (MODULE_NAME_IS_i420_rgb_sse2)
+
+ if( p_vout->render.i_width & 15 )
+ {
+ i_rewind = 16 - ( p_vout->render.i_width & 15 );
+ }
+ else
+ {
+ i_rewind = 0;
+ }
+
+ /*
+ ** SSE2 128 bits fetch/store instructions are faster
+ ** if memory access is 16 bytes aligned
+ */
+
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
+ p_dest->p->i_pitch|
+ ((int)p_y)|
+ ((int)p_buffer))) )
+ {
+ /* use faster SSE2 aligned fetch and store */
+ for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+
+ for ( i_x = p_vout->render.i_width / 16; i_x--; )
{
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
- uint64_t tmp64;
+ SSE2_CALL (
+ SSE2_INIT_32_ALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ABGR_ALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ p_buffer += 16;
+ }
- MMX_INTRINSICS_INIT_32
- MMX_INTRINSICS_YUV_MUL
- MMX_INTRINSICS_YUV_ADD
- MMX_INTRINSICS_UNPACK_32_BGRA
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ABGR_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 4;
+ p_v += 4;
}
-#endif
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 4 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ }
+ }
+ else
+ {
+ /* use slower SSE2 unaligned fetch and store */
+ for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+ for ( i_x = p_vout->render.i_width / 16; i_x--; )
+ {
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ABGR_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ p_buffer += 16;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ABGR_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 4 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ }
+ }
+
+#else
+
+ if( p_vout->render.i_width & 7 )
+ {
+ i_rewind = 8 - ( p_vout->render.i_width & 7 );
+ }
+ else
+ {
+ i_rewind = 0;
+ }
+
+ for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+ for ( i_x = p_vout->render.i_width / 8; i_x--; )
+ {
+ MMX_CALL (
+ MMX_INIT_32
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_32_ABGR
+ );
+ p_y += 8;
+ p_u += 4;
+ p_v += 4;
+ p_buffer += 8;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+ MMX_CALL (
+ MMX_INIT_32
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_32_ABGR
+ );
p_y += 8;
p_u += 4;
p_v += 4;
p_v += i_source_margin_c;
}
}
+
/* re-enable FPU registers */
-#if defined (CAN_COMPILE_MMX)
- __asm__ __volatile__ ( "emms" );
-#else
- _mm_empty();
-#endif
+ MMX_END;
#endif
}