Reduces code size because movaps/movups is one byte shorter than movdqa/movdqu.
Also merge MMX and SSE versions of memcpy_aligned into a single macro.
// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
{"MMX2", X264_CPU_MMX|X264_CPU_MMX2},
{"MMXEXT", X264_CPU_MMX|X264_CPU_MMX2},
-// {"SSE", X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE}, // there are no sse1 functions in x264
+ {"SSE", X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE},
#define SSE2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE|X264_CPU_SSE2
{"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW},
{"SSE2", SSE2},
INTRA_MBCMP_8x8( sad,, _c )
INTRA_MBCMP_8x8(sa8d,, _c )
#if HIGH_BIT_DEPTH && HAVE_MMX
+#define x264_predict_8x8_v_sse2 x264_predict_8x8_v_sse
INTRA_MBCMP_8x8( sad, _mmx2, _c )
INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 )
#endif
#if HAVE_MMX
#if HIGH_BIT_DEPTH
+#define x264_predict_8x8c_v_sse2 x264_predict_8x8c_v_sse
+#define x264_predict_8x16c_v_sse2 x264_predict_8x16c_v_sse
+#define x264_predict_16x16_v_sse2 x264_predict_16x16_v_sse
INTRA_MBCMP( sad, 4x4, v, h, dc, , _mmx2, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _c )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _mmx2, _mmx2 )
INIT_MMX mmx
MC_COPY 8
MC_COPY 16
-INIT_XMM sse2
+INIT_XMM sse
MC_COPY 8
MC_COPY 16
-INIT_XMM aligned, sse2
+INIT_XMM aligned, sse
MC_COPY 16
;-----------------------------------------------------------------------------
; void *memcpy_aligned( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal memcpy_aligned_mmx, 3,3
- test r2d, 16
- jz .copy32start
- movq mm0, [r1 + r2 - 16]
- movq mm1, [r1 + r2 - 8]
- movq [r0 + r2 - 16], mm0
- movq [r0 + r2 - 8], mm1
- sub r2d, 16
-.copy32start
- test r2d, r2d
- jz .ret
-.copy32:
- movq mm0, [r1 + r2 - 32]
- movq mm1, [r1 + r2 - 24]
- movq mm2, [r1 + r2 - 16]
- movq mm3, [r1 + r2 - 8]
- movq [r0 + r2 - 32], mm0
- movq [r0 + r2 - 24], mm1
- movq [r0 + r2 - 16], mm2
- movq [r0 + r2 - 8], mm3
- sub r2d, 32
- jg .copy32
-.ret
- RET
-
-;-----------------------------------------------------------------------------
-; void *memcpy_aligned( void *dst, const void *src, size_t n );
-;-----------------------------------------------------------------------------
-cglobal memcpy_aligned_sse2, 3,3
+%macro MEMCPY 0
+cglobal memcpy_aligned, 3,3
+%if mmsize == 16
test r2d, 16
- jz .copy32
- movdqa xmm0, [r1 + r2 - 16]
- movdqa [r0 + r2 - 16], xmm0
+ jz .copy2
+ mova m0, [r1+r2-16]
+ mova [r0+r2-16], m0
sub r2d, 16
-.copy32:
- test r2d, 32
- jz .copy64start
- movdqa xmm0, [r1 + r2 - 32]
- movdqa [r0 + r2 - 32], xmm0
- movdqa xmm1, [r1 + r2 - 16]
- movdqa [r0 + r2 - 16], xmm1
- sub r2d, 32
-.copy64start
+.copy2:
+%endif
+ test r2d, 2*mmsize
+ jz .copy4start
+ mova m0, [r1+r2-1*mmsize]
+ mova m1, [r1+r2-2*mmsize]
+ mova [r0+r2-1*mmsize], m0
+ mova [r0+r2-2*mmsize], m1
+ sub r2d, 2*mmsize
+.copy4start:
test r2d, r2d
jz .ret
-.copy64:
- movdqa xmm0, [r1 + r2 - 64]
- movdqa [r0 + r2 - 64], xmm0
- movdqa xmm1, [r1 + r2 - 48]
- movdqa [r0 + r2 - 48], xmm1
- movdqa xmm2, [r1 + r2 - 32]
- movdqa [r0 + r2 - 32], xmm2
- movdqa xmm3, [r1 + r2 - 16]
- movdqa [r0 + r2 - 16], xmm3
- sub r2d, 64
- jg .copy64
+.copy4:
+ mova m0, [r1+r2-1*mmsize]
+ mova m1, [r1+r2-2*mmsize]
+ mova m2, [r1+r2-3*mmsize]
+ mova m3, [r1+r2-4*mmsize]
+ mova [r0+r2-1*mmsize], m0
+ mova [r0+r2-2*mmsize], m1
+ mova [r0+r2-3*mmsize], m2
+ mova [r0+r2-4*mmsize], m3
+ sub r2d, 4*mmsize
+ jg .copy4
.ret:
REP_RET
+%endmacro
+
+INIT_MMX mmx
+MEMCPY
+INIT_XMM sse
+MEMCPY
;-----------------------------------------------------------------------------
; void *memzero_aligned( void *dst, size_t n );
cglobal memzero_aligned, 2,2
add r0, r1
neg r1
+%if mmsize == 8
pxor m0, m0
+%else
+ xorps m0, m0
+%endif
.loop:
%assign i 0
%rep 8
INIT_MMX mmx
MEMZERO
-INIT_XMM sse2
+INIT_XMM sse
MEMZERO
#undef MC_OFFSET
#undef MC_WEIGHT
-void x264_mc_copy_w4_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w8_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w16_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w16_sse2( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w16_aligned_sse2( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w4_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w8_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w8_sse ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_mmx( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_sse( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_aligned_sse( pixel *, intptr_t, pixel *, intptr_t, int );
void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
-void *x264_memcpy_aligned_mmx ( void *dst, const void *src, size_t n );
-void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
-void x264_memzero_aligned_mmx ( void *dst, size_t n );
-void x264_memzero_aligned_sse2( void *dst, size_t n );
+void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
+void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n );
+void x264_memzero_aligned_mmx( void *dst, size_t n );
+void x264_memzero_aligned_sse( void *dst, size_t n );
void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, intptr_t stride );
MC_COPY_WTAB(mmx,mmx,mmx,mmx)
#if HIGH_BIT_DEPTH
-MC_COPY_WTAB(sse2,mmx,sse2,sse2)
+MC_COPY_WTAB(sse,mmx,sse,sse)
#else
-MC_COPY_WTAB(sse2,mmx,mmx,sse2)
+MC_COPY_WTAB(sse,mmx,mmx,sse)
#endif
#define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
}
MC_LUMA(mmx2,mmx2,mmx)
-MC_LUMA(sse2,sse2,sse2)
+MC_LUMA(sse2,sse2,sse)
#if !HIGH_BIT_DEPTH
#if ARCH_X86
MC_LUMA(cache32_mmx2,cache32_mmx2,mmx)
MC_LUMA(cache64_mmx2,cache64_mmx2,mmx)
#endif
-MC_LUMA(cache64_sse2,cache64_sse2,sse2)
-MC_LUMA(cache64_ssse3,cache64_ssse3,sse2)
+MC_LUMA(cache64_sse2,cache64_sse2,sse)
+MC_LUMA(cache64_ssse3,cache64_ssse3,sse)
#endif // !HIGH_BIT_DEPTH
#define GET_REF(name)\
pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmx2;
+ if( cpu&X264_CPU_SSE )
+ {
+ pf->memcpy_aligned = x264_memcpy_aligned_sse;
+ pf->memzero_aligned = x264_memzero_aligned_sse;
+ }
+
#if HIGH_BIT_DEPTH
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
if( cpu&(X264_CPU_CACHELINE_32|X264_CPU_CACHELINE_64) )
pf->hpel_filter = x264_hpel_filter_sse2;
}
- pf->memcpy_aligned = x264_memcpy_aligned_sse2;
- pf->memzero_aligned = x264_memzero_aligned_sse2;
pf->integral_init4v = x264_integral_init4v_sse2;
pf->integral_init8v = x264_integral_init8v_sse2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_sse2;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_sse2;
- pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
+ pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
pf->weight = x264_mc_weight_wtab_sse2;
if( !(cpu&X264_CPU_STACK_MOD4) )
if( !(cpu&X264_CPU_SSE2) )
return;
- pf->memcpy_aligned = x264_memcpy_aligned_sse2;
- pf->memzero_aligned = x264_memzero_aligned_sse2;
pf->integral_init4v = x264_integral_init4v_sse2;
pf->integral_init8v = x264_integral_init8v_sse2;
pf->hpel_filter = x264_hpel_filter_sse2_amd;
pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
}
- pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
+ pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
%endmacro
%if HIGH_BIT_DEPTH
-INIT_XMM sse2
+INIT_XMM sse
PREDICT_8x8_V
%else
INIT_MMX mmx2
%endmacro
%if HIGH_BIT_DEPTH
-INIT_XMM sse2
+INIT_XMM sse
PREDICT_8x8C_V
%else
INIT_MMX mmx
%endmacro
%if HIGH_BIT_DEPTH
-INIT_XMM sse2
+INIT_XMM sse
PREDICT_8x16C_V
%else
INIT_MMX mmx
mova m3, [r0 - FDEC_STRIDEB+24]
STORE16x16 m0, m1, m2, m3
RET
-INIT_XMM
-cglobal predict_16x16_v_sse2, 2,2
+INIT_XMM sse
+cglobal predict_16x16_v, 1,2
mova m0, [r0 - FDEC_STRIDEB+ 0]
mova m1, [r0 - FDEC_STRIDEB+16]
STORE16x16_SSE2 m0, m1
movq m1, [r0 - FDEC_STRIDE + 8]
STORE16x16 m0, m1
RET
-INIT_XMM
-cglobal predict_16x16_v_sse2, 1,1
- movdqa xmm0, [r0 - FDEC_STRIDE]
- STORE16x16_SSE2 xmm0
+INIT_XMM sse
+cglobal predict_16x16_v, 1,1
+ mova m0, [r0 - FDEC_STRIDE]
+ STORE16x16_SSE2 m0
RET
%endif
pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx2;
pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmx2;
#if HIGH_BIT_DEPTH
+ if( !(cpu&X264_CPU_SSE) )
+ return;
+ pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse;
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2;
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2;
pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
- pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse2;
pf[I_PRED_16x16_H] = x264_predict_16x16_h_sse2;
#if HAVE_X86_INLINE_ASM
pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2;
#if !ARCH_X86_64
pf[I_PRED_16x16_P] = x264_predict_16x16_p_mmx2;
#endif
+ if( !(cpu&X264_CPU_SSE) )
+ return;
+ pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse;
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2;
- pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse2;
if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2;
return;
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmx2;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmx2;
+ if( !(cpu&X264_CPU_SSE) )
+ return;
+ pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_sse;
if( !(cpu&X264_CPU_SSE2) )
return;
- pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_sse2;
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_sse2;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_sse2;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_sse2;
return;
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2;
+ if( !(cpu&X264_CPU_SSE) )
+ return;
+ pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse;
if( !(cpu&X264_CPU_SSE2) )
return;
- pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse2;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_sse2;
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_sse2;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_sse2;
if( !(cpu&X264_CPU_MMX2) )
return;
#if HIGH_BIT_DEPTH
+ if( !(cpu&X264_CPU_SSE) )
+ return;
+ pf[I_PRED_8x8_V] = x264_predict_8x8_v_sse;
if( !(cpu&X264_CPU_SSE2) )
return;
- pf[I_PRED_8x8_V] = x264_predict_8x8_v_sse2;
pf[I_PRED_8x8_H] = x264_predict_8x8_h_sse2;
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_sse2;
pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_sse2;
void x264_predict_8x8_init_mmx ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter );
void x264_predict_16x16_v_mmx2( pixel *src );
-void x264_predict_16x16_v_sse2( pixel *src );
+void x264_predict_16x16_v_sse ( pixel *src );
void x264_predict_16x16_h_mmx2( pixel *src );
void x264_predict_16x16_h_sse2( uint16_t *src );
void x264_predict_16x16_h_ssse3( uint8_t *src );
void x264_predict_8x16c_dc_top_mmx2( uint8_t *src );
void x264_predict_8x16c_dc_top_sse2( uint16_t *src );
void x264_predict_8x16c_v_mmx( uint8_t *src );
-void x264_predict_8x16c_v_sse2( uint16_t *src );
+void x264_predict_8x16c_v_sse( uint16_t *src );
void x264_predict_8x16c_h_mmx2( pixel *src );
void x264_predict_8x16c_h_sse2( pixel *src );
void x264_predict_8x16c_h_ssse3( uint8_t *src );
void x264_predict_8x8c_dc_top_mmx2( uint8_t *src );
void x264_predict_8x8c_dc_top_sse2( uint16_t *src );
void x264_predict_8x8c_v_mmx( pixel *src );
-void x264_predict_8x8c_v_sse2( uint16_t *src );
+void x264_predict_8x8c_v_sse( uint16_t *src );
void x264_predict_8x8c_h_mmx2( pixel *src );
void x264_predict_8x8c_h_sse2( pixel *src );
void x264_predict_8x8c_h_ssse3( uint8_t *src );
void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_v_sse2( uint16_t *src, uint16_t edge[36] );
+void x264_predict_8x8_v_sse ( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_hd_mmx2( uint8_t *src, uint8_t edge[36] );
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
- movdqa [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
+ movaps [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
%endrep
%endmacro
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
- movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
+ movaps xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
%endrep
%if stack_size_padded == 0
add %1, (xmm_regs_used-6)*16+16
/* print sse2slow only if there's also a sse2fast version of the same func */
b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
+ b->cpu&X264_CPU_SSE ? "sse" :
b->cpu&X264_CPU_MMX ? "mmx" :
b->cpu&X264_CPU_ALTIVEC ? "altivec" :
b->cpu&X264_CPU_NEON ? "neon" :
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
cpu1 &= ~X264_CPU_SLOW_CTZ;
}
+ if( x264_cpu_detect() & X264_CPU_SSE )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" );
if( x264_cpu_detect() & X264_CPU_SSE2 )
{
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
cpu1 &= ~X264_CPU_CACHELINE_64;