}
if( cpu&X264_CPU_XOP )
{
+ pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
pixf->vsad = x264_pixel_vsad_xop;
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop;
+ pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
pshufhw m0, m0, q3120
pshufhw m1, m1, q3120
%endif
+%if cpuflag(xop)
+ pmadcswd m2, m0, m0, m2
+ pmadcswd m3, m1, m1, m3
+%else
pmaddwd m0, m0
pmaddwd m1, m1
paddd m2, m0
paddd m3, m1
+%endif
add r6, 2*mmsize
jl .loopx
%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
por m0, m1
psrlw m2, m0, 8
pand m0, m5
+%if cpuflag(xop)
+ pmadcswd m4, m2, m2, m4
+ pmadcswd m3, m0, m0, m3
+%else
pmaddwd m2, m2
pmaddwd m0, m0
- paddd m3, m0
paddd m4, m2
+ paddd m3, m0
+%endif
add r6, mmsize
jl .loopx
%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
SSD_NV12
INIT_XMM avx
SSD_NV12
+INIT_XMM xop
+SSD_NV12
INIT_YMM avx2
SSD_NV12
void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1,
pixel *pixuv2, intptr_t stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
+void x264_pixel_ssd_nv12_core_xop ( pixel *pixuv1, intptr_t stride1,
+ pixel *pixuv2, intptr_t stride2, int width,
+ int height, uint64_t *ssd_u, uint64_t *ssd_v );
void x264_pixel_ssd_nv12_core_avx2( pixel *pixuv1, intptr_t stride1,
pixel *pixuv2, intptr_t stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );