From Google Code-In.
INTRA_MBCMP( sad, 16x16, v, h, dc, , _mmx2, _mmx2 )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _sse2, _sse2 )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _sse2, _sse2 )
-INTRA_MBCMP( sad, 4x4, v, h, dc, , _ssse3, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _ssse3, _sse2 )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _ssse3, _sse2 )
#else
#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse2;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
INIT4( hadamard_ac, _ssse3 );
}
pixf->vsad = x264_pixel_vsad_ssse3;
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
{
INIT4( hadamard_ac, _avx );
}
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_avx;
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_4x4_mmx2 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_4x4_sse2 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_4x4_ssse3 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_4x4_avx ( pixel *, pixel *, int * );
void x264_intra_satd_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
SECTION .text
cextern pw_1
+cextern pw_4
cextern pw_8
;=============================================================================
SAD_X 4, 8, 4
;-----------------------------------------------------------------------------
-; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3]);
+; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] );
+;-----------------------------------------------------------------------------
+
+%macro INTRA_SAD_X3_4x4 0
+cglobal intra_sad_x3_4x4, 3,3,7
+ movq m0, [r1-1*FDEC_STRIDEB]
+ movq m1, [r0+0*FENC_STRIDEB]
+ movq m2, [r0+2*FENC_STRIDEB]
+ pshuflw m6, m0, q1032
+ paddw m6, m0
+ pshuflw m5, m6, q2301
+ paddw m6, m5
+ punpcklqdq m6, m6 ;A+B+C+D 8 times
+ punpcklqdq m0, m0
+ movhps m1, [r0+1*FENC_STRIDEB]
+ movhps m2, [r0+3*FENC_STRIDEB]
+ psubw m3, m1, m0
+ psubw m0, m2
+ ABSW m3, m3, m5
+ ABSW m0, m0, m5
+ paddw m0, m3
+ HADDW m0, m5
+ movd [r2], m0 ;V prediction cost
+ movd m3, [r1+0*FDEC_STRIDEB-4]
+ movhps m3, [r1+1*FDEC_STRIDEB-8]
+ movd m4, [r1+2*FDEC_STRIDEB-4]
+ movhps m4, [r1+3*FDEC_STRIDEB-8]
+ pshufhw m3, m3, q3333
+ pshufhw m4, m4, q3333
+ pshuflw m3, m3, q1111 ; FF FF EE EE
+ pshuflw m4, m4, q1111 ; HH HH GG GG
+ paddw m5, m3, m4
+ pshufd m0, m5, q1032
+ paddw m5, m6
+ paddw m5, m0
+ paddw m5, [pw_4]
+ psrlw m5, 3
+ psubw m6, m5, m2
+ psubw m5, m1
+ psubw m1, m3
+ psubw m2, m4
+ ABSW m5, m5, m0
+ ABSW m6, m6, m0
+ ABSW m1, m1, m0
+ ABSW m2, m2, m0
+ paddw m5, m6
+ paddw m1, m2
+ HADDW m5, m0
+ HADDW m1, m2
+ movd [r2+8], m5 ;DC prediction cost
+ movd [r2+4], m1 ;H prediction cost
+ RET
+%endmacro
+
+INIT_XMM sse2
+INTRA_SAD_X3_4x4
+INIT_XMM ssse3
+INTRA_SAD_X3_4x4
+INIT_XMM avx
+INTRA_SAD_X3_4x4
+
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3] );
;-----------------------------------------------------------------------------
;m0 = DC