dctf->add4x4_idct = x264_add4x4_idct_sse2;
dctf->dct4x4dc = x264_dct4x4dc_sse2;
dctf->idct4x4dc = x264_idct4x4dc_sse2;
+ dctf->dct2x4dc = x264_dct2x4dc_sse2;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
dctf->add4x4_idct = x264_add4x4_idct_avx;
dctf->dct4x4dc = x264_dct4x4dc_avx;
dctf->idct4x4dc = x264_idct4x4dc_avx;
+ dctf->dct2x4dc = x264_dct2x4dc_avx;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
dctf->add8x8_idct = x264_add8x8_idct_avx;
if( cpu&X264_CPU_MMX2 )
{
dctf->dct4x4dc = x264_dct4x4dc_mmx2;
+ dctf->dct2x4dc = x264_dct2x4dc_mmx2;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
}
RET
%endif ; HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
+;-----------------------------------------------------------------------------
+%if WIN64
+ DECLARE_REG_TMP 6 ; Avoid some REX prefixes to reduce code size
+%else
+ DECLARE_REG_TMP 2
+%endif
+
+%macro INSERT_COEFF 3 ; dst, src, imm
+ %if %3
+ %if HIGH_BIT_DEPTH
+ %if cpuflag(sse4)
+ pinsrd %1, %2, %3
+ %elif %3 == 2
+ movd m2, %2
+ %elif %3 == 1
+ punpckldq %1, %2
+ %else
+ punpckldq m2, %2
+ punpcklqdq %1, m2
+ %endif
+ %else
+ %if %3 == 2
+ punpckldq %1, %2
+ %else
+ pinsrw %1, %2, %3
+ %endif
+ %endif
+ %else
+ movd %1, %2
+ %endif
+ %if HIGH_BIT_DEPTH
+ mov %2, t0d
+ %else
+ mov %2, t0w
+ %endif
+%endmacro
+
+%macro DCT2x4DC 2
+cglobal dct2x4dc, 2,3
+ xor t0d, t0d
+ INSERT_COEFF m0, [r1+0*16*SIZEOF_DCTCOEF], 0
+ INSERT_COEFF m0, [r1+1*16*SIZEOF_DCTCOEF], 2
+ add r1, 4*16*SIZEOF_DCTCOEF
+ INSERT_COEFF m0, [r1-2*16*SIZEOF_DCTCOEF], 1
+ INSERT_COEFF m0, [r1-1*16*SIZEOF_DCTCOEF], 3
+ INSERT_COEFF m1, [r1+0*16*SIZEOF_DCTCOEF], 0
+ INSERT_COEFF m1, [r1+1*16*SIZEOF_DCTCOEF], 2
+ INSERT_COEFF m1, [r1+2*16*SIZEOF_DCTCOEF], 1
+ INSERT_COEFF m1, [r1+3*16*SIZEOF_DCTCOEF], 3
+ SUMSUB_BA %1, 1, 0, 2
+ SBUTTERFLY %2, 1, 0, 2
+ SUMSUB_BA %1, 0, 1, 2
+ SBUTTERFLY %2, 0, 1, 2
+ SUMSUB_BA %1, 1, 0, 2
+ pshuf%1 m0, m0, q1032
+ mova [r0], m1
+ mova [r0+mmsize], m0
+ RET
+%endmacro
+
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+DCT2x4DC d, dq
+INIT_XMM avx
+DCT2x4DC d, dq
+%else
+INIT_MMX mmx2
+DCT2x4DC w, wd
+%endif
+
%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
void x264_idct4x4dc_sse2 ( int32_t d[16] );
void x264_idct4x4dc_avx ( int32_t d[16] );
+void x264_dct2x4dc_mmx2( dctcoef dct[8], dctcoef dct4x4[8][16] );
+void x264_dct2x4dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16] );
+void x264_dct2x4dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16] );
+
void x264_sub8x8_dct8_mmx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_mmx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_sse2 ( dctcoef dct [64], pixel *pix1, pixel *pix2 );