Faster chroma encoding by terminating early if heuristics indicate that the block will be DC-only.
This works because the vast majority of inter chroma blocks have no coefficients at all, and those that do are almost always DC-only.
Add two new helper DSP functions for this: dct_dc_8x8 and var2_8x8. mmx/sse2/ssse3 versions of each.
Early termination is disabled at very low QPs due to it not being useful there.
Performance increase is ~1-2% without trellis, up to 5-6% with trellis=2.
Increase is greater with lower bitrates.
sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}
+static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
+{
+ int16_t d[4][4];
+ int sum = 0;
+
+ pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
+
+ sum += d[0][0] + d[0][1] + d[0][2] + d[0][3];
+ sum += d[1][0] + d[1][1] + d[1][2] + d[1][3];
+ sum += d[2][0] + d[2][1] + d[2][2] + d[2][3];
+ sum += d[3][0] + d[3][1] + d[3][2] + d[3][3];
+
+ return sum;
+}
+
+static void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
+{
+ dct[0][0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
+ dct[0][1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
+ dct[1][0] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
+ dct[1][1] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
+}
static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
{
dctf->add4x4_idct = add4x4_idct;
dctf->sub8x8_dct = sub8x8_dct;
+ dctf->sub8x8_dct_dc = sub8x8_dct_dc;
dctf->add8x8_idct = add8x8_idct;
dctf->add8x8_idct_dc = add8x8_idct_dc;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
dctf->dct4x4dc = x264_dct4x4dc_mmx;
dctf->idct4x4dc = x264_idct4x4dc_mmx;
+ dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
#ifndef ARCH_X86_64
dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
{
dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
+ dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
void (*add4x4_idct) ( uint8_t *p_dst, int16_t dct[4][4] );
void (*sub8x8_dct) ( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
+ void (*sub8x8_dct_dc)( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 );
void (*add8x8_idct) ( uint8_t *p_dst, int16_t dct[4][4][4] );
void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t dct[2][2] );
PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
PIXEL_VAR_C( x264_pixel_var_8x8, 8, 6 )
+/****************************************************************************
+ * pixel_var2_wxh
+ ****************************************************************************/
+static int pixel_var2_8x8( uint8_t *pix1, int i_stride1, uint8_t *pix2, int i_stride2, int *ssd )
+{
+ uint32_t var = 0, sum = 0, sqr = 0;
+ int x, y;
+ for( y = 0; y < 8; y++ )
+ {
+ for( x = 0; x < 8; x++ )
+ {
+ int diff = pix1[x] - pix2[x];
+ sum += diff;
+ sqr += diff * diff;
+ }
+ pix1 += i_stride1;
+ pix2 += i_stride2;
+ }
+ sum = abs(sum);
+ var = sqr - (sum * sum >> 6);
+ *ssd = sqr;
+ return var;
+}
+
#define HADAMARD4(d0,d1,d2,d3,s0,s1,s2,s3) {\
int t0 = s0 + s1;\
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
+ pixf->var2_8x8 = pixel_var2_8x8;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext;
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext;
+ pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
if( cpu&X264_CPU_CACHELINE_32 )
{
#ifdef ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
+ pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
#ifdef ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
#endif
+ pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
x264_pixel_cmp_x3_t fpelcmp_x3[7];
x264_pixel_cmp_x4_t fpelcmp_x4[7];
x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
+ int (*var2_8x8)( uint8_t *, int, uint8_t *, int, int * );
int (*var[4])( uint8_t *pix, int stride );
uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
pb_1: times 16 db 1
+pw_1: times 8 dw 1
SECTION .text
IDCT_DC_STORE 0, xmm2, xmm3
ret
+;-----------------------------------------------------------------------------
+; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+
+%macro DCTDC_2ROW_MMX 3
+ movq %1, [r1+FENC_STRIDE*(0+%3)]
+ movq m1, [r1+FENC_STRIDE*(1+%3)]
+ movq m2, [r2+FDEC_STRIDE*(0+%3)]
+ movq m3, [r2+FDEC_STRIDE*(1+%3)]
+ movq %2, %1
+ punpckldq %1, m1
+ punpckhdq %2, m1
+ movq m1, m2
+ punpckldq m2, m3
+ punpckhdq m1, m3
+ psadbw %1, m7
+ psadbw %2, m7
+ psadbw m2, m7
+ psadbw m1, m7
+ psubw %1, m2
+ psubw %2, m1
+%endmacro
+
+INIT_MMX
+cglobal x264_sub8x8_dct_dc_mmxext, 3,3
+ pxor m7, m7
+ call .loop
+ add r1, FENC_STRIDE*4
+ add r2, FDEC_STRIDE*4
+ add r0, 4
+.loop:
+ DCTDC_2ROW_MMX m0, m4, 0
+ DCTDC_2ROW_MMX m5, m6, 2
+ paddw m0, m5
+ paddw m4, m6
+ punpcklwd m0, m4
+ movd [r0], m0
+ ret
+
+INIT_XMM
+%macro DCTDC_2ROW_SSE2 3
+ movq m0, [r1+FENC_STRIDE*(0+%1)]
+ movq m1, [r1+FENC_STRIDE*(1+%1)]
+ movq m2, [r2+FDEC_STRIDE*(0+%1)]
+ movq m3, [r2+FDEC_STRIDE*(1+%1)]
+ punpckldq m0, m1
+ punpckldq m2, m3
+ psadbw m0, m7
+ psadbw m2, m7
+%if %2
+ paddw %3, m0
+ paddw m6, m2
+%else
+ SWAP %3, m0
+ SWAP m6, m2
+%endif
+%endmacro
+
+cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
+ pxor m7, m7
+ DCTDC_2ROW_SSE2 0, 0, m4
+ DCTDC_2ROW_SSE2 2, 1, m4
+ add r1, FENC_STRIDE*4
+ add r2, FDEC_STRIDE*4
+ psubq m4, m6
+ DCTDC_2ROW_SSE2 0, 0, m5
+ DCTDC_2ROW_SSE2 2, 1, m5
+ psubq m5, m6
+ packssdw m4, m5
+ packssdw m4, m4
+ movq [r0], m4
+ RET
+
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
void x264_sub4x4_dct_ssse3 ( int16_t dct[ 4][4] , uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_ssse3 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
-
+void x264_sub8x8_dct_dc_mmxext( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_sse2 ( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 );
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] );
void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
jg .loop
VAR_END 6
+%macro VAR2_END 0
+ HADDW m5, m7
+ movd r1d, m5
+ imul r1d, r1d
+ HADDD m6, m1
+ shr r1d, 6
+ movd eax, m6
+ mov [r4], eax
+ sub eax, r1d ; sqr - (sum * sum >> shift)
+ RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * )
+;-----------------------------------------------------------------------------
+%ifndef ARCH_X86_64
+INIT_MMX
+cglobal x264_pixel_var2_8x8_mmxext, 5,6
+ VAR_START 0
+ mov r5d, 8
+.loop:
+ movq m0, [r0]
+ movq m1, m0
+ movq m4, m0
+ movq m2, [r2]
+ movq m3, m2
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ psubw m0, m2
+ psubw m1, m3
+ paddw m5, m0
+ paddw m5, m1
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ paddd m6, m0
+ paddd m6, m1
+ add r0, r1
+ add r2, r3
+ dec r5d
+ jg .loop
+ VAR2_END
+ RET
+%endif
+
+INIT_XMM
+cglobal x264_pixel_var2_8x8_sse2, 5,6,8
+ VAR_START 1
+ mov r5d, 4
+.loop:
+ movq m1, [r0]
+ movhps m1, [r0+r1]
+ movq m3, [r2]
+ movhps m3, [r2+r3]
+ DEINTB 0, 1, 2, 3, 7
+ psubw m0, m2
+ psubw m1, m3
+ paddw m5, m0
+ paddw m5, m1
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ paddd m6, m0
+ paddd m6, m1
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ dec r5d
+ jg .loop
+ VAR2_END
+ RET
+
+cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
+ pxor m5, m5 ; sum
+ pxor m6, m6 ; sum squared
+ mova m7, [hsub_mul GLOBAL]
+ mov r5d, 2
+.loop:
+ movq m0, [r0]
+ movq m2, [r2]
+ movq m1, [r0+r1]
+ movq m3, [r2+r3]
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ punpcklbw m0, m2
+ punpcklbw m1, m3
+ movq m2, [r0]
+ movq m3, [r2]
+ punpcklbw m2, m3
+ movq m3, [r0+r1]
+ movq m4, [r2+r3]
+ punpcklbw m3, m4
+ pmaddubsw m0, m7
+ pmaddubsw m1, m7
+ pmaddubsw m2, m7
+ pmaddubsw m3, m7
+ paddw m5, m0
+ paddw m5, m1
+ paddw m5, m2
+ paddw m5, m3
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ paddd m6, m0
+ paddd m6, m1
+ paddd m6, m2
+ paddd m6, m3
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ dec r5d
+ jg .loop
+ VAR2_END
+ RET
;=============================================================================
; SATD
void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
+int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * );
+int x264_pixel_var2_8x8_sse2( uint8_t *, int, uint8_t *, int, int * );
+int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
dct4x4[3][0][0] = 0;
}
+static inline void dct2x2dc_dconly( int16_t d[2][2] )
+{
+ int d0 = d[0][0] + d[0][1];
+ int d1 = d[1][0] + d[1][1];
+ int d2 = d[0][0] - d[0][1];
+ int d3 = d[1][0] - d[1][1];
+ d[0][0] = d0 + d1;
+ d[1][0] = d2 + d3;
+ d[0][1] = d0 - d1;
+ d[1][1] = d2 - d3;
+}
+
static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
{
int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
{
int i, ch, nz, nz_dc;
int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
+ DECLARE_ALIGNED_16( int16_t dct2x2[2][2] );
h->mb.i_cbp_chroma = 0;
+ /* Early termination: check variance of chroma residual before encoding.
+ * Don't bother trying early termination at low QPs.
+ * Values are experimentally derived. */
+ if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) )
+ {
+ int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
+ int ssd[2];
+ int score = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
+ score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
+ if( score < thresh*4 )
+ {
+ h->mb.cache.non_zero_count[x264_scan8[16]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[17]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[18]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[19]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[20]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[21]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[22]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[23]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[25]] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[26]] = 0;
+ for( ch = 0; ch < 2; ch++ )
+ {
+ if( ssd[ch] > thresh )
+ {
+ h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] );
+ dct2x2dc_dconly( dct2x2 );
+ if( h->mb.b_trellis )
+ nz_dc = x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter );
+ else
+ nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<
+ 1 );
+ if( nz_dc )
+ {
+ h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
+ zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
+ idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+ h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 );
+ h->mb.i_cbp_chroma = 1;
+ }
+ }
+ }
+ return;
+ }
+ }
+
for( ch = 0; ch < 2; ch++ )
{
uint8_t *p_src = h->mb.pic.p_fenc[1+ch];
int i_decimate_score = 0;
int nz_ac = 0;
- DECLARE_ALIGNED_16( int16_t dct2x2[2][2] );
DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
if( h->mb.b_lossless )
TEST_PIXEL_VAR( PIXEL_8x8 );
report( "pixel var :" );
+ ok = 1; used_asm = 0;
+ if( pixel_asm.var2_8x8 != pixel_ref.var2_8x8 )
+ {
+ int res_c, res_asm, ssd_c, ssd_asm;
+ set_func_name( "var2_8x8" );
+ used_asm = 1;
+ res_c = call_c( pixel_c.var2_8x8, buf1, 16, buf2, 16, &ssd_c );
+ res_asm = call_a( pixel_asm.var2_8x8, buf1, 16, buf2, 16, &ssd_asm );
+ if( res_c != res_asm || ssd_c != ssd_asm )
+ {
+ ok = 0;
+ fprintf( stderr, "var[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm );
+ }
+ }
+
+ report( "pixel var2 :" );
+
for( i=0, ok=1, used_asm=0; i<4; i++ )
if( pixel_asm.hadamard_ac[i] != pixel_ref.hadamard_ac[i] )
{
DECLARE_ALIGNED_16( int16_t dct2[16][4][4] );
DECLARE_ALIGNED_16( int16_t dct4[16][4][4] );
DECLARE_ALIGNED_16( int16_t dct8[4][8][8] );
+ DECLARE_ALIGNED_8( int16_t dctdc[2][2][2] );
x264_t h_buf;
x264_t *h = &h_buf;
ok = 1; used_asm = 0;
TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 );
TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 );
+ TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4*2 );
TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 );
report( "sub_dct4 :" );