X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=encoder%2Fmacroblock.c;h=7d03e41b21c6008950ea831b3ab6ae8e1de5eda5;hb=ec3d09554addbcecb8cf82f3ff33ac737a6f996b;hp=0eafd586c5accb9506ec0dcdbd178285653d4f7f;hpb=27b73b3b86524ec9b0bdf8310a55081898b408c0;p=x264 diff --git a/encoder/macroblock.c b/encoder/macroblock.c index 0eafd586..7d03e41b 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -64,7 +64,7 @@ static int x264_mb_decimate_score( int16_t *dct, int i_max ) { int i_run; - if( abs( dct[idx--] ) > 1 ) + if( (unsigned)(dct[idx--] + 1) > 2 ) return 9; i_run = 0; @@ -144,6 +144,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale ) int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE; h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od ); dct4x4[0][block_idx_x[i]][block_idx_y[i]] = h->dct.luma4x4[i][0]; + h->dct.luma4x4[i][0] = 0; } h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct4x4[0] ); return; @@ -154,6 +155,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale ) { /* copy dc coeff */ dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0]; + dct4x4[1+i][0][0] = 0; /* quant/scan/dequant */ if( h->mb.b_trellis ) @@ -205,6 +207,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale ) int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE; h->zigzagf.sub_4x4( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od ); h->dct.chroma_dc[ch][i] = h->dct.luma4x4[16+i+ch*4][0]; + h->dct.luma4x4[16+i+ch*4][0] = 0; } continue; } @@ -215,6 +218,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale ) { /* copy dc coeff */ dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0]; + dct4x4[i][0][0] = 0; /* no trellis; it doesn't seem to help chroma noticeably */ h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qscale], h->quant4_bias[CQM_4IC+b_inter][i_qscale] ); @@ -257,7 +261,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale ) h->mb.i_cbp_chroma = 0; for( i = 0; i < 8; i++ ) { - int nz = array_non_zero_count( h->dct.luma4x4[16+i]+1, 15 ); + int nz = array_non_zero( h->dct.luma4x4[16+i] ); h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz; h->mb.i_cbp_chroma |= nz; } @@ -269,15 +273,9 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale ) static void x264_macroblock_encode_skip( x264_t *h ) { - int i; h->mb.i_cbp_luma = 0x00; h->mb.i_cbp_chroma = 0x00; - - for( i = 0; i < 16+8; i++ ) - { - h->mb.cache.non_zero_count[x264_scan8[i]] = 0; - } - + memset( h->mb.cache.non_zero_count, 0, X264_SCAN8_SIZE ); /* store cbp */ h->mb.cbp[h->mb.i_mb_xy] = 0; } @@ -293,19 +291,21 @@ void x264_macroblock_encode_pskip( x264_t *h ) const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1], h->mb.mv_min[1], h->mb.mv_max[1] ); - /* Motion compensation XXX probably unneeded */ - h->mc.mc_luma( h->mb.pic.p_fdec[0], FDEC_STRIDE, - h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0], - mvx, mvy, 16, 16 ); + /* don't do pskip motion compensation if it was already done in macroblock_analyse */ + if( !h->mb.b_skip_mc ) + { + h->mc.mc_luma( h->mb.pic.p_fdec[0], FDEC_STRIDE, + h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0], + mvx, mvy, 16, 16 ); - /* Chroma MC */ - h->mc.mc_chroma( h->mb.pic.p_fdec[1], FDEC_STRIDE, - h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], - mvx, mvy, 8, 8 ); + h->mc.mc_chroma( h->mb.pic.p_fdec[1], FDEC_STRIDE, + h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], + mvx, mvy, 8, 8 ); - h->mc.mc_chroma( h->mb.pic.p_fdec[2], FDEC_STRIDE, - h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2], - mvx, mvy, 8, 8 ); + h->mc.mc_chroma( h->mb.pic.p_fdec[2], FDEC_STRIDE, + h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2], + mvx, mvy, 8, 8 ); + } x264_macroblock_encode_skip( h ); } @@ -319,7 +319,8 @@ void x264_macroblock_encode( x264_t *h ) int i_qp = h->mb.i_qp; int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate; int b_force_no_skip = 0; - int i; + int i,j,idx; + uint8_t nnz8x8[4] = {1,1,1,1}; if( h->sh.b_mbaff && h->mb.i_mb_xy == h->sh.i_first_mb + h->mb.i_mb_stride @@ -346,8 +347,9 @@ void x264_macroblock_encode( x264_t *h ) } if( h->mb.i_type == B_SKIP ) { - /* XXX motion compensation is probably unneeded */ - x264_mb_mc( h ); + /* don't do bskip motion compensation if it was already done in macroblock_analyse */ + if( !h->mb.b_skip_mc ) + x264_mb_mc( h ); x264_macroblock_encode_skip( h ); return; } @@ -383,6 +385,8 @@ void x264_macroblock_encode( x264_t *h ) h->predict_8x8[i_mode]( p_dst, edge ); x264_mb_encode_i8x8( h, i, i_qp ); } + for( i = 0; i < 4; i++ ) + nnz8x8[i] = array_non_zero( h->dct.luma8x8[i] ); } else if( h->mb.i_type == I_4x4 ) { @@ -410,11 +414,12 @@ void x264_macroblock_encode( x264_t *h ) } else /* Inter MB */ { - int i8x8, i4x4, idx; + int i8x8, i4x4; int i_decimate_mb = 0; - /* Motion compensation */ - x264_mb_mc( h ); + /* Don't repeat motion compensation if it was already done in non-RD transform analysis */ + if( !h->mb.b_skip_mc ) + x264_mb_mc( h ); if( h->mb.b_lossless ) { @@ -430,7 +435,6 @@ void x264_macroblock_encode( x264_t *h ) else if( h->mb.b_transform_8x8 ) { DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] ); - int nnz8x8[4] = {1,1,1,1}; b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] ); @@ -450,18 +454,14 @@ void x264_macroblock_encode( x264_t *h ) int i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[idx], 64 ); i_decimate_mb += i_decimate_8x8; if( i_decimate_8x8 < 4 ) - { - memset( h->dct.luma8x8[idx], 0, sizeof( h->dct.luma8x8[idx] ) ); - memset( dct8x8[idx], 0, sizeof( dct8x8[idx] ) ); nnz8x8[idx] = 0; - } } else nnz8x8[idx] = array_non_zero( dct8x8[idx] ); } if( i_decimate_mb < 6 && b_decimate ) - memset( h->dct.luma8x8, 0, sizeof( h->dct.luma8x8 ) ); + *(uint32_t*)nnz8x8 = 0; else { for( idx = 0; idx < 4; idx++ ) @@ -475,7 +475,6 @@ void x264_macroblock_encode( x264_t *h ) else { DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] ); - int nnz8x8[4] = {1,1,1,1}; h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] ); for( i8x8 = 0; i8x8 < 4; i8x8++ ) @@ -496,23 +495,19 @@ void x264_macroblock_encode( x264_t *h ) h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] ); - - if( b_decimate ) + + if( b_decimate && i_decimate_8x8 <= 6 ) i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[idx], 16 ); } /* decimate this 8x8 block */ i_decimate_mb += i_decimate_8x8; if( i_decimate_8x8 < 4 && b_decimate ) - { - memset( &dct4x4[i8x8*4], 0, 4 * sizeof( *dct4x4 ) ); - memset( &h->dct.luma4x4[i8x8*4], 0, 4 * sizeof( *h->dct.luma4x4 ) ); nnz8x8[i8x8] = 0; - } } if( i_decimate_mb < 6 && b_decimate ) - memset( h->dct.luma4x4, 0, 16 * sizeof( *h->dct.luma4x4 ) ); + *(uint32_t*)nnz8x8 = 0; else { for( i8x8 = 0; i8x8 < 4; i8x8++ ) @@ -543,34 +538,38 @@ void x264_macroblock_encode( x264_t *h ) { for( i = 0; i < 16; i++ ) { - const int nz = array_non_zero_count( h->dct.luma4x4[i]+1, 15 ); + int nz = array_non_zero( h->dct.luma4x4[i] ); h->mb.cache.non_zero_count[x264_scan8[i]] = nz; - if( nz > 0 ) - h->mb.i_cbp_luma = 0x0f; - } - } - else if( h->mb.b_transform_8x8 ) - { - /* coded_block_flag is enough for CABAC. - * the full non_zero_count is done only in CAVLC. */ - for( i = 0; i < 4; i++ ) - { - const int nz = array_non_zero( h->dct.luma8x8[i] ); - int j; - for( j = 0; j < 4; j++ ) - h->mb.cache.non_zero_count[x264_scan8[4*i+j]] = nz; - if( nz > 0 ) - h->mb.i_cbp_luma |= 1 << i; + h->mb.i_cbp_luma |= nz; } + h->mb.i_cbp_luma *= 0xf; } else { - for( i = 0; i < 16; i++ ) + for( i = 0; i < 4; i++) { - const int nz = array_non_zero_count( h->dct.luma4x4[i], 16 ); - h->mb.cache.non_zero_count[x264_scan8[i]] = nz; - if( nz > 0 ) - h->mb.i_cbp_luma |= 1 << (i/4); + if(!nnz8x8[i]) + { + *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+i*4]] = 0; + *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+i*4]] = 0; + } + else if( h->mb.b_transform_8x8 ) + { + *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+4*i]] = nnz8x8[i] * 0x0101; + *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+4*i]] = nnz8x8[i] * 0x0101; + h->mb.i_cbp_luma |= nnz8x8[i] << i; + } + else + { + int nz, cbp = 0; + for( j = 0; j < 4; j++ ) + { + nz = array_non_zero( h->dct.luma4x4[j+4*i] ); + h->mb.cache.non_zero_count[x264_scan8[j+4*i]] = nz; + cbp |= nz; + } + h->mb.i_cbp_luma |= cbp << i; + } } } @@ -590,17 +589,15 @@ void x264_macroblock_encode( x264_t *h ) if( !b_force_no_skip ) { if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 && - h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma == 0x00 && - h->mb.cache.mv[0][x264_scan8[0]][0] == h->mb.cache.pskip_mv[0] && - h->mb.cache.mv[0][x264_scan8[0]][1] == h->mb.cache.pskip_mv[1] && - h->mb.cache.ref[0][x264_scan8[0]] == 0 ) + !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) && + *(uint32_t*)h->mb.cache.mv[0][x264_scan8[0]] == *(uint32_t*)h->mb.cache.pskip_mv + && h->mb.cache.ref[0][x264_scan8[0]] == 0 ) { h->mb.i_type = P_SKIP; } /* Check for B_SKIP */ - if( h->mb.i_type == B_DIRECT && - h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 ) + if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) ) { h->mb.i_type = B_SKIP; } @@ -614,7 +611,7 @@ void x264_macroblock_encode( x264_t *h ) *****************************************************************************/ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir ) { - DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] ); + DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] ); DECLARE_ALIGNED_16( int16_t dct2x2[2][2] ); DECLARE_ALIGNED_16( int16_t dctscan[16] ); @@ -637,27 +634,23 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir ) mvp[0], mvp[1], 16, 16 ); } - /* get luma diff */ - h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], - h->mb.pic.p_fdec[0] ); - for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ ) { + int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8; + int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8; + /* get luma diff */ + h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[0] + fenc_offset, + h->mb.pic.p_fdec[0] + fdec_offset ); /* encode one 4x4 block */ for( i4x4 = 0; i4x4 < 4; i4x4++ ) { - const int idx = i8x8 * 4 + i4x4; - - h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); - h->zigzagf.scan_4x4( dctscan, dct4x4[idx] ); - + h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); + if( !array_non_zero(dct4x4[i4x4]) ) + continue; + h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] ); i_decimate_mb += x264_mb_decimate_score( dctscan, 16 ); - if( i_decimate_mb >= 6 ) - { - /* not as P_SKIP */ return 0; - } } } @@ -685,26 +678,23 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir ) dct2x2[1][1] = dct4x4[3][0][0]; h->dctf.dct2x2dc( dct2x2 ); h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ); - if( dct2x2[0][0] || dct2x2[0][1] || dct2x2[1][0] || dct2x2[1][1] ) - { - /* can't be */ + if( array_non_zero(dct2x2) ) return 0; - } /* calculate dct coeffs */ for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ ) { h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); + if( !array_non_zero(dct4x4[i4x4]) ) + continue; h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] ); - i_decimate_mb += x264_mb_decimate_score( dctscan+1, 15 ); if( i_decimate_mb >= 7 ) - { return 0; - } } } + h->mb.b_skip_mc = 1; return 1; } @@ -805,10 +795,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) int i4; DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] ); h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); - h->quantf.quant_4x4( dct4x4[0], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); - h->quantf.quant_4x4( dct4x4[1], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); - h->quantf.quant_4x4( dct4x4[2], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); - h->quantf.quant_4x4( dct4x4[3], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); + for( i4 = 0; i4 < 4; i4++ ) + h->quantf.quant_4x4( dct4x4[i4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); for( i4 = 0; i4 < 4; i4++ ) h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] ); @@ -848,9 +836,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) } } - if( nnz8x8 ) - h->mb.i_cbp_luma |= (1 << i8); - else - h->mb.i_cbp_luma &= ~(1 << i8); + h->mb.i_cbp_luma &= ~(1 << i8); + h->mb.i_cbp_luma |= nnz8x8 << i8; h->mb.i_cbp_chroma = 0x02; }