Modify quantization to also calculate array_non_zero.
PPC assembly changes by gpoirior.
New quant asm includes some small tweaks to quant and SSE4 versions using ptest for the array_non_zero.
Use this new feature of quant to merge nnz/cbp calculation directly with encoding and avoid many unnecessary calls to dequant/zigzag/decimate/etc.
Also add new i16x16 DC-only iDCT with asm.
Since intra encoding now directly calculates nnz, skip_intra now backs up nnz/cbp as well.
Output should be equivalent except when using p4x4+RDO because of a subtlety involving old nnz values lying around.
Performance increase in macroblock_encode: ~18% with dct-decimate, 30% without at CRF 25.
Overall performance increase 0-6% depending on encoding settings.
DECLARE_ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] );
DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
+ uint32_t i4x4_nnz_buf[4];
+ uint32_t i8x8_nnz_buf[4];
+ int i4x4_cbp;
+ int i8x8_cbp;
/* Psy trellis DCT data */
DECLARE_ALIGNED_16( int16_t fenc_dct8[4][64] );
add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
}
+static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[4][4] )
+{
+ int i;
+ for( i = 0; i < 4; i++, p_dst += 4*FDEC_STRIDE )
+ {
+ add4x4_idct_dc( &p_dst[ 0], dct[i][0] );
+ add4x4_idct_dc( &p_dst[ 4], dct[i][1] );
+ add4x4_idct_dc( &p_dst[ 8], dct[i][2] );
+ add4x4_idct_dc( &p_dst[12], dct[i][3] );
+ }
+}
+
/****************************************************************************
* x264_dct_init:
dctf->sub16x16_dct = sub16x16_dct;
dctf->add16x16_idct = add16x16_idct;
+ dctf->add16x16_idct_dc = add16x16_idct_dc;
dctf->sub8x8_dct8 = sub8x8_dct8;
dctf->add8x8_idct8 = add8x8_idct8;
dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
dctf->add4x4_idct = x264_add4x4_idct_mmx;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
+ dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
dctf->dct4x4dc = x264_dct4x4dc_mmx;
dctf->idct4x4dc = x264_idct4x4dc_mmx;
dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
dctf->add16x16_idct = x264_add16x16_idct_sse2;
+ dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
}
if( cpu&X264_CPU_SSSE3 )
+ {
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
+ dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
+ }
#endif //HAVE_MMX
#ifdef ARCH_PPC
void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][4][4] );
+ void (*add16x16_idct_dc) ( uint8_t *p_dst, int16_t dct[4][4] );
void (*sub8x8_dct8) ( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[8][8] );
mfvB = vec_ld((idx1), mf); \
biasvA = vec_ld((idx0), bias); \
biasvB = vec_ld((idx1), bias); \
-mskA = vec_cmplt(temp1v, zerov); \
-mskB = vec_cmplt(temp2v, zerov); \
-coefvA = (vec_u16_t)vec_max(vec_sub(zerov, temp1v), temp1v); \
-coefvB = (vec_u16_t)vec_max(vec_sub(zerov, temp2v), temp2v); \
+mskA = vec_cmplt(temp1v, zero_s16v); \
+mskB = vec_cmplt(temp2v, zero_s16v); \
+coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v); \
+coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v); \
coefvA = vec_adds(coefvA, biasvA); \
coefvB = vec_adds(coefvB, biasvB); \
multEvenvA = vec_mule(coefvA, mfvA); \
temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
vec_st(temp1v, (idx0), (int16_t*)dct); \
temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
+nz = vec_or(nz, vec_or(temp1v, temp2v)); \
vec_st(temp2v, (idx1), (int16_t*)dct);
-void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
+int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
{
+ LOAD_ZERO;
vector bool short mskA;
vec_u32_t i_qbitsv;
vec_u16_t coefvA;
vec_u32_t multEvenvA, multOddvA;
vec_u16_t mfvA;
vec_u16_t biasvA;
- vec_s16_t zerov, one;
+ vec_s16_t one = vec_splat_s16(1);;
+ vec_s16_t nz = zero_s16v;
vector bool short mskB;
vec_u16_t coefvB;
qbits_u.s[0]=16;
i_qbitsv = vec_splat(qbits_u.v, 0);
- zerov = vec_splat_s16(0);
- one = vec_splat_s16(1);
-
QUANT_16_U( 0, 16 );
+ return vec_any_ne(nz, zero_s16v);
}
// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
#define QUANT_16_U_DC( idx0, idx1 ) \
temp1v = vec_ld((idx0), *dct); \
temp2v = vec_ld((idx1), *dct); \
-mskA = vec_cmplt(temp1v, zerov); \
-mskB = vec_cmplt(temp2v, zerov); \
-coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \
-coefvB = (vec_u16_t) vec_max(vec_sub(zerov, temp2v), temp2v); \
+mskA = vec_cmplt(temp1v, zero_s16v); \
+mskB = vec_cmplt(temp2v, zero_s16v); \
+coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
+coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
coefvA = vec_add(coefvA, biasv); \
coefvB = vec_add(coefvB, biasv); \
multEvenvA = vec_mule(coefvA, mfv); \
temp1v = vec_add(temp1v, vec_and(mskA, one)); \
vec_st(temp1v, (idx0), (int16_t*)dct); \
temp2v = vec_add(temp2v, vec_and(mskB, one)); \
+nz = vec_or(nz, vec_or(temp1v, temp2v)); \
vec_st(temp2v, (idx1), (int16_t*)dct);
-void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
+int x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
{
+ LOAD_ZERO;
vector bool short mskA;
vec_u32_t i_qbitsv;
vec_u16_t coefvA;
vec_u32_t multEvenvA, multOddvA;
- vec_s16_t zerov, one;
+ vec_s16_t one = vec_splat_s16(1);
+ vec_s16_t nz = zero_s16v;
vector bool short mskB;
vec_u16_t coefvB;
bias_u.s[0]=bias;
biasv = vec_splat(bias_u.v, 0);
- zerov = vec_splat_s16(0);
- one = vec_splat_s16(1);
-
QUANT_16_U_DC( 0, 16 );
+ return vec_any_ne(nz, zero_s16v);
}
// DC quant of a whole 2x2 block
#define QUANT_4_U_DC( idx0 ) \
const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \
temp1v = vec_ld((idx0), *dct); \
-mskA = vec_cmplt(temp1v, zerov); \
-coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \
+mskA = vec_cmplt(temp1v, zero_s16v); \
+coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
coefvA = vec_add(coefvA, biasv); \
multEvenvA = vec_mule(coefvA, mfv); \
multOddvA = vec_mulo(coefvA, mfv); \
temp2v = vec_xor(temp2v, mskA); \
temp2v = vec_add(temp2v, vec_and(mskA, one)); \
temp1v = vec_sel(temp1v, temp2v, sel); \
+nz = vec_or(nz, temp1v); \
vec_st(temp1v, (idx0), (int16_t*)dct);
-void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
+int x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
{
+ LOAD_ZERO;
vector bool short mskA;
vec_u32_t i_qbitsv;
vec_u16_t coefvA;
vec_u32_t multEvenvA, multOddvA;
- vec_s16_t zerov, one;
+ vec_s16_t one = vec_splat_s16(1);
+ vec_s16_t nz = zero_s16v;
vec_s16_t temp1v, temp2v;
bias_u.s[0]=bias;
biasv = vec_splat(bias_u.v, 0);
- zerov = vec_splat_s16(0);
- one = vec_splat_s16(1);
-
+ static const vec_s16_t mask2 = CV(-1, -1, -1, -1, 0, 0, 0, 0);
QUANT_4_U_DC(0);
+ return vec_any_ne(vec_and(nz, mask2), zero_s16v);
}
-void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
+int x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
{
+ LOAD_ZERO;
vector bool short mskA;
vec_u32_t i_qbitsv;
vec_u16_t coefvA;
vec_u32_t multEvenvA, multOddvA;
vec_u16_t mfvA;
vec_u16_t biasvA;
- vec_s16_t zerov, one;
-
+ vec_s16_t one = vec_splat_s16(1);;
+ vec_s16_t nz = zero_s16v;
+
vector bool short mskB;
vec_u16_t coefvB;
vec_u32_t multEvenvB, multOddvB;
vec_u16_t mfvB;
vec_u16_t biasvB;
-
+
vec_s16_t temp1v, temp2v;
vec_u32_u qbits_u;
qbits_u.s[0]=16;
i_qbitsv = vec_splat(qbits_u.v, 0);
-
- zerov = vec_splat_s16(0);
- one = vec_splat_s16(1);
int i;
for ( i=0; i<4; i++ ) {
QUANT_16_U( i*2*16, i*2*16+16 );
}
+ return vec_any_ne(nz, zero_s16v);
}
#define DEQUANT_SHL() \
#ifndef X264_PPC_QUANT_H
#define X264_PPC_QUANT_H
-void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias );
-void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias );
+int x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias );
+int x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias );
void x264_dequant_4x4_altivec( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
(coef) = (f + (coef)) * (mf) >> 16; \
else \
(coef) = - ((f - (coef)) * (mf) >> 16); \
+ nz |= (coef); \
}
-static void quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
+static int quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
{
- int i;
+ int i, nz = 0;
for( i = 0; i < 64; i++ )
QUANT_ONE( dct[0][i], mf[i], bias[i] );
+ return !!nz;
}
-static void quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
+static int quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
{
- int i;
+ int i, nz = 0;
for( i = 0; i < 16; i++ )
QUANT_ONE( dct[0][i], mf[i], bias[i] );
+ return !!nz;
}
-static void quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
+static int quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
{
- int i;
+ int i, nz = 0;
for( i = 0; i < 16; i++ )
QUANT_ONE( dct[0][i], mf, bias );
+ return !!nz;
}
-static void quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
+static int quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
{
+ int nz = 0;
QUANT_ONE( dct[0][0], mf, bias );
QUANT_ONE( dct[0][1], mf, bias );
QUANT_ONE( dct[0][2], mf, bias );
QUANT_ONE( dct[0][3], mf, bias );
+ return !!nz;
}
#define DEQUANT_SHL( x ) \
pf->decimate_score16 = x264_decimate_score16_ssse3;
pf->decimate_score64 = x264_decimate_score64_ssse3;
}
+
+ if( cpu&X264_CPU_SSE4 )
+ {
+ pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
+ pf->quant_4x4 = x264_quant_4x4_sse4;
+ pf->quant_8x8 = x264_quant_8x8_sse4;
+ }
#endif // HAVE_MMX
#ifdef ARCH_PPC
typedef struct
{
- void (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
- void (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
- void (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias );
- void (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias );
+ int (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+ int (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+ int (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias );
+ int (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias );
void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
+pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
SECTION .text
movhps [r0+FDEC_STRIDE* 3], xmm5
ret
+cglobal x264_add16x16_idct_dc_mmx, 2,3
+ mov r2, 4
+.loop:
+ movq mm0, [r1]
+ pxor mm1, mm1
+ paddw mm0, [pw_32 GLOBAL]
+ psraw mm0, 6
+ psubw mm1, mm0
+ packuswb mm0, mm0
+ packuswb mm1, mm1
+ punpcklbw mm0, mm0
+ punpcklbw mm1, mm1
+ pshufw mm2, mm0, 0xFA
+ pshufw mm3, mm1, 0xFA
+ punpcklbw mm0, mm0
+ punpcklbw mm1, mm1
+ ADD_DC mm0, mm1, r0
+ ADD_DC mm2, mm3, r0+8
+ add r1, 8
+ add r0, FDEC_STRIDE*4
+ dec r2
+ jg .loop
+ ret
+
+%macro IDCT_DC_STORE 3
+ movdqa xmm4, [r0+%1+FDEC_STRIDE*0]
+ movdqa xmm5, [r0+%1+FDEC_STRIDE*1]
+ movdqa xmm6, [r0+%1+FDEC_STRIDE*2]
+ movdqa xmm7, [r0+%1+FDEC_STRIDE*3]
+ paddusb xmm4, %2
+ paddusb xmm5, %2
+ paddusb xmm6, %2
+ paddusb xmm7, %2
+ psubusb xmm4, %3
+ psubusb xmm5, %3
+ psubusb xmm6, %3
+ psubusb xmm7, %3
+ movdqa [r0+%1+FDEC_STRIDE*0], xmm4
+ movdqa [r0+%1+FDEC_STRIDE*1], xmm5
+ movdqa [r0+%1+FDEC_STRIDE*2], xmm6
+ movdqa [r0+%1+FDEC_STRIDE*3], xmm7
+%endmacro
+
+cglobal x264_add16x16_idct_dc_sse2, 2,2
+ call .loop
+ add r0, FDEC_STRIDE*4
+.loop:
+ add r0, FDEC_STRIDE*4
+ movq xmm0, [r1+0]
+ movq xmm2, [r1+8]
+ add r1, 16
+ punpcklwd xmm0, xmm0
+ punpcklwd xmm2, xmm2
+ pxor xmm1, xmm1
+ pxor xmm3, xmm3
+ paddw xmm0, [pw_32 GLOBAL]
+ paddw xmm2, [pw_32 GLOBAL]
+ psraw xmm0, 6
+ psraw xmm2, 6
+ psubw xmm1, xmm0
+ psubw xmm3, xmm2
+ packuswb xmm0, xmm1
+ packuswb xmm2, xmm3
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ punpcklbw xmm0, xmm0
+ punpcklbw xmm2, xmm2
+ punpckhbw xmm1, xmm1
+ punpckhbw xmm3, xmm3
+ IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
+ IDCT_DC_STORE 0, xmm2, xmm3
+ ret
+
+cglobal x264_add16x16_idct_dc_ssse3, 2,2
+ call .loop
+ add r0, FDEC_STRIDE*4
+.loop:
+ add r0, FDEC_STRIDE*4
+ movdqa xmm0, [r1]
+ add r1, 16
+ pxor xmm1, xmm1
+ paddw xmm0, [pw_32 GLOBAL]
+ psraw xmm0, 6
+ psubw xmm1, xmm0
+ movdqa xmm5, [ pb_idctdc_unpack GLOBAL]
+ movdqa xmm6, [pb_idctdc_unpack2 GLOBAL]
+ packuswb xmm0, xmm0
+ packuswb xmm1, xmm1
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ pshufb xmm0, xmm5
+ pshufb xmm2, xmm6
+ pshufb xmm1, xmm5
+ pshufb xmm3, xmm6
+ IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
+ IDCT_DC_STORE 0, xmm2, xmm3
+ ret
+
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[2][2] );
void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][4][4] );
+void x264_add16x16_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[4][4] );
void x264_add8x8_idct_sse2 ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][4][4] );
+void x264_add16x16_idct_dc_sse2( uint8_t *p_dst, int16_t dct[4][4] );
void x264_add8x8_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[2][2] );
+void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[4][4] );
void x264_dct4x4dc_mmx ( int16_t d[4][4] );
void x264_idct4x4dc_mmx ( int16_t d[4][4] );
pb_1: times 16 db 1
pw_1: times 8 dw 1
pd_1: times 4 dd 1
+pb_01: times 8 db 0, 1
%macro DQM4 3
dw %1, %2, %1, %2, %2, %3, %2, %3
SECTION .text
-%macro QUANT_DC_START 0
+%macro QUANT_DC_START_MMX 0
movd m6, r1m ; mf
movd m7, r2m ; bias
%ifidn m0, mm0
%endif
%endmacro
+%macro QUANT_DC_START_SSSE3 0
+ movdqa m5, [pb_01 GLOBAL]
+ movd m6, r1m ; mf
+ movd m7, r2m ; bias
+ pshufb m6, m5
+ pshufb m7, m5
+%endmacro
+
%macro PABSW_MMX 2
pxor %1, %1
pcmpgtw %1, %2
psignw %1, %2
%endmacro
-%macro QUANT_ONE 3
+%macro QUANT_ONE 4
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
pmulhuw m0, %2 ; divide
PSIGNW m0, m1 ; restore sign
mova %1, m0 ; store
+%if %4
+ por m5, m0
+%else
+ SWAP m5, m0
+%endif
+%endmacro
+
+%macro QUANT_TWO 7
+ mova m1, %1
+ mova m3, %2
+ PABSW m0, m1
+ PABSW m2, m3
+ paddusw m0, %5
+ paddusw m2, %6
+ pmulhuw m0, %3
+ pmulhuw m2, %4
+ PSIGNW m0, m1
+ PSIGNW m2, m3
+ mova %1, m0
+ mova %2, m2
+%if %7
+ por m5, m0
+ por m5, m2
+%else
+ SWAP m5, m0
+ por m5, m2
+%endif
+%endmacro
+
+%macro QUANT_END_MMX 0
+ xor eax, eax
+%ifndef ARCH_X86_64
+%if mmsize==8
+ packsswb m5, m5
+ movd ecx, m5
+ test ecx, ecx
+%else
+ pxor m4, m4
+ pcmpeqb m5, m4
+ pmovmskb ecx, m5
+ cmp ecx, (1<<mmsize)-1
+%endif
+%else
+%if mmsize==16
+ packsswb m5, m5
+%endif
+ movq rcx, m5
+ test rcx, rcx
+%endif
+ setne al
+%endmacro
+
+%macro QUANT_END_SSE4 0
+ xor eax, eax
+ ptest m5, m5
+ setne al
%endmacro
;-----------------------------------------------------------------------------
%macro QUANT_DC 2
cglobal %1, 1,1
QUANT_DC_START
+%if %2==1
+ QUANT_ONE [r0], m6, m7, 0
+%else
%assign x 0
-%rep %2
- QUANT_ONE [r0+x], m6, m7
-%assign x x+mmsize
+%rep %2/2
+ QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x
+%assign x x+mmsize*2
%endrep
+%endif
+ QUANT_END
RET
%endmacro
;-----------------------------------------------------------------------------
-; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
+; int x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
;-----------------------------------------------------------------------------
%macro QUANT_AC 2
cglobal %1, 3,3
%assign x 0
-%rep %2
- QUANT_ONE [r0+x], [r1+x], [r2+x]
-%assign x x+mmsize
+%rep %2/2
+ QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x
+%assign x x+mmsize*2
%endrep
+ QUANT_END
RET
%endmacro
INIT_MMX
+%define QUANT_END QUANT_END_MMX
%define PABSW PABSW_MMX
%define PSIGNW PSIGNW_MMX
+%define QUANT_DC_START QUANT_DC_START_MMX
QUANT_DC x264_quant_2x2_dc_mmxext, 1
%ifndef ARCH_X86_64 ; not needed because sse2 is faster
QUANT_DC x264_quant_4x4_dc_mmxext, 4
INIT_MMX
QUANT_DC x264_quant_2x2_dc_ssse3, 1
+%define QUANT_END QUANT_END_SSE4
+;Not faster on Conroe, so only used in SSE4 versions
+%define QUANT_DC_START QUANT_DC_START_SSSE3
+INIT_XMM
+QUANT_DC x264_quant_4x4_dc_sse4, 2
+QUANT_AC x264_quant_4x4_sse4, 2
+QUANT_AC x264_quant_8x8_sse4, 8
#ifndef X264_I386_QUANT_H
#define X264_I386_QUANT_H
-void x264_quant_2x2_dc_mmxext( int16_t dct[2][2], int mf, int bias );
-void x264_quant_4x4_dc_mmxext( int16_t dct[4][4], int mf, int bias );
-void x264_quant_4x4_mmx( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-void x264_quant_8x8_mmx( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-void x264_quant_4x4_dc_sse2( int16_t dct[4][4], int mf, int bias );
-void x264_quant_4x4_sse2( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-void x264_quant_8x8_sse2( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-void x264_quant_2x2_dc_ssse3( int16_t dct[2][2], int mf, int bias );
-void x264_quant_4x4_dc_ssse3( int16_t dct[4][4], int mf, int bias );
-void x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-void x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_2x2_dc_mmxext( int16_t dct[2][2], int mf, int bias );
+int x264_quant_4x4_dc_mmxext( int16_t dct[4][4], int mf, int bias );
+int x264_quant_4x4_mmx( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_mmx( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_4x4_dc_sse2( int16_t dct[4][4], int mf, int bias );
+int x264_quant_4x4_sse2( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_sse2( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_2x2_dc_ssse3( int16_t dct[2][2], int mf, int bias );
+int x264_quant_4x4_dc_ssse3( int16_t dct[4][4], int mf, int bias );
+int x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_4x4_dc_sse4( int16_t dct[4][4], int mf, int bias );
+int x264_quant_4x4_sse4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_sse4( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_4x4dc_mmxext( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
int i_cost = 0;
+ h->mb.i_cbp_luma = 0;
b_merged_satd = h->pixf.intra_sa8d_x3_8x8 && h->pixf.mbcmp[0] == h->pixf.satd[0];
// FIXME some bias like in i4x4?
if( h->mb.i_skip_intra )
{
h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
+ h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
+ h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
+ h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
+ h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
+ h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
if( h->mb.i_skip_intra == 2 )
h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
}
{
int i_cost;
int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
+ h->mb.i_cbp_luma = 0;
b_merged_satd = h->pixf.intra_satd_x3_4x4 && h->pixf.mbcmp[0] == h->pixf.satd[0];
if( a->i_mbrd )
i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
if( h->mb.i_skip_intra )
{
h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
+ h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
+ h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
+ h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
+ h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
+ h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
if( h->mb.i_skip_intra == 2 )
h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
}
x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
+ /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
+ * for future blocks are those left over from previous RDO calls. */
for( i = 0; i < 4; i++ )
{
int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
static void x264_partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_mode )
{
const int i_pred = x264_mb_predict_intra4x4_mode( h, 4*i8 );
- const int nnz = array_non_zero(h->dct.luma8x8[i8]);
i_mode = x264_mb_pred_mode4x4_fix( i_mode );
x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
- if( nnz )
- {
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4]] = 0x0101;
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
+ if( h->mb.i_cbp_luma & (1 << i8) )
block_residual_write_cabac_8x8( h, cb, 4*i8, h->dct.luma8x8[i8] );
- }
- else
- {
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4]] = 0;
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
- }
}
static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_mode )
const int i_pred = x264_mb_predict_intra4x4_mode( h, i4 );
i_mode = x264_mb_pred_mode4x4_fix( i_mode );
x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
- h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
}
static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
{
h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
- h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
return h->out.bs.i_bits_encoded;
}
dct4x4[3][0][0] = 0;
}
-static ALWAYS_INLINE void x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
+static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
{
int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
if( h->mb.b_trellis )
- x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx );
+ return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx );
else
- h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
+ return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
}
-static ALWAYS_INLINE void x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx )
+static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx )
{
int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY;
if( h->mb.b_trellis )
- x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx );
+ return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx );
else
- h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
+ return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
}
+/* All encoding functions must output the correct CBP and NNZ values.
+ * The entropy coding functions will check CBP first, then NNZ, before
+ * actually reading the DCT coefficients. NNZ still must be correct even
+ * if CBP is zero because of the use of NNZ values for context selection.
+ * "NNZ" need only be 0 or 1 rather than the exact coefficient count because
+ * that is only needed in CAVLC, and will be calculated by CAVLC's residual
+ * coding and stored as necessary. */
+
+/* This means that decimation can be done merely by adjusting the CBP and NNZ
+ * rather than memsetting the coefficients. */
+
void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
{
+ int nz;
uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
if( h->mb.b_lossless )
{
h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst );
+ nz = array_non_zero( h->dct.luma4x4[idx] );
+ h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
+ h->mb.i_cbp_luma |= nz<<(idx>>2);
return;
}
h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
- x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx );
-
- if( array_non_zero( dct4x4 ) )
+ nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx );
+ h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
+ if( nz )
{
+ h->mb.i_cbp_luma |= 1<<(idx>>2);
h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qp );
-
- /* output samples to fdec */
h->dctf.add4x4_idct( p_dst, dct4x4 );
}
- else
- memset( h->dct.luma4x4[idx], 0, sizeof(h->dct.luma4x4[idx]));
+}
+
+#define STORE_8x8_NNZ(idx,nz)\
+{\
+ *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] = nz * 0x0101;\
+ *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] = nz * 0x0101;\
}
void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
{
int x = 8 * (idx&1);
int y = 8 * (idx>>1);
+ int nz;
uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
if( h->mb.b_lossless )
{
h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
+ nz = array_non_zero( h->dct.luma8x8[idx] );
+ STORE_8x8_NNZ(idx,nz);
+ h->mb.i_cbp_luma |= nz<<idx;
return;
}
h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
- x264_quant_8x8( h, dct8x8, i_qp, 1, idx );
-
+ nz = x264_quant_8x8( h, dct8x8, i_qp, 1, idx );
h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
- h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
- h->dctf.add8x8_idct8( p_dst, dct8x8 );
+ if( nz )
+ {
+ h->mb.i_cbp_luma |= 1<<idx;
+ h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
+ h->dctf.add8x8_idct8( p_dst, dct8x8 );
+ STORE_8x8_NNZ(idx,1);
+ }
+ else
+ STORE_8x8_NNZ(idx,0);
}
static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
DECLARE_ALIGNED_16( int16_t dct_dc4x4[4][4] );
- int i;
+ int i, nz;
if( h->mb.b_lossless )
{
h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od );
dct_dc4x4[0][block_idx_yx_1d[i]] = h->dct.luma4x4[i][0];
h->dct.luma4x4[i][0] = 0;
+ nz = array_non_zero( h->dct.luma4x4[i] );
+ h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
+ h->mb.i_cbp_luma |= nz;
}
+ h->mb.i_cbp_luma *= 0xf;
+ h->mb.cache.non_zero_count[x264_scan8[24]] = array_non_zero( dct_dc4x4 );
h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
return;
}
h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
+
for( i = 0; i < 16; i++ )
{
/* copy dc coeff */
dct4x4[i][0][0] = 0;
/* quant/scan/dequant */
- x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i );
-
- h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
- h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp );
+ nz = x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i );
+ h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
+ if( nz )
+ {
+ h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
+ h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp );
+ h->mb.i_cbp_luma = 0xf;
+ }
}
h->dctf.dct4x4dc( dct_dc4x4 );
if( h->mb.b_trellis )
- x264_quant_dc_trellis( h, (int16_t*)dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1);
+ nz = x264_quant_dc_trellis( h, (int16_t*)dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1);
else
- h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
- h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
+ nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
- /* output samples to fdec */
- h->dctf.idct4x4dc( dct_dc4x4 );
- h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp ); /* XXX not inversed */
-
- /* calculate dct coeffs */
- for( i = 0; i < 16; i++ )
+ h->mb.cache.non_zero_count[x264_scan8[24]] = nz;
+ if( nz )
{
- /* copy dc coeff */
- dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]];
+ h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
+
+ /* output samples to fdec */
+ h->dctf.idct4x4dc( dct_dc4x4 );
+ h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp ); /* XXX not inversed */
+ if( h->mb.i_cbp_luma )
+ for( i = 0; i < 16; i++ )
+ dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]];
}
+
/* put pixels to fdec */
- h->dctf.add16x16_idct( p_dst, dct4x4 );
+ if( h->mb.i_cbp_luma )
+ h->dctf.add16x16_idct( p_dst, dct4x4 );
+ else if( nz )
+ h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
}
void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
{
- int i, ch, nz;
+ int i, ch, nz, nz_dc;
int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
h->mb.i_cbp_chroma = 0;
uint8_t *p_src = h->mb.pic.p_fenc[1+ch];
uint8_t *p_dst = h->mb.pic.p_fdec[1+ch];
int i_decimate_score = 0;
+ int nz_ac = 0;
DECLARE_ALIGNED_16( int16_t dct2x2[2][2] );
DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
for( i = 0; i < 4; i++ )
{
if( h->mb.b_trellis )
- x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 0 );
+ nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 0 );
else
- h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
- h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
-
- if( b_decimate )
- i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] );
+ nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
+ h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
+ if( nz )
+ {
+ nz_ac = 1;
+ h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
+ h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+ if( b_decimate )
+ i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] );
+ }
}
if( h->mb.b_trellis )
- x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter );
+ nz_dc = x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter );
else
- h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
+ nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
- if( b_decimate && i_decimate_score < 7 )
+ h->mb.cache.non_zero_count[x264_scan8[25]+ch] = nz_dc;
+
+ if( (b_decimate && i_decimate_score < 7) || !nz_ac )
{
/* Decimate the block */
h->mb.cache.non_zero_count[x264_scan8[16+0]+24*ch] = 0;
h->mb.cache.non_zero_count[x264_scan8[16+1]+24*ch] = 0;
h->mb.cache.non_zero_count[x264_scan8[16+2]+24*ch] = 0;
h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0;
- if( !array_non_zero( dct2x2 ) ) /* Whole block is empty */
- {
- h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0;
+ if( !nz_dc ) /* Whole block is empty */
continue;
- }
/* DC-only */
- h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
}
else
{
- for( i = 0; i < 4; i++ )
+ h->mb.i_cbp_chroma = 1;
+ if( nz_dc )
{
- nz = array_non_zero( h->dct.luma4x4[16+ch*4+i] );
- h->mb.cache.non_zero_count[x264_scan8[16+ch*4+i]] = nz;
- h->mb.i_cbp_chroma |= nz;
- if( nz )
- h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+ zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
+ idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
}
- /* Don't optimize for the AC-only case--it's very rare */
- h->mb.cache.non_zero_count[x264_scan8[25]+ch] = array_non_zero( dct2x2 );
- zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
- idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
h->dctf.add8x8_idct( p_dst, dct4x4 );
}
}
int i_qp = h->mb.i_qp;
int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
int b_force_no_skip = 0;
- int i,j,idx;
- uint8_t nnz8x8[4] = {1,1,1,1};
+ int i,idx,nz;
+ h->mb.i_cbp_luma = 0;
+ h->mb.cache.non_zero_count[x264_scan8[24]] = 0;
if( h->sh.b_mbaff
&& h->mb.i_mb_xy == h->sh.i_first_mb + h->mb.i_mb_stride
if( h->mb.i_skip_intra )
{
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i8x8_nnz_buf[0];
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i8x8_nnz_buf[1];
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i8x8_nnz_buf[2];
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i8x8_nnz_buf[3];
+ h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
/* In RD mode, restore the now-overwritten DCT data. */
if( h->mb.i_skip_intra == 2 )
h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
x264_mb_encode_i8x8( h, i, i_qp );
}
- for( i = 0; i < 4; i++ )
- nnz8x8[i] = array_non_zero( h->dct.luma8x8[i] );
}
else if( h->mb.i_type == I_4x4 )
{
if( h->mb.i_skip_intra )
{
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i4x4_nnz_buf[0];
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i4x4_nnz_buf[1];
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i4x4_nnz_buf[2];
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i4x4_nnz_buf[3];
+ h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
/* In RD mode, restore the now-overwritten DCT data. */
if( h->mb.i_skip_intra == 2 )
h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
- nnz8x8[i8x8] = array_non_zero( h->dct.luma8x8[i8x8] );
+ nz = array_non_zero( h->dct.luma8x8[i8x8] );
+ STORE_8x8_NNZ(i8x8,nz);
+ h->mb.i_cbp_luma |= nz << i8x8;
}
else
for( i4x4 = 0; i4x4 < 16; i4x4++ )
h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
+ nz = array_non_zero( h->dct.luma4x4[i4x4] );
+ h->mb.cache.non_zero_count[x264_scan8[i4x4]] = nz;
+ h->mb.i_cbp_luma |= nz << (i4x4>>2);
}
}
else if( h->mb.b_transform_8x8 )
{
if( h->mb.b_noise_reduction )
h->quantf.denoise_dct( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
- x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );
+ nz = x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );
- h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
-
- if( b_decimate )
+ if( nz )
{
- int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] );
- i_decimate_mb += i_decimate_8x8;
- if( i_decimate_8x8 < 4 )
- nnz8x8[idx] = 0;
+ h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
+ if( b_decimate )
+ {
+ int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] );
+ i_decimate_mb += i_decimate_8x8;
+ if( i_decimate_8x8 >= 4 )
+ h->mb.i_cbp_luma |= 1<<idx;
+ }
+ else
+ h->mb.i_cbp_luma |= 1<<idx;
}
- else
- nnz8x8[idx] = array_non_zero( dct8x8[idx] );
}
if( i_decimate_mb < 6 && b_decimate )
- *(uint32_t*)nnz8x8 = 0;
+ {
+ h->mb.i_cbp_luma = 0;
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+ }
else
{
for( idx = 0; idx < 4; idx++ )
- if( nnz8x8[idx] )
+ {
+ if( h->mb.i_cbp_luma&(1<<idx) )
{
h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
+ STORE_8x8_NNZ(idx,1);
}
+ else
+ STORE_8x8_NNZ(idx,0);
+ }
}
}
else
for( i8x8 = 0; i8x8 < 4; i8x8++ )
{
- int i_decimate_8x8;
+ int i_decimate_8x8 = 0;
+ int cbp = 0;
/* encode one 4x4 block */
- i_decimate_8x8 = 0;
for( i4x4 = 0; i4x4 < 4; i4x4++ )
{
idx = i8x8 * 4 + i4x4;
if( h->mb.b_noise_reduction )
h->quantf.denoise_dct( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
- x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
+ nz = x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
+ h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
- h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
-
- if( b_decimate && i_decimate_8x8 < 6 )
- i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] );
+ if( nz )
+ {
+ h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
+ h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp );
+ if( b_decimate && i_decimate_8x8 < 6 )
+ i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] );
+ cbp = 1;
+ }
}
/* decimate this 8x8 block */
i_decimate_mb += i_decimate_8x8;
- if( i_decimate_8x8 < 4 && b_decimate )
- nnz8x8[i8x8] = 0;
+ if( b_decimate )
+ {
+ if( i_decimate_8x8 < 4 )
+ STORE_8x8_NNZ(i8x8,0)
+ else
+ h->mb.i_cbp_luma |= 1<<i8x8;
+ }
+ else if( cbp )
+ {
+ h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
+ h->mb.i_cbp_luma |= 1<<i8x8;
+ }
}
- if( i_decimate_mb < 6 && b_decimate )
- *(uint32_t*)nnz8x8 = 0;
- else
+ if( b_decimate )
{
- for( i8x8 = 0; i8x8 < 4; i8x8++ )
- if( nnz8x8[i8x8] )
- {
- for( i = 0; i < 4; i++ )
- h->quantf.dequant_4x4( dct4x4[i8x8*4+i], h->dequant4_mf[CQM_4PY], i_qp );
- h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
- }
+ if( i_decimate_mb < 6 )
+ {
+ h->mb.i_cbp_luma = 0;
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
+ *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+ }
+ else
+ {
+ for( i8x8 = 0; i8x8 < 4; i8x8++ )
+ if( h->mb.i_cbp_luma&(1<<i8x8) )
+ h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
+ }
}
}
}
/* encode the 8x8 blocks */
x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
- /* coded block pattern and non_zero_count */
- h->mb.i_cbp_luma = 0x00;
- if( h->mb.i_type == I_16x16 )
- {
- for( i = 0; i < 16; i++ )
- {
- int nz = array_non_zero( h->dct.luma4x4[i] );
- h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
- h->mb.i_cbp_luma |= nz;
- }
- h->mb.i_cbp_luma *= 0xf;
- h->mb.cache.non_zero_count[x264_scan8[24]] = array_non_zero( h->dct.luma16x16_dc );
- }
- else
- {
- for( i = 0; i < 4; i++)
- {
- if(!nnz8x8[i])
- {
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+i*4]] = 0;
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+i*4]] = 0;
- }
- else if( h->mb.b_transform_8x8 )
- {
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+4*i]] = nnz8x8[i] * 0x0101;
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+4*i]] = nnz8x8[i] * 0x0101;
- h->mb.i_cbp_luma |= nnz8x8[i] << i;
- }
- else
- {
- int nz, cbp = 0;
- for( j = 0; j < 4; j++ )
- {
- nz = array_non_zero( h->dct.luma4x4[j+4*i] );
- h->mb.cache.non_zero_count[x264_scan8[j+4*i]] = nz;
- cbp |= nz;
- }
- h->mb.i_cbp_luma |= cbp << i;
- }
- }
- h->mb.cache.non_zero_count[x264_scan8[24]] = 0;
- }
-
if( h->param.b_cabac )
{
i_cbp_dc = h->mb.cache.non_zero_count[x264_scan8[24]]
/* encode one 4x4 block */
for( i4x4 = 0; i4x4 < 4; i4x4++ )
{
- h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
- if( !array_non_zero(dct4x4[i4x4]) )
+ if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ) )
continue;
h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
i_decimate_mb += h->quantf.decimate_score16( dctscan );
/* calculate dct DC */
dct2x2dc( dct2x2, dct4x4 );
- h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 );
- if( array_non_zero(dct2x2) )
+ if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) )
return 0;
/* calculate dct coeffs */
for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
{
- h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
- if( !array_non_zero(dct4x4[i4x4]) )
+ if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
continue;
h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
i_decimate_mb += h->quantf.decimate_score15( dctscan );
uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
int nnz8x8 = 0;
- int ch;
+ int ch, nz;
x264_mb_mc_8x8( h, i8 );
{
h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
nnz8x8 = array_non_zero( h->dct.luma8x8[i8] );
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101 * nnz8x8;
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101 * nnz8x8;
+ STORE_8x8_NNZ(i8,nnz8x8);
}
else
{
p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
h->zigzagf.sub_4x4( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec );
h->dct.luma4x4[16+i8+ch*4][0] = 0;
+ h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = array_non_zero( h->dct.luma4x4[16+i8+ch*4] );
}
- h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero( h->dct.luma4x4[16+i8] );
- h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero( h->dct.luma4x4[20+i8] );
}
else
{
{
DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
- x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
- h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
-
- if( b_decimate && !h->mb.b_trellis )
- nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[i8] );
- else
- nnz8x8 = array_non_zero( dct8x8 );
-
+ nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
if( nnz8x8 )
{
- h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
- h->dctf.add8x8_idct8( p_fdec, dct8x8 );
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101;
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
+ h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
+
+ if( b_decimate && !h->mb.b_trellis )
+ nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[i8] );
+
+ if( nnz8x8 )
+ {
+ h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
+ h->dctf.add8x8_idct8( p_fdec, dct8x8 );
+ STORE_8x8_NNZ(i8,1);
+ }
+ else
+ STORE_8x8_NNZ(i8,0);
}
else
- {
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
- }
+ STORE_8x8_NNZ(i8,0);
}
else
{
int i4;
+ int i_decimate_8x8 = 0;
DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
for( i4 = 0; i4 < 4; i4++ )
- x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 );
-
- for( i4 = 0; i4 < 4; i4++ )
- h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
-
- if( b_decimate )
{
- int i_decimate_8x8 = 0;
- for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ )
- i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] );
- nnz8x8 = 4 <= i_decimate_8x8;
+ nz = x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 );
+ h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = nz;
+ if( nz )
+ {
+ h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
+ h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
+ if( b_decimate )
+ i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] );
+ nnz8x8 = 1;
+ }
}
- else
- nnz8x8 = array_non_zero( dct4x4 );
+
+ if( b_decimate && i_decimate_8x8 < 4 )
+ nnz8x8 = 0;
if( nnz8x8 )
- {
- for( i4 = 0; i4 < 4; i4++ )
- {
- if( array_non_zero( dct4x4[i4] ) )
- {
- h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
- h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = 1;
- }
- else
- h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = 0;
- }
h->dctf.add8x8_idct( p_fdec, dct4x4 );
- }
else
- {
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
- }
+ STORE_8x8_NNZ(i8,0);
}
i_qp = h->mb.i_chroma_qp;
dct4x4[0][0] = 0;
if( h->mb.b_trellis )
- x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 0 );
+ nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 0 );
else
- h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
+ nz = h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
- if( array_non_zero( dct4x4 ) )
+ h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
+ if( nz )
{
h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*4], dct4x4 );
h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
h->dctf.add4x4_idct( p_fdec, dct4x4 );
- h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = 1;
}
- else
- h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = 0;
}
}
h->mb.i_cbp_luma &= ~(1 << i8);
const int i_ref = h->mb.cache.ref[0][x264_scan8[i4]];
const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[i4]][0], h->mb.mv_min[0], h->mb.mv_max[0] );
const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[i4]][1], h->mb.mv_min[1], h->mb.mv_max[1] );
+ int nz;
h->mc.mc_luma( p_fdec, FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0], mvx + 4*4*block_idx_x[i4], mvy + 4*4*block_idx_y[i4], 4, 4 );
{
DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
- x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 );
- if( array_non_zero( dct4x4 ) )
+ nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 );
+ h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
+ if( nz )
{
h->zigzagf.scan_4x4( h->dct.luma4x4[i4], dct4x4 );
h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PY], i_qp );
h->dctf.add4x4_idct( p_fdec, dct4x4 );
- h->mb.cache.non_zero_count[x264_scan8[i4]] = 1;
}
- else
- h->mb.cache.non_zero_count[x264_scan8[i4]] = 0;
}
}
void x264_cabac_mb_skip( x264_t *h, int b_skip );
-void x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
+int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
int i_qp, int i_ctxBlockCat, int b_intra );
-void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
+int x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
int i_qp, int i_ctxBlockCat, int b_intra, int idx );
-void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
+int x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
int i_qp, int b_intra, int idx );
void x264_noise_reduction_update( x264_t *h );
if( i_pixel > PIXEL_8x8 )
return x264_rd_cost_subpart( h, i_lambda2, i4, i_pixel );
+ h->mb.i_cbp_luma = 0;
+
x264_macroblock_encode_p8x8( h, i8 );
if( i_pixel == PIXEL_16x8 )
x264_macroblock_encode_p8x8( h, i8+1 );
static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
{
uint64_t i_ssd, i_bits;
+ h->mb.i_cbp_luma = 0;
+ h->mb.b_transform_8x8 = 1;
x264_mb_encode_i8x8( h, i8, h->mb.i_qp );
i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );
// comparable to the input. so unquant is the direct inverse of quant,
// and uses the dct scaling factors, not the idct ones.
-static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct,
+static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, int16_t *dct,
const uint16_t *quant_mf, const int *unquant_mf,
const int *coef_weight, const uint8_t *zigzag,
int i_ctxBlockCat, int i_lambda2, int b_ac, int dc, int i_coefs, int idx )
const int b_interlaced = h->mb.b_interlaced;
const int f = 1 << 15; // no deadzone
int i_last_nnz;
- int i, j;
+ int i, j, nz;
// (# of coefs) * (# of ctx) * (# of levels tried) = 1024
// we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
if( i < b_ac )
{
memset( dct, 0, i_coefs * sizeof(*dct) );
- return;
+ return 0;
}
i_last_nnz = i;
bnode = &nodes_cur[j];
j = bnode->level_idx;
+ nz = 0;
for( i = b_ac; i < i_coefs; i++ )
{
dct[zigzag[i]] = level_tree[j].abs_level * signs[i];
+ nz |= level_tree[j].abs_level;
j = level_tree[j].next;
}
+ return !!nz;
}
const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};
-void x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
+int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
int i_qp, int i_ctxBlockCat, int b_intra )
{
- quant_trellis_cabac( h, (int16_t*)dct,
+ return quant_trellis_cabac( h, (int16_t*)dct,
h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced],
i_ctxBlockCat, lambda2_tab[b_intra][i_qp], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 );
}
-void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
+int x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
int i_qp, int i_ctxBlockCat, int b_intra, int idx )
{
int b_ac = (i_ctxBlockCat == DCT_LUMA_AC || i_ctxBlockCat == DCT_CHROMA_AC);
- quant_trellis_cabac( h, (int16_t*)dct,
+ return quant_trellis_cabac( h, (int16_t*)dct,
h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
x264_dct4_weight2_zigzag[h->mb.b_interlaced],
x264_zigzag_scan4[h->mb.b_interlaced],
i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 0, 16, idx );
}
-void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
+int x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
int i_qp, int b_intra, int idx )
{
- quant_trellis_cabac( h, (int16_t*)dct,
+ return quant_trellis_cabac( h, (int16_t*)dct,
h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
x264_dct8_weight2_zigzag[h->mb.b_interlaced],
x264_zigzag_scan8[h->mb.b_interlaced],
TEST_IDCT( add8x8_idct, dct4 );
TEST_IDCT( add8x8_idct_dc, dct4 );
TEST_IDCT( add16x16_idct, dct4 );
+ TEST_IDCT( add16x16_idct_dc, dct4 );
report( "add_idct4 :" );
ok = 1; used_asm = 0;
DECLARE_ALIGNED_16( uint8_t cqm_buf[64] );
int ret = 0, ok, used_asm;
int oks[2] = {1,1}, used_asms[2] = {0,0};
- int i, i_cqm, qp;
+ int i, j, i_cqm, qp;
x264_t h_buf;
x264_t *h = &h_buf;
memset( h, 0, sizeof(*h) );
for( x = 0; x < 8; x++ ) \
{ \
unsigned int scale = (255*scale1d[y]*scale1d[x])/16; \
- dct1[y*8+x] = dct2[y*8+x] = (rand()%(2*scale+1))-scale; \
+ dct1[y*8+x] = dct2[y*8+x] = j ? (rand()%(2*scale+1))-scale : 0; \
} \
}
for( x = 0; x < 4; x++ ) \
{ \
unsigned int scale = 255*scale1d[y]*scale1d[x]; \
- dct1[y*4+x] = dct2[y*4+x] = (rand()%(2*scale+1))-scale; \
+ dct1[y*4+x] = dct2[y*4+x] = j ? (rand()%(2*scale+1))-scale : 0; \
} \
}
used_asms[0] = 1; \
for( qp = 51; qp > 0; qp-- ) \
{ \
- for( i = 0; i < 16; i++ ) \
- dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
- call_c1( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
- call_a1( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
- if( memcmp( dct1, dct2, 16*2 ) ) \
+ for( j = 0; j < 2; j++ ) \
{ \
- oks[0] = 0; \
- fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
- break; \
+ int result_c, result_a; \
+ for( i = 0; i < 16; i++ ) \
+ dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \
+ result_c = call_c1( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+ result_a = call_a1( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+ if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a ) \
+ { \
+ oks[0] = 0; \
+ fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
+ break; \
+ } \
+ call_c2( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+ call_a2( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
} \
- call_c2( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
- call_a2( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
} \
}
used_asms[0] = 1; \
for( qp = 51; qp > 0; qp-- ) \
{ \
- INIT_QUANT##w() \
- call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
- call_a1( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
- if( memcmp( dct1, dct2, w*w*2 ) ) \
+ for( j = 0; j < 2; j++ ) \
{ \
- oks[0] = 0; \
- fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
- break; \
+ int result_c, result_a; \
+ INIT_QUANT##w() \
+ result_c = call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+ result_a = call_a1( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+ if( memcmp( dct1, dct2, w*w*2 ) || result_c != result_a ) \
+ { \
+ oks[0] = 0; \
+ fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
+ break; \
+ } \
+ call_c2( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+ call_a2( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
} \
- call_c2( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
- call_a2( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
} \
}
{ \
set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
used_asms[1] = 1; \
+ j = 1; \
for( qp = 51; qp > 0; qp-- ) \
{ \
INIT_QUANT##w() \