}
}
+/* (ref: JVT-B118)
+ * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
+ * to 0 (low score means set it to null)
+ * Used in inter macroblock (luma and chroma)
+ * luma: for a 8x8 block: if score < 4 -> null
+ * for the complete mb: if score < 6 -> null
+ * chroma: for the complete mb: if score < 7 -> null
+ */
+
+const uint8_t x264_decimate_table4[16] = {
+ 3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 };
+const uint8_t x264_decimate_table8[64] = {
+ 3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
+ 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
+
+static int ALWAYS_INLINE x264_decimate_score_internal( int16_t *dct, int i_max )
+{
+ const uint8_t *ds_table = (i_max == 64) ? x264_decimate_table8 : x264_decimate_table4;
+ int i_score = 0;
+ int idx = i_max - 1;
+
+ /* Yes, dct[idx-1] is guaranteed to be 32-bit aligned. idx>=0 instead of 1 works correctly for the same reason */
+ while( idx >= 0 && *(uint32_t*)&dct[idx-1] == 0 )
+ idx -= 2;
+ if( idx >= 0 && dct[idx] == 0 )
+ idx--;
+ while( idx >= 0 )
+ {
+ int i_run;
+
+ if( (unsigned)(dct[idx--] + 1) > 2 )
+ return 9;
+
+ i_run = 0;
+ while( idx >= 0 && dct[idx] == 0 )
+ {
+ idx--;
+ i_run++;
+ }
+ i_score += ds_table[i_run];
+ }
+
+ return i_score;
+}
+
+static int x264_decimate_score15( int16_t *dct )
+{
+ return x264_decimate_score_internal( dct+1, 15 );
+}
+static int x264_decimate_score16( int16_t *dct )
+{
+ return x264_decimate_score_internal( dct, 16 );
+}
+static int x264_decimate_score64( int16_t *dct )
+{
+ return x264_decimate_score_internal( dct, 64 );
+}
+
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->quant_8x8 = quant_8x8;
pf->dequant_8x8 = dequant_8x8;
pf->denoise_dct = x264_denoise_dct;
+ pf->decimate_score15 = x264_decimate_score15;
+ pf->decimate_score16 = x264_decimate_score16;
+ pf->decimate_score64 = x264_decimate_score64;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
pf->quant_2x2_dc = x264_quant_2x2_dc_mmxext;
#ifdef ARCH_X86
pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext;
+ pf->decimate_score15 = x264_decimate_score15_mmxext;
+ pf->decimate_score16 = x264_decimate_score16_mmxext;
+ pf->decimate_score64 = x264_decimate_score64_mmxext;
#endif
}
pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
}
pf->denoise_dct = x264_denoise_dct_sse2;
+ pf->decimate_score15 = x264_decimate_score15_sse2;
+ pf->decimate_score16 = x264_decimate_score16_sse2;
+ pf->decimate_score64 = x264_decimate_score64_sse2;
}
if( cpu&X264_CPU_SSSE3 )
pf->quant_4x4 = x264_quant_4x4_ssse3;
pf->quant_8x8 = x264_quant_8x8_ssse3;
pf->denoise_dct = x264_denoise_dct_ssse3;
+ pf->decimate_score15 = x264_decimate_score15_ssse3;
+ pf->decimate_score16 = x264_decimate_score16_ssse3;
+ pf->decimate_score64 = x264_decimate_score64_ssse3;
}
#endif // HAVE_MMX
void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void (*denoise_dct)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
+
+ int (*decimate_score15)( int16_t *dct );
+ int (*decimate_score16)( int16_t *dct );
+ int (*decimate_score64)( int16_t *dct );
} x264_quant_function_t;
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
;*****************************************************************************
%include "x86inc.asm"
+%include "x86util.asm"
SECTION_RODATA
+pb_1: times 16 db 1
pw_1: times 8 dw 1
pd_1: times 4 dd 1
DQM8 32, 28, 51, 30, 40, 38
DQM8 36, 32, 58, 34, 46, 43
+decimate_mask_table4:
+ db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
+ db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
+ db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
+ db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
+ db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
+ db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
+ db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
+ db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
+ db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
+
SECTION .text
%macro QUANT_DC_START 0
%define PABSW PABSW_SSSE3
%define PSIGNW PSIGNW_SSSE3
DENOISE_DCT ssse3
+
+
+
+;-----------------------------------------------------------------------------
+; int x264_decimate_score( int16_t *dct )
+;-----------------------------------------------------------------------------
+
+%macro DECIMATE_MASK_SSE2 6
+%ifidn %5, ssse3
+ pabsw xmm0, [%3+ 0]
+ pabsw xmm1, [%3+16]
+%else
+ movdqa xmm0, [%3+ 0]
+ movdqa xmm1, [%3+16]
+ ABS2_MMX xmm0, xmm1, xmm3, xmm4
+%endif
+ packsswb xmm0, xmm1
+ pxor xmm2, xmm2
+ pcmpeqb xmm2, xmm0
+ pcmpgtb xmm0, %4
+ pmovmskb %1, xmm2
+ pmovmskb %2, xmm0
+%endmacro
+
+%macro DECIMATE_MASK_MMX 6
+ movq mm0, [%3+ 0]
+ movq mm1, [%3+ 8]
+ movq mm2, [%3+16]
+ movq mm3, [%3+24]
+ ABS2_MMX mm0, mm1, mm4, mm5
+ ABS2_MMX mm2, mm3, mm4, mm5
+ packsswb mm0, mm1
+ packsswb mm2, mm3
+ pxor mm4, mm4
+ pxor mm5, mm5
+ pcmpeqb mm4, mm0
+ pcmpeqb mm5, mm2
+ pcmpgtb mm0, %4
+ pcmpgtb mm2, %4
+ pmovmskb %6, mm4
+ pmovmskb %1, mm5
+ shl %1, 8
+ or %1, %6
+ pmovmskb %6, mm0
+ pmovmskb %2, mm2
+ shl %2, 8
+ or %2, %6
+%endmacro
+
+cextern x264_decimate_table4
+cextern x264_decimate_table8
+
+%macro DECIMATE4x4 2
+
+;A LUT is faster than bsf on AMD processors, and no slower on Intel
+;This is not true for score64.
+cglobal x264_decimate_score%1_%2, 1,3
+%ifdef PIC
+ lea r10, [x264_decimate_table4 GLOBAL]
+ lea r11, [decimate_mask_table4 GLOBAL]
+ %define table r10
+ %define mask_table r11
+%else
+ %define table x264_decimate_table4
+ %define mask_table decimate_mask_table4
+%endif
+ DECIMATE_MASK edx, eax, r0, [pb_1 GLOBAL], %2, ecx
+ xor edx, 0xffff
+ je .ret
+ test eax, eax
+ jne .ret9
+%if %1==15
+ shr edx, 1
+%endif
+ movzx ecx, dl
+ movzx eax, byte [mask_table + rcx]
+ cmp edx, ecx
+ je .ret
+ bsr ecx, ecx
+ shr edx, 1
+ shr edx, cl
+ bsf ecx, edx
+ shr edx, 1
+ shr edx, cl
+ add al, byte [table + rcx]
+ add al, byte [mask_table + rdx]
+.ret:
+ REP_RET
+.ret9:
+ mov eax, 9
+ RET
+
+%endmacro
+
+%ifndef ARCH_X86_64
+%define DECIMATE_MASK DECIMATE_MASK_MMX
+DECIMATE4x4 15, mmxext
+DECIMATE4x4 16, mmxext
+%endif
+%define DECIMATE_MASK DECIMATE_MASK_SSE2
+DECIMATE4x4 15, sse2
+DECIMATE4x4 15, ssse3
+DECIMATE4x4 16, sse2
+DECIMATE4x4 16, ssse3
+
+%macro DECIMATE8x8 1
+
+%ifdef ARCH_X86_64
+cglobal x264_decimate_score64_%1, 1,4
+%ifdef PIC
+ lea r10, [x264_decimate_table8 GLOBAL]
+ %define table r10
+%else
+ %define table x264_decimate_table8
+%endif
+ mova m7, [pb_1 GLOBAL]
+ DECIMATE_MASK r1d, eax, r0, m7, %1, null
+ test eax, eax
+ jne .ret9
+ DECIMATE_MASK r2d, eax, r0+32, m7, %1, null
+ shl r2d, 16
+ or r1d, r2d
+ DECIMATE_MASK r2d, r3d, r0+64, m7, %1, null
+ shl r2, 32
+ or eax, r3d
+ or r1, r2
+ DECIMATE_MASK r2d, r3d, r0+96, m7, %1, null
+ shl r2, 48
+ or r1, r2
+ not r1
+ test r1, r1
+ je .ret
+ or eax, r3d
+ jne .ret9
+.loop:
+ bsf rcx, r1
+ shr r1, cl
+ movzx ecx, byte [table + rcx]
+ add eax, ecx
+ shr r1, 1
+ jne .loop
+.ret:
+ REP_RET
+.ret9:
+ mov eax, 9
+ RET
+
+%else ; ARCH
+%ifidn %1, mmxext
+cglobal x264_decimate_score64_%1, 1,6
+%else
+cglobal x264_decimate_score64_%1, 1,5
+%endif
+ mova m7, [pb_1 GLOBAL]
+ DECIMATE_MASK r3, r2, r0, m7, %1, r5
+ test r2, r2
+ jne .ret9
+ DECIMATE_MASK r4, r2, r0+32, m7, %1, r5
+ shl r4, 16
+ or r3, r4
+ DECIMATE_MASK r4, r1, r0+64, m7, %1, r5
+ or r2, r1
+ DECIMATE_MASK r1, r0, r0+96, m7, %1, r5
+ shl r1, 16
+ or r4, r1
+ not r3
+ not r4
+ mov r1, r3
+ or r1, r4
+ je .ret
+ or r0, r2
+ jne .ret9 ;r2 is zero at this point, so we don't need to zero it
+.loop:
+ bsf ecx, r3
+ test r3, r3
+ je .largerun
+ shrd r3, r4, cl
+ shr r4, cl
+ movzx ecx, byte [x264_decimate_table8 + ecx]
+ add r0, ecx
+ shrd r3, r4, 1
+ shr r4, 1
+ mov r2, r3
+ or r2, r4
+ jne .loop
+.ret:
+ REP_RET
+.ret9:
+ mov eax, 9
+ RET
+.largerun:
+ mov r3, r4
+ xor r4, r4
+ bsf ecx, r3
+ shr r3, cl
+ shr r3, 1
+ jne .loop
+ REP_RET
+%endif ; ARCH
+
+%endmacro
+
+%ifndef ARCH_X86_64
+INIT_MMX
+%define DECIMATE_MASK DECIMATE_MASK_MMX
+DECIMATE8x8 mmxext
+%endif
+INIT_XMM
+%define DECIMATE_MASK DECIMATE_MASK_SSE2
+DECIMATE8x8 sse2
+DECIMATE8x8 ssse3
+
void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
void x264_denoise_dct_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
void x264_denoise_dct_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
+int x264_decimate_score15_mmxext( int16_t *dct );
+int x264_decimate_score15_sse2 ( int16_t *dct );
+int x264_decimate_score15_ssse3 ( int16_t *dct );
+int x264_decimate_score16_mmxext( int16_t *dct );
+int x264_decimate_score16_sse2 ( int16_t *dct );
+int x264_decimate_score16_ssse3 ( int16_t *dct );
+int x264_decimate_score64_mmxext( int16_t *dct );
+int x264_decimate_score64_sse2 ( int16_t *dct );
+int x264_decimate_score64_ssse3 ( int16_t *dct );
#endif
}
#undef ZIG
-/* (ref: JVT-B118)
- * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
- * to 0 (low score means set it to null)
- * Used in inter macroblock (luma and chroma)
- * luma: for a 8x8 block: if score < 4 -> null
- * for the complete mb: if score < 6 -> null
- * chroma: for the complete mb: if score < 7 -> null
- */
-static int x264_mb_decimate_score( int16_t *dct, int i_max )
-{
- static const int i_ds_table4[16] = {
- 3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 };
- static const int i_ds_table8[64] = {
- 3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
- 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
-
- const int *ds_table = (i_max == 64) ? i_ds_table8 : i_ds_table4;
- int i_score = 0;
- int idx = i_max - 1;
-
- while( idx >= 0 && dct[idx] == 0 )
- idx--;
-
- while( idx >= 0 )
- {
- int i_run;
-
- if( (unsigned)(dct[idx--] + 1) > 2 )
- return 9;
-
- i_run = 0;
- while( idx >= 0 && dct[idx] == 0 )
- {
- idx--;
- i_run++;
- }
- i_score += ds_table[i_run];
- }
-
- return i_score;
-}
-
static ALWAYS_INLINE void x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
{
int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
if( b_decimate )
- i_decimate_score += x264_mb_decimate_score( h->dct.luma4x4[16+i+ch*4]+1, 15 );
+ i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] );
}
h->dctf.dct2x2dc( dct2x2 );
if( b_decimate )
{
- int i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[idx], 64 );
+ int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] );
i_decimate_mb += i_decimate_8x8;
if( i_decimate_8x8 < 4 )
nnz8x8[idx] = 0;
h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
if( b_decimate && i_decimate_8x8 <= 6 )
- i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[idx], 16 );
+ i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] );
}
/* decimate this 8x8 block */
if( !array_non_zero(dct4x4[i4x4]) )
continue;
h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
- i_decimate_mb += x264_mb_decimate_score( dctscan, 16 );
+ i_decimate_mb += h->quantf.decimate_score16( dctscan );
if( i_decimate_mb >= 6 )
return 0;
}
/* calculate dct coeffs */
for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
{
+ dct4x4[i4x4][0][0] = 0;
h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
if( !array_non_zero(dct4x4[i4x4]) )
continue;
h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
- i_decimate_mb += x264_mb_decimate_score( dctscan+1, 15 );
+ i_decimate_mb += h->quantf.decimate_score15( dctscan );
if( i_decimate_mb >= 7 )
return 0;
}
h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
if( b_decimate && !h->mb.b_trellis )
- nnz8x8 = 4 <= x264_mb_decimate_score( h->dct.luma8x8[i8], 64 );
+ nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[i8] );
else
nnz8x8 = array_non_zero( dct8x8 );
{
int i_decimate_8x8 = 0;
for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ )
- i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[i8*4+i4], 16 );
+ i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] );
nnz8x8 = 4 <= i_decimate_8x8;
}
else
}
report( "denoise dct :" );
+#define TEST_DECIMATE( qname, decname, block, w, ac ) \
+ if( qf_a.decname != qf_ref.decname ) \
+ { \
+ set_func_name( #decname ); \
+ used_asm = 1; \
+ for( i = 0; i < 100; i++ ) \
+ { \
+ int result_c, result_a, idx; \
+ for( idx = 0; idx < w*w; idx++ ) \
+ dct1[idx] = !(rand()&3) + (!(rand()&15))*(rand()&3); \
+ if( ac ) \
+ dct1[0] = 0; \
+ memcpy( dct2, dct1, w*w*2 ); \
+ result_c = call_c1( qf_c.decname, (void*)dct2 ); \
+ result_a = call_a1( qf_a.decname, (void*)dct2 ); \
+ if( result_c != result_a ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, #decname ": [FAILED]\n" ); \
+ break; \
+ } \
+ call_c2( qf_c.decname, (void*)dct2 ); \
+ call_a2( qf_a.decname, (void*)dct2 ); \
+ } \
+ }
+
+ TEST_DECIMATE( quant_8x8, decimate_score64, CQM_8IY, 8, 0 );
+ TEST_DECIMATE( quant_4x4, decimate_score16, CQM_4IY, 4, 0 );
+ TEST_DECIMATE( quant_4x4, decimate_score15, CQM_4IY, 4, 1 );
+ report( "decimate_score :" );
+
return ret;
}