From: Fiona Glaser Date: Fri, 5 Apr 2013 01:00:23 +0000 (-0700) Subject: x86: SSSE3 LUT-based faster coeff_level_run X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=67d6f602018d0fc1cb05cd6240e4fe1c2646169f;p=x264 x86: SSSE3 LUT-based faster coeff_level_run ~2x faster coeff_level_run. Faster CAVLC encoding: {1%,2%,7%} overall with {superfast,medium,slower}. Uses the same pshufb LUT abuse trick as in the previous ads_mvs patch. --- diff --git a/common/bitstream.h b/common/bitstream.h index a0ace070..629cf607 100644 --- a/common/bitstream.h +++ b/common/bitstream.h @@ -55,9 +55,9 @@ typedef struct bs_s typedef struct { - int last; - int mask; - dctcoef level[16]; + int32_t last; + int32_t mask; + ALIGNED_16( dctcoef level[18] ); } x264_run_level_t; extern const vlc_t x264_coeff0_token[6]; diff --git a/common/quant.c b/common/quant.c index 7dfd3bd8..3f70310f 100644 --- a/common/quant.c +++ b/common/quant.c @@ -633,6 +633,17 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->decimate_score16 = x264_decimate_score16_ssse3; pf->decimate_score64 = x264_decimate_score64_ssse3; INIT_TRELLIS( ssse3 ); + pf->coeff_level_run4 = x264_coeff_level_run4_ssse3; + pf->coeff_level_run8 = x264_coeff_level_run8_ssse3; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3; + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_level_run4 = x264_coeff_level_run4_ssse3; + pf->coeff_level_run8 = x264_coeff_level_run8_ssse3; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt; + } } if( cpu&X264_CPU_SSE4 ) @@ -681,6 +692,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) } pf->decimate_score64 = x264_decimate_score64_avx2; pf->denoise_dct = x264_denoise_dct_avx2; + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt; + } } #endif // HAVE_MMX diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm index b5637647..1389be1f 100644 --- a/common/x86/const-a.asm +++ b/common/x86/const-a.asm @@ -49,6 +49,7 @@ const pw_m2, times 8 dw -2 const pw_4, times 8 dw 4 const pw_8, times 8 dw 8 const pw_64, times 8 dw 64 +const pw_256, times 8 dw 256 const pw_32_0, times 4 dw 32, times 4 dw 0 const pw_8000, times 8 dw 0x8000 @@ -63,4 +64,12 @@ const pd_1024, times 4 dd 1024 const pd_ffff, times 4 dd 0xffff const pw_ff00, times 8 dw 0xff00 +const popcnt_table +%assign x 0 +%rep 256 +; population count +db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1) +%assign x x+1 +%endrep + const sw_64, dd 64 diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index b0362597..4ee52fd6 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -129,14 +129,6 @@ pd_f0: times 4 dd 0xffff0000 pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7 -ads_mvs_count: -%assign x 0 -%rep 256 -; population count -db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1) -%assign x x+1 -%endrep - ads_mvs_shuffle: %macro ADS_MVS_SHUFFLE 8 %assign y x @@ -171,6 +163,7 @@ cextern pw_pmpmpmpm cextern pw_pmmpzzzz cextern pd_1 cextern hsub_mul +cextern popcnt_table ;============================================================================= ; SSD @@ -5189,19 +5182,24 @@ ads_mvs_ssse3: add r5, r6 xor r0d, r0d ; nmv mov [r5], r0d - lea r1, [ads_mvs_count] +%ifdef PIC + lea r1, [$$] + %define GLOBAL +r1-$$ +%else + %define GLOBAL +%endif .loop: movh m0, [r6] pcmpeqb m0, m5 pmovmskb r2d, m0 - xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions) - movzx r3d, byte [r1+r2] ; popcnt + xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions) + movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt add r2d, r2d ; shuffle counters based on mv mask - pshufb m2, m4, [r1+r2*8+(ads_mvs_shuffle-ads_mvs_count)] + pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL] movu [r4+r0*2], m2 add r0d, r3d - paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7} + paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7} add r6, 8 cmp r6, r5 jl .loop diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index ccae210b..0f7fe610 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -74,14 +74,38 @@ chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0 chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1 chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1 +%if HIGH_BIT_DEPTH==0 +dct_coef_shuffle: +%macro DCT_COEF_SHUFFLE 8 + %assign y x + %rep 8 + %rep 7 + %rotate (~(y>>7))&1 + %assign y y<<((~(y>>7))&1) + %endrep + db %1*2 + %rotate 1 + %assign y y<<1 + %endrep +%endmacro +%assign x 0 +%rep 256 + DCT_COEF_SHUFFLE 7, 6, 5, 4, 3, 2, 1, 0 +%assign x x+1 +%endrep +%endif + SECTION .text cextern pb_1 cextern pw_1 +cextern pw_2 +cextern pw_256 cextern pd_1 cextern pb_01 cextern pd_1024 cextern deinterleave_shufd +cextern popcnt_table %macro QUANT_DC_START 2 movd xm%1, r1m ; mf @@ -1567,6 +1591,13 @@ cglobal coeff_last64, 1,3 ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel ) ;----------------------------------------------------------------------------- +struc levelrun + .last: resd 1 + .mask: resd 1 + align 16, resb 1 + .level: resw 16 +endstruc + ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args %if WIN64 DECLARE_REG_TMP 3,1,2,0,4,5,6 @@ -1581,6 +1612,7 @@ cglobal coeff_level_run%1,0,7 movifnidn t0, r0mp movifnidn t1, r1mp pxor m2, m2 + xor t3d, t3d LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d %if %1==15 shr t5d, 1 @@ -1590,7 +1622,7 @@ cglobal coeff_level_run%1,0,7 and t5d, 0xf %endif xor t5d, (1<<%1)-1 - mov [t1+4], t5d + mov [t1+levelrun.mask], t5d shl t5d, 32-%1 mov t4d, %1-1 LZCOUNT t3d, t5d, 0x1f @@ -1598,7 +1630,7 @@ cglobal coeff_level_run%1,0,7 add t5d, t5d sub t4d, t3d shl t5d, t3b - mov [t1], t4d + mov [t1+levelrun.last], t4d .loop: LZCOUNT t3d, t5d, 0x1f %if HIGH_BIT_DEPTH @@ -1609,9 +1641,9 @@ cglobal coeff_level_run%1,0,7 inc t3d shl t5d, t3b %if HIGH_BIT_DEPTH - mov [t1+t6*4+ 8], t2d + mov [t1+t6*4+levelrun.level], t2d %else - mov [t1+t6*2+ 8], t2w + mov [t1+t6*2+levelrun.level], t2w %endif inc t6d sub t4d, t3d @@ -1641,3 +1673,133 @@ COEFF_LEVELRUN 16 INIT_MMX mmx2, lzcnt COEFF_LEVELRUN 4 COEFF_LEVELRUN 8 + +; Similar to the one above, but saves the DCT +; coefficients in m0/m1 so we don't have to load +; them later. +%macro LAST_MASK_LUT 3 + pxor xm5, xm5 +%if %1 <= 8 + mova m0, [%3] + packsswb m2, m0, m0 +%else + mova xm0, [%3+ 0] + mova xm1, [%3+16] + packsswb xm2, xm0, xm1 +%if mmsize==32 + vinserti128 m0, m0, xm1, 1 +%endif +%endif + pcmpeqb xm2, xm5 + pmovmskb %2, xm2 +%endmacro + +%macro COEFF_LEVELRUN_LUT 1 +cglobal coeff_level_run%1,2,4+(%1/9) +%ifdef PIC + lea r5, [$$] + %define GLOBAL +r5-$$ +%else + %define GLOBAL +%endif + LAST_MASK_LUT %1, eax, r0-(%1&1)*SIZEOF_DCTCOEF +%if %1==15 + shr eax, 1 +%elif %1==8 + and eax, 0xff +%elif %1==4 + and eax, 0xf +%endif + xor eax, (1<<%1)-1 + mov [r1+levelrun.mask], eax +%if %1==15 + add eax, eax +%endif +%if %1 > 8 +%if ARCH_X86_64 + mov r4d, eax + shr r4d, 8 +%else + movzx r4d, ah ; first 8 bits +%endif +%endif + movzx r2d, al ; second 8 bits + shl eax, 32-%1-(%1&1) + LZCOUNT eax, eax, 0x1f + mov r3d, %1-1 + sub r3d, eax + mov [r1+levelrun.last], r3d +; Here we abuse pshufb, combined with a lookup table, to do a gather +; operation based on a bitmask. For example: +; +; dct 15-8 (input): 0 0 4 0 0 -2 1 0 +; dct 7-0 (input): 0 0 -1 0 0 0 0 15 +; bitmask 1: 0 0 1 0 0 1 1 0 +; bitmask 2: 0 0 1 0 0 0 0 1 +; gather 15-8: 4 -2 1 __ __ __ __ __ +; gather 7-0: -1 15 __ __ __ __ __ __ +; levels (output): 4 -2 1 -1 15 __ __ __ __ __ __ __ __ __ __ __ +; +; The overlapping, dependent stores almost surely cause a mess of +; forwarding issues, but it's still enormously faster. +%if %1 > 8 + movzx eax, byte [popcnt_table+r4 GLOBAL] + movzx r3d, byte [popcnt_table+r2 GLOBAL] +%if mmsize==16 + movh m3, [dct_coef_shuffle+r4*8 GLOBAL] + movh m2, [dct_coef_shuffle+r2*8 GLOBAL] + mova m4, [pw_256] +; Storing 8 bytes of shuffle constant and converting it (unpack + or) +; is neutral to slightly faster in local speed measurements, but it +; cuts the table size in half, which is surely a big cache win. + punpcklbw m3, m3 + punpcklbw m2, m2 + por m3, m4 + por m2, m4 + pshufb m1, m3 + pshufb m0, m2 + mova [r1+levelrun.level], m1 +; This obnoxious unaligned store messes with store forwarding and +; stalls the CPU to no end, but merging the two registers before +; storing requires a variable 128-bit shift. Emulating this does +; work, but requires a lot of ops and the gain is tiny and +; inconsistent, so we'll err on the side of fewer instructions. + movu [r1+rax*2+levelrun.level], m0 +%else ; mmsize==32 + movq xm2, [dct_coef_shuffle+r4*8 GLOBAL] + vinserti128 m2, m2, [dct_coef_shuffle+r2*8 GLOBAL], 1 + punpcklbw m2, m2 + por m2, [pw_256] + pshufb m0, m2 + vextracti128 [r1+levelrun.level], m0, 1 + movu [r1+rax*2+levelrun.level], xm0 +%endif + add eax, r3d +%else + movzx eax, byte [popcnt_table+r2 GLOBAL] + movh m1, [dct_coef_shuffle+r2*8 GLOBAL] + punpcklbw m1, m1 + por m1, [pw_256] + pshufb m0, m1 + mova [r1+levelrun.level], m0 +%endif + RET +%endmacro + +%if HIGH_BIT_DEPTH==0 +INIT_MMX ssse3 +COEFF_LEVELRUN_LUT 4 +INIT_XMM ssse3 +COEFF_LEVELRUN_LUT 8 +COEFF_LEVELRUN_LUT 15 +COEFF_LEVELRUN_LUT 16 +INIT_MMX ssse3, lzcnt +COEFF_LEVELRUN_LUT 4 +INIT_XMM ssse3, lzcnt +COEFF_LEVELRUN_LUT 8 +COEFF_LEVELRUN_LUT 15 +COEFF_LEVELRUN_LUT 16 +INIT_XMM avx2, lzcnt +COEFF_LEVELRUN_LUT 15 +COEFF_LEVELRUN_LUT 16 +%endif diff --git a/common/x86/quant.h b/common/x86/quant.h index 559c6f49..5541db03 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -110,15 +110,25 @@ int x264_coeff_last64_avx2_lzcnt( dctcoef *dct ); int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac ); int x264_trellis_cabac_4x4_ssse3( TRELLIS_PARAMS, int b_ac ); int x264_trellis_cabac_8x8_sse2 ( TRELLIS_PARAMS, int b_interlaced ); diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index f0e816d7..008f73de 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -700,7 +700,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, extern %1 %endmacro -%macro const 2+ +%macro const 1-2+ %xdefine %1 mangle(private_prefix %+ _ %+ %1) %ifidn __OUTPUT_FORMAT__,elf global %1:data hidden diff --git a/encoder/cavlc.c b/encoder/cavlc.c index e41b5e14..daf0614c 100644 --- a/encoder/cavlc.c +++ b/encoder/cavlc.c @@ -128,13 +128,13 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct unsigned int i_sign; /* level and run and total */ - /* set these to 2 to allow branchless i_trailing calculation */ - runlevel.level[1] = 2; - runlevel.level[2] = 2; i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel ); x264_prefetch( &x264_run_before[runlevel.mask] ); i_total_zero = runlevel.last + 1 - i_total; + /* branchless i_trailing calculation */ + runlevel.level[i_total+0] = 2; + runlevel.level[i_total+1] = 2; i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1 | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2) | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4); diff --git a/tools/checkasm.c b/tools/checkasm.c index 7a2f6d4d..1173126a 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -2593,7 +2593,14 @@ static int check_all_flags( void ) cpu1 &= ~X264_CPU_BMI1; } if( x264_cpu_detect() & X264_CPU_AVX2 ) + { ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" ); + if( x264_cpu_detect() & X264_CPU_LZCNT ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2_LZCNT" ); + cpu1 &= ~X264_CPU_LZCNT; + } + } if( x264_cpu_detect() & X264_CPU_BMI2 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" );