~2x faster coeff_level_run.
Faster CAVLC encoding: {1%,2%,7%} overall with {superfast,medium,slower}.
Uses the same pshufb LUT abuse trick as in the previous ads_mvs patch.
typedef struct
{
- int last;
- int mask;
- dctcoef level[16];
+ int32_t last;
+ int32_t mask;
+ ALIGNED_16( dctcoef level[18] );
} x264_run_level_t;
extern const vlc_t x264_coeff0_token[6];
pf->decimate_score16 = x264_decimate_score16_ssse3;
pf->decimate_score64 = x264_decimate_score64_ssse3;
INIT_TRELLIS( ssse3 );
+ pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
+ pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3;
+ if( cpu&X264_CPU_LZCNT )
+ {
+ pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
+ pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt;
+ }
}
if( cpu&X264_CPU_SSE4 )
}
pf->decimate_score64 = x264_decimate_score64_avx2;
pf->denoise_dct = x264_denoise_dct_avx2;
+ if( cpu&X264_CPU_LZCNT )
+ {
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt;
+ }
}
#endif // HAVE_MMX
const pw_4, times 8 dw 4
const pw_8, times 8 dw 8
const pw_64, times 8 dw 64
+const pw_256, times 8 dw 256
const pw_32_0, times 4 dw 32,
times 4 dw 0
const pw_8000, times 8 dw 0x8000
const pd_ffff, times 4 dd 0xffff
const pw_ff00, times 8 dw 0xff00
+const popcnt_table
+%assign x 0
+%rep 256
+; population count
+db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
+%assign x x+1
+%endrep
+
const sw_64, dd 64
pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
-ads_mvs_count:
-%assign x 0
-%rep 256
-; population count
-db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
-%assign x x+1
-%endrep
-
ads_mvs_shuffle:
%macro ADS_MVS_SHUFFLE 8
%assign y x
cextern pw_pmmpzzzz
cextern pd_1
cextern hsub_mul
+cextern popcnt_table
;=============================================================================
; SSD
add r5, r6
xor r0d, r0d ; nmv
mov [r5], r0d
- lea r1, [ads_mvs_count]
+%ifdef PIC
+ lea r1, [$$]
+ %define GLOBAL +r1-$$
+%else
+ %define GLOBAL
+%endif
.loop:
movh m0, [r6]
pcmpeqb m0, m5
pmovmskb r2d, m0
- xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions)
- movzx r3d, byte [r1+r2] ; popcnt
+ xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions)
+ movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt
add r2d, r2d
; shuffle counters based on mv mask
- pshufb m2, m4, [r1+r2*8+(ads_mvs_shuffle-ads_mvs_count)]
+ pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
movu [r4+r0*2], m2
add r0d, r3d
- paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
+ paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
add r6, 8
cmp r6, r5
jl .loop
chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1
chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1
+%if HIGH_BIT_DEPTH==0
+dct_coef_shuffle:
+%macro DCT_COEF_SHUFFLE 8
+ %assign y x
+ %rep 8
+ %rep 7
+ %rotate (~(y>>7))&1
+ %assign y y<<((~(y>>7))&1)
+ %endrep
+ db %1*2
+ %rotate 1
+ %assign y y<<1
+ %endrep
+%endmacro
+%assign x 0
+%rep 256
+ DCT_COEF_SHUFFLE 7, 6, 5, 4, 3, 2, 1, 0
+%assign x x+1
+%endrep
+%endif
+
SECTION .text
cextern pb_1
cextern pw_1
+cextern pw_2
+cextern pw_256
cextern pd_1
cextern pb_01
cextern pd_1024
cextern deinterleave_shufd
+cextern popcnt_table
%macro QUANT_DC_START 2
movd xm%1, r1m ; mf
; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
;-----------------------------------------------------------------------------
+struc levelrun
+ .last: resd 1
+ .mask: resd 1
+ align 16, resb 1
+ .level: resw 16
+endstruc
+
; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
%if WIN64
DECLARE_REG_TMP 3,1,2,0,4,5,6
movifnidn t0, r0mp
movifnidn t1, r1mp
pxor m2, m2
+ xor t3d, t3d
LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
%if %1==15
shr t5d, 1
and t5d, 0xf
%endif
xor t5d, (1<<%1)-1
- mov [t1+4], t5d
+ mov [t1+levelrun.mask], t5d
shl t5d, 32-%1
mov t4d, %1-1
LZCOUNT t3d, t5d, 0x1f
add t5d, t5d
sub t4d, t3d
shl t5d, t3b
- mov [t1], t4d
+ mov [t1+levelrun.last], t4d
.loop:
LZCOUNT t3d, t5d, 0x1f
%if HIGH_BIT_DEPTH
inc t3d
shl t5d, t3b
%if HIGH_BIT_DEPTH
- mov [t1+t6*4+ 8], t2d
+ mov [t1+t6*4+levelrun.level], t2d
%else
- mov [t1+t6*2+ 8], t2w
+ mov [t1+t6*2+levelrun.level], t2w
%endif
inc t6d
sub t4d, t3d
INIT_MMX mmx2, lzcnt
COEFF_LEVELRUN 4
COEFF_LEVELRUN 8
+
+; Similar to the one above, but saves the DCT
+; coefficients in m0/m1 so we don't have to load
+; them later.
+%macro LAST_MASK_LUT 3
+ pxor xm5, xm5
+%if %1 <= 8
+ mova m0, [%3]
+ packsswb m2, m0, m0
+%else
+ mova xm0, [%3+ 0]
+ mova xm1, [%3+16]
+ packsswb xm2, xm0, xm1
+%if mmsize==32
+ vinserti128 m0, m0, xm1, 1
+%endif
+%endif
+ pcmpeqb xm2, xm5
+ pmovmskb %2, xm2
+%endmacro
+
+%macro COEFF_LEVELRUN_LUT 1
+cglobal coeff_level_run%1,2,4+(%1/9)
+%ifdef PIC
+ lea r5, [$$]
+ %define GLOBAL +r5-$$
+%else
+ %define GLOBAL
+%endif
+ LAST_MASK_LUT %1, eax, r0-(%1&1)*SIZEOF_DCTCOEF
+%if %1==15
+ shr eax, 1
+%elif %1==8
+ and eax, 0xff
+%elif %1==4
+ and eax, 0xf
+%endif
+ xor eax, (1<<%1)-1
+ mov [r1+levelrun.mask], eax
+%if %1==15
+ add eax, eax
+%endif
+%if %1 > 8
+%if ARCH_X86_64
+ mov r4d, eax
+ shr r4d, 8
+%else
+ movzx r4d, ah ; first 8 bits
+%endif
+%endif
+ movzx r2d, al ; second 8 bits
+ shl eax, 32-%1-(%1&1)
+ LZCOUNT eax, eax, 0x1f
+ mov r3d, %1-1
+ sub r3d, eax
+ mov [r1+levelrun.last], r3d
+; Here we abuse pshufb, combined with a lookup table, to do a gather
+; operation based on a bitmask. For example:
+;
+; dct 15-8 (input): 0 0 4 0 0 -2 1 0
+; dct 7-0 (input): 0 0 -1 0 0 0 0 15
+; bitmask 1: 0 0 1 0 0 1 1 0
+; bitmask 2: 0 0 1 0 0 0 0 1
+; gather 15-8: 4 -2 1 __ __ __ __ __
+; gather 7-0: -1 15 __ __ __ __ __ __
+; levels (output): 4 -2 1 -1 15 __ __ __ __ __ __ __ __ __ __ __
+;
+; The overlapping, dependent stores almost surely cause a mess of
+; forwarding issues, but it's still enormously faster.
+%if %1 > 8
+ movzx eax, byte [popcnt_table+r4 GLOBAL]
+ movzx r3d, byte [popcnt_table+r2 GLOBAL]
+%if mmsize==16
+ movh m3, [dct_coef_shuffle+r4*8 GLOBAL]
+ movh m2, [dct_coef_shuffle+r2*8 GLOBAL]
+ mova m4, [pw_256]
+; Storing 8 bytes of shuffle constant and converting it (unpack + or)
+; is neutral to slightly faster in local speed measurements, but it
+; cuts the table size in half, which is surely a big cache win.
+ punpcklbw m3, m3
+ punpcklbw m2, m2
+ por m3, m4
+ por m2, m4
+ pshufb m1, m3
+ pshufb m0, m2
+ mova [r1+levelrun.level], m1
+; This obnoxious unaligned store messes with store forwarding and
+; stalls the CPU to no end, but merging the two registers before
+; storing requires a variable 128-bit shift. Emulating this does
+; work, but requires a lot of ops and the gain is tiny and
+; inconsistent, so we'll err on the side of fewer instructions.
+ movu [r1+rax*2+levelrun.level], m0
+%else ; mmsize==32
+ movq xm2, [dct_coef_shuffle+r4*8 GLOBAL]
+ vinserti128 m2, m2, [dct_coef_shuffle+r2*8 GLOBAL], 1
+ punpcklbw m2, m2
+ por m2, [pw_256]
+ pshufb m0, m2
+ vextracti128 [r1+levelrun.level], m0, 1
+ movu [r1+rax*2+levelrun.level], xm0
+%endif
+ add eax, r3d
+%else
+ movzx eax, byte [popcnt_table+r2 GLOBAL]
+ movh m1, [dct_coef_shuffle+r2*8 GLOBAL]
+ punpcklbw m1, m1
+ por m1, [pw_256]
+ pshufb m0, m1
+ mova [r1+levelrun.level], m0
+%endif
+ RET
+%endmacro
+
+%if HIGH_BIT_DEPTH==0
+INIT_MMX ssse3
+COEFF_LEVELRUN_LUT 4
+INIT_XMM ssse3
+COEFF_LEVELRUN_LUT 8
+COEFF_LEVELRUN_LUT 15
+COEFF_LEVELRUN_LUT 16
+INIT_MMX ssse3, lzcnt
+COEFF_LEVELRUN_LUT 4
+INIT_XMM ssse3, lzcnt
+COEFF_LEVELRUN_LUT 8
+COEFF_LEVELRUN_LUT 15
+COEFF_LEVELRUN_LUT 16
+INIT_XMM avx2, lzcnt
+COEFF_LEVELRUN_LUT 15
+COEFF_LEVELRUN_LUT 16
+%endif
int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac );
int x264_trellis_cabac_4x4_ssse3( TRELLIS_PARAMS, int b_ac );
int x264_trellis_cabac_8x8_sse2 ( TRELLIS_PARAMS, int b_interlaced );
extern %1
%endmacro
-%macro const 2+
+%macro const 1-2+
%xdefine %1 mangle(private_prefix %+ _ %+ %1)
%ifidn __OUTPUT_FORMAT__,elf
global %1:data hidden
unsigned int i_sign;
/* level and run and total */
- /* set these to 2 to allow branchless i_trailing calculation */
- runlevel.level[1] = 2;
- runlevel.level[2] = 2;
i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
x264_prefetch( &x264_run_before[runlevel.mask] );
i_total_zero = runlevel.last + 1 - i_total;
+ /* branchless i_trailing calculation */
+ runlevel.level[i_total+0] = 2;
+ runlevel.level[i_total+1] = 2;
i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
| ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2)
| ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4);
cpu1 &= ~X264_CPU_BMI1;
}
if( x264_cpu_detect() & X264_CPU_AVX2 )
+ {
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
+ if( x264_cpu_detect() & X264_CPU_LZCNT )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2_LZCNT" );
+ cpu1 &= ~X264_CPU_LZCNT;
+ }
+ }
if( x264_cpu_detect() & X264_CPU_BMI2 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" );