From: Fiona Glaser <fiona@x264.com>
Date: Fri, 5 Apr 2013 01:00:23 +0000 (-0700)
Subject: x86: SSSE3 LUT-based faster coeff_level_run
X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=67d6f602018d0fc1cb05cd6240e4fe1c2646169f;p=x264

x86: SSSE3 LUT-based faster coeff_level_run

~2x faster coeff_level_run.
Faster CAVLC encoding: {1%,2%,7%} overall with {superfast,medium,slower}.
Uses the same pshufb LUT abuse trick as in the previous ads_mvs patch.
---

diff --git a/common/bitstream.h b/common/bitstream.h
index a0ace070..629cf607 100644
--- a/common/bitstream.h
+++ b/common/bitstream.h
@@ -55,9 +55,9 @@ typedef struct bs_s
 
 typedef struct
 {
-    int     last;
-    int     mask;
-    dctcoef level[16];
+    int32_t last;
+    int32_t mask;
+    ALIGNED_16( dctcoef level[18] );
 } x264_run_level_t;
 
 extern const vlc_t x264_coeff0_token[6];
diff --git a/common/quant.c b/common/quant.c
index 7dfd3bd8..3f70310f 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -633,6 +633,17 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->decimate_score16 = x264_decimate_score16_ssse3;
         pf->decimate_score64 = x264_decimate_score64_ssse3;
         INIT_TRELLIS( ssse3 );
+        pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
+        pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
+        pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3;
+        pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3;
+        if( cpu&X264_CPU_LZCNT )
+        {
+            pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
+            pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
+            pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt;
+            pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt;
+        }
     }
 
     if( cpu&X264_CPU_SSE4 )
@@ -681,6 +692,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         }
         pf->decimate_score64 = x264_decimate_score64_avx2;
         pf->denoise_dct = x264_denoise_dct_avx2;
+        if( cpu&X264_CPU_LZCNT )
+        {
+            pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt;
+            pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt;
+        }
     }
 #endif // HAVE_MMX
 
diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm
index b5637647..1389be1f 100644
--- a/common/x86/const-a.asm
+++ b/common/x86/const-a.asm
@@ -49,6 +49,7 @@ const pw_m2,       times 8 dw -2
 const pw_4,        times 8 dw 4
 const pw_8,        times 8 dw 8
 const pw_64,       times 8 dw 64
+const pw_256,      times 8 dw 256
 const pw_32_0,     times 4 dw 32,
                    times 4 dw 0
 const pw_8000,     times 8 dw 0x8000
@@ -63,4 +64,12 @@ const pd_1024,     times 4 dd 1024
 const pd_ffff,     times 4 dd 0xffff
 const pw_ff00,     times 8 dw 0xff00
 
+const popcnt_table
+%assign x 0
+%rep 256
+; population count
+db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
+%assign x x+1
+%endrep
+
 const sw_64,       dd 64
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index b0362597..4ee52fd6 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -129,14 +129,6 @@ pd_f0:     times 4 dd 0xffff0000
 
 pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
 
-ads_mvs_count:
-%assign x 0
-%rep 256
-; population count
-db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
-%assign x x+1
-%endrep
-
 ads_mvs_shuffle:
 %macro ADS_MVS_SHUFFLE 8
     %assign y x
@@ -171,6 +163,7 @@ cextern pw_pmpmpmpm
 cextern pw_pmmpzzzz
 cextern pd_1
 cextern hsub_mul
+cextern popcnt_table
 
 ;=============================================================================
 ; SSD
@@ -5189,19 +5182,24 @@ ads_mvs_ssse3:
     add       r5, r6
     xor      r0d, r0d ; nmv
     mov     [r5], r0d
-    lea       r1, [ads_mvs_count]
+%ifdef PIC
+    lea       r1, [$$]
+    %define GLOBAL +r1-$$
+%else
+    %define GLOBAL
+%endif
 .loop:
     movh      m0, [r6]
     pcmpeqb   m0, m5
     pmovmskb r2d, m0
-    xor      r2d, 0xffff        ; skipping if r2d is zero is slower (branch mispredictions)
-    movzx    r3d, byte [r1+r2]  ; popcnt
+    xor      r2d, 0xffff                         ; skipping if r2d is zero is slower (branch mispredictions)
+    movzx    r3d, byte [r2+popcnt_table GLOBAL]  ; popcnt
     add      r2d, r2d
     ; shuffle counters based on mv mask
-    pshufb    m2, m4, [r1+r2*8+(ads_mvs_shuffle-ads_mvs_count)]
+    pshufb    m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
     movu [r4+r0*2], m2
     add      r0d, r3d
-    paddw     m4, m3            ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
+    paddw     m4, m3                             ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
     add       r6, 8
     cmp       r6, r5
     jl .loop
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index ccae210b..0f7fe610 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -74,14 +74,38 @@ chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
 chroma_dc_dct_mask:     dw 1, 1,-1,-1, 1, 1,-1,-1
 chroma_dc_dmf_mask:     dw 1, 1,-1,-1, 1,-1,-1, 1
 
+%if HIGH_BIT_DEPTH==0
+dct_coef_shuffle:
+%macro DCT_COEF_SHUFFLE 8
+    %assign y x
+    %rep 8
+        %rep 7
+            %rotate (~(y>>7))&1
+            %assign y y<<((~(y>>7))&1)
+        %endrep
+        db %1*2
+        %rotate 1
+        %assign y y<<1
+    %endrep
+%endmacro
+%assign x 0
+%rep 256
+    DCT_COEF_SHUFFLE 7, 6, 5, 4, 3, 2, 1, 0
+%assign x x+1
+%endrep
+%endif
+
 SECTION .text
 
 cextern pb_1
 cextern pw_1
+cextern pw_2
+cextern pw_256
 cextern pd_1
 cextern pb_01
 cextern pd_1024
 cextern deinterleave_shufd
+cextern popcnt_table
 
 %macro QUANT_DC_START 2
     movd      xm%1, r1m     ; mf
@@ -1567,6 +1591,13 @@ cglobal coeff_last64, 1,3
 ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
 ;-----------------------------------------------------------------------------
 
+struc levelrun
+    .last: resd 1
+    .mask: resd 1
+    align 16, resb 1
+    .level: resw 16
+endstruc
+
 ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
 %if WIN64
     DECLARE_REG_TMP 3,1,2,0,4,5,6
@@ -1581,6 +1612,7 @@ cglobal coeff_level_run%1,0,7
     movifnidn t0, r0mp
     movifnidn t1, r1mp
     pxor    m2, m2
+    xor    t3d, t3d
     LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
 %if %1==15
     shr    t5d, 1
@@ -1590,7 +1622,7 @@ cglobal coeff_level_run%1,0,7
     and    t5d, 0xf
 %endif
     xor    t5d, (1<<%1)-1
-    mov [t1+4], t5d
+    mov [t1+levelrun.mask], t5d
     shl    t5d, 32-%1
     mov    t4d, %1-1
     LZCOUNT t3d, t5d, 0x1f
@@ -1598,7 +1630,7 @@ cglobal coeff_level_run%1,0,7
     add    t5d, t5d
     sub    t4d, t3d
     shl    t5d, t3b
-    mov   [t1], t4d
+    mov [t1+levelrun.last], t4d
 .loop:
     LZCOUNT t3d, t5d, 0x1f
 %if HIGH_BIT_DEPTH
@@ -1609,9 +1641,9 @@ cglobal coeff_level_run%1,0,7
     inc    t3d
     shl    t5d, t3b
 %if HIGH_BIT_DEPTH
-    mov   [t1+t6*4+ 8], t2d
+    mov   [t1+t6*4+levelrun.level], t2d
 %else
-    mov   [t1+t6*2+ 8], t2w
+    mov   [t1+t6*2+levelrun.level], t2w
 %endif
     inc    t6d
     sub    t4d, t3d
@@ -1641,3 +1673,133 @@ COEFF_LEVELRUN 16
 INIT_MMX mmx2, lzcnt
 COEFF_LEVELRUN 4
 COEFF_LEVELRUN 8
+
+; Similar to the one above, but saves the DCT
+; coefficients in m0/m1 so we don't have to load
+; them later.
+%macro LAST_MASK_LUT 3
+    pxor     xm5, xm5
+%if %1 <= 8
+    mova      m0, [%3]
+    packsswb  m2, m0, m0
+%else
+    mova     xm0, [%3+ 0]
+    mova     xm1, [%3+16]
+    packsswb xm2, xm0, xm1
+%if mmsize==32
+    vinserti128 m0, m0, xm1, 1
+%endif
+%endif
+    pcmpeqb  xm2, xm5
+    pmovmskb  %2, xm2
+%endmacro
+
+%macro COEFF_LEVELRUN_LUT 1
+cglobal coeff_level_run%1,2,4+(%1/9)
+%ifdef PIC
+    lea       r5, [$$]
+    %define GLOBAL +r5-$$
+%else
+    %define GLOBAL
+%endif
+    LAST_MASK_LUT %1, eax, r0-(%1&1)*SIZEOF_DCTCOEF
+%if %1==15
+    shr     eax, 1
+%elif %1==8
+    and     eax, 0xff
+%elif %1==4
+    and     eax, 0xf
+%endif
+    xor     eax, (1<<%1)-1
+    mov [r1+levelrun.mask], eax
+%if %1==15
+    add     eax, eax
+%endif
+%if %1 > 8
+%if ARCH_X86_64
+    mov     r4d, eax
+    shr     r4d, 8
+%else
+    movzx   r4d, ah ; first 8 bits
+%endif
+%endif
+    movzx   r2d, al ; second 8 bits
+    shl     eax, 32-%1-(%1&1)
+    LZCOUNT eax, eax, 0x1f
+    mov     r3d, %1-1
+    sub     r3d, eax
+    mov [r1+levelrun.last], r3d
+; Here we abuse pshufb, combined with a lookup table, to do a gather
+; operation based on a bitmask. For example:
+;
+; dct 15-8 (input): 0  0  4  0  0 -2  1  0
+; dct  7-0 (input): 0  0 -1  0  0  0  0 15
+; bitmask 1:        0  0  1  0  0  1  1  0
+; bitmask 2:        0  0  1  0  0  0  0  1
+; gather 15-8:      4 -2  1 __ __ __ __ __
+; gather  7-0:     -1 15 __ __ __ __ __ __
+; levels (output):  4 -2  1 -1 15 __ __ __ __ __ __ __ __ __ __ __
+;
+; The overlapping, dependent stores almost surely cause a mess of
+; forwarding issues, but it's still enormously faster.
+%if %1 > 8
+    movzx   eax, byte [popcnt_table+r4 GLOBAL]
+    movzx   r3d, byte [popcnt_table+r2 GLOBAL]
+%if mmsize==16
+    movh      m3, [dct_coef_shuffle+r4*8 GLOBAL]
+    movh      m2, [dct_coef_shuffle+r2*8 GLOBAL]
+    mova      m4, [pw_256]
+; Storing 8 bytes of shuffle constant and converting it (unpack + or)
+; is neutral to slightly faster in local speed measurements, but it
+; cuts the table size in half, which is surely a big cache win.
+    punpcklbw m3, m3
+    punpcklbw m2, m2
+    por       m3, m4
+    por       m2, m4
+    pshufb    m1, m3
+    pshufb    m0, m2
+    mova [r1+levelrun.level], m1
+; This obnoxious unaligned store messes with store forwarding and
+; stalls the CPU to no end, but merging the two registers before
+; storing requires a variable 128-bit shift. Emulating this does
+; work, but requires a lot of ops and the gain is tiny and
+; inconsistent, so we'll err on the side of fewer instructions.
+    movu [r1+rax*2+levelrun.level], m0
+%else ; mmsize==32
+    movq     xm2, [dct_coef_shuffle+r4*8 GLOBAL]
+    vinserti128 m2, m2, [dct_coef_shuffle+r2*8 GLOBAL], 1
+    punpcklbw m2, m2
+    por       m2, [pw_256]
+    pshufb    m0, m2
+    vextracti128 [r1+levelrun.level], m0, 1
+    movu [r1+rax*2+levelrun.level], xm0
+%endif
+    add     eax, r3d
+%else
+    movzx   eax, byte [popcnt_table+r2 GLOBAL]
+    movh m1, [dct_coef_shuffle+r2*8 GLOBAL]
+    punpcklbw m1, m1
+    por       m1, [pw_256]
+    pshufb    m0, m1
+    mova [r1+levelrun.level], m0
+%endif
+    RET
+%endmacro
+
+%if HIGH_BIT_DEPTH==0
+INIT_MMX ssse3
+COEFF_LEVELRUN_LUT 4
+INIT_XMM ssse3
+COEFF_LEVELRUN_LUT 8
+COEFF_LEVELRUN_LUT 15
+COEFF_LEVELRUN_LUT 16
+INIT_MMX ssse3, lzcnt
+COEFF_LEVELRUN_LUT 4
+INIT_XMM ssse3, lzcnt
+COEFF_LEVELRUN_LUT 8
+COEFF_LEVELRUN_LUT 15
+COEFF_LEVELRUN_LUT 16
+INIT_XMM avx2, lzcnt
+COEFF_LEVELRUN_LUT 15
+COEFF_LEVELRUN_LUT 16
+%endif
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 559c6f49..5541db03 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -110,15 +110,25 @@ int x264_coeff_last64_avx2_lzcnt( dctcoef *dct );
 int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac );
 int x264_trellis_cabac_4x4_ssse3( TRELLIS_PARAMS, int b_ac );
 int x264_trellis_cabac_8x8_sse2 ( TRELLIS_PARAMS, int b_interlaced );
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index f0e816d7..008f73de 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -700,7 +700,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     extern %1
 %endmacro
 
-%macro const 2+
+%macro const 1-2+
     %xdefine %1 mangle(private_prefix %+ _ %+ %1)
     %ifidn __OUTPUT_FORMAT__,elf
         global %1:data hidden
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index e41b5e14..daf0614c 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -128,13 +128,13 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct
     unsigned int i_sign;
 
     /* level and run and total */
-    /* set these to 2 to allow branchless i_trailing calculation */
-    runlevel.level[1] = 2;
-    runlevel.level[2] = 2;
     i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
     x264_prefetch( &x264_run_before[runlevel.mask] );
     i_total_zero = runlevel.last + 1 - i_total;
 
+    /* branchless i_trailing calculation */
+    runlevel.level[i_total+0] = 2;
+    runlevel.level[i_total+1] = 2;
     i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
                | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2)
                | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4);
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 7a2f6d4d..1173126a 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -2593,7 +2593,14 @@ static int check_all_flags( void )
         cpu1 &= ~X264_CPU_BMI1;
     }
     if( x264_cpu_detect() & X264_CPU_AVX2 )
+    {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
+        if( x264_cpu_detect() & X264_CPU_LZCNT )
+        {
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2_LZCNT" );
+            cpu1 &= ~X264_CPU_LZCNT;
+        }
+    }
     if( x264_cpu_detect() & X264_CPU_BMI2 )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" );