x86: SSSE3 LUT-based faster coeff_level_run

author Fiona Glaser <fiona@x264.com>

Fri, 5 Apr 2013 01:00:23 +0000 (18:00 -0700)

committer Fiona Glaser <fiona@x264.com>

Tue, 23 Apr 2013 21:36:39 +0000 (14:36 -0700)
author Fiona Glaser <fiona@x264.com>
Fri, 5 Apr 2013 01:00:23 +0000 (18:00 -0700)
committer Fiona Glaser <fiona@x264.com>
Tue, 23 Apr 2013 21:36:39 +0000 (14:36 -0700)
diff --git a/common/bitstream.h b/common/bitstream.h

index a0ace070c3c206e6c9334204e78ce1cf466364d0..629cf6078fa56b28d5889a87e5d0926745e61465 100644 (file)
--- a/common/bitstream.h
+++ b/common/bitstream.h
@@ -55,9 +55,9 @@ typedef struct bs_s
  
  typedef struct
  {
-    int     last;
-    int     mask;
-    dctcoef level[16];
+    int32_t last;
+    int32_t mask;
+    ALIGNED_16( dctcoef level[18] );
  } x264_run_level_t;
  
  extern const vlc_t x264_coeff0_token[6];
diff --git a/common/quant.c b/common/quant.c

index 7dfd3bd8282a3f83e92b6078c707d57fd58a6235..3f70310fda377df9d100635f9165802060d0caa3 100644 (file)
--- a/common/quant.c
+++ b/common/quant.c
@@ -633,6 +633,17 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->decimate_score16 = x264_decimate_score16_ssse3;
          pf->decimate_score64 = x264_decimate_score64_ssse3;
          INIT_TRELLIS( ssse3 );
+        pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
+        pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
+        pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3;
+        pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3;
+        if( cpu&X264_CPU_LZCNT )
+        {
+            pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
+            pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
+            pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt;
+            pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt;
+        }
      }
  
      if( cpu&X264_CPU_SSE4 )
@@ -681,6 +692,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          }
          pf->decimate_score64 = x264_decimate_score64_avx2;
          pf->denoise_dct = x264_denoise_dct_avx2;
+        if( cpu&X264_CPU_LZCNT )
+        {
+            pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt;
+            pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt;
+        }
      }
  #endif // HAVE_MMX
  
diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm

index b5637647ce2fec6c8a9eeed85ec57ff540912619..1389be1f4f1f373401258e6895f5e7da1216a1c6 100644 (file)
--- a/common/x86/const-a.asm
+++ b/common/x86/const-a.asm
@@ -49,6 +49,7 @@ const pw_m2,       times 8 dw -2
  const pw_4,        times 8 dw 4
  const pw_8,        times 8 dw 8
  const pw_64,       times 8 dw 64
+const pw_256,      times 8 dw 256
  const pw_32_0,     times 4 dw 32,
                     times 4 dw 0
  const pw_8000,     times 8 dw 0x8000
@@ -63,4 +64,12 @@ const pd_1024,     times 4 dd 1024
  const pd_ffff,     times 4 dd 0xffff
  const pw_ff00,     times 8 dw 0xff00
  
+const popcnt_table
+%assign x 0
+%rep 256
+; population count
+db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
+%assign x x+1
+%endrep
+
  const sw_64,       dd 64
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm

index b0362597839f4183e54a3939b29bb0f30bbb7585..4ee52fd676d88ec58a83bb3772ffe58f7b46d954 100644 (file)
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -129,14 +129,6 @@ pd_f0:     times 4 dd 0xffff0000
  
  pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
  
-ads_mvs_count:
-%assign x 0
-%rep 256
-; population count
-db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
-%assign x x+1
-%endrep
-
  ads_mvs_shuffle:
  %macro ADS_MVS_SHUFFLE 8
      %assign y x
@@ -171,6 +163,7 @@ cextern pw_pmpmpmpm
  cextern pw_pmmpzzzz
  cextern pd_1
  cextern hsub_mul
+cextern popcnt_table
  
  ;=============================================================================
  ; SSD
@@ -5189,19 +5182,24 @@ ads_mvs_ssse3:
      add       r5, r6
      xor      r0d, r0d ; nmv
      mov     [r5], r0d
-    lea       r1, [ads_mvs_count]
+%ifdef PIC
+    lea       r1, [$$]
+    %define GLOBAL +r1-$$
+%else
+    %define GLOBAL
+%endif
  .loop:
      movh      m0, [r6]
      pcmpeqb   m0, m5
      pmovmskb r2d, m0
-    xor      r2d, 0xffff        ; skipping if r2d is zero is slower (branch mispredictions)
-    movzx    r3d, byte [r1+r2]  ; popcnt
+    xor      r2d, 0xffff                         ; skipping if r2d is zero is slower (branch mispredictions)
+    movzx    r3d, byte [r2+popcnt_table GLOBAL]  ; popcnt
      add      r2d, r2d
      ; shuffle counters based on mv mask
-    pshufb    m2, m4, [r1+r2*8+(ads_mvs_shuffle-ads_mvs_count)]
+    pshufb    m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
      movu [r4+r0*2], m2
      add      r0d, r3d
-    paddw     m4, m3            ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
+    paddw     m4, m3                             ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
      add       r6, 8
      cmp       r6, r5
      jl .loop
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm

index ccae210b05cc2349d923f2c2ecea3c8a572c75a3..0f7fe61075774039a1b9cf072f1d28edd50087ed 100644 (file)
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -74,14 +74,38 @@ chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
  chroma_dc_dct_mask:     dw 1, 1,-1,-1, 1, 1,-1,-1
  chroma_dc_dmf_mask:     dw 1, 1,-1,-1, 1,-1,-1, 1
  
+%if HIGH_BIT_DEPTH==0
+dct_coef_shuffle:
+%macro DCT_COEF_SHUFFLE 8
+    %assign y x
+    %rep 8
+        %rep 7
+            %rotate (~(y>>7))&1
+            %assign y y<<((~(y>>7))&1)
+        %endrep
+        db %1*2
+        %rotate 1
+        %assign y y<<1
+    %endrep
+%endmacro
+%assign x 0
+%rep 256
+    DCT_COEF_SHUFFLE 7, 6, 5, 4, 3, 2, 1, 0
+%assign x x+1
+%endrep
+%endif
+
  SECTION .text
  
  cextern pb_1
  cextern pw_1
+cextern pw_2
+cextern pw_256
  cextern pd_1
  cextern pb_01
  cextern pd_1024
  cextern deinterleave_shufd
+cextern popcnt_table
  
  %macro QUANT_DC_START 2
      movd      xm%1, r1m     ; mf
@@ -1567,6 +1591,13 @@ cglobal coeff_last64, 1,3
  ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
  ;-----------------------------------------------------------------------------
  
+struc levelrun
+    .last: resd 1
+    .mask: resd 1
+    align 16, resb 1
+    .level: resw 16
+endstruc
+
  ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
  %if WIN64
      DECLARE_REG_TMP 3,1,2,0,4,5,6
@@ -1581,6 +1612,7 @@ cglobal coeff_level_run%1,0,7
      movifnidn t0, r0mp
      movifnidn t1, r1mp
      pxor    m2, m2
+    xor    t3d, t3d
      LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
  %if %1==15
      shr    t5d, 1
@@ -1590,7 +1622,7 @@ cglobal coeff_level_run%1,0,7
      and    t5d, 0xf
  %endif
      xor    t5d, (1<<%1)-1
-    mov [t1+4], t5d
+    mov [t1+levelrun.mask], t5d
      shl    t5d, 32-%1
      mov    t4d, %1-1
      LZCOUNT t3d, t5d, 0x1f
@@ -1598,7 +1630,7 @@ cglobal coeff_level_run%1,0,7
      add    t5d, t5d
      sub    t4d, t3d
      shl    t5d, t3b
-    mov   [t1], t4d
+    mov [t1+levelrun.last], t4d
  .loop:
      LZCOUNT t3d, t5d, 0x1f
  %if HIGH_BIT_DEPTH
@@ -1609,9 +1641,9 @@ cglobal coeff_level_run%1,0,7
      inc    t3d
      shl    t5d, t3b
  %if HIGH_BIT_DEPTH
-    mov   [t1+t6*4+ 8], t2d
+    mov   [t1+t6*4+levelrun.level], t2d
  %else
-    mov   [t1+t6*2+ 8], t2w
+    mov   [t1+t6*2+levelrun.level], t2w
  %endif
      inc    t6d
      sub    t4d, t3d
@@ -1641,3 +1673,133 @@ COEFF_LEVELRUN 16
  INIT_MMX mmx2, lzcnt
  COEFF_LEVELRUN 4
  COEFF_LEVELRUN 8
+
+; Similar to the one above, but saves the DCT
+; coefficients in m0/m1 so we don't have to load
+; them later.
+%macro LAST_MASK_LUT 3
+    pxor     xm5, xm5
+%if %1 <= 8
+    mova      m0, [%3]
+    packsswb  m2, m0, m0
+%else
+    mova     xm0, [%3+ 0]
+    mova     xm1, [%3+16]
+    packsswb xm2, xm0, xm1
+%if mmsize==32
+    vinserti128 m0, m0, xm1, 1
+%endif
+%endif
+    pcmpeqb  xm2, xm5
+    pmovmskb  %2, xm2
+%endmacro
+
+%macro COEFF_LEVELRUN_LUT 1
+cglobal coeff_level_run%1,2,4+(%1/9)
+%ifdef PIC
+    lea       r5, [$$]
+    %define GLOBAL +r5-$$
+%else
+    %define GLOBAL
+%endif
+    LAST_MASK_LUT %1, eax, r0-(%1&1)*SIZEOF_DCTCOEF
+%if %1==15
+    shr     eax, 1
+%elif %1==8
+    and     eax, 0xff
+%elif %1==4
+    and     eax, 0xf
+%endif
+    xor     eax, (1<<%1)-1
+    mov [r1+levelrun.mask], eax
+%if %1==15
+    add     eax, eax
+%endif
+%if %1 > 8
+%if ARCH_X86_64
+    mov     r4d, eax
+    shr     r4d, 8
+%else
+    movzx   r4d, ah ; first 8 bits
+%endif
+%endif
+    movzx   r2d, al ; second 8 bits
+    shl     eax, 32-%1-(%1&1)
+    LZCOUNT eax, eax, 0x1f
+    mov     r3d, %1-1
+    sub     r3d, eax
+    mov [r1+levelrun.last], r3d
+; Here we abuse pshufb, combined with a lookup table, to do a gather
+; operation based on a bitmask. For example:
+;
+; dct 15-8 (input): 0  0  4  0  0 -2  1  0
+; dct  7-0 (input): 0  0 -1  0  0  0  0 15
+; bitmask 1:        0  0  1  0  0  1  1  0
+; bitmask 2:        0  0  1  0  0  0  0  1
+; gather 15-8:      4 -2  1 __ __ __ __ __
+; gather  7-0:     -1 15 __ __ __ __ __ __
+; levels (output):  4 -2  1 -1 15 __ __ __ __ __ __ __ __ __ __ __
+;
+; The overlapping, dependent stores almost surely cause a mess of
+; forwarding issues, but it's still enormously faster.
+%if %1 > 8
+    movzx   eax, byte [popcnt_table+r4 GLOBAL]
+    movzx   r3d, byte [popcnt_table+r2 GLOBAL]
+%if mmsize==16
+    movh      m3, [dct_coef_shuffle+r4*8 GLOBAL]
+    movh      m2, [dct_coef_shuffle+r2*8 GLOBAL]
+    mova      m4, [pw_256]
+; Storing 8 bytes of shuffle constant and converting it (unpack + or)
+; is neutral to slightly faster in local speed measurements, but it
+; cuts the table size in half, which is surely a big cache win.
+    punpcklbw m3, m3
+    punpcklbw m2, m2
+    por       m3, m4
+    por       m2, m4
+    pshufb    m1, m3
+    pshufb    m0, m2
+    mova [r1+levelrun.level], m1
+; This obnoxious unaligned store messes with store forwarding and
+; stalls the CPU to no end, but merging the two registers before
+; storing requires a variable 128-bit shift. Emulating this does
+; work, but requires a lot of ops and the gain is tiny and
+; inconsistent, so we'll err on the side of fewer instructions.
+    movu [r1+rax*2+levelrun.level], m0
+%else ; mmsize==32
+    movq     xm2, [dct_coef_shuffle+r4*8 GLOBAL]
+    vinserti128 m2, m2, [dct_coef_shuffle+r2*8 GLOBAL], 1
+    punpcklbw m2, m2
+    por       m2, [pw_256]
+    pshufb    m0, m2
+    vextracti128 [r1+levelrun.level], m0, 1
+    movu [r1+rax*2+levelrun.level], xm0
+%endif
+    add     eax, r3d
+%else
+    movzx   eax, byte [popcnt_table+r2 GLOBAL]
+    movh m1, [dct_coef_shuffle+r2*8 GLOBAL]
+    punpcklbw m1, m1
+    por       m1, [pw_256]
+    pshufb    m0, m1
+    mova [r1+levelrun.level], m0
+%endif
+    RET
+%endmacro
+
+%if HIGH_BIT_DEPTH==0
+INIT_MMX ssse3
+COEFF_LEVELRUN_LUT 4
+INIT_XMM ssse3
+COEFF_LEVELRUN_LUT 8
+COEFF_LEVELRUN_LUT 15
+COEFF_LEVELRUN_LUT 16
+INIT_MMX ssse3, lzcnt
+COEFF_LEVELRUN_LUT 4
+INIT_XMM ssse3, lzcnt
+COEFF_LEVELRUN_LUT 8
+COEFF_LEVELRUN_LUT 15
+COEFF_LEVELRUN_LUT 16
+INIT_XMM avx2, lzcnt
+COEFF_LEVELRUN_LUT 15
+COEFF_LEVELRUN_LUT 16
+%endif
diff --git a/common/x86/quant.h b/common/x86/quant.h

index 559c6f495c286d1385d8b0bb08dfa2726d94434e..5541db038d3bacc6427fd14d95501d7d4f9ebfd2 100644 (file)
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -110,15 +110,25 @@ int x264_coeff_last64_avx2_lzcnt( dctcoef *dct );
  int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
  int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
  int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
  int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
  int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel );
  int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
  int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
  int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
  int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
  int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
  int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel );
  int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
  int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac );
  int x264_trellis_cabac_4x4_ssse3( TRELLIS_PARAMS, int b_ac );
  int x264_trellis_cabac_8x8_sse2 ( TRELLIS_PARAMS, int b_interlaced );
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm

index f0e816d7be20c6a8846f3498504059e09251f262..008f73de5bba00289ef21dd6e28e0ed4b837fff5 100644 (file)
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -700,7 +700,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
      extern %1
  %endmacro
  
-%macro const 2+
+%macro const 1-2+
      %xdefine %1 mangle(private_prefix %+ _ %+ %1)
      %ifidn __OUTPUT_FORMAT__,elf
          global %1:data hidden
diff --git a/encoder/cavlc.c b/encoder/cavlc.c

index e41b5e14747453764c9a0232fe1595f280cdfd1f..daf0614c7abe7eb515283394f35536cee27c9995 100644 (file)
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -128,13 +128,13 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct
      unsigned int i_sign;
  
      /* level and run and total */
-    /* set these to 2 to allow branchless i_trailing calculation */
-    runlevel.level[1] = 2;
-    runlevel.level[2] = 2;
      i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
      x264_prefetch( &x264_run_before[runlevel.mask] );
      i_total_zero = runlevel.last + 1 - i_total;
  
+    /* branchless i_trailing calculation */
+    runlevel.level[i_total+0] = 2;
+    runlevel.level[i_total+1] = 2;
      i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
                 | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2)
                 | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4);
diff --git a/tools/checkasm.c b/tools/checkasm.c

index 7a2f6d4d51e1d2a388b9bccc29cfc97b3b77708f..1173126a6b1edf43ac0ee3523335f7b5da47ebad 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -2593,7 +2593,14 @@ static int check_all_flags( void )
          cpu1 &= ~X264_CPU_BMI1;
      }
      if( x264_cpu_detect() & X264_CPU_AVX2 )
+    {
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
+        if( x264_cpu_detect() & X264_CPU_LZCNT )
+        {
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2_LZCNT" );
+            cpu1 &= ~X264_CPU_LZCNT;
+        }
+    }
      if( x264_cpu_detect() & X264_CPU_BMI2 )
      {
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" );
author	Fiona Glaser <fiona@x264.com>
	Fri, 5 Apr 2013 01:00:23 +0000 (18:00 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Tue, 23 Apr 2013 21:36:39 +0000 (14:36 -0700)
common/bitstream.h		patch \| blob \| history
common/quant.c		patch \| blob \| history
common/x86/const-a.asm		patch \| blob \| history
common/x86/pixel-a.asm		patch \| blob \| history
common/x86/quant-a.asm		patch \| blob \| history
common/x86/quant.h		patch \| blob \| history
common/x86/x86inc.asm		patch \| blob \| history
encoder/cavlc.c		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history