Solaris: use sysconf to get processor count

[x264] / common / x86 / quant-a.asm
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm

index 550776e5a90c39a3bf33f8160fea0f9a51025cdf..3c43220b175add85da89bd217084e1d962dc68ac 100644 (file)
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -86,11 +86,11 @@ cextern pd_1024
  %macro QUANT_DC_START 0
      movd       m6, r1m     ; mf
      movd       m7, r2m     ; bias
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
      SPLATD     m6, m6
      SPLATD     m7, m7
  %elif cpuflag(sse4) ; ssse3, but not faster on conroe
-    movdqa     m5, [pb_01]
+    mova       m5, [pb_01]
      pshufb     m6, m5
      pshufb     m7, m5
  %else
@@ -100,13 +100,11 @@ cextern pd_1024
  %endmacro
  
  %macro QUANT_END 0
-%if cpuflag(sse4)
      xor      eax, eax
+%if cpuflag(sse4)
      ptest     m5, m5
-    setne     al
  %else ; !sse4
-    xor      eax, eax
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
  %if mmsize == 16
      packsswb  m5, m5
  %endif
@@ -124,11 +122,11 @@ cextern pd_1024
      test     ecx, ecx
  %endif
  %endif
-    setne     al
  %endif ; cpuflag
+    setne     al
  %endmacro
  
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
  %macro QUANT_ONE_DC 4
  %if cpuflag(sse4)
      mova        m0, [%1]
@@ -136,9 +134,6 @@ cextern pd_1024
      paddd       m1, %3
      pmulld      m1, %2
      psrad       m1, 16
-    PSIGND      m1, m0
-    mova      [%1], m1
-    ACCUM      por, 5, 1, %4
  %else ; !sse4
      mova        m0, [%1]
      ABSD        m1, m0
@@ -150,15 +145,15 @@ cextern pd_1024
      psllq       m2, 32
      paddd       m1, m2
      psrld       m1, 16
+%endif ; cpuflag
      PSIGND      m1, m0
      mova      [%1], m1
      ACCUM     por, 5, 1, %4
-%endif ; cpuflag
  %endmacro
  
  %macro QUANT_TWO_DC 4
  %if cpuflag(sse4)
-    mova        m0, [%1]
+    mova        m0, [%1       ]
      mova        m1, [%1+mmsize]
      ABSD        m2, m0
      ABSD        m3, m1
@@ -170,8 +165,8 @@ cextern pd_1024
      psrad       m3, 16
      PSIGND      m2, m0
      PSIGND      m3, m1
-    mova      [%1], m2
-    mova      [%1+mmsize], m3
+    mova [%1       ], m2
+    mova [%1+mmsize], m3
      ACCUM      por, 5, 2, %4
      por         m5, m3
  %else ; !sse4
@@ -201,20 +196,20 @@ cextern pd_1024
  
  %macro QUANT_TWO_AC 4
  %if cpuflag(sse4)
-    mova        m0, [%1]
+    mova        m0, [%1       ]
      mova        m1, [%1+mmsize]
      ABSD        m2, m0
      ABSD        m3, m1
-    paddd       m2, [%3]
+    paddd       m2, [%3       ]
      paddd       m3, [%3+mmsize]
-    pmulld      m2, [%2]
+    pmulld      m2, [%2       ]
      pmulld      m3, [%2+mmsize]
      psrad       m2, 16
      psrad       m3, 16
      PSIGND      m2, m0
      PSIGND      m3, m1
-    mova      [%1], m2
-    mova      [%1+mmsize], m3
+    mova [%1       ], m2
+    mova [%1+mmsize], m3
      ACCUM      por, 5, 2, %4
      por         m5, m3
  %else ; !sse4
@@ -276,7 +271,7 @@ QUANT_AC 8, 8
  
  %endif ; HIGH_BIT_DEPTH
  
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
  %macro QUANT_ONE 4
  ;;; %1      (m64)       dct[y][x]
  ;;; %2      (m64/mmx)   mf[y][x] or mf[0][0] (as uint16_t)
@@ -342,7 +337,7 @@ cglobal %1, 3,3
  
  INIT_MMX mmx2
  QUANT_DC quant_2x2_dc, 1
-%ifndef ARCH_X86_64 ; not needed because sse2 is faster
+%if ARCH_X86_64 == 0 ; not needed because sse2 is faster
  QUANT_DC quant_4x4_dc, 4
  INIT_MMX mmx
  QUANT_AC quant_4x4, 4
@@ -380,7 +375,7 @@ QUANT_AC quant_8x8, 8
  ;;; %2,%3   dequant_mf[i_mf][y][x]
  ;;; m2      i_qbits
      mova     m0, %2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
      pmaddwd  m0, %1
      pslld    m0, m2
  %else
@@ -398,8 +393,8 @@ QUANT_AC quant_8x8, 8
  ;;; m3      f
  ;;; m4      0
      mova      m0, %1
-%ifdef HIGH_BIT_DEPTH
-    pmadcswd   m0, m0, %2, m3
+%if HIGH_BIT_DEPTH
+    pmadcswd  m0, m0, %2, m3
      psrad     m0, m2
  %else
      punpckhwd m1, m0, m4
@@ -446,9 +441,9 @@ QUANT_AC quant_8x8, 8
  %endrep
  %endmacro
  
-%ifdef WIN64
+%if WIN64
      DECLARE_REG_TMP 6,3,2
-%elifdef ARCH_X86_64
+%elif ARCH_X86_64
      DECLARE_REG_TMP 4,3,2
  %else
      DECLARE_REG_TMP 2,0,1
@@ -462,7 +457,7 @@ QUANT_AC quant_8x8, 8
      sub  t2d, t1d
      sub  t2d, t1d   ; i_mf = i_qp % 6
      shl  t2d, %1
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
      add  r1, t2     ; dequant_mf[i_mf]
  %else
      add  r1, r1mp   ; dequant_mf[i_mf]
@@ -493,8 +488,7 @@ cglobal dequant_%1x%1, 0,3,6
      psrld m3, 1
      DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
  
-%ifndef HIGH_BIT_DEPTH
-%if notcpuflag(avx)
+%if HIGH_BIT_DEPTH == 0 && notcpuflag(avx)
  cglobal dequant_%1x%1_flat16, 0,3
      movifnidn t2d, r2m
  %if %1 == 8
@@ -534,11 +528,10 @@ cglobal dequant_%1x%1_flat16, 0,3
      DEQUANT16_FLAT [r1+32], 32, 96
  %endif
      RET
-%endif ; !AVX
-%endif ; !HIGH_BIT_DEPTH
+%endif ; !HIGH_BIT_DEPTH && !AVX
  %endmacro ; DEQUANT
  
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
  INIT_XMM sse2
  DEQUANT 4, 4, 1
  DEQUANT 8, 6, 1
@@ -546,7 +539,7 @@ INIT_XMM xop
  DEQUANT 4, 4, 1
  DEQUANT 8, 6, 1
  %else
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
  INIT_MMX mmx
  DEQUANT 4, 4, 1
  DEQUANT 8, 6, 1
@@ -592,7 +585,7 @@ cglobal dequant_4x4dc, 0,3,6
      psrld m4, 1
      movd  m2, [r1]
  %assign x 0
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
      pshufd m2, m2, 0
  %rep SIZEOF_PIXEL*32/mmsize
      mova      m0, [r0+x]
@@ -621,13 +614,13 @@ cglobal dequant_4x4dc, 0,3,6
      RET
  %endmacro
  
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
  INIT_XMM sse2
  DEQUANT_DC d, pmaddwd
  INIT_XMM xop
  DEQUANT_DC d, pmaddwd
  %else
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
  INIT_MMX mmx2
  DEQUANT_DC w, pmullw
  %endif
@@ -638,7 +631,7 @@ DEQUANT_DC w, pmullw
  %endif
  
  ; t4 is eax for return value.
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
      DECLARE_REG_TMP 0,1,2,3,6,4  ; Identical for both Windows and *NIX
  %else
      DECLARE_REG_TMP 4,1,2,3,0,5
@@ -653,7 +646,7 @@ DEQUANT_DC w, pmullw
  %if cpuflag(sse4)
      %assign %%regs %%regs-1
  %endif
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
      %assign %%regs %%regs+1      ; t0-t4 are volatile on x86-64
  %endif
  cglobal optimize_chroma_2x2_dc, 0,%%regs,7
@@ -748,7 +741,7 @@ cglobal optimize_chroma_2x2_dc, 0,%%regs,7
      REP_RET
  %endmacro
  
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
  INIT_XMM sse2
  OPTIMIZE_CHROMA_2x2_DC
  INIT_XMM ssse3
@@ -759,13 +752,14 @@ INIT_XMM avx
  OPTIMIZE_CHROMA_2x2_DC
  %endif ; !HIGH_BIT_DEPTH
  
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
  ;-----------------------------------------------------------------------------
  ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
  ;-----------------------------------------------------------------------------
  %macro DENOISE_DCT 0
  cglobal denoise_dct, 4,4,8
      pxor      m6, m6
+    movsxdifnidn r3, r3d
  .loop:
      mova      m2, [r0+r3*4-2*mmsize]
      mova      m3, [r0+r3*4-1*mmsize]
@@ -792,7 +786,7 @@ cglobal denoise_dct, 4,4,8
      REP_RET
  %endmacro
  
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
  INIT_MMX mmx
  DENOISE_DCT
  %endif
@@ -811,6 +805,7 @@ DENOISE_DCT
  %macro DENOISE_DCT 0
  cglobal denoise_dct, 4,4,7
      pxor      m6, m6
+    movsxdifnidn r3, r3d
  .loop:
      mova      m2, [r0+r3*2-2*mmsize]
      mova      m3, [r0+r3*2-1*mmsize]
@@ -839,7 +834,7 @@ cglobal denoise_dct, 4,4,7
      REP_RET
  %endmacro
  
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
  INIT_MMX mmx
  DENOISE_DCT
  %endif
@@ -858,7 +853,7 @@ DENOISE_DCT
  
  %macro DECIMATE_MASK 5
  %if mmsize==16
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
      movdqa   xmm0, [%3+ 0]
      movdqa   xmm1, [%3+32]
      packssdw xmm0, [%3+16]
@@ -876,7 +871,7 @@ DENOISE_DCT
      pmovmskb %2, xmm0
  
  %else ; mmsize==8
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
      movq      mm0, [%3+ 0]
      movq      mm1, [%3+16]
      movq      mm2, [%3+32]
@@ -917,7 +912,7 @@ cextern decimate_table8
  
  %macro DECIMATE4x4 1
  
-;A LUT is faster than bsf on AMD processors.
+;A LUT is faster than bsf on older AMD processors.
  ;This is not true for score64.
  cglobal decimate_score%1, 1,3
  %ifdef PIC
@@ -952,7 +947,7 @@ cglobal decimate_score%1, 1,3
      add    al, byte [mask_table + rdx]
  %else
  .loop:
-    bsf   ecx, edx
+    tzcnt ecx, edx
      shr   edx, cl
      add    al, byte [table + rcx]
      shr   edx, 1
@@ -966,7 +961,7 @@ cglobal decimate_score%1, 1,3
  
  %endmacro
  
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
  INIT_MMX mmx2
  DECIMATE4x4 15
  DECIMATE4x4 16
@@ -989,7 +984,7 @@ DECIMATE4x4 16
  
  %macro DECIMATE8x8 0
  
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
  cglobal decimate_score64, 1,5
  %ifdef PIC
      lea r4, [decimate_table8]
@@ -1016,7 +1011,7 @@ cglobal decimate_score64, 1,5
      add   eax, r3d
      jne  .ret9
  .loop:
-    bsf   rcx, r1
+    tzcnt rcx, r1
      shr   r1, cl
      add   al, byte [table + rcx]
      shr   r1, 1
@@ -1052,7 +1047,7 @@ cglobal decimate_score64, 1,5
      add   r0, r2
      jne  .ret9      ;r0 is zero at this point, so we don't need to zero it
  .loop:
-    bsf   ecx, r3
+    tzcnt ecx, r3
      test  r3, r3
      je   .largerun
      shrd  r3, r4, cl
@@ -1078,7 +1073,7 @@ cglobal decimate_score64, 1,5
  .largerun:
      mov   r3, r4
      xor   r4, r4
-    bsf   ecx, r3
+    tzcnt ecx, r3
      shr   r3, cl
      shr   r3, 1
      jne  .loop
@@ -1087,7 +1082,7 @@ cglobal decimate_score64, 1,5
  
  %endmacro
  
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
  INIT_MMX mmx2
  DECIMATE8x8
  %endif
@@ -1118,7 +1113,7 @@ DECIMATE8x8
  %endif
  %endmacro
  
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
  %macro LAST_MASK 3-4
  %if %1 == 4
      movq     mm0, [%3]
@@ -1196,7 +1191,7 @@ cglobal coeff_last8, 1,3
      RET
  %endmacro
  
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
  INIT_MMX mmx2
  COEFF_LAST8
  %endif
@@ -1236,7 +1231,7 @@ COEFF_LAST8
  %endmacro
  
  %macro COEFF_LAST48 0
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
  cglobal coeff_last4, 1,1
      BSR  rax, [r0], 0x3f
      shr  eax, 4
@@ -1285,7 +1280,7 @@ cglobal coeff_last16, 1,3
      BSR eax, r1d, 0x1f
      RET
  
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
  cglobal coeff_last64, 1, 5-mmsize/16
      pxor m2, m2
      LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 32, r4d
@@ -1316,15 +1311,15 @@ cglobal coeff_last64, 1,4
      shl r0d, 16
      or  r1d, r2d
      or  r3d, r0d
-    shl r3,  32
-    or  r1,  r3
-    not r1
+    shl  r3, 32
+    or   r1, r3
+    not  r1
      BSR rax, r1, 0x3f
      RET
  %endif
  %endmacro
  
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
  INIT_MMX mmx2
  COEFF_LAST
  %endif
@@ -1338,9 +1333,9 @@ COEFF_LAST
  ;-----------------------------------------------------------------------------
  
  ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
-%ifdef WIN64
+%if WIN64
      DECLARE_REG_TMP 3,1,2,0,4,5,6
-%elifdef ARCH_X86_64
+%elif ARCH_X86_64
      DECLARE_REG_TMP 0,1,2,3,4,5,6
  %else
      DECLARE_REG_TMP 6,3,2,1,4,5,0
@@ -1353,14 +1348,14 @@ cglobal coeff_level_run%1,0,7
      pxor    m2, m2
      LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
  %if %1==15
-    shr   t5d, 1
+    shr    t5d, 1
  %elif %1==8
-    and   t5d, 0xff
+    and    t5d, 0xff
  %elif %1==4
-    and   t5d, 0xf
+    and    t5d, 0xf
  %endif
-    xor   t5d, (1<<%1)-1
-    mov   [t1+4], t5d
+    xor    t5d, (1<<%1)-1
+    mov [t1+4], t5d
      shl    t5d, 32-%1
      mov    t4d, %1-1
      LZCOUNT t3d, t5d, 0x1f
@@ -1371,17 +1366,18 @@ cglobal coeff_level_run%1,0,7
      mov   [t1], t4d
  .loop:
      LZCOUNT t3d, t5d, 0x1f
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
      mov    t2d, [t0+t4*4]
-    mov   [t1+t6+8+16*4], t3b
-    mov   [t1+t6*4+ 8], t2d
  %else
      mov    t2w, [t0+t4*2]
-    mov   [t1+t6+8+16*2], t3b
-    mov   [t1+t6*2+ 8], t2w
  %endif
      inc    t3d
      shl    t5d, t3b
+%if HIGH_BIT_DEPTH
+    mov   [t1+t6*4+ 8], t2d
+%else
+    mov   [t1+t6*2+ 8], t2w
+%endif
      inc    t6d
      sub    t4d, t3d
      jge .loop
@@ -1389,20 +1385,20 @@ cglobal coeff_level_run%1,0,7
  %endmacro
  
  INIT_MMX mmx2
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
  COEFF_LEVELRUN 15
  COEFF_LEVELRUN 16
  %endif
  COEFF_LEVELRUN 4
  COEFF_LEVELRUN 8
  INIT_XMM sse2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
  COEFF_LEVELRUN 8
  %endif
  COEFF_LEVELRUN 15
  COEFF_LEVELRUN 16
  INIT_XMM sse2, lzcnt
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
  COEFF_LEVELRUN 8
  %endif
  COEFF_LEVELRUN 15