x86: Remove X264_CPU_SSE_MISALIGN functions

author Henrik Gramner <henrik@gramner.com>

Fri, 5 Jul 2013 19:15:43 +0000 (21:15 +0200)

committer Fiona Glaser <fiona@x264.com>

Fri, 5 Jul 2013 20:10:06 +0000 (13:10 -0700)
author Henrik Gramner <henrik@gramner.com>
Fri, 5 Jul 2013 19:15:43 +0000 (21:15 +0200)
committer Fiona Glaser <fiona@x264.com>
Fri, 5 Jul 2013 20:10:06 +0000 (13:10 -0700)
diff --git a/common/cpu.c b/common/cpu.c

index 9b77c1d7a5610a5b0bd5d06c7929685fee1ebdcd..670aa4ff1fb5200c9aa965da096caf0dd7f812b2 100644 (file)
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -74,7 +74,6 @@ const x264_cpu_name_t x264_cpu_names[] =
  #undef MMX2
      {"Cache32",         X264_CPU_CACHELINE_32},
      {"Cache64",         X264_CPU_CACHELINE_64},
-    {"SSEMisalign",     X264_CPU_SSE_MISALIGN},
      {"LZCNT",           X264_CPU_LZCNT},
      {"BMI1",            X264_CPU_BMI1},
      {"BMI2",            X264_CPU_BMI1|X264_CPU_BMI2},
@@ -210,12 +209,6 @@ uint32_t x264_cpu_detect( void )
              }
          }
  
-        if( ecx&0x00000080 ) /* Misalign SSE */
-        {
-            cpu |= X264_CPU_SSE_MISALIGN;
-            x264_cpu_mask_misalign_sse();
-        }
-
          if( cpu & X264_CPU_AVX )
          {
              if( ecx&0x00000800 ) /* XOP */
diff --git a/common/cpu.h b/common/cpu.h

index 27d133977ae5dcc776f5668809260aaf8403cbc7..06b45d37243282f832842243f02f8a7c50f01181 100644 (file)
--- a/common/cpu.h
+++ b/common/cpu.h
@@ -45,7 +45,6 @@ void     x264_cpu_sfence( void );
  #define x264_emms()
  #endif
  #define x264_sfence x264_cpu_sfence
-void     x264_cpu_mask_misalign_sse( void );
  void     x264_safe_intel_cpu_indicator_init( void );
  
  /* kludge:
diff --git a/common/pixel.c b/common/pixel.c

index 5dfa98df93810e72dbe0744b816c7819f22e158f..7a1e95502ee9aa68e5c8788ee71a0ce893d53685 100644 (file)
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1119,12 +1119,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
                 pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_sse2;
             }
          }
-
-        if( cpu&X264_CPU_SSE_MISALIGN )
-        {
-            INIT2( sad_x3, _sse2_misalign );
-            INIT2( sad_x4, _sse2_misalign );
-        }
      }
  
      if( cpu&X264_CPU_SSE2_IS_FAST && !(cpu&X264_CPU_CACHELINE_64) )
@@ -1237,6 +1231,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
      if( cpu&X264_CPU_AVX )
      {
          INIT2_NAME( sad_aligned, sad, _sse2 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
+        INIT2( sad_x3, _avx );
+        INIT2( sad_x4, _avx );
          INIT8( satd, _avx );
          INIT7( satd_x3, _avx );
          INIT7( satd_x4, _avx );
diff --git a/common/x86/cpu-a.asm b/common/x86/cpu-a.asm

index d059a9aae251fc698d81eceb427cd5c50dafc78c..2ff86e7542eb62a38c5e413a29ae67f0787a6e8a 100644 (file)
--- a/common/x86/cpu-a.asm
+++ b/common/x86/cpu-a.asm
@@ -146,17 +146,6 @@ cglobal cpu_sfence
      sfence
      ret
  
-;-----------------------------------------------------------------------------
-; void cpu_mask_misalign_sse( void )
-;-----------------------------------------------------------------------------
-cglobal cpu_mask_misalign_sse
-    sub   rsp, 4
-    stmxcsr [rsp]
-    or dword [rsp], 1<<17
-    ldmxcsr [rsp]
-    add   rsp, 4
-    ret
-
  cextern intel_cpu_indicator_init
  
  ;-----------------------------------------------------------------------------
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm

index 923a0d7960edec59de0f164e4cd76a6019b76bb6..f8d66a45f53641a647efecbec08b513c91ae1fad 100644 (file)
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -1029,59 +1029,48 @@ cglobal pixel_avg2_w20_mmx2, 6,7
      jg     .height_loop
      RET
  
+INIT_XMM
  cglobal pixel_avg2_w16_sse2, 6,7
      sub    r4, r2
      lea    r6, [r4+r3]
  .height_loop:
-    movdqu xmm0, [r2]
-    movdqu xmm2, [r2+r3]
-    movdqu xmm1, [r2+r4]
-    movdqu xmm3, [r2+r6]
+    movu   m0, [r2]
+    movu   m2, [r2+r3]
+    movu   m1, [r2+r4]
+    movu   m3, [r2+r6]
      lea    r2, [r2+r3*2]
-    pavgb  xmm0, xmm1
-    pavgb  xmm2, xmm3
-    movdqa [r0], xmm0
-    movdqa [r0+r1], xmm2
+    pavgb  m0, m1
+    pavgb  m2, m3
+    mova [r0], m0
+    mova [r0+r1], m2
      lea    r0, [r0+r1*2]
-    sub    r5d, 2
-    jg     .height_loop
+    sub   r5d, 2
+    jg .height_loop
      RET
  
-%macro AVG2_W20 1
-cglobal pixel_avg2_w20_%1, 6,7
+cglobal pixel_avg2_w20_sse2, 6,7
      sub    r2, r4
      lea    r6, [r2+r3]
  .height_loop:
-    movdqu xmm0, [r4]
-    movdqu xmm2, [r4+r3]
-%ifidn %1, sse2_misalign
-    movd   mm4,  [r4+16]
-    movd   mm5,  [r4+r3+16]
-    pavgb  xmm0, [r4+r2]
-    pavgb  xmm2, [r4+r6]
-%else
-    movdqu xmm1, [r4+r2]
-    movdqu xmm3, [r4+r6]
-    movd   mm4,  [r4+16]
-    movd   mm5,  [r4+r3+16]
-    pavgb  xmm0, xmm1
-    pavgb  xmm2, xmm3
-%endif
-    pavgb  mm4,  [r4+r2+16]
-    pavgb  mm5,  [r4+r6+16]
+    movu   m0, [r4]
+    movu   m2, [r4+r3]
+    movu   m1, [r4+r2]
+    movu   m3, [r4+r6]
+    movd  mm4, [r4+16]
+    movd  mm5, [r4+r3+16]
+    pavgb  m0, m1
+    pavgb  m2, m3
+    pavgb mm4, [r4+r2+16]
+    pavgb mm5, [r4+r6+16]
      lea    r4, [r4+r3*2]
-    movdqa [r0], xmm0
-    movd   [r0+16], mm4
-    movdqa [r0+r1], xmm2
-    movd   [r0+r1+16], mm5
+    mova [r0], m0
+    mova [r0+r1], m2
+    movd [r0+16], mm4
+    movd [r0+r1+16], mm5
      lea    r0, [r0+r1*2]
-    sub    r5d, 2
-    jg     .height_loop
+    sub   r5d, 2
+    jg .height_loop
      RET
-%endmacro
-
-AVG2_W20 sse2
-AVG2_W20 sse2_misalign
  
  INIT_YMM avx2
  cglobal pixel_avg2_w20, 6,7
@@ -1524,7 +1513,7 @@ cglobal prefetch_ref, 3,3
  %endmacro
  %else ; !HIGH_BIT_DEPTH
  %macro UNPACK_UNALIGNED 3
-%if mmsize == 8 || cpuflag(misalign)
+%if mmsize == 8
      punpcklwd  %1, %3
  %else
      movh       %2, %3
@@ -2130,8 +2119,6 @@ MC_CHROMA
  %else ; !HIGH_BIT_DEPTH
  INIT_MMX mmx2
  MC_CHROMA
-INIT_XMM sse2, misalign
-MC_CHROMA
  INIT_XMM sse2
  MC_CHROMA
  INIT_XMM ssse3
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm

index 893c0655edd500cf4fda8e3f19646a2fcaf304f3..0b97f237c8ef2a516f4c37a276b1aa5947985acd 100644 (file)
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -482,7 +482,7 @@ cglobal hpel_filter_c, 3,3,9
      %define pw_rnd [pw_32]
  %endif
  ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
-%if cpuflag(misalign) || mmsize==32
+%if mmsize==32
  .loop:
      movu    m4, [src-4]
      movu    m5, [src-2]
@@ -630,8 +630,6 @@ INIT_MMX mmx2
  HPEL_V 0
  INIT_XMM sse2
  HPEL_V 8
-INIT_XMM sse2, misalign
-HPEL_C
  %if ARCH_X86_64 == 0
  INIT_XMM sse2
  HPEL_C
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index d83958ded54cedb6c97fa3bc234839e71afc8c06..ffc2ca862128d288f750b242baa7d701d600014d 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -158,7 +158,6 @@ void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src,
                             int dx, int dy, int i_width, int i_height );
  MC_CHROMA(mmx2)
  MC_CHROMA(sse2)
-MC_CHROMA(sse2_misalign)
  MC_CHROMA(ssse3)
  MC_CHROMA(ssse3_cache64)
  MC_CHROMA(avx)
@@ -186,7 +185,6 @@ PIXEL_AVG_WALL(cache32_mmx2)
  PIXEL_AVG_WALL(cache64_mmx2)
  PIXEL_AVG_WALL(cache64_sse2)
  PIXEL_AVG_WALL(sse2)
-PIXEL_AVG_WALL(sse2_misalign)
  PIXEL_AVG_WALL(cache64_ssse3)
  PIXEL_AVG_WALL(avx2)
  
@@ -227,7 +225,6 @@ PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cac
  PIXEL_AVG_WTAB(cache64_mmx2, mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2)
  #endif
  PIXEL_AVG_WTAB(sse2, mmx2, mmx2, sse2, sse2, sse2)
-PIXEL_AVG_WTAB(sse2_misalign, mmx2, mmx2, sse2, sse2, sse2_misalign)
  PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
  PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
  PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2)
@@ -429,7 +426,6 @@ GET_REF(avx2)
  GET_REF(cache32_mmx2)
  GET_REF(cache64_mmx2)
  #endif
-GET_REF(sse2_misalign)
  GET_REF(cache64_sse2)
  GET_REF(cache64_ssse3)
  GET_REF(cache64_ssse3_atom)
@@ -477,7 +473,6 @@ HPEL(16, ssse3, ssse3, ssse3, ssse3)
  HPEL(16, avx, avx, avx, avx)
  HPEL(32, avx2, avx2, avx2, avx2)
  #endif
-HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
  #endif // HIGH_BIT_DEPTH
  
  static void x264_plane_copy_mmx2( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )
@@ -696,8 +691,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
          pf->avg[PIXEL_8x8]  = x264_pixel_avg_8x8_sse2;
          pf->avg[PIXEL_8x4]  = x264_pixel_avg_8x4_sse2;
          pf->hpel_filter = x264_hpel_filter_sse2;
-        if( cpu&X264_CPU_SSE_MISALIGN )
-            pf->hpel_filter = x264_hpel_filter_sse2_misalign;
          pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
          if( !(cpu&X264_CPU_STACK_MOD4) )
              pf->mc_chroma = x264_mc_chroma_sse2;
@@ -716,12 +709,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
                  pf->mc_luma = mc_luma_cache64_sse2;
                  pf->get_ref = get_ref_cache64_sse2;
              }
-            if( cpu&X264_CPU_SSE_MISALIGN )
-            {
-                pf->get_ref = get_ref_sse2_misalign;
-                if( !(cpu&X264_CPU_STACK_MOD4) )
-                    pf->mc_chroma = x264_mc_chroma_sse2_misalign;
-            }
          }
      }
  
diff --git a/common/x86/pixel.h b/common/x86/pixel.h

index e561fd13694193b706adb9bcb8384f6154a2fa85..0f3524cbf670e8e9bab3a5906b030b904a592dc2 100644 (file)
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -47,7 +47,6 @@
  
  DECL_X1( sad, mmx2 )
  DECL_X1( sad, sse2 )
-DECL_X4( sad, sse2_misalign )
  DECL_X1( sad, sse3 )
  DECL_X1( sad, sse2_aligned )
  DECL_X1( sad, ssse3 )
@@ -57,6 +56,7 @@ DECL_X4( sad, mmx2 )
  DECL_X4( sad, sse2 )
  DECL_X4( sad, sse3 )
  DECL_X4( sad, ssse3 )
+DECL_X4( sad, avx )
  DECL_X4( sad, avx2 )
  DECL_X1( ssd, mmx )
  DECL_X1( ssd, mmx2 )
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm

index 2ff80355770b29f443b618b700f0a179664052b3..a287c780b563f23880b50c86ca397c8cd8f8d4df 100644 (file)
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -1009,62 +1009,56 @@ SAD_X 4,  4,  4
  ;=============================================================================
  
  %macro SAD_X3_START_1x16P_SSE2 0
-%if cpuflag(misalign)
-    mova   xmm2, [r0]
-    movu   xmm0, [r1]
-    movu   xmm1, [r2]
-    psadbw xmm0, xmm2
-    psadbw xmm1, xmm2
-    psadbw xmm2, [r3]
+    mova     m2, [r0]
+%if cpuflag(avx)
+    psadbw   m0, m2, [r1]
+    psadbw   m1, m2, [r2]
+    psadbw   m2, [r3]
  %else
-    mova   xmm3, [r0]
-    movu   xmm0, [r1]
-    movu   xmm1, [r2]
-    movu   xmm2, [r3]
-    psadbw xmm0, xmm3
-    psadbw xmm1, xmm3
-    psadbw xmm2, xmm3
+    movu     m0, [r1]
+    movu     m1, [r2]
+    movu     m3, [r3]
+    psadbw   m0, m2
+    psadbw   m1, m2
+    psadbw   m2, m3
  %endif
  %endmacro
  
  %macro SAD_X3_1x16P_SSE2 2
-%if cpuflag(misalign)
-    mova   xmm3, [r0+%1]
-    movu   xmm4, [r1+%2]
-    movu   xmm5, [r2+%2]
-    psadbw xmm4, xmm3
-    psadbw xmm5, xmm3
-    psadbw xmm3, [r3+%2]
-    paddw  xmm0, xmm4
-    paddw  xmm1, xmm5
-    paddw  xmm2, xmm3
+    mova     m3, [r0+%1]
+%if cpuflag(avx)
+    psadbw   m4, m3, [r1+%2]
+    psadbw   m5, m3, [r2+%2]
+    psadbw   m3, [r3+%2]
  %else
-    mova   xmm3, [r0+%1]
-    movu   xmm4, [r1+%2]
-    movu   xmm5, [r2+%2]
-    movu   xmm6, [r3+%2]
-    psadbw xmm4, xmm3
-    psadbw xmm5, xmm3
-    psadbw xmm6, xmm3
-    paddw  xmm0, xmm4
-    paddw  xmm1, xmm5
-    paddw  xmm2, xmm6
+    movu     m4, [r1+%2]
+    movu     m5, [r2+%2]
+    movu     m6, [r3+%2]
+    psadbw   m4, m3
+    psadbw   m5, m3
+    psadbw   m3, m6
  %endif
+    paddw    m0, m4
+    paddw    m1, m5
+    paddw    m2, m3
  %endmacro
  
+%if ARCH_X86_64
+    DECLARE_REG_TMP 6
+%else
+    DECLARE_REG_TMP 5
+%endif
+
  %macro SAD_X3_4x16P_SSE2 2
  %if %1==0
-%if UNIX64
-    mov  r6, r5
-%endif
-    lea  r5, [r4*3]
+    lea  t0, [r4*3]
      SAD_X3_START_1x16P_SSE2
  %else
      SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
  %endif
      SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1
      SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2
-    SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r5
+    SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0
  %if %1 != %2-1
  %if (%1&1) != 0
      add  r0, 8*FENC_STRIDE
@@ -1076,156 +1070,117 @@ SAD_X 4,  4,  4
  %endmacro
  
  %macro SAD_X3_START_2x8P_SSE2 0
-    movq    xmm7, [r0]
-    movq    xmm0, [r1]
-    movq    xmm1, [r2]
-    movq    xmm2, [r3]
-    movhps  xmm7, [r0+FENC_STRIDE]
-    movhps  xmm0, [r1+r4]
-    movhps  xmm1, [r2+r4]
-    movhps  xmm2, [r3+r4]
-    psadbw  xmm0, xmm7
-    psadbw  xmm1, xmm7
-    psadbw  xmm2, xmm7
+    movq     m3, [r0]
+    movq     m0, [r1]
+    movq     m1, [r2]
+    movq     m2, [r3]
+    movhps   m3, [r0+FENC_STRIDE]
+    movhps   m0, [r1+r4]
+    movhps   m1, [r2+r4]
+    movhps   m2, [r3+r4]
+    psadbw   m0, m3
+    psadbw   m1, m3
+    psadbw   m2, m3
  %endmacro
  
  %macro SAD_X3_2x8P_SSE2 4
-    movq    xmm7, [r0+%1]
-    movq    xmm3, [r1+%2]
-    movq    xmm4, [r2+%2]
-    movq    xmm5, [r3+%2]
-    movhps  xmm7, [r0+%3]
-    movhps  xmm3, [r1+%4]
-    movhps  xmm4, [r2+%4]
-    movhps  xmm5, [r3+%4]
-    psadbw  xmm3, xmm7
-    psadbw  xmm4, xmm7
-    psadbw  xmm5, xmm7
-    paddw   xmm0, xmm3
-    paddw   xmm1, xmm4
-    paddw   xmm2, xmm5
+    movq     m6, [r0+%1]
+    movq     m3, [r1+%2]
+    movq     m4, [r2+%2]
+    movq     m5, [r3+%2]
+    movhps   m6, [r0+%3]
+    movhps   m3, [r1+%4]
+    movhps   m4, [r2+%4]
+    movhps   m5, [r3+%4]
+    psadbw   m3, m6
+    psadbw   m4, m6
+    psadbw   m5, m6
+    paddw    m0, m3
+    paddw    m1, m4
+    paddw    m2, m5
  %endmacro
  
  %macro SAD_X4_START_2x8P_SSE2 0
-    movq    xmm7, [r0]
-    movq    xmm0, [r1]
-    movq    xmm1, [r2]
-    movq    xmm2, [r3]
-    movq    xmm3, [r4]
-    movhps  xmm7, [r0+FENC_STRIDE]
-    movhps  xmm0, [r1+r5]
-    movhps  xmm1, [r2+r5]
-    movhps  xmm2, [r3+r5]
-    movhps  xmm3, [r4+r5]
-    psadbw  xmm0, xmm7
-    psadbw  xmm1, xmm7
-    psadbw  xmm2, xmm7
-    psadbw  xmm3, xmm7
+    movq     m4, [r0]
+    movq     m0, [r1]
+    movq     m1, [r2]
+    movq     m2, [r3]
+    movq     m3, [r4]
+    movhps   m4, [r0+FENC_STRIDE]
+    movhps   m0, [r1+r5]
+    movhps   m1, [r2+r5]
+    movhps   m2, [r3+r5]
+    movhps   m3, [r4+r5]
+    psadbw   m0, m4
+    psadbw   m1, m4
+    psadbw   m2, m4
+    psadbw   m3, m4
  %endmacro
  
  %macro SAD_X4_2x8P_SSE2 4
-    movq    xmm7, [r0+%1]
-    movq    xmm4, [r1+%2]
-    movq    xmm5, [r2+%2]
-%if ARCH_X86_64
-    movq    xmm6, [r3+%2]
-    movq    xmm8, [r4+%2]
-    movhps  xmm7, [r0+%3]
-    movhps  xmm4, [r1+%4]
-    movhps  xmm5, [r2+%4]
-    movhps  xmm6, [r3+%4]
-    movhps  xmm8, [r4+%4]
-    psadbw  xmm4, xmm7
-    psadbw  xmm5, xmm7
-    psadbw  xmm6, xmm7
-    psadbw  xmm8, xmm7
-    paddw   xmm0, xmm4
-    paddw   xmm1, xmm5
-    paddw   xmm2, xmm6
-    paddw   xmm3, xmm8
-%else
-    movhps  xmm7, [r0+%3]
-    movhps  xmm4, [r1+%4]
-    movhps  xmm5, [r2+%4]
-    psadbw  xmm4, xmm7
-    psadbw  xmm5, xmm7
-    paddw   xmm0, xmm4
-    paddw   xmm1, xmm5
-    movq    xmm6, [r3+%2]
-    movq    xmm4, [r4+%2]
-    movhps  xmm6, [r3+%4]
-    movhps  xmm4, [r4+%4]
-    psadbw  xmm6, xmm7
-    psadbw  xmm4, xmm7
-    paddw   xmm2, xmm6
-    paddw   xmm3, xmm4
-%endif
+    movq     m6, [r0+%1]
+    movq     m4, [r1+%2]
+    movq     m5, [r2+%2]
+    movhps   m6, [r0+%3]
+    movhps   m4, [r1+%4]
+    movhps   m5, [r2+%4]
+    psadbw   m4, m6
+    psadbw   m5, m6
+    paddw    m0, m4
+    paddw    m1, m5
+    movq     m4, [r3+%2]
+    movq     m5, [r4+%2]
+    movhps   m4, [r3+%4]
+    movhps   m5, [r4+%4]
+    psadbw   m4, m6
+    psadbw   m5, m6
+    paddw    m2, m4
+    paddw    m3, m5
  %endmacro
  
  %macro SAD_X4_START_1x16P_SSE2 0
-%if cpuflag(misalign)
-    mova   xmm3, [r0]
-    movu   xmm0, [r1]
-    movu   xmm1, [r2]
-    movu   xmm2, [r3]
-    psadbw xmm0, xmm3
-    psadbw xmm1, xmm3
-    psadbw xmm2, xmm3
-    psadbw xmm3, [r4]
+    mova     m3, [r0]
+%if cpuflag(avx)
+    psadbw   m0, m3, [r1]
+    psadbw   m1, m3, [r2]
+    psadbw   m2, m3, [r3]
+    psadbw   m3, [r4]
  %else
-    mova   xmm7, [r0]
-    movu   xmm0, [r1]
-    movu   xmm1, [r2]
-    movu   xmm2, [r3]
-    movu   xmm3, [r4]
-    psadbw xmm0, xmm7
-    psadbw xmm1, xmm7
-    psadbw xmm2, xmm7
-    psadbw xmm3, xmm7
+    movu     m0, [r1]
+    movu     m1, [r2]
+    movu     m2, [r3]
+    movu     m4, [r4]
+    psadbw   m0, m3
+    psadbw   m1, m3
+    psadbw   m2, m3
+    psadbw   m3, m4
  %endif
  %endmacro
  
  %macro SAD_X4_1x16P_SSE2 2
-%if cpuflag(misalign)
-    mova   xmm7, [r0+%1]
-    movu   xmm4, [r1+%2]
-    movu   xmm5, [r2+%2]
-    movu   xmm6, [r3+%2]
-    psadbw xmm4, xmm7
-    psadbw xmm5, xmm7
-    psadbw xmm6, xmm7
-    psadbw xmm7, [r4+%2]
-    paddw  xmm0, xmm4
-    paddw  xmm1, xmm5
-    paddw  xmm2, xmm6
-    paddw  xmm3, xmm7
+    mova     m6, [r0+%1]
+%if cpuflag(avx)
+    psadbw   m4, m6, [r1+%2]
+    psadbw   m5, m6, [r2+%2]
  %else
-    mova   xmm7, [r0+%1]
-    movu   xmm4, [r1+%2]
-    movu   xmm5, [r2+%2]
-    movu   xmm6, [r3+%2]
-%if ARCH_X86_64
-    movu   xmm8, [r4+%2]
-    psadbw xmm4, xmm7
-    psadbw xmm5, xmm7
-    psadbw xmm6, xmm7
-    psadbw xmm8, xmm7
-    paddw  xmm0, xmm4
-    paddw  xmm1, xmm5
-    paddw  xmm2, xmm6
-    paddw  xmm3, xmm8
-%else
-    psadbw xmm4, xmm7
-    psadbw xmm5, xmm7
-    paddw  xmm0, xmm4
-    psadbw xmm6, xmm7
-    movu   xmm4, [r4+%2]
-    paddw  xmm1, xmm5
-    psadbw xmm4, xmm7
-    paddw  xmm2, xmm6
-    paddw  xmm3, xmm4
+    movu     m4, [r1+%2]
+    movu     m5, [r2+%2]
+    psadbw   m4, m6
+    psadbw   m5, m6
  %endif
+    paddw    m0, m4
+    paddw    m1, m5
+%if cpuflag(avx)
+    psadbw   m4, m6, [r3+%2]
+    psadbw   m5, m6, [r4+%2]
+%else
+    movu     m4, [r3+%2]
+    movu     m5, [r4+%2]
+    psadbw   m4, m6
+    psadbw   m5, m6
  %endif
+    paddw    m2, m4
+    paddw    m3, m5
  %endmacro
  
  %macro SAD_X4_4x16P_SSE2 2
@@ -1251,15 +1206,12 @@ SAD_X 4,  4,  4
  
  %macro SAD_X3_4x8P_SSE2 2
  %if %1==0
-%if UNIX64
-    mov  r6, r5
-%endif
-    lea  r5, [r4*3]
+    lea  t0, [r4*3]
      SAD_X3_START_2x8P_SSE2
  %else
      SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1
  %endif
-    SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), r5
+    SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0
  %if %1 != %2-1
  %if (%1&1) != 0
      add  r0, 8*FENC_STRIDE
@@ -1290,78 +1242,72 @@ SAD_X 4,  4,  4
  %endmacro
  
  %macro SAD_X3_END_SSE2 0
-    movhlps xmm4, xmm0
-    movhlps xmm5, xmm1
-    movhlps xmm6, xmm2
-    paddw   xmm0, xmm4
-    paddw   xmm1, xmm5
-    paddw   xmm2, xmm6
-%if UNIX64
-    movd [r6+0], xmm0
-    movd [r6+4], xmm1
-    movd [r6+8], xmm2
-%else
-    mov      r0, r5mp
-    movd [r0+0], xmm0
-    movd [r0+4], xmm1
-    movd [r0+8], xmm2
-%endif
+    movhlps  m3, m0
+    movhlps  m4, m1
+    movhlps  m5, m2
+    paddw    m0, m3
+    paddw    m1, m4
+    paddw    m2, m5
+    movifnidn r5, r5mp
+    movd [r5+0], m0
+    movd [r5+4], m1
+    movd [r5+8], m2
      RET
  %endmacro
  
  %macro SAD_X4_END_SSE2 0
-    mov       r0, r6mp
-    psllq   xmm1, 32
-    psllq   xmm3, 32
-    paddw   xmm0, xmm1
-    paddw   xmm2, xmm3
-    movhlps xmm1, xmm0
-    movhlps xmm3, xmm2
-    paddw   xmm0, xmm1
-    paddw   xmm2, xmm3
-    movq  [r0+0], xmm0
-    movq  [r0+8], xmm2
+    mov      r0, r6mp
+    psllq    m1, 32
+    psllq    m3, 32
+    paddw    m0, m1
+    paddw    m2, m3
+    movhlps  m1, m0
+    movhlps  m3, m2
+    paddw    m0, m1
+    paddw    m2, m3
+    movq [r0+0], m0
+    movq [r0+8], m2
      RET
  %endmacro
  
  %macro SAD_X4_START_2x8P_SSSE3 0
-    movddup xmm4, [r0]
-    movq    xmm0, [r1]
-    movq    xmm1, [r3]
-    movhps  xmm0, [r2]
-    movhps  xmm1, [r4]
-    movddup xmm5, [r0+FENC_STRIDE]
-    movq    xmm2, [r1+r5]
-    movq    xmm3, [r3+r5]
-    movhps  xmm2, [r2+r5]
-    movhps  xmm3, [r4+r5]
-    psadbw  xmm0, xmm4
-    psadbw  xmm1, xmm4
-    psadbw  xmm2, xmm5
-    psadbw  xmm3, xmm5
-    paddw   xmm0, xmm2
-    paddw   xmm1, xmm3
+    movddup  m4, [r0]
+    movq     m0, [r1]
+    movq     m1, [r3]
+    movhps   m0, [r2]
+    movhps   m1, [r4]
+    movddup  m5, [r0+FENC_STRIDE]
+    movq     m2, [r1+r5]
+    movq     m3, [r3+r5]
+    movhps   m2, [r2+r5]
+    movhps   m3, [r4+r5]
+    psadbw   m0, m4
+    psadbw   m1, m4
+    psadbw   m2, m5
+    psadbw   m3, m5
+    paddw    m0, m2
+    paddw    m1, m3
  %endmacro
  
  %macro SAD_X4_2x8P_SSSE3 4
-    movddup xmm6, [r0+%1]
-    movq    xmm2, [r1+%2]
-    movq    xmm3, [r3+%2]
-    movhps  xmm2, [r2+%2]
-    movhps  xmm3, [r4+%2]
-    movddup xmm7, [r0+%3]
-    movq    xmm4, [r1+%4]
-    movq    xmm5, [r3+%4]
-    movhps  xmm4, [r2+%4]
-    movhps  xmm5, [r4+%4]
-    psadbw  xmm2, xmm6
-    psadbw  xmm3, xmm6
-    psadbw  xmm4, xmm7
-    psadbw  xmm5, xmm7
-    paddw   xmm0, xmm2
-    paddw   xmm1, xmm3
-    paddw   xmm0, xmm4
-    paddw   xmm1, xmm5
+    movddup  m6, [r0+%1]
+    movq     m2, [r1+%2]
+    movq     m3, [r3+%2]
+    movhps   m2, [r2+%2]
+    movhps   m3, [r4+%2]
+    movddup  m7, [r0+%3]
+    movq     m4, [r1+%4]
+    movq     m5, [r3+%4]
+    movhps   m4, [r2+%4]
+    movhps   m5, [r4+%4]
+    psadbw   m2, m6
+    psadbw   m3, m6
+    psadbw   m4, m7
+    psadbw   m5, m7
+    paddw    m0, m2
+    paddw    m1, m3
+    paddw    m0, m4
+    paddw    m1, m5
  %endmacro
  
  %macro SAD_X4_4x8P_SSSE3 2
@@ -1384,9 +1330,9 @@ SAD_X 4,  4,  4
  %endmacro
  
  %macro SAD_X4_END_SSSE3 0
-    mov       r0, r6mp
-    packssdw xmm0, xmm1
-    movdqa  [r0], xmm0
+    mov      r0, r6mp
+    packssdw m0, m1
+    mova   [r0], m0
      RET
  %endmacro
  
@@ -1421,15 +1367,12 @@ SAD_X 4,  4,  4
  
  %macro SAD_X3_4x16P_AVX2 2
  %if %1==0
-%if UNIX64
-    mov  r6, r5
-%endif
-    lea  r5, [r4*3]
+    lea  t0, [r4*3]
      SAD_X3_START_2x16P_AVX2
  %else
      SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1
  %endif
-    SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, r5
+    SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, t0
  %if %1 != %2-1
  %if (%1&1) != 0
      add  r0, 8*FENC_STRIDE
@@ -1512,16 +1455,10 @@ SAD_X 4,  4,  4
      paddw   xm0, xm4
      paddw   xm1, xm5
      paddw   xm2, xm6
-%if UNIX64
-    movd [r6+0], xm0
-    movd [r6+4], xm1
-    movd [r6+8], xm2
-%else
-    mov      r0, r5mp
-    movd [r0+0], xm0
-    movd [r0+4], xm1
-    movd [r0+8], xm2
-%endif
+    movifnidn r5, r5mp
+    movd [r5+0], xm0
+    movd [r5+4], xm1
+    movd [r5+8], xm2
      RET
  %endmacro
  
@@ -1542,8 +1479,8 @@ SAD_X 4,  4,  4
  ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
  ;                          uint8_t *pix2, intptr_t i_stride, int scores[3] )
  ;-----------------------------------------------------------------------------
-%macro SAD_X_SSE2 3
-cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,9
+%macro SAD_X_SSE2 4
+cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
  %assign x 0
  %rep %3/4
      SAD_X%1_4x%2P_SSE2 x, %3/4
@@ -1553,28 +1490,22 @@ cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,9
  %endmacro
  
  INIT_XMM sse2
-SAD_X_SSE2 3, 16, 16
-SAD_X_SSE2 3, 16,  8
-SAD_X_SSE2 3,  8, 16
-SAD_X_SSE2 3,  8,  8
-SAD_X_SSE2 3,  8,  4
-SAD_X_SSE2 4, 16, 16
-SAD_X_SSE2 4, 16,  8
-SAD_X_SSE2 4,  8, 16
-SAD_X_SSE2 4,  8,  8
-SAD_X_SSE2 4,  8,  4
-
-INIT_XMM sse2, misalign
-SAD_X_SSE2 3, 16, 16
-SAD_X_SSE2 3, 16,  8
-SAD_X_SSE2 4, 16, 16
-SAD_X_SSE2 4, 16,  8
+SAD_X_SSE2 3, 16, 16, 7
+SAD_X_SSE2 3, 16,  8, 7
+SAD_X_SSE2 3,  8, 16, 7
+SAD_X_SSE2 3,  8,  8, 7
+SAD_X_SSE2 3,  8,  4, 7
+SAD_X_SSE2 4, 16, 16, 7
+SAD_X_SSE2 4, 16,  8, 7
+SAD_X_SSE2 4,  8, 16, 7
+SAD_X_SSE2 4,  8,  8, 7
+SAD_X_SSE2 4,  8,  4, 7
  
  INIT_XMM sse3
-SAD_X_SSE2 3, 16, 16
-SAD_X_SSE2 3, 16,  8
-SAD_X_SSE2 4, 16, 16
-SAD_X_SSE2 4, 16,  8
+SAD_X_SSE2 3, 16, 16, 7
+SAD_X_SSE2 3, 16,  8, 7
+SAD_X_SSE2 4, 16, 16, 7
+SAD_X_SSE2 4, 16,  8, 7
  
  %macro SAD_X_SSSE3 3
  cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8
@@ -1591,6 +1522,12 @@ SAD_X_SSSE3 4, 8, 16
  SAD_X_SSSE3 4, 8,  8
  SAD_X_SSSE3 4, 8,  4
  
+INIT_XMM avx
+SAD_X_SSE2 3, 16, 16, 6
+SAD_X_SSE2 3, 16,  8, 6
+SAD_X_SSE2 4, 16, 16, 7
+SAD_X_SSE2 4, 16,  8, 7
+
  %macro SAD_X_AVX2 4
  cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
  %assign x 0
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm

index 4568682871fa082c62f4e5ab1748c2983ba791ba..9a03f9e497c618e2bdb8f38d18c85872a6ae1a2c 100644 (file)
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -731,11 +731,10 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
  %assign cpuflags_cache64  (1<<17)
  %assign cpuflags_slowctz  (1<<18)
  %assign cpuflags_lzcnt    (1<<19)
-%assign cpuflags_misalign (1<<20)
-%assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant
-%assign cpuflags_atom     (1<<22)
-%assign cpuflags_bmi1     (1<<23)|cpuflags_lzcnt
-%assign cpuflags_bmi2     (1<<24)|cpuflags_bmi1
+%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<21)
+%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
+%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
  
  %define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
  %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
diff --git a/encoder/encoder.c b/encoder/encoder.c

index 729584237b881d68ca91017bd7494271251378fc..167daa9acc2963d687fee904ce2ed5daec97a0c7 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -403,21 +403,6 @@ static void x264_encoder_thread_init( x264_t *h )
  {
      if( h->param.i_sync_lookahead )
          x264_lower_thread_priority( 10 );
-
-#if HAVE_MMX
-    /* Misalign mask has to be set separately for each thread. */
-    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
-        x264_cpu_mask_misalign_sse();
-#endif
-}
-
-static void x264_lookahead_thread_init( x264_t *h )
-{
-#if HAVE_MMX
-    /* Misalign mask has to be set separately for each thread. */
-    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
-        x264_cpu_mask_misalign_sse();
-#endif
  }
  #endif
  
@@ -1400,7 +1385,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
          x264_threadpool_init( &h->threadpool, h->param.i_threads, (void*)x264_encoder_thread_init, h ) )
          goto fail;
      if( h->param.i_lookahead_threads > 1 &&
-        x264_threadpool_init( &h->lookaheadpool, h->param.i_lookahead_threads, (void*)x264_lookahead_thread_init, h ) )
+        x264_threadpool_init( &h->lookaheadpool, h->param.i_lookahead_threads, NULL, NULL ) )
          goto fail;
  
  #if HAVE_OPENCL
@@ -2949,10 +2934,6 @@ int     x264_encoder_encode( x264_t *h,
          thread_current =
          thread_oldest  = h;
      }
-#if HAVE_MMX
-    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
-        x264_cpu_mask_misalign_sse();
-#endif
      h->i_cpb_delay_pir_offset = h->i_cpb_delay_pir_offset_next;
  
      /* no data out */
diff --git a/encoder/lookahead.c b/encoder/lookahead.c

index ad8a3e68bf823b6003fd764bf73b82642988ab38..772db9d62e7fe704a74e32eee2165b5d901fc97e 100644 (file)
--- a/encoder/lookahead.c
+++ b/encoder/lookahead.c
@@ -89,16 +89,11 @@ static void x264_lookahead_slicetype_decide( x264_t *h )
  
  static void *x264_lookahead_thread( x264_t *h )
  {
-    int shift;
-#if HAVE_MMX
-    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
-        x264_cpu_mask_misalign_sse();
-#endif
      while( !h->lookahead->b_exit_thread )
      {
          x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
          x264_pthread_mutex_lock( &h->lookahead->next.mutex );
-        shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size );
+        int shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size );
          x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, shift );
          x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
          if( h->lookahead->next.i_size <= h->lookahead->i_slicetype_length + h->param.b_vfr_input )
diff --git a/tools/checkasm.c b/tools/checkasm.c

index 4224ebaddaac4086c660a093a0e2fc58574a3849..e731bf645b156c7b69b26bc9bd74d4f28657f217 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -191,7 +191,6 @@ static void print_bench(void)
                      b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" :
                      b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
                      b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" :
-                    b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
                      b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
                      b->cpu&X264_CPU_BMI2 ? "_bmi2" :
                      b->cpu&X264_CPU_BMI1 ? "_bmi1" :
@@ -2549,11 +2548,6 @@ static int check_all_flags( void )
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
          cpu1 &= ~X264_CPU_SLOW_CTZ;
      }
-    if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
-        cpu1 &= ~X264_CPU_SSE_MISALIGN;
-    }
      if( x264_cpu_detect() & X264_CPU_LZCNT )
      {
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
diff --git a/x264.h b/x264.h

index cd94b594b08a675d846f79db69ac143f2f1a2ebd..4e44c8354bb440f04a5bd47be9c471d906b7dc35 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -41,7 +41,7 @@
  
  #include "x264_config.h"
  
-#define X264_BUILD 135
+#define X264_BUILD 136
  
  /* Application developers planning to link against a shared library version of
   * libx264 from a Microsoft Visual Studio or similar development environment
@@ -122,30 +122,29 @@ typedef struct
  #define X264_CPU_SSSE3           0x0000040
  #define X264_CPU_SSE4            0x0000080  /* SSE4.1 */
  #define X264_CPU_SSE42           0x0000100  /* SSE4.2 */
-#define X264_CPU_SSE_MISALIGN    0x0000200  /* Phenom support for misaligned SSE instruction arguments */
-#define X264_CPU_LZCNT           0x0000400  /* Phenom support for "leading zero count" instruction. */
-#define X264_CPU_AVX             0x0000800  /* AVX support: requires OS support even if YMM registers aren't used. */
-#define X264_CPU_XOP             0x0001000  /* AMD XOP */
-#define X264_CPU_FMA4            0x0002000  /* AMD FMA4 */
-#define X264_CPU_AVX2            0x0004000  /* AVX2 */
-#define X264_CPU_FMA3            0x0008000  /* Intel FMA3 */
-#define X264_CPU_BMI1            0x0010000  /* BMI1 */
-#define X264_CPU_BMI2            0x0020000  /* BMI2 */
+#define X264_CPU_LZCNT           0x0000200  /* Phenom support for "leading zero count" instruction. */
+#define X264_CPU_AVX             0x0000400  /* AVX support: requires OS support even if YMM registers aren't used. */
+#define X264_CPU_XOP             0x0000800  /* AMD XOP */
+#define X264_CPU_FMA4            0x0001000  /* AMD FMA4 */
+#define X264_CPU_AVX2            0x0002000  /* AVX2 */
+#define X264_CPU_FMA3            0x0004000  /* Intel FMA3 */
+#define X264_CPU_BMI1            0x0008000  /* BMI1 */
+#define X264_CPU_BMI2            0x0010000  /* BMI2 */
  /* x86 modifiers */
-#define X264_CPU_CACHELINE_32    0x0040000  /* avoid memory loads that span the border between two cachelines */
-#define X264_CPU_CACHELINE_64    0x0080000  /* 32/64 is the size of a cacheline in bytes */
-#define X264_CPU_SSE2_IS_SLOW    0x0100000  /* avoid most SSE2 functions on Athlon64 */
-#define X264_CPU_SSE2_IS_FAST    0x0200000  /* a few functions are only faster on Core2 and Phenom */
-#define X264_CPU_SLOW_SHUFFLE    0x0400000  /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
-#define X264_CPU_STACK_MOD4      0x0800000  /* if stack is only mod4 and not mod16 */
-#define X264_CPU_SLOW_CTZ        0x1000000  /* BSR/BSF x86 instructions are really slow on some CPUs */
-#define X264_CPU_SLOW_ATOM       0x2000000  /* The Atom is terrible: slow SSE unaligned loads, slow
+#define X264_CPU_CACHELINE_32    0x0020000  /* avoid memory loads that span the border between two cachelines */
+#define X264_CPU_CACHELINE_64    0x0040000  /* 32/64 is the size of a cacheline in bytes */
+#define X264_CPU_SSE2_IS_SLOW    0x0080000  /* avoid most SSE2 functions on Athlon64 */
+#define X264_CPU_SSE2_IS_FAST    0x0100000  /* a few functions are only faster on Core2 and Phenom */
+#define X264_CPU_SLOW_SHUFFLE    0x0200000  /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
+#define X264_CPU_STACK_MOD4      0x0400000  /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SLOW_CTZ        0x0800000  /* BSR/BSF x86 instructions are really slow on some CPUs */
+#define X264_CPU_SLOW_ATOM       0x1000000  /* The Atom is terrible: slow SSE unaligned loads, slow
                                               * SIMD multiplies, slow SIMD variable shifts, slow pshufb,
                                               * cacheline split penalties -- gather everything here that
                                               * isn't shared by other CPUs to avoid making half a dozen
                                               * new SLOW flags. */
-#define X264_CPU_SLOW_PSHUFB     0x4000000  /* such as on the Intel Atom */
-#define X264_CPU_SLOW_PALIGNR    0x8000000  /* such as on the AMD Bobcat */
+#define X264_CPU_SLOW_PSHUFB     0x2000000  /* such as on the Intel Atom */
+#define X264_CPU_SLOW_PALIGNR    0x4000000  /* such as on the AMD Bobcat */
  
  /* PowerPC */
  #define X264_CPU_ALTIVEC         0x0000001
author	Henrik Gramner <henrik@gramner.com>
	Fri, 5 Jul 2013 19:15:43 +0000 (21:15 +0200)
committer	Fiona Glaser <fiona@x264.com>
	Fri, 5 Jul 2013 20:10:06 +0000 (13:10 -0700)
common/cpu.c		patch \| blob \| history
common/cpu.h		patch \| blob \| history
common/pixel.c		patch \| blob \| history
common/x86/cpu-a.asm		patch \| blob \| history
common/x86/mc-a.asm		patch \| blob \| history
common/x86/mc-a2.asm		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history
common/x86/pixel.h		patch \| blob \| history
common/x86/sad-a.asm		patch \| blob \| history
common/x86/x86inc.asm		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/lookahead.c		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history
x264.h		patch \| blob \| history