]> git.sesse.net Git - x264/commitdiff
x86: detect Bobcat, improve Atom optimizations, reorganize flags
authorFiona Glaser <fiona@x264.com>
Sat, 2 Feb 2013 20:37:08 +0000 (12:37 -0800)
committerFiona Glaser <fiona@x264.com>
Tue, 26 Feb 2013 07:22:53 +0000 (23:22 -0800)
The Bobcat has a 64-bit SIMD unit reminiscent of the Athlon 64; detect this
and apply the appropriate flags.

It also has an extremely slow palignr instruction; create a flag for this to
avoid massive penalties on palignr-heavy functions.

Improve Atom function selection and document exactly what the SLOW_ATOM flag
covers.

Add Atom-optimized SATD/SA8D/hadamard_ac functions: simply combine the ssse3
optimizations with the sse2 algorithm to avoid pmaddubsw, which is slow on
Atom along with other SIMD multiplies.

Drop TBM detection; it'll probably never be useful for x264.

Invert FastShuffle to SlowShuffle; it only ever applied to one CPU (Conroe).

Detect CMOV, to fail more gracefully when run on a chip with MMX2 but no CMOV.

12 files changed:
common/common.c
common/cpu.c
common/dct.c
common/frame.c
common/pixel.c
common/x86/mc-c.c
common/x86/pixel-a.asm
common/x86/pixel.h
common/x86/predict-c.c
encoder/encoder.c
tools/checkasm.c
x264.h

index de6e9ff42fdf9274a991c3b5e9aa67315725bd45..ffd31421447eb32f4bc3553445e244d02653943d 100644 (file)
@@ -622,10 +622,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
                     b_error = 1;
             }
             free( buf );
-            if( p->cpu & X264_CPU_SSSE3 )
+            if( (p->cpu&X264_CPU_SSSE3) && !(p->cpu&X264_CPU_SSE2_IS_SLOW) )
                 p->cpu |= X264_CPU_SSE2_IS_FAST;
-            if( p->cpu & X264_CPU_SSE4 )
-                p->cpu |= X264_CPU_SHUFFLE_IS_FAST;
         }
     }
     OPT("threads")
index c5e34a9fd649f1d2b5712de2af1a36e1df1b4d13..bb707db3f7c35ccfee93b8c7fa381d090fed71d8 100644 (file)
 
 const x264_cpu_name_t x264_cpu_names[] =
 {
-    {"Altivec",     X264_CPU_ALTIVEC},
-//  {"MMX",         X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
-    {"MMX2",        X264_CPU_MMX|X264_CPU_MMX2},
-    {"MMXEXT",      X264_CPU_MMX|X264_CPU_MMX2},
-    {"SSE",         X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE},
-#define SSE2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE|X264_CPU_SSE2
+#if HAVE_MMX
+//  {"MMX",         X264_CPU_MMX},  // we don't support asm on mmx1 cpus anymore
+//  {"CMOV",        X264_CPU_CMOV}, // we require this unconditionally, so don't print it
+#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV
+    {"MMX2",        MMX2},
+    {"MMXEXT",      MMX2},
+    {"SSE",         MMX2|X264_CPU_SSE},
+#define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2
     {"SSE2Slow",    SSE2|X264_CPU_SSE2_IS_SLOW},
     {"SSE2",        SSE2},
     {"SSE2Fast",    SSE2|X264_CPU_SSE2_IS_FAST},
     {"SSE3",        SSE2|X264_CPU_SSE3},
     {"SSSE3",       SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
-    {"FastShuffle", SSE2|X264_CPU_SHUFFLE_IS_FAST},
     {"SSE4.1",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
     {"SSE4",        SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
     {"SSE4.2",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
@@ -70,19 +71,26 @@ const x264_cpu_name_t x264_cpu_names[] =
     {"FMA3",        AVX|X264_CPU_FMA3},
 #undef AVX
 #undef SSE2
+#undef MMX2
     {"Cache32",         X264_CPU_CACHELINE_32},
     {"Cache64",         X264_CPU_CACHELINE_64},
     {"SSEMisalign",     X264_CPU_SSE_MISALIGN},
     {"LZCNT",           X264_CPU_LZCNT},
     {"BMI1",            X264_CPU_BMI1},
     {"BMI2",            X264_CPU_BMI1|X264_CPU_BMI2},
-    {"TBM",             X264_CPU_TBM},
-    {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
-    {"ARMv6",           X264_CPU_ARMV6},
-    {"NEON",            X264_CPU_NEON},
-    {"Fast_NEON_MRC",   X264_CPU_FAST_NEON_MRC},
     {"SlowCTZ",         X264_CPU_SLOW_CTZ},
     {"SlowAtom",        X264_CPU_SLOW_ATOM},
+    {"SlowPshufb",      X264_CPU_SLOW_PSHUFB},
+    {"SlowPalignr",     X264_CPU_SLOW_PALIGNR},
+    {"SlowShuffle",     X264_CPU_SLOW_SHUFFLE},
+    {"UnalignedStack",  X264_CPU_STACK_MOD4},
+#elif ARCH_PPC
+    {"Altivec",         X264_CPU_ALTIVEC},
+#elif ARCH_ARM
+    {"ARMv6",           X264_CPU_ARMV6},
+    {"NEON",            X264_CPU_NEON},
+    {"FastNeonMRC",     X264_CPU_FAST_NEON_MRC},
+#endif
     {"", 0},
 };
 
@@ -131,9 +139,13 @@ uint32_t x264_cpu_detect( void )
     if( edx&0x00800000 )
         cpu |= X264_CPU_MMX;
     else
-        return 0;
+        return cpu;
     if( edx&0x02000000 )
         cpu |= X264_CPU_MMX2|X264_CPU_SSE;
+    if( edx&0x00008000 )
+        cpu |= X264_CPU_CMOV;
+    else
+        return cpu;
     if( edx&0x04000000 )
         cpu |= X264_CPU_SSE2;
     if( ecx&0x00000001 )
@@ -170,46 +182,50 @@ uint32_t x264_cpu_detect( void )
 
     if( cpu & X264_CPU_SSSE3 )
         cpu |= X264_CPU_SSE2_IS_FAST;
-    if( cpu & X264_CPU_SSE4 )
-        cpu |= X264_CPU_SHUFFLE_IS_FAST;
 
     x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
     max_extended_cap = eax;
 
-    if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
+    if( max_extended_cap >= 0x80000001 )
     {
-        cpu |= X264_CPU_SLOW_CTZ;
         x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
-        if( edx&0x00400000 )
-            cpu |= X264_CPU_MMX2;
-        if( cpu & X264_CPU_SSE2 )
+
+        if( ecx&0x00000020 )
+            cpu |= X264_CPU_LZCNT;             /* Supported by Intel chips starting with Haswell */
+        if( ecx&0x00000040 ) /* SSE4a, AMD only */
         {
-            if( ecx&0x00000040 ) /* SSE4a */
+            int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
+            cpu |= X264_CPU_SSE2_IS_FAST;      /* Phenom and later CPUs have fast SSE units */
+            if( family == 0x14 )
             {
-                cpu |= X264_CPU_SSE2_IS_FAST;
-                cpu |= X264_CPU_LZCNT;
-                cpu |= X264_CPU_SHUFFLE_IS_FAST;
-                cpu &= ~X264_CPU_SLOW_CTZ;
+                cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
+                cpu |= X264_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
+                cpu |= X264_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
             }
-            else
-                cpu |= X264_CPU_SSE2_IS_SLOW;
+        }
 
-            if( ecx&0x00000080 ) /* Misalign SSE */
-            {
-                cpu |= X264_CPU_SSE_MISALIGN;
-                x264_cpu_mask_misalign_sse();
-            }
+        if( ecx&0x00000080 ) /* Misalign SSE */
+        {
+            cpu |= X264_CPU_SSE_MISALIGN;
+            x264_cpu_mask_misalign_sse();
+        }
 
-            if( cpu & X264_CPU_AVX )
-            {
-                if( ecx&0x00000800 ) /* XOP */
-                    cpu |= X264_CPU_XOP;
-                if( ecx&0x00010000 ) /* FMA4 */
-                    cpu |= X264_CPU_FMA4;
-            }
+        if( cpu & X264_CPU_AVX )
+        {
+            if( ecx&0x00000800 ) /* XOP */
+                cpu |= X264_CPU_XOP;
+            if( ecx&0x00010000 ) /* FMA4 */
+                cpu |= X264_CPU_FMA4;
+        }
 
-            if( ecx&0x00200000 )
-                cpu |= X264_CPU_TBM;
+        if( !strcmp((char*)vendor, "AuthenticAMD") )
+        {
+            if( edx&0x00400000 )
+                cpu |= X264_CPU_MMX2;
+            if( !(cpu&X264_CPU_LZCNT) )
+                cpu |= X264_CPU_SLOW_CTZ;
+            if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
+                cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
         }
     }
 
@@ -233,11 +249,12 @@ uint32_t x264_cpu_detect( void )
             {
                 cpu |= X264_CPU_SLOW_ATOM;
                 cpu |= X264_CPU_SLOW_CTZ;
+                cpu |= X264_CPU_SLOW_PSHUFB;
             }
-            /* Some Penryns and Nehalems are pointlessly crippled (SSE4 disabled), so
-             * detect them here. */
-            else if( model >= 23 )
-                cpu |= X264_CPU_SHUFFLE_IS_FAST;
+            /* Conroe has a slow shuffle unit. Check the model number to make sure not
+             * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
+            else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 )
+                cpu |= X264_CPU_SLOW_SHUFFLE;
         }
     }
 
index 8c17c0044239fe5d6cf68aa4abe328721ffcbff1..479e03d0e07728892164a67bdc8213b8f4578218 100644 (file)
@@ -640,23 +640,32 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 
-        dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
-        dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
-        dctf->add8x8_idct   = x264_add8x8_idct_sse2;
-        dctf->add16x16_idct = x264_add16x16_idct_sse2;
-        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
+        if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
+        {
+            dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
+            dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
+            dctf->add8x8_idct   = x264_add8x8_idct_sse2;
+            dctf->add16x16_idct = x264_add16x16_idct_sse2;
+            dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
+        }
     }
 
-    if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
+    if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
     {
-        dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
-        dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
-        dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
-        dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
-        dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
         dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
-        dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
-        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
+        if( !(cpu&X264_CPU_SLOW_ATOM) )
+        {
+            dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
+            dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
+            dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
+            dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
+            dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
+            if( !(cpu&X264_CPU_SLOW_PSHUFB) )
+            {
+                dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
+                dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
+            }
+        }
     }
 
     if( cpu&X264_CPU_SSE4 )
@@ -951,7 +960,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
         pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
         pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
-        if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+        if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
             pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
     }
     if( cpu&X264_CPU_AVX )
@@ -962,8 +971,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
         pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
         pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
 #endif
-        if( cpu&X264_CPU_SHUFFLE_IS_FAST )
-            pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
+        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
     }
     if( cpu&X264_CPU_XOP )
     {
@@ -1005,7 +1013,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
         pf_interlaced->interleave_8x8_cavlc =
         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
     }
-    if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+    if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
     {
         pf_interlaced->interleave_8x8_cavlc =
         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
index 4e73ddef7ada110db1050375a121d246eaa83778..f91d45f60bc83535b1c89c0ca9d9d16fb31a66d8 100644 (file)
@@ -73,7 +73,11 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
     int i_stride, i_width, i_lines, luma_plane_count;
     int i_padv = PADV << PARAM_INTERLACED;
     int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
-    int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10;
+#if ARCH_PPC
+    int disalign = 1<<9;
+#else
+    int disalign = 1<<10;
+#endif
 
     CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
 
index 31289c2e61442423b37041756b37123935cdcdf1..5027ab852e42f6d99d02053ff1e360cd1a3ef06a 100644 (file)
@@ -500,6 +500,7 @@ SATD_X_DECL7( _mmx2 )
 #if !HIGH_BIT_DEPTH
 SATD_X_DECL6( _sse2 )
 SATD_X_DECL7( _ssse3 )
+SATD_X_DECL6( _ssse3_atom )
 SATD_X_DECL7( _sse4 )
 SATD_X_DECL7( _avx )
 SATD_X_DECL7( _xop )
@@ -1024,14 +1025,14 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
             INIT4( sad_x3, _cache32_mmx2 );
             INIT4( sad_x4, _cache32_mmx2 );
         }
-        else if( cpu&X264_CPU_CACHELINE_64 )
+        else if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) )
         {
             INIT5( sad, _cache64_mmx2 );
             INIT4( sad_x3, _cache64_mmx2 );
             INIT4( sad_x4, _cache64_mmx2 );
         }
 #else
-        if( cpu&X264_CPU_CACHELINE_64 )
+        if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) )
         {
             pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmx2;
             pixf->sad[PIXEL_8x8]  = x264_pixel_sad_8x8_cache64_mmx2;
@@ -1146,7 +1147,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 #endif
         }
         INIT_ADS( _ssse3 );
-        if( !(cpu&X264_CPU_SLOW_ATOM) )
+        if( cpu&X264_CPU_SLOW_ATOM )
+        {
+            pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom;
+            pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3_atom;
+            INIT6( satd, _ssse3_atom );
+            pixf->satd[PIXEL_4x16]  = x264_pixel_satd_4x16_ssse3_atom;
+            INIT6( satd_x3, _ssse3_atom );
+            INIT6( satd_x4, _ssse3_atom );
+            INIT4( hadamard_ac, _ssse3_atom );
+#if ARCH_X86_64
+            pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3_atom;
+#endif
+        }
+        else
         {
             INIT8( ssd, _ssse3 );
             pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
@@ -1154,25 +1168,26 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
             INIT8( satd, _ssse3 );
             INIT7( satd_x3, _ssse3 );
             INIT7( satd_x4, _ssse3 );
+#if ARCH_X86_64
+            pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
+#endif
         }
         pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
-        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
+        if( !(cpu&X264_CPU_SLOW_PSHUFB) )
+            pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
         pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_ssse3;
         pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_ssse3;
         pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_ssse3;
         pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
         pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3;
         pixf->asd8 = x264_pixel_asd8_ssse3;
-#if ARCH_X86_64
-        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
-#endif
         if( cpu&X264_CPU_CACHELINE_64 )
         {
             INIT2( sad, _cache64_ssse3 );
             INIT2( sad_x3, _cache64_ssse3 );
             INIT2( sad_x4, _cache64_ssse3 );
         }
-        if( cpu&X264_CPU_SLOW_ATOM || !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
+        if( (cpu&X264_CPU_SLOW_ATOM) || (cpu&X264_CPU_SLOW_SHUFFLE) )
         {
             INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
         }
index b0a31e1537d831b2fabf45da433d7e84ae56ac8b..f959b2794562dcd5db09a4fbd3f5288e53a05fcd 100644 (file)
@@ -214,6 +214,7 @@ PIXEL_AVG_WTAB(sse2, mmx2, mmx2, sse2, sse2, sse2)
 PIXEL_AVG_WTAB(sse2_misalign, mmx2, mmx2, sse2, sse2, sse2_misalign)
 PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
 PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
+PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2)
 #endif // HIGH_BIT_DEPTH
 
 #define MC_COPY_WTAB(instr, name1, name2, name3)\
@@ -365,6 +366,7 @@ MC_LUMA(cache64_mmx2,cache64_mmx2,mmx)
 #endif
 MC_LUMA(cache64_sse2,cache64_sse2,sse)
 MC_LUMA(cache64_ssse3,cache64_ssse3,sse)
+MC_LUMA(cache64_ssse3_atom,cache64_ssse3_atom,sse)
 #endif // !HIGH_BIT_DEPTH
 
 #define GET_REF(name)\
@@ -408,6 +410,7 @@ GET_REF(cache64_mmx2)
 GET_REF(sse2_misalign)
 GET_REF(cache64_sse2)
 GET_REF(cache64_ssse3)
+GET_REF(cache64_ssse3_atom)
 #endif // !HIGH_BIT_DEPTH
 
 #define HPEL(align, cpu, cpuv, cpuc, cpuh)\
@@ -606,7 +609,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
 
-    if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
+    if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
         pf->integral_init4v = x264_integral_init4v_ssse3;
 
     if( !(cpu&X264_CPU_AVX) )
@@ -649,48 +652,48 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->hpel_filter = x264_hpel_filter_sse2_amd;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
 
-    if( cpu&X264_CPU_SSE2_IS_SLOW )
-        return;
-
-    pf->weight = x264_mc_weight_wtab_sse2;
-    if( !(cpu&X264_CPU_SLOW_ATOM) )
-    {
-        pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
-        pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
-    }
-
-    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
-    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
-    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
-    pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
-    pf->avg[PIXEL_8x8]  = x264_pixel_avg_8x8_sse2;
-    pf->avg[PIXEL_8x4]  = x264_pixel_avg_8x4_sse2;
-    pf->hpel_filter = x264_hpel_filter_sse2;
-    if( cpu&X264_CPU_SSE_MISALIGN )
-        pf->hpel_filter = x264_hpel_filter_sse2_misalign;
-    pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
-    if( !(cpu&X264_CPU_STACK_MOD4) )
-        pf->mc_chroma = x264_mc_chroma_sse2;
-
-    if( cpu&X264_CPU_SSE2_IS_FAST )
+    if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
     {
-        pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
-        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
-        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
-        pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
-        pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
-        pf->mc_luma = mc_luma_sse2;
-        pf->get_ref = get_ref_sse2;
-        if( cpu&X264_CPU_CACHELINE_64 )
+        pf->weight = x264_mc_weight_wtab_sse2;
+        if( !(cpu&X264_CPU_SLOW_ATOM) )
         {
-            pf->mc_luma = mc_luma_cache64_sse2;
-            pf->get_ref = get_ref_cache64_sse2;
+            pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
+            pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
         }
+
+        pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
+        pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
+        pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
+        pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
+        pf->avg[PIXEL_8x8]  = x264_pixel_avg_8x8_sse2;
+        pf->avg[PIXEL_8x4]  = x264_pixel_avg_8x4_sse2;
+        pf->hpel_filter = x264_hpel_filter_sse2;
         if( cpu&X264_CPU_SSE_MISALIGN )
+            pf->hpel_filter = x264_hpel_filter_sse2_misalign;
+        pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
+        if( !(cpu&X264_CPU_STACK_MOD4) )
+            pf->mc_chroma = x264_mc_chroma_sse2;
+
+        if( cpu&X264_CPU_SSE2_IS_FAST )
         {
-            pf->get_ref = get_ref_sse2_misalign;
-            if( !(cpu&X264_CPU_STACK_MOD4) )
-                pf->mc_chroma = x264_mc_chroma_sse2_misalign;
+            pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
+            pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
+            pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
+            pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
+            pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
+            pf->mc_luma = mc_luma_sse2;
+            pf->get_ref = get_ref_sse2;
+            if( cpu&X264_CPU_CACHELINE_64 )
+            {
+                pf->mc_luma = mc_luma_cache64_sse2;
+                pf->get_ref = get_ref_cache64_sse2;
+            }
+            if( cpu&X264_CPU_SSE_MISALIGN )
+            {
+                pf->get_ref = get_ref_sse2_misalign;
+                if( !(cpu&X264_CPU_STACK_MOD4) )
+                    pf->mc_chroma = x264_mc_chroma_sse2_misalign;
+            }
         }
     }
 
@@ -707,12 +710,21 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_ssse3;
     pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_ssse3;
 
-    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
-    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
-    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
+    if( !(cpu&X264_CPU_SLOW_PSHUFB) )
+    {
+        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
+        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
+        pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
+    }
 
-    pf->hpel_filter = x264_hpel_filter_ssse3;
-    pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
+    if( !(cpu&X264_CPU_SLOW_PALIGNR) )
+    {
+#if ARCH_X86_64
+        if( !(cpu&X264_CPU_SLOW_ATOM) ) /* The 64-bit version is slower, but the 32-bit version is faster? */
+#endif
+            pf->hpel_filter = x264_hpel_filter_ssse3;
+        pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
+    }
     if( !(cpu&X264_CPU_STACK_MOD4) )
         pf->mc_chroma = x264_mc_chroma_ssse3;
 
@@ -722,13 +734,17 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
             pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
         pf->mc_luma = mc_luma_cache64_ssse3;
         pf->get_ref = get_ref_cache64_ssse3;
-
-        /* ssse3 weight is slower on Nehalem, so only assign here. */
-        pf->weight_cache = x264_weight_cache_ssse3;
-        pf->weight = x264_mc_weight_wtab_ssse3;
+        if( cpu&X264_CPU_SLOW_ATOM )
+        {
+            pf->mc_luma = mc_luma_cache64_ssse3_atom;
+            pf->get_ref = get_ref_cache64_ssse3_atom;
+        }
     }
 
-    if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
+    pf->weight_cache = x264_weight_cache_ssse3;
+    pf->weight = x264_mc_weight_wtab_ssse3;
+
+    if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
         pf->integral_init4v = x264_integral_init4v_ssse3;
 
     if( !(cpu&X264_CPU_SSE4) )
@@ -744,9 +760,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->integral_init8h = x264_integral_init8h_avx;
     pf->hpel_filter = x264_hpel_filter_avx;
 
-    /* ssse3 weight seems to be faster again on Sandy Bridge and Bulldozer. */
-    pf->weight_cache = x264_weight_cache_ssse3;
-    pf->weight = x264_mc_weight_wtab_ssse3;
     if( !(cpu&X264_CPU_STACK_MOD4) )
         pf->mc_chroma = x264_mc_chroma_avx;
 
index 276309026a1f961c5b25b18d6681a786ef2a1809..09678b08d945e30e20f95b2b8ef230982a23bdd9 100644 (file)
@@ -961,7 +961,7 @@ VAR2_8x8_SSSE3 16, 7
 %if cpuflag(sse4)
     ; just use shufps on anything post conroe
     shufps %1, %2, 0
-%elif cpuflag(ssse3)
+%elif cpuflag(ssse3) && notcpuflag(atom)
     ; join 2x 32 bit and duplicate them
     ; emulating shufps is faster on conroe
     punpcklqdq %1, %2
@@ -1079,6 +1079,7 @@ VAR2_8x8_SSSE3 16, 7
     SWAP %%n, 4
 %endmacro
 
+; in: %1 = horizontal if 0, vertical if 1
 %macro SATD_8x4_SSE 8-9
 %if %1
     HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
@@ -1253,7 +1254,7 @@ cglobal pixel_satd_4x4, 4,6
     FIX_STRIDES r1, r3
 %if HIGH_BIT_DEPTH && %3
     pxor    %2, %2
-%elif cpuflag(ssse3)
+%elif cpuflag(ssse3) && notcpuflag(atom)
     mova    %2, [hmul_8p]
 %endif
     lea     r4, [3*r1]
@@ -1307,7 +1308,7 @@ cglobal pixel_satd_4x4, 4,6
 %endif
 %endmacro
 
-%macro SATD_4x8_SSE 2
+%macro SATD_4x8_SSE 3
 %if HIGH_BIT_DEPTH
     movh    m0, [r0+0*r1]
     movh    m4, [r2+0*r3]
@@ -1348,7 +1349,7 @@ cglobal pixel_satd_4x4, 4,6
     JDUP m5, m3
     movd m3, [r0+2*r1]
     JDUP m1, m3
-%if cpuflag(ssse3) && %1==1
+%if %1==0 && %2==1
     mova m3, [hmul_4p]
     DIFFOP 0, 4, 1, 5, 3
 %else
@@ -1366,21 +1367,23 @@ cglobal pixel_satd_4x4, 4,6
     JDUP m5, m4
     movd m4, [r0+r1]
     JDUP m3, m4
-%if cpuflag(ssse3) && %1==1
+%if %1==0 && %2==1
     mova m4, [hmul_4p]
     DIFFOP 2, 6, 3, 5, 4
 %else
     DIFFOP 2, 6, 3, 5, 7
 %endif
 %endif ; HIGH_BIT_DEPTH
-    SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 7, %2
+    SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
 %endmacro
 
 ;-----------------------------------------------------------------------------
 ; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 %macro SATDS_SSE2 0
-%if cpuflag(ssse3)
+%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
+
+%if vertical==0 || HIGH_BIT_DEPTH
 cglobal pixel_satd_4x4, 4, 6, 6
     SATD_START_MMX
     mova m4, [hmul_4p]
@@ -1399,33 +1402,33 @@ cglobal pixel_satd_4x4, 4, 6, 6
 
 cglobal pixel_satd_4x8, 4, 6, 8
     SATD_START_MMX
-%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
-    mova   m7, [hmul_4p]
+%if vertical==0
+    mova m7, [hmul_4p]
 %endif
-    SATD_4x8_SSE 0, swap
-    HADDW  m7, m1
-    movd  eax, m7
+    SATD_4x8_SSE vertical, 0, swap
+    HADDW m7, m1
+    movd eax, m7
     RET
 
 cglobal pixel_satd_4x16, 4, 6, 8
     SATD_START_MMX
-%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
+%if vertical==0
     mova m7, [hmul_4p]
 %endif
-    SATD_4x8_SSE 0, swap
+    SATD_4x8_SSE vertical, 0, swap
     lea r0, [r0+r1*2*SIZEOF_PIXEL]
     lea r2, [r2+r3*2*SIZEOF_PIXEL]
-    SATD_4x8_SSE 1, add
+    SATD_4x8_SSE vertical, 1, add
     HADDW m7, m1
     movd eax, m7
     RET
 
 cglobal pixel_satd_8x8_internal
     LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
-    SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 6
+    SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
 %%pixel_satd_8x4_internal:
     LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
-    SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 6
+    SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
     ret
 
 %if HIGH_BIT_DEPTH == 0 && UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
@@ -1433,20 +1436,21 @@ cglobal pixel_satd_16x4_internal
     LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
     lea  r2, [r2+4*r3]
     lea  r0, [r0+4*r1]
+    ; always use horizontal mode here
     SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10
     SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
     ret
 
 cglobal pixel_satd_16x8, 4,6,12
     SATD_START_SSE2 m10, m7
-%if notcpuflag(ssse3)
+%if vertical
     mova m7, [pw_00ff]
 %endif
     jmp %%pixel_satd_16x8_internal
 
 cglobal pixel_satd_16x16, 4,6,12
     SATD_START_SSE2 m10, m7
-%if notcpuflag(ssse3)
+%if vertical
     mova m7, [pw_00ff]
 %endif
     call pixel_satd_16x4_internal
@@ -1510,11 +1514,8 @@ cglobal pixel_satd_8x4, 4,6,8
 %endmacro
 
 %macro SA8D 0
-%if HIGH_BIT_DEPTH
-    %define vertical 1
-%else ; sse2 doesn't seem to like the horizontal way of doing things
-    %define vertical (cpuflags == cpuflags_sse2)
-%endif
+; sse2 doesn't seem to like the horizontal way of doing things
+%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
 
 %if ARCH_X86_64
 ;-----------------------------------------------------------------------------
@@ -1724,42 +1725,43 @@ cglobal pixel_sa8d_16x16, 4,7
 ; SA8D_SATD
 ;=============================================================================
 
-; %1-%4: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9)
+; %1: vertical/horizontal mode
+; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9)
 ; m10: satd result
 ; m6, m11-15: tmp regs
-%macro SA8D_SATD_8x4 4
-%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
-    LOAD_SUMSUB_8x4P_SSSE3 %1, %2, %3, %4, 6, 11, 7, r0, r2, 1
-    HADAMARD4_V %1, %2, %3, %4, 6
-
-    pabsw    m12, m%1 ; doing the abs first is a slight advantage
-    pabsw    m14, m%3
-    pabsw    m13, m%2
-    pabsw    m15, m%4
-    HADAMARD 1, max, 12, 14, 6, 11
-    paddw    m10, m12
-    HADAMARD 1, max, 13, 15, 6, 11
-    paddw    m10, m13
-%else
-    LOAD_DIFF_8x4P %1, %2, %3, %4, 6, 11, 7, r0, r2, 1
-    HADAMARD   0, sumsub, %1, %2, 6
-    HADAMARD   0, sumsub, %3, %4, 6
-    SBUTTERFLY        wd, %1, %2, 6
-    SBUTTERFLY        wd, %3, %4, 6
-    HADAMARD2_2D  %1, %3, %2, %4, 6, dq
-
-    mova   m12, m%1
-    mova   m13, m%2
-    mova   m14, m%3
-    mova   m15, m%4
-    HADAMARD 0, sumsub, %1, %2, 6
-    HADAMARD 0, sumsub, %3, %4, 6
+%macro SA8D_SATD_8x4 5
+%if %1
+    LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
+    HADAMARD   0, sumsub, %2, %3, 6
+    HADAMARD   0, sumsub, %4, %5, 6
+    SBUTTERFLY        wd, %2, %3, 6
+    SBUTTERFLY        wd, %4, %5, 6
+    HADAMARD2_2D  %2, %4, %3, %5, 6, dq
+
+    mova   m12, m%2
+    mova   m13, m%3
+    mova   m14, m%4
+    mova   m15, m%5
+    HADAMARD 0, sumsub, %2, %3, 6
+    HADAMARD 0, sumsub, %4, %5, 6
     SBUTTERFLY     qdq, 12, 13, 6
     HADAMARD   0, amax, 12, 13, 6
     SBUTTERFLY     qdq, 14, 15, 6
     paddw m10, m12
     HADAMARD   0, amax, 14, 15, 6
     paddw m10, m14
+%else
+    LOAD_SUMSUB_8x4P_SSSE3 %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
+    HADAMARD4_V %2, %3, %4, %5, 6
+
+    pabsw    m12, m%2 ; doing the abs first is a slight advantage
+    pabsw    m14, m%4
+    pabsw    m13, m%3
+    pabsw    m15, m%5
+    HADAMARD 1, max, 12, 14, 6, 11
+    paddw    m10, m12
+    HADAMARD 1, max, 13, 15, 6, 11
+    paddw    m10, m13
 %endif
 %endmacro ; SA8D_SATD_8x4
 
@@ -1786,12 +1788,15 @@ cglobal pixel_sa8d_16x16, 4,7
 %endmacro
 
 %macro SA8D_SATD 0
+%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
 cglobal pixel_sa8d_satd_8x8_internal
-    SA8D_SATD_8x4 0, 1, 2, 3
-    SA8D_SATD_8x4 4, 5, 8, 9
+    SA8D_SATD_8x4 vertical, 0, 1, 2, 3
+    SA8D_SATD_8x4 vertical, 4, 5, 8, 9
 
-    ; complete sa8d
-%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
+%if vertical ; sse2-style
+    HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax
+    HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax
+%else        ; complete sa8d
     SUMSUB_BADC w, 0, 4, 1, 5, 12
     HADAMARD 2, sumsub, 0, 4, 12, 11
     HADAMARD 2, sumsub, 1, 5, 12, 11
@@ -1802,9 +1807,6 @@ cglobal pixel_sa8d_satd_8x8_internal
     HADAMARD 1, amax, 1, 5, 12, 4
     HADAMARD 1, amax, 2, 8, 12, 4
     HADAMARD 1, amax, 3, 9, 12, 4
-%else ; sse2
-    HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax
-    HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax
 %endif
 
     ; create sa8d sub results
@@ -1822,7 +1824,7 @@ cglobal pixel_sa8d_satd_16x16, 4,8,16,SIZEOF_PIXEL*mmsize
     %define temp0 [rsp+0*mmsize]
     %define temp1 [rsp+1*mmsize]
     FIX_STRIDES r1, r3
-%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
+%if vertical==0
     mova     m7, [hmul_8p]
 %endif
     lea      r4, [3*r1]
@@ -2720,7 +2722,7 @@ ALIGN 16
     psubw      m1, m9
     psubw      m2, m10
     psubw      m3, m11
-    SATD_8x4_SSE (cpuflags == cpuflags_sse2), 0, 1, 2, 3, 13, 14, 0, swap
+    SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap
     pmaddwd    m0, [pw_1]
 %if cpuflag(sse4)
     pshufd     m1, m0, q0032
@@ -2828,7 +2830,7 @@ ALIGN 16
     psubw      m2, [fenc_buf+0x20]
 .satd_8x4b:
     psubw      m3, [fenc_buf+0x30]
-    SATD_8x4_SSE (cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 0, swap
+    SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap
     pmaddwd    m0, [pw_1]
 %if cpuflag(sse4)
     pshufd     m1, m0, q0032
@@ -3773,7 +3775,7 @@ cglobal hadamard_ac_8x8
 %endif
 %if HIGH_BIT_DEPTH
     %define vertical 1
-%elif cpuflag(ssse3)
+%elif cpuflag(ssse3) && notcpuflag(atom)
     %define vertical 0
     ;LOAD_INC loads sumsubs
     mova      m7, [hmul_8p]
@@ -3980,6 +3982,16 @@ INTRA_X3_MMX
 INIT_XMM sse2
 HADAMARD_AC_SSE2
 
+%if HIGH_BIT_DEPTH == 0
+INIT_XMM ssse3,atom
+SATDS_SSE2
+SA8D
+HADAMARD_AC_SSE2
+%if ARCH_X86_64
+SA8D_SATD
+%endif
+%endif
+
 %define DIFFOP DIFF_SUMSUB_SSSE3
 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
 %if HIGH_BIT_DEPTH == 0
index 48910d9272d43a640da45d3d4c8b49c46344e875..e06005f98c236d3cf33085a7c69d43cea34030d9 100644 (file)
@@ -66,12 +66,14 @@ DECL_X1( ssd, xop )
 DECL_X1( satd, mmx2 )
 DECL_X1( satd, sse2 )
 DECL_X1( satd, ssse3 )
+DECL_X1( satd, ssse3_atom )
 DECL_X1( satd, sse4 )
 DECL_X1( satd, avx )
 DECL_X1( satd, xop )
 DECL_X1( sa8d, mmx2 )
 DECL_X1( sa8d, sse2 )
 DECL_X1( sa8d, ssse3 )
+DECL_X1( sa8d, ssse3_atom )
 DECL_X1( sa8d, sse4 )
 DECL_X1( sa8d, avx )
 DECL_X1( sa8d, xop )
@@ -91,6 +93,7 @@ DECL_PIXELS( uint64_t, var, xop,  ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, mmx2,  ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, sse2,  ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, ssse3_atom, ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, sse4,  ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, avx,   ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, xop,   ( pixel *pix, intptr_t i_stride ))
@@ -162,11 +165,12 @@ int  x264_pixel_vsad_xop  ( pixel *src, intptr_t stride, int height );
 int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
 int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
 int x264_pixel_asd8_xop  ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
-uint64_t x264_pixel_sa8d_satd_16x16_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-uint64_t x264_pixel_sa8d_satd_16x16_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-uint64_t x264_pixel_sa8d_satd_16x16_sse4 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-uint64_t x264_pixel_sa8d_satd_16x16_avx  ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-uint64_t x264_pixel_sa8d_satd_16x16_xop  ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_sse2      ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_ssse3     ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_ssse3_atom( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_sse4      ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_avx       ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_xop       ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
 
 #define DECL_ADS( size, suffix ) \
 int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
index 395b8237f50b541c581f267460d010da536e6be3..712a06136ea8c09e45c9800a469085779b93b9fa 100644 (file)
@@ -358,7 +358,8 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
     pf[I_PRED_16x16_P]       = x264_predict_16x16_p_sse2;
     if( !(cpu&X264_CPU_SSSE3) )
         return;
-    pf[I_PRED_16x16_H]       = x264_predict_16x16_h_ssse3;
+    if( !(cpu&X264_CPU_SLOW_PSHUFB) )
+        pf[I_PRED_16x16_H]       = x264_predict_16x16_h_ssse3;
 #if HAVE_X86_INLINE_ASM
     pf[I_PRED_16x16_P]       = x264_predict_16x16_p_ssse3;
 #endif
@@ -530,8 +531,11 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
     pf[I_PRED_8x8_HU]   = x264_predict_8x8_hu_sse2;
     if( !(cpu&X264_CPU_SSSE3) )
         return;
-    pf[I_PRED_8x8_DDL]  = x264_predict_8x8_ddl_ssse3;
-    pf[I_PRED_8x8_VR]   = x264_predict_8x8_vr_ssse3;
+    if( !(cpu&X264_CPU_SLOW_PALIGNR) )
+    {
+        pf[I_PRED_8x8_DDL]  = x264_predict_8x8_ddl_ssse3;
+        pf[I_PRED_8x8_VR]   = x264_predict_8x8_vr_ssse3;
+    }
     pf[I_PRED_8x8_HU]   = x264_predict_8x8_hu_ssse3;
     *predict_8x8_filter = x264_predict_8x8_filter_ssse3;
     if( !(cpu&X264_CPU_AVX) )
index 4f3d555b2f3ae7e8c72910afd1ea2d1cefac809d..7e5ebcaef6a7417c17474dcd09b0aa5f0305ef5c 100644 (file)
@@ -417,17 +417,33 @@ static void x264_lookahead_thread_init( x264_t *h )
 static int x264_validate_parameters( x264_t *h, int b_open )
 {
 #if HAVE_MMX
-#ifdef __SSE__
-    if( b_open && !(x264_cpu_detect() & X264_CPU_SSE) )
+    if( b_open )
     {
-        x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n");
+        int cpuflags = x264_cpu_detect();
+        int fail = 0;
+#ifdef __SSE__
+        if( !(cpuflags & X264_CPU_SSE) )
+        {
+            x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm\n");
+            fail = 1;
+        }
 #else
-    if( b_open && !(x264_cpu_detect() & X264_CPU_MMX2) )
-    {
-        x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
+        if( !(cpuflags & X264_CPU_MMX2) )
+        {
+            x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm\n");
+            fail = 1;
+        }
 #endif
-        x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n");
-        return -1;
+        if( !fail && !(cpuflags & X264_CPU_CMOV) )
+        {
+            x264_log( h, X264_LOG_ERROR, "your cpu does not support CMOV, but x264 was compiled with asm\n");
+            fail = 1;
+        }
+        if( fail )
+        {
+            x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n");
+            return -1;
+        }
     }
 #endif
 
@@ -1262,6 +1278,9 @@ x264_t *x264_encoder_open( x264_param_t *param )
     p = buf + sprintf( buf, "using cpu capabilities:" );
     for( int i = 0; x264_cpu_names[i].flags; i++ )
     {
+        if( !strcmp(x264_cpu_names[i].name, "SSE")
+            && h->param.cpu & (X264_CPU_SSE2) )
+            continue;
         if( !strcmp(x264_cpu_names[i].name, "SSE2")
             && h->param.cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) )
             continue;
@@ -1303,7 +1322,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
     {
         x264_log( h, X264_LOG_ERROR, "CLZ test failed: x264 has been miscompiled!\n" );
 #if ARCH_X86 || ARCH_X86_64
-        x264_log( h, X264_LOG_ERROR, "Are you attempting to run an SSE4a-targeted build on a CPU that\n" );
+        x264_log( h, X264_LOG_ERROR, "Are you attempting to run an SSE4a/LZCNT-targeted build on a CPU that\n" );
         x264_log( h, X264_LOG_ERROR, "doesn't support it?\n" );
 #endif
         goto fail;
index 8cfbc5d412d3b6d2723fba018f5d77b5a063b293..ad22dec316a0ef2ee9a2fe1dda4a2fc5cc39c051 100644 (file)
@@ -164,6 +164,7 @@ static void print_bench(void)
             if( k < j )
                 continue;
             printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+#if HAVE_MMX
                     b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" :
                     b->cpu&X264_CPU_AVX2 ? "avx2" :
                     b->cpu&X264_CPU_FMA3 ? "fma3" :
@@ -178,20 +179,28 @@ static void print_bench(void)
                     b->cpu&X264_CPU_SSE2 ? "sse2" :
                     b->cpu&X264_CPU_SSE ? "sse" :
                     b->cpu&X264_CPU_MMX ? "mmx" :
+#elif ARCH_PPC
                     b->cpu&X264_CPU_ALTIVEC ? "altivec" :
+#elif ARCH_ARM
                     b->cpu&X264_CPU_NEON ? "neon" :
-                    b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
+                    b->cpu&X264_CPU_ARMV6 ? "armv6" :
+#endif
+                    "c",
+#if HAVE_MMX
                     b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
+                    b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" :
                     b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
-                    b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" :
+                    b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" :
                     b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
                     b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
                     b->cpu&X264_CPU_BMI2 ? "_bmi2" :
-                    b->cpu&X264_CPU_TBM ? "_tbm" :
                     b->cpu&X264_CPU_BMI1 ? "_bmi1" :
-                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
                     b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
-                    b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
+                    b->cpu&X264_CPU_SLOW_ATOM ? "_atom" :
+#elif ARCH_ARM
+                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
+#endif
+                    "",
                     ((int64_t)10*b->cycles/b->den - nop_time)/4 );
         }
 }
@@ -2440,12 +2449,10 @@ static int check_all_flags( void )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
         cpu1 &= ~X264_CPU_CACHELINE_64;
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
-        cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" );
+        cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
         cpu1 &= ~X264_CPU_SLOW_CTZ;
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSE2 SlowAtom" );
-        cpu1 &= ~X264_CPU_SLOW_ATOM;
     }
     if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
     {
@@ -2467,15 +2474,17 @@ static int check_all_flags( void )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
         cpu1 &= ~X264_CPU_CACHELINE_64;
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
-        cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" );
+        cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
         cpu1 &= ~X264_CPU_SLOW_CTZ;
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
+        cpu1 &= ~X264_CPU_CACHELINE_64;
         cpu1 &= ~X264_CPU_SLOW_ATOM;
     }
     if( x264_cpu_detect() & X264_CPU_SSE4 )
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4 | X264_CPU_SHUFFLE_IS_FAST, "SSE4" );
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
     if( x264_cpu_detect() & X264_CPU_AVX )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
     if( x264_cpu_detect() & X264_CPU_XOP )
@@ -2488,11 +2497,6 @@ static int check_all_flags( void )
     if( x264_cpu_detect() & X264_CPU_BMI1 )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
-        if( x264_cpu_detect() & X264_CPU_TBM )
-        {
-            ret |= add_flags( &cpu0, &cpu1, X264_CPU_TBM, "TBM" );
-            cpu1 &= ~X264_CPU_TBM;
-        }
         if( x264_cpu_detect() & X264_CPU_BMI2 )
         {
             ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
diff --git a/x264.h b/x264.h
index a93db090b7715a4bd55653aa0c2585aeb44ba754..3a233906ee88da4bc164b1a031fa093aca786537 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -41,7 +41,7 @@
 
 #include "x264_config.h"
 
-#define X264_BUILD 129
+#define X264_BUILD 130
 
 /* Application developers planning to link against a shared library version of
  * libx264 from a Microsoft Visual Studio or similar development environment
@@ -109,43 +109,53 @@ typedef struct
 /****************************************************************************
  * Encoder parameters
  ****************************************************************************/
-/* CPU flags
- */
-#define X264_CPU_CACHELINE_32    0x0000001  /* avoid memory loads that span the border between two cachelines */
-#define X264_CPU_CACHELINE_64    0x0000002  /* 32/64 is the size of a cacheline in bytes */
-#define X264_CPU_ALTIVEC         0x0000004
-#define X264_CPU_MMX             0x0000008
-#define X264_CPU_MMX2            0x0000010  /* MMX2 aka MMXEXT aka ISSE */
+/* CPU flags */
+
+/* x86 */
+#define X264_CPU_CMOV            0x0000001
+#define X264_CPU_MMX             0x0000002
+#define X264_CPU_MMX2            0x0000004  /* MMX2 aka MMXEXT aka ISSE */
 #define X264_CPU_MMXEXT          X264_CPU_MMX2
-#define X264_CPU_SSE             0x0000020
-#define X264_CPU_SSE2            0x0000040
-#define X264_CPU_SSE2_IS_SLOW    0x0000080  /* avoid most SSE2 functions on Athlon64 */
-#define X264_CPU_SSE2_IS_FAST    0x0000100  /* a few functions are only faster on Core2 and Phenom */
-#define X264_CPU_SSE3            0x0000200
-#define X264_CPU_SSSE3           0x0000400
-#define X264_CPU_SHUFFLE_IS_FAST 0x0000800  /* Penryn, Nehalem, and Phenom have fast shuffle units */
-#define X264_CPU_STACK_MOD4      0x0001000  /* if stack is only mod4 and not mod16 */
-#define X264_CPU_SSE4            0x0002000  /* SSE4.1 */
-#define X264_CPU_SSE42           0x0004000  /* SSE4.2 */
-#define X264_CPU_SSE_MISALIGN    0x0008000  /* Phenom support for misaligned SSE instruction arguments */
-#define X264_CPU_LZCNT           0x0010000  /* Phenom support for "leading zero count" instruction. */
-#define X264_CPU_ARMV6           0x0020000
-#define X264_CPU_NEON            0x0040000  /* ARM NEON */
-#define X264_CPU_FAST_NEON_MRC   0x0080000  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
-#define X264_CPU_SLOW_CTZ        0x0100000  /* BSR/BSF x86 instructions are really slow on some CPUs */
-#define X264_CPU_SLOW_ATOM       0x0200000  /* The Atom just sucks */
-#define X264_CPU_AVX             0x0400000  /* AVX support: requires OS support even if YMM registers
-                                             * aren't used. */
-#define X264_CPU_XOP             0x0800000  /* AMD XOP */
-#define X264_CPU_FMA4            0x1000000  /* AMD FMA4 */
-#define X264_CPU_AVX2            0x2000000  /* AVX2 */
-#define X264_CPU_FMA3            0x4000000  /* Intel FMA3 */
-#define X264_CPU_BMI1            0x8000000  /* BMI1 */
-#define X264_CPU_BMI2           0x10000000  /* BMI2 */
-#define X264_CPU_TBM            0x20000000  /* AMD TBM */
-
-/* Analyse flags
- */
+#define X264_CPU_SSE             0x0000008
+#define X264_CPU_SSE2            0x0000010
+#define X264_CPU_SSE3            0x0000020
+#define X264_CPU_SSSE3           0x0000040
+#define X264_CPU_SSE4            0x0000080  /* SSE4.1 */
+#define X264_CPU_SSE42           0x0000100  /* SSE4.2 */
+#define X264_CPU_SSE_MISALIGN    0x0000200  /* Phenom support for misaligned SSE instruction arguments */
+#define X264_CPU_LZCNT           0x0000400  /* Phenom support for "leading zero count" instruction. */
+#define X264_CPU_AVX             0x0000800  /* AVX support: requires OS support even if YMM registers aren't used. */
+#define X264_CPU_XOP             0x0001000  /* AMD XOP */
+#define X264_CPU_FMA4            0x0002000  /* AMD FMA4 */
+#define X264_CPU_AVX2            0x0004000  /* AVX2 */
+#define X264_CPU_FMA3            0x0008000  /* Intel FMA3 */
+#define X264_CPU_BMI1            0x0010000  /* BMI1 */
+#define X264_CPU_BMI2            0x0020000  /* BMI2 */
+/* x86 modifiers */
+#define X264_CPU_CACHELINE_32    0x0040000  /* avoid memory loads that span the border between two cachelines */
+#define X264_CPU_CACHELINE_64    0x0080000  /* 32/64 is the size of a cacheline in bytes */
+#define X264_CPU_SSE2_IS_SLOW    0x0100000  /* avoid most SSE2 functions on Athlon64 */
+#define X264_CPU_SSE2_IS_FAST    0x0200000  /* a few functions are only faster on Core2 and Phenom */
+#define X264_CPU_SLOW_SHUFFLE    0x0400000  /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
+#define X264_CPU_STACK_MOD4      0x0800000  /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SLOW_CTZ        0x1000000  /* BSR/BSF x86 instructions are really slow on some CPUs */
+#define X264_CPU_SLOW_ATOM       0x2000000  /* The Atom is terrible: slow SSE unaligned loads, slow
+                                             * SIMD multiplies, slow SIMD variable shifts, slow pshufb,
+                                             * cacheline split penalties -- gather everything here that
+                                             * isn't shared by other CPUs to avoid making half a dozen
+                                             * new SLOW flags. */
+#define X264_CPU_SLOW_PSHUFB     0x4000000  /* such as on the Intel Atom */
+#define X264_CPU_SLOW_PALIGNR    0x8000000  /* such as on the AMD Bobcat */
+
+/* PowerPC */
+#define X264_CPU_ALTIVEC         0x0000001
+
+/* ARM */
+#define X264_CPU_ARMV6           0x0000001
+#define X264_CPU_NEON            0x0000002  /* ARM NEON */
+#define X264_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+
+/* Analyse flags */
 #define X264_ANALYSE_I4x4       0x0001  /* Analyse i4x4 */
 #define X264_ANALYSE_I8x8       0x0002  /* Analyse i8x8 (requires 8x8 transform) */
 #define X264_ANALYSE_PSUB16x16  0x0010  /* Analyse p16x8, p8x16 and p8x8 */