]> git.sesse.net Git - x264/commitdiff
many changes to which asm functions are enabled on which cpus.
authorLoren Merritt <pengvado@akuvian.org>
Sat, 7 Jun 2008 05:31:22 +0000 (23:31 -0600)
committerLoren Merritt <pengvado@akuvian.org>
Sun, 8 Jun 2008 05:02:04 +0000 (23:02 -0600)
with Phenom, 3dnow is no longer equivalent to "sse2 is slow", so make a new flag for that.
some sse2 functions are useful only on Core2 and Phenom, so make a "sse2 is fast" flag for that.
some ssse3 instructions didn't become useful until Penryn, so yet another flag.
disable sse2 completely on Pentium M and Core1, because it's uniformly slower than mmx.
enable some sse2 functions on Athlon64 that always were faster and we just didn't notice.
remove mc_luma_sse3, because the only cpu that has lddqu (namely Pentium 4D) doesn't have "sse2 is fast".
don't print mmx1, sse1, nor 3dnow in the detected cpuflags, since we don't really have any such functions. likewise don't print sse3 unless it's used (Pentium 4D).

12 files changed:
common/cpu.c
common/cpu.h
common/dct.c
common/frame.c
common/pixel.c
common/x86/dct-a.asm
common/x86/mc-a.asm
common/x86/mc-c.c
common/x86/predict-c.c
encoder/encoder.c
tools/checkasm.c
x264.h

index 3ebe970f530d3e7ec0fc886f6d0c29d0b924e97f..ed72c6496c24a5248525d2a09cb05c40bc5035d6 100644 (file)
 #endif
 
 #include "common.h"
+#include "cpu.h"
 
-const struct {
-    const char name[8];
-    int flags;
-} x264_cpu_names[] = {
-    {"MMX",     X264_CPU_MMX},
+const x264_cpu_name_t x264_cpu_names[] = {
+    {"Altivec", X264_CPU_ALTIVEC},
+//  {"MMX",     X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
     {"MMX2",    X264_CPU_MMX|X264_CPU_MMXEXT},
     {"MMXEXT",  X264_CPU_MMX|X264_CPU_MMXEXT},
-    {"SSE",     X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE},
-    {"SSE1",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE},
+//  {"SSE",     X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE}, // there are no sse1 functions in x264
+    {"SSE2Slow",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_SLOW},
     {"SSE2",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2},
+    {"SSE2Fast",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_FAST},
     {"SSE3",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
     {"SSSE3",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
+    {"PHADD",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_PHADD_IS_FAST},
     {"SSE4",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
-    {"3DNow",   X264_CPU_3DNOW},
-    {"Altivec", X264_CPU_ALTIVEC},
-    {"Cache32", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32},
-    {"Cache64", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64},
+    {"Cache32", X264_CPU_CACHELINE_32},
+    {"Cache64", X264_CPU_CACHELINE_64},
     {"", 0},
 };
 
@@ -92,57 +91,87 @@ uint32_t x264_cpu_detect( void )
     if( ecx&0x00080000 )
         cpu |= X264_CPU_SSE4;
 
+    if( cpu & X264_CPU_SSSE3 )
+        cpu |= X264_CPU_SSE2_IS_FAST;
+    if( cpu & X264_CPU_SSE4 )
+        cpu |= X264_CPU_PHADD_IS_FAST;
+
     x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
     max_extended_cap = eax;
 
     if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
     {
         x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
-        if( edx&0x80000000 )
-            cpu |= X264_CPU_3DNOW;
         if( edx&0x00400000 )
             cpu |= X264_CPU_MMXEXT;
+        if( cpu & X264_CPU_SSE2 )
+        {
+            if( ecx&0x00000040 ) /* SSE4a */
+                cpu |= X264_CPU_SSE2_IS_FAST;
+            else
+                cpu |= X264_CPU_SSE2_IS_SLOW;
+        }
     }
 
-    if( !strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead") )
-        cpu |= X264_CPU_CACHELINE_SPLIT;
-    /* cacheline size is specified in 3 places, any of which may be missing */
-    x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
-    cache = (ebx&0xff00)>>5; // cflush size
-    if( !cache && max_extended_cap >= 0x80000006 )
+    if( !strcmp((char*)vendor, "GenuineIntel") )
     {
-        x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
-        cache = ecx&0xff; // cacheline size
+        int family, model, stepping;
+        x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
+        family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
+        model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
+        stepping = eax&0xf;
+        /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
+         * theoretically support sse2, but it's significantly slower than mmx for
+         * almost all of x264's functions, so let's just pretend they don't. */
+        if( family==6 && (model==9 || model==13 || model==14) )
+        {
+            cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
+            assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
+        }
     }
-    if( !cache )
+
+    if( !strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead") )
     {
-        // Cache and TLB Information
-        static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
-        static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
-        uint32_t buf[4];
-        int max, i=0, j;
-        do {
-            x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 );
-            max = buf[0]&0xff;
-            buf[0] &= ~0xff;
-            for(j=0; j<4; j++)
-                if( !(buf[j]>>31) )
-                    while( buf[j] )
-                    {
-                        if( strchr( cache32_ids, buf[j]&0xff ) )
-                            cache = 32;
-                        if( strchr( cache64_ids, buf[j]&0xff ) )
-                            cache = 64;
-                        buf[j] >>= 8;
-                    }
-        } while( ++i < max );
+        /* cacheline size is specified in 3 places, any of which may be missing */
+        x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
+        cache = (ebx&0xff00)>>5; // cflush size
+        if( !cache && max_extended_cap >= 0x80000006 )
+        {
+            x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
+            cache = ecx&0xff; // cacheline size
+        }
+        if( !cache )
+        {
+            // Cache and TLB Information
+            static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
+            static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
+            uint32_t buf[4];
+            int max, i=0, j;
+            do {
+                x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 );
+                max = buf[0]&0xff;
+                buf[0] &= ~0xff;
+                for(j=0; j<4; j++)
+                    if( !(buf[j]>>31) )
+                        while( buf[j] )
+                        {
+                            if( strchr( cache32_ids, buf[j]&0xff ) )
+                                cache = 32;
+                            if( strchr( cache64_ids, buf[j]&0xff ) )
+                                cache = 64;
+                            buf[j] >>= 8;
+                        }
+            } while( ++i < max );
+        }
+
+        if( cache == 32 )
+            cpu |= X264_CPU_CACHELINE_32;
+        else if( cache == 64 )
+            cpu |= X264_CPU_CACHELINE_64;
+        else
+            fprintf( stderr, "x264 [warning]: unable to determine cacheline size\n" );
     }
 
-    if( cache == 32 )
-        cpu |= X264_CPU_CACHELINE_32;
-    if( cache == 64 )
-        cpu |= X264_CPU_CACHELINE_64;
-
     return cpu;
 }
 
index 6a669735f3e72ef23172fc275b327e01b0f758e6..1871e3a2af2dcce257972725b67846c388662582 100644 (file)
@@ -42,9 +42,10 @@ void x264_stack_align( void (*func)(x264_t*), x264_t *arg );
 #define x264_stack_align(func,arg) func(arg)
 #endif
 
-extern const struct {
-    const char name[8];
+typedef struct {
+    const char name[12];
     int flags;
-} x264_cpu_names[];
+} x264_cpu_name_t;
+extern const x264_cpu_name_t x264_cpu_names[];
 
 #endif
index 669e24f34ad63409a898103bdfd4c9b15fb98777..1815fc3db281b13c6b0e708b6fda3ce6b9c2a4eb 100644 (file)
@@ -394,20 +394,18 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
     if( cpu&X264_CPU_MMX )
     {
         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
-        dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
-        dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
-
         dctf->add4x4_idct   = x264_add4x4_idct_mmx;
-        dctf->add8x8_idct   = x264_add8x8_idct_mmx;
-        dctf->add16x16_idct = x264_add16x16_idct_mmx;
-
         dctf->dct4x4dc      = x264_dct4x4dc_mmx;
         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
 
 #ifndef ARCH_X86_64
+        dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
+        dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
+        dctf->add8x8_idct   = x264_add8x8_idct_mmx;
+        dctf->add16x16_idct = x264_add16x16_idct_mmx;
+
         dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
         dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
-
         dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
         dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
 #endif
@@ -419,9 +417,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
-    }
-    if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) )
-    {
+
         dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
         dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
         dctf->add8x8_idct   = x264_add8x8_idct_sse2;
index 70bcf8a0abf5e7ddec27fa68b665495e06dcbecb..a1a36b55c83fc5e47ff28e424b6eb01bd5f824ad 100644 (file)
@@ -44,11 +44,10 @@ x264_frame_t *x264_frame_new( x264_t *h )
     if( h->param.b_interlaced )
         i_lines = ( i_lines + 31 ) & -32;
 
-    if( h->param.cpu&X264_CPU_CACHELINE_SPLIT )
-    {
-        int align = h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 64;
-        i_stride = (i_stride + align-1) & -align;
-    }
+    if( h->param.cpu&X264_CPU_CACHELINE_64 )
+        i_stride = (i_stride + 63) & ~63;
+    else if( h->param.cpu&X264_CPU_CACHELINE_32 )
+        i_stride = (i_stride + 31) & ~31;
 
     frame->i_plane = 3;
     for( i = 0; i < 3; i++ )
index 0d00b6e52c40131f9c2d39c631a1c6285774e1e2..11d74a0e4613078d5ab226dac03d8077bdcfe42a 100644 (file)
@@ -557,23 +557,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
         pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_mmxext;
 
-        if( cpu&X264_CPU_CACHELINE_SPLIT )
+        if( cpu&X264_CPU_CACHELINE_32 )
         {
-            if( cpu&X264_CPU_CACHELINE_32 )
-            {
-                INIT5( sad, _cache32_mmxext );
-                INIT4( sad_x3, _cache32_mmxext );
-                INIT4( sad_x4, _cache32_mmxext );
-            }
-            else
-            {
-                INIT5( sad, _cache64_mmxext );
-                INIT4( sad_x3, _cache64_mmxext );
-                INIT4( sad_x4, _cache64_mmxext );
-            }
+            INIT5( sad, _cache32_mmxext );
+            INIT4( sad_x3, _cache32_mmxext );
+            INIT4( sad_x4, _cache32_mmxext );
+        }
+        else if( cpu&X264_CPU_CACHELINE_64 )
+        {
+            INIT5( sad, _cache64_mmxext );
+            INIT4( sad_x3, _cache64_mmxext );
+            INIT4( sad_x4, _cache64_mmxext );
         }
 #else
-        if( cpu&X264_CPU_CACHELINE_SPLIT )
+        if( cpu&X264_CPU_CACHELINE_64 )
         {
             pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmxext;
             pixf->sad[PIXEL_8x8]  = x264_pixel_sad_8x8_cache64_mmxext;
@@ -589,19 +586,15 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_mmxext;
     }
 
-    // disable on AMD processors since it is slower
-    if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) )
+    if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
     {
         INIT2( sad, _sse2 );
         INIT2( sad_x3, _sse2 );
         INIT2( sad_x4, _sse2 );
-        INIT5( satd, _sse2 );
-        INIT5( satd_x3, _sse2 );
-        INIT5( satd_x4, _sse2 );
         INIT_ADS( _sse2 );
 
 #ifdef ARCH_X86
-        if( cpu&X264_CPU_CACHELINE_SPLIT )
+        if( cpu&X264_CPU_CACHELINE_64 )
         {
             INIT2( sad, _cache64_sse2 );
             INIT2( sad_x3, _cache64_sse2 );
@@ -609,10 +602,12 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         }
 #endif
     }
-    // these are faster on both Intel and AMD
     if( cpu&X264_CPU_SSE2 )
     {
         INIT5( ssd, _sse2 );
+        INIT5( satd, _sse2 );
+        INIT5( satd_x3, _sse2 );
+        INIT5( satd_x4, _sse2 );
         pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_sse2;
         pixf->ssim_end4        = x264_pixel_ssim_end4_sse2;
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
@@ -622,7 +617,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 #endif
     }
 
-    if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_SPLIT) )
+    if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_64) )
     {
         INIT2( sad, _sse3 );
         INIT2( sad_x3, _sse3 );
@@ -643,20 +638,18 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 #ifdef ARCH_X86_64
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
 #endif
-        if( cpu&X264_CPU_CACHELINE_SPLIT )
+        if( cpu&X264_CPU_CACHELINE_64 )
         {
             INIT2( sad, _cache64_ssse3 );
             INIT2( sad_x3, _cache64_ssse3 );
             INIT2( sad_x4, _cache64_ssse3 );
         }
-    }
-
-    if( cpu&X264_CPU_SSE4 )
-    {
-        // enabled on Penryn, but slower on Conroe
-        INIT5( satd, _ssse3_phadd );
-        INIT5( satd_x3, _ssse3_phadd );
-        INIT5( satd_x4, _ssse3_phadd );
+        if( cpu&X264_CPU_PHADD_IS_FAST )
+        {
+            INIT5( satd, _ssse3_phadd );
+            INIT5( satd_x3, _ssse3_phadd );
+            INIT5( satd_x4, _ssse3_phadd );
+        }
     }
 #endif //HAVE_MMX
 
index 77baddaabd01dea9d3c25b13c1bf960f9b4d9a98..525f94a342ad62b6a87c9da1d3e048955ed1a083 100644 (file)
@@ -283,16 +283,12 @@ cglobal %1, 2,2,1
     jmp  %2
 %endmacro
 
+%ifndef ARCH_X86_64
 SUB_NxN_DCT  x264_sub8x8_dct_mmx,    x264_sub4x4_dct_mmx  %+ .skip_prologue, 32, 4, 0, 0
 ADD_NxN_IDCT x264_add8x8_idct_mmx,   x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0
-
 SUB_NxN_DCT  x264_sub16x16_dct_mmx,  x264_sub8x8_dct_mmx  %+ .skip_prologue, 32, 8, 4, 4
 ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4
 
-SUB_NxN_DCT  x264_sub16x16_dct_sse2,  x264_sub8x8_dct_sse2  %+ .skip_prologue, 64, 8, 0, 4
-ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
-
-%ifndef ARCH_X86_64
 cextern x264_sub8x8_dct8_mmx.skip_prologue
 cextern x264_add8x8_idct8_mmx.skip_prologue
 SUB_NxN_DCT  x264_sub16x16_dct8_mmx,  x264_sub8x8_dct8_mmx  %+ .skip_prologue, 128, 8, 0, 0
@@ -301,6 +297,9 @@ ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 1
 %define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
 %endif
 
+SUB_NxN_DCT  x264_sub16x16_dct_sse2,  x264_sub8x8_dct_sse2  %+ .skip_prologue, 64, 8, 0, 4
+ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
+
 cextern x264_sub8x8_dct8_sse2
 cextern x264_add8x8_idct8_sse2
 SUB_NxN_DCT  x264_sub16x16_dct8_sse2,  x264_sub8x8_dct8_sse2,  128, 8, 0, 0
index eadb6b5a903db071e3fdbd62775f2dcc681da2a7..3dabe9f67b331edb092547ac4036208efa1d363d 100644 (file)
@@ -230,8 +230,7 @@ cglobal x264_pixel_avg2_w20_mmxext, 6,7
     jg     .height_loop
     REP_RET
 
-%macro PIXEL_AVG_SSE 1
-cglobal x264_pixel_avg2_w16_%1, 6,7
+cglobal x264_pixel_avg2_w16_sse2, 6,7
     sub    r4, r2
     lea    r6, [r4+r3]
 .height_loop:
@@ -249,7 +248,7 @@ cglobal x264_pixel_avg2_w16_%1, 6,7
     jg     .height_loop
     REP_RET
 
-cglobal x264_pixel_avg2_w20_%1, 6,7
+cglobal x264_pixel_avg2_w20_sse2, 6,7
     sub    r4, r2
     lea    r6, [r4+r3]
 .height_loop:
@@ -272,12 +271,6 @@ cglobal x264_pixel_avg2_w20_%1, 6,7
     sub    r5d, 2
     jg     .height_loop
     REP_RET
-%endmacro
-
-PIXEL_AVG_SSE sse2
-%define movdqu lddqu
-PIXEL_AVG_SSE sse3
-%undef movdqu
 
 ; Cacheline split code for processors with high latencies for loads
 ; split over cache lines.  See sad-a.asm for a more detailed explanation.
index dcb89db0a5cf98068beafaac8a390090f60ef7fe..4a6194ae248acd45ea7d04d5930b58675f9239dc 100644 (file)
@@ -69,7 +69,6 @@ PIXEL_AVG_WALL(cache32_mmxext)
 PIXEL_AVG_WALL(cache64_mmxext)
 PIXEL_AVG_WALL(cache64_sse2)
 PIXEL_AVG_WALL(sse2)
-PIXEL_AVG_WALL(sse3)
 
 #define AVG_WEIGHT(W,H) \
 void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
@@ -104,7 +103,6 @@ PIXEL_AVG_WTAB(cache32_mmxext, mmxext, cache32_mmxext, cache32_mmxext, cache32_m
 PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext)
 PIXEL_AVG_WTAB(sse2, mmxext, mmxext, mmxext, sse2, sse2)
 PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
-PIXEL_AVG_WTAB(cache64_sse3, mmxext, cache64_mmxext, sse3, sse3, sse3)
 
 #define MC_COPY_WTAB(instr, name1, name2, name3)\
 static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
@@ -118,7 +116,6 @@ static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, i
 
 MC_COPY_WTAB(mmx,mmx,mmx,mmx)
 MC_COPY_WTAB(sse2,mmx,mmx,sse2)
-MC_COPY_WTAB(sse3,mmx,mmx,sse3)
 
 static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
 static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
@@ -153,7 +150,6 @@ MC_LUMA(cache64_mmxext,cache64_mmxext,mmx)
 #endif
 MC_LUMA(sse2,sse2,sse2)
 MC_LUMA(cache64_sse2,cache64_sse2,sse2)
-MC_LUMA(cache64_sse3,cache64_sse3,sse3)
 
 #define GET_REF(name)\
 uint8_t *get_ref_##name( uint8_t *dst,   int *i_dst_stride,\
@@ -186,7 +182,6 @@ GET_REF(cache64_mmxext)
 #endif
 GET_REF(sse2)
 GET_REF(cache64_sse2)
-GET_REF(cache64_sse3)
 
 #define HPEL(align, cpu, cpuv, cpuc, cpuh)\
 void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
@@ -270,7 +265,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         pf->mc_luma = mc_luma_cache32_mmxext;
         pf->get_ref = get_ref_cache32_mmxext;
     }
-    else if( cpu&X264_CPU_CACHELINE_SPLIT )
+    else if( cpu&X264_CPU_CACHELINE_64 )
     {
         pf->mc_luma = mc_luma_cache64_mmxext;
         pf->get_ref = get_ref_cache64_mmxext;
@@ -284,26 +279,22 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->memzero_aligned = x264_memzero_aligned_sse2;
     pf->hpel_filter = x264_hpel_filter_sse2_amd;
 
-    // disable on AMD processors since it is slower
-    if( cpu&X264_CPU_3DNOW )
+    if( cpu&X264_CPU_SSE2_IS_SLOW )
         return;
 
-    pf->mc_luma = mc_luma_sse2;
-    pf->get_ref = get_ref_sse2;
     pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
     pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
     pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
     pf->hpel_filter = x264_hpel_filter_sse2;
 
-    if( cpu&X264_CPU_CACHELINE_SPLIT )
+    if( cpu&X264_CPU_SSE2_IS_FAST )
     {
-        pf->mc_luma = mc_luma_cache64_sse2;
-        pf->get_ref = get_ref_cache64_sse2;
-        /* lddqu doesn't work on Core2 */
-        if( (cpu&X264_CPU_SSE3) && !(cpu&X264_CPU_SSSE3) )
+        pf->mc_luma = mc_luma_sse2;
+        pf->get_ref = get_ref_sse2;
+        if( cpu&X264_CPU_CACHELINE_64 )
         {
-            pf->mc_luma = mc_luma_cache64_sse3;
-            pf->get_ref = get_ref_cache64_sse3;
+            pf->mc_luma = mc_luma_cache64_sse2;
+            pf->get_ref = get_ref_cache64_sse2;
         }
     }
 
index 18a115cbafe9d0de218197d3a58ac6167f7953c6..ce671c3a6a139988fde9d8cae5ef4f2a3a215557 100644 (file)
@@ -505,11 +505,13 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
     pf[I_PRED_16x16_DC]      = predict_16x16_dc_mmxext;
     pf[I_PRED_16x16_DC_TOP]  = predict_16x16_dc_top_mmxext;
     pf[I_PRED_16x16_P]       = predict_16x16_p_mmxext;
-    if( !(cpu&X264_CPU_SSE2) || (cpu&X264_CPU_3DNOW) )
+    if( !(cpu&X264_CPU_SSE2) )
         return;
     pf[I_PRED_16x16_DC]     = predict_16x16_dc_sse2;
-    pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2;
     pf[I_PRED_16x16_V]      = predict_16x16_v_sse2;
+    if( cpu&X264_CPU_SSE2_IS_SLOW )
+        return;
+    pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2;
     pf[I_PRED_16x16_P]      = predict_16x16_p_sse2;
 }
 
index 636daa8f194c096c96fdc18e80801579e1a4d1bf..cffaeeb6aa9f0d2aedb6e82c1f9908f5b200cce6 100644 (file)
@@ -660,9 +660,17 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
 
     p = buf + sprintf( buf, "using cpu capabilities:" );
     for( i=0; x264_cpu_names[i].flags; i++ )
+    {
+        if( !strcmp(x264_cpu_names[i].name, "SSE2")
+            && param->cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) )
+            continue;
+        if( !strcmp(x264_cpu_names[i].name, "SSE3")
+            && (param->cpu & X264_CPU_SSSE3 || !(param->cpu & X264_CPU_CACHELINE_64)) )
+            continue;
         if( (param->cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
             && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
             p += sprintf( p, " %s", x264_cpu_names[i].name );
+    }
     if( !param->cpu )
         p += sprintf( p, " none!" );
     x264_log( h, X264_LOG_INFO, "%s\n", buf );
index 115e2217d9086688e45841a19c8f3f76f86e8ee8..c2c16618a971446395539e0e41f3d81dc4a7287a 100644 (file)
@@ -120,9 +120,11 @@ static void print_bench(void)
             for( k=0; k<j && benchs[i].vers[k].pointer != b->pointer; k++ );
             if( k<j ) continue;
             printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
-                    b->cpu&X264_CPU_SSE4 ? "sse4" :
+                    b->cpu&X264_CPU_PHADD_IS_FAST ? "phadd" :
                     b->cpu&X264_CPU_SSSE3 ? "ssse3" :
                     b->cpu&X264_CPU_SSE3 ? "sse3" :
+                    /* print sse2slow only if there's also a sse2fast version of the same func */
+                    b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
                     b->cpu&X264_CPU_SSE2 ? "sse2" :
                     b->cpu&X264_CPU_MMX ? "mmx" : "c",
                     b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
@@ -1112,6 +1114,8 @@ int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
 {
     *cpu_ref = *cpu_new;
     *cpu_new |= flags;
+    if( *cpu_new & X264_CPU_SSE2_IS_FAST )
+        *cpu_new &= ~X264_CPU_SSE2_IS_SLOW;
     if( !quiet )
         fprintf( stderr, "x264: %s\n", name );
     return check_all_funcs( *cpu_ref, *cpu_new );
@@ -1124,29 +1128,28 @@ int check_all_flags( void )
 #ifdef HAVE_MMX
     if( x264_cpu_detect() & X264_CPU_MMXEXT )
     {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMXEXT, "MMXEXT" );
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "MMXEXT Cache64" );
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMXEXT, "MMX" );
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "MMX Cache64" );
         cpu1 &= ~X264_CPU_CACHELINE_64;
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32, "MMXEXT Cache32" );
+#ifdef ARCH_X86
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
+        cpu1 &= ~X264_CPU_CACHELINE_32;
+#endif
     }
     if( x264_cpu_detect() & X264_CPU_SSE2 )
     {
-        cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32);
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2, "SSE2" );
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSE2 Cache64" );
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
     }
     if( x264_cpu_detect() & X264_CPU_SSE3 )
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3, "SSE3" );
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
     if( x264_cpu_detect() & X264_CPU_SSSE3 )
     {
-        cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64);
+        cpu1 &= ~X264_CPU_CACHELINE_64;
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
-    }
-    if( x264_cpu_detect() & X264_CPU_SSSE3 )
-    {
-        cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64);
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_PHADD_IS_FAST, "PHADD" );
     }
 #elif ARCH_PPC
     if( x264_cpu_detect() & X264_CPU_ALTIVEC )
diff --git a/x264.h b/x264.h
index 7b3904951c68593c708e0d50a02f5fd9471e16d3..0e257a1939df573e6a45d9287e18d36e46a4b86d 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@
 
 #include <stdarg.h>
 
-#define X264_BUILD 59
+#define X264_BUILD 60
 
 /* x264_t:
  *      opaque handler for encoder */
@@ -46,19 +46,19 @@ typedef struct x264_t x264_t;
  ****************************************************************************/
 /* CPU flags
  */
-#define X264_CPU_MMX        0x000001    /* mmx */
-#define X264_CPU_MMXEXT     0x000002    /* mmx-ext*/
-#define X264_CPU_SSE        0x000004    /* sse */
-#define X264_CPU_SSE2       0x000008    /* sse 2 */
-#define X264_CPU_3DNOW      0x000010    /* 3dnow! */
-#define X264_CPU_3DNOWEXT   0x000020    /* 3dnow! ext */
-#define X264_CPU_ALTIVEC    0x000040    /* altivec */
-#define X264_CPU_SSE3       0x000080    /* sse 3 */
-#define X264_CPU_SSSE3      0x000100    /* ssse 3 */
-#define X264_CPU_CACHELINE_SPLIT 0x200  /* avoid memory loads that span the border between two cachelines */
-#define X264_CPU_CACHELINE_32 0x0400    /* size of a cacheline in bytes */
-#define X264_CPU_CACHELINE_64 0x0800
-#define X264_CPU_SSE4       0x001000    /* sse 4.1 */
+#define X264_CPU_CACHELINE_32   0x000001  /* avoid memory loads that span the border between two cachelines */
+#define X264_CPU_CACHELINE_64   0x000002  /* 32/64 is the size of a cacheline in bytes */
+#define X264_CPU_ALTIVEC        0x000004
+#define X264_CPU_MMX            0x000008
+#define X264_CPU_MMXEXT         0x000010  /* MMX2 aka MMXEXT aka ISSE */
+#define X264_CPU_SSE            0x000020
+#define X264_CPU_SSE2           0x000040
+#define X264_CPU_SSE2_IS_SLOW   0x000080  /* avoid most SSE2 functions on Athlon64 */
+#define X264_CPU_SSE2_IS_FAST   0x000100  /* a few functions are only faster on Core2 and Phenom */
+#define X264_CPU_SSE3           0x000200
+#define X264_CPU_SSSE3          0x000400
+#define X264_CPU_PHADD_IS_FAST  0x000800  /* pre-Penryn Core2 have a uselessly slow PHADD instruction */
+#define X264_CPU_SSE4           0x001000  /* SSE4.1 */
 
 /* Analyse flags
  */