From: Loren Merritt Date: Sat, 7 Jun 2008 05:31:22 +0000 (-0600) Subject: many changes to which asm functions are enabled on which cpus. X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=c0c0e1f48de74acec0b681bfa842d3c8cddb4a32;p=x264 many changes to which asm functions are enabled on which cpus. with Phenom, 3dnow is no longer equivalent to "sse2 is slow", so make a new flag for that. some sse2 functions are useful only on Core2 and Phenom, so make a "sse2 is fast" flag for that. some ssse3 instructions didn't become useful until Penryn, so yet another flag. disable sse2 completely on Pentium M and Core1, because it's uniformly slower than mmx. enable some sse2 functions on Athlon64 that always were faster and we just didn't notice. remove mc_luma_sse3, because the only cpu that has lddqu (namely Pentium 4D) doesn't have "sse2 is fast". don't print mmx1, sse1, nor 3dnow in the detected cpuflags, since we don't really have any such functions. likewise don't print sse3 unless it's used (Pentium 4D). --- diff --git a/common/cpu.c b/common/cpu.c index 3ebe970f..ed72c649 100644 --- a/common/cpu.c +++ b/common/cpu.c @@ -34,24 +34,23 @@ #endif #include "common.h" +#include "cpu.h" -const struct { - const char name[8]; - int flags; -} x264_cpu_names[] = { - {"MMX", X264_CPU_MMX}, +const x264_cpu_name_t x264_cpu_names[] = { + {"Altivec", X264_CPU_ALTIVEC}, +// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore {"MMX2", X264_CPU_MMX|X264_CPU_MMXEXT}, {"MMXEXT", X264_CPU_MMX|X264_CPU_MMXEXT}, - {"SSE", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE}, - {"SSE1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE}, +// {"SSE", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE}, // there are no sse1 functions in x264 + {"SSE2Slow",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_SLOW}, {"SSE2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2}, + {"SSE2Fast",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_FAST}, {"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3}, {"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, + {"PHADD", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_PHADD_IS_FAST}, {"SSE4", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, - {"3DNow", X264_CPU_3DNOW}, - {"Altivec", X264_CPU_ALTIVEC}, - {"Cache32", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32}, - {"Cache64", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64}, + {"Cache32", X264_CPU_CACHELINE_32}, + {"Cache64", X264_CPU_CACHELINE_64}, {"", 0}, }; @@ -92,57 +91,87 @@ uint32_t x264_cpu_detect( void ) if( ecx&0x00080000 ) cpu |= X264_CPU_SSE4; + if( cpu & X264_CPU_SSSE3 ) + cpu |= X264_CPU_SSE2_IS_FAST; + if( cpu & X264_CPU_SSE4 ) + cpu |= X264_CPU_PHADD_IS_FAST; + x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx ); max_extended_cap = eax; if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 ) { x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx ); - if( edx&0x80000000 ) - cpu |= X264_CPU_3DNOW; if( edx&0x00400000 ) cpu |= X264_CPU_MMXEXT; + if( cpu & X264_CPU_SSE2 ) + { + if( ecx&0x00000040 ) /* SSE4a */ + cpu |= X264_CPU_SSE2_IS_FAST; + else + cpu |= X264_CPU_SSE2_IS_SLOW; + } } - if( !strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead") ) - cpu |= X264_CPU_CACHELINE_SPLIT; - /* cacheline size is specified in 3 places, any of which may be missing */ - x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx ); - cache = (ebx&0xff00)>>5; // cflush size - if( !cache && max_extended_cap >= 0x80000006 ) + if( !strcmp((char*)vendor, "GenuineIntel") ) { - x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx ); - cache = ecx&0xff; // cacheline size + int family, model, stepping; + x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx ); + family = ((eax>>8)&0xf) + ((eax>>20)&0xff); + model = ((eax>>4)&0xf) + ((eax>>12)&0xf0); + stepping = eax&0xf; + /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah") + * theoretically support sse2, but it's significantly slower than mmx for + * almost all of x264's functions, so let's just pretend they don't. */ + if( family==6 && (model==9 || model==13 || model==14) ) + { + cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3); + assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4))); + } } - if( !cache ) + + if( !strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead") ) { - // Cache and TLB Information - static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 }; - static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 }; - uint32_t buf[4]; - int max, i=0, j; - do { - x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 ); - max = buf[0]&0xff; - buf[0] &= ~0xff; - for(j=0; j<4; j++) - if( !(buf[j]>>31) ) - while( buf[j] ) - { - if( strchr( cache32_ids, buf[j]&0xff ) ) - cache = 32; - if( strchr( cache64_ids, buf[j]&0xff ) ) - cache = 64; - buf[j] >>= 8; - } - } while( ++i < max ); + /* cacheline size is specified in 3 places, any of which may be missing */ + x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx ); + cache = (ebx&0xff00)>>5; // cflush size + if( !cache && max_extended_cap >= 0x80000006 ) + { + x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx ); + cache = ecx&0xff; // cacheline size + } + if( !cache ) + { + // Cache and TLB Information + static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 }; + static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 }; + uint32_t buf[4]; + int max, i=0, j; + do { + x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 ); + max = buf[0]&0xff; + buf[0] &= ~0xff; + for(j=0; j<4; j++) + if( !(buf[j]>>31) ) + while( buf[j] ) + { + if( strchr( cache32_ids, buf[j]&0xff ) ) + cache = 32; + if( strchr( cache64_ids, buf[j]&0xff ) ) + cache = 64; + buf[j] >>= 8; + } + } while( ++i < max ); + } + + if( cache == 32 ) + cpu |= X264_CPU_CACHELINE_32; + else if( cache == 64 ) + cpu |= X264_CPU_CACHELINE_64; + else + fprintf( stderr, "x264 [warning]: unable to determine cacheline size\n" ); } - if( cache == 32 ) - cpu |= X264_CPU_CACHELINE_32; - if( cache == 64 ) - cpu |= X264_CPU_CACHELINE_64; - return cpu; } diff --git a/common/cpu.h b/common/cpu.h index 6a669735..1871e3a2 100644 --- a/common/cpu.h +++ b/common/cpu.h @@ -42,9 +42,10 @@ void x264_stack_align( void (*func)(x264_t*), x264_t *arg ); #define x264_stack_align(func,arg) func(arg) #endif -extern const struct { - const char name[8]; +typedef struct { + const char name[12]; int flags; -} x264_cpu_names[]; +} x264_cpu_name_t; +extern const x264_cpu_name_t x264_cpu_names[]; #endif diff --git a/common/dct.c b/common/dct.c index 669e24f3..1815fc3d 100644 --- a/common/dct.c +++ b/common/dct.c @@ -394,20 +394,18 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) if( cpu&X264_CPU_MMX ) { dctf->sub4x4_dct = x264_sub4x4_dct_mmx; - dctf->sub8x8_dct = x264_sub8x8_dct_mmx; - dctf->sub16x16_dct = x264_sub16x16_dct_mmx; - dctf->add4x4_idct = x264_add4x4_idct_mmx; - dctf->add8x8_idct = x264_add8x8_idct_mmx; - dctf->add16x16_idct = x264_add16x16_idct_mmx; - dctf->dct4x4dc = x264_dct4x4dc_mmx; dctf->idct4x4dc = x264_idct4x4dc_mmx; #ifndef ARCH_X86_64 + dctf->sub8x8_dct = x264_sub8x8_dct_mmx; + dctf->sub16x16_dct = x264_sub16x16_dct_mmx; + dctf->add8x8_idct = x264_add8x8_idct_mmx; + dctf->add16x16_idct = x264_add16x16_idct_mmx; + dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx; dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx; - dctf->add8x8_idct8 = x264_add8x8_idct8_mmx; dctf->add16x16_idct8= x264_add16x16_idct8_mmx; #endif @@ -419,9 +417,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2; dctf->add8x8_idct8 = x264_add8x8_idct8_sse2; dctf->add16x16_idct8= x264_add16x16_idct8_sse2; - } - if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) ) - { + dctf->sub8x8_dct = x264_sub8x8_dct_sse2; dctf->sub16x16_dct = x264_sub16x16_dct_sse2; dctf->add8x8_idct = x264_add8x8_idct_sse2; diff --git a/common/frame.c b/common/frame.c index 70bcf8a0..a1a36b55 100644 --- a/common/frame.c +++ b/common/frame.c @@ -44,11 +44,10 @@ x264_frame_t *x264_frame_new( x264_t *h ) if( h->param.b_interlaced ) i_lines = ( i_lines + 31 ) & -32; - if( h->param.cpu&X264_CPU_CACHELINE_SPLIT ) - { - int align = h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 64; - i_stride = (i_stride + align-1) & -align; - } + if( h->param.cpu&X264_CPU_CACHELINE_64 ) + i_stride = (i_stride + 63) & ~63; + else if( h->param.cpu&X264_CPU_CACHELINE_32 ) + i_stride = (i_stride + 31) & ~31; frame->i_plane = 3; for( i = 0; i < 3; i++ ) diff --git a/common/pixel.c b/common/pixel.c index 0d00b6e5..11d74a0e 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -557,23 +557,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext; - if( cpu&X264_CPU_CACHELINE_SPLIT ) + if( cpu&X264_CPU_CACHELINE_32 ) { - if( cpu&X264_CPU_CACHELINE_32 ) - { - INIT5( sad, _cache32_mmxext ); - INIT4( sad_x3, _cache32_mmxext ); - INIT4( sad_x4, _cache32_mmxext ); - } - else - { - INIT5( sad, _cache64_mmxext ); - INIT4( sad_x3, _cache64_mmxext ); - INIT4( sad_x4, _cache64_mmxext ); - } + INIT5( sad, _cache32_mmxext ); + INIT4( sad_x3, _cache32_mmxext ); + INIT4( sad_x4, _cache32_mmxext ); + } + else if( cpu&X264_CPU_CACHELINE_64 ) + { + INIT5( sad, _cache64_mmxext ); + INIT4( sad_x3, _cache64_mmxext ); + INIT4( sad_x4, _cache64_mmxext ); } #else - if( cpu&X264_CPU_CACHELINE_SPLIT ) + if( cpu&X264_CPU_CACHELINE_64 ) { pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmxext; pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmxext; @@ -589,19 +586,15 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext; } - // disable on AMD processors since it is slower - if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) ) + if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) { INIT2( sad, _sse2 ); INIT2( sad_x3, _sse2 ); INIT2( sad_x4, _sse2 ); - INIT5( satd, _sse2 ); - INIT5( satd_x3, _sse2 ); - INIT5( satd_x4, _sse2 ); INIT_ADS( _sse2 ); #ifdef ARCH_X86 - if( cpu&X264_CPU_CACHELINE_SPLIT ) + if( cpu&X264_CPU_CACHELINE_64 ) { INIT2( sad, _cache64_sse2 ); INIT2( sad_x3, _cache64_sse2 ); @@ -609,10 +602,12 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) } #endif } - // these are faster on both Intel and AMD if( cpu&X264_CPU_SSE2 ) { INIT5( ssd, _sse2 ); + INIT5( satd, _sse2 ); + INIT5( satd_x3, _sse2 ); + INIT5( satd_x4, _sse2 ); pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; @@ -622,7 +617,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) #endif } - if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_SPLIT) ) + if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_64) ) { INIT2( sad, _sse3 ); INIT2( sad_x3, _sse3 ); @@ -643,20 +638,18 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) #ifdef ARCH_X86_64 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3; #endif - if( cpu&X264_CPU_CACHELINE_SPLIT ) + if( cpu&X264_CPU_CACHELINE_64 ) { INIT2( sad, _cache64_ssse3 ); INIT2( sad_x3, _cache64_ssse3 ); INIT2( sad_x4, _cache64_ssse3 ); } - } - - if( cpu&X264_CPU_SSE4 ) - { - // enabled on Penryn, but slower on Conroe - INIT5( satd, _ssse3_phadd ); - INIT5( satd_x3, _ssse3_phadd ); - INIT5( satd_x4, _ssse3_phadd ); + if( cpu&X264_CPU_PHADD_IS_FAST ) + { + INIT5( satd, _ssse3_phadd ); + INIT5( satd_x3, _ssse3_phadd ); + INIT5( satd_x4, _ssse3_phadd ); + } } #endif //HAVE_MMX diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 77baddaa..525f94a3 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -283,16 +283,12 @@ cglobal %1, 2,2,1 jmp %2 %endmacro +%ifndef ARCH_X86_64 SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0 ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0 - SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4 ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4 -SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4 -ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4 - -%ifndef ARCH_X86_64 cextern x264_sub8x8_dct8_mmx.skip_prologue cextern x264_add8x8_idct8_mmx.skip_prologue SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0 @@ -301,6 +297,9 @@ ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 1 %define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue %endif +SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4 +ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4 + cextern x264_sub8x8_dct8_sse2 cextern x264_add8x8_idct8_sse2 SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0 diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index eadb6b5a..3dabe9f6 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -230,8 +230,7 @@ cglobal x264_pixel_avg2_w20_mmxext, 6,7 jg .height_loop REP_RET -%macro PIXEL_AVG_SSE 1 -cglobal x264_pixel_avg2_w16_%1, 6,7 +cglobal x264_pixel_avg2_w16_sse2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -249,7 +248,7 @@ cglobal x264_pixel_avg2_w16_%1, 6,7 jg .height_loop REP_RET -cglobal x264_pixel_avg2_w20_%1, 6,7 +cglobal x264_pixel_avg2_w20_sse2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -272,12 +271,6 @@ cglobal x264_pixel_avg2_w20_%1, 6,7 sub r5d, 2 jg .height_loop REP_RET -%endmacro - -PIXEL_AVG_SSE sse2 -%define movdqu lddqu -PIXEL_AVG_SSE sse3 -%undef movdqu ; Cacheline split code for processors with high latencies for loads ; split over cache lines. See sad-a.asm for a more detailed explanation. diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index dcb89db0..4a6194ae 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -69,7 +69,6 @@ PIXEL_AVG_WALL(cache32_mmxext) PIXEL_AVG_WALL(cache64_mmxext) PIXEL_AVG_WALL(cache64_sse2) PIXEL_AVG_WALL(sse2) -PIXEL_AVG_WALL(sse3) #define AVG_WEIGHT(W,H) \ void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \ @@ -104,7 +103,6 @@ PIXEL_AVG_WTAB(cache32_mmxext, mmxext, cache32_mmxext, cache32_mmxext, cache32_m PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext) PIXEL_AVG_WTAB(sse2, mmxext, mmxext, mmxext, sse2, sse2) PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2) -PIXEL_AVG_WTAB(cache64_sse3, mmxext, cache64_mmxext, sse3, sse3, sse3) #define MC_COPY_WTAB(instr, name1, name2, name3)\ static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\ @@ -118,7 +116,6 @@ static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, i MC_COPY_WTAB(mmx,mmx,mmx,mmx) MC_COPY_WTAB(sse2,mmx,mmx,sse2) -MC_COPY_WTAB(sse3,mmx,mmx,sse3) static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; @@ -153,7 +150,6 @@ MC_LUMA(cache64_mmxext,cache64_mmxext,mmx) #endif MC_LUMA(sse2,sse2,sse2) MC_LUMA(cache64_sse2,cache64_sse2,sse2) -MC_LUMA(cache64_sse3,cache64_sse3,sse3) #define GET_REF(name)\ uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\ @@ -186,7 +182,6 @@ GET_REF(cache64_mmxext) #endif GET_REF(sse2) GET_REF(cache64_sse2) -GET_REF(cache64_sse3) #define HPEL(align, cpu, cpuv, cpuc, cpuh)\ void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\ @@ -270,7 +265,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->mc_luma = mc_luma_cache32_mmxext; pf->get_ref = get_ref_cache32_mmxext; } - else if( cpu&X264_CPU_CACHELINE_SPLIT ) + else if( cpu&X264_CPU_CACHELINE_64 ) { pf->mc_luma = mc_luma_cache64_mmxext; pf->get_ref = get_ref_cache64_mmxext; @@ -284,26 +279,22 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->memzero_aligned = x264_memzero_aligned_sse2; pf->hpel_filter = x264_hpel_filter_sse2_amd; - // disable on AMD processors since it is slower - if( cpu&X264_CPU_3DNOW ) + if( cpu&X264_CPU_SSE2_IS_SLOW ) return; - pf->mc_luma = mc_luma_sse2; - pf->get_ref = get_ref_sse2; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2; pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2; pf->hpel_filter = x264_hpel_filter_sse2; - if( cpu&X264_CPU_CACHELINE_SPLIT ) + if( cpu&X264_CPU_SSE2_IS_FAST ) { - pf->mc_luma = mc_luma_cache64_sse2; - pf->get_ref = get_ref_cache64_sse2; - /* lddqu doesn't work on Core2 */ - if( (cpu&X264_CPU_SSE3) && !(cpu&X264_CPU_SSSE3) ) + pf->mc_luma = mc_luma_sse2; + pf->get_ref = get_ref_sse2; + if( cpu&X264_CPU_CACHELINE_64 ) { - pf->mc_luma = mc_luma_cache64_sse3; - pf->get_ref = get_ref_cache64_sse3; + pf->mc_luma = mc_luma_cache64_sse2; + pf->get_ref = get_ref_cache64_sse2; } } diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c index 18a115cb..ce671c3a 100644 --- a/common/x86/predict-c.c +++ b/common/x86/predict-c.c @@ -505,11 +505,13 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] ) pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext; pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext; pf[I_PRED_16x16_P] = predict_16x16_p_mmxext; - if( !(cpu&X264_CPU_SSE2) || (cpu&X264_CPU_3DNOW) ) + if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_16x16_DC] = predict_16x16_dc_sse2; - pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2; pf[I_PRED_16x16_V] = predict_16x16_v_sse2; + if( cpu&X264_CPU_SSE2_IS_SLOW ) + return; + pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2; pf[I_PRED_16x16_P] = predict_16x16_p_sse2; } diff --git a/encoder/encoder.c b/encoder/encoder.c index 636daa8f..cffaeeb6 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -660,9 +660,17 @@ x264_t *x264_encoder_open ( x264_param_t *param ) p = buf + sprintf( buf, "using cpu capabilities:" ); for( i=0; x264_cpu_names[i].flags; i++ ) + { + if( !strcmp(x264_cpu_names[i].name, "SSE2") + && param->cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) ) + continue; + if( !strcmp(x264_cpu_names[i].name, "SSE3") + && (param->cpu & X264_CPU_SSSE3 || !(param->cpu & X264_CPU_CACHELINE_64)) ) + continue; if( (param->cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) ) p += sprintf( p, " %s", x264_cpu_names[i].name ); + } if( !param->cpu ) p += sprintf( p, " none!" ); x264_log( h, X264_LOG_INFO, "%s\n", buf ); diff --git a/tools/checkasm.c b/tools/checkasm.c index 115e2217..c2c16618 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -120,9 +120,11 @@ static void print_bench(void) for( k=0; kpointer; k++ ); if( kcpu&X264_CPU_SSE4 ? "sse4" : + b->cpu&X264_CPU_PHADD_IS_FAST ? "phadd" : b->cpu&X264_CPU_SSSE3 ? "ssse3" : b->cpu&X264_CPU_SSE3 ? "sse3" : + /* print sse2slow only if there's also a sse2fast version of the same func */ + b->cpu&X264_CPU_SSE2_IS_SLOW && jcpu&X264_CPU_SSE2 ? "sse2" : b->cpu&X264_CPU_MMX ? "mmx" : "c", b->cpu&X264_CPU_CACHELINE_32 ? "_c32" : @@ -1112,6 +1114,8 @@ int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name ) { *cpu_ref = *cpu_new; *cpu_new |= flags; + if( *cpu_new & X264_CPU_SSE2_IS_FAST ) + *cpu_new &= ~X264_CPU_SSE2_IS_SLOW; if( !quiet ) fprintf( stderr, "x264: %s\n", name ); return check_all_funcs( *cpu_ref, *cpu_new ); @@ -1124,29 +1128,28 @@ int check_all_flags( void ) #ifdef HAVE_MMX if( x264_cpu_detect() & X264_CPU_MMXEXT ) { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMXEXT, "MMXEXT" ); - ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "MMXEXT Cache64" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMXEXT, "MMX" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "MMX Cache64" ); cpu1 &= ~X264_CPU_CACHELINE_64; - ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32, "MMXEXT Cache32" ); +#ifdef ARCH_X86 + ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" ); + cpu1 &= ~X264_CPU_CACHELINE_32; +#endif } if( x264_cpu_detect() & X264_CPU_SSE2 ) { - cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32); - ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2, "SSE2" ); - ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSE2 Cache64" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" ); } if( x264_cpu_detect() & X264_CPU_SSE3 ) - ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3, "SSE3" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" ); if( x264_cpu_detect() & X264_CPU_SSSE3 ) { - cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64); + cpu1 &= ~X264_CPU_CACHELINE_64; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" ); - ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSSE3 Cache64" ); - } - if( x264_cpu_detect() & X264_CPU_SSSE3 ) - { - cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64); - ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_PHADD_IS_FAST, "PHADD" ); } #elif ARCH_PPC if( x264_cpu_detect() & X264_CPU_ALTIVEC ) diff --git a/x264.h b/x264.h index 7b390495..0e257a19 100644 --- a/x264.h +++ b/x264.h @@ -35,7 +35,7 @@ #include -#define X264_BUILD 59 +#define X264_BUILD 60 /* x264_t: * opaque handler for encoder */ @@ -46,19 +46,19 @@ typedef struct x264_t x264_t; ****************************************************************************/ /* CPU flags */ -#define X264_CPU_MMX 0x000001 /* mmx */ -#define X264_CPU_MMXEXT 0x000002 /* mmx-ext*/ -#define X264_CPU_SSE 0x000004 /* sse */ -#define X264_CPU_SSE2 0x000008 /* sse 2 */ -#define X264_CPU_3DNOW 0x000010 /* 3dnow! */ -#define X264_CPU_3DNOWEXT 0x000020 /* 3dnow! ext */ -#define X264_CPU_ALTIVEC 0x000040 /* altivec */ -#define X264_CPU_SSE3 0x000080 /* sse 3 */ -#define X264_CPU_SSSE3 0x000100 /* ssse 3 */ -#define X264_CPU_CACHELINE_SPLIT 0x200 /* avoid memory loads that span the border between two cachelines */ -#define X264_CPU_CACHELINE_32 0x0400 /* size of a cacheline in bytes */ -#define X264_CPU_CACHELINE_64 0x0800 -#define X264_CPU_SSE4 0x001000 /* sse 4.1 */ +#define X264_CPU_CACHELINE_32 0x000001 /* avoid memory loads that span the border between two cachelines */ +#define X264_CPU_CACHELINE_64 0x000002 /* 32/64 is the size of a cacheline in bytes */ +#define X264_CPU_ALTIVEC 0x000004 +#define X264_CPU_MMX 0x000008 +#define X264_CPU_MMXEXT 0x000010 /* MMX2 aka MMXEXT aka ISSE */ +#define X264_CPU_SSE 0x000020 +#define X264_CPU_SSE2 0x000040 +#define X264_CPU_SSE2_IS_SLOW 0x000080 /* avoid most SSE2 functions on Athlon64 */ +#define X264_CPU_SSE2_IS_FAST 0x000100 /* a few functions are only faster on Core2 and Phenom */ +#define X264_CPU_SSE3 0x000200 +#define X264_CPU_SSSE3 0x000400 +#define X264_CPU_PHADD_IS_FAST 0x000800 /* pre-Penryn Core2 have a uselessly slow PHADD instruction */ +#define X264_CPU_SSE4 0x001000 /* SSE4.1 */ /* Analyse flags */