}
#if HAVE_MMX
-uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
+uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_avx( uint8_t *dst, uint8_t *src, uint8_t *end );
#endif
{
pf->nal_escape = x264_nal_escape_c;
#if HAVE_MMX
- if( cpu&X264_CPU_MMXEXT )
- pf->nal_escape = x264_nal_escape_mmxext;
+ if( cpu&X264_CPU_MMX2 )
+ pf->nal_escape = x264_nal_escape_mmx2;
if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
pf->nal_escape = x264_nal_escape_sse2;
if( cpu&X264_CPU_AVX )
const x264_cpu_name_t x264_cpu_names[] =
{
- {"Altivec", X264_CPU_ALTIVEC},
-// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
- {"MMX2", X264_CPU_MMX|X264_CPU_MMXEXT},
- {"MMXEXT", X264_CPU_MMX|X264_CPU_MMXEXT},
-// {"SSE", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE}, // there are no sse1 functions in x264
- {"SSE2Slow",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_SLOW},
- {"SSE2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2},
- {"SSE2Fast",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_FAST},
- {"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
- {"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
- {"FastShuffle", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SHUFFLE_IS_FAST},
- {"SSE4.1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
- {"SSE4", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
- {"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
- {"AVX", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX},
- {"Cache32", X264_CPU_CACHELINE_32},
- {"Cache64", X264_CPU_CACHELINE_64},
- {"SSEMisalign", X264_CPU_SSE_MISALIGN},
- {"LZCNT", X264_CPU_LZCNT},
+ {"Altivec", X264_CPU_ALTIVEC},
+// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
+ {"MMX2", X264_CPU_MMX|X264_CPU_MMX2},
+ {"MMXEXT", X264_CPU_MMX|X264_CPU_MMX2},
+// {"SSE", X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE}, // there are no sse1 functions in x264
+#define SSE2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE|X264_CPU_SSE2
+ {"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW},
+ {"SSE2", SSE2},
+ {"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST},
+ {"SSE3", SSE2|X264_CPU_SSE3},
+ {"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
+ {"FastShuffle", SSE2|X264_CPU_SHUFFLE_IS_FAST},
+ {"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
+ {"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
+ {"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
+ {"AVX", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX},
+#undef SSE2
+ {"Cache32", X264_CPU_CACHELINE_32},
+ {"Cache64", X264_CPU_CACHELINE_64},
+ {"SSEMisalign", X264_CPU_SSE_MISALIGN},
+ {"LZCNT", X264_CPU_LZCNT},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
- {"ARMv6", X264_CPU_ARMV6},
- {"NEON", X264_CPU_NEON},
- {"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC},
- {"SlowCTZ", X264_CPU_SLOW_CTZ},
- {"SlowAtom", X264_CPU_SLOW_ATOM},
+ {"ARMv6", X264_CPU_ARMV6},
+ {"NEON", X264_CPU_NEON},
+ {"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC},
+ {"SlowCTZ", X264_CPU_SLOW_CTZ},
+ {"SlowAtom", X264_CPU_SLOW_ATOM},
{"", 0},
};
else
return 0;
if( edx&0x02000000 )
- cpu |= X264_CPU_MMXEXT|X264_CPU_SSE;
+ cpu |= X264_CPU_MMX2|X264_CPU_SSE;
if( edx&0x04000000 )
cpu |= X264_CPU_SSE2;
if( ecx&0x00000001 )
cpu |= X264_CPU_SLOW_CTZ;
x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
if( edx&0x00400000 )
- cpu |= X264_CPU_MMXEXT;
+ cpu |= X264_CPU_MMX2;
if( cpu & X264_CPU_SSE2 )
{
if( ecx&0x00000040 ) /* SSE4a */
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
dctf->dct4x4dc = x264_dct4x4dc_mmx;
dctf->idct4x4dc = x264_idct4x4dc_mmx;
- dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
+ dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
#if !ARCH_X86_64
dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
- if( cpu&X264_CPU_MMXEXT )
+ if( cpu&X264_CPU_MMX2 )
{
- pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
- pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext;
- pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
+ pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmx2;
+ pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2;
+ pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
}
if( cpu&X264_CPU_SSE2_IS_FAST )
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
void x264_deblock_v_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_strength_mmxext( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
- int mvy_limit, int bframe );
-void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
- int mvy_limit, int bframe );
-void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
- int mvy_limit, int bframe );
-void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
- int mvy_limit, int bframe );
+void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+ int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+ int mvy_limit, int bframe );
+void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+ int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+ int mvy_limit, int bframe );
+void x264_deblock_strength_ssse3( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+ int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+ int mvy_limit, int bframe );
+void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+ int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+ int mvy_limit, int bframe );
#if ARCH_X86
-void x264_deblock_h_luma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_chroma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_luma_intra_mmxext( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
-void x264_deblock_v_chroma_intra_mmxext( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_mmxext( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v8_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
#if HIGH_BIT_DEPTH
-void x264_deblock_v_luma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_luma_intra_mmxext( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_v_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
#else
// FIXME this wrapper has a significant cpu cost
-static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void x264_deblock_v_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
- x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 );
- x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
+ x264_deblock_v8_luma_mmx2( pix, stride, alpha, beta, tc0 );
+ x264_deblock_v8_luma_mmx2( pix+8, stride, alpha, beta, tc0+2 );
}
-static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta )
{
- x264_deblock_v8_luma_intra_mmxext( pix, stride, alpha, beta );
- x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
+ x264_deblock_v8_luma_intra_mmx2( pix, stride, alpha, beta );
+ x264_deblock_v8_luma_intra_mmx2( pix+8, stride, alpha, beta );
}
#endif // HIGH_BIT_DEPTH
#endif
pf->deblock_strength = deblock_strength_c;
#if HAVE_MMX
- if( cpu&X264_CPU_MMXEXT )
+ if( cpu&X264_CPU_MMX2 )
{
#if ARCH_X86
- pf->deblock_luma[1] = x264_deblock_v_luma_mmxext;
- pf->deblock_luma[0] = x264_deblock_h_luma_mmxext;
- pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext;
- pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext;
- pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmxext;
- pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmxext;
- pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext;
- pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmxext;
+ pf->deblock_luma[1] = x264_deblock_v_luma_mmx2;
+ pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
+ pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
+ pf->deblock_chroma[0] = x264_deblock_h_chroma_mmx2;
+ pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
+ pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
+ pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
+ pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmx2;
#endif
- pf->deblock_strength = x264_deblock_strength_mmxext;
+ pf->deblock_strength = x264_deblock_strength_mmx2;
if( cpu&X264_CPU_SSE2 )
{
pf->deblock_strength = x264_deblock_strength_sse2;
__intel_cpu_indicator = 0x200;
else if( cpu&X264_CPU_SSE )
__intel_cpu_indicator = 0x80;
- else if( cpu&X264_CPU_MMXEXT )
+ else if( cpu&X264_CPU_MMX2 )
__intel_cpu_indicator = 8;
else
__intel_cpu_indicator = 1;
SATD_X_DECL7()
#if HAVE_MMX
-SATD_X_DECL7( _mmxext )
+SATD_X_DECL7( _mmx2 )
#if !HIGH_BIT_DEPTH
SATD_X_DECL6( _sse2 )
SATD_X_DECL7( _ssse3 )
INTRA_MBCMP_8x8( sad, )
INTRA_MBCMP_8x8(sa8d, )
#if HIGH_BIT_DEPTH && HAVE_MMX
-INTRA_MBCMP_8x8( sad, _mmxext)
+INTRA_MBCMP_8x8( sad, _mmx2 )
INTRA_MBCMP_8x8( sad, _sse2 )
INTRA_MBCMP_8x8( sad, _ssse3 )
INTRA_MBCMP_8x8(sa8d, _sse2 )
INTRA_MBCMP(satd, 16, v, h, dc, , )
#if HIGH_BIT_DEPTH && HAVE_MMX
-INTRA_MBCMP( sad, 4, v, h, dc, , _mmxext)
-INTRA_MBCMP(satd, 4, v, h, dc, , _mmxext)
-INTRA_MBCMP( sad, 8, dc, h, v, c, _mmxext)
-INTRA_MBCMP(satd, 8, dc, h, v, c, _mmxext)
-INTRA_MBCMP( sad, 16, v, h, dc, , _mmxext)
-INTRA_MBCMP(satd, 16, v, h, dc, , _mmxext)
-INTRA_MBCMP( sad, 8, dc, h, v, c, _sse2 )
-INTRA_MBCMP( sad, 16, v, h, dc, , _sse2 )
+INTRA_MBCMP( sad, 4, v, h, dc, , _mmx2 )
+INTRA_MBCMP(satd, 4, v, h, dc, , _mmx2 )
+INTRA_MBCMP( sad, 8, dc, h, v, c, _mmx2 )
+INTRA_MBCMP(satd, 8, dc, h, v, c, _mmx2 )
+INTRA_MBCMP( sad, 16, v, h, dc, , _mmx2 )
+INTRA_MBCMP(satd, 16, v, h, dc, , _mmx2 )
+INTRA_MBCMP( sad, 8, dc, h, v, c, _sse2 )
+INTRA_MBCMP( sad, 16, v, h, dc, , _sse2 )
INTRA_MBCMP( sad, 8, dc, h, v, c, _ssse3 )
INTRA_MBCMP( sad, 16, v, h, dc, , _ssse3 )
#endif
#if HIGH_BIT_DEPTH
#if HAVE_MMX
- if( cpu&X264_CPU_MMXEXT )
+ if( cpu&X264_CPU_MMX2 )
{
- INIT7( sad, _mmxext );
- INIT7( sad_x3, _mmxext );
- INIT7( sad_x4, _mmxext );
- INIT7( satd, _mmxext );
- INIT7( satd_x3, _mmxext );
- INIT7( satd_x4, _mmxext );
- INIT4( hadamard_ac, _mmxext );
- INIT7( ssd, _mmxext );
- INIT_ADS( _mmxext );
-
- pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmxext;
- pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext;
- pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext;
- pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
-
- pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmxext;
- pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
- pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmxext;
- pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmxext;
- pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
- pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext;
- pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
+ INIT7( sad, _mmx2 );
+ INIT7( sad_x3, _mmx2 );
+ INIT7( sad_x4, _mmx2 );
+ INIT7( satd, _mmx2 );
+ INIT7( satd_x3, _mmx2 );
+ INIT7( satd_x4, _mmx2 );
+ INIT4( hadamard_ac, _mmx2 );
+ INIT7( ssd, _mmx2 );
+ INIT_ADS( _mmx2 );
+
+ pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
+ pixf->var2_8x8 = x264_pixel_var2_8x8_mmx2;
+
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2;
+ pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
+ pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2;
+ pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmx2;
+ pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmx2;
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmx2;
+ pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2;
}
if( cpu&X264_CPU_SSE2 )
{
INIT7( ssd, _mmx );
}
- if( cpu&X264_CPU_MMXEXT )
+ if( cpu&X264_CPU_MMX2 )
{
- INIT7( sad, _mmxext );
- INIT7_NAME( sad_aligned, sad, _mmxext );
- INIT7( sad_x3, _mmxext );
- INIT7( sad_x4, _mmxext );
- INIT7( satd, _mmxext );
- INIT7( satd_x3, _mmxext );
- INIT7( satd_x4, _mmxext );
- INIT4( hadamard_ac, _mmxext );
- INIT_ADS( _mmxext );
- pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext;
- pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext;
- pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmxext;
+ INIT7( sad, _mmx2 );
+ INIT7_NAME( sad_aligned, sad, _mmx2 );
+ INIT7( sad_x3, _mmx2 );
+ INIT7( sad_x4, _mmx2 );
+ INIT7( satd, _mmx2 );
+ INIT7( satd_x3, _mmx2 );
+ INIT7( satd_x4, _mmx2 );
+ INIT4( hadamard_ac, _mmx2 );
+ INIT_ADS( _mmx2 );
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
+ pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
#if ARCH_X86
- pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
- pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext;
- pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
- pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext;
- pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
- pixf->vsad = x264_pixel_vsad_mmxext;
+ pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2;
+ pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2;
+ pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2;
+ pixf->var2_8x8 = x264_pixel_var2_8x8_mmx2;
+ pixf->vsad = x264_pixel_vsad_mmx2;
if( cpu&X264_CPU_CACHELINE_32 )
{
- INIT5( sad, _cache32_mmxext );
- INIT4( sad_x3, _cache32_mmxext );
- INIT4( sad_x4, _cache32_mmxext );
+ INIT5( sad, _cache32_mmx2 );
+ INIT4( sad_x3, _cache32_mmx2 );
+ INIT4( sad_x4, _cache32_mmx2 );
}
else if( cpu&X264_CPU_CACHELINE_64 )
{
- INIT5( sad, _cache64_mmxext );
- INIT4( sad_x3, _cache64_mmxext );
- INIT4( sad_x4, _cache64_mmxext );
+ INIT5( sad, _cache64_mmx2 );
+ INIT4( sad_x3, _cache64_mmx2 );
+ INIT4( sad_x4, _cache64_mmx2 );
}
#else
if( cpu&X264_CPU_CACHELINE_64 )
{
- pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmxext;
- pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmxext;
- pixf->sad[PIXEL_8x4] = x264_pixel_sad_8x4_cache64_mmxext;
- pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_mmxext;
- pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_cache64_mmxext;
- pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_mmxext;
- pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_cache64_mmxext;
+ pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmx2;
+ pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmx2;
+ pixf->sad[PIXEL_8x4] = x264_pixel_sad_8x4_cache64_mmx2;
+ pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_mmx2;
+ pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_cache64_mmx2;
+ pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_mmx2;
+ pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_cache64_mmx2;
}
#endif
- pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
- pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext;
- pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
- pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmxext;
- pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmxext;
- pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
- pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmxext;
+ pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2;
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmx2;
+ pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmx2;
+ pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmx2;
+ pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2;
+ pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2;
}
if( cpu&X264_CPU_SSE2 )
#if HIGH_BIT_DEPTH
#if HAVE_MMX
- if( cpu&X264_CPU_MMXEXT )
+ if( cpu&X264_CPU_MMX2 )
{
#if ARCH_X86
pf->denoise_dct = x264_denoise_dct_mmx;
- pf->decimate_score15 = x264_decimate_score15_mmxext;
- pf->decimate_score16 = x264_decimate_score16_mmxext;
+ pf->decimate_score15 = x264_decimate_score15_mmx2;
+ pf->decimate_score16 = x264_decimate_score16_mmx2;
if( cpu&X264_CPU_SLOW_CTZ )
{
- pf->decimate_score15 = x264_decimate_score15_mmxext_slowctz;
- pf->decimate_score16 = x264_decimate_score16_mmxext_slowctz;
+ pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
+ pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
}
- pf->decimate_score64 = x264_decimate_score64_mmxext;
- pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext;
- pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmxext;
- pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
- pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmxext;
- pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmxext;
- pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmxext;
+ pf->decimate_score64 = x264_decimate_score64_mmx2;
+ pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2;
+ pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
+ pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
+ pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
+ pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
#endif
- pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext;
+ pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2;
if( cpu&X264_CPU_LZCNT )
- pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext_lzcnt;
+ pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2_lzcnt;
}
if( cpu&X264_CPU_SSE2 )
{
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
if( cpu&X264_CPU_LZCNT )
{
- pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext_lzcnt;
+ pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2_lzcnt;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
pf->quant_4x4 = x264_quant_4x4_mmx;
pf->quant_8x8 = x264_quant_8x8_mmx;
pf->dequant_4x4 = x264_dequant_4x4_mmx;
- pf->dequant_4x4_dc = x264_dequant_4x4dc_mmxext;
+ pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
pf->dequant_8x8 = x264_dequant_8x8_mmx;
if( h->param.i_cqm_preset == X264_CQM_FLAT )
{
#endif
}
- if( cpu&X264_CPU_MMXEXT )
+ if( cpu&X264_CPU_MMX2 )
{
- pf->quant_2x2_dc = x264_quant_2x2_dc_mmxext;
+ pf->quant_2x2_dc = x264_quant_2x2_dc_mmx2;
#if ARCH_X86
- pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext;
- pf->decimate_score15 = x264_decimate_score15_mmxext;
- pf->decimate_score16 = x264_decimate_score16_mmxext;
+ pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
+ pf->decimate_score15 = x264_decimate_score15_mmx2;
+ pf->decimate_score16 = x264_decimate_score16_mmx2;
if( cpu&X264_CPU_SLOW_CTZ )
{
- pf->decimate_score15 = x264_decimate_score15_mmxext_slowctz;
- pf->decimate_score16 = x264_decimate_score16_mmxext_slowctz;
+ pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
+ pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
}
- pf->decimate_score64 = x264_decimate_score64_mmxext;
- pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmxext;
- pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
- pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmxext;
- pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmxext;
- pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmxext;
+ pf->decimate_score64 = x264_decimate_score64_mmx2;
+ pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
+ pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
+ pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
+ pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
#endif
- pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext;
- pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext;
+ pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2;
+ pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2;
if( cpu&X264_CPU_LZCNT )
{
- pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext_lzcnt;
- pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext_lzcnt;
+ pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2_lzcnt;
+ pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2_lzcnt;
}
}
%endmacro
INIT_MMX
-NAL_ESCAPE mmxext
+NAL_ESCAPE mmx2
INIT_XMM
NAL_ESCAPE sse2
INIT_AVX
%endmacro
INIT_MMX
-cglobal sub8x8_dct_dc_mmxext, 3,3
+cglobal sub8x8_dct_dc_mmx2, 3,3
DCTDC_2ROW_MMX m0, m4, 0
DCTDC_2ROW_MMX m5, m6, 2
paddw m0, m5
%ifdef HIGH_BIT_DEPTH
INIT_XMM
-SCAN_8x8_FRAME sse2 , 4 , dq, qdq, dq, d
+SCAN_8x8_FRAME sse2, 4 , dq, qdq, dq, d
INIT_AVX
-SCAN_8x8_FRAME avx , 4 , dq, qdq, dq, d
+SCAN_8x8_FRAME avx , 4 , dq, qdq, dq, d
%else
INIT_MMX
-SCAN_8x8_FRAME mmxext, 16, q , dq , wd, w
+SCAN_8x8_FRAME mmx2, 16, q , dq , wd, w
%endif
;-----------------------------------------------------------------------------
; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
-cglobal zigzag_scan_4x4_field_mmxext, 2,3
+cglobal zigzag_scan_4x4_field_mmx2, 2,3
pshufw mm0, [r1+4], 0xd2
movq mm1, [r1+16]
movq mm2, [r1+24]
%endmacro
%ifdef HIGH_BIT_DEPTH
INIT_XMM
-SCAN_8x8 sse4 , d, dq, qdq, dq, 4
+SCAN_8x8 sse4, d, dq, qdq, dq, 4
INIT_AVX
-SCAN_8x8 avx , d, dq, qdq, dq, 4
+SCAN_8x8 avx , d, dq, qdq, dq, 4
%else
INIT_MMX
-SCAN_8x8 mmxext, w, wd, dq , q , 16
+SCAN_8x8 mmx2, w, wd, dq , q , 16
%endif
;-----------------------------------------------------------------------------
#ifndef X264_I386_DCT_H
#define X264_I386_DCT_H
-void x264_sub4x4_dct_mmx ( dctcoef dct [16], pixel *pix1, pixel *pix2 );
-void x264_sub8x8_dct_mmx ( dctcoef dct[ 4][16], pixel *pix1, pixel *pix2 );
-void x264_sub16x16_dct_mmx ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
-void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_ssse3 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_dc_mmxext( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_dc_sse2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub4x4_dct_mmx ( dctcoef dct [16], pixel *pix1, pixel *pix2 );
+void x264_sub8x8_dct_mmx ( dctcoef dct[ 4][16], pixel *pix1, pixel *pix2 );
+void x264_sub16x16_dct_mmx ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
+void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_sse2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] );
void x264_add8x8_idct8_avx ( uint8_t *dst, int16_t dct [64] );
void x264_add16x16_idct8_avx ( uint8_t *dst, int16_t dct[4][64] );
-void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
-void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
-void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
-void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
-void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
-void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
-void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
-void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[16] );
-void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
-void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
-void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
+void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
+void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
+void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
+void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
+void x264_zigzag_scan_4x4_field_mmx2 ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
+void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
+void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] );
int x264_zigzag_sub_4x4_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4ac_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4ac_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
-void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
-void x264_zigzag_interleave_8x8_cavlc_avx( dctcoef *dst, dctcoef *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#endif
%ifndef ARCH_X86_64
INIT_MMX
-DEBLOCK_LUMA mmxext
-DEBLOCK_LUMA_INTRA mmxext
+DEBLOCK_LUMA mmx2
+DEBLOCK_LUMA_INTRA mmx2
INIT_XMM
DEBLOCK_LUMA sse2
DEBLOCK_LUMA_INTRA sse2
%endmacro ; DEBLOCK_LUMA
INIT_MMX
-DEBLOCK_LUMA mmxext, v8, 8
+DEBLOCK_LUMA mmx2, v8, 8
INIT_XMM
DEBLOCK_LUMA sse2, v, 16
INIT_AVX
DEBLOCK_LUMA_INTRA avx , v
%ifndef ARCH_X86_64
INIT_MMX
-DEBLOCK_LUMA_INTRA mmxext, v8
+DEBLOCK_LUMA_INTRA mmx2, v8
%endif
%endif ; !HIGH_BIT_DEPTH
%ifndef ARCH_X86_64
INIT_MMX
-DEBLOCK_CHROMA mmxext
+DEBLOCK_CHROMA mmx2
%endif
INIT_XMM
DEBLOCK_CHROMA sse2
DEBLOCK_CHROMA avx
%ifndef ARCH_X86_64
INIT_MMX
-DEBLOCK_CHROMA mmxext
+DEBLOCK_CHROMA mmx2
%endif
DEBLOCK_CHROMA_INTRA avx
%ifndef ARCH_X86_64
INIT_MMX
-DEBLOCK_CHROMA_INTRA mmxext
+DEBLOCK_CHROMA_INTRA mmx2
%endif
%endif ; !HIGH_BIT_DEPTH
%endmacro
INIT_MMX
-cglobal deblock_strength_mmxext, 6,6
+cglobal deblock_strength_mmx2, 6,6
; Prepare mv comparison register
shl r4d, 8
add r4d, 3 - (1<<8)
%define BIWEIGHT BIWEIGHT_MMX
%define BIWEIGHT_START BIWEIGHT_START_MMX
INIT_MMX
-AVG_WEIGHT mmxext, 4
-AVG_WEIGHT mmxext, 8
-AVG_WEIGHT mmxext, 16
+AVG_WEIGHT mmx2, 4
+AVG_WEIGHT mmx2, 8
+AVG_WEIGHT mmx2, 16
%ifdef HIGH_BIT_DEPTH
INIT_XMM
AVG_WEIGHT sse2, 4, 8
%endmacro
INIT_MMX
-WEIGHTER 4, mmxext
-WEIGHTER 8, mmxext
-WEIGHTER 12, mmxext
-WEIGHTER 16, mmxext
-WEIGHTER 20, mmxext
+WEIGHTER 4, mmx2
+WEIGHTER 8, mmx2
+WEIGHTER 12, mmx2
+WEIGHTER 16, mmx2
+WEIGHTER 20, mmx2
INIT_XMM
WEIGHTER 8, sse2
WEIGHTER 16, sse2
OFFSET %1, %2, sub
%endmacro
INIT_MMX
-OFFSETPN 4, mmxext
-OFFSETPN 8, mmxext
-OFFSETPN 12, mmxext
-OFFSETPN 16, mmxext
-OFFSETPN 20, mmxext
+OFFSETPN 4, mmx2
+OFFSETPN 8, mmx2
+OFFSETPN 12, mmx2
+OFFSETPN 16, mmx2
+OFFSETPN 20, mmx2
INIT_XMM
OFFSETPN 12, sse2
OFFSETPN 16, sse2
test dword r4m, 15
jz pixel_avg_w%1_sse2
%endif
- jmp pixel_avg_w%1_mmxext
+ jmp pixel_avg_w%1_mmx2
%endmacro
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_MMX
-AVG_FUNC 4, movq, movq, mmxext
-AVGH 4, 8, mmxext
-AVGH 4, 4, mmxext
-AVGH 4, 2, mmxext
+AVG_FUNC 4, movq, movq, mmx2
+AVGH 4, 8, mmx2
+AVGH 4, 4, mmx2
+AVGH 4, 2, mmx2
-AVG_FUNC 8, movq, movq, mmxext
-AVGH 8, 16, mmxext
-AVGH 8, 8, mmxext
-AVGH 8, 4, mmxext
+AVG_FUNC 8, movq, movq, mmx2
+AVGH 8, 16, mmx2
+AVGH 8, 8, mmx2
+AVGH 8, 4, mmx2
-AVG_FUNC 16, movq, movq, mmxext
-AVGH 16, 16, mmxext
-AVGH 16, 8, mmxext
+AVG_FUNC 16, movq, movq, mmx2
+AVGH 16, 16, mmx2
+AVGH 16, 8, mmx2
INIT_XMM
AVG_FUNC 4, movq, movq, sse2
%else ;!HIGH_BIT_DEPTH
INIT_MMX
-AVG_FUNC 4, movd, movd, mmxext
-AVGH 4, 8, mmxext
-AVGH 4, 4, mmxext
-AVGH 4, 2, mmxext
+AVG_FUNC 4, movd, movd, mmx2
+AVGH 4, 8, mmx2
+AVGH 4, 4, mmx2
+AVGH 4, 2, mmx2
-AVG_FUNC 8, movq, movq, mmxext
-AVGH 8, 16, mmxext
-AVGH 8, 8, mmxext
-AVGH 8, 4, mmxext
+AVG_FUNC 8, movq, movq, mmx2
+AVGH 8, 16, mmx2
+AVGH 8, 8, mmx2
+AVGH 8, 4, mmx2
-AVG_FUNC 16, movq, movq, mmxext
-AVGH 16, 16, mmxext
-AVGH 16, 8, mmxext
+AVG_FUNC 16, movq, movq, mmx2
+AVGH 16, 16, mmx2
+AVGH 16, 8, mmx2
INIT_XMM
AVG_FUNC 16, movdqu, movdqa, sse2
%endmacro
INIT_MMX
-AVG2_W_ONE 4, mmxext
-AVG2_W_TWO 8, movu, mova, mmxext
+AVG2_W_ONE 4, mmx2
+AVG2_W_TWO 8, movu, mova, mmx2
INIT_XMM
AVG2_W_ONE 8, sse2
AVG2_W_TWO 10, movd, movd, sse2
AVG2_W_TWO 16, movu, mova, sse2
INIT_MMX
-cglobal pixel_avg2_w10_mmxext, 6,7
+cglobal pixel_avg2_w10_mmx2, 6,7
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
jg .height_loop
REP_RET
-cglobal pixel_avg2_w16_mmxext, 6,7
+cglobal pixel_avg2_w16_mmx2, 6,7
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
jg .height_loop
REP_RET
-cglobal pixel_avg2_w18_mmxext, 6,7
+cglobal pixel_avg2_w18_mmx2, 6,7
sub r4, r2
.height_loop:
movu m0, [r2+ 0]
; uint8_t *src2, int height );
;-----------------------------------------------------------------------------
%macro AVG2_W8 2
-cglobal pixel_avg2_w%1_mmxext, 6,7
+cglobal pixel_avg2_w%1_mmx2, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
AVG2_W8 8, movq
%macro AVG2_W16 2
-cglobal pixel_avg2_w%1_mmxext, 6,7
+cglobal pixel_avg2_w%1_mmx2, 6,7
sub r2, r4
lea r6, [r2+r3]
.height_loop:
AVG2_W16 12, movd
AVG2_W16 16, movq
-cglobal pixel_avg2_w20_mmxext, 6,7
+cglobal pixel_avg2_w20_mmx2, 6,7
sub r2, r4
lea r6, [r2+r3]
.height_loop:
%endmacro
%macro AVG_CACHELINE_FUNC 2
-pixel_avg2_w%1_cache_mmxext:
+pixel_avg2_w%1_cache_mmx2:
AVG_CACHELINE_START
AVG_CACHELINE_LOOP 0, movq
%if %1>8
%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
%if %1 == 12
;w12 isn't needed because w16 is just as fast if there's no cacheline split
-%define cachesplit pixel_avg2_w16_cache_mmxext
+%define cachesplit pixel_avg2_w16_cache_mmx2
%else
-%define cachesplit pixel_avg2_w%1_cache_mmxext
+%define cachesplit pixel_avg2_w%1_cache_mmx2
%endif
cglobal pixel_avg2_w%1_cache%2_%3
mov eax, r2m
%endif
%endmacro
-AVG_CACHELINE_CHECK 8, 64, mmxext
-AVG_CACHELINE_CHECK 12, 64, mmxext
+AVG_CACHELINE_CHECK 8, 64, mmx2
+AVG_CACHELINE_CHECK 12, 64, mmx2
%ifndef ARCH_X86_64
-AVG_CACHELINE_CHECK 16, 64, mmxext
-AVG_CACHELINE_CHECK 20, 64, mmxext
-AVG_CACHELINE_CHECK 8, 32, mmxext
-AVG_CACHELINE_CHECK 12, 32, mmxext
-AVG_CACHELINE_CHECK 16, 32, mmxext
-AVG_CACHELINE_CHECK 20, 32, mmxext
+AVG_CACHELINE_CHECK 16, 64, mmx2
+AVG_CACHELINE_CHECK 20, 64, mmx2
+AVG_CACHELINE_CHECK 8, 32, mmx2
+AVG_CACHELINE_CHECK 12, 32, mmx2
+AVG_CACHELINE_CHECK 16, 32, mmx2
+AVG_CACHELINE_CHECK 20, 32, mmx2
%endif
AVG_CACHELINE_CHECK 16, 64, sse2
AVG_CACHELINE_CHECK 20, 64, sse2
; uint8_t *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
%ifdef ARCH_X86_64
-cglobal prefetch_fenc_mmxext, 5,5
+cglobal prefetch_fenc_mmx2, 5,5
and r4d, 3
mov eax, r4d
imul r4d, r1d
RET
%else
-cglobal prefetch_fenc_mmxext, 0,3
+cglobal prefetch_fenc_mmx2, 0,3
mov r2, r4m
mov r1, r1m
mov r0, r0m
;-----------------------------------------------------------------------------
; void prefetch_ref( uint8_t *pix, int stride, int parity )
;-----------------------------------------------------------------------------
-cglobal prefetch_ref_mmxext, 3,3
+cglobal prefetch_ref_mmx2, 3,3
dec r2d
and r2d, r1d
lea r0, [r0+r2*8+64]
%if mmsize==8
.skip_prologue:
%else
- jl mc_chroma_mmxext %+ .skip_prologue
+ jl mc_chroma_mmx2 %+ .skip_prologue
WIN64_SPILL_XMM 9
%endif
movd m5, t2d
%ifdef HIGH_BIT_DEPTH
INIT_MMX
-MC_CHROMA mmxext
+MC_CHROMA mmx2
INIT_XMM
MC_CHROMA sse2
INIT_AVX
%else ; !HIGH_BIT_DEPTH
INIT_MMX
%define UNPACK_UNALIGNED UNPACK_UNALIGNED_MEM
-MC_CHROMA mmxext
+MC_CHROMA mmx2
INIT_XMM
MC_CHROMA sse2_misalign
%define UNPACK_UNALIGNED UNPACK_UNALIGNED_LOAD
%endmacro ; HPEL_FILTER
INIT_MMX
-HPEL_FILTER mmxext
+HPEL_FILTER mmx2
INIT_XMM
HPEL_FILTER sse2
%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal hpel_filter_c_mmxext, 3,3
+cglobal hpel_filter_c_mmx2, 3,3
add r0, r2
lea r1, [r1+r2*2]
neg r2
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal hpel_filter_h_mmxext, 3,3
+cglobal hpel_filter_h_mmx2, 3,3
add r0, r2
add r1, r2
neg r2
%define PALIGNR PALIGNR_MMX
INIT_MMX
-HPEL_V mmxext, 0, 1
+HPEL_V mmx2, 0, 1
INIT_XMM
-HPEL_V sse2, 8, 1
+HPEL_V sse2, 8, 1
HPEL_C sse2_misalign
%ifndef ARCH_X86_64
HPEL_C sse2
%define PALIGNR PALIGNR_SSSE3
HPEL_C ssse3
-HPEL_V ssse3, 0, 0
+HPEL_V ssse3, 0, 0
INIT_AVX
HPEL_C avx
-HPEL_V avx, 0, 0
+HPEL_V avx, 0, 0
%endif
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>w
INIT_MMX
-cglobal plane_copy_core_mmxext, 6,7
+cglobal plane_copy_core_mmx2, 6,7
FIX_STRIDES r1d, r3d, r4d
movsxdifnidn r1, r1d
movsxdifnidn r3, r3d
%ifdef HIGH_BIT_DEPTH
INIT_MMX
-PLANE_INTERLEAVE mmxext
+PLANE_INTERLEAVE mmx2
PLANE_DEINTERLEAVE mmx
INIT_XMM
PLANE_INTERLEAVE sse2
PLANE_DEINTERLEAVE avx
%else
INIT_MMX
-PLANE_INTERLEAVE mmxext
+PLANE_INTERLEAVE mmx2
PLANE_DEINTERLEAVE mmx
INIT_XMM
PLANE_INTERLEAVE sse2
psrld m7, 16
.vloop:
mov r6d, r7m
-%ifnidn %1,mmxext
+%ifnidn %1,mmx2
mova m0, [r0]
mova m1, [r0+r5]
pavgw m0, m1
sub r2, mmsize
sub r3, mmsize
sub r4, mmsize
-%ifidn %1,mmxext
+%ifidn %1,mmx2
FILT8xU r1, r2, 0
FILT8xU r3, r4, r5
%else
psrlw m7, 8
.vloop:
mov r6d, r7m
-%ifnidn %1, mmxext
+%ifnidn %1, mmx2
mova m0, [r0]
mova m1, [r0+r5]
pavgb m0, m1
mova [r2], m4
mova [r3], m3
mova [r4], m5
-%elifidn %1, mmxext
+%elifidn %1, mmx2
FILT8x2U r1, r2, 0
FILT8x2U r3, r4, r5
%else
INIT_MMX
%define PALIGNR PALIGNR_MMX
-FRAME_INIT_LOWRES mmxext
+FRAME_INIT_LOWRES mmx2
%ifndef ARCH_X86_64
-FRAME_INIT_LOWRES cache32_mmxext
+FRAME_INIT_LOWRES cache32_mmx2
%endif
INIT_XMM
FRAME_INIT_LOWRES sse2
#include "mc.h"
#define DECL_SUF( func, args )\
- void func##_mmxext args;\
+ void func##_mmx2 args;\
void func##_sse2 args;\
void func##_ssse3 args;
void x264_mc_offsetsub_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
MC_WEIGHT(w,type)
-MC_WEIGHT_OFFSET( 4, mmxext )
-MC_WEIGHT_OFFSET( 8, mmxext )
-MC_WEIGHT_OFFSET( 12, mmxext )
-MC_WEIGHT_OFFSET( 16, mmxext )
-MC_WEIGHT_OFFSET( 20, mmxext )
+MC_WEIGHT_OFFSET( 4, mmx2 )
+MC_WEIGHT_OFFSET( 8, mmx2 )
+MC_WEIGHT_OFFSET( 12, mmx2 )
+MC_WEIGHT_OFFSET( 16, mmx2 )
+MC_WEIGHT_OFFSET( 20, mmx2 )
MC_WEIGHT_OFFSET( 12, sse2 )
MC_WEIGHT_OFFSET( 16, sse2 )
MC_WEIGHT_OFFSET( 20, sse2 )
void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int );
-void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
-void x264_prefetch_ref_mmxext( uint8_t *, int, int );
-void x264_plane_copy_core_mmxext( pixel *, int, pixel *, int, int w, int h);
+void x264_prefetch_fenc_mmx2( uint8_t *, int, uint8_t *, int, int );
+void x264_prefetch_ref_mmx2( uint8_t *, int, int );
+void x264_plane_copy_core_mmx2( pixel *, int, pixel *, int, int w, int h);
void x264_plane_copy_c( pixel *, int, pixel *, int, int w, int h );
-void x264_plane_copy_interleave_core_mmxext( pixel *dst, int i_dst,
+void x264_plane_copy_interleave_core_mmx2( pixel *dst, int i_dst,
pixel *srcu, int i_srcu,
pixel *srcv, int i_srcv, int w, int h );
void x264_plane_copy_interleave_core_sse2( pixel *dst, int i_dst,
void x264_plane_copy_deinterleave_avx( uint16_t *dstu, int i_dstu,
uint16_t *dstv, int i_dstv,
uint16_t *src, int i_src, int w, int h );
-void x264_store_interleave_8x8x2_mmxext( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
+void x264_store_interleave_8x8x2_mmx2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
void x264_store_interleave_8x8x2_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
void x264_store_interleave_8x8x2_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
void x264_load_deinterleave_8x8x2_fenc_mmx( pixel *dst, pixel *src, int i_src );
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
pixel *src, int i_src,\
int dx, int dy, int i_width, int i_height );
-MC_CHROMA(mmxext)
+MC_CHROMA(mmx2)
MC_CHROMA(sse2)
MC_CHROMA(sse2_misalign)
MC_CHROMA(ssse3)
#define LOWRES(cpu)\
void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
int src_stride, int dst_stride, int width, int height );
-LOWRES(mmxext)
-LOWRES(cache32_mmxext)
+LOWRES(mmx2)
+LOWRES(cache32_mmx2)
LOWRES(sse2)
LOWRES(ssse3)
#define PIXEL_AVG_WALL(cpu)\
PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(10,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(18,cpu); PIXEL_AVG_W(20,cpu);
-PIXEL_AVG_WALL(mmxext)
-PIXEL_AVG_WALL(cache32_mmxext)
-PIXEL_AVG_WALL(cache64_mmxext)
+PIXEL_AVG_WALL(mmx2)
+PIXEL_AVG_WALL(cache32_mmx2)
+PIXEL_AVG_WALL(cache64_mmx2)
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
PIXEL_AVG_WALL(sse2_misalign)
#if HIGH_BIT_DEPTH
/* we can replace w12/w20 with w10/w18 as only 9/17 pixels in fact are important */
-#define x264_pixel_avg2_w12_mmxext x264_pixel_avg2_w10_mmxext
-#define x264_pixel_avg2_w20_mmxext x264_pixel_avg2_w18_mmxext
+#define x264_pixel_avg2_w12_mmx2 x264_pixel_avg2_w10_mmx2
+#define x264_pixel_avg2_w20_mmx2 x264_pixel_avg2_w18_mmx2
#define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w10_sse2
#define x264_pixel_avg2_w20_sse2 x264_pixel_avg2_w18_sse2
#else
#define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w16_sse2
#endif // HIGH_BIT_DEPTH
-PIXEL_AVG_WTAB(mmxext, mmxext, mmxext, mmxext, mmxext, mmxext)
+PIXEL_AVG_WTAB(mmx2, mmx2, mmx2, mmx2, mmx2, mmx2)
#if HIGH_BIT_DEPTH
-PIXEL_AVG_WTAB(sse2, mmxext, sse2, sse2, sse2, sse2)
+PIXEL_AVG_WTAB(sse2, mmx2, sse2, sse2, sse2, sse2)
#else // !HIGH_BIT_DEPTH
#if ARCH_X86
-PIXEL_AVG_WTAB(cache32_mmxext, mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext)
-PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext)
+PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2)
+PIXEL_AVG_WTAB(cache64_mmx2, mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2)
#endif
-PIXEL_AVG_WTAB(sse2, mmxext, mmxext, sse2, sse2, sse2)
-PIXEL_AVG_WTAB(sse2_misalign, mmxext, mmxext, sse2, sse2, sse2_misalign)
-PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
-PIXEL_AVG_WTAB(cache64_ssse3, mmxext, cache64_mmxext, cache64_ssse3, cache64_ssse3, cache64_sse2)
+PIXEL_AVG_WTAB(sse2, mmx2, mmx2, sse2, sse2, sse2)
+PIXEL_AVG_WTAB(sse2_misalign, mmx2, mmx2, sse2, sse2, sse2_misalign)
+PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
+PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
#endif // HIGH_BIT_DEPTH
#define MC_COPY_WTAB(instr, name1, name2, name3)\
};
#if HIGH_BIT_DEPTH
-MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
-MC_WEIGHT_WTAB(offsetadd,mmxext,mmxext,mmxext,12)
-MC_WEIGHT_WTAB(offsetsub,mmxext,mmxext,mmxext,12)
-MC_WEIGHT_WTAB(weight,sse2,mmxext,sse2,12)
-MC_WEIGHT_WTAB(offsetadd,sse2,mmxext,sse2,16)
-MC_WEIGHT_WTAB(offsetsub,sse2,mmxext,sse2,16)
-
-static void x264_weight_cache_mmxext( x264_t *h, x264_weight_t *w )
+MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12)
+MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12)
+MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12)
+MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,12)
+MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,sse2,16)
+MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,sse2,16)
+
+static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )
{
if( w->i_scale == 1<<w->i_denom )
{
}
}
#else
-MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
-MC_WEIGHT_WTAB(offsetadd,mmxext,mmxext,mmxext,12)
-MC_WEIGHT_WTAB(offsetsub,mmxext,mmxext,mmxext,12)
-MC_WEIGHT_WTAB(weight,sse2,mmxext,sse2,16)
-MC_WEIGHT_WTAB(offsetadd,sse2,mmxext,mmxext,16)
-MC_WEIGHT_WTAB(offsetsub,sse2,mmxext,mmxext,16)
+MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12)
+MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12)
+MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12)
+MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,16)
+MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,mmx2,16)
+MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,mmx2,16)
MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16)
-static void x264_weight_cache_mmxext( x264_t *h, x264_weight_t *w )
+static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )
{
int i;
int16_t den1;
x264_mc_copy_wtab_##instr2[i_width>>2](dst, i_dst_stride, src1, i_src_stride, i_height );\
}
-MC_LUMA(mmxext,mmxext,mmx)
+MC_LUMA(mmx2,mmx2,mmx)
MC_LUMA(sse2,sse2,sse2)
#if !HIGH_BIT_DEPTH
#if ARCH_X86
-MC_LUMA(cache32_mmxext,cache32_mmxext,mmx)
-MC_LUMA(cache64_mmxext,cache64_mmxext,mmx)
+MC_LUMA(cache32_mmx2,cache32_mmx2,mmx)
+MC_LUMA(cache64_mmx2,cache64_mmx2,mmx)
#endif
MC_LUMA(cache64_sse2,cache64_sse2,sse2)
MC_LUMA(cache64_ssse3,cache64_ssse3,sse2)
}\
}
-GET_REF(mmxext)
+GET_REF(mmx2)
GET_REF(sse2)
#if !HIGH_BIT_DEPTH
#if ARCH_X86
-GET_REF(cache32_mmxext)
-GET_REF(cache64_mmxext)
+GET_REF(cache32_mmx2)
+GET_REF(cache64_mmx2)
#endif
GET_REF(sse2_misalign)
GET_REF(cache64_sse2)
x264_sfence();\
}
-HPEL(8, mmxext, mmxext, mmxext, mmxext)
+HPEL(8, mmx2, mmx2, mmx2, mmx2)
#if HIGH_BIT_DEPTH
HPEL(16, sse2, sse2, sse2, sse2)
#else // !HIGH_BIT_DEPTH
-HPEL(16, sse2_amd, mmxext, mmxext, sse2)
+HPEL(16, sse2_amd, mmx2, mmx2, sse2)
#if ARCH_X86_64
void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
#endif // HIGH_BIT_DEPTH
-static void x264_plane_copy_mmxext( pixel *dst, int i_dst, pixel *src, int i_src, int w, int h )
+static void x264_plane_copy_mmx2( pixel *dst, int i_dst, pixel *src, int i_src, int w, int h )
{
int c_w = 16/sizeof(pixel) - 1;
if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold.
x264_plane_copy_c( dst, i_dst, src, i_src, w, h );
} else if( !(w&c_w) ) {
- x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, w, h );
+ x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, w, h );
} else if( i_src > 0 ) {
// have to use plain memcpy on the last line (in memory order) to avoid overreading src
- x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, (w+c_w)&~c_w, h-1 );
+ x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, (w+c_w)&~c_w, h-1 );
memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w*sizeof(pixel) );
} else {
memcpy( dst, src, w*sizeof(pixel) );
- x264_plane_copy_core_mmxext( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h-1 );
+ x264_plane_copy_core_mmx2( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h-1 );
}
}
}\
}
-PLANE_INTERLEAVE(mmxext)
+PLANE_INTERLEAVE(mmx2)
PLANE_INTERLEAVE(sse2)
#if HIGH_BIT_DEPTH
PLANE_INTERLEAVE(avx)
pf->integral_init4v = x264_integral_init4v_mmx;
pf->integral_init8v = x264_integral_init8v_mmx;
- if( !(cpu&X264_CPU_MMXEXT) )
+ if( !(cpu&X264_CPU_MMX2) )
return;
- pf->plane_copy = x264_plane_copy_mmxext;
- pf->plane_copy_interleave = x264_plane_copy_interleave_mmxext;
- pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmxext;
-
- pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
- pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmxext;
- pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_mmxext;
- pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_mmxext;
- pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_mmxext;
- pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_mmxext;
- pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext;
- pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext;
-
- pf->mc_luma = mc_luma_mmxext;
- pf->get_ref = get_ref_mmxext;
- pf->mc_chroma = x264_mc_chroma_mmxext;
- pf->hpel_filter = x264_hpel_filter_mmxext;
- pf->weight = x264_mc_weight_wtab_mmxext;
- pf->weight_cache = x264_weight_cache_mmxext;
- pf->offsetadd = x264_mc_offsetadd_wtab_mmxext;
- pf->offsetsub = x264_mc_offsetsub_wtab_mmxext;
-
- pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext;
+ pf->plane_copy = x264_plane_copy_mmx2;
+ pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
+ pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmx2;
+
+ pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmx2;
+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmx2;
+ pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_mmx2;
+ pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_mmx2;
+ pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_mmx2;
+ pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_mmx2;
+ pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmx2;
+ pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmx2;
+
+ pf->mc_luma = mc_luma_mmx2;
+ pf->get_ref = get_ref_mmx2;
+ pf->mc_chroma = x264_mc_chroma_mmx2;
+ pf->hpel_filter = x264_hpel_filter_mmx2;
+ pf->weight = x264_mc_weight_wtab_mmx2;
+ pf->weight_cache = x264_weight_cache_mmx2;
+ pf->offsetadd = x264_mc_offsetadd_wtab_mmx2;
+ pf->offsetsub = x264_mc_offsetsub_wtab_mmx2;
+
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmx2;
#if HIGH_BIT_DEPTH
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
if( cpu&(X264_CPU_CACHELINE_32|X264_CPU_CACHELINE_64) )
- pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmxext;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
#endif
if( !(cpu&X264_CPU_SSE2) )
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_avx;
#else // !HIGH_BIT_DEPTH
- pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
- pf->prefetch_ref = x264_prefetch_ref_mmxext;
+ pf->prefetch_fenc = x264_prefetch_fenc_mmx2;
+ pf->prefetch_ref = x264_prefetch_ref_mmx2;
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
if( cpu&X264_CPU_CACHELINE_32 )
{
- pf->mc_luma = mc_luma_cache32_mmxext;
- pf->get_ref = get_ref_cache32_mmxext;
- pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmxext;
+ pf->mc_luma = mc_luma_cache32_mmx2;
+ pf->get_ref = get_ref_cache32_mmx2;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
}
else if( cpu&X264_CPU_CACHELINE_64 )
{
- pf->mc_luma = mc_luma_cache64_mmxext;
- pf->get_ref = get_ref_cache64_mmxext;
- pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmxext;
+ pf->mc_luma = mc_luma_cache64_mmx2;
+ pf->get_ref = get_ref_cache64_mmx2;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
}
#endif
;-----------------------------------------------------------------------------
; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal pixel_sa8d_8x8_internal_mmxext
+cglobal pixel_sa8d_8x8_internal_mmx2
push r0
push r2
sub esp, 0x74
;-----------------------------------------------------------------------------
; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
;-----------------------------------------------------------------------------
-cglobal intra_sa8d_x3_8x8_core_mmxext
+cglobal intra_sa8d_x3_8x8_core_mmx2
mov eax, [esp+4]
mov ecx, [esp+8]
sub esp, 0x70
; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
-cglobal pixel_ssim_4x4x2_core_mmxext
+cglobal pixel_ssim_4x4x2_core_mmx2
push ebx
push edi
mov ebx, [esp+16]
%endmacro
%macro SSD_16_MMX 2
-cglobal pixel_ssd_%1x%2_mmxext, 4,5
+cglobal pixel_ssd_%1x%2_mmx2, 4,5
mov r4, %1*%2/mmsize/2
pxor m0, m0
.loop
%endmacro
INIT_MMX
-SSD_ONE 4, 4, mmxext
-SSD_ONE 4, 8, mmxext
-SSD_ONE 8, 4, mmxext
-SSD_ONE 8, 8, mmxext
-SSD_ONE 8, 16, mmxext
+SSD_ONE 4, 4, mmx2
+SSD_ONE 4, 8, mmx2
+SSD_ONE 8, 4, mmx2
+SSD_ONE 8, 8, mmx2
+SSD_ONE 8, 16, mmx2
SSD_16_MMX 16, 8
SSD_16_MMX 16, 16
INIT_XMM
paddq m4, m3
paddq m4, m1
%else ; unfortunately paddq is sse2
- ; emulate 48 bit precision for mmxext instead
+ ; emulate 48 bit precision for mmx2 instead
mova m0, m2
mova m1, m3
punpcklwd m2, m6
%if mmsize==16
movq [r3], m4
movhps [r4], m4
-%else ; fixup for mmxext
+%else ; fixup for mmx2
SBUTTERFLY dq, 4, 5, 0
mova m0, m4
psrld m4, 16
%endif ; !HIGH_BIT_DEPTH
INIT_MMX
-SSD_NV12 mmxext
+SSD_NV12 mmx2
INIT_XMM
SSD_NV12 sse2
INIT_AVX
; int pixel_var_wxh( uint8_t *, int )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal pixel_var_16x16_mmxext, 2,3
+cglobal pixel_var_16x16_mmx2, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW 8*SIZEOF_PIXEL, 16
VAR_END 16, 16
-cglobal pixel_var_8x8_mmxext, 2,3
+cglobal pixel_var_8x8_mmx2, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 4
; int pixel_var2_8x8( pixel *, int, pixel *, int, int * )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal pixel_var2_8x8_mmxext, 5,6
+cglobal pixel_var2_8x8_mmx2, 5,6
FIX_STRIDES r1, r3
VAR_START 0
mov r5d, 8
; int pixel_satd_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal pixel_satd_16x4_internal_mmxext
+cglobal pixel_satd_16x4_internal_mmx2
SATD_4x4_MMX m2, 0, 0
SATD_4x4_MMX m1, 4, 0
paddw m0, m2
paddw m0, m1
ret
-cglobal pixel_satd_8x8_internal_mmxext
+cglobal pixel_satd_8x8_internal_mmx2
SATD_4x4_MMX m2, 0, 0
SATD_4x4_MMX m1, 4, 1
paddw m0, m2
paddw m0, m1
-pixel_satd_8x4_internal_mmxext:
+pixel_satd_8x4_internal_mmx2:
SATD_4x4_MMX m2, 0, 0
SATD_4x4_MMX m1, 4, 0
paddw m0, m2
%ifdef HIGH_BIT_DEPTH
%macro SATD_MxN_MMX 3
-cglobal pixel_satd_%1x%2_mmxext, 4,7
+cglobal pixel_satd_%1x%2_mmx2, 4,7
SATD_START_MMX
pxor m0, m0
- call pixel_satd_%1x%3_internal_mmxext
+ call pixel_satd_%1x%3_internal_mmx2
HADDUW m0, m1
movd r6d, m0
%rep %2/%3-1
pxor m0, m0
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
- call pixel_satd_%1x%3_internal_mmxext
+ call pixel_satd_%1x%3_internal_mmx2
movd m2, r4
HADDUW m0, m1
movd r4, m0
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
-cglobal pixel_satd_16x16_mmxext, 4,6
+cglobal pixel_satd_16x16_mmx2, 4,6
SATD_START_MMX
pxor m0, m0
%rep 3
- call pixel_satd_16x4_internal_mmxext
+ call pixel_satd_16x4_internal_mmx2
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endrep
- call pixel_satd_16x4_internal_mmxext
+ call pixel_satd_16x4_internal_mmx2
HADDUW m0, m1
movd eax, m0
RET
-cglobal pixel_satd_16x8_mmxext, 4,6
+cglobal pixel_satd_16x8_mmx2, 4,6
SATD_START_MMX
pxor m0, m0
- call pixel_satd_16x4_internal_mmxext
+ call pixel_satd_16x4_internal_mmx2
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
- call pixel_satd_16x4_internal_mmxext
+ call pixel_satd_16x4_internal_mmx2
SATD_END_MMX
-cglobal pixel_satd_8x16_mmxext, 4,6
+cglobal pixel_satd_8x16_mmx2, 4,6
SATD_START_MMX
pxor m0, m0
- call pixel_satd_8x8_internal_mmxext
+ call pixel_satd_8x8_internal_mmx2
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
- call pixel_satd_8x8_internal_mmxext
+ call pixel_satd_8x8_internal_mmx2
SATD_END_MMX
%endif ; !HIGH_BIT_DEPTH
-cglobal pixel_satd_8x8_mmxext, 4,6
+cglobal pixel_satd_8x8_mmx2, 4,6
SATD_START_MMX
pxor m0, m0
- call pixel_satd_8x8_internal_mmxext
+ call pixel_satd_8x8_internal_mmx2
SATD_END_MMX
-cglobal pixel_satd_8x4_mmxext, 4,6
+cglobal pixel_satd_8x4_mmx2, 4,6
SATD_START_MMX
pxor m0, m0
- call pixel_satd_8x4_internal_mmxext
+ call pixel_satd_8x4_internal_mmx2
SATD_END_MMX
-cglobal pixel_satd_4x8_mmxext, 4,6
+cglobal pixel_satd_4x8_mmx2, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 1
SATD_4x4_MMX m1, 0, 0
paddw m0, m1
SATD_END_MMX
-cglobal pixel_satd_4x4_mmxext, 4,6
+cglobal pixel_satd_4x4_mmx2, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 0
SATD_END_MMX
RET
%else ; ARCH_X86_32
-%ifnidn %1, mmxext
+%ifnidn %1, mmx2
cglobal pixel_sa8d_8x8_internal_%1
%define spill0 [esp+4]
%define spill1 [esp+20]
paddw m0, m3
SAVE_MM_PERMUTATION pixel_sa8d_8x8_internal_%1
ret
-%endif ; ifndef mmxext
+%endif ; ifndef mmx2
cglobal pixel_sa8d_8x8_%1, 4,7
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
call pixel_sa8d_8x8_internal_%1
-%ifidn %1, mmxext
+%ifidn %1, mmx2
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal_%1
-%ifidn %1, mmxext
+%ifidn %1, mmx2
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
; out: [tmp]=hadamard4, m0=satd
-cglobal hadamard_ac_4x4_mmxext
+cglobal hadamard_ac_4x4_mmx2
%ifdef HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+r1]
paddw m0, m1
paddw m2, m3
paddw m0, m2
- SAVE_MM_PERMUTATION hadamard_ac_4x4_mmxext
+ SAVE_MM_PERMUTATION hadamard_ac_4x4_mmx2
ret
-cglobal hadamard_ac_2x2max_mmxext
+cglobal hadamard_ac_2x2max_mmx2
mova m0, [r3+0x00]
mova m1, [r3+0x20]
mova m2, [r3+0x40]
paddw m7, m0
paddw m7, m1
%endif ; HIGH_BIT_DEPTH
- SAVE_MM_PERMUTATION hadamard_ac_2x2max_mmxext
+ SAVE_MM_PERMUTATION hadamard_ac_2x2max_mmx2
ret
%macro AC_PREP 2
%endif ; HIGH_BIT_DEPTH
%endmacro
-cglobal hadamard_ac_8x8_mmxext
+cglobal hadamard_ac_8x8_mmx2
mova m6, [mask_ac4]
%ifdef HIGH_BIT_DEPTH
mova m7, [pw_1]
%else
pxor m7, m7
%endif ; HIGH_BIT_DEPTH
- call hadamard_ac_4x4_mmxext
+ call hadamard_ac_4x4_mmx2
add r0, 4*SIZEOF_PIXEL
add r3, 32
mova m5, m0
AC_PREP m5, m7
- call hadamard_ac_4x4_mmxext
+ call hadamard_ac_4x4_mmx2
lea r0, [r0+4*r1]
add r3, 64
AC_PADD m5, m0, m7
- call hadamard_ac_4x4_mmxext
+ call hadamard_ac_4x4_mmx2
sub r0, 4*SIZEOF_PIXEL
sub r3, 32
AC_PADD m5, m0, m7
- call hadamard_ac_4x4_mmxext
+ call hadamard_ac_4x4_mmx2
AC_PADD m5, m0, m7
sub r3, 40
mova [rsp+gprsize+8], m5 ; save satd
pxor m6, m6
%endif
%rep 3
- call hadamard_ac_2x2max_mmxext
+ call hadamard_ac_2x2max_mmx2
%endrep
mova m0, [r3+0x00]
mova m1, [r3+0x20]
%endif ; HIGH_BIT_DEPTH
mova [rsp+gprsize], m6 ; save sa8d
SWAP 0, 6
- SAVE_MM_PERMUTATION hadamard_ac_8x8_mmxext
+ SAVE_MM_PERMUTATION hadamard_ac_8x8_mmx2
ret
-%macro HADAMARD_AC_WXH_SUM_MMXEXT 2
+%macro HADAMARD_AC_WXH_SUM_MMX 2
mova m1, [rsp+1*mmsize]
%ifdef HIGH_BIT_DEPTH
%if %1*%2 >= 128
%endmacro
%macro HADAMARD_AC_WXH_MMX 2
-cglobal pixel_hadamard_ac_%1x%2_mmxext, 2,4
+cglobal pixel_hadamard_ac_%1x%2_mmx2, 2,4
%assign pad 16-gprsize-(stack_offset&15)
%define ysub r1
FIX_STRIDES r1
sub rsp, 16+128+pad
lea r2, [r1*3]
lea r3, [rsp+16]
- call hadamard_ac_8x8_mmxext
+ call hadamard_ac_8x8_mmx2
%if %2==16
%define ysub r2
lea r0, [r0+r1*4]
sub rsp, 16
- call hadamard_ac_8x8_mmxext
+ call hadamard_ac_8x8_mmx2
%endif
%if %1==16
neg ysub
sub rsp, 16
lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
neg ysub
- call hadamard_ac_8x8_mmxext
+ call hadamard_ac_8x8_mmx2
%if %2==16
lea r0, [r0+r1*4]
sub rsp, 16
- call hadamard_ac_8x8_mmxext
+ call hadamard_ac_8x8_mmx2
%endif
%endif
- HADAMARD_AC_WXH_SUM_MMXEXT %1, %2
+ HADAMARD_AC_WXH_SUM_MMX %1, %2
movd edx, m0
movd eax, m1
shr edx, 1
; instantiate satds
%ifndef ARCH_X86_64
-cextern pixel_sa8d_8x8_internal_mmxext
-SA8D mmxext
+cextern pixel_sa8d_8x8_internal_mmx2
+SA8D mmx2
%endif
%define TRANS TRANS_SSE2
INTRA_SA8D_SSE2 sse2
%ifndef HIGH_BIT_DEPTH
INIT_MMX
-INTRA_SATDS_MMX mmxext
+INTRA_SATDS_MMX mmx2
%endif
INIT_XMM
HADAMARD_AC_SSE2 sse2
; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
-cglobal pixel_ads4_mmxext, 6,7
+cglobal pixel_ads4_mmx2, 6,7
movq mm6, [r0]
movq mm4, [r0+8]
pshufw mm7, mm6, 0
movd [r6], mm1
ADS_END 1
-cglobal pixel_ads2_mmxext, 6,7
+cglobal pixel_ads2_mmx2, 6,7
movq mm6, [r0]
pshufw mm5, r6m, 0
pshufw mm7, mm6, 0
movd [r6], mm4
ADS_END 1
-cglobal pixel_ads1_mmxext, 6,7
+cglobal pixel_ads1_mmx2, 6,7
pshufw mm7, [r0], 0
pshufw mm6, r6m, 0
ADS_START
DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, int, int * ) )\
DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, int, int * ) )
-DECL_X1( sad, mmxext )
+DECL_X1( sad, mmx2 )
DECL_X1( sad, sse2 )
DECL_X4( sad, sse2_misalign )
DECL_X1( sad, sse3 )
DECL_X1( sad, sse2_aligned )
DECL_X1( sad, ssse3 )
DECL_X1( sad, ssse3_aligned )
-DECL_X4( sad, mmxext )
+DECL_X4( sad, mmx2 )
DECL_X4( sad, sse2 )
DECL_X4( sad, sse3 )
DECL_X4( sad, ssse3 )
DECL_X1( ssd, mmx )
-DECL_X1( ssd, mmxext )
+DECL_X1( ssd, mmx2 )
DECL_X1( ssd, sse2slow )
DECL_X1( ssd, sse2 )
DECL_X1( ssd, ssse3 )
DECL_X1( ssd, avx )
-DECL_X1( satd, mmxext )
+DECL_X1( satd, mmx2 )
DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 )
DECL_X1( satd, sse4 )
DECL_X1( satd, avx )
-DECL_X1( sa8d, mmxext )
+DECL_X1( sa8d, mmx2 )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
DECL_X1( sa8d, sse4 )
DECL_X1( sa8d, avx )
-DECL_X1( sad, cache32_mmxext );
-DECL_X1( sad, cache64_mmxext );
+DECL_X1( sad, cache32_mmx2 );
+DECL_X1( sad, cache64_mmx2 );
DECL_X1( sad, cache64_sse2 );
DECL_X1( sad, cache64_ssse3 );
-DECL_X4( sad, cache32_mmxext );
-DECL_X4( sad, cache64_mmxext );
+DECL_X4( sad, cache32_mmx2 );
+DECL_X4( sad, cache64_mmx2 );
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
-DECL_PIXELS( uint64_t, var, mmxext, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, int i_stride ))
-void x264_intra_satd_x3_4x4_mmxext ( pixel *, pixel *, int * );
-void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
-void x264_intra_sad_x3_4x4_mmxext ( pixel *, pixel *, int * );
-void x264_intra_sad_x3_4x4_sse4 ( uint8_t *, uint8_t *, int * );
-void x264_intra_sad_x3_4x4_avx ( uint8_t *, uint8_t *, int * );
-void x264_intra_satd_x3_8x8c_mmxext ( pixel *, pixel *, int * );
-void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
-void x264_intra_sad_x3_8x8c_mmxext ( pixel *, pixel *, int * );
-void x264_intra_sad_x3_8x8c_sse2 ( pixel *, pixel *, int * );
-void x264_intra_sad_x3_8x8c_ssse3 ( pixel *, pixel *, int * );
-void x264_intra_satd_x3_16x16_mmxext( pixel *, pixel *, int * );
-void x264_intra_satd_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * );
-void x264_intra_sad_x3_16x16_mmxext ( pixel *, pixel *, int * );
-void x264_intra_sad_x3_16x16_sse2 ( pixel *, pixel *, int * );
-void x264_intra_sad_x3_16x16_ssse3 ( pixel *, pixel *, int * );
-void x264_intra_sa8d_x3_8x8_mmxext ( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * );
-void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_avx ( uint8_t *, uint8_t *, int * );
-void x264_intra_sad_x3_8x8_mmxext ( pixel *, pixel *, int * );
-void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * );
-void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * );
-void x264_intra_sad_x3_8x8_avx ( pixel *, pixel *, int * );
-void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * );
-void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * );
-void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * );
-void x264_intra_sa8d_x3_8x8_core_avx ( uint8_t *, int16_t [2][8], int * );
+void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * );
+void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_4x4_mmx2 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_4x4_sse4 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_4x4_avx ( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
+void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_8x8c_sse2 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_8x8c_ssse3 ( pixel *, pixel *, int * );
+void x264_intra_satd_x3_16x16_mmx2 ( pixel *, pixel *, int * );
+void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_16x16_mmx2 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_16x16_sse2 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_16x16_ssse3 ( pixel *, pixel *, int * );
+void x264_intra_sa8d_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * );
+void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_avx ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_8x8_avx ( pixel *, pixel *, int * );
+void x264_intra_sa8d_x3_8x8_core_mmx2 ( uint8_t *, int16_t [2][8], int * );
+void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * );
+void x264_intra_sa8d_x3_8x8_core_ssse3( uint8_t *, int16_t [2][8], int * );
+void x264_intra_sa8d_x3_8x8_core_avx ( uint8_t *, int16_t [2][8], int * );
-void x264_pixel_ssd_nv12_core_mmxext( pixel *pixuv1, int stride1,
- pixel *pixuv2, int stride2, int width,
- int height, uint64_t *ssd_u, uint64_t *ssd_v );
+void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, int stride1,
+ pixel *pixuv2, int stride2, int width,
+ int height, uint64_t *ssd_u, uint64_t *ssd_v );
void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, int stride1,
pixel *pixuv2, int stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
-void x264_pixel_ssd_nv12_core_avx( pixel *pixuv1, int stride1,
+void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, int stride1,
pixel *pixuv2, int stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
-void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
- const uint8_t *pix2, int stride2, int sums[2][4] );
+void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, int stride1,
+ const uint8_t *pix2, int stride2, int sums[2][4] );
void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, int stride1,
const pixel *pix2, int stride2, int sums[2][4] );
-void x264_pixel_ssim_4x4x2_core_avx( const pixel *pix1, int stride1,
+void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, int stride1,
const pixel *pix2, int stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
float x264_pixel_ssim_end4_avx( int sum0[5][4], int sum1[5][4], int width );
-int x264_pixel_var2_8x8_mmxext( pixel *, int, pixel *, int, int * );
+int x264_pixel_var2_8x8_mmx2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
-int x264_pixel_vsad_mmxext( pixel *src, int stride, int height );
+int x264_pixel_vsad_mmx2( pixel *src, int stride, int height );
int x264_pixel_vsad_sse2( pixel *src, int stride, int height );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
-DECL_ADS( 4, mmxext )
-DECL_ADS( 2, mmxext )
-DECL_ADS( 1, mmxext )
+DECL_ADS( 4, mmx2 )
+DECL_ADS( 2, mmx2 )
+DECL_ADS( 1, mmx2 )
DECL_ADS( 4, sse2 )
DECL_ADS( 2, sse2 )
DECL_ADS( 1, sse2 )
PREDICT_4x4_DDL avx , dq, 2, w
INIT_MMX
%define PALIGNR PALIGNR_MMX
-cglobal predict_4x4_ddl_mmxext, 1,2
+cglobal predict_4x4_ddl_mmx2, 1,2
mova m1, [r0-2*FDEC_STRIDE+4]
mova m2, [r0-2*FDEC_STRIDE+0]
mova m3, [r0-2*FDEC_STRIDE+2]
RET
%else
INIT_MMX
-PREDICT_4x4_DDL mmxext, q , 8, b
+PREDICT_4x4_DDL mmx2, q, 8, b
%endif
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_MMX
%define PALIGNR PALIGNR_MMX
-cglobal predict_4x4_ddr_mmxext, 1,1
+cglobal predict_4x4_ddr_mmx2, 1,1
movq m3, [r0+3*FDEC_STRIDEB-8]
psrlq m3, 48
PALIGNR m3, [r0+2*FDEC_STRIDEB-8], 6, m6
movd [r0+3*FDEC_STRIDEB+4], m1
RET
-cglobal predict_4x4_hd_mmxext, 1,1
+cglobal predict_4x4_hd_mmx2, 1,1
mova m0, [r0+1*FDEC_STRIDEB-8]
punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
mova m1, [r0+3*FDEC_STRIDEB-8]
INIT_XMM
%define PALIGNR PALIGNR_MMX
-PREDICT_4x4 sse2 , wd, dq, dq, w, qdq, 2
+PREDICT_4x4 sse2 , wd, dq, dq, w, qdq, 2
%define PALIGNR PALIGNR_SSSE3
-PREDICT_4x4 ssse3 , wd, dq, dq, w, qdq, 2
+PREDICT_4x4 ssse3, wd, dq, dq, w, qdq, 2
INIT_AVX
-PREDICT_4x4 avx , wd, dq, dq, w, qdq, 2
+PREDICT_4x4 avx , wd, dq, dq, w, qdq, 2
%else
INIT_MMX
%define PALIGNR PALIGNR_MMX
-PREDICT_4x4 mmxext, bw, wd, q , b, dq , 8
+PREDICT_4x4 mmx2 , bw, wd, q , b, dq , 8
%define PALIGNR PALIGNR_SSSE3
-PREDICT_4x4 ssse3 , bw, wd, q , b, dq , 8
+PREDICT_4x4 ssse3, bw, wd, q , b, dq , 8
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_MMX
-cglobal predict_4x4_hu_mmxext, 1,1
+cglobal predict_4x4_hu_mmx2, 1,1
movq m0, [r0+0*FDEC_STRIDEB-4*2]
punpckhwd m0, [r0+1*FDEC_STRIDEB-4*2]
movq m1, [r0+2*FDEC_STRIDEB-4*2]
%else ; !HIGH_BIT_DEPTH
INIT_MMX
-cglobal predict_4x4_hu_mmxext, 1,1
+cglobal predict_4x4_hu_mmx2, 1,1
movq mm0, [r0+0*FDEC_STRIDE-8]
punpckhbw mm0, [r0+1*FDEC_STRIDE-8]
movq mm1, [r0+2*FDEC_STRIDE-8]
INIT_MMX
%define PALIGNR PALIGNR_MMX
-cglobal predict_4x4_vl_mmxext, 1,4
+cglobal predict_4x4_vl_mmx2, 1,4
mova m1, [r0-FDEC_STRIDEB+0]
mova m2, [r0-FDEC_STRIDEB+8]
mova m3, m2
RET
%else
INIT_MMX
-PREDICT_4x4_V1 mmxext, q , 8, b
+PREDICT_4x4_V1 mmx2, q, 8, b
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_MMX
-cglobal predict_4x4_dc_mmxext, 1,1
+cglobal predict_4x4_dc_mmx2, 1,1
mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
%else
INIT_MMX
-cglobal predict_4x4_dc_mmxext, 1,4
+cglobal predict_4x4_dc_mmx2, 1,4
pxor mm7, mm7
movd mm0, [r0-FDEC_STRIDE]
psadbw mm0, mm7
%ifdef HIGH_BIT_DEPTH
INIT_XMM
%define PALIGNR PALIGNR_MMX
-PREDICT_FILTER sse2 , w, d, q, dq, 2
+PREDICT_FILTER sse2 , w, d, q, dq, 2
%define PALIGNR PALIGNR_SSSE3
-PREDICT_FILTER ssse3 , w, d, q, dq, 2
+PREDICT_FILTER ssse3, w, d, q, dq, 2
INIT_AVX
-PREDICT_FILTER avx , w, d, q, dq, 2
+PREDICT_FILTER avx , w, d, q, dq, 2
%else
INIT_MMX
%define PALIGNR PALIGNR_MMX
-PREDICT_FILTER mmxext, b, w, d, q , 8
+PREDICT_FILTER mmx2 , b, w, d, q , 8
%define PALIGNR PALIGNR_SSSE3
-PREDICT_FILTER ssse3 , b, w, d, q , 8
+PREDICT_FILTER ssse3, b, w, d, q , 8
%endif
;-----------------------------------------------------------------------------
PREDICT_8x8_V sse2
%else
INIT_MMX
-PREDICT_8x8_V mmxext
+PREDICT_8x8_V mmx2
%endif
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_XMM
-PREDICT_8x8_H sse2 , wd, D
+PREDICT_8x8_H sse2, wd, D
%else
INIT_MMX
-PREDICT_8x8_H mmxext, bw, W
+PREDICT_8x8_H mmx2, bw, W
%endif
;-----------------------------------------------------------------------------
%else
INIT_MMX
-cglobal predict_8x8_dc_mmxext, 2,2
+cglobal predict_8x8_dc_mmx2, 2,2
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [r1+7]
RET
%endmacro
INIT_MMX
-PREDICT_8x8_DC predict_8x8_dc_top_mmxext, 16
-PREDICT_8x8_DC predict_8x8_dc_left_mmxext, 7
+PREDICT_8x8_DC predict_8x8_dc_top_mmx2, 16
+PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7
%endif ; HIGH_BIT_DEPTH
; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe
%ifdef HIGH_BIT_DEPTH
INIT_XMM
-PREDICT_8x8 sse2 , w, dq, 2
+PREDICT_8x8 sse2, w, dq, 2
INIT_AVX
-PREDICT_8x8 avx , w, dq, 2
+PREDICT_8x8 avx , w, dq, 2
%elifndef ARCH_X86_64
INIT_MMX
-PREDICT_8x8 mmxext, b, q , 8
+PREDICT_8x8 mmx2, b, q , 8
%endif
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_XMM
%define PALIGNR PALIGNR_MMX
-PREDICT_8x8_HU sse2 , w, dq, d, wd, 2
+PREDICT_8x8_HU sse2 , w, dq, d, wd, 2
%define PALIGNR PALIGNR_SSSE3
-PREDICT_8x8_HU ssse3 , w, dq, d, wd, 2
+PREDICT_8x8_HU ssse3, w, dq, d, wd, 2
INIT_AVX
-PREDICT_8x8_HU avx , w, dq, d, wd, 2
+PREDICT_8x8_HU avx , w, dq, d, wd, 2
%elifndef ARCH_X86_64
INIT_MMX
%define PALIGNR PALIGNR_MMX
-PREDICT_8x8_HU mmxext, b, q , w, bw, 8
+PREDICT_8x8_HU mmx2 , b, q , w, bw, 8
%endif
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_XMM
%define PALIGNR PALIGNR_MMX
-PREDICT_8x8_VR sse2 , w, dq, 2
+PREDICT_8x8_VR sse2 , w, dq, 2
%define PALIGNR PALIGNR_SSSE3
-PREDICT_8x8_VR ssse3 , w, dq, 2
+PREDICT_8x8_VR ssse3, w, dq, 2
INIT_AVX
-PREDICT_8x8_VR avx , w, dq, 2
+PREDICT_8x8_VR avx , w, dq, 2
%else
INIT_MMX
%define PALIGNR PALIGNR_MMX
-PREDICT_8x8_VR mmxext, b, q , 8
+PREDICT_8x8_VR mmx2 , b, q , 8
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
INIT_MMX
-cglobal predict_8x8c_p_core_mmxext, 1,2
+cglobal predict_8x8c_p_core_mmx2, 1,2
LOAD_PLANE_ARGS
movq mm1, mm2
pmullw mm2, [pw_3210]
; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
-cglobal predict_16x16_p_core_mmxext, 1,2
+cglobal predict_16x16_p_core_mmx2, 1,2
LOAD_PLANE_ARGS
movq mm5, mm2
movq mm1, mm2
%ifdef HIGH_BIT_DEPTH
INIT_XMM
%define PALIGNR PALIGNR_MMX
-PREDICT_8x8_HD sse2 , w, dq, wd, 2
+PREDICT_8x8_HD sse2 , w, dq, wd, 2
%define PALIGNR PALIGNR_SSSE3
-PREDICT_8x8_HD ssse3 , w, dq, wd, 2
+PREDICT_8x8_HD ssse3, w, dq, wd, 2
INIT_AVX
-PREDICT_8x8_HD avx , w, dq, wd, 2
+PREDICT_8x8_HD avx , w, dq, wd, 2
%else
INIT_MMX
%define PALIGNR PALIGNR_MMX
-PREDICT_8x8_HD mmxext, b, q , bw, 8
+PREDICT_8x8_HD mmx2 , b, q , bw, 8
;-----------------------------------------------------------------------------
; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
INIT_MMX
%define SPLATB SPLATB_MMX
-PREDICT_8x8C_H mmxext
+PREDICT_8x8C_H mmx2
%define SPLATB SPLATB_SSSE3
PREDICT_8x8C_H ssse3
%endmacro
INIT_MMX
-PREDICT_8x8C_DC mmxext
+PREDICT_8x8C_DC mmx2
%ifdef HIGH_BIT_DEPTH
PREDICT_8x8C_DC sse2
%endif
%else
-cglobal predict_8x8c_dc_top_mmxext, 1,1
+cglobal predict_8x8c_dc_top_mmx2, 1,1
movq mm0, [r0 - FDEC_STRIDE]
pxor mm1, mm1
pxor mm2, mm2
INIT_MMX
%define SPLATB SPLATB_MMX
-PREDICT_16x16_H mmxext
+PREDICT_16x16_H mmx2
INIT_XMM
%ifdef HIGH_BIT_DEPTH
PREDICT_16x16_H sse2
%endmacro
INIT_MMX
-cglobal predict_16x16_dc_core_mmxext, 1,2
+cglobal predict_16x16_dc_core_mmx2, 1,2
%ifdef ARCH_X86_64
movd m6, r1d
PRED16x16_DC m6, 5
REP_RET
INIT_MMX
-cglobal predict_16x16_dc_top_mmxext, 1,2
+cglobal predict_16x16_dc_top_mmx2, 1,2
PRED16x16_DC [pw_8], 4
REP_RET
INIT_MMX
%ifdef HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core_mmxext, 1,2
+cglobal predict_16x16_dc_left_core_mmx2, 1,2
movd m0, r1m
SPLATW m0, m0
STORE16x16 m0, m0, m0, m0
REP_RET
%else
-cglobal predict_16x16_dc_left_core_mmxext, 1,1
+cglobal predict_16x16_dc_left_core_mmx2, 1,1
movd m0, r1m
pshufw m0, m0, 0
packuswb m0, m0
void x264_predict_16x16_v_mmx( pixel *src );
void x264_predict_16x16_v_sse2( pixel *src );
- void x264_predict_16x16_h_mmxext( pixel *src );
+ void x264_predict_16x16_h_mmx2( pixel *src );
void x264_predict_16x16_h_sse2( uint16_t *src );
void x264_predict_16x16_h_ssse3( uint8_t *src );
- void x264_predict_16x16_dc_core_mmxext( pixel *src, int i_dc_left );
+ void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_core_sse2( pixel *src, int i_dc_left );
- void x264_predict_16x16_dc_left_core_mmxext( pixel *src, int i_dc_left );
+ void x264_predict_16x16_dc_left_core_mmx2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_left_core_sse2( pixel *src, int i_dc_left );
- void x264_predict_16x16_dc_top_mmxext( pixel *src );
+ void x264_predict_16x16_dc_top_mmx2( pixel *src );
void x264_predict_16x16_dc_top_sse2( pixel *src );
void x264_predict_16x16_dc_top_ssse3( uint16_t *src );
- void x264_predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
+ void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
- void x264_predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
+ void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
- void x264_predict_8x8c_dc_mmxext( pixel *src );
+ void x264_predict_8x8c_dc_mmx2( pixel *src );
void x264_predict_8x8c_dc_sse2( uint16_t *src );
- void x264_predict_8x8c_dc_top_mmxext( uint8_t *src );
+ void x264_predict_8x8c_dc_top_mmx2( uint8_t *src );
void x264_predict_8x8c_dc_top_sse2( uint16_t *src );
void x264_predict_8x8c_v_mmx( pixel *src );
void x264_predict_8x8c_v_sse2( uint16_t *src );
- void x264_predict_8x8c_h_mmxext( uint8_t *src );
+ void x264_predict_8x8c_h_mmx2( uint8_t *src );
void x264_predict_8x8c_h_sse2( pixel *src );
void x264_predict_8x8c_h_ssse3( uint8_t *src );
- void x264_predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_v_sse2( uint16_t *src, uint16_t edge[33] );
- void x264_predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[33] );
- void x264_predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] );
- void x264_predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] );
- void x264_predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hd_mmx2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hu_mmx2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_dc_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_dc_sse2( uint16_t *src, uint16_t edge[33] );
- void x264_predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_dc_top_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_dc_top_sse2( uint16_t *src, uint16_t edge[33] );
- void x264_predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_dc_left_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[33] );
- void x264_predict_8x8_ddl_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_ddl_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[33] );
void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[33] );
- void x264_predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_ddr_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[33] );
void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[33] );
void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_vl_avx( uint8_t *src, uint8_t edge[33] );
- void x264_predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_vr_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[33] );
void x264_predict_8x8_vr_ssse3( uint16_t *src, uint16_t edge[33] );
void x264_predict_8x8_vr_avx( pixel *src, pixel edge[33] );
void x264_predict_8x8_hd_sse2( pixel *src, pixel edge[33] );
void x264_predict_8x8_hd_ssse3( pixel *src, pixel edge[33] );
void x264_predict_8x8_hd_avx( pixel *src, pixel edge[33] );
- void x264_predict_8x8_filter_mmxext( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
+ void x264_predict_8x8_filter_mmx2( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[33], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[33], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[33], int i_neighbor, int i_filters );
- void x264_predict_4x4_ddl_mmxext( pixel *src );
+ void x264_predict_4x4_ddl_mmx2( pixel *src );
void x264_predict_4x4_ddl_sse2( uint16_t *src );
void x264_predict_4x4_ddl_avx( uint16_t *src );
- void x264_predict_4x4_ddr_mmxext( pixel *src );
- void x264_predict_4x4_vl_mmxext( pixel *src );
+ void x264_predict_4x4_ddr_mmx2( pixel *src );
+ void x264_predict_4x4_vl_mmx2( pixel *src );
void x264_predict_4x4_vl_sse2( uint16_t *src );
void x264_predict_4x4_vl_avx( uint16_t *src );
- void x264_predict_4x4_vr_mmxext( uint8_t *src );
+ void x264_predict_4x4_vr_mmx2( uint8_t *src );
void x264_predict_4x4_vr_sse2( uint16_t *src );
void x264_predict_4x4_vr_ssse3( pixel *src );
void x264_predict_4x4_vr_avx( uint16_t *src );
- void x264_predict_4x4_hd_mmxext( pixel *src );
+ void x264_predict_4x4_hd_mmx2( pixel *src );
void x264_predict_4x4_hd_sse2( uint16_t *src );
void x264_predict_4x4_hd_ssse3( pixel *src );
void x264_predict_4x4_hd_avx( uint16_t *src );
- void x264_predict_4x4_dc_mmxext( pixel *src );
+ void x264_predict_4x4_dc_mmx2( pixel *src );
void x264_predict_4x4_ddr_sse2( uint16_t *src );
void x264_predict_4x4_ddr_ssse3( pixel *src );
void x264_predict_4x4_ddr_avx( uint16_t *src );
- void x264_predict_4x4_hu_mmxext( pixel *src );
+ void x264_predict_4x4_hu_mmx2( pixel *src );
#define PREDICT_16x16_DC(name)\
static void x264_predict_16x16_dc_##name( pixel *src )\
x264_predict_16x16_dc_core_##name( src, dc );\
}
-PREDICT_16x16_DC( mmxext )
+PREDICT_16x16_DC( mmx2 )
PREDICT_16x16_DC( sse2 )
#define PREDICT_16x16_DC_LEFT(name)\
x264_predict_16x16_dc_left_core_##name( src, dc>>4 );\
}
-PREDICT_16x16_DC_LEFT( mmxext )
+PREDICT_16x16_DC_LEFT( mmx2 )
PREDICT_16x16_DC_LEFT( sse2 )
#define PREDICT_P_SUM(j,i)\
x264_predict_16x16_p_core_##name( src, i00, b, c );\
}
#ifndef ARCH_X86_64
-PREDICT_16x16_P( mmxext )
+PREDICT_16x16_P( mmx2 )
#endif
PREDICT_16x16_P( sse2 )
PREDICT_16x16_P( avx )
x264_predict_8x8c_p_core_##name( src, i00, b, c );\
}
#ifndef ARCH_X86_64
-PREDICT_8x8_P( mmxext )
+PREDICT_8x8_P( mmx2 )
#endif
PREDICT_8x8_P( sse2 )
INTRA_SA8D_X3(ssse3)
INTRA_SA8D_X3(avx)
#else
-INTRA_SA8D_X3(mmxext)
+INTRA_SA8D_X3(mmx2)
#endif
#endif // !HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_MMX) )
return;
pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx;
- if( cpu&X264_CPU_MMXEXT )
+ if( cpu&X264_CPU_MMX2 )
{
- pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_mmxext;
- pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_mmxext;
- pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_mmxext;
- pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmxext;
+ pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_mmx2;
+ pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_mmx2;
+ pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_mmx2;
+ pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmx2;
}
#if HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_SSE2) )
#endif
#else
#if !ARCH_X86_64
- pf[I_PRED_16x16_P] = x264_predict_16x16_p_mmxext;
+ pf[I_PRED_16x16_P] = x264_predict_16x16_p_mmx2;
#endif
if( !(cpu&X264_CPU_SSE2) )
return;
return;
#if HIGH_BIT_DEPTH
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_mmx;
- if( !(cpu&X264_CPU_MMXEXT) )
+ if( !(cpu&X264_CPU_MMX2) )
return;
- pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmxext;
+ pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmx2;
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_sse2;
pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left;
#endif
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_mmx;
- if( !(cpu&X264_CPU_MMXEXT) )
+ if( !(cpu&X264_CPU_MMX2) )
return;
- pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_mmxext;
- pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmxext;
+ pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_mmx2;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmx2;
#if !ARCH_X86_64
- pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_mmxext;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_mmx2;
#endif
- pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmxext;
+ pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmx2;
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_sse2;
void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
{
- if( !(cpu&X264_CPU_MMXEXT) )
+ if( !(cpu&X264_CPU_MMX2) )
return;
#if HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_SSE2) )
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx;
*predict_8x8_filter = x264_predict_8x8_filter_avx;
#else
- pf[I_PRED_8x8_V] = x264_predict_8x8_v_mmxext;
- pf[I_PRED_8x8_H] = x264_predict_8x8_h_mmxext;
- pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_mmxext;
- pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmxext;
- pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmxext;
- pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_mmxext;
- *predict_8x8_filter = x264_predict_8x8_filter_mmxext;
+ pf[I_PRED_8x8_V] = x264_predict_8x8_v_mmx2;
+ pf[I_PRED_8x8_H] = x264_predict_8x8_h_mmx2;
+ pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_mmx2;
+ pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmx2;
+ pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmx2;
+ pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_mmx2;
+ *predict_8x8_filter = x264_predict_8x8_filter_mmx2;
#if ARCH_X86
- pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_mmxext;
- pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_mmxext;
- pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_mmxext;
- pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_mmxext;
+ pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_mmx2;
+ pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_mmx2;
+ pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_mmx2;
+ pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_mmx2;
#endif
if( !(cpu&X264_CPU_SSE2) )
return;
void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
{
- if( !(cpu&X264_CPU_MMXEXT) )
+ if( !(cpu&X264_CPU_MMX2) )
return;
- pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_mmxext;
- pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmxext;
- pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_mmxext;
- pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_mmxext;
- pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_mmxext;
- pf[I_PRED_4x4_HU] = x264_predict_4x4_hu_mmxext;
+ pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_mmx2;
+ pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmx2;
+ pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_mmx2;
+ pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_mmx2;
+ pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_mmx2;
+ pf[I_PRED_4x4_HU] = x264_predict_4x4_hu_mmx2;
#if HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_SSE2) )
return;
#endif
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_avx;
#else
- pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmxext;
+ pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmx2;
#endif // HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_SSSE3) )
return;
%define PABSW PABSW_MMX
%define PSIGNW PSIGNW_MMX
%define QUANT_DC_START QUANT_DC_START_MMX
-QUANT_DC quant_2x2_dc_mmxext, 1
+QUANT_DC quant_2x2_dc_mmx2, 1
%ifndef ARCH_X86_64 ; not needed because sse2 is faster
-QUANT_DC quant_4x4_dc_mmxext, 4
+QUANT_DC quant_4x4_dc_mmx2, 4
QUANT_AC quant_4x4_mmx, 4
QUANT_AC quant_8x8_mmx, 16
%endif
%ifdef HIGH_BIT_DEPTH
INIT_XMM
-DEQUANT_DC sse2 , d
-DEQUANT_DC sse4 , d
+DEQUANT_DC sse2, d
+DEQUANT_DC sse4, d
INIT_AVX
-DEQUANT_DC avx , d
+DEQUANT_DC avx, d
%else
INIT_MMX
-DEQUANT_DC mmxext, w
+DEQUANT_DC mmx2, w
INIT_XMM
-DEQUANT_DC sse2 , w
+DEQUANT_DC sse2, w
INIT_AVX
-DEQUANT_DC avx , w
+DEQUANT_DC avx, w
%endif
; t4 is eax for return value.
%ifndef ARCH_X86_64
INIT_MMX
%define DECIMATE_MASK DECIMATE_MASK_MMX
-DECIMATE4x4 15, mmxext, 0, 0
-DECIMATE4x4 16, mmxext, 0, 0
-DECIMATE4x4 15, mmxext_slowctz, 1, 0
-DECIMATE4x4 16, mmxext_slowctz, 1, 0
+DECIMATE4x4 15, mmx2, 0, 0
+DECIMATE4x4 16, mmx2, 0, 0
+DECIMATE4x4 15, mmx2_slowctz, 1, 0
+DECIMATE4x4 16, mmx2_slowctz, 1, 0
%endif
INIT_XMM
%define DECIMATE_MASK DECIMATE_MASK_SSE2
RET
%else ; ARCH
-%ifidn %1, mmxext
+%ifidn %1, mmx2
cglobal decimate_score64_%1, 1,6
%else
cglobal decimate_score64_%1, 1,5
%ifndef ARCH_X86_64
INIT_MMX
%define DECIMATE_MASK DECIMATE_MASK_MMX
-DECIMATE8x8 mmxext, 0
+DECIMATE8x8 mmx2, 0
%endif
INIT_XMM
%define DECIMATE_MASK DECIMATE_MASK_SSE2
%endmacro
%define LAST LAST_X86
-COEFF_LAST4 mmxext
+COEFF_LAST4 mmx2
%define LAST LAST_SSE4A
-COEFF_LAST4 mmxext_lzcnt
+COEFF_LAST4 mmx2_lzcnt
%else ; !HIGH_BIT_DEPTH
%macro LAST_MASK4_MMX 2-3
%endmacro
%define LAST LAST_X86
-COEFF_LAST4 mmxext
+COEFF_LAST4 mmx2
%define LAST LAST_SSE4A
-COEFF_LAST4 mmxext_lzcnt
+COEFF_LAST4 mmx2_lzcnt
%endif ; HIGH_BIT_DEPTH
%macro COEFF_LAST 1
%ifndef ARCH_X86_64
INIT_MMX
%define LAST_MASK LAST_MASK_MMX
-COEFF_LAST mmxext
+COEFF_LAST mmx2
%endif
INIT_XMM
%define LAST_MASK LAST_MASK_SSE2
%define LZCOUNT LZCOUNT_X86
%ifndef ARCH_X86_64
%define LAST_MASK LAST_MASK_MMX
-COEFF_LEVELRUN mmxext, 15
-COEFF_LEVELRUN mmxext, 16
+COEFF_LEVELRUN mmx2, 15
+COEFF_LEVELRUN mmx2, 16
%endif
%define LAST_MASK LAST_MASK4_MMX
-COEFF_LEVELRUN mmxext, 4
+COEFF_LEVELRUN mmx2, 4
INIT_XMM
%define LAST_MASK LAST_MASK_SSE2
COEFF_LEVELRUN sse2, 15
COEFF_LEVELRUN sse2_lzcnt, 16
INIT_MMX
%define LAST_MASK LAST_MASK4_MMX
-COEFF_LEVELRUN mmxext_lzcnt, 4
+COEFF_LEVELRUN mmx2_lzcnt, 4
#ifndef X264_I386_QUANT_H
#define X264_I386_QUANT_H
-int x264_quant_2x2_dc_mmxext( dctcoef dct[4], int mf, int bias );
-int x264_quant_4x4_dc_mmxext( dctcoef dct[16], int mf, int bias );
+int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias );
+int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_mmx( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_8x8_mmx( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_sse4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_4x4dc_mmxext( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4dc_mmx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_avx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
-int x264_decimate_score15_mmxext( dctcoef *dct );
-int x264_decimate_score15_sse2 ( dctcoef *dct );
-int x264_decimate_score15_ssse3 ( dctcoef *dct );
-int x264_decimate_score16_mmxext( dctcoef *dct );
-int x264_decimate_score16_sse2 ( dctcoef *dct );
-int x264_decimate_score16_ssse3 ( dctcoef *dct );
-int x264_decimate_score15_mmxext_slowctz( dctcoef *dct );
-int x264_decimate_score15_sse2_slowctz ( dctcoef *dct );
-int x264_decimate_score15_ssse3_slowctz ( dctcoef *dct );
-int x264_decimate_score16_mmxext_slowctz( dctcoef *dct );
-int x264_decimate_score16_sse2_slowctz ( dctcoef *dct );
-int x264_decimate_score16_ssse3_slowctz ( dctcoef *dct );
-int x264_decimate_score64_mmxext( dctcoef *dct );
-int x264_decimate_score64_sse2 ( dctcoef *dct );
-int x264_decimate_score64_ssse3 ( dctcoef *dct );
-int x264_coeff_last4_mmxext( dctcoef *dct );
-int x264_coeff_last15_mmxext( dctcoef *dct );
-int x264_coeff_last16_mmxext( dctcoef *dct );
-int x264_coeff_last64_mmxext( dctcoef *dct );
+int x264_decimate_score15_mmx2( dctcoef *dct );
+int x264_decimate_score15_sse2( dctcoef *dct );
+int x264_decimate_score15_ssse3( dctcoef *dct );
+int x264_decimate_score16_mmx2( dctcoef *dct );
+int x264_decimate_score16_sse2( dctcoef *dct );
+int x264_decimate_score16_ssse3( dctcoef *dct );
+int x264_decimate_score15_mmx2_slowctz( dctcoef *dct );
+int x264_decimate_score15_sse2_slowctz( dctcoef *dct );
+int x264_decimate_score15_ssse3_slowctz( dctcoef *dct );
+int x264_decimate_score16_mmx2_slowctz( dctcoef *dct );
+int x264_decimate_score16_sse2_slowctz( dctcoef *dct );
+int x264_decimate_score16_ssse3_slowctz( dctcoef *dct );
+int x264_decimate_score64_mmx2( dctcoef *dct );
+int x264_decimate_score64_sse2( dctcoef *dct );
+int x264_decimate_score64_ssse3( dctcoef *dct );
+int x264_coeff_last4_mmx2( dctcoef *dct );
+int x264_coeff_last15_mmx2( dctcoef *dct );
+int x264_coeff_last16_mmx2( dctcoef *dct );
+int x264_coeff_last64_mmx2( dctcoef *dct );
int x264_coeff_last15_sse2( dctcoef *dct );
int x264_coeff_last16_sse2( dctcoef *dct );
int x264_coeff_last64_sse2( dctcoef *dct );
-int x264_coeff_last4_mmxext_lzcnt( dctcoef *dct );
+int x264_coeff_last4_mmx2_lzcnt( dctcoef *dct );
int x264_coeff_last15_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last16_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last64_sse2_lzcnt( dctcoef *dct );
-int x264_coeff_level_run16_mmxext( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run15_mmxext( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run4_mmxext( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run4_mmxext_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
#endif
; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SAD 2
-cglobal pixel_sad_%1x%2_mmxext, 4,4
+cglobal pixel_sad_%1x%2_mmx2, 4,4
pxor mm0, mm0
%rep %2/2
SAD_INC_2x%1P
%ifndef ARCH_X86_64
INIT_MMX
-cglobal pixel_vsad_mmxext, 3,3
+cglobal pixel_vsad_mmx2, 3,3
mova m0, [r0]
mova m1, [r0+8]
mova m2, [r0+r1]
; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
-cglobal intra_sad_x3_4x4_mmxext, 3,3
+cglobal intra_sad_x3_4x4_mmx2, 3,3
pxor mm7, mm7
movd mm0, [r1-FDEC_STRIDE]
movd mm1, [r0+FENC_STRIDE*0]
%endmacro
INIT_MMX
-cglobal intra_sad_x3_8x8_mmxext, 3,3
+cglobal intra_sad_x3_8x8_mmx2, 3,3
movq m7, [r1+7]
pxor m0, m0
movq m6, [r1+16] ;V prediction
%endmacro
INIT_MMX
-INTRA_SAD_8x8C mmxext
+INTRA_SAD_8x8C mmx2
INTRA_SAD_8x8C ssse3
INIT_MMX
%define SPLATB SPLATB_MMX
-INTRA_SAD16 mmxext
+INTRA_SAD16 mmx2
INIT_XMM
INTRA_SAD16 sse2, 8
%define SPLATB SPLATB_SSSE3
; uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
-cglobal pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
+cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
%ifdef WIN64
%assign i %1+1
movsxd r %+ i, r %+ i %+ d
mov eax, r2m
and eax, 0x17|%1|(%4>>1)
cmp eax, 0x10|%1|(%4>>1)
- jle pixel_sad_%1x%2_mmxext
+ jle pixel_sad_%1x%2_mmx2
and eax, 7
shl eax, 3
movd mm6, [sw_64]
%endmacro
%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal pixel_sad_16x%1_cache%2_mmxext
+cglobal pixel_sad_16x%1_cache%2_mmx2
SAD_CACHELINE_START_MMX2 16, %1, %1, %2
.loop:
movq mm1, [r2]
%endmacro
%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal pixel_sad_8x%1_cache%2_mmxext
+cglobal pixel_sad_8x%1_cache%2_mmx2
SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
.loop:
movq mm1, [r2+8]
SAD8_CACHELINE_FUNC_MMX2 16, 64
%ifndef ARCH_X86_64
-SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext, mmxext
-SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext, mmxext
-SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2
+SADX34_CACHELINE_FUNC 16, 8, 32, mmx2, mmx2, mmx2
+SADX34_CACHELINE_FUNC 8, 16, 32, mmx2, mmx2, mmx2
+SADX34_CACHELINE_FUNC 8, 8, 32, mmx2, mmx2, mmx2
+SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2
+SADX34_CACHELINE_FUNC 16, 8, 64, mmx2, mmx2, mmx2
%endif ; !ARCH_X86_64
-SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext, mmxext
-SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext, mmxext
+SADX34_CACHELINE_FUNC 8, 16, 64, mmx2, mmx2, mmx2
+SADX34_CACHELINE_FUNC 8, 8, 64, mmx2, mmx2, mmx2
%ifndef ARCH_X86_64
SAD16_CACHELINE_FUNC sse2, 8
SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2
%endif ; !ARCH_X86_64
-SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmxext, sse2
+SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmx2, sse2
SAD16_CACHELINE_FUNC ssse3, 8
SAD16_CACHELINE_FUNC ssse3, 16
INIT_MMX
%define ABS1 ABS1_MMX
%define ABS2 ABS2_MMX
-SAD_MMX 16, 16, 1, mmxext
-SAD_MMX 16, 8, 1, mmxext
-SAD_MMX 8, 16, 2, mmxext
-SAD_MMX 8, 8, 2, mmxext
-SAD_MMX 8, 4, 2, mmxext
-SAD_MMX 4, 8, 2, mmxext
-SAD_MMX 4, 4, 2, mmxext
+SAD_MMX 16, 16, 1, mmx2
+SAD_MMX 16, 8, 1, mmx2
+SAD_MMX 8, 16, 2, mmx2
+SAD_MMX 8, 8, 2, mmx2
+SAD_MMX 8, 4, 2, mmx2
+SAD_MMX 4, 8, 2, mmx2
+SAD_MMX 4, 4, 2, mmx2
%define ABS1 ABS1_SSSE3
%define ABS2 ABS2_SSSE3
SAD_MMX 4, 8, 2, ssse3
%define XMM_REGS 0
%define ABS1 ABS1_MMX
%define ABS2 ABS2_MMX
-SAD_X 3, 16, 16, mmxext
-SAD_X 3, 16, 8, mmxext
-SAD_X 3, 8, 16, mmxext
-SAD_X 3, 8, 8, mmxext
-SAD_X 3, 8, 4, mmxext
-SAD_X 3, 4, 8, mmxext
-SAD_X 3, 4, 4, mmxext
-SAD_X 4, 16, 16, mmxext
-SAD_X 4, 16, 8, mmxext
-SAD_X 4, 8, 16, mmxext
-SAD_X 4, 8, 8, mmxext
-SAD_X 4, 8, 4, mmxext
-SAD_X 4, 4, 8, mmxext
-SAD_X 4, 4, 4, mmxext
+SAD_X 3, 16, 16, mmx2
+SAD_X 3, 16, 8, mmx2
+SAD_X 3, 8, 16, mmx2
+SAD_X 3, 8, 8, mmx2
+SAD_X 3, 8, 4, mmx2
+SAD_X 3, 4, 8, mmx2
+SAD_X 3, 4, 4, mmx2
+SAD_X 4, 16, 16, mmx2
+SAD_X 4, 16, 8, mmx2
+SAD_X 4, 8, 16, mmx2
+SAD_X 4, 8, 8, mmx2
+SAD_X 4, 8, 4, mmx2
+SAD_X 4, 4, 8, mmx2
+SAD_X 4, 4, 4, mmx2
%define ABS1 ABS1_SSSE3
%define ABS2 ABS2_SSSE3
SAD_X 3, 4, 8, ssse3
#if HAVE_X86_INLINE_ASM && HAVE_MMX
-#define x264_median_mv x264_median_mv_mmxext
-static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
+#define x264_median_mv x264_median_mv_mmx2
+static ALWAYS_INLINE void x264_median_mv_mmx2( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
{
asm(
"movd %1, %%mm0 \n"
);
}
-#define x264_predictor_difference x264_predictor_difference_mmxext
-static ALWAYS_INLINE int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t i_mvc )
+#define x264_predictor_difference x264_predictor_difference_mmx2
+static ALWAYS_INLINE int x264_predictor_difference_mmx2( int16_t (*mvc)[2], intptr_t i_mvc )
{
int sum;
static const uint64_t pw_1 = 0x0001000100010001ULL;
return sum;
}
-#define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmxext
-static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_t *mvdtop)
+#define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmx2
+static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmx2(uint8_t *mvdleft, uint8_t *mvdtop)
{
static const uint64_t pb_2 = 0x0202020202020202ULL;
static const uint64_t pb_32 = 0x2020202020202020ULL;
return amvd;
}
-#define x264_predictor_roundclip x264_predictor_roundclip_mmxext
-static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
+#define x264_predictor_roundclip x264_predictor_roundclip_mmx2
+static void ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
{
uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min );
uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max );
{
x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n");
#else
- if( b_open && !(x264_cpu_detect() & X264_CPU_MMXEXT) )
+ if( b_open && !(x264_cpu_detect() & X264_CPU_MMX2) )
{
x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
#endif
int ret = 0;
int cpu0 = 0, cpu1 = 0;
#if HAVE_MMX
- if( x264_cpu_detect() & X264_CPU_MMXEXT )
+ if( x264_cpu_detect() & X264_CPU_MMX2 )
{
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMXEXT, "MMX" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMX2, "MMX" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "MMX Cache64" );
cpu1 &= ~X264_CPU_CACHELINE_64;
#if ARCH_X86
#define X264_CPU_CACHELINE_64 0x000002 /* 32/64 is the size of a cacheline in bytes */
#define X264_CPU_ALTIVEC 0x000004
#define X264_CPU_MMX 0x000008
-#define X264_CPU_MMXEXT 0x000010 /* MMX2 aka MMXEXT aka ISSE */
+#define X264_CPU_MMX2 0x000010 /* MMX2 aka MMXEXT aka ISSE */
+#define X264_CPU_MMXEXT X264_CPU_MMX2
#define X264_CPU_SSE 0x000020
#define X264_CPU_SSE2 0x000040
#define X264_CPU_SSE2_IS_SLOW 0x000080 /* avoid most SSE2 functions on Athlon64 */