ifneq ($(AS),)
X86SRC0 = cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \
pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
- cpu-32.asm dct-32.asm
+ cpu-a.asm dct-32.asm
X86SRC = $(X86SRC0:%=common/x86/%)
ifeq ($(ARCH),X86)
{"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
+ {"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
{"", 0},
};
+
#ifdef HAVE_MMX
extern int x264_cpu_cpuid_test( void );
+extern void x264_cpu_mask_misalign_sse( void );
extern uint32_t x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
uint32_t x264_cpu_detect( void )
if( cpu & X264_CPU_SSE2 )
{
if( ecx&0x00000040 ) /* SSE4a */
+ {
cpu |= X264_CPU_SSE2_IS_FAST;
+ cpu |= X264_CPU_SSE_MISALIGN;
+ x264_cpu_mask_misalign_sse();
+ }
else
cpu |= X264_CPU_SSE2_IS_SLOW;
}
INIT2( sad_x4, _cache64_sse2 );
}
#endif
+ if( cpu&X264_CPU_SSE_MISALIGN )
+ {
+ INIT2( sad_x3, _sse2_misalign );
+ INIT2( sad_x4, _sse2_misalign );
+ }
}
if( cpu&X264_CPU_SSE2 )
{
+++ /dev/null
-;*****************************************************************************
-;* cpu-64.asm: h264 encoder library
-;*****************************************************************************
-;* Copyright (C) 2003-2008 x264 project
-;*
-;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
-;* Loren Merritt <lorenm@u.washington.edu>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
-;*****************************************************************************
-
-%include "x86inc.asm"
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
-;-----------------------------------------------------------------------------
-cglobal x264_cpu_cpuid
- push rbx
- mov r10, r3
- mov r11, r2
- mov r9, r1
- mov eax, r0d
- cpuid
- mov [r9], eax
- mov [r11], ebx
- mov [r10], ecx
- mov [r8], edx
- pop rbx
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_emms( void )
-;-----------------------------------------------------------------------------
-cglobal x264_emms
- emms
- ret
-
SECTION .text
+%ifdef ARCH_X86_64
+;-----------------------------------------------------------------------------
+; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+;-----------------------------------------------------------------------------
+cglobal x264_cpu_cpuid
+ push rbx
+ mov r10, r3
+ mov r11, r2
+ mov r9, r1
+ mov eax, r0d
+ cpuid
+ mov [r9], eax
+ mov [r11], ebx
+ mov [r10], ecx
+ mov [r8], edx
+ pop rbx
+ ret
+
+%else
+
;-----------------------------------------------------------------------------
; int x264_cpu_cpuid_test( void )
; return 0 if unsupported
mov [esi], edx
RET
-;-----------------------------------------------------------------------------
-; void x264_emms( void )
-;-----------------------------------------------------------------------------
-cglobal x264_emms
- emms
- ret
-
;-----------------------------------------------------------------------------
; void x264_stack_align( void (*func)(void*), void *arg );
;-----------------------------------------------------------------------------
call ecx
leave
ret
+%endif
+
+;-----------------------------------------------------------------------------
+; void x264_emms( void )
+;-----------------------------------------------------------------------------
+cglobal x264_emms
+ emms
+ ret
+;-----------------------------------------------------------------------------
+; void x264_cpu_mask_misalign_sse(void)
+;-----------------------------------------------------------------------------
+cglobal x264_cpu_mask_misalign_sse
+ sub rsp, 4
+ stmxcsr [rsp]
+ or dword [rsp], 1<<17
+ ldmxcsr [rsp]
+ add rsp, 4
+ ret
jg .height_loop
REP_RET
-cglobal x264_pixel_avg2_w20_sse2, 6,7
+%macro AVG2_W20 1
+cglobal x264_pixel_avg2_w20_%1, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
movdqu xmm0, [r2]
movdqu xmm2, [r2+r3]
- movdqu xmm1, [r2+r4]
- movdqu xmm3, [r2+r6]
movd mm4, [r2+16]
movd mm5, [r2+r3+16]
+%ifidn %1, sse2_misalign
+ pavgb xmm0, [r2+r4]
+ pavgb xmm2, [r2+r6]
+%else
+ movdqu xmm1, [r2+r4]
+ movdqu xmm3, [r2+r6]
pavgb xmm0, xmm1
pavgb xmm2, xmm3
+%endif
pavgb mm4, [r2+r4+16]
pavgb mm5, [r2+r6+16]
movdqa [r0], xmm0
sub r5d, 2
jg .height_loop
REP_RET
+%endmacro
+
+AVG2_W20 sse2
+AVG2_W20 sse2_misalign
; Cacheline split code for processors with high latencies for loads
; split over cache lines. See sad-a.asm for a more detailed explanation.
%define tpw_32 [pw_32 GLOBAL]
%endif
.loop:
+%ifidn %1,sse2_misalign
+ movu m0, [src-4]
+ movu m1, [src-2]
+ mova m2, [src]
+ paddw m0, [src+6]
+ paddw m1, [src+4]
+ paddw m2, [src+2]
+%else
mova m6, [src-16]
mova m2, [src]
mova m3, [src+16]
paddw m2, m3
paddw m1, m4
paddw m0, m5
+%endif
FILT_H m0, m1, m2
paddw m0, tpw_32
psraw m0, 6
jl .loop
REP_RET
+%ifndef ARCH_X86_64
;-----------------------------------------------------------------------------
; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
jl .loop
REP_RET
-
+%endif
%define PALIGNR PALIGNR_MMX
-HPEL_V sse2
+%ifndef ARCH_X86_64
HPEL_C sse2
+%endif
+HPEL_V sse2
+HPEL_C sse2_misalign
%define PALIGNR PALIGNR_SSSE3
HPEL_C ssse3
PIXEL_AVG_WALL(cache64_mmxext)
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
+PIXEL_AVG_WALL(sse2_misalign)
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
/* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
#define x264_pixel_avg2_w12_cache64_sse2 x264_pixel_avg2_w16_cache64_sse2
#define x264_pixel_avg2_w12_sse3 x264_pixel_avg2_w16_sse3
+#define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w16_sse2
PIXEL_AVG_WTAB(mmxext, mmxext, mmxext, mmxext, mmxext, mmxext)
#ifdef ARCH_X86
PIXEL_AVG_WTAB(cache32_mmxext, mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext)
PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext)
#endif
-PIXEL_AVG_WTAB(sse2, mmxext, mmxext, mmxext, sse2, sse2)
+PIXEL_AVG_WTAB(sse2, mmxext, mmxext, sse2, sse2, sse2)
+PIXEL_AVG_WTAB(sse2_misalign, mmxext, mmxext, sse2, sse2, sse2_misalign)
PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
#define MC_COPY_WTAB(instr, name1, name2, name3)\
GET_REF(cache64_mmxext)
#endif
GET_REF(sse2)
+GET_REF(sse2_misalign)
GET_REF(cache64_sse2)
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
HPEL(16, sse2, sse2, sse2, sse2)
HPEL(16, ssse3, sse2, ssse3, ssse3)
#endif
+HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
pf->hpel_filter = x264_hpel_filter_sse2;
+ if( cpu&X264_CPU_SSE_MISALIGN )
+ pf->hpel_filter = x264_hpel_filter_sse2_misalign;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
pf->mc_chroma = x264_mc_chroma_sse2;
pf->mc_luma = mc_luma_cache64_sse2;
pf->get_ref = get_ref_cache64_sse2;
}
+ if( cpu&X264_CPU_SSE_MISALIGN )
+ pf->get_ref = get_ref_sse2_misalign;
}
if( !(cpu&X264_CPU_SSSE3) )
DECL_X1( sad, mmxext )
DECL_X1( sad, sse2 )
+DECL_X4( sad, sse2_misalign )
DECL_X1( sad, sse3 )
DECL_X1( sad, sse2_aligned )
DECL_X4( sad, mmxext )
RET
%endmacro
+%macro SAD_X3_START_1x16P_SSE2_MISALIGN 0
+ movdqa xmm2, [r0]
+ movdqu xmm0, [r1]
+ movdqu xmm1, [r2]
+ psadbw xmm0, xmm2
+ psadbw xmm1, xmm2
+ psadbw xmm2, [r3]
+%endmacro
+
+%macro SAD_X3_1x16P_SSE2_MISALIGN 2
+ movdqa xmm3, [r0+%1]
+ movdqu xmm4, [r1+%2]
+ movdqu xmm5, [r2+%2]
+ psadbw xmm4, xmm3
+ psadbw xmm5, xmm3
+ psadbw xmm3, [r3+%2]
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+ paddw xmm2, xmm3
+%endmacro
+
+%macro SAD_X4_START_1x16P_SSE2_MISALIGN 0
+ movdqa xmm3, [r0]
+ movdqu xmm0, [r1]
+ movdqu xmm1, [r2]
+ movdqu xmm2, [r3]
+ psadbw xmm0, xmm3
+ psadbw xmm1, xmm3
+ psadbw xmm2, xmm3
+ psadbw xmm3, [r4]
+%endmacro
+
+%macro SAD_X4_1x16P_SSE2_MISALIGN 2
+ movdqa xmm7, [r0+%1]
+ movdqu xmm4, [r1+%2]
+ movdqu xmm5, [r2+%2]
+ movdqu xmm6, [r3+%2]
+ psadbw xmm4, xmm7
+ psadbw xmm5, xmm7
+ psadbw xmm6, xmm7
+ psadbw xmm7, [r4+%2]
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+ paddw xmm2, xmm6
+ paddw xmm3, xmm7
+%endmacro
+
+%macro SAD_X3_2x16P_SSE2_MISALIGN 1
+%if %1
+ SAD_X3_START_1x16P_SSE2_MISALIGN
+%else
+ SAD_X3_1x16P_SSE2_MISALIGN 0, 0
+%endif
+ SAD_X3_1x16P_SSE2_MISALIGN FENC_STRIDE, r4
+ add r0, 2*FENC_STRIDE
+ lea r1, [r1+2*r4]
+ lea r2, [r2+2*r4]
+ lea r3, [r3+2*r4]
+%endmacro
+
+%macro SAD_X4_2x16P_SSE2_MISALIGN 1
+%if %1
+ SAD_X4_START_1x16P_SSE2_MISALIGN
+%else
+ SAD_X4_1x16P_SSE2_MISALIGN 0, 0
+%endif
+ SAD_X4_1x16P_SSE2_MISALIGN FENC_STRIDE, r5
+ add r0, 2*FENC_STRIDE
+ lea r1, [r1+2*r5]
+ lea r2, [r2+2*r5]
+ lea r3, [r3+2*r5]
+ lea r4, [r4+2*r5]
+%endmacro
+
;-----------------------------------------------------------------------------
; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
; uint8_t *pix2, int i_stride, int scores[3] )
SAD_X%1_END_SSE2
%endmacro
+%macro SAD_X_SSE2_MISALIGN 4
+cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1
+ SAD_X%1_2x%2P_SSE2_MISALIGN 1
+%rep %3/2-1
+ SAD_X%1_2x%2P_SSE2_MISALIGN 0
+%endrep
+ SAD_X%1_END_SSE2
+%endmacro
+
SAD_X_SSE2 3, 16, 16, sse2
SAD_X_SSE2 3, 16, 8, sse2
SAD_X_SSE2 3, 8, 16, sse2
SAD_X_SSE2 4, 8, 8, sse2
SAD_X_SSE2 4, 8, 4, sse2
+SAD_X_SSE2_MISALIGN 3, 16, 16, sse2
+SAD_X_SSE2_MISALIGN 3, 16, 8, sse2
+SAD_X_SSE2_MISALIGN 4, 16, 16, sse2
+SAD_X_SSE2_MISALIGN 4, 16, 8, sse2
+
%define movdqu lddqu
SAD_X_SSE2 3, 16, 16, sse3
SAD_X_SSE2 3, 16, 8, sse3
; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
; unless the unaligned data spans the border between 2 cachelines, in which
-; case it's really slow. The exact numbers may differ, but all Intel cpus
-; have a large penalty for cacheline splits.
+; case it's really slow. The exact numbers may differ, but all Intel cpus prior
+; to Nehalem have a large penalty for cacheline splits.
; (8-byte alignment exactly half way between two cachelines is ok though.)
; LDDQU was supposed to fix this, but it only works on Pentium 4.
; So in the split case we load aligned data and explicitly perform the
b->cpu&X264_CPU_SSE2 ? "sse2" :
b->cpu&X264_CPU_MMX ? "mmx" : "c",
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
- b->cpu&X264_CPU_CACHELINE_64 ? "_c64" : "",
+ b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
+ b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" : "",
((int64_t)10*b->cycles/b->den - nop_time)/4 );
}
}
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
}
+ if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
+ {
+ cpu1 &= ~X264_CPU_CACHELINE_64;
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
+ cpu1 &= ~X264_CPU_SSE_MISALIGN;
+ }
if( x264_cpu_detect() & X264_CPU_SSE3 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
if( x264_cpu_detect() & X264_CPU_SSSE3 )
#define X264_CPU_STACK_MOD4 0x001000 /* if stack is only mod4 and not mod16 */
#define X264_CPU_SSE4 0x002000 /* SSE4.1 */
#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
+#define X264_CPU_SSE_MISALIGN 0x008000 /* Phenom support for misaligned SSE instruction arguments */
/* Analyse flags
*/