Prevents a crash if the misaligned exception mask bit is cleared for some reason.
Misaligned SSE functions are only used on AMD Phenom CPUs and the benefit is miniscule.
They also require modifying the MXCSR control register and by removing those functions
we can get rid of that complexity altogether.
VEX-encoded instructions also supports unaligned memory operands. I tried adding AVX
implementations of all removed functions but there were no performance improvements on
Ivy Bridge. pixel_sad_x3 and pixel_sad_x4 had significant code size reductions though
so I kept them and added some minor cosmetics fixes and tweaks.
#undef MMX2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
- {"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"LZCNT", X264_CPU_LZCNT},
{"BMI1", X264_CPU_BMI1},
{"BMI2", X264_CPU_BMI1|X264_CPU_BMI2},
}
}
- if( ecx&0x00000080 ) /* Misalign SSE */
- {
- cpu |= X264_CPU_SSE_MISALIGN;
- x264_cpu_mask_misalign_sse();
- }
-
if( cpu & X264_CPU_AVX )
{
if( ecx&0x00000800 ) /* XOP */
#define x264_emms()
#endif
#define x264_sfence x264_cpu_sfence
-void x264_cpu_mask_misalign_sse( void );
void x264_safe_intel_cpu_indicator_init( void );
/* kludge:
pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_sse2;
}
}
-
- if( cpu&X264_CPU_SSE_MISALIGN )
- {
- INIT2( sad_x3, _sse2_misalign );
- INIT2( sad_x4, _sse2_misalign );
- }
}
if( cpu&X264_CPU_SSE2_IS_FAST && !(cpu&X264_CPU_CACHELINE_64) )
if( cpu&X264_CPU_AVX )
{
INIT2_NAME( sad_aligned, sad, _sse2 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
+ INIT2( sad_x3, _avx );
+ INIT2( sad_x4, _avx );
INIT8( satd, _avx );
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
sfence
ret
-;-----------------------------------------------------------------------------
-; void cpu_mask_misalign_sse( void )
-;-----------------------------------------------------------------------------
-cglobal cpu_mask_misalign_sse
- sub rsp, 4
- stmxcsr [rsp]
- or dword [rsp], 1<<17
- ldmxcsr [rsp]
- add rsp, 4
- ret
-
cextern intel_cpu_indicator_init
;-----------------------------------------------------------------------------
jg .height_loop
RET
+INIT_XMM
cglobal pixel_avg2_w16_sse2, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
- movdqu xmm0, [r2]
- movdqu xmm2, [r2+r3]
- movdqu xmm1, [r2+r4]
- movdqu xmm3, [r2+r6]
+ movu m0, [r2]
+ movu m2, [r2+r3]
+ movu m1, [r2+r4]
+ movu m3, [r2+r6]
lea r2, [r2+r3*2]
- pavgb xmm0, xmm1
- pavgb xmm2, xmm3
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm2
+ pavgb m0, m1
+ pavgb m2, m3
+ mova [r0], m0
+ mova [r0+r1], m2
lea r0, [r0+r1*2]
- sub r5d, 2
- jg .height_loop
+ sub r5d, 2
+ jg .height_loop
RET
-%macro AVG2_W20 1
-cglobal pixel_avg2_w20_%1, 6,7
+cglobal pixel_avg2_w20_sse2, 6,7
sub r2, r4
lea r6, [r2+r3]
.height_loop:
- movdqu xmm0, [r4]
- movdqu xmm2, [r4+r3]
-%ifidn %1, sse2_misalign
- movd mm4, [r4+16]
- movd mm5, [r4+r3+16]
- pavgb xmm0, [r4+r2]
- pavgb xmm2, [r4+r6]
-%else
- movdqu xmm1, [r4+r2]
- movdqu xmm3, [r4+r6]
- movd mm4, [r4+16]
- movd mm5, [r4+r3+16]
- pavgb xmm0, xmm1
- pavgb xmm2, xmm3
-%endif
- pavgb mm4, [r4+r2+16]
- pavgb mm5, [r4+r6+16]
+ movu m0, [r4]
+ movu m2, [r4+r3]
+ movu m1, [r4+r2]
+ movu m3, [r4+r6]
+ movd mm4, [r4+16]
+ movd mm5, [r4+r3+16]
+ pavgb m0, m1
+ pavgb m2, m3
+ pavgb mm4, [r4+r2+16]
+ pavgb mm5, [r4+r6+16]
lea r4, [r4+r3*2]
- movdqa [r0], xmm0
- movd [r0+16], mm4
- movdqa [r0+r1], xmm2
- movd [r0+r1+16], mm5
+ mova [r0], m0
+ mova [r0+r1], m2
+ movd [r0+16], mm4
+ movd [r0+r1+16], mm5
lea r0, [r0+r1*2]
- sub r5d, 2
- jg .height_loop
+ sub r5d, 2
+ jg .height_loop
RET
-%endmacro
-
-AVG2_W20 sse2
-AVG2_W20 sse2_misalign
INIT_YMM avx2
cglobal pixel_avg2_w20, 6,7
%endmacro
%else ; !HIGH_BIT_DEPTH
%macro UNPACK_UNALIGNED 3
-%if mmsize == 8 || cpuflag(misalign)
+%if mmsize == 8
punpcklwd %1, %3
%else
movh %2, %3
%else ; !HIGH_BIT_DEPTH
INIT_MMX mmx2
MC_CHROMA
-INIT_XMM sse2, misalign
-MC_CHROMA
INIT_XMM sse2
MC_CHROMA
INIT_XMM ssse3
%define pw_rnd [pw_32]
%endif
; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
-%if cpuflag(misalign) || mmsize==32
+%if mmsize==32
.loop:
movu m4, [src-4]
movu m5, [src-2]
HPEL_V 0
INIT_XMM sse2
HPEL_V 8
-INIT_XMM sse2, misalign
-HPEL_C
%if ARCH_X86_64 == 0
INIT_XMM sse2
HPEL_C
int dx, int dy, int i_width, int i_height );
MC_CHROMA(mmx2)
MC_CHROMA(sse2)
-MC_CHROMA(sse2_misalign)
MC_CHROMA(ssse3)
MC_CHROMA(ssse3_cache64)
MC_CHROMA(avx)
PIXEL_AVG_WALL(cache64_mmx2)
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
-PIXEL_AVG_WALL(sse2_misalign)
PIXEL_AVG_WALL(cache64_ssse3)
PIXEL_AVG_WALL(avx2)
PIXEL_AVG_WTAB(cache64_mmx2, mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2)
#endif
PIXEL_AVG_WTAB(sse2, mmx2, mmx2, sse2, sse2, sse2)
-PIXEL_AVG_WTAB(sse2_misalign, mmx2, mmx2, sse2, sse2, sse2_misalign)
PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2)
GET_REF(cache32_mmx2)
GET_REF(cache64_mmx2)
#endif
-GET_REF(sse2_misalign)
GET_REF(cache64_sse2)
GET_REF(cache64_ssse3)
GET_REF(cache64_ssse3_atom)
HPEL(16, avx, avx, avx, avx)
HPEL(32, avx2, avx2, avx2, avx2)
#endif
-HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
#endif // HIGH_BIT_DEPTH
static void x264_plane_copy_mmx2( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
pf->hpel_filter = x264_hpel_filter_sse2;
- if( cpu&X264_CPU_SSE_MISALIGN )
- pf->hpel_filter = x264_hpel_filter_sse2_misalign;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_sse2;
pf->mc_luma = mc_luma_cache64_sse2;
pf->get_ref = get_ref_cache64_sse2;
}
- if( cpu&X264_CPU_SSE_MISALIGN )
- {
- pf->get_ref = get_ref_sse2_misalign;
- if( !(cpu&X264_CPU_STACK_MOD4) )
- pf->mc_chroma = x264_mc_chroma_sse2_misalign;
- }
}
}
DECL_X1( sad, mmx2 )
DECL_X1( sad, sse2 )
-DECL_X4( sad, sse2_misalign )
DECL_X1( sad, sse3 )
DECL_X1( sad, sse2_aligned )
DECL_X1( sad, ssse3 )
DECL_X4( sad, sse2 )
DECL_X4( sad, sse3 )
DECL_X4( sad, ssse3 )
+DECL_X4( sad, avx )
DECL_X4( sad, avx2 )
DECL_X1( ssd, mmx )
DECL_X1( ssd, mmx2 )
;=============================================================================
%macro SAD_X3_START_1x16P_SSE2 0
-%if cpuflag(misalign)
- mova xmm2, [r0]
- movu xmm0, [r1]
- movu xmm1, [r2]
- psadbw xmm0, xmm2
- psadbw xmm1, xmm2
- psadbw xmm2, [r3]
+ mova m2, [r0]
+%if cpuflag(avx)
+ psadbw m0, m2, [r1]
+ psadbw m1, m2, [r2]
+ psadbw m2, [r3]
%else
- mova xmm3, [r0]
- movu xmm0, [r1]
- movu xmm1, [r2]
- movu xmm2, [r3]
- psadbw xmm0, xmm3
- psadbw xmm1, xmm3
- psadbw xmm2, xmm3
+ movu m0, [r1]
+ movu m1, [r2]
+ movu m3, [r3]
+ psadbw m0, m2
+ psadbw m1, m2
+ psadbw m2, m3
%endif
%endmacro
%macro SAD_X3_1x16P_SSE2 2
-%if cpuflag(misalign)
- mova xmm3, [r0+%1]
- movu xmm4, [r1+%2]
- movu xmm5, [r2+%2]
- psadbw xmm4, xmm3
- psadbw xmm5, xmm3
- psadbw xmm3, [r3+%2]
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm3
+ mova m3, [r0+%1]
+%if cpuflag(avx)
+ psadbw m4, m3, [r1+%2]
+ psadbw m5, m3, [r2+%2]
+ psadbw m3, [r3+%2]
%else
- mova xmm3, [r0+%1]
- movu xmm4, [r1+%2]
- movu xmm5, [r2+%2]
- movu xmm6, [r3+%2]
- psadbw xmm4, xmm3
- psadbw xmm5, xmm3
- psadbw xmm6, xmm3
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
+ movu m4, [r1+%2]
+ movu m5, [r2+%2]
+ movu m6, [r3+%2]
+ psadbw m4, m3
+ psadbw m5, m3
+ psadbw m3, m6
%endif
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m3
%endmacro
+%if ARCH_X86_64
+ DECLARE_REG_TMP 6
+%else
+ DECLARE_REG_TMP 5
+%endif
+
%macro SAD_X3_4x16P_SSE2 2
%if %1==0
-%if UNIX64
- mov r6, r5
-%endif
- lea r5, [r4*3]
+ lea t0, [r4*3]
SAD_X3_START_1x16P_SSE2
%else
SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
%endif
SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1
SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2
- SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r5
+ SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0
%if %1 != %2-1
%if (%1&1) != 0
add r0, 8*FENC_STRIDE
%endmacro
%macro SAD_X3_START_2x8P_SSE2 0
- movq xmm7, [r0]
- movq xmm0, [r1]
- movq xmm1, [r2]
- movq xmm2, [r3]
- movhps xmm7, [r0+FENC_STRIDE]
- movhps xmm0, [r1+r4]
- movhps xmm1, [r2+r4]
- movhps xmm2, [r3+r4]
- psadbw xmm0, xmm7
- psadbw xmm1, xmm7
- psadbw xmm2, xmm7
+ movq m3, [r0]
+ movq m0, [r1]
+ movq m1, [r2]
+ movq m2, [r3]
+ movhps m3, [r0+FENC_STRIDE]
+ movhps m0, [r1+r4]
+ movhps m1, [r2+r4]
+ movhps m2, [r3+r4]
+ psadbw m0, m3
+ psadbw m1, m3
+ psadbw m2, m3
%endmacro
%macro SAD_X3_2x8P_SSE2 4
- movq xmm7, [r0+%1]
- movq xmm3, [r1+%2]
- movq xmm4, [r2+%2]
- movq xmm5, [r3+%2]
- movhps xmm7, [r0+%3]
- movhps xmm3, [r1+%4]
- movhps xmm4, [r2+%4]
- movhps xmm5, [r3+%4]
- psadbw xmm3, xmm7
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- paddw xmm0, xmm3
- paddw xmm1, xmm4
- paddw xmm2, xmm5
+ movq m6, [r0+%1]
+ movq m3, [r1+%2]
+ movq m4, [r2+%2]
+ movq m5, [r3+%2]
+ movhps m6, [r0+%3]
+ movhps m3, [r1+%4]
+ movhps m4, [r2+%4]
+ movhps m5, [r3+%4]
+ psadbw m3, m6
+ psadbw m4, m6
+ psadbw m5, m6
+ paddw m0, m3
+ paddw m1, m4
+ paddw m2, m5
%endmacro
%macro SAD_X4_START_2x8P_SSE2 0
- movq xmm7, [r0]
- movq xmm0, [r1]
- movq xmm1, [r2]
- movq xmm2, [r3]
- movq xmm3, [r4]
- movhps xmm7, [r0+FENC_STRIDE]
- movhps xmm0, [r1+r5]
- movhps xmm1, [r2+r5]
- movhps xmm2, [r3+r5]
- movhps xmm3, [r4+r5]
- psadbw xmm0, xmm7
- psadbw xmm1, xmm7
- psadbw xmm2, xmm7
- psadbw xmm3, xmm7
+ movq m4, [r0]
+ movq m0, [r1]
+ movq m1, [r2]
+ movq m2, [r3]
+ movq m3, [r4]
+ movhps m4, [r0+FENC_STRIDE]
+ movhps m0, [r1+r5]
+ movhps m1, [r2+r5]
+ movhps m2, [r3+r5]
+ movhps m3, [r4+r5]
+ psadbw m0, m4
+ psadbw m1, m4
+ psadbw m2, m4
+ psadbw m3, m4
%endmacro
%macro SAD_X4_2x8P_SSE2 4
- movq xmm7, [r0+%1]
- movq xmm4, [r1+%2]
- movq xmm5, [r2+%2]
-%if ARCH_X86_64
- movq xmm6, [r3+%2]
- movq xmm8, [r4+%2]
- movhps xmm7, [r0+%3]
- movhps xmm4, [r1+%4]
- movhps xmm5, [r2+%4]
- movhps xmm6, [r3+%4]
- movhps xmm8, [r4+%4]
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- psadbw xmm6, xmm7
- psadbw xmm8, xmm7
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- paddw xmm3, xmm8
-%else
- movhps xmm7, [r0+%3]
- movhps xmm4, [r1+%4]
- movhps xmm5, [r2+%4]
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- movq xmm6, [r3+%2]
- movq xmm4, [r4+%2]
- movhps xmm6, [r3+%4]
- movhps xmm4, [r4+%4]
- psadbw xmm6, xmm7
- psadbw xmm4, xmm7
- paddw xmm2, xmm6
- paddw xmm3, xmm4
-%endif
+ movq m6, [r0+%1]
+ movq m4, [r1+%2]
+ movq m5, [r2+%2]
+ movhps m6, [r0+%3]
+ movhps m4, [r1+%4]
+ movhps m5, [r2+%4]
+ psadbw m4, m6
+ psadbw m5, m6
+ paddw m0, m4
+ paddw m1, m5
+ movq m4, [r3+%2]
+ movq m5, [r4+%2]
+ movhps m4, [r3+%4]
+ movhps m5, [r4+%4]
+ psadbw m4, m6
+ psadbw m5, m6
+ paddw m2, m4
+ paddw m3, m5
%endmacro
%macro SAD_X4_START_1x16P_SSE2 0
-%if cpuflag(misalign)
- mova xmm3, [r0]
- movu xmm0, [r1]
- movu xmm1, [r2]
- movu xmm2, [r3]
- psadbw xmm0, xmm3
- psadbw xmm1, xmm3
- psadbw xmm2, xmm3
- psadbw xmm3, [r4]
+ mova m3, [r0]
+%if cpuflag(avx)
+ psadbw m0, m3, [r1]
+ psadbw m1, m3, [r2]
+ psadbw m2, m3, [r3]
+ psadbw m3, [r4]
%else
- mova xmm7, [r0]
- movu xmm0, [r1]
- movu xmm1, [r2]
- movu xmm2, [r3]
- movu xmm3, [r4]
- psadbw xmm0, xmm7
- psadbw xmm1, xmm7
- psadbw xmm2, xmm7
- psadbw xmm3, xmm7
+ movu m0, [r1]
+ movu m1, [r2]
+ movu m2, [r3]
+ movu m4, [r4]
+ psadbw m0, m3
+ psadbw m1, m3
+ psadbw m2, m3
+ psadbw m3, m4
%endif
%endmacro
%macro SAD_X4_1x16P_SSE2 2
-%if cpuflag(misalign)
- mova xmm7, [r0+%1]
- movu xmm4, [r1+%2]
- movu xmm5, [r2+%2]
- movu xmm6, [r3+%2]
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- psadbw xmm6, xmm7
- psadbw xmm7, [r4+%2]
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- paddw xmm3, xmm7
+ mova m6, [r0+%1]
+%if cpuflag(avx)
+ psadbw m4, m6, [r1+%2]
+ psadbw m5, m6, [r2+%2]
%else
- mova xmm7, [r0+%1]
- movu xmm4, [r1+%2]
- movu xmm5, [r2+%2]
- movu xmm6, [r3+%2]
-%if ARCH_X86_64
- movu xmm8, [r4+%2]
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- psadbw xmm6, xmm7
- psadbw xmm8, xmm7
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- paddw xmm3, xmm8
-%else
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- paddw xmm0, xmm4
- psadbw xmm6, xmm7
- movu xmm4, [r4+%2]
- paddw xmm1, xmm5
- psadbw xmm4, xmm7
- paddw xmm2, xmm6
- paddw xmm3, xmm4
+ movu m4, [r1+%2]
+ movu m5, [r2+%2]
+ psadbw m4, m6
+ psadbw m5, m6
%endif
+ paddw m0, m4
+ paddw m1, m5
+%if cpuflag(avx)
+ psadbw m4, m6, [r3+%2]
+ psadbw m5, m6, [r4+%2]
+%else
+ movu m4, [r3+%2]
+ movu m5, [r4+%2]
+ psadbw m4, m6
+ psadbw m5, m6
%endif
+ paddw m2, m4
+ paddw m3, m5
%endmacro
%macro SAD_X4_4x16P_SSE2 2
%macro SAD_X3_4x8P_SSE2 2
%if %1==0
-%if UNIX64
- mov r6, r5
-%endif
- lea r5, [r4*3]
+ lea t0, [r4*3]
SAD_X3_START_2x8P_SSE2
%else
SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1
%endif
- SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), r5
+ SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0
%if %1 != %2-1
%if (%1&1) != 0
add r0, 8*FENC_STRIDE
%endmacro
%macro SAD_X3_END_SSE2 0
- movhlps xmm4, xmm0
- movhlps xmm5, xmm1
- movhlps xmm6, xmm2
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
-%if UNIX64
- movd [r6+0], xmm0
- movd [r6+4], xmm1
- movd [r6+8], xmm2
-%else
- mov r0, r5mp
- movd [r0+0], xmm0
- movd [r0+4], xmm1
- movd [r0+8], xmm2
-%endif
+ movhlps m3, m0
+ movhlps m4, m1
+ movhlps m5, m2
+ paddw m0, m3
+ paddw m1, m4
+ paddw m2, m5
+ movifnidn r5, r5mp
+ movd [r5+0], m0
+ movd [r5+4], m1
+ movd [r5+8], m2
RET
%endmacro
%macro SAD_X4_END_SSE2 0
- mov r0, r6mp
- psllq xmm1, 32
- psllq xmm3, 32
- paddw xmm0, xmm1
- paddw xmm2, xmm3
- movhlps xmm1, xmm0
- movhlps xmm3, xmm2
- paddw xmm0, xmm1
- paddw xmm2, xmm3
- movq [r0+0], xmm0
- movq [r0+8], xmm2
+ mov r0, r6mp
+ psllq m1, 32
+ psllq m3, 32
+ paddw m0, m1
+ paddw m2, m3
+ movhlps m1, m0
+ movhlps m3, m2
+ paddw m0, m1
+ paddw m2, m3
+ movq [r0+0], m0
+ movq [r0+8], m2
RET
%endmacro
%macro SAD_X4_START_2x8P_SSSE3 0
- movddup xmm4, [r0]
- movq xmm0, [r1]
- movq xmm1, [r3]
- movhps xmm0, [r2]
- movhps xmm1, [r4]
- movddup xmm5, [r0+FENC_STRIDE]
- movq xmm2, [r1+r5]
- movq xmm3, [r3+r5]
- movhps xmm2, [r2+r5]
- movhps xmm3, [r4+r5]
- psadbw xmm0, xmm4
- psadbw xmm1, xmm4
- psadbw xmm2, xmm5
- psadbw xmm3, xmm5
- paddw xmm0, xmm2
- paddw xmm1, xmm3
+ movddup m4, [r0]
+ movq m0, [r1]
+ movq m1, [r3]
+ movhps m0, [r2]
+ movhps m1, [r4]
+ movddup m5, [r0+FENC_STRIDE]
+ movq m2, [r1+r5]
+ movq m3, [r3+r5]
+ movhps m2, [r2+r5]
+ movhps m3, [r4+r5]
+ psadbw m0, m4
+ psadbw m1, m4
+ psadbw m2, m5
+ psadbw m3, m5
+ paddw m0, m2
+ paddw m1, m3
%endmacro
%macro SAD_X4_2x8P_SSSE3 4
- movddup xmm6, [r0+%1]
- movq xmm2, [r1+%2]
- movq xmm3, [r3+%2]
- movhps xmm2, [r2+%2]
- movhps xmm3, [r4+%2]
- movddup xmm7, [r0+%3]
- movq xmm4, [r1+%4]
- movq xmm5, [r3+%4]
- movhps xmm4, [r2+%4]
- movhps xmm5, [r4+%4]
- psadbw xmm2, xmm6
- psadbw xmm3, xmm6
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- paddw xmm0, xmm2
- paddw xmm1, xmm3
- paddw xmm0, xmm4
- paddw xmm1, xmm5
+ movddup m6, [r0+%1]
+ movq m2, [r1+%2]
+ movq m3, [r3+%2]
+ movhps m2, [r2+%2]
+ movhps m3, [r4+%2]
+ movddup m7, [r0+%3]
+ movq m4, [r1+%4]
+ movq m5, [r3+%4]
+ movhps m4, [r2+%4]
+ movhps m5, [r4+%4]
+ psadbw m2, m6
+ psadbw m3, m6
+ psadbw m4, m7
+ psadbw m5, m7
+ paddw m0, m2
+ paddw m1, m3
+ paddw m0, m4
+ paddw m1, m5
%endmacro
%macro SAD_X4_4x8P_SSSE3 2
%endmacro
%macro SAD_X4_END_SSSE3 0
- mov r0, r6mp
- packssdw xmm0, xmm1
- movdqa [r0], xmm0
+ mov r0, r6mp
+ packssdw m0, m1
+ mova [r0], m0
RET
%endmacro
%macro SAD_X3_4x16P_AVX2 2
%if %1==0
-%if UNIX64
- mov r6, r5
-%endif
- lea r5, [r4*3]
+ lea t0, [r4*3]
SAD_X3_START_2x16P_AVX2
%else
SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1
%endif
- SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, r5
+ SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, t0
%if %1 != %2-1
%if (%1&1) != 0
add r0, 8*FENC_STRIDE
paddw xm0, xm4
paddw xm1, xm5
paddw xm2, xm6
-%if UNIX64
- movd [r6+0], xm0
- movd [r6+4], xm1
- movd [r6+8], xm2
-%else
- mov r0, r5mp
- movd [r0+0], xm0
- movd [r0+4], xm1
- movd [r0+8], xm2
-%endif
+ movifnidn r5, r5mp
+ movd [r5+0], xm0
+ movd [r5+4], xm1
+ movd [r5+8], xm2
RET
%endmacro
; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
; uint8_t *pix2, intptr_t i_stride, int scores[3] )
;-----------------------------------------------------------------------------
-%macro SAD_X_SSE2 3
-cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,9
+%macro SAD_X_SSE2 4
+cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
%assign x 0
%rep %3/4
SAD_X%1_4x%2P_SSE2 x, %3/4
%endmacro
INIT_XMM sse2
-SAD_X_SSE2 3, 16, 16
-SAD_X_SSE2 3, 16, 8
-SAD_X_SSE2 3, 8, 16
-SAD_X_SSE2 3, 8, 8
-SAD_X_SSE2 3, 8, 4
-SAD_X_SSE2 4, 16, 16
-SAD_X_SSE2 4, 16, 8
-SAD_X_SSE2 4, 8, 16
-SAD_X_SSE2 4, 8, 8
-SAD_X_SSE2 4, 8, 4
-
-INIT_XMM sse2, misalign
-SAD_X_SSE2 3, 16, 16
-SAD_X_SSE2 3, 16, 8
-SAD_X_SSE2 4, 16, 16
-SAD_X_SSE2 4, 16, 8
+SAD_X_SSE2 3, 16, 16, 7
+SAD_X_SSE2 3, 16, 8, 7
+SAD_X_SSE2 3, 8, 16, 7
+SAD_X_SSE2 3, 8, 8, 7
+SAD_X_SSE2 3, 8, 4, 7
+SAD_X_SSE2 4, 16, 16, 7
+SAD_X_SSE2 4, 16, 8, 7
+SAD_X_SSE2 4, 8, 16, 7
+SAD_X_SSE2 4, 8, 8, 7
+SAD_X_SSE2 4, 8, 4, 7
INIT_XMM sse3
-SAD_X_SSE2 3, 16, 16
-SAD_X_SSE2 3, 16, 8
-SAD_X_SSE2 4, 16, 16
-SAD_X_SSE2 4, 16, 8
+SAD_X_SSE2 3, 16, 16, 7
+SAD_X_SSE2 3, 16, 8, 7
+SAD_X_SSE2 4, 16, 16, 7
+SAD_X_SSE2 4, 16, 8, 7
%macro SAD_X_SSSE3 3
cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8
SAD_X_SSSE3 4, 8, 8
SAD_X_SSSE3 4, 8, 4
+INIT_XMM avx
+SAD_X_SSE2 3, 16, 16, 6
+SAD_X_SSE2 3, 16, 8, 6
+SAD_X_SSE2 4, 16, 16, 7
+SAD_X_SSE2 4, 16, 8, 7
+
%macro SAD_X_AVX2 4
cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
%assign x 0
%assign cpuflags_cache64 (1<<17)
%assign cpuflags_slowctz (1<<18)
%assign cpuflags_lzcnt (1<<19)
-%assign cpuflags_misalign (1<<20)
-%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
-%assign cpuflags_atom (1<<22)
-%assign cpuflags_bmi1 (1<<23)|cpuflags_lzcnt
-%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1
+%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<21)
+%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt
+%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
{
if( h->param.i_sync_lookahead )
x264_lower_thread_priority( 10 );
-
-#if HAVE_MMX
- /* Misalign mask has to be set separately for each thread. */
- if( h->param.cpu&X264_CPU_SSE_MISALIGN )
- x264_cpu_mask_misalign_sse();
-#endif
-}
-
-static void x264_lookahead_thread_init( x264_t *h )
-{
-#if HAVE_MMX
- /* Misalign mask has to be set separately for each thread. */
- if( h->param.cpu&X264_CPU_SSE_MISALIGN )
- x264_cpu_mask_misalign_sse();
-#endif
}
#endif
x264_threadpool_init( &h->threadpool, h->param.i_threads, (void*)x264_encoder_thread_init, h ) )
goto fail;
if( h->param.i_lookahead_threads > 1 &&
- x264_threadpool_init( &h->lookaheadpool, h->param.i_lookahead_threads, (void*)x264_lookahead_thread_init, h ) )
+ x264_threadpool_init( &h->lookaheadpool, h->param.i_lookahead_threads, NULL, NULL ) )
goto fail;
#if HAVE_OPENCL
thread_current =
thread_oldest = h;
}
-#if HAVE_MMX
- if( h->param.cpu&X264_CPU_SSE_MISALIGN )
- x264_cpu_mask_misalign_sse();
-#endif
h->i_cpb_delay_pir_offset = h->i_cpb_delay_pir_offset_next;
/* no data out */
static void *x264_lookahead_thread( x264_t *h )
{
- int shift;
-#if HAVE_MMX
- if( h->param.cpu&X264_CPU_SSE_MISALIGN )
- x264_cpu_mask_misalign_sse();
-#endif
while( !h->lookahead->b_exit_thread )
{
x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
x264_pthread_mutex_lock( &h->lookahead->next.mutex );
- shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size );
+ int shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size );
x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, shift );
x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
if( h->lookahead->next.i_size <= h->lookahead->i_slicetype_length + h->param.b_vfr_input )
b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" :
- b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
b->cpu&X264_CPU_BMI2 ? "_bmi2" :
b->cpu&X264_CPU_BMI1 ? "_bmi1" :
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
cpu1 &= ~X264_CPU_SLOW_CTZ;
}
- if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
- {
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
- cpu1 &= ~X264_CPU_SSE_MISALIGN;
- }
if( x264_cpu_detect() & X264_CPU_LZCNT )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
#include "x264_config.h"
-#define X264_BUILD 135
+#define X264_BUILD 136
/* Application developers planning to link against a shared library version of
* libx264 from a Microsoft Visual Studio or similar development environment
#define X264_CPU_SSSE3 0x0000040
#define X264_CPU_SSE4 0x0000080 /* SSE4.1 */
#define X264_CPU_SSE42 0x0000100 /* SSE4.2 */
-#define X264_CPU_SSE_MISALIGN 0x0000200 /* Phenom support for misaligned SSE instruction arguments */
-#define X264_CPU_LZCNT 0x0000400 /* Phenom support for "leading zero count" instruction. */
-#define X264_CPU_AVX 0x0000800 /* AVX support: requires OS support even if YMM registers aren't used. */
-#define X264_CPU_XOP 0x0001000 /* AMD XOP */
-#define X264_CPU_FMA4 0x0002000 /* AMD FMA4 */
-#define X264_CPU_AVX2 0x0004000 /* AVX2 */
-#define X264_CPU_FMA3 0x0008000 /* Intel FMA3 */
-#define X264_CPU_BMI1 0x0010000 /* BMI1 */
-#define X264_CPU_BMI2 0x0020000 /* BMI2 */
+#define X264_CPU_LZCNT 0x0000200 /* Phenom support for "leading zero count" instruction. */
+#define X264_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */
+#define X264_CPU_XOP 0x0000800 /* AMD XOP */
+#define X264_CPU_FMA4 0x0001000 /* AMD FMA4 */
+#define X264_CPU_AVX2 0x0002000 /* AVX2 */
+#define X264_CPU_FMA3 0x0004000 /* Intel FMA3 */
+#define X264_CPU_BMI1 0x0008000 /* BMI1 */
+#define X264_CPU_BMI2 0x0010000 /* BMI2 */
/* x86 modifiers */
-#define X264_CPU_CACHELINE_32 0x0040000 /* avoid memory loads that span the border between two cachelines */
-#define X264_CPU_CACHELINE_64 0x0080000 /* 32/64 is the size of a cacheline in bytes */
-#define X264_CPU_SSE2_IS_SLOW 0x0100000 /* avoid most SSE2 functions on Athlon64 */
-#define X264_CPU_SSE2_IS_FAST 0x0200000 /* a few functions are only faster on Core2 and Phenom */
-#define X264_CPU_SLOW_SHUFFLE 0x0400000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
-#define X264_CPU_STACK_MOD4 0x0800000 /* if stack is only mod4 and not mod16 */
-#define X264_CPU_SLOW_CTZ 0x1000000 /* BSR/BSF x86 instructions are really slow on some CPUs */
-#define X264_CPU_SLOW_ATOM 0x2000000 /* The Atom is terrible: slow SSE unaligned loads, slow
+#define X264_CPU_CACHELINE_32 0x0020000 /* avoid memory loads that span the border between two cachelines */
+#define X264_CPU_CACHELINE_64 0x0040000 /* 32/64 is the size of a cacheline in bytes */
+#define X264_CPU_SSE2_IS_SLOW 0x0080000 /* avoid most SSE2 functions on Athlon64 */
+#define X264_CPU_SSE2_IS_FAST 0x0100000 /* a few functions are only faster on Core2 and Phenom */
+#define X264_CPU_SLOW_SHUFFLE 0x0200000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
+#define X264_CPU_STACK_MOD4 0x0400000 /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SLOW_CTZ 0x0800000 /* BSR/BSF x86 instructions are really slow on some CPUs */
+#define X264_CPU_SLOW_ATOM 0x1000000 /* The Atom is terrible: slow SSE unaligned loads, slow
* SIMD multiplies, slow SIMD variable shifts, slow pshufb,
* cacheline split penalties -- gather everything here that
* isn't shared by other CPUs to avoid making half a dozen
* new SLOW flags. */
-#define X264_CPU_SLOW_PSHUFB 0x4000000 /* such as on the Intel Atom */
-#define X264_CPU_SLOW_PALIGNR 0x8000000 /* such as on the AMD Bobcat */
+#define X264_CPU_SLOW_PSHUFB 0x2000000 /* such as on the Intel Atom */
+#define X264_CPU_SLOW_PALIGNR 0x4000000 /* such as on the AMD Bobcat */
/* PowerPC */
#define X264_CPU_ALTIVEC 0x0000001