jl .loop
REP_RET
-%macro INTEGRAL_INIT 1
-;-----------------------------------------------------------------------------
-; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
-;-----------------------------------------------------------------------------
-cglobal x264_integral_init4v_%1, 3,5
- shl r2, 1
- add r0, r2
- add r1, r2
- lea r3, [r0+r2*4]
- lea r4, [r0+r2*8]
- neg r2
-.loop:
- movu m0, [r0+r2+8]
- mova m2, [r0+r2]
- movu m1, [r4+r2+8]
- paddw m0, m2
- paddw m1, [r4+r2]
- mova m3, [r3+r2]
- psubw m1, m0
- psubw m3, m2
- mova [r0+r2], m1
- mova [r1+r2], m3
- add r2, mmsize
- jl .loop
- REP_RET
-
+%macro INTEGRAL_INIT_8V 1
;-----------------------------------------------------------------------------
; void x264_integral_init8v_mmx( uint16_t *sum8, int stride )
;-----------------------------------------------------------------------------
%endmacro
INIT_MMX
-INTEGRAL_INIT mmx
+INTEGRAL_INIT_8V mmx
INIT_XMM
-INTEGRAL_INIT sse2
+INTEGRAL_INIT_8V sse2
+
+;-----------------------------------------------------------------------------
+; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal x264_integral_init4v_mmx, 3,5
+ shl r2, 1
+ lea r3, [r0+r2*4]
+ lea r4, [r0+r2*8]
+ mova m0, [r0+r2]
+ mova m4, [r4+r2]
+.loop:
+ sub r2, 8
+ mova m1, m4
+ psubw m1, m0
+ mova m4, [r4+r2]
+ mova m0, [r0+r2]
+ paddw m1, m4
+ mova m3, [r3+r2]
+ psubw m1, m0
+ psubw m3, m0
+ mova [r0+r2], m1
+ mova [r1+r2], m3
+ jge .loop
+ REP_RET
+INIT_XMM
+cglobal x264_integral_init4v_sse2, 3,5
+ shl r2, 1
+ add r0, r2
+ add r1, r2
+ lea r3, [r0+r2*4]
+ lea r4, [r0+r2*8]
+ neg r2
+.loop:
+ mova m0, [r0+r2]
+ mova m1, [r4+r2]
+ mova m2, m0
+ mova m4, m1
+ shufpd m0, [r0+r2+16], 1
+ shufpd m1, [r4+r2+16], 1
+ paddw m0, m2
+ paddw m1, m4
+ mova m3, [r3+r2]
+ psubw m1, m0
+ psubw m3, m2
+ mova [r0+r2], m1
+ mova [r1+r2], m3
+ add r2, 16
+ jl .loop
+ REP_RET
+cglobal x264_integral_init4v_ssse3, 3,5
+ shl r2, 1
+ add r0, r2
+ add r1, r2
+ lea r3, [r0+r2*4]
+ lea r4, [r0+r2*8]
+ neg r2
+.loop:
+ mova m2, [r0+r2]
+ mova m0, [r0+r2+16]
+ mova m4, [r4+r2]
+ mova m1, [r4+r2+16]
+ palignr m0, m2, 8
+ palignr m1, m4, 8
+ paddw m0, m2
+ paddw m1, m4
+ mova m3, [r3+r2]
+ psubw m1, m0
+ psubw m3, m2
+ mova [r0+r2], m1
+ mova [r1+r2], m3
+ add r2, 16
+ jl .loop
+ REP_RET
%macro FILT8x4 7
mova %3, [r0+%7]
extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
+extern void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
#define LOWRES(cpu) \
extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
int src_stride, int dst_stride, int width, int height );
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3;
+ if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+ pf->integral_init4v = x264_integral_init4v_ssse3;
+
if( !(cpu&X264_CPU_SSE4) )
return;