#define FENC_STRIDE 16
#define FDEC_STRIDE 32
ALIGNED_16( pixel fenc_buf[48*FENC_STRIDE] );
- ALIGNED_16( pixel fdec_buf[52*FDEC_STRIDE] );
+ ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
;-----------------------------------------------------------------------------
%macro PREDICT_16x16_H 0
cglobal predict_16x16_h, 1,2
- mov r1, 12*FDEC_STRIDEB
-%if HIGH_BIT_DEPTH
-.vloop:
-%assign Y 0
-%rep 4
- movd m0, [r0+r1+Y*FDEC_STRIDEB-2*SIZEOF_PIXEL]
- SPLATW m0, m0, 1
- mova [r0+r1+Y*FDEC_STRIDEB+ 0], m0
- mova [r0+r1+Y*FDEC_STRIDEB+16], m0
-%if mmsize==8
- mova [r0+r1+Y*FDEC_STRIDEB+ 8], m0
- mova [r0+r1+Y*FDEC_STRIDEB+24], m0
-%endif
-%assign Y Y+1
-%endrep
-
-%else ; !HIGH_BIT_DEPTH
-%if cpuflag(ssse3)
- mova m1, [pb_3]
-%endif
-.vloop:
-%assign Y 0
-%rep 4
- SPLATB_LOAD m0, r0+r1+FDEC_STRIDE*Y-1, m1
- mova [r0+r1+FDEC_STRIDE*Y], m0
-%if mmsize==8
- mova [r0+r1+FDEC_STRIDE*Y+8], m0
+%if cpuflag(ssse3) && notcpuflag(avx2)
+ mova m2, [pb_3]
%endif
-%assign Y Y+1
-%endrep
-%endif ; HIGH_BIT_DEPTH
- sub r1, 4*FDEC_STRIDEB
- jge .vloop
+ mov r1d, 4
+.loop:
+ PRED_H_4ROWS 16, 1
+ dec r1d
+ jg .loop
RET
%endmacro
INIT_MMX mmx2
PREDICT_16x16_H
-INIT_XMM sse2
%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+PREDICT_16x16_H
+INIT_YMM avx2
PREDICT_16x16_H
%else
;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
#if HAVE_X86_INLINE_ASM
pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2;
#endif
+ if( !(cpu&X264_CPU_AVX2) )
+ return;
+ pf[I_PRED_16x16_H] = x264_predict_16x16_h_avx2;
#else
#if !ARCH_X86_64
pf[I_PRED_16x16_P] = x264_predict_16x16_p_mmx2;
void x264_predict_16x16_h_mmx2( pixel *src );
void x264_predict_16x16_h_sse2( uint16_t *src );
void x264_predict_16x16_h_ssse3( uint8_t *src );
+void x264_predict_16x16_h_avx2( uint16_t *src );
void x264_predict_16x16_dc_mmx2( pixel *src );
void x264_predict_16x16_dc_sse2( pixel *src );
void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left );
int ret = 0, ok = 1, used_asm = 0;
ALIGNED_ARRAY_32( pixel, edge,[36] );
ALIGNED_ARRAY_32( pixel, edge2,[36] );
- ALIGNED_16( pixel fdec[FDEC_STRIDE*20] );
+ ALIGNED_ARRAY_32( pixel, fdec,[FDEC_STRIDE*20] );
struct
{
x264_predict_t predict_16x16[4+3];