int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
int i_mode = x264_size2pixel[height][width];
intptr_t i_stride0 = 16, i_stride1 = 16;
- ALIGNED_ARRAY_16( pixel, tmp0,[16*16] );
- ALIGNED_ARRAY_16( pixel, tmp1,[16*16] );
+ ALIGNED_ARRAY_N( pixel, tmp0,[16*16] );
+ ALIGNED_ARRAY_N( pixel, tmp1,[16*16] );
pixel *src0, *src1;
MC_LUMA_BI( 0 );
.height_loop:
movu m0, [r2]
movu m1, [r2+r3*2]
-%if mmsize == 8
+%if cpuflag(avx) || mmsize == 8
pavgw m0, [r2+r4]
pavgw m1, [r2+r6]
%else
AVG2_W_ONE 8
AVG2_W_TWO 10, movd, movd
AVG2_W_TWO 16, movu, mova
+INIT_YMM avx2
+AVG2_W_ONE 16
INIT_MMX
cglobal pixel_avg2_w10_mmx2, 6,7
jg .height_loop
RET
-INIT_XMM
-cglobal pixel_avg2_w18_sse2, 6,7,6
+%macro PIXEL_AVG_W18 0
+cglobal pixel_avg2_w18, 6,7
sub r4, r2
.height_loop:
movu m0, [r2+ 0]
+ movd xm2, [r2+32]
+%if mmsize == 32
+ pavgw m0, [r2+r4+ 0]
+ movd xm1, [r2+r4+32]
+ pavgw xm2, xm1
+%else
movu m1, [r2+16]
- movh m2, [r2+32]
movu m3, [r2+r4+ 0]
movu m4, [r2+r4+16]
- movh m5, [r2+r4+32]
+ movd m5, [r2+r4+32]
pavgw m0, m3
pavgw m1, m4
pavgw m2, m5
- mova [r0+ 0], m0
mova [r0+16], m1
- movh [r0+32], m2
+%endif
+ mova [r0+ 0], m0
+ movd [r0+32], xm2
lea r2, [r2+r3*2]
lea r0, [r0+r1*2]
dec r5d
jg .height_loop
RET
+%endmacro
+
+INIT_XMM sse2
+PIXEL_AVG_W18
+INIT_YMM avx2
+PIXEL_AVG_W18
+
%endif ; HIGH_BIT_DEPTH
%if HIGH_BIT_DEPTH == 0
movu m1, [r2+%4*mmsize]
movu m2, [r2+r3+%3*mmsize]
movu m3, [r2+r3+%4*mmsize]
- movu m4, [r2+r3*2+%3*mmsize]
- movu m5, [r2+r3*2+%4*mmsize]
- movu m6, [r2+%2+%3*mmsize]
- movu m7, [r2+%2+%4*mmsize]
mova [r0+%3*mmsize], m0
mova [r0+%4*mmsize], m1
mova [r0+r1+%3*mmsize], m2
mova [r0+r1+%4*mmsize], m3
- mova [r0+r1*2+%3*mmsize], m4
- mova [r0+r1*2+%4*mmsize], m5
- mova [r0+%1+%3*mmsize], m6
- mova [r0+%1+%4*mmsize], m7
+ movu m0, [r2+r3*2+%3*mmsize]
+ movu m1, [r2+r3*2+%4*mmsize]
+ movu m2, [r2+%2+%3*mmsize]
+ movu m3, [r2+%2+%4*mmsize]
+ mova [r0+r1*2+%3*mmsize], m0
+ mova [r0+r1*2+%4*mmsize], m1
+ mova [r0+%1+%3*mmsize], m2
+ mova [r0+%1+%4*mmsize], m3
%endmacro
%macro COPY4 2
%macro MC_COPY 1
%assign %%w %1*SIZEOF_PIXEL/mmsize
%if %%w > 0
-cglobal mc_copy_w%1, 5,7,8*(%%w/2)
+cglobal mc_copy_w%1, 5,7
FIX_STRIDES r1, r3
lea r6, [r3*3]
lea r5, [r1*3]
MC_COPY 16
INIT_XMM aligned, sse
MC_COPY 16
-
-
+%if HIGH_BIT_DEPTH
+INIT_YMM avx
+MC_COPY 16
+INIT_YMM aligned, avx
+MC_COPY 16
+%endif
;=============================================================================
; prefetch
void x264_mc_copy_w16_mmx( pixel *, intptr_t, pixel *, intptr_t, int );
void x264_mc_copy_w16_sse( pixel *, intptr_t, pixel *, intptr_t, int );
void x264_mc_copy_w16_aligned_sse( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );
+void x264_mc_copy_w16_aligned_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );
void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
#define x264_pixel_avg2_w20_mmx2 x264_pixel_avg2_w18_mmx2
#define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w10_sse2
#define x264_pixel_avg2_w20_sse2 x264_pixel_avg2_w18_sse2
+#define x264_pixel_avg2_w12_avx2 x264_pixel_avg2_w16_avx2
+#define x264_pixel_avg2_w20_avx2 x264_pixel_avg2_w18_avx2
#else
/* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
#define x264_pixel_avg2_w12_cache64_ssse3 x264_pixel_avg2_w16_cache64_ssse3
PIXEL_AVG_WTAB(mmx2, mmx2, mmx2, mmx2, mmx2, mmx2)
#if HIGH_BIT_DEPTH
PIXEL_AVG_WTAB(sse2, mmx2, sse2, sse2, sse2, sse2)
+PIXEL_AVG_WTAB(avx2, mmx2, sse2, avx2, avx2, avx2)
#else // !HIGH_BIT_DEPTH
#if ARCH_X86
PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2)
MC_COPY_WTAB(mmx,mmx,mmx,mmx)
#if HIGH_BIT_DEPTH
MC_COPY_WTAB(sse,mmx,sse,sse)
+MC_COPY_WTAB(avx,mmx,sse,avx)
#else
MC_COPY_WTAB(sse,mmx,mmx,sse)
#endif
MC_LUMA(mmx2,mmx2,mmx)
MC_LUMA(sse2,sse2,sse)
-#if !HIGH_BIT_DEPTH
+#if HIGH_BIT_DEPTH
+MC_LUMA(avx2,avx2,avx)
+#else
#if ARCH_X86
MC_LUMA(cache32_mmx2,cache32_mmx2,mmx)
MC_LUMA(cache64_mmx2,cache64_mmx2,mmx)
GET_REF(mmx2)
GET_REF(sse2)
+GET_REF(avx2)
#if !HIGH_BIT_DEPTH
#if ARCH_X86
GET_REF(cache32_mmx2)
GET_REF(cache64_sse2)
GET_REF(cache64_ssse3)
GET_REF(cache64_ssse3_atom)
-GET_REF(avx2)
#endif // !HIGH_BIT_DEPTH
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
pf->plane_copy_interleave = x264_plane_copy_interleave_avx;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx;
pf->store_interleave_chroma = x264_store_interleave_chroma_avx;
+ pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_avx;
if( cpu&X264_CPU_XOP )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
+
+ if( cpu&X264_CPU_AVX2 )
+ pf->mc_luma = mc_luma_avx2;
#else // !HIGH_BIT_DEPTH
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
pf->weight = x264_mc_weight_wtab_avx2;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx2;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx2;
- pf->get_ref = get_ref_avx2;
pf->integral_init8v = x264_integral_init8v_avx2;
pf->integral_init4v = x264_integral_init4v_avx2;
pf->integral_init8h = x264_integral_init8h_avx2;
if( !(cpu&X264_CPU_AVX2) )
return;
+ pf->get_ref = get_ref_avx2;
if( cpu&X264_CPU_FMA3 )
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2_fma3;
static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
pixel **p_fref, int i8x8, int size, int chroma )
{
- ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
+ ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
pixel *pix2 = pix1+8;
int i_stride = h->mb.pic.i_stride[1];
int chroma_h_shift = chroma <= CHROMA_422;
static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
{
- ALIGNED_ARRAY_16( pixel, pix, [4],[16*16] );
- ALIGNED_ARRAY_16( pixel, bi, [2],[16*16] );
+ ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
+ ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] );
int i_chroma_cost = 0;
int chromapix = h->luma2chroma_pixel[i_pixel];
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{
- ALIGNED_ARRAY_16( pixel, pix0,[16*16] );
- ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
+ ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
+ ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
pixel *src0, *src1;
intptr_t stride0 = 16, stride1 = 16;
int i_ref, i_mvc;
static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
{
- ALIGNED_ARRAY_16( pixel, pix,[2],[16*8] );
+ ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
ALIGNED_4( int16_t mvc[3][2] );
h->mb.i_partition = D_16x8;
int omx, omy, pmx, pmy;
pixel *p_fenc = m->p_fenc[0];
pixel *p_fref_w = m->p_fref_w;
- ALIGNED_ARRAY_16( pixel, pix,[16*16] );
+ ALIGNED_ARRAY_N( pixel, pix,[16*16] );
ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] );
ALIGNED_ARRAY_16( int, costs,[16] );
const int i_pixel = m0->i_pixel;
const int bw = x264_pixel_size[i_pixel].w;
const int bh = x264_pixel_size[i_pixel].h;
- ALIGNED_ARRAY_16( pixel, pixy_buf,[2],[9][16*16] );
- ALIGNED_ARRAY_16( pixel, pixu_buf,[2],[9][16*16] );
- ALIGNED_ARRAY_16( pixel, pixv_buf,[2],[9][16*16] );
+ ALIGNED_ARRAY_N( pixel, pixy_buf,[2],[9][16*16] );
+ ALIGNED_ARRAY_N( pixel, pixu_buf,[2],[9][16*16] );
+ ALIGNED_ARRAY_N( pixel, pixv_buf,[2],[9][16*16] );
pixel *src[3][2][9];
int chromapix = h->luma2chroma_pixel[i_pixel];
int chroma_v_shift = CHROMA_V_SHIFT;