From: Henrik Gramner Date: Mon, 1 Dec 2014 21:05:42 +0000 (+0100) Subject: x86: SSSE3 and AVX2 implementations of plane_copy_swap X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=a83edfa053f60ad0c8a164f31e7492a680eef361;p=x264 x86: SSSE3 and AVX2 implementations of plane_copy_swap For NV21 input. --- diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index acafb2f4..7fa72fcf 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -40,6 +40,7 @@ hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 %if HIGH_BIT_DEPTH +copy_swap_shuf: times 2 db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 v210_mask: times 4 dq 0xc00ffc003ff003ff v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15 v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14 @@ -50,6 +51,7 @@ v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15 %else +copy_swap_shuf: times 2 db 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14 deinterleave_rgb_shuf: db 0,3,6,9,1,4,7,10,2,5,8,11,-1,-1,-1,-1 db 0,4,8,12,1,5,9,13,2,6,10,14,-1,-1,-1,-1 @@ -922,14 +924,23 @@ HPEL %endmacro ;----------------------------------------------------------------------------- -; void plane_copy_core( pixel *dst, intptr_t i_dst, -; pixel *src, intptr_t i_src, int w, int h ) +; void plane_copy(_swap)_core( pixel *dst, intptr_t i_dst, +; pixel *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- ; assumes i_dst and w are multiples of mmsize, and i_dst>w -%macro PLANE_COPY_CORE 0 +%macro PLANE_COPY_CORE 1 ; swap +%if %1 +cglobal plane_copy_swap_core, 6,7 + mova m4, [copy_swap_shuf] +%else cglobal plane_copy_core, 6,7 - FIX_STRIDES r1, r3, r4d -%if HIGH_BIT_DEPTH == 0 +%endif + FIX_STRIDES r1, r3 +%if %1 && HIGH_BIT_DEPTH + shl r4d, 2 +%elif %1 || HIGH_BIT_DEPTH + add r4d, r4d +%else movsxdifnidn r4, r4d %endif add r0, r4 @@ -937,23 +948,37 @@ cglobal plane_copy_core, 6,7 neg r4 .loopy: lea r6, [r4+4*mmsize] +%if %1 + test r6d, r6d + jg .skip +%endif .loopx: PREFETCHNT_ITER r2+r6, 4*mmsize movu m0, [r2+r6-4*mmsize] movu m1, [r2+r6-3*mmsize] movu m2, [r2+r6-2*mmsize] movu m3, [r2+r6-1*mmsize] +%if %1 + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 +%endif movnta [r0+r6-4*mmsize], m0 movnta [r0+r6-3*mmsize], m1 movnta [r0+r6-2*mmsize], m2 movnta [r0+r6-1*mmsize], m3 add r6, 4*mmsize jle .loopx +.skip: PREFETCHNT_ITER r2+r6, 4*mmsize sub r6, 4*mmsize jz .end .loop_end: movu m0, [r2+r6] +%if %1 + pshufb m0, m4 +%endif movnta [r0+r6], m0 add r6, mmsize jl .loop_end @@ -967,9 +992,13 @@ cglobal plane_copy_core, 6,7 %endmacro INIT_XMM sse -PLANE_COPY_CORE +PLANE_COPY_CORE 0 +INIT_XMM ssse3 +PLANE_COPY_CORE 1 INIT_YMM avx -PLANE_COPY_CORE +PLANE_COPY_CORE 0 +INIT_YMM avx2 +PLANE_COPY_CORE 1 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint %if HIGH_BIT_DEPTH diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index d893e063..d868706c 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -93,6 +93,9 @@ void x264_prefetch_ref_mmx2( pixel *, intptr_t, int ); void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); +void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); +void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); +void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); void x264_plane_copy_interleave_core_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); @@ -519,6 +522,39 @@ static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intpt PLANE_COPY(16, sse) PLANE_COPY(32, avx) +#define PLANE_COPY_SWAP(align, cpu)\ +static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\ +{\ + int c_w = (align>>1) / sizeof(pixel) - 1;\ + if( !(w&c_w) )\ + x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\ + else if( w > c_w )\ + {\ + if( --h > 0 )\ + {\ + if( i_src > 0 )\ + {\ + x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ + dst += i_dst * h;\ + src += i_src * h;\ + }\ + else\ + x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ + }\ + x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\ + for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\ + {\ + dst[x] = src[x+1];\ + dst[x+1] = src[x];\ + }\ + }\ + else\ + x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\ +} + +PLANE_COPY_SWAP(16, ssse3) +PLANE_COPY_SWAP(32, avx2) + #define PLANE_INTERLEAVE(cpu) \ static void x264_plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\ pixel *srcu, intptr_t i_srcu,\ @@ -771,6 +807,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) return; pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; + pf->plane_copy_swap = x264_plane_copy_swap_ssse3; pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3; pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3; @@ -875,6 +912,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_ssse3; pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3; + pf->plane_copy_swap = x264_plane_copy_swap_ssse3; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3; pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3; @@ -961,6 +999,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) if( !(cpu&X264_CPU_AVX2) ) return; + pf->plane_copy_swap = x264_plane_copy_swap_avx2; pf->get_ref = get_ref_avx2; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2; }