X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fx86%2Fmc-a.asm;h=cd69c824a2e1ff165d4e100f18bc3c4e161436dd;hb=5265b927b0f2e043dd39cbbbf3909da0862d60e6;hp=0c610b5f69e99818e0ab2f74ca14fc4b862e4908;hpb=15595e6d94940064046c61e64ef9cea993f3e05c;p=x264 diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 0c610b5f..cd69c824 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* mc-a.asm: x86 motion compensation ;***************************************************************************** -;* Copyright (C) 2003-2010 x264 project +;* Copyright (C) 2003-2015 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser @@ -34,7 +34,7 @@ SECTION_RODATA 32 -ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 +ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 ch_shuf_adj: times 8 db 0 times 8 db 2 times 8 db 4 @@ -49,22 +49,26 @@ cextern pw_4 cextern pw_8 cextern pw_32 cextern pw_64 +cextern pw_512 cextern pw_00ff cextern pw_pixel_max cextern sw_64 cextern pd_32 +cextern deinterleave_shufd ;============================================================================= ; implicit weighted biprediction ;============================================================================= ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 -%ifdef ARCH_X86_64 - DECLARE_REG_TMP 0,1,2,3,4,5,10,11 +%if WIN64 + DECLARE_REG_TMP 0,1,2,3,4,5,4,5 %macro AVG_START 0-1 0 PROLOGUE 6,7,%1 -%ifdef WIN64 - movsxd r5, r5d -%endif + %endmacro +%elif UNIX64 + DECLARE_REG_TMP 0,1,2,3,4,5,7,8 + %macro AVG_START 0-1 0 + PROLOGUE 6,9,%1 %endmacro %else DECLARE_REG_TMP 1,2,3,4,5,6,1,2 @@ -80,15 +84,15 @@ cextern pd_32 %endif %macro AVG_END 0 - sub eax, 2 lea t4, [t4+t5*2*SIZEOF_PIXEL] lea t2, [t2+t3*2*SIZEOF_PIXEL] lea t0, [t0+t1*2*SIZEOF_PIXEL] + sub eax, 2 jg .height_loop - REP_RET + RET %endmacro -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %macro BIWEIGHT_MMX 2 movh m0, %1 @@ -139,8 +143,7 @@ cextern pd_32 movh m1, %2 punpcklbw m0, m1 pmaddubsw m0, m3 - paddw m0, m4 - psraw m0, 6 + pmulhrsw m0, m4 %endmacro %macro BIWEIGHT_START_SSSE3 0 @@ -149,13 +152,16 @@ cextern pd_32 sub t7d, t6d shl t7d, 8 add t6d, t7d - movd m3, t6d - mova m4, [pw_32] + mova m4, [pw_512] + movd xm3, t6d +%if cpuflag(avx2) + vpbroadcastw m3, xm3 +%else SPLATW m3, m3 ; weight_dst,src +%endif %endmacro -%ifdef HIGH_BIT_DEPTH - +%if HIGH_BIT_DEPTH %macro BIWEIGHT_ROW 4 BIWEIGHT [%2], [%3] %if %4==mmsize/4 @@ -188,21 +194,21 @@ cextern pd_32 %endif ;HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; int pixel_avg_weight_w16( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight ) +; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight ) ;----------------------------------------------------------------------------- -%macro AVG_WEIGHT 2-3 0 -cglobal pixel_avg_weight_w%2_%1 +%macro AVG_WEIGHT 1-2 0 +cglobal pixel_avg_weight_w%1 BIWEIGHT_START - AVG_START %3 -%ifdef HIGH_BIT_DEPTH + AVG_START %2 +%if HIGH_BIT_DEPTH mova m7, [pw_pixel_max] %endif .height_loop: -%if mmsize==16 && %2==mmsize/(2*SIZEOF_PIXEL) +%if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL) BIWEIGHT [t2], [t4] SWAP 0, 6 BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5] -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH packssdw m6, m0 CLIPW m6, m5, m7 %else ;!HIGH_BIT_DEPTH @@ -212,9 +218,9 @@ cglobal pixel_avg_weight_w%2_%1 movhps [t0+SIZEOF_PIXEL*t1], m6 %else %assign x 0 -%rep (%2*SIZEOF_PIXEL+mmsize-1)/mmsize - BIWEIGHT_ROW t0+x, t2+x, t4+x, %2 - BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %2 +%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize + BIWEIGHT_ROW t0+x, t2+x, t4+x, %1 + BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %1 %assign x x+mmsize %endrep %endif @@ -223,34 +229,54 @@ cglobal pixel_avg_weight_w%2_%1 %define BIWEIGHT BIWEIGHT_MMX %define BIWEIGHT_START BIWEIGHT_START_MMX -INIT_MMX -AVG_WEIGHT mmxext, 4 -AVG_WEIGHT mmxext, 8 -AVG_WEIGHT mmxext, 16 -%ifdef HIGH_BIT_DEPTH -INIT_XMM -AVG_WEIGHT sse2, 4, 8 -AVG_WEIGHT sse2, 8, 8 -AVG_WEIGHT sse2, 16, 8 +INIT_MMX mmx2 +AVG_WEIGHT 4 +AVG_WEIGHT 8 +AVG_WEIGHT 16 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +AVG_WEIGHT 4, 8 +AVG_WEIGHT 8, 8 +AVG_WEIGHT 16, 8 %else ;!HIGH_BIT_DEPTH -INIT_XMM -AVG_WEIGHT sse2, 8, 7 -AVG_WEIGHT sse2, 16, 7 +INIT_XMM sse2 +AVG_WEIGHT 8, 7 +AVG_WEIGHT 16, 7 %define BIWEIGHT BIWEIGHT_SSSE3 %define BIWEIGHT_START BIWEIGHT_START_SSSE3 -INIT_MMX -AVG_WEIGHT ssse3, 4 -INIT_XMM -AVG_WEIGHT ssse3, 8, 7 -AVG_WEIGHT ssse3, 16, 7 +INIT_MMX ssse3 +AVG_WEIGHT 4 +INIT_XMM ssse3 +AVG_WEIGHT 8, 7 +AVG_WEIGHT 16, 7 + +INIT_YMM avx2 +cglobal pixel_avg_weight_w16 + BIWEIGHT_START + AVG_START 5 +.height_loop: + movu xm0, [t2] + movu xm1, [t4] + vinserti128 m0, m0, [t2+t3], 1 + vinserti128 m1, m1, [t4+t5], 1 + SBUTTERFLY bw, 0, 1, 2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 + mova [t0], xm0 + vextracti128 [t0+t1], m0, 1 + AVG_END %endif ;HIGH_BIT_DEPTH ;============================================================================= ; P frame explicit weighted prediction ;============================================================================= -%ifdef HIGH_BIT_DEPTH -%macro WEIGHT_START 1 ; (width) +%if HIGH_BIT_DEPTH +; width +%macro WEIGHT_START 1 mova m0, [r4+ 0] ; 1<= mmsize - WEIGHT_ROW (%1+x), (%2+x), mmsize ; weight 1 mmsize - WEIGHT_ROW (%1+r3+x), (%2+r1+x), mmsize ; weight 1 mmsize + WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4 %assign x (x+mmsize) %else - WEIGHT_COL (%1+x),(%2+x),(%3-x) - %exitrep + %assign w %3-x +%if w == 20 + %assign w 16 +%endif + WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4 + %assign x (x+w) %endif %if x >= %3 %exitrep @@ -409,68 +476,68 @@ AVG_WEIGHT ssse3, 16, 7 %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h ) +;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h ) ;----------------------------------------------------------------------------- -%ifdef ARCH_X86_64 -%define NUMREGS 6 -%define LOAD_HEIGHT -%define HEIGHT_REG r5d -%define TMP_REG r6d -%else -%define NUMREGS 5 -%define TMP_REG r5d -%define LOAD_HEIGHT mov r4d, r5m -%define HEIGHT_REG r4d -%endif - -%assign XMMREGS 7 -%ifdef HIGH_BIT_DEPTH -%assign NUMREGS NUMREGS+1 -%assign XMMREGS 8 -%endif - -%macro WEIGHTER 2 - cglobal mc_weight_w%1_%2, NUMREGS, NUMREGS, XMMREGS*(mmsize/16) +%macro WEIGHTER 1 +cglobal mc_weight_w%1, 6,6,8 FIX_STRIDES r1, r3 WEIGHT_START %1 - LOAD_HEIGHT +%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 + ; we can merge the shift step into the scale factor + ; if (m3<<7) doesn't overflow an int16_t + cmp byte [r4+1], 0 + jz .fast +%endif .loop: - WEIGHT_TWO_ROW r2, r0, %1 + WEIGHT_TWO_ROW r2, r0, %1, 0 lea r0, [r0+r1*2] lea r2, [r2+r3*2] - sub HEIGHT_REG, 2 + sub r5d, 2 jg .loop - REP_RET + RET +%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 +.fast: + psllw m3, 7 +.fastloop: + WEIGHT_TWO_ROW r2, r0, %1, 1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + sub r5d, 2 + jg .fastloop + RET +%endif %endmacro -INIT_MMX -WEIGHTER 4, mmxext -WEIGHTER 8, mmxext -WEIGHTER 12, mmxext -WEIGHTER 16, mmxext -WEIGHTER 20, mmxext -INIT_XMM -WEIGHTER 8, sse2 -WEIGHTER 16, sse2 -WEIGHTER 20, sse2 -%ifdef HIGH_BIT_DEPTH -WEIGHTER 12, sse2 +INIT_MMX mmx2 +WEIGHTER 4 +WEIGHTER 8 +WEIGHTER 12 +WEIGHTER 16 +WEIGHTER 20 +INIT_XMM sse2 +WEIGHTER 8 +WEIGHTER 16 +WEIGHTER 20 +%if HIGH_BIT_DEPTH +WEIGHTER 12 %else -%define WEIGHT WEIGHT_SSSE3 -%define WEIGHT_START WEIGHT_START_SSSE3 -INIT_MMX -WEIGHTER 4, ssse3 -INIT_XMM -WEIGHTER 8, ssse3 -WEIGHTER 16, ssse3 -WEIGHTER 20, ssse3 +INIT_MMX ssse3 +WEIGHTER 4 +INIT_XMM ssse3 +WEIGHTER 8 +WEIGHTER 16 +WEIGHTER 20 +INIT_YMM avx2 +WEIGHTER 8 +WEIGHTER 16 +WEIGHTER 20 %endif %macro OFFSET_OP 7 mov%6 m0, [%1] mov%6 m1, [%2] -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH p%5usw m0, m2 p%5usw m1, m2 %ifidn %5,add @@ -492,7 +559,7 @@ WEIGHTER 20, ssse3 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a %assign x (x+mmsize) %else -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h %else OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d @@ -506,48 +573,44 @@ WEIGHTER 20, ssse3 %endmacro ;----------------------------------------------------------------------------- -;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h ) +;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h ) ;----------------------------------------------------------------------------- -%macro OFFSET 3 - cglobal mc_offset%3_w%1_%2, NUMREGS, NUMREGS +%macro OFFSET 2 +cglobal mc_offset%2_w%1, 6,6 FIX_STRIDES r1, r3 mova m2, [r4] -%ifdef HIGH_BIT_DEPTH -%ifidn %3,add +%if HIGH_BIT_DEPTH +%ifidn %2,add mova m3, [pw_pixel_max] %endif %endif - LOAD_HEIGHT .loop: - OFFSET_TWO_ROW r2, r0, %1, %3 + OFFSET_TWO_ROW r2, r0, %1, %2 lea r0, [r0+r1*2] lea r2, [r2+r3*2] - sub HEIGHT_REG, 2 + sub r5d, 2 jg .loop - REP_RET + RET %endmacro -%macro OFFSETPN 2 - OFFSET %1, %2, add - OFFSET %1, %2, sub +%macro OFFSETPN 1 + OFFSET %1, add + OFFSET %1, sub %endmacro -INIT_MMX -OFFSETPN 4, mmxext -OFFSETPN 8, mmxext -OFFSETPN 12, mmxext -OFFSETPN 16, mmxext -OFFSETPN 20, mmxext -INIT_XMM -OFFSETPN 12, sse2 -OFFSETPN 16, sse2 -OFFSETPN 20, sse2 -%ifdef HIGH_BIT_DEPTH -OFFSETPN 8, sse2 +INIT_MMX mmx2 +OFFSETPN 4 +OFFSETPN 8 +OFFSETPN 12 +OFFSETPN 16 +OFFSETPN 20 +INIT_XMM sse2 +OFFSETPN 12 +OFFSETPN 16 +OFFSETPN 20 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +OFFSETPN 8 %endif -%undef LOAD_HEIGHT -%undef HEIGHT_REG -%undef NUMREGS - ;============================================================================= @@ -555,36 +618,39 @@ OFFSETPN 8, sse2 ;============================================================================= ;----------------------------------------------------------------------------- -; void pixel_avg_4x4( pixel *dst, int dst_stride, -; pixel *src1, int src1_stride, pixel *src2, int src2_stride, int weight ); +; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, +; pixel *src2, intptr_t src2_stride, int weight ); ;----------------------------------------------------------------------------- -%macro AVGH 3 -cglobal pixel_avg_%1x%2_%3 +%macro AVGH 2 +cglobal pixel_avg_%1x%2 mov eax, %2 cmp dword r6m, 32 - jne pixel_avg_weight_w%1_%3 + jne pixel_avg_weight_w%1 %+ SUFFIX +%if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads + jmp pixel_avg_w%1_avx2 +%else %if mmsize == 16 && %1 == 16 test dword r4m, 15 jz pixel_avg_w%1_sse2 %endif - jmp pixel_avg_w%1_mmxext + jmp pixel_avg_w%1_mmx2 +%endif %endmacro ;----------------------------------------------------------------------------- -; void pixel_avg_w4( pixel *dst, int dst_stride, -; pixel *src1, int src1_stride, pixel *src2, int src2_stride, -; int height, int weight ); +; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, +; pixel *src2, intptr_t src2_stride, int height, int weight ); ;----------------------------------------------------------------------------- -%macro AVG_FUNC 4 -cglobal pixel_avg_w%1_%4 +%macro AVG_FUNC 3 +cglobal pixel_avg_w%1 AVG_START .height_loop: %assign x 0 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize %2 m0, [t2+x] %2 m1, [t2+x+SIZEOF_PIXEL*t3] -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH pavgw m0, [t4+x] pavgw m1, [t4+x+SIZEOF_PIXEL*t5] %else ;!HIGH_BIT_DEPTH @@ -598,94 +664,103 @@ cglobal pixel_avg_w%1_%4 AVG_END %endmacro -%ifdef HIGH_BIT_DEPTH - -INIT_MMX -AVG_FUNC 4, movq, movq, mmxext -AVGH 4, 8, mmxext -AVGH 4, 4, mmxext -AVGH 4, 2, mmxext +%if HIGH_BIT_DEPTH -AVG_FUNC 8, movq, movq, mmxext -AVGH 8, 16, mmxext -AVGH 8, 8, mmxext -AVGH 8, 4, mmxext +INIT_MMX mmx2 +AVG_FUNC 4, movq, movq +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 -AVG_FUNC 16, movq, movq, mmxext -AVGH 16, 16, mmxext -AVGH 16, 8, mmxext +AVG_FUNC 8, movq, movq +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 -INIT_XMM +AVG_FUNC 16, movq, movq +AVGH 16, 16 +AVGH 16, 8 -AVG_FUNC 4, movq, movq, sse2 -AVGH 4, 8, sse2 -AVGH 4, 4, sse2 -AVGH 4, 2, sse2 +INIT_XMM sse2 +AVG_FUNC 4, movq, movq +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 -AVG_FUNC 8, movdqu, movdqa, sse2 -AVGH 8, 16, sse2 -AVGH 8, 8, sse2 -AVGH 8, 4, sse2 +AVG_FUNC 8, movdqu, movdqa +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 -AVG_FUNC 16, movdqu, movdqa, sse2 -AVGH 16, 16, sse2 -AVGH 16, 8, sse2 +AVG_FUNC 16, movdqu, movdqa +AVGH 16, 16 +AVGH 16, 8 %else ;!HIGH_BIT_DEPTH -INIT_MMX -AVG_FUNC 4, movd, movd, mmxext -AVGH 4, 8, mmxext -AVGH 4, 4, mmxext -AVGH 4, 2, mmxext - -AVG_FUNC 8, movq, movq, mmxext -AVGH 8, 16, mmxext -AVGH 8, 8, mmxext -AVGH 8, 4, mmxext - -AVG_FUNC 16, movq, movq, mmxext -AVGH 16, 16, mmxext -AVGH 16, 8, mmxext - -INIT_XMM -AVG_FUNC 16, movdqu, movdqa, sse2 -AVGH 16, 16, sse2 -AVGH 16, 8, sse2 -AVGH 8, 16, sse2 -AVGH 8, 8, sse2 -AVGH 8, 4, sse2 -AVGH 16, 16, ssse3 -AVGH 16, 8, ssse3 -AVGH 8, 16, ssse3 -AVGH 8, 8, ssse3 -AVGH 8, 4, ssse3 -INIT_MMX -AVGH 4, 8, ssse3 -AVGH 4, 4, ssse3 -AVGH 4, 2, ssse3 +INIT_MMX mmx2 +AVG_FUNC 4, movd, movd +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 + +AVG_FUNC 8, movq, movq +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 + +AVG_FUNC 16, movq, movq +AVGH 16, 16 +AVGH 16, 8 + +INIT_XMM sse2 +AVG_FUNC 16, movdqu, movdqa +AVGH 16, 16 +AVGH 16, 8 +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 +INIT_XMM ssse3 +AVGH 16, 16 +AVGH 16, 8 +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 +INIT_MMX ssse3 +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 +INIT_XMM avx2 +AVG_FUNC 16, movdqu, movdqa +AVGH 16, 16 +AVGH 16, 8 %endif ;HIGH_BIT_DEPTH + ;============================================================================= ; pixel avg2 ;============================================================================= -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; void pixel_avg2_wN( uint16_t *dst, int dst_stride, -; uint16_t *src1, int src_stride, +; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride, +; uint16_t *src1, intptr_t src_stride, ; uint16_t *src2, int height ); ;----------------------------------------------------------------------------- -%macro AVG2_W_ONE 2 -cglobal pixel_avg2_w%1_%2, 6,7,4*(mmsize/16) +%macro AVG2_W_ONE 1 +cglobal pixel_avg2_w%1, 6,7,4 sub r4, r2 lea r6, [r4+r3*2] .height_loop: movu m0, [r2] movu m1, [r2+r3*2] -%if mmsize == 8 +%if cpuflag(avx) || mmsize == 8 pavgw m0, [r2+r4] pavgw m1, [r2+r6] %else @@ -696,15 +771,15 @@ cglobal pixel_avg2_w%1_%2, 6,7,4*(mmsize/16) %endif mova [r0], m0 mova [r0+r1*2], m1 - sub r5d, 2 lea r2, [r2+r3*4] lea r0, [r0+r1*4] + sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro -%macro AVG2_W_TWO 4 -cglobal pixel_avg2_w%1_%4, 6,7,8*(mmsize/16) +%macro AVG2_W_TWO 3 +cglobal pixel_avg2_w%1, 6,7,8 sub r4, r2 lea r6, [r4+r3*2] .height_loop: @@ -731,23 +806,25 @@ cglobal pixel_avg2_w%1_%4, 6,7,8*(mmsize/16) %3 [r0+mmsize], m1 mova [r0+r1*2], m2 %3 [r0+r1*2+mmsize], m3 - sub r5d, 2 lea r2, [r2+r3*4] lea r0, [r0+r1*4] + sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro -INIT_MMX -AVG2_W_ONE 4, mmxext -AVG2_W_TWO 8, movu, mova, mmxext -INIT_XMM -AVG2_W_ONE 8, sse2 -AVG2_W_TWO 10, movd, movd, sse2 -AVG2_W_TWO 16, movu, mova, sse2 +INIT_MMX mmx2 +AVG2_W_ONE 4 +AVG2_W_TWO 8, movu, mova +INIT_XMM sse2 +AVG2_W_ONE 8 +AVG2_W_TWO 10, movd, movd +AVG2_W_TWO 16, movu, mova +INIT_YMM avx2 +AVG2_W_ONE 16 INIT_MMX -cglobal pixel_avg2_w10_mmxext, 6,7 +cglobal pixel_avg2_w10_mmx2, 6,7 sub r4, r2 lea r6, [r4+r3*2] .height_loop: @@ -769,13 +846,13 @@ cglobal pixel_avg2_w10_mmxext, 6,7 mova [r0+r1*2+ 0], m3 mova [r0+r1*2+ 8], m4 movh [r0+r1*2+16], m5 - sub r5d, 2 lea r2, [r2+r3*2*2] lea r0, [r0+r1*2*2] + sub r5d, 2 jg .height_loop - REP_RET + RET -cglobal pixel_avg2_w16_mmxext, 6,7 +cglobal pixel_avg2_w16_mmx2, 6,7 sub r4, r2 lea r6, [r4+r3*2] .height_loop: @@ -803,13 +880,13 @@ cglobal pixel_avg2_w16_mmxext, 6,7 mova [r0+r1*2+ 8], m5 mova [r0+r1*2+16], m6 mova [r0+r1*2+24], m7 - sub r5d, 2 lea r2, [r2+r3*2*2] lea r0, [r0+r1*2*2] + sub r5d, 2 jg .height_loop - REP_RET + RET -cglobal pixel_avg2_w18_mmxext, 6,7 +cglobal pixel_avg2_w18_mmx2, 6,7 sub r4, r2 .height_loop: movu m0, [r2+ 0] @@ -827,43 +904,56 @@ cglobal pixel_avg2_w18_mmxext, 6,7 mova [r0+16], m2 mova [r0+24], m3 movh [r0+32], m4 - sub r5d, 1 lea r2, [r2+r3*2] lea r0, [r0+r1*2] + dec r5d jg .height_loop - REP_RET + RET -INIT_XMM -cglobal pixel_avg2_w18_sse2, 6,7,6 +%macro PIXEL_AVG_W18 0 +cglobal pixel_avg2_w18, 6,7 sub r4, r2 .height_loop: movu m0, [r2+ 0] + movd xm2, [r2+32] +%if mmsize == 32 + pavgw m0, [r2+r4+ 0] + movd xm1, [r2+r4+32] + pavgw xm2, xm1 +%else movu m1, [r2+16] - movh m2, [r2+32] movu m3, [r2+r4+ 0] movu m4, [r2+r4+16] - movh m5, [r2+r4+32] + movd m5, [r2+r4+32] pavgw m0, m3 pavgw m1, m4 pavgw m2, m5 - mova [r0+ 0], m0 mova [r0+16], m1 - movh [r0+32], m2 - sub r5d, 1 +%endif + mova [r0+ 0], m0 + movd [r0+32], xm2 lea r2, [r2+r3*2] lea r0, [r0+r1*2] + dec r5d jg .height_loop - REP_RET + RET +%endmacro + +INIT_XMM sse2 +PIXEL_AVG_W18 +INIT_YMM avx2 +PIXEL_AVG_W18 + %endif ; HIGH_BIT_DEPTH -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- -; void pixel_avg2_w4( uint8_t *dst, int dst_stride, -; uint8_t *src1, int src_stride, +; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride, +; uint8_t *src1, intptr_t src_stride, ; uint8_t *src2, int height ); ;----------------------------------------------------------------------------- %macro AVG2_W8 2 -cglobal pixel_avg2_w%1_mmxext, 6,7 +cglobal pixel_avg2_w%1_mmx2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -877,14 +967,15 @@ cglobal pixel_avg2_w%1_mmxext, 6,7 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro +INIT_MMX AVG2_W8 4, movd AVG2_W8 8, movq %macro AVG2_W16 2 -cglobal pixel_avg2_w%1_mmxext, 6,7 +cglobal pixel_avg2_w%1_mmx2, 6,7 sub r2, r4 lea r6, [r2+r3] .height_loop: @@ -904,13 +995,13 @@ cglobal pixel_avg2_w%1_mmxext, 6,7 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro AVG2_W16 12, movd AVG2_W16 16, movq -cglobal pixel_avg2_w20_mmxext, 6,7 +cglobal pixel_avg2_w20_mmx2, 6,7 sub r2, r4 lea r6, [r2+r3] .height_loop: @@ -936,61 +1027,67 @@ cglobal pixel_avg2_w20_mmxext, 6,7 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET + RET +INIT_XMM cglobal pixel_avg2_w16_sse2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: - movdqu xmm0, [r2] - movdqu xmm2, [r2+r3] - movdqu xmm1, [r2+r4] - movdqu xmm3, [r2+r6] + movu m0, [r2] + movu m2, [r2+r3] + movu m1, [r2+r4] + movu m3, [r2+r6] lea r2, [r2+r3*2] - pavgb xmm0, xmm1 - pavgb xmm2, xmm3 - movdqa [r0], xmm0 - movdqa [r0+r1], xmm2 + pavgb m0, m1 + pavgb m2, m3 + mova [r0], m0 + mova [r0+r1], m2 lea r0, [r0+r1*2] - sub r5d, 2 - jg .height_loop - REP_RET + sub r5d, 2 + jg .height_loop + RET -%macro AVG2_W20 1 -cglobal pixel_avg2_w20_%1, 6,7 +cglobal pixel_avg2_w20_sse2, 6,7 sub r2, r4 lea r6, [r2+r3] .height_loop: - movdqu xmm0, [r4] - movdqu xmm2, [r4+r3] -%ifidn %1, sse2_misalign - movd mm4, [r4+16] - movd mm5, [r4+r3+16] - pavgb xmm0, [r4+r2] - pavgb xmm2, [r4+r6] -%else - movdqu xmm1, [r4+r2] - movdqu xmm3, [r4+r6] - movd mm4, [r4+16] - movd mm5, [r4+r3+16] - pavgb xmm0, xmm1 - pavgb xmm2, xmm3 -%endif - pavgb mm4, [r4+r2+16] - pavgb mm5, [r4+r6+16] + movu m0, [r4] + movu m2, [r4+r3] + movu m1, [r4+r2] + movu m3, [r4+r6] + movd mm4, [r4+16] + movd mm5, [r4+r3+16] + pavgb m0, m1 + pavgb m2, m3 + pavgb mm4, [r4+r2+16] + pavgb mm5, [r4+r6+16] lea r4, [r4+r3*2] - movdqa [r0], xmm0 - movd [r0+16], mm4 - movdqa [r0+r1], xmm2 - movd [r0+r1+16], mm5 + mova [r0], m0 + mova [r0+r1], m2 + movd [r0+16], mm4 + movd [r0+r1+16], mm5 + lea r0, [r0+r1*2] + sub r5d, 2 + jg .height_loop + RET + +INIT_YMM avx2 +cglobal pixel_avg2_w20, 6,7 + sub r2, r4 + lea r6, [r2+r3] +.height_loop: + movu m0, [r4] + movu m1, [r4+r3] + pavgb m0, [r4+r2] + pavgb m1, [r4+r6] + lea r4, [r4+r3*2] + mova [r0], m0 + mova [r0+r1], m1 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET -%endmacro - -AVG2_W20 sse2 -AVG2_W20 sse2_misalign + RET ; Cacheline split code for processors with high latencies for loads ; split over cache lines. See sad-a.asm for a more detailed explanation. @@ -1036,7 +1133,7 @@ AVG2_W20 sse2_misalign %endmacro %macro AVG_CACHELINE_FUNC 2 -pixel_avg2_w%1_cache_mmxext: +pixel_avg2_w%1_cache_mmx2: AVG_CACHELINE_START AVG_CACHELINE_LOOP 0, movq %if %1>8 @@ -1049,20 +1146,20 @@ pixel_avg2_w%1_cache_mmxext: add r0, r1 dec r5d jg .height_loop - REP_RET + RET %endmacro %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set %if %1 == 12 ;w12 isn't needed because w16 is just as fast if there's no cacheline split -%define cachesplit pixel_avg2_w16_cache_mmxext +%define cachesplit pixel_avg2_w16_cache_mmx2 %else -%define cachesplit pixel_avg2_w%1_cache_mmxext +%define cachesplit pixel_avg2_w%1_cache_mmx2 %endif cglobal pixel_avg2_w%1_cache%2_%3 mov eax, r2m - and eax, 0x1f|(%2>>1) - cmp eax, (32-%1-(%1 % 8))|(%2>>1) + and eax, %2-1 + cmp eax, (%2-%1-(%1 % 8)) %if %1==12||%1==20 jbe pixel_avg2_w%1_%3 %else @@ -1070,7 +1167,7 @@ cglobal pixel_avg2_w%1_cache%2_%3 %endif %if 0 ; or %1==8 - but the extra branch seems too expensive ja cachesplit -%ifdef ARCH_X86_64 +%if ARCH_X86_64 test r4b, 1 %else test byte r4m, 1 @@ -1082,25 +1179,25 @@ cglobal pixel_avg2_w%1_cache%2_%3 jz pixel_avg2_w%1_%3 mov eax, r2m %endif -%ifidn %3, sse2 - AVG_CACHELINE_FUNC %1, %2 -%elif %1==8 && %2==64 +%if mmsize==16 || (%1==8 && %2==64) AVG_CACHELINE_FUNC %1, %2 %else jmp cachesplit %endif %endmacro -AVG_CACHELINE_CHECK 8, 64, mmxext -AVG_CACHELINE_CHECK 12, 64, mmxext -%ifndef ARCH_X86_64 -AVG_CACHELINE_CHECK 16, 64, mmxext -AVG_CACHELINE_CHECK 20, 64, mmxext -AVG_CACHELINE_CHECK 8, 32, mmxext -AVG_CACHELINE_CHECK 12, 32, mmxext -AVG_CACHELINE_CHECK 16, 32, mmxext -AVG_CACHELINE_CHECK 20, 32, mmxext +INIT_MMX +AVG_CACHELINE_CHECK 8, 64, mmx2 +AVG_CACHELINE_CHECK 12, 64, mmx2 +%if ARCH_X86_64 == 0 +AVG_CACHELINE_CHECK 16, 64, mmx2 +AVG_CACHELINE_CHECK 20, 64, mmx2 +AVG_CACHELINE_CHECK 8, 32, mmx2 +AVG_CACHELINE_CHECK 12, 32, mmx2 +AVG_CACHELINE_CHECK 16, 32, mmx2 +AVG_CACHELINE_CHECK 20, 32, mmx2 %endif +INIT_XMM AVG_CACHELINE_CHECK 16, 64, sse2 AVG_CACHELINE_CHECK 20, 64, sse2 @@ -1136,7 +1233,9 @@ avg_w16_align%1_%2_ssse3: jg avg_w16_align%1_%2_ssse3 ret %if %1==0 - times 13 db 0x90 ; make sure the first ones don't end up short + ; make sure the first ones don't end up short + ALIGN 16 + times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop %endif %endmacro @@ -1150,7 +1249,7 @@ cglobal pixel_avg2_w16_cache64_ssse3 and eax, 7 jz x264_pixel_avg2_w16_sse2 %endif - PROLOGUE 6, 7 + PROLOGUE 6, 8 lea r6, [r4+r2] and r4, ~0xf and r6, 0x1f @@ -1160,17 +1259,12 @@ cglobal pixel_avg2_w16_cache64_ssse3 shl r6, 4 ;jump = (offset + align*2)*48 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3) %ifdef PIC - lea r11, [avg_w16_addr] - add r6, r11 + lea r7, [avg_w16_addr] + add r6, r7 %else lea r6, [avg_w16_addr + r6] %endif -%ifdef UNIX64 - jmp r6 -%else - call r6 - RET -%endif + TAIL_CALL r6, 1 %assign j 0 %assign k 1 @@ -1186,41 +1280,45 @@ AVG16_CACHELINE_LOOP_SSSE3 j, k ; pixel copy ;============================================================================= -%macro COPY4 4 - %2 m0, [r2] - %2 m1, [r2+r3] - %2 m2, [r2+r3*2] - %2 m3, [r2+%4] - %1 [r0], m0 - %1 [r0+r1], m1 - %1 [r0+r1*2], m2 - %1 [r0+%3], m3 +%macro COPY1 2 + movu m0, [r2] + movu m1, [r2+r3] + movu m2, [r2+r3*2] + movu m3, [r2+%2] + mova [r0], m0 + mova [r0+r1], m1 + mova [r0+r1*2], m2 + mova [r0+%1], m3 %endmacro -%ifdef HIGH_BIT_DEPTH -%macro COPY_ONE 6 - COPY4 %1, %2, %3, %4 +%macro COPY2 2-4 0, 1 + movu m0, [r2+%3*mmsize] + movu m1, [r2+%4*mmsize] + movu m2, [r2+r3+%3*mmsize] + movu m3, [r2+r3+%4*mmsize] + mova [r0+%3*mmsize], m0 + mova [r0+%4*mmsize], m1 + mova [r0+r1+%3*mmsize], m2 + mova [r0+r1+%4*mmsize], m3 + movu m0, [r2+r3*2+%3*mmsize] + movu m1, [r2+r3*2+%4*mmsize] + movu m2, [r2+%2+%3*mmsize] + movu m3, [r2+%2+%4*mmsize] + mova [r0+r1*2+%3*mmsize], m0 + mova [r0+r1*2+%4*mmsize], m1 + mova [r0+%1+%3*mmsize], m2 + mova [r0+%1+%4*mmsize], m3 %endmacro -%macro COPY_TWO 6 - %2 m0, [r2+%5] - %2 m1, [r2+%6] - %2 m2, [r2+r3+%5] - %2 m3, [r2+r3+%6] - %2 m4, [r2+r3*2+%5] - %2 m5, [r2+r3*2+%6] - %2 m6, [r2+%4+%5] - %2 m7, [r2+%4+%6] - %1 [r0+%5], m0 - %1 [r0+%6], m1 - %1 [r0+r1+%5], m2 - %1 [r0+r1+%6], m3 - %1 [r0+r1*2+%5], m4 - %1 [r0+r1*2+%6], m5 - %1 [r0+%3+%5], m6 - %1 [r0+%3+%6], m7 +%macro COPY4 2 + COPY2 %1, %2, 0, 1 + COPY2 %1, %2, 2, 3 %endmacro +;----------------------------------------------------------------------------- +; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride, +; uint8_t *src, intptr_t i_src_stride, int i_height ) +;----------------------------------------------------------------------------- INIT_MMX cglobal mc_copy_w4_mmx, 4,6 FIX_STRIDES r1, r3 @@ -1228,140 +1326,68 @@ cglobal mc_copy_w4_mmx, 4,6 lea r5, [r3*3] lea r4, [r1*3] je .end - COPY4 mova, mova, r4, r5 +%if HIGH_BIT_DEPTH == 0 + %define mova movd + %define movu movd +%endif + COPY1 r4, r5 lea r2, [r2+r3*4] lea r0, [r0+r1*4] -.end - COPY4 movu, mova, r4, r5 +.end: + COPY1 r4, r5 RET -cglobal mc_copy_w16_mmx, 5,7 +%macro MC_COPY 1 +%assign %%w %1*SIZEOF_PIXEL/mmsize +%if %%w > 0 +cglobal mc_copy_w%1, 5,7 FIX_STRIDES r1, r3 lea r6, [r3*3] lea r5, [r1*3] .height_loop: - COPY_TWO mova, movu, r5, r6, mmsize*0, mmsize*1 - COPY_TWO mova, movu, r5, r6, mmsize*2, mmsize*3 - sub r4d, 4 + COPY %+ %%w r5, r6 lea r2, [r2+r3*4] lea r0, [r0+r1*4] - jg .height_loop - REP_RET - -%macro MC_COPY 5 -cglobal mc_copy_w%2_%4, 5,7,%5 - FIX_STRIDES r1, r3 - lea r6, [r3*3] - lea r5, [r1*3] -.height_loop: - COPY_%1 mova, %3, r5, r6, 0, mmsize sub r4d, 4 - lea r2, [r2+r3*4] - lea r0, [r0+r1*4] jg .height_loop - REP_RET -%endmacro - -MC_COPY TWO, 8, movu, mmx, 0 -INIT_XMM -MC_COPY ONE, 8, movu, sse2, 0 -MC_COPY TWO, 16, movu, sse2, 8 -MC_COPY TWO, 16, mova, aligned_sse2, 8 -%endif ; HIGH_BIT_DEPTH - -%ifndef HIGH_BIT_DEPTH -INIT_MMX -;----------------------------------------------------------------------------- -; void mc_copy_w4( uint8_t *dst, int i_dst_stride, -; uint8_t *src, int i_src_stride, int i_height ) -;----------------------------------------------------------------------------- -cglobal mc_copy_w4_mmx, 4,6 - cmp dword r4m, 4 - lea r5, [r3*3] - lea r4, [r1*3] - je .end - COPY4 movd, movd, r4, r5 - lea r2, [r2+r3*4] - lea r0, [r0+r1*4] -.end: - COPY4 movd, movd, r4, r5 RET - -cglobal mc_copy_w8_mmx, 5,7 - lea r6, [r3*3] - lea r5, [r1*3] -.height_loop: - COPY4 movq, movq, r5, r6 - lea r2, [r2+r3*4] - lea r0, [r0+r1*4] - sub r4d, 4 - jg .height_loop - REP_RET - -cglobal mc_copy_w16_mmx, 5,7 - lea r6, [r3*3] - lea r5, [r1*3] -.height_loop: - movq mm0, [r2] - movq mm1, [r2+8] - movq mm2, [r2+r3] - movq mm3, [r2+r3+8] - movq mm4, [r2+r3*2] - movq mm5, [r2+r3*2+8] - movq mm6, [r2+r6] - movq mm7, [r2+r6+8] - movq [r0], mm0 - movq [r0+8], mm1 - movq [r0+r1], mm2 - movq [r0+r1+8], mm3 - movq [r0+r1*2], mm4 - movq [r0+r1*2+8], mm5 - movq [r0+r5], mm6 - movq [r0+r5+8], mm7 - lea r2, [r2+r3*4] - lea r0, [r0+r1*4] - sub r4d, 4 - jg .height_loop - REP_RET - -INIT_XMM -%macro COPY_W16_SSE2 2 -cglobal %1, 5,7 - lea r6, [r3*3] - lea r5, [r1*3] -.height_loop: - COPY4 movdqa, %2, r5, r6 - lea r2, [r2+r3*4] - lea r0, [r0+r1*4] - sub r4d, 4 - jg .height_loop - REP_RET +%endif %endmacro -COPY_W16_SSE2 mc_copy_w16_sse2, movdqu -; cacheline split with mmx has too much overhead; the speed benefit is near-zero. -; but with SSE3 the overhead is zero, so there's no reason not to include it. -COPY_W16_SSE2 mc_copy_w16_sse3, lddqu -COPY_W16_SSE2 mc_copy_w16_aligned_sse2, movdqa -%endif ; !HIGH_BIT_DEPTH - - +INIT_MMX mmx +MC_COPY 8 +MC_COPY 16 +INIT_XMM sse +MC_COPY 8 +MC_COPY 16 +INIT_XMM aligned, sse +MC_COPY 16 +%if HIGH_BIT_DEPTH +INIT_YMM avx +MC_COPY 16 +INIT_YMM aligned, avx +MC_COPY 16 +%endif ;============================================================================= ; prefetch ;============================================================================= -; FIXME assumes 64 byte cachelines +; assumes 64 byte cachelines +; FIXME doesn't cover all pixels in high depth and/or 4:4:4 ;----------------------------------------------------------------------------- -; void prefetch_fenc( uint8_t *pix_y, int stride_y, -; uint8_t *pix_uv, int stride_uv, int mb_x ) +; void prefetch_fenc( pixel *pix_y, intptr_t stride_y, +; pixel *pix_uv, intptr_t stride_uv, int mb_x ) ;----------------------------------------------------------------------------- -%ifdef ARCH_X86_64 -cglobal prefetch_fenc_mmxext, 5,5 + +%macro PREFETCH_FENC 1 +%if ARCH_X86_64 +cglobal prefetch_fenc_%1, 5,5 + FIX_STRIDES r1, r3 and r4d, 3 mov eax, r4d imul r4d, r1d - lea r0, [r0+r4*4+64] + lea r0, [r0+r4*4+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] @@ -1369,19 +1395,25 @@ cglobal prefetch_fenc_mmxext, 5,5 prefetcht0 [r0+r1] imul eax, r3d - lea r2, [r2+rax*2+64] + lea r2, [r2+rax*2+64*SIZEOF_PIXEL] + prefetcht0 [r2] + prefetcht0 [r2+r3] +%ifidn %1, 422 + lea r2, [r2+r3*2] prefetcht0 [r2] prefetcht0 [r2+r3] +%endif RET %else -cglobal prefetch_fenc_mmxext, 0,3 +cglobal prefetch_fenc_%1, 0,3 mov r2, r4m mov r1, r1m mov r0, r0m + FIX_STRIDES r1 and r2, 3 imul r2, r1 - lea r0, [r0+r2*4+64] + lea r0, [r0+r2*4+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] @@ -1391,21 +1423,34 @@ cglobal prefetch_fenc_mmxext, 0,3 mov r2, r4m mov r1, r3m mov r0, r2m + FIX_STRIDES r1 and r2, 3 imul r2, r1 - lea r0, [r0+r2*2+64] + lea r0, [r0+r2*2+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] +%ifidn %1, 422 + lea r0, [r0+r1*2] + prefetcht0 [r0] + prefetcht0 [r0+r1] +%endif ret %endif ; ARCH_X86_64 +%endmacro + +INIT_MMX mmx2 +PREFETCH_FENC 420 +PREFETCH_FENC 422 ;----------------------------------------------------------------------------- -; void prefetch_ref( uint8_t *pix, int stride, int parity ) +; void prefetch_ref( pixel *pix, intptr_t stride, int parity ) ;----------------------------------------------------------------------------- -cglobal prefetch_ref_mmxext, 3,3 +INIT_MMX mmx2 +cglobal prefetch_ref, 3,3 + FIX_STRIDES r1 dec r2d and r2d, r1d - lea r0, [r0+r2*8+64] + lea r0, [r0+r2*8+64*SIZEOF_PIXEL] lea r2, [r1*3] prefetcht0 [r0] prefetcht0 [r0+r1] @@ -1424,18 +1469,23 @@ cglobal prefetch_ref_mmxext, 3,3 ; chroma MC ;============================================================================= -%ifdef ARCH_X86_64 - DECLARE_REG_TMP 10,11,6 +%if ARCH_X86_64 + DECLARE_REG_TMP 6,7,8 %else DECLARE_REG_TMP 0,1,2 %endif -%macro MC_CHROMA_START 0 +%macro MC_CHROMA_START 1 +%if ARCH_X86_64 + PROLOGUE 0,9,%1 +%else + PROLOGUE 0,6,%1 +%endif movifnidn r3, r3mp movifnidn r4d, r4m movifnidn r5d, r5m - movifnidn t2d, r6m - mov t0d, t2d + movifnidn t0d, r6m + mov t2d, t0d mov t1d, r5d sar t0d, 3 sar t1d, 3 @@ -1446,49 +1496,48 @@ cglobal prefetch_ref_mmxext, 3,3 add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride %endmacro -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %macro UNPACK_UNALIGNED 4 movu %1, [%4+0] movu %2, [%4+4] - mova %3, %1 + punpckhwd %3, %1, %2 punpcklwd %1, %2 - punpckhwd %3, %2 - mova %2, %1 %if mmsize == 8 + mova %2, %1 punpcklwd %1, %3 punpckhwd %2, %3 %else - shufps %1, %3, 10001000b - shufps %2, %3, 11011101b + shufps %2, %1, %3, q3131 + shufps %1, %3, q2020 %endif %endmacro %else ; !HIGH_BIT_DEPTH -%macro UNPACK_UNALIGNED_MEM 3 +%macro UNPACK_UNALIGNED 3 +%if mmsize == 8 punpcklwd %1, %3 -%endmacro - -%macro UNPACK_UNALIGNED_LOAD 3 +%else movh %2, %3 punpcklwd %1, %2 +%endif %endmacro %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride, -; uint8_t *src, int src_stride, +; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride, +; uint8_t *src, intptr_t src_stride, ; int dx, int dy, ; int width, int height ) ;----------------------------------------------------------------------------- -%macro MC_CHROMA 1 -cglobal mc_chroma_%1, 0,6 - MC_CHROMA_START +%macro MC_CHROMA 0 +cglobal mc_chroma + MC_CHROMA_START 0 FIX_STRIDES r4 and r5d, 7 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 jz .mc1dy %endif and t2d, 7 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 jz .mc1dx %endif shl r5d, 16 @@ -1501,7 +1550,7 @@ cglobal mc_chroma_%1, 0,6 %if mmsize==8 .skip_prologue: %else - jl mc_chroma_mmxext %+ .skip_prologue + jl mc_chroma_mmx2 %+ .skip_prologue WIN64_SPILL_XMM 9 %endif movd m5, t2d @@ -1512,21 +1561,21 @@ cglobal mc_chroma_%1, 0,6 pxor m6, m6 punpcklbw m5, m6 %if mmsize==8 - pshufw m7, m5, 0xee - pshufw m6, m5, 0x00 - pshufw m5, m5, 0x55 + pshufw m7, m5, q3232 + pshufw m6, m5, q0000 + pshufw m5, m5, q1111 jge .width4 %else -%ifdef WIN64 +%if WIN64 cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM %endif - pshufd m7, m5, 0x55 + pshufd m7, m5, q1111 punpcklwd m5, m5 - pshufd m6, m5, 0x00 - pshufd m5, m5, 0x55 + pshufd m6, m5, q0000 + pshufd m5, m5, q1111 jg .width8 %endif -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH add r2, r2 UNPACK_UNALIGNED m0, m1, m2, r3 %else @@ -1539,10 +1588,10 @@ cglobal mc_chroma_%1, 0,6 pmaddwd m0, m7 pmaddwd m1, m7 packssdw m0, m1 - SWAP m3, m0 + SWAP 3, 0 ALIGN 4 .loop2: -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH UNPACK_UNALIGNED m0, m1, m2, r3+r4 pmullw m3, m6 %else ; !HIGH_BIT_DEPTH @@ -1562,7 +1611,7 @@ ALIGN 4 pmullw m0, m5 paddw m0, m2 psrlw m0, 6 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH movh [r0], m0 %if mmsize == 8 psrlq m0, 32 @@ -1585,15 +1634,19 @@ ALIGN 4 add r1, r2 dec r5d jg .loop2 - REP_RET + RET %if mmsize==8 .width4: -%ifdef ARCH_X86_64 +%if ARCH_X86_64 mov t0, r0 mov t1, r1 mov t2, r3 +%if WIN64 + %define multy0 r4m +%else %define multy0 [rsp-8] +%endif mova multy0, m5 %else mov r3m, r3 @@ -1602,9 +1655,9 @@ ALIGN 4 %endif %else .width8: -%ifdef ARCH_X86_64 +%if ARCH_X86_64 %define multy0 m8 - SWAP m8, m5 + SWAP 8, 5 %else %define multy0 r0m mova multy0, m5 @@ -1612,7 +1665,7 @@ ALIGN 4 %endif FIX_STRIDES r2 .loopx: -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH UNPACK_UNALIGNED m0, m2, m4, r3 UNPACK_UNALIGNED m1, m3, m5, r3+mmsize %else @@ -1620,12 +1673,10 @@ ALIGN 4 movu m1, [r3+mmsize/2] UNPACK_UNALIGNED m0, m2, [r3+2] UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2] - mova m2, m0 - mova m3, m1 + psrlw m2, m0, 8 + psrlw m3, m1, 8 pand m0, [pw_00ff] pand m1, [pw_00ff] - psrlw m2, 8 - psrlw m3, 8 %endif pmaddwd m0, m7 pmaddwd m2, m7 @@ -1633,12 +1684,12 @@ ALIGN 4 pmaddwd m3, m7 packssdw m0, m2 packssdw m1, m3 - SWAP m4, m0 - SWAP m5, m1 + SWAP 4, 0 + SWAP 5, 1 add r3, r4 ALIGN 4 .loop4: -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH UNPACK_UNALIGNED m0, m1, m2, r3 pmaddwd m0, m7 pmaddwd m1, m7 @@ -1652,12 +1703,10 @@ ALIGN 4 movu m1, [r3+mmsize/2] UNPACK_UNALIGNED m0, m2, [r3+2] UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2] - mova m2, m0 - mova m3, m1 + psrlw m2, m0, 8 + psrlw m3, m1, 8 pand m0, [pw_00ff] pand m1, [pw_00ff] - psrlw m2, 8 - psrlw m3, 8 pmaddwd m0, m7 pmaddwd m2, m7 pmaddwd m1, m7 @@ -1668,9 +1717,8 @@ ALIGN 4 pmullw m4, m6 pmullw m5, m6 mova m2, [pw_32] - mova m3, m2 + paddw m3, m2, m5 paddw m2, m4 - paddw m3, m5 mova m4, m0 mova m5, m1 pmullw m0, multy0 @@ -1679,7 +1727,7 @@ ALIGN 4 paddw m1, m3 psrlw m0, 6 psrlw m1, 6 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH movh [r0], m0 movh [r0+mmsize/2], m1 %if mmsize==8 @@ -1694,12 +1742,12 @@ ALIGN 4 %else ; !HIGH_BIT_DEPTH packuswb m0, m1 %if mmsize==8 - pshufw m1, m0, 0x8 - pshufw m0, m0, 0xd + pshufw m1, m0, q0020 + pshufw m0, m0, q0031 movd [r0], m1 movd [r1], m0 %else - pshufd m0, m0, 0xd8 + pshufd m0, m0, q3120 movq [r0], m0 movhps [r1], m0 %endif @@ -1710,13 +1758,13 @@ ALIGN 4 dec r5d jg .loop4 %if mmsize!=8 - REP_RET + RET %else sub dword r7m, 4 jg .width8 - REP_RET + RET .width8: -%ifdef ARCH_X86_64 +%if ARCH_X86_64 lea r3, [t2+8*SIZEOF_PIXEL] lea r0, [t0+4*SIZEOF_PIXEL] lea r1, [t1+4*SIZEOF_PIXEL] @@ -1732,13 +1780,12 @@ ALIGN 4 jmp .loopx %endif -%ifdef ARCH_X86_64 ; too many regs for x86_32 +%if ARCH_X86_64 ; too many regs for x86_32 RESET_MM_PERMUTATION -%ifdef WIN64 -%if xmm_regs_used > 6 - %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16 - %assign xmm_regs_used 6 -%endif +%if WIN64 + %assign stack_offset stack_offset - stack_size_padded + %assign stack_size_padded 0 + %assign xmm_regs_used 0 %endif .mc1dy: and t2d, 7 @@ -1749,10 +1796,8 @@ ALIGN 4 movd m5, r5d mov r6d, 2*SIZEOF_PIXEL .mc1d: -%ifdef HIGH_BIT_DEPTH -%if mmsize == 16 +%if HIGH_BIT_DEPTH && mmsize == 16 WIN64_SPILL_XMM 8 -%endif %endif mova m4, [pw_8] SPLATW m5, m5 @@ -1764,13 +1809,13 @@ ALIGN 4 movifnidn r5d, r8m cmp dword r7m, 4 jg .mc1d_w8 - mov r10, r2 - mov r11, r4 + mov r7, r2 + mov r8, r4 %if mmsize!=8 shr r5d, 1 %endif .loop1d_w4: -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %if mmsize == 8 movq m0, [r3+0] movq m2, [r3+8] @@ -1779,7 +1824,7 @@ ALIGN 4 %else movu m0, [r3] movu m1, [r3+r6] - add r3, r11 + add r3, r8 movu m2, [r3] movu m3, [r3+r6] %endif @@ -1795,16 +1840,14 @@ ALIGN 4 movq m0, [r3] movq m1, [r3+r6] %if mmsize!=8 - add r3, r11 + add r3, r8 movhps m0, [r3] movhps m1, [r3+r6] %endif - mova m2, m0 - mova m3, m1 + psrlw m2, m0, 8 + psrlw m3, m1, 8 pand m0, [pw_00ff] pand m1, [pw_00ff] - psrlw m2, 8 - psrlw m3, 8 %endif ; HIGH_BIT_DEPTH pmullw m0, m4 pmullw m1, m5 @@ -1816,24 +1859,24 @@ ALIGN 4 paddw m2, m3 psrlw m0, 3 psrlw m2, 3 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %if mmsize == 8 - xchg r4, r11 - xchg r2, r10 + xchg r4, r8 + xchg r2, r7 %endif movq [r0], m0 movq [r1], m2 %if mmsize == 16 - add r0, r10 - add r1, r10 + add r0, r7 + add r1, r7 movhps [r0], m0 movhps [r1], m2 %endif %else ; !HIGH_BIT_DEPTH packuswb m0, m2 %if mmsize==8 - xchg r4, r11 - xchg r2, r10 + xchg r4, r8 + xchg r2, r7 movd [r0], m0 psrlq m0, 32 movd [r1], m0 @@ -1841,8 +1884,8 @@ ALIGN 4 movhlps m1, m0 movd [r0], m0 movd [r1], m1 - add r0, r10 - add r1, r10 + add r0, r7 + add r1, r7 psrldq m0, 4 psrldq m1, 4 movd [r0], m0 @@ -1854,12 +1897,12 @@ ALIGN 4 add r1, r2 dec r5d jg .loop1d_w4 - REP_RET + RET .mc1d_w8: sub r2, 4*SIZEOF_PIXEL sub r4, 8*SIZEOF_PIXEL - mov r10, 4*SIZEOF_PIXEL - mov r11, 8*SIZEOF_PIXEL + mov r7, 4*SIZEOF_PIXEL + mov r8, 8*SIZEOF_PIXEL %if mmsize==8 shl r5d, 1 %endif @@ -1867,11 +1910,13 @@ ALIGN 4 %endif ; ARCH_X86_64 %endmacro ; MC_CHROMA - -%macro MC_CHROMA_SSSE3 0-1 -INIT_XMM -cglobal mc_chroma_ssse3%1, 0,6,9 - MC_CHROMA_START +%macro MC_CHROMA_SSSE3 0 +cglobal mc_chroma +%if cpuflag(avx2) + MC_CHROMA_START 9 +%else + MC_CHROMA_START 10 +%endif and r5d, 7 and t2d, 7 mov t0d, r5d @@ -1882,18 +1927,18 @@ cglobal mc_chroma_ssse3%1, 0,6,9 sub r5d, t2d imul t2d, t0d ; (x*255+8)*y imul r5d, t0d ; (x*255+8)*(8-y) - movd m6, t2d - movd m7, r5d -%ifidn %1, _cache64 + movd xm6, t2d + movd xm7, r5d +%if cpuflag(cache64) mov t0d, r3d and t0d, 7 %ifdef PIC lea t1, [ch_shuf_adj] - movddup m5, [t1 + t0*4] + movddup xm5, [t1 + t0*4] %else - movddup m5, [ch_shuf_adj + t0*4] + movddup xm5, [ch_shuf_adj + t0*4] %endif - paddb m5, [ch_shuf] + paddb xm5, [ch_shuf] and r3, ~7 %else mova m5, [ch_shuf] @@ -1902,33 +1947,98 @@ cglobal mc_chroma_ssse3%1, 0,6,9 movifnidn r1, r1mp movifnidn r2d, r2m movifnidn r5d, r8m +%if cpuflag(avx2) + vpbroadcastw m6, xm6 + vpbroadcastw m7, xm7 +%else SPLATW m6, m6 SPLATW m7, m7 +%endif +%if ARCH_X86_64 + %define shiftround m8 + mova m8, [pw_512] +%else + %define shiftround [pw_512] +%endif cmp dword r7m, 4 jg .width8 - movu m0, [r3] + +%if cpuflag(avx2) +.loop4: + movu xm0, [r3] + movu xm1, [r3+r4] + vinserti128 m0, m0, [r3+r4], 1 + vinserti128 m1, m1, [r3+r4*2], 1 + pshufb m0, m5 + pshufb m1, m5 + pmaddubsw m0, m7 + pmaddubsw m1, m6 + paddw m0, m1 + pmulhrsw m0, shiftround + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [r0], xm0 + movd [r0+r2], xm1 + psrldq xm0, 4 + psrldq xm1, 4 + movd [r1], xm0 + movd [r1+r2], xm1 + lea r3, [r3+r4*2] + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + sub r5d, 2 + jg .loop4 + RET +.width8: + movu xm0, [r3] + vinserti128 m0, m0, [r3+8], 1 + pshufb m0, m5 +.loop8: + movu xm3, [r3+r4] + vinserti128 m3, m3, [r3+r4+8], 1 + pshufb m3, m5 + pmaddubsw m1, m0, m7 + pmaddubsw m2, m3, m6 + pmaddubsw m3, m3, m7 + + movu xm0, [r3+r4*2] + vinserti128 m0, m0, [r3+r4*2+8], 1 pshufb m0, m5 + pmaddubsw m4, m0, m6 + + paddw m1, m2 + paddw m3, m4 + pmulhrsw m1, shiftround + pmulhrsw m3, shiftround + packuswb m1, m3 + mova m2, [deinterleave_shufd] + vpermd m1, m2, m1 + vextracti128 xm2, m1, 1 + movq [r0], xm1 + movhps [r1], xm1 + movq [r0+r2], xm2 + movhps [r1+r2], xm2 +%else + movu m0, [r3] + pshufb m0, xm5 .loop4: movu m1, [r3+r4] pshufb m1, m5 movu m3, [r3+r4*2] pshufb m3, m5 - mova m2, m1 mova m4, m3 pmaddubsw m0, m7 + pmaddubsw m2, m1, m7 pmaddubsw m1, m6 - pmaddubsw m2, m7 pmaddubsw m3, m6 - paddw m0, [pw_32] - paddw m2, [pw_32] paddw m1, m0 paddw m3, m2 + pmulhrsw m1, shiftround + pmulhrsw m3, shiftround mova m0, m4 - psrlw m1, 6 - psrlw m3, 6 packuswb m1, m3 movhlps m3, m1 - movd [r0], m1 + movd [r0], xm1 movd [r0+r2], m3 psrldq m1, 4 psrldq m3, 4 @@ -1939,16 +2049,15 @@ cglobal mc_chroma_ssse3%1, 0,6,9 lea r1, [r1+r2*2] sub r5d, 2 jg .loop4 - REP_RET - + RET .width8: movu m0, [r3] pshufb m0, m5 movu m1, [r3+8] pshufb m1, m5 -%ifdef ARCH_X86_64 - SWAP m8, m6 - %define mult1 m8 +%if ARCH_X86_64 + SWAP 9, 6 + %define mult1 m9 %else mova r0m, m6 %define mult1 r0m @@ -1964,14 +2073,12 @@ cglobal mc_chroma_ssse3%1, 0,6,9 pmaddubsw m1, m7 pmaddubsw m2, mult1 pmaddubsw m3, mult1 - paddw m0, [pw_32] - paddw m1, [pw_32] paddw m0, m2 paddw m1, m3 - psrlw m0, 6 - psrlw m1, 6 + pmulhrsw m0, shiftround ; x + 32 >> 6 + pmulhrsw m1, shiftround packuswb m0, m1 - pshufd m0, m0, 0xd8 + pshufd m0, m0, q3120 movq [r0], m0 movhps [r1], m0 @@ -1985,37 +2092,41 @@ cglobal mc_chroma_ssse3%1, 0,6,9 pmaddubsw m6, m7 pmaddubsw m2, mult1 pmaddubsw m3, mult1 - paddw m4, [pw_32] - paddw m6, [pw_32] paddw m2, m4 paddw m3, m6 - psrlw m2, 6 - psrlw m3, 6 + pmulhrsw m2, shiftround + pmulhrsw m3, shiftround packuswb m2, m3 - pshufd m2, m2, 0xd8 + pshufd m2, m2, q3120 movq [r0+r2], m2 movhps [r1+r2], m2 +%endif lea r3, [r3+r4*2] lea r0, [r0+r2*2] lea r1, [r1+r2*2] sub r5d, 2 jg .loop8 - REP_RET + RET %endmacro -%ifdef HIGH_BIT_DEPTH -INIT_MMX -MC_CHROMA mmxext -INIT_XMM -MC_CHROMA sse2 +%if HIGH_BIT_DEPTH +INIT_MMX mmx2 +MC_CHROMA +INIT_XMM sse2 +MC_CHROMA +INIT_XMM avx +MC_CHROMA %else ; !HIGH_BIT_DEPTH -INIT_MMX -%define UNPACK_UNALIGNED UNPACK_UNALIGNED_MEM -MC_CHROMA mmxext -INIT_XMM -MC_CHROMA sse2_misalign -%define UNPACK_UNALIGNED UNPACK_UNALIGNED_LOAD -MC_CHROMA sse2 +INIT_MMX mmx2 +MC_CHROMA +INIT_XMM sse2 +MC_CHROMA +INIT_XMM ssse3 +MC_CHROMA_SSSE3 +INIT_XMM ssse3, cache64 +MC_CHROMA_SSSE3 +INIT_XMM avx +MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64 +INIT_YMM avx2 MC_CHROMA_SSSE3 -MC_CHROMA_SSSE3 _cache64 %endif ; HIGH_BIT_DEPTH