X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fx86%2Fmc-a.asm;h=cd69c824a2e1ff165d4e100f18bc3c4e161436dd;hb=5265b927b0f2e043dd39cbbbf3909da0862d60e6;hp=4555526a78766bdef0b2b61c4dd8bbabed3b3307;hpb=1921c6824e37bdf5a8436a6cbe36b0d3a8c376b3;p=x264 diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 4555526a..cd69c824 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* mc-a.asm: x86 motion compensation ;***************************************************************************** -;* Copyright (C) 2003-2011 x264 project +;* Copyright (C) 2003-2015 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser @@ -34,7 +34,7 @@ SECTION_RODATA 32 -ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 +ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 ch_shuf_adj: times 8 db 0 times 8 db 2 times 8 db 4 @@ -49,22 +49,26 @@ cextern pw_4 cextern pw_8 cextern pw_32 cextern pw_64 +cextern pw_512 cextern pw_00ff cextern pw_pixel_max cextern sw_64 cextern pd_32 +cextern deinterleave_shufd ;============================================================================= ; implicit weighted biprediction ;============================================================================= ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 -%ifdef ARCH_X86_64 - DECLARE_REG_TMP 0,1,2,3,4,5,10,11 +%if WIN64 + DECLARE_REG_TMP 0,1,2,3,4,5,4,5 %macro AVG_START 0-1 0 PROLOGUE 6,7,%1 -%ifdef WIN64 - movsxd r5, r5d -%endif + %endmacro +%elif UNIX64 + DECLARE_REG_TMP 0,1,2,3,4,5,7,8 + %macro AVG_START 0-1 0 + PROLOGUE 6,9,%1 %endmacro %else DECLARE_REG_TMP 1,2,3,4,5,6,1,2 @@ -80,15 +84,15 @@ cextern pd_32 %endif %macro AVG_END 0 - sub eax, 2 lea t4, [t4+t5*2*SIZEOF_PIXEL] lea t2, [t2+t3*2*SIZEOF_PIXEL] lea t0, [t0+t1*2*SIZEOF_PIXEL] + sub eax, 2 jg .height_loop - REP_RET + RET %endmacro -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %macro BIWEIGHT_MMX 2 movh m0, %1 @@ -139,8 +143,7 @@ cextern pd_32 movh m1, %2 punpcklbw m0, m1 pmaddubsw m0, m3 - paddw m0, m4 - psraw m0, 6 + pmulhrsw m0, m4 %endmacro %macro BIWEIGHT_START_SSSE3 0 @@ -149,12 +152,16 @@ cextern pd_32 sub t7d, t6d shl t7d, 8 add t6d, t7d - movd m3, t6d - mova m4, [pw_32] + mova m4, [pw_512] + movd xm3, t6d +%if cpuflag(avx2) + vpbroadcastw m3, xm3 +%else SPLATW m3, m3 ; weight_dst,src +%endif %endmacro -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %macro BIWEIGHT_ROW 4 BIWEIGHT [%2], [%3] %if %4==mmsize/4 @@ -187,13 +194,13 @@ cextern pd_32 %endif ;HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; int pixel_avg_weight_w16( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight ) +; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight ) ;----------------------------------------------------------------------------- %macro AVG_WEIGHT 1-2 0 cglobal pixel_avg_weight_w%1 BIWEIGHT_START AVG_START %2 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH mova m7, [pw_pixel_max] %endif .height_loop: @@ -201,7 +208,7 @@ cglobal pixel_avg_weight_w%1 BIWEIGHT [t2], [t4] SWAP 0, 6 BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5] -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH packssdw m6, m0 CLIPW m6, m5, m7 %else ;!HIGH_BIT_DEPTH @@ -226,7 +233,7 @@ INIT_MMX mmx2 AVG_WEIGHT 4 AVG_WEIGHT 8 AVG_WEIGHT 16 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_XMM sse2 AVG_WEIGHT 4, 8 AVG_WEIGHT 8, 8 @@ -242,14 +249,34 @@ AVG_WEIGHT 4 INIT_XMM ssse3 AVG_WEIGHT 8, 7 AVG_WEIGHT 16, 7 + +INIT_YMM avx2 +cglobal pixel_avg_weight_w16 + BIWEIGHT_START + AVG_START 5 +.height_loop: + movu xm0, [t2] + movu xm1, [t4] + vinserti128 m0, m0, [t2+t3], 1 + vinserti128 m1, m1, [t4+t5], 1 + SBUTTERFLY bw, 0, 1, 2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 + mova [t0], xm0 + vextracti128 [t0+t1], m0, 1 + AVG_END %endif ;HIGH_BIT_DEPTH ;============================================================================= ; P frame explicit weighted prediction ;============================================================================= -%ifdef HIGH_BIT_DEPTH -%macro WEIGHT_START 1 ; (width) +%if HIGH_BIT_DEPTH +; width +%macro WEIGHT_START 1 mova m0, [r4+ 0] ; 1<= mmsize - WEIGHT_ROW (%1+x), (%2+x), mmsize ; weight 1 mmsize - WEIGHT_ROW (%1+r3+x), (%2+r1+x), mmsize ; weight 1 mmsize + WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4 %assign x (x+mmsize) %else - WEIGHT_COL (%1+x),(%2+x),(%3-x) - %exitrep + %assign w %3-x +%if w == 20 + %assign w 16 +%endif + WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4 + %assign x (x+w) %endif %if x >= %3 %exitrep @@ -408,39 +476,37 @@ AVG_WEIGHT 16, 7 %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h ) +;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h ) ;----------------------------------------------------------------------------- -%ifdef ARCH_X86_64 -%define NUMREGS 6 -%define LOAD_HEIGHT -%define HEIGHT_REG r5d -%define TMP_REG r6d -%else -%define NUMREGS 5 -%define TMP_REG r5d -%define LOAD_HEIGHT mov r4d, r5m -%define HEIGHT_REG r4d -%endif - -%assign XMMREGS 7 -%ifdef HIGH_BIT_DEPTH -%assign NUMREGS NUMREGS+1 -%assign XMMREGS 8 -%endif - %macro WEIGHTER 1 - cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS +cglobal mc_weight_w%1, 6,6,8 FIX_STRIDES r1, r3 WEIGHT_START %1 - LOAD_HEIGHT +%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 + ; we can merge the shift step into the scale factor + ; if (m3<<7) doesn't overflow an int16_t + cmp byte [r4+1], 0 + jz .fast +%endif .loop: - WEIGHT_TWO_ROW r2, r0, %1 + WEIGHT_TWO_ROW r2, r0, %1, 0 lea r0, [r0+r1*2] lea r2, [r2+r3*2] - sub HEIGHT_REG, 2 + sub r5d, 2 jg .loop - REP_RET + RET +%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 +.fast: + psllw m3, 7 +.fastloop: + WEIGHT_TWO_ROW r2, r0, %1, 1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + sub r5d, 2 + jg .fastloop + RET +%endif %endmacro INIT_MMX mmx2 @@ -453,24 +519,17 @@ INIT_XMM sse2 WEIGHTER 8 WEIGHTER 16 WEIGHTER 20 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH WEIGHTER 12 -INIT_XMM avx -WEIGHTER 8 -WEIGHTER 12 -WEIGHTER 16 -WEIGHTER 20 %else -%define WEIGHT WEIGHT_SSSE3 -%define WEIGHT_START WEIGHT_START_SSSE3 INIT_MMX ssse3 WEIGHTER 4 INIT_XMM ssse3 WEIGHTER 8 WEIGHTER 16 WEIGHTER 20 -INIT_XMM avx -WEIGHTER 8 +INIT_YMM avx2 +WEIGHTER 8 WEIGHTER 16 WEIGHTER 20 %endif @@ -478,7 +537,7 @@ WEIGHTER 20 %macro OFFSET_OP 7 mov%6 m0, [%1] mov%6 m1, [%2] -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH p%5usw m0, m2 p%5usw m1, m2 %ifidn %5,add @@ -500,7 +559,7 @@ WEIGHTER 20 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a %assign x (x+mmsize) %else -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h %else OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d @@ -514,25 +573,24 @@ WEIGHTER 20 %endmacro ;----------------------------------------------------------------------------- -;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h ) +;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h ) ;----------------------------------------------------------------------------- %macro OFFSET 2 - cglobal mc_offset%2_w%1, NUMREGS, NUMREGS +cglobal mc_offset%2_w%1, 6,6 FIX_STRIDES r1, r3 mova m2, [r4] -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %ifidn %2,add mova m3, [pw_pixel_max] %endif %endif - LOAD_HEIGHT .loop: OFFSET_TWO_ROW r2, r0, %1, %2 lea r0, [r0+r1*2] lea r2, [r2+r3*2] - sub HEIGHT_REG, 2 + sub r5d, 2 jg .loop - REP_RET + RET %endmacro %macro OFFSETPN 1 @@ -549,20 +607,10 @@ INIT_XMM sse2 OFFSETPN 12 OFFSETPN 16 OFFSETPN 20 -INIT_XMM avx -OFFSETPN 12 -OFFSETPN 16 -OFFSETPN 20 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_XMM sse2 OFFSETPN 8 -INIT_XMM avx -OFFSETPN 8 %endif -%undef LOAD_HEIGHT -%undef HEIGHT_REG -%undef NUMREGS - ;============================================================================= @@ -570,25 +618,28 @@ OFFSETPN 8 ;============================================================================= ;----------------------------------------------------------------------------- -; void pixel_avg_4x4( pixel *dst, int dst_stride, -; pixel *src1, int src1_stride, pixel *src2, int src2_stride, int weight ); +; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, +; pixel *src2, intptr_t src2_stride, int weight ); ;----------------------------------------------------------------------------- %macro AVGH 2 cglobal pixel_avg_%1x%2 mov eax, %2 cmp dword r6m, 32 jne pixel_avg_weight_w%1 %+ SUFFIX +%if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads + jmp pixel_avg_w%1_avx2 +%else %if mmsize == 16 && %1 == 16 test dword r4m, 15 jz pixel_avg_w%1_sse2 %endif jmp pixel_avg_w%1_mmx2 +%endif %endmacro ;----------------------------------------------------------------------------- -; void pixel_avg_w4( pixel *dst, int dst_stride, -; pixel *src1, int src1_stride, pixel *src2, int src2_stride, -; int height, int weight ); +; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, +; pixel *src2, intptr_t src2_stride, int height, int weight ); ;----------------------------------------------------------------------------- %macro AVG_FUNC 3 @@ -599,7 +650,7 @@ cglobal pixel_avg_w%1 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize %2 m0, [t2+x] %2 m1, [t2+x+SIZEOF_PIXEL*t3] -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH pavgw m0, [t4+x] pavgw m1, [t4+x+SIZEOF_PIXEL*t5] %else ;!HIGH_BIT_DEPTH @@ -613,10 +664,11 @@ cglobal pixel_avg_w%1 AVG_END %endmacro -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_MMX mmx2 AVG_FUNC 4, movq, movq +AVGH 4, 16 AVGH 4, 8 AVGH 4, 4 AVGH 4, 2 @@ -632,6 +684,7 @@ AVGH 16, 8 INIT_XMM sse2 AVG_FUNC 4, movq, movq +AVGH 4, 16 AVGH 4, 8 AVGH 4, 4 AVGH 4, 2 @@ -649,6 +702,7 @@ AVGH 16, 8 INIT_MMX mmx2 AVG_FUNC 4, movd, movd +AVGH 4, 16 AVGH 4, 8 AVGH 4, 4 AVGH 4, 2 @@ -676,9 +730,14 @@ AVGH 8, 16 AVGH 8, 8 AVGH 8, 4 INIT_MMX ssse3 +AVGH 4, 16 AVGH 4, 8 AVGH 4, 4 AVGH 4, 2 +INIT_XMM avx2 +AVG_FUNC 16, movdqu, movdqa +AVGH 16, 16 +AVGH 16, 8 %endif ;HIGH_BIT_DEPTH @@ -688,10 +747,10 @@ AVGH 4, 2 ; pixel avg2 ;============================================================================= -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; void pixel_avg2_wN( uint16_t *dst, int dst_stride, -; uint16_t *src1, int src_stride, +; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride, +; uint16_t *src1, intptr_t src_stride, ; uint16_t *src2, int height ); ;----------------------------------------------------------------------------- %macro AVG2_W_ONE 1 @@ -701,7 +760,7 @@ cglobal pixel_avg2_w%1, 6,7,4 .height_loop: movu m0, [r2] movu m1, [r2+r3*2] -%if mmsize == 8 +%if cpuflag(avx) || mmsize == 8 pavgw m0, [r2+r4] pavgw m1, [r2+r6] %else @@ -712,11 +771,11 @@ cglobal pixel_avg2_w%1, 6,7,4 %endif mova [r0], m0 mova [r0+r1*2], m1 - sub r5d, 2 lea r2, [r2+r3*4] lea r0, [r0+r1*4] + sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro %macro AVG2_W_TWO 3 @@ -747,11 +806,11 @@ cglobal pixel_avg2_w%1, 6,7,8 %3 [r0+mmsize], m1 mova [r0+r1*2], m2 %3 [r0+r1*2+mmsize], m3 - sub r5d, 2 lea r2, [r2+r3*4] lea r0, [r0+r1*4] + sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro INIT_MMX mmx2 @@ -761,6 +820,8 @@ INIT_XMM sse2 AVG2_W_ONE 8 AVG2_W_TWO 10, movd, movd AVG2_W_TWO 16, movu, mova +INIT_YMM avx2 +AVG2_W_ONE 16 INIT_MMX cglobal pixel_avg2_w10_mmx2, 6,7 @@ -785,11 +846,11 @@ cglobal pixel_avg2_w10_mmx2, 6,7 mova [r0+r1*2+ 0], m3 mova [r0+r1*2+ 8], m4 movh [r0+r1*2+16], m5 - sub r5d, 2 lea r2, [r2+r3*2*2] lea r0, [r0+r1*2*2] + sub r5d, 2 jg .height_loop - REP_RET + RET cglobal pixel_avg2_w16_mmx2, 6,7 sub r4, r2 @@ -819,11 +880,11 @@ cglobal pixel_avg2_w16_mmx2, 6,7 mova [r0+r1*2+ 8], m5 mova [r0+r1*2+16], m6 mova [r0+r1*2+24], m7 - sub r5d, 2 lea r2, [r2+r3*2*2] lea r0, [r0+r1*2*2] + sub r5d, 2 jg .height_loop - REP_RET + RET cglobal pixel_avg2_w18_mmx2, 6,7 sub r4, r2 @@ -843,39 +904,52 @@ cglobal pixel_avg2_w18_mmx2, 6,7 mova [r0+16], m2 mova [r0+24], m3 movh [r0+32], m4 - sub r5d, 1 lea r2, [r2+r3*2] lea r0, [r0+r1*2] + dec r5d jg .height_loop - REP_RET + RET -INIT_XMM -cglobal pixel_avg2_w18_sse2, 6,7,6 +%macro PIXEL_AVG_W18 0 +cglobal pixel_avg2_w18, 6,7 sub r4, r2 .height_loop: movu m0, [r2+ 0] + movd xm2, [r2+32] +%if mmsize == 32 + pavgw m0, [r2+r4+ 0] + movd xm1, [r2+r4+32] + pavgw xm2, xm1 +%else movu m1, [r2+16] - movh m2, [r2+32] movu m3, [r2+r4+ 0] movu m4, [r2+r4+16] - movh m5, [r2+r4+32] + movd m5, [r2+r4+32] pavgw m0, m3 pavgw m1, m4 pavgw m2, m5 - mova [r0+ 0], m0 mova [r0+16], m1 - movh [r0+32], m2 - sub r5d, 1 +%endif + mova [r0+ 0], m0 + movd [r0+32], xm2 lea r2, [r2+r3*2] lea r0, [r0+r1*2] + dec r5d jg .height_loop - REP_RET + RET +%endmacro + +INIT_XMM sse2 +PIXEL_AVG_W18 +INIT_YMM avx2 +PIXEL_AVG_W18 + %endif ; HIGH_BIT_DEPTH -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- -; void pixel_avg2_w4( uint8_t *dst, int dst_stride, -; uint8_t *src1, int src_stride, +; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride, +; uint8_t *src1, intptr_t src_stride, ; uint8_t *src2, int height ); ;----------------------------------------------------------------------------- %macro AVG2_W8 2 @@ -893,7 +967,7 @@ cglobal pixel_avg2_w%1_mmx2, 6,7 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro INIT_MMX @@ -921,7 +995,7 @@ cglobal pixel_avg2_w%1_mmx2, 6,7 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro AVG2_W16 12, movd @@ -953,61 +1027,67 @@ cglobal pixel_avg2_w20_mmx2, 6,7 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET + RET +INIT_XMM cglobal pixel_avg2_w16_sse2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: - movdqu xmm0, [r2] - movdqu xmm2, [r2+r3] - movdqu xmm1, [r2+r4] - movdqu xmm3, [r2+r6] + movu m0, [r2] + movu m2, [r2+r3] + movu m1, [r2+r4] + movu m3, [r2+r6] lea r2, [r2+r3*2] - pavgb xmm0, xmm1 - pavgb xmm2, xmm3 - movdqa [r0], xmm0 - movdqa [r0+r1], xmm2 + pavgb m0, m1 + pavgb m2, m3 + mova [r0], m0 + mova [r0+r1], m2 lea r0, [r0+r1*2] - sub r5d, 2 - jg .height_loop - REP_RET + sub r5d, 2 + jg .height_loop + RET -%macro AVG2_W20 1 -cglobal pixel_avg2_w20_%1, 6,7 +cglobal pixel_avg2_w20_sse2, 6,7 sub r2, r4 lea r6, [r2+r3] .height_loop: - movdqu xmm0, [r4] - movdqu xmm2, [r4+r3] -%ifidn %1, sse2_misalign - movd mm4, [r4+16] - movd mm5, [r4+r3+16] - pavgb xmm0, [r4+r2] - pavgb xmm2, [r4+r6] -%else - movdqu xmm1, [r4+r2] - movdqu xmm3, [r4+r6] - movd mm4, [r4+16] - movd mm5, [r4+r3+16] - pavgb xmm0, xmm1 - pavgb xmm2, xmm3 -%endif - pavgb mm4, [r4+r2+16] - pavgb mm5, [r4+r6+16] + movu m0, [r4] + movu m2, [r4+r3] + movu m1, [r4+r2] + movu m3, [r4+r6] + movd mm4, [r4+16] + movd mm5, [r4+r3+16] + pavgb m0, m1 + pavgb m2, m3 + pavgb mm4, [r4+r2+16] + pavgb mm5, [r4+r6+16] lea r4, [r4+r3*2] - movdqa [r0], xmm0 - movd [r0+16], mm4 - movdqa [r0+r1], xmm2 - movd [r0+r1+16], mm5 + mova [r0], m0 + mova [r0+r1], m2 + movd [r0+16], mm4 + movd [r0+r1+16], mm5 + lea r0, [r0+r1*2] + sub r5d, 2 + jg .height_loop + RET + +INIT_YMM avx2 +cglobal pixel_avg2_w20, 6,7 + sub r2, r4 + lea r6, [r2+r3] +.height_loop: + movu m0, [r4] + movu m1, [r4+r3] + pavgb m0, [r4+r2] + pavgb m1, [r4+r6] + lea r4, [r4+r3*2] + mova [r0], m0 + mova [r0+r1], m1 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET -%endmacro - -AVG2_W20 sse2 -AVG2_W20 sse2_misalign + RET ; Cacheline split code for processors with high latencies for loads ; split over cache lines. See sad-a.asm for a more detailed explanation. @@ -1066,7 +1146,7 @@ pixel_avg2_w%1_cache_mmx2: add r0, r1 dec r5d jg .height_loop - REP_RET + RET %endmacro %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set @@ -1087,7 +1167,7 @@ cglobal pixel_avg2_w%1_cache%2_%3 %endif %if 0 ; or %1==8 - but the extra branch seems too expensive ja cachesplit -%ifdef ARCH_X86_64 +%if ARCH_X86_64 test r4b, 1 %else test byte r4m, 1 @@ -1109,7 +1189,7 @@ cglobal pixel_avg2_w%1_cache%2_%3 INIT_MMX AVG_CACHELINE_CHECK 8, 64, mmx2 AVG_CACHELINE_CHECK 12, 64, mmx2 -%ifndef ARCH_X86_64 +%if ARCH_X86_64 == 0 AVG_CACHELINE_CHECK 16, 64, mmx2 AVG_CACHELINE_CHECK 20, 64, mmx2 AVG_CACHELINE_CHECK 8, 32, mmx2 @@ -1153,7 +1233,9 @@ avg_w16_align%1_%2_ssse3: jg avg_w16_align%1_%2_ssse3 ret %if %1==0 - times 13 db 0x90 ; make sure the first ones don't end up short + ; make sure the first ones don't end up short + ALIGN 16 + times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop %endif %endmacro @@ -1167,7 +1249,7 @@ cglobal pixel_avg2_w16_cache64_ssse3 and eax, 7 jz x264_pixel_avg2_w16_sse2 %endif - PROLOGUE 6, 7 + PROLOGUE 6, 8 lea r6, [r4+r2] and r4, ~0xf and r6, 0x1f @@ -1177,17 +1259,12 @@ cglobal pixel_avg2_w16_cache64_ssse3 shl r6, 4 ;jump = (offset + align*2)*48 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3) %ifdef PIC - lea r11, [avg_w16_addr] - add r6, r11 + lea r7, [avg_w16_addr] + add r6, r7 %else lea r6, [avg_w16_addr + r6] %endif -%ifdef UNIX64 - jmp r6 -%else - call r6 - RET -%endif + TAIL_CALL r6, 1 %assign j 0 %assign k 1 @@ -1219,18 +1296,18 @@ AVG16_CACHELINE_LOOP_SSSE3 j, k movu m1, [r2+%4*mmsize] movu m2, [r2+r3+%3*mmsize] movu m3, [r2+r3+%4*mmsize] - movu m4, [r2+r3*2+%3*mmsize] - movu m5, [r2+r3*2+%4*mmsize] - movu m6, [r2+%2+%3*mmsize] - movu m7, [r2+%2+%4*mmsize] mova [r0+%3*mmsize], m0 mova [r0+%4*mmsize], m1 mova [r0+r1+%3*mmsize], m2 mova [r0+r1+%4*mmsize], m3 - mova [r0+r1*2+%3*mmsize], m4 - mova [r0+r1*2+%4*mmsize], m5 - mova [r0+%1+%3*mmsize], m6 - mova [r0+%1+%4*mmsize], m7 + movu m0, [r2+r3*2+%3*mmsize] + movu m1, [r2+r3*2+%4*mmsize] + movu m2, [r2+%2+%3*mmsize] + movu m3, [r2+%2+%4*mmsize] + mova [r0+r1*2+%3*mmsize], m0 + mova [r0+r1*2+%4*mmsize], m1 + mova [r0+%1+%3*mmsize], m2 + mova [r0+%1+%4*mmsize], m3 %endmacro %macro COPY4 2 @@ -1239,8 +1316,8 @@ AVG16_CACHELINE_LOOP_SSSE3 j, k %endmacro ;----------------------------------------------------------------------------- -; void mc_copy_w4( uint8_t *dst, int i_dst_stride, -; uint8_t *src, int i_src_stride, int i_height ) +; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride, +; uint8_t *src, intptr_t i_src_stride, int i_height ) ;----------------------------------------------------------------------------- INIT_MMX cglobal mc_copy_w4_mmx, 4,6 @@ -1249,7 +1326,7 @@ cglobal mc_copy_w4_mmx, 4,6 lea r5, [r3*3] lea r4, [r1*3] je .end -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 %define mova movd %define movu movd %endif @@ -1263,7 +1340,7 @@ cglobal mc_copy_w4_mmx, 4,6 %macro MC_COPY 1 %assign %%w %1*SIZEOF_PIXEL/mmsize %if %%w > 0 -cglobal mc_copy_w%1, 5,7,8*(%%w/2) +cglobal mc_copy_w%1, 5,7 FIX_STRIDES r1, r3 lea r6, [r3*3] lea r5, [r1*3] @@ -1273,37 +1350,44 @@ cglobal mc_copy_w%1, 5,7,8*(%%w/2) lea r0, [r0+r1*4] sub r4d, 4 jg .height_loop - REP_RET + RET %endif %endmacro INIT_MMX mmx MC_COPY 8 MC_COPY 16 -INIT_XMM sse2 +INIT_XMM sse MC_COPY 8 MC_COPY 16 -INIT_XMM aligned, sse2 +INIT_XMM aligned, sse MC_COPY 16 - - +%if HIGH_BIT_DEPTH +INIT_YMM avx +MC_COPY 16 +INIT_YMM aligned, avx +MC_COPY 16 +%endif ;============================================================================= ; prefetch ;============================================================================= -; FIXME assumes 64 byte cachelines +; assumes 64 byte cachelines +; FIXME doesn't cover all pixels in high depth and/or 4:4:4 ;----------------------------------------------------------------------------- -; void prefetch_fenc( uint8_t *pix_y, int stride_y, -; uint8_t *pix_uv, int stride_uv, int mb_x ) +; void prefetch_fenc( pixel *pix_y, intptr_t stride_y, +; pixel *pix_uv, intptr_t stride_uv, int mb_x ) ;----------------------------------------------------------------------------- -INIT_MMX -%ifdef ARCH_X86_64 -cglobal prefetch_fenc_mmx2, 5,5 + +%macro PREFETCH_FENC 1 +%if ARCH_X86_64 +cglobal prefetch_fenc_%1, 5,5 + FIX_STRIDES r1, r3 and r4d, 3 mov eax, r4d imul r4d, r1d - lea r0, [r0+r4*4+64] + lea r0, [r0+r4*4+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] @@ -1311,19 +1395,25 @@ cglobal prefetch_fenc_mmx2, 5,5 prefetcht0 [r0+r1] imul eax, r3d - lea r2, [r2+rax*2+64] + lea r2, [r2+rax*2+64*SIZEOF_PIXEL] + prefetcht0 [r2] + prefetcht0 [r2+r3] +%ifidn %1, 422 + lea r2, [r2+r3*2] prefetcht0 [r2] prefetcht0 [r2+r3] +%endif RET %else -cglobal prefetch_fenc_mmx2, 0,3 +cglobal prefetch_fenc_%1, 0,3 mov r2, r4m mov r1, r1m mov r0, r0m + FIX_STRIDES r1 and r2, 3 imul r2, r1 - lea r0, [r0+r2*4+64] + lea r0, [r0+r2*4+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] @@ -1333,21 +1423,34 @@ cglobal prefetch_fenc_mmx2, 0,3 mov r2, r4m mov r1, r3m mov r0, r2m + FIX_STRIDES r1 and r2, 3 imul r2, r1 - lea r0, [r0+r2*2+64] + lea r0, [r0+r2*2+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] +%ifidn %1, 422 + lea r0, [r0+r1*2] + prefetcht0 [r0] + prefetcht0 [r0+r1] +%endif ret %endif ; ARCH_X86_64 +%endmacro + +INIT_MMX mmx2 +PREFETCH_FENC 420 +PREFETCH_FENC 422 ;----------------------------------------------------------------------------- -; void prefetch_ref( uint8_t *pix, int stride, int parity ) +; void prefetch_ref( pixel *pix, intptr_t stride, int parity ) ;----------------------------------------------------------------------------- -cglobal prefetch_ref_mmx2, 3,3 +INIT_MMX mmx2 +cglobal prefetch_ref, 3,3 + FIX_STRIDES r1 dec r2d and r2d, r1d - lea r0, [r0+r2*8+64] + lea r0, [r0+r2*8+64*SIZEOF_PIXEL] lea r2, [r1*3] prefetcht0 [r0] prefetcht0 [r0+r1] @@ -1366,18 +1469,23 @@ cglobal prefetch_ref_mmx2, 3,3 ; chroma MC ;============================================================================= -%ifdef ARCH_X86_64 - DECLARE_REG_TMP 10,11,6 +%if ARCH_X86_64 + DECLARE_REG_TMP 6,7,8 %else DECLARE_REG_TMP 0,1,2 %endif -%macro MC_CHROMA_START 0 +%macro MC_CHROMA_START 1 +%if ARCH_X86_64 + PROLOGUE 0,9,%1 +%else + PROLOGUE 0,6,%1 +%endif movifnidn r3, r3mp movifnidn r4d, r4m movifnidn r5d, r5m - movifnidn t2d, r6m - mov t0d, t2d + movifnidn t0d, r6m + mov t2d, t0d mov t1d, r5d sar t0d, 3 sar t1d, 3 @@ -1388,7 +1496,7 @@ cglobal prefetch_ref_mmx2, 3,3 add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride %endmacro -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %macro UNPACK_UNALIGNED 4 movu %1, [%4+0] movu %2, [%4+4] @@ -1399,13 +1507,13 @@ cglobal prefetch_ref_mmx2, 3,3 punpcklwd %1, %3 punpckhwd %2, %3 %else - shufps %2, %1, %3, 11011101b - shufps %1, %3, 10001000b + shufps %2, %1, %3, q3131 + shufps %1, %3, q2020 %endif %endmacro %else ; !HIGH_BIT_DEPTH %macro UNPACK_UNALIGNED 3 -%if mmsize == 8 || cpuflag(misalign) +%if mmsize == 8 punpcklwd %1, %3 %else movh %2, %3 @@ -1415,21 +1523,21 @@ cglobal prefetch_ref_mmx2, 3,3 %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride, -; uint8_t *src, int src_stride, +; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride, +; uint8_t *src, intptr_t src_stride, ; int dx, int dy, ; int width, int height ) ;----------------------------------------------------------------------------- %macro MC_CHROMA 0 -cglobal mc_chroma, 0,6 - MC_CHROMA_START +cglobal mc_chroma + MC_CHROMA_START 0 FIX_STRIDES r4 and r5d, 7 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 jz .mc1dy %endif and t2d, 7 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 jz .mc1dx %endif shl r5d, 16 @@ -1453,21 +1561,21 @@ cglobal mc_chroma, 0,6 pxor m6, m6 punpcklbw m5, m6 %if mmsize==8 - pshufw m7, m5, 0xee - pshufw m6, m5, 0x00 - pshufw m5, m5, 0x55 + pshufw m7, m5, q3232 + pshufw m6, m5, q0000 + pshufw m5, m5, q1111 jge .width4 %else -%ifdef WIN64 +%if WIN64 cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM %endif - pshufd m7, m5, 0x55 + pshufd m7, m5, q1111 punpcklwd m5, m5 - pshufd m6, m5, 0x00 - pshufd m5, m5, 0x55 + pshufd m6, m5, q0000 + pshufd m5, m5, q1111 jg .width8 %endif -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH add r2, r2 UNPACK_UNALIGNED m0, m1, m2, r3 %else @@ -1483,7 +1591,7 @@ cglobal mc_chroma, 0,6 SWAP 3, 0 ALIGN 4 .loop2: -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH UNPACK_UNALIGNED m0, m1, m2, r3+r4 pmullw m3, m6 %else ; !HIGH_BIT_DEPTH @@ -1503,7 +1611,7 @@ ALIGN 4 pmullw m0, m5 paddw m0, m2 psrlw m0, 6 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH movh [r0], m0 %if mmsize == 8 psrlq m0, 32 @@ -1526,15 +1634,19 @@ ALIGN 4 add r1, r2 dec r5d jg .loop2 - REP_RET + RET %if mmsize==8 .width4: -%ifdef ARCH_X86_64 +%if ARCH_X86_64 mov t0, r0 mov t1, r1 mov t2, r3 +%if WIN64 + %define multy0 r4m +%else %define multy0 [rsp-8] +%endif mova multy0, m5 %else mov r3m, r3 @@ -1543,7 +1655,7 @@ ALIGN 4 %endif %else .width8: -%ifdef ARCH_X86_64 +%if ARCH_X86_64 %define multy0 m8 SWAP 8, 5 %else @@ -1553,7 +1665,7 @@ ALIGN 4 %endif FIX_STRIDES r2 .loopx: -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH UNPACK_UNALIGNED m0, m2, m4, r3 UNPACK_UNALIGNED m1, m3, m5, r3+mmsize %else @@ -1577,7 +1689,7 @@ ALIGN 4 add r3, r4 ALIGN 4 .loop4: -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH UNPACK_UNALIGNED m0, m1, m2, r3 pmaddwd m0, m7 pmaddwd m1, m7 @@ -1615,7 +1727,7 @@ ALIGN 4 paddw m1, m3 psrlw m0, 6 psrlw m1, 6 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH movh [r0], m0 movh [r0+mmsize/2], m1 %if mmsize==8 @@ -1630,12 +1742,12 @@ ALIGN 4 %else ; !HIGH_BIT_DEPTH packuswb m0, m1 %if mmsize==8 - pshufw m1, m0, 0x8 - pshufw m0, m0, 0xd + pshufw m1, m0, q0020 + pshufw m0, m0, q0031 movd [r0], m1 movd [r1], m0 %else - pshufd m0, m0, 0xd8 + pshufd m0, m0, q3120 movq [r0], m0 movhps [r1], m0 %endif @@ -1646,13 +1758,13 @@ ALIGN 4 dec r5d jg .loop4 %if mmsize!=8 - REP_RET + RET %else sub dword r7m, 4 jg .width8 - REP_RET + RET .width8: -%ifdef ARCH_X86_64 +%if ARCH_X86_64 lea r3, [t2+8*SIZEOF_PIXEL] lea r0, [t0+4*SIZEOF_PIXEL] lea r1, [t1+4*SIZEOF_PIXEL] @@ -1668,13 +1780,12 @@ ALIGN 4 jmp .loopx %endif -%ifdef ARCH_X86_64 ; too many regs for x86_32 +%if ARCH_X86_64 ; too many regs for x86_32 RESET_MM_PERMUTATION -%ifdef WIN64 -%if xmm_regs_used > 6 - %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16 - %assign xmm_regs_used 6 -%endif +%if WIN64 + %assign stack_offset stack_offset - stack_size_padded + %assign stack_size_padded 0 + %assign xmm_regs_used 0 %endif .mc1dy: and t2d, 7 @@ -1685,10 +1796,8 @@ ALIGN 4 movd m5, r5d mov r6d, 2*SIZEOF_PIXEL .mc1d: -%ifdef HIGH_BIT_DEPTH -%if mmsize == 16 +%if HIGH_BIT_DEPTH && mmsize == 16 WIN64_SPILL_XMM 8 -%endif %endif mova m4, [pw_8] SPLATW m5, m5 @@ -1700,13 +1809,13 @@ ALIGN 4 movifnidn r5d, r8m cmp dword r7m, 4 jg .mc1d_w8 - mov r10, r2 - mov r11, r4 + mov r7, r2 + mov r8, r4 %if mmsize!=8 shr r5d, 1 %endif .loop1d_w4: -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %if mmsize == 8 movq m0, [r3+0] movq m2, [r3+8] @@ -1715,7 +1824,7 @@ ALIGN 4 %else movu m0, [r3] movu m1, [r3+r6] - add r3, r11 + add r3, r8 movu m2, [r3] movu m3, [r3+r6] %endif @@ -1731,7 +1840,7 @@ ALIGN 4 movq m0, [r3] movq m1, [r3+r6] %if mmsize!=8 - add r3, r11 + add r3, r8 movhps m0, [r3] movhps m1, [r3+r6] %endif @@ -1750,24 +1859,24 @@ ALIGN 4 paddw m2, m3 psrlw m0, 3 psrlw m2, 3 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %if mmsize == 8 - xchg r4, r11 - xchg r2, r10 + xchg r4, r8 + xchg r2, r7 %endif movq [r0], m0 movq [r1], m2 %if mmsize == 16 - add r0, r10 - add r1, r10 + add r0, r7 + add r1, r7 movhps [r0], m0 movhps [r1], m2 %endif %else ; !HIGH_BIT_DEPTH packuswb m0, m2 %if mmsize==8 - xchg r4, r11 - xchg r2, r10 + xchg r4, r8 + xchg r2, r7 movd [r0], m0 psrlq m0, 32 movd [r1], m0 @@ -1775,8 +1884,8 @@ ALIGN 4 movhlps m1, m0 movd [r0], m0 movd [r1], m1 - add r0, r10 - add r1, r10 + add r0, r7 + add r1, r7 psrldq m0, 4 psrldq m1, 4 movd [r0], m0 @@ -1788,12 +1897,12 @@ ALIGN 4 add r1, r2 dec r5d jg .loop1d_w4 - REP_RET + RET .mc1d_w8: sub r2, 4*SIZEOF_PIXEL sub r4, 8*SIZEOF_PIXEL - mov r10, 4*SIZEOF_PIXEL - mov r11, 8*SIZEOF_PIXEL + mov r7, 4*SIZEOF_PIXEL + mov r8, 8*SIZEOF_PIXEL %if mmsize==8 shl r5d, 1 %endif @@ -1801,10 +1910,13 @@ ALIGN 4 %endif ; ARCH_X86_64 %endmacro ; MC_CHROMA - %macro MC_CHROMA_SSSE3 0 -cglobal mc_chroma, 0,6,9 - MC_CHROMA_START +cglobal mc_chroma +%if cpuflag(avx2) + MC_CHROMA_START 9 +%else + MC_CHROMA_START 10 +%endif and r5d, 7 and t2d, 7 mov t0d, r5d @@ -1815,18 +1927,18 @@ cglobal mc_chroma, 0,6,9 sub r5d, t2d imul t2d, t0d ; (x*255+8)*y imul r5d, t0d ; (x*255+8)*(8-y) - movd m6, t2d - movd m7, r5d + movd xm6, t2d + movd xm7, r5d %if cpuflag(cache64) mov t0d, r3d and t0d, 7 %ifdef PIC lea t1, [ch_shuf_adj] - movddup m5, [t1 + t0*4] + movddup xm5, [t1 + t0*4] %else - movddup m5, [ch_shuf_adj + t0*4] + movddup xm5, [ch_shuf_adj + t0*4] %endif - paddb m5, [ch_shuf] + paddb xm5, [ch_shuf] and r3, ~7 %else mova m5, [ch_shuf] @@ -1835,12 +1947,80 @@ cglobal mc_chroma, 0,6,9 movifnidn r1, r1mp movifnidn r2d, r2m movifnidn r5d, r8m +%if cpuflag(avx2) + vpbroadcastw m6, xm6 + vpbroadcastw m7, xm7 +%else SPLATW m6, m6 SPLATW m7, m7 +%endif +%if ARCH_X86_64 + %define shiftround m8 + mova m8, [pw_512] +%else + %define shiftround [pw_512] +%endif cmp dword r7m, 4 jg .width8 - movu m0, [r3] + +%if cpuflag(avx2) +.loop4: + movu xm0, [r3] + movu xm1, [r3+r4] + vinserti128 m0, m0, [r3+r4], 1 + vinserti128 m1, m1, [r3+r4*2], 1 pshufb m0, m5 + pshufb m1, m5 + pmaddubsw m0, m7 + pmaddubsw m1, m6 + paddw m0, m1 + pmulhrsw m0, shiftround + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [r0], xm0 + movd [r0+r2], xm1 + psrldq xm0, 4 + psrldq xm1, 4 + movd [r1], xm0 + movd [r1+r2], xm1 + lea r3, [r3+r4*2] + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + sub r5d, 2 + jg .loop4 + RET +.width8: + movu xm0, [r3] + vinserti128 m0, m0, [r3+8], 1 + pshufb m0, m5 +.loop8: + movu xm3, [r3+r4] + vinserti128 m3, m3, [r3+r4+8], 1 + pshufb m3, m5 + pmaddubsw m1, m0, m7 + pmaddubsw m2, m3, m6 + pmaddubsw m3, m3, m7 + + movu xm0, [r3+r4*2] + vinserti128 m0, m0, [r3+r4*2+8], 1 + pshufb m0, m5 + pmaddubsw m4, m0, m6 + + paddw m1, m2 + paddw m3, m4 + pmulhrsw m1, shiftround + pmulhrsw m3, shiftround + packuswb m1, m3 + mova m2, [deinterleave_shufd] + vpermd m1, m2, m1 + vextracti128 xm2, m1, 1 + movq [r0], xm1 + movhps [r1], xm1 + movq [r0+r2], xm2 + movhps [r1+r2], xm2 +%else + movu m0, [r3] + pshufb m0, xm5 .loop4: movu m1, [r3+r4] pshufb m1, m5 @@ -1851,16 +2031,14 @@ cglobal mc_chroma, 0,6,9 pmaddubsw m2, m1, m7 pmaddubsw m1, m6 pmaddubsw m3, m6 - paddw m0, [pw_32] - paddw m2, [pw_32] paddw m1, m0 paddw m3, m2 + pmulhrsw m1, shiftround + pmulhrsw m3, shiftround mova m0, m4 - psrlw m1, 6 - psrlw m3, 6 packuswb m1, m3 movhlps m3, m1 - movd [r0], m1 + movd [r0], xm1 movd [r0+r2], m3 psrldq m1, 4 psrldq m3, 4 @@ -1871,16 +2049,15 @@ cglobal mc_chroma, 0,6,9 lea r1, [r1+r2*2] sub r5d, 2 jg .loop4 - REP_RET - + RET .width8: movu m0, [r3] pshufb m0, m5 movu m1, [r3+8] pshufb m1, m5 -%ifdef ARCH_X86_64 - SWAP 8, 6 - %define mult1 m8 +%if ARCH_X86_64 + SWAP 9, 6 + %define mult1 m9 %else mova r0m, m6 %define mult1 r0m @@ -1896,14 +2073,12 @@ cglobal mc_chroma, 0,6,9 pmaddubsw m1, m7 pmaddubsw m2, mult1 pmaddubsw m3, mult1 - paddw m0, [pw_32] - paddw m1, [pw_32] paddw m0, m2 paddw m1, m3 - psrlw m0, 6 - psrlw m1, 6 + pmulhrsw m0, shiftround ; x + 32 >> 6 + pmulhrsw m1, shiftround packuswb m0, m1 - pshufd m0, m0, 0xd8 + pshufd m0, m0, q3120 movq [r0], m0 movhps [r1], m0 @@ -1917,25 +2092,24 @@ cglobal mc_chroma, 0,6,9 pmaddubsw m6, m7 pmaddubsw m2, mult1 pmaddubsw m3, mult1 - paddw m4, [pw_32] - paddw m6, [pw_32] paddw m2, m4 paddw m3, m6 - psrlw m2, 6 - psrlw m3, 6 + pmulhrsw m2, shiftround + pmulhrsw m3, shiftround packuswb m2, m3 - pshufd m2, m2, 0xd8 + pshufd m2, m2, q3120 movq [r0+r2], m2 movhps [r1+r2], m2 +%endif lea r3, [r3+r4*2] lea r0, [r0+r2*2] lea r1, [r1+r2*2] sub r5d, 2 jg .loop8 - REP_RET + RET %endmacro -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_MMX mmx2 MC_CHROMA INIT_XMM sse2 @@ -1945,8 +2119,6 @@ MC_CHROMA %else ; !HIGH_BIT_DEPTH INIT_MMX mmx2 MC_CHROMA -INIT_XMM sse2, misalign -MC_CHROMA INIT_XMM sse2 MC_CHROMA INIT_XMM ssse3 @@ -1955,4 +2127,6 @@ INIT_XMM ssse3, cache64 MC_CHROMA_SSSE3 INIT_XMM avx MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64 +INIT_YMM avx2 +MC_CHROMA_SSSE3 %endif ; HIGH_BIT_DEPTH