X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fx86%2Fmc-a.asm;h=608efcd97b40d70c9223ab23719dcd95baf5de07;hb=64f4e24909924fceeea6e154d71b7dfbf586c7ea;hp=2d226e084076348841bc0674efd1d2318a4defa1;hpb=213a99d070ebd4f9aeffe7cb3ed9bd7fe755ec7f;p=x264 diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 2d226e08..608efcd9 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -1,13 +1,15 @@ ;***************************************************************************** ;* mc-a.asm: x86 motion compensation ;***************************************************************************** -;* Copyright (C) 2003-2010 x264 project +;* Copyright (C) 2003-2016 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Laurent Aimar ;* Dylan Yudaken +;* Holger Lubitz ;* Min Chen +;* Oskar Arvidsson ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -28,36 +30,45 @@ ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION_RODATA 32 -ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 +ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 ch_shuf_adj: times 8 db 0 times 8 db 2 times 8 db 4 times 8 db 6 +sq_1: times 1 dq 1 SECTION .text +cextern pb_0 +cextern pw_1 cextern pw_4 cextern pw_8 cextern pw_32 cextern pw_64 +cextern pw_512 cextern pw_00ff +cextern pw_pixel_max cextern sw_64 +cextern pd_32 +cextern deinterleave_shufd ;============================================================================= ; implicit weighted biprediction ;============================================================================= ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 -%ifdef ARCH_X86_64 - DECLARE_REG_TMP 0,1,2,3,4,5,10,11 +%if WIN64 + DECLARE_REG_TMP 0,1,2,3,4,5,4,5 %macro AVG_START 0-1 0 PROLOGUE 6,7,%1 -%ifdef WIN64 - movsxd r5, r5d -%endif - .height_loop: + %endmacro +%elif UNIX64 + DECLARE_REG_TMP 0,1,2,3,4,5,7,8 + %macro AVG_START 0-1 0 + PROLOGUE 6,9,%1 %endmacro %else DECLARE_REG_TMP 1,2,3,4,5,6,1,2 @@ -69,19 +80,42 @@ cextern sw_64 mov t3, r3m mov t4, r4m mov t5, r5m - .height_loop: %endmacro %endif -%macro SPLATW 2-3 0 -%if mmsize==16 - pshuflw %1, %2, %3*0x55 - punpcklqdq %1, %1 -%else - pshufw %1, %2, %3*0x55 -%endif +%macro AVG_END 0 + lea t4, [t4+t5*2*SIZEOF_PIXEL] + lea t2, [t2+t3*2*SIZEOF_PIXEL] + lea t0, [t0+t1*2*SIZEOF_PIXEL] + sub eax, 2 + jg .height_loop + RET %endmacro +%if HIGH_BIT_DEPTH + +%macro BIWEIGHT_MMX 2 + movh m0, %1 + movh m1, %2 + punpcklwd m0, m1 + pmaddwd m0, m3 + paddd m0, m4 + psrad m0, 6 +%endmacro + +%macro BIWEIGHT_START_MMX 0 + movzx t6d, word r6m + mov t7d, 64 + sub t7d, t6d + shl t7d, 16 + add t6d, t7d + movd m3, t6d + SPLATD m3, m3 + mova m4, [pd_32] + pxor m5, m5 +%endmacro + +%else ;!HIGH_BIT_DEPTH %macro BIWEIGHT_MMX 2 movh m0, %1 movh m1, %2 @@ -102,14 +136,14 @@ cextern sw_64 mova m4, [pw_32] ; rounding pxor m5, m5 %endmacro +%endif ;HIGH_BIT_DEPTH %macro BIWEIGHT_SSSE3 2 movh m0, %1 movh m1, %2 punpcklbw m0, m1 pmaddubsw m0, m3 - paddw m0, m4 - psraw m0, 6 + pmulhrsw m0, m4 %endmacro %macro BIWEIGHT_START_SSSE3 0 @@ -118,11 +152,32 @@ cextern sw_64 sub t7d, t6d shl t7d, 8 add t6d, t7d - movd m3, t6d - mova m4, [pw_32] + mova m4, [pw_512] + movd xm3, t6d +%if cpuflag(avx2) + vpbroadcastw m3, xm3 +%else SPLATW m3, m3 ; weight_dst,src +%endif %endmacro +%if HIGH_BIT_DEPTH +%macro BIWEIGHT_ROW 4 + BIWEIGHT [%2], [%3] +%if %4==mmsize/4 + packssdw m0, m0 + CLIPW m0, m5, m7 + movh [%1], m0 +%else + SWAP 0, 6 + BIWEIGHT [%2+mmsize/2], [%3+mmsize/2] + packssdw m6, m0 + CLIPW m6, m5, m7 + mova [%1], m6 +%endif +%endmacro + +%else ;!HIGH_BIT_DEPTH %macro BIWEIGHT_ROW 4 BIWEIGHT [%2], [%3] %if %4==mmsize/2 @@ -136,164 +191,281 @@ cextern sw_64 %endif %endmacro +%endif ;HIGH_BIT_DEPTH + ;----------------------------------------------------------------------------- -; int pixel_avg_weight_w16( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight ) +; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight ) ;----------------------------------------------------------------------------- -%macro AVG_WEIGHT 2-3 0 -cglobal pixel_avg_weight_w%2_%1 +%macro AVG_WEIGHT 1-2 0 +cglobal pixel_avg_weight_w%1 BIWEIGHT_START - AVG_START %3 -%if %2==8 && mmsize==16 + AVG_START %2 +%if HIGH_BIT_DEPTH + mova m7, [pw_pixel_max] +%endif +.height_loop: +%if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL) BIWEIGHT [t2], [t4] SWAP 0, 6 - BIWEIGHT [t2+t3], [t4+t5] + BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5] +%if HIGH_BIT_DEPTH + packssdw m6, m0 + CLIPW m6, m5, m7 +%else ;!HIGH_BIT_DEPTH packuswb m6, m0 +%endif ;HIGH_BIT_DEPTH movlps [t0], m6 - movhps [t0+t1], m6 + movhps [t0+SIZEOF_PIXEL*t1], m6 %else %assign x 0 -%rep 1+%2/(mmsize*2) - BIWEIGHT_ROW t0+x, t2+x, t4+x, %2 - BIWEIGHT_ROW t0+x+t1, t2+x+t3, t4+x+t5, %2 +%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize + BIWEIGHT_ROW t0+x, t2+x, t4+x, %1 + BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %1 %assign x x+mmsize %endrep %endif - lea t0, [t0+t1*2] - lea t2, [t2+t3*2] - lea t4, [t4+t5*2] - sub eax, 2 - jg .height_loop - REP_RET + AVG_END %endmacro %define BIWEIGHT BIWEIGHT_MMX %define BIWEIGHT_START BIWEIGHT_START_MMX -INIT_MMX -AVG_WEIGHT mmxext, 4 -AVG_WEIGHT mmxext, 8 -AVG_WEIGHT mmxext, 16 -INIT_XMM -%define pixel_avg_weight_w4_sse2 pixel_avg_weight_w4_mmxext -AVG_WEIGHT sse2, 8, 7 -AVG_WEIGHT sse2, 16, 7 +INIT_MMX mmx2 +AVG_WEIGHT 4 +AVG_WEIGHT 8 +AVG_WEIGHT 16 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +AVG_WEIGHT 4, 8 +AVG_WEIGHT 8, 8 +AVG_WEIGHT 16, 8 +%else ;!HIGH_BIT_DEPTH +INIT_XMM sse2 +AVG_WEIGHT 8, 7 +AVG_WEIGHT 16, 7 %define BIWEIGHT BIWEIGHT_SSSE3 %define BIWEIGHT_START BIWEIGHT_START_SSSE3 -INIT_MMX -AVG_WEIGHT ssse3, 4 -INIT_XMM -AVG_WEIGHT ssse3, 8, 7 -AVG_WEIGHT ssse3, 16, 7 +INIT_MMX ssse3 +AVG_WEIGHT 4 +INIT_XMM ssse3 +AVG_WEIGHT 8, 7 +AVG_WEIGHT 16, 7 + +INIT_YMM avx2 +cglobal pixel_avg_weight_w16 + BIWEIGHT_START + AVG_START 5 +.height_loop: + movu xm0, [t2] + movu xm1, [t4] + vinserti128 m0, m0, [t2+t3], 1 + vinserti128 m1, m1, [t4+t5], 1 + SBUTTERFLY bw, 0, 1, 2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 + mova [t0], xm0 + vextracti128 [t0+t1], m0, 1 + AVG_END +%endif ;HIGH_BIT_DEPTH ;============================================================================= ; P frame explicit weighted prediction ;============================================================================= +%if HIGH_BIT_DEPTH +; width %macro WEIGHT_START 1 - mova m3, [r4] - mova m6, [r4+16] - movd m5, [r4+32] - pxor m2, m2 -%if (%1 == 20 || %1 == 12) && mmsize == 16 - movdq2q mm3, xmm3 - movdq2q mm4, xmm4 - movdq2q mm5, xmm5 - movdq2q mm6, xmm6 - pxor mm2, mm2 + mova m0, [r4+ 0] ; 1<= mmsize - WEIGHT_ROW (%1+x), (%2+x), mmsize ; weight 1 mmsize - WEIGHT_ROW (%1+r3+x), (%2+r1+x), mmsize ; weight 1 mmsize + WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4 %assign x (x+mmsize) %else - WEIGHT_COL (%1+x),(%2+x),(%3-x) - %exitrep + %assign w %3-x +%if w == 20 + %assign w 16 +%endif + WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4 + %assign x (x+w) %endif %if x >= %3 %exitrep @@ -301,57 +473,81 @@ AVG_WEIGHT ssse3, 16, 7 %endrep %endmacro +%endif ; HIGH_BIT_DEPTH + ;----------------------------------------------------------------------------- -;void mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, weight_t *weight, int h ) +;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h ) ;----------------------------------------------------------------------------- -%ifdef ARCH_X86_64 -%define NUMREGS 6 -%define LOAD_HEIGHT -%define HEIGHT_REG r5d -%else -%define NUMREGS 5 -%define LOAD_HEIGHT mov r4d, r5m -%define HEIGHT_REG r4d -%endif - -%macro WEIGHTER 2 - cglobal mc_weight_w%1_%2, NUMREGS, NUMREGS, 7 +%macro WEIGHTER 1 +cglobal mc_weight_w%1, 6,6,8 + FIX_STRIDES r1, r3 WEIGHT_START %1 - LOAD_HEIGHT +%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 + ; we can merge the shift step into the scale factor + ; if (m3<<7) doesn't overflow an int16_t + cmp byte [r4+1], 0 + jz .fast +%endif .loop: - WEIGHT_TWO_ROW r2, r0, %1 + WEIGHT_TWO_ROW r2, r0, %1, 0 lea r0, [r0+r1*2] lea r2, [r2+r3*2] - sub HEIGHT_REG, 2 + sub r5d, 2 jg .loop - REP_RET + RET +%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 +.fast: + psllw m3, 7 +.fastloop: + WEIGHT_TWO_ROW r2, r0, %1, 1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + sub r5d, 2 + jg .fastloop + RET +%endif %endmacro -INIT_MMX -WEIGHTER 4, mmxext -WEIGHTER 8, mmxext -WEIGHTER 12, mmxext -WEIGHTER 16, mmxext -WEIGHTER 20, mmxext -INIT_XMM -WEIGHTER 8, sse2 -WEIGHTER 16, sse2 -WEIGHTER 20, sse2 -%define WEIGHT WEIGHT_SSSE3 -%define WEIGHT_START WEIGHT_START_SSSE3 -INIT_MMX -WEIGHTER 4, ssse3 -INIT_XMM -WEIGHTER 8, ssse3 -WEIGHTER 16, ssse3 -WEIGHTER 20, ssse3 +INIT_MMX mmx2 +WEIGHTER 4 +WEIGHTER 8 +WEIGHTER 12 +WEIGHTER 16 +WEIGHTER 20 +INIT_XMM sse2 +WEIGHTER 8 +WEIGHTER 16 +WEIGHTER 20 +%if HIGH_BIT_DEPTH +WEIGHTER 12 +%else +INIT_MMX ssse3 +WEIGHTER 4 +INIT_XMM ssse3 +WEIGHTER 8 +WEIGHTER 16 +WEIGHTER 20 +INIT_YMM avx2 +WEIGHTER 8 +WEIGHTER 16 +WEIGHTER 20 +%endif %macro OFFSET_OP 7 mov%6 m0, [%1] mov%6 m1, [%2] +%if HIGH_BIT_DEPTH + p%5usw m0, m2 + p%5usw m1, m2 +%ifidn %5,add + pminsw m0, m3 + pminsw m1, m3 +%endif +%else p%5usb m0, m2 p%5usb m1, m2 +%endif mov%7 [%3], m0 mov%7 [%4], m1 %endmacro @@ -359,53 +555,62 @@ WEIGHTER 20, ssse3 %macro OFFSET_TWO_ROW 4 %assign x 0 %rep %3 -%if (%3-x) >= mmsize +%if (%3*SIZEOF_PIXEL-x) >= mmsize OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a %assign x (x+mmsize) %else - OFFSET_OP (%1+x),(%1+x+r3), (%2+x), (%2+x+r1), %4, d, d +%if HIGH_BIT_DEPTH + OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h +%else + OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d +%endif %exitrep %endif -%if x >= %3 +%if x >= %3*SIZEOF_PIXEL %exitrep %endif %endrep %endmacro ;----------------------------------------------------------------------------- -;void mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, weight_t *w, int h ) +;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h ) ;----------------------------------------------------------------------------- -%macro OFFSET 3 - cglobal mc_offset%3_w%1_%2, NUMREGS, NUMREGS +%macro OFFSET 2 +cglobal mc_offset%2_w%1, 6,6 + FIX_STRIDES r1, r3 mova m2, [r4] - LOAD_HEIGHT +%if HIGH_BIT_DEPTH +%ifidn %2,add + mova m3, [pw_pixel_max] +%endif +%endif .loop: - OFFSET_TWO_ROW r2, r0, %1, %3 + OFFSET_TWO_ROW r2, r0, %1, %2 lea r0, [r0+r1*2] lea r2, [r2+r3*2] - sub HEIGHT_REG, 2 + sub r5d, 2 jg .loop - REP_RET + RET %endmacro -%macro OFFSETPN 2 - OFFSET %1, %2, add - OFFSET %1, %2, sub +%macro OFFSETPN 1 + OFFSET %1, add + OFFSET %1, sub %endmacro -INIT_MMX -OFFSETPN 4, mmxext -OFFSETPN 8, mmxext -OFFSETPN 12, mmxext -OFFSETPN 16, mmxext -OFFSETPN 20, mmxext -INIT_XMM -OFFSETPN 12, sse2 -OFFSETPN 16, sse2 -OFFSETPN 20, sse2 -%undef LOAD_HEIGHT -%undef HEIGHT_REG -%undef NUMREGS - +INIT_MMX mmx2 +OFFSETPN 4 +OFFSETPN 8 +OFFSETPN 12 +OFFSETPN 16 +OFFSETPN 20 +INIT_XMM sse2 +OFFSETPN 12 +OFFSETPN 16 +OFFSETPN 20 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +OFFSETPN 8 +%endif ;============================================================================= @@ -413,108 +618,342 @@ OFFSETPN 20, sse2 ;============================================================================= ;----------------------------------------------------------------------------- -; void pixel_avg_4x4( uint8_t *dst, int dst_stride, -; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight ); +; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, +; pixel *src2, intptr_t src2_stride, int weight ); ;----------------------------------------------------------------------------- -%macro AVGH 3 -cglobal pixel_avg_%1x%2_%3 +%macro AVGH 2 +cglobal pixel_avg_%1x%2 mov eax, %2 cmp dword r6m, 32 - jne pixel_avg_weight_w%1_%3 + jne pixel_avg_weight_w%1 %+ SUFFIX +%if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads + jmp pixel_avg_w%1_avx2 +%else %if mmsize == 16 && %1 == 16 test dword r4m, 15 jz pixel_avg_w%1_sse2 %endif - jmp pixel_avg_w%1_mmxext + jmp pixel_avg_w%1_mmx2 +%endif %endmacro ;----------------------------------------------------------------------------- -; void pixel_avg_w4( uint8_t *dst, int dst_stride, -; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, -; int height, int weight ); +; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, +; pixel *src2, intptr_t src2_stride, int height, int weight ); ;----------------------------------------------------------------------------- -%macro AVG_END 0 - sub eax, 2 - lea t4, [t4+t5*2] - lea t2, [t2+t3*2] - lea t0, [t0+t1*2] - jg .height_loop - REP_RET -%endmacro - %macro AVG_FUNC 3 -cglobal %1 +cglobal pixel_avg_w%1 AVG_START - %2 m0, [t2] - %2 m1, [t2+t3] - pavgb m0, [t4] - pavgb m1, [t4+t5] - %3 [t0], m0 - %3 [t0+t1], m1 +.height_loop: +%assign x 0 +%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize + %2 m0, [t2+x] + %2 m1, [t2+x+SIZEOF_PIXEL*t3] +%if HIGH_BIT_DEPTH + pavgw m0, [t4+x] + pavgw m1, [t4+x+SIZEOF_PIXEL*t5] +%else ;!HIGH_BIT_DEPTH + pavgb m0, [t4+x] + pavgb m1, [t4+x+SIZEOF_PIXEL*t5] +%endif + %3 [t0+x], m0 + %3 [t0+x+SIZEOF_PIXEL*t1], m1 +%assign x x+mmsize +%endrep AVG_END %endmacro -INIT_MMX -AVG_FUNC pixel_avg_w4_mmxext, movd, movd -AVGH 4, 8, mmxext -AVGH 4, 4, mmxext -AVGH 4, 2, mmxext +%if HIGH_BIT_DEPTH + +INIT_MMX mmx2 +AVG_FUNC 4, movq, movq +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 + +AVG_FUNC 8, movq, movq +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 + +AVG_FUNC 16, movq, movq +AVGH 16, 16 +AVGH 16, 8 + +INIT_XMM sse2 +AVG_FUNC 4, movq, movq +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 + +AVG_FUNC 8, movdqu, movdqa +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 + +AVG_FUNC 16, movdqu, movdqa +AVGH 16, 16 +AVGH 16, 8 + +%else ;!HIGH_BIT_DEPTH + +INIT_MMX mmx2 +AVG_FUNC 4, movd, movd +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 + +AVG_FUNC 8, movq, movq +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 + +AVG_FUNC 16, movq, movq +AVGH 16, 16 +AVGH 16, 8 + +INIT_XMM sse2 +AVG_FUNC 16, movdqu, movdqa +AVGH 16, 16 +AVGH 16, 8 +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 +INIT_XMM ssse3 +AVGH 16, 16 +AVGH 16, 8 +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 +INIT_MMX ssse3 +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 +INIT_XMM avx2 +AVG_FUNC 16, movdqu, movdqa +AVGH 16, 16 +AVGH 16, 8 + +%endif ;HIGH_BIT_DEPTH -AVG_FUNC pixel_avg_w8_mmxext, movq, movq -AVGH 8, 16, mmxext -AVGH 8, 8, mmxext -AVGH 8, 4, mmxext -cglobal pixel_avg_w16_mmxext - AVG_START - movq mm0, [t2 ] - movq mm1, [t2+8] - movq mm2, [t2+t3 ] - movq mm3, [t2+t3+8] - pavgb mm0, [t4 ] - pavgb mm1, [t4+8] - pavgb mm2, [t4+t5 ] - pavgb mm3, [t4+t5+8] - movq [t0 ], mm0 - movq [t0+8], mm1 - movq [t0+t1 ], mm2 - movq [t0+t1+8], mm3 - AVG_END -AVGH 16, 16, mmxext -AVGH 16, 8, mmxext +;============================================================================= +; pixel avg2 +;============================================================================= + +%if HIGH_BIT_DEPTH +;----------------------------------------------------------------------------- +; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride, +; uint16_t *src1, intptr_t src_stride, +; uint16_t *src2, int height ); +;----------------------------------------------------------------------------- +%macro AVG2_W_ONE 1 +cglobal pixel_avg2_w%1, 6,7,4 + sub r4, r2 + lea r6, [r4+r3*2] +.height_loop: + movu m0, [r2] + movu m1, [r2+r3*2] +%if cpuflag(avx) || mmsize == 8 + pavgw m0, [r2+r4] + pavgw m1, [r2+r6] +%else + movu m2, [r2+r4] + movu m3, [r2+r6] + pavgw m0, m2 + pavgw m1, m3 +%endif + mova [r0], m0 + mova [r0+r1*2], m1 + lea r2, [r2+r3*4] + lea r0, [r0+r1*4] + sub r5d, 2 + jg .height_loop + RET +%endmacro + +%macro AVG2_W_TWO 3 +cglobal pixel_avg2_w%1, 6,7,8 + sub r4, r2 + lea r6, [r4+r3*2] +.height_loop: + movu m0, [r2] + %2 m1, [r2+mmsize] + movu m2, [r2+r3*2] + %2 m3, [r2+r3*2+mmsize] +%if mmsize == 8 + pavgw m0, [r2+r4] + pavgw m1, [r2+r4+mmsize] + pavgw m2, [r2+r6] + pavgw m3, [r2+r6+mmsize] +%else + movu m4, [r2+r4] + %2 m5, [r2+r4+mmsize] + movu m6, [r2+r6] + %2 m7, [r2+r6+mmsize] + pavgw m0, m4 + pavgw m1, m5 + pavgw m2, m6 + pavgw m3, m7 +%endif + mova [r0], m0 + %3 [r0+mmsize], m1 + mova [r0+r1*2], m2 + %3 [r0+r1*2+mmsize], m3 + lea r2, [r2+r3*4] + lea r0, [r0+r1*4] + sub r5d, 2 + jg .height_loop + RET +%endmacro + +INIT_MMX mmx2 +AVG2_W_ONE 4 +AVG2_W_TWO 8, movu, mova +INIT_XMM sse2 +AVG2_W_ONE 8 +AVG2_W_TWO 10, movd, movd +AVG2_W_TWO 16, movu, mova +INIT_YMM avx2 +AVG2_W_ONE 16 -INIT_XMM -AVG_FUNC pixel_avg_w16_sse2, movdqu, movdqa -AVGH 16, 16, sse2 -AVGH 16, 8, sse2 -AVGH 8, 16, sse2 -AVGH 8, 8, sse2 -AVGH 8, 4, sse2 -AVGH 16, 16, ssse3 -AVGH 16, 8, ssse3 -AVGH 8, 16, ssse3 -AVGH 8, 8, ssse3 -AVGH 8, 4, ssse3 INIT_MMX -AVGH 4, 8, ssse3 -AVGH 4, 4, ssse3 -AVGH 4, 2, ssse3 +cglobal pixel_avg2_w10_mmx2, 6,7 + sub r4, r2 + lea r6, [r4+r3*2] +.height_loop: + movu m0, [r2+ 0] + movu m1, [r2+ 8] + movh m2, [r2+16] + movu m3, [r2+r3*2+ 0] + movu m4, [r2+r3*2+ 8] + movh m5, [r2+r3*2+16] + pavgw m0, [r2+r4+ 0] + pavgw m1, [r2+r4+ 8] + pavgw m2, [r2+r4+16] + pavgw m3, [r2+r6+ 0] + pavgw m4, [r2+r6+ 8] + pavgw m5, [r2+r6+16] + mova [r0+ 0], m0 + mova [r0+ 8], m1 + movh [r0+16], m2 + mova [r0+r1*2+ 0], m3 + mova [r0+r1*2+ 8], m4 + movh [r0+r1*2+16], m5 + lea r2, [r2+r3*2*2] + lea r0, [r0+r1*2*2] + sub r5d, 2 + jg .height_loop + RET +cglobal pixel_avg2_w16_mmx2, 6,7 + sub r4, r2 + lea r6, [r4+r3*2] +.height_loop: + movu m0, [r2+ 0] + movu m1, [r2+ 8] + movu m2, [r2+16] + movu m3, [r2+24] + movu m4, [r2+r3*2+ 0] + movu m5, [r2+r3*2+ 8] + movu m6, [r2+r3*2+16] + movu m7, [r2+r3*2+24] + pavgw m0, [r2+r4+ 0] + pavgw m1, [r2+r4+ 8] + pavgw m2, [r2+r4+16] + pavgw m3, [r2+r4+24] + pavgw m4, [r2+r6+ 0] + pavgw m5, [r2+r6+ 8] + pavgw m6, [r2+r6+16] + pavgw m7, [r2+r6+24] + mova [r0+ 0], m0 + mova [r0+ 8], m1 + mova [r0+16], m2 + mova [r0+24], m3 + mova [r0+r1*2+ 0], m4 + mova [r0+r1*2+ 8], m5 + mova [r0+r1*2+16], m6 + mova [r0+r1*2+24], m7 + lea r2, [r2+r3*2*2] + lea r0, [r0+r1*2*2] + sub r5d, 2 + jg .height_loop + RET +cglobal pixel_avg2_w18_mmx2, 6,7 + sub r4, r2 +.height_loop: + movu m0, [r2+ 0] + movu m1, [r2+ 8] + movu m2, [r2+16] + movu m3, [r2+24] + movh m4, [r2+32] + pavgw m0, [r2+r4+ 0] + pavgw m1, [r2+r4+ 8] + pavgw m2, [r2+r4+16] + pavgw m3, [r2+r4+24] + pavgw m4, [r2+r4+32] + mova [r0+ 0], m0 + mova [r0+ 8], m1 + mova [r0+16], m2 + mova [r0+24], m3 + movh [r0+32], m4 + lea r2, [r2+r3*2] + lea r0, [r0+r1*2] + dec r5d + jg .height_loop + RET -;============================================================================= -; pixel avg2 -;============================================================================= +%macro PIXEL_AVG_W18 0 +cglobal pixel_avg2_w18, 6,7 + sub r4, r2 +.height_loop: + movu m0, [r2+ 0] + movd xm2, [r2+32] +%if mmsize == 32 + pavgw m0, [r2+r4+ 0] + movd xm1, [r2+r4+32] + pavgw xm2, xm1 +%else + movu m1, [r2+16] + movu m3, [r2+r4+ 0] + movu m4, [r2+r4+16] + movd m5, [r2+r4+32] + pavgw m0, m3 + pavgw m1, m4 + pavgw m2, m5 + mova [r0+16], m1 +%endif + mova [r0+ 0], m0 + movd [r0+32], xm2 + lea r2, [r2+r3*2] + lea r0, [r0+r1*2] + dec r5d + jg .height_loop + RET +%endmacro + +INIT_XMM sse2 +PIXEL_AVG_W18 +INIT_YMM avx2 +PIXEL_AVG_W18 + +%endif ; HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- -; void pixel_avg2_w4( uint8_t *dst, int dst_stride, -; uint8_t *src1, int src_stride, +; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride, +; uint8_t *src1, intptr_t src_stride, ; uint8_t *src2, int height ); ;----------------------------------------------------------------------------- %macro AVG2_W8 2 -cglobal pixel_avg2_w%1_mmxext, 6,7 +cglobal pixel_avg2_w%1_mmx2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -522,124 +961,133 @@ cglobal pixel_avg2_w%1_mmxext, 6,7 %2 mm1, [r2+r3] pavgb mm0, [r2+r4] pavgb mm1, [r2+r6] + lea r2, [r2+r3*2] %2 [r0], mm0 %2 [r0+r1], mm1 - sub r5d, 2 - lea r2, [r2+r3*2] lea r0, [r0+r1*2] + sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro +INIT_MMX AVG2_W8 4, movd AVG2_W8 8, movq %macro AVG2_W16 2 -cglobal pixel_avg2_w%1_mmxext, 6,7 - sub r4, r2 - lea r6, [r4+r3] +cglobal pixel_avg2_w%1_mmx2, 6,7 + sub r2, r4 + lea r6, [r2+r3] .height_loop: - movq mm0, [r2] - %2 mm1, [r2+8] - movq mm2, [r2+r3] - %2 mm3, [r2+r3+8] - pavgb mm0, [r2+r4] - pavgb mm1, [r2+r4+8] - pavgb mm2, [r2+r6] - pavgb mm3, [r2+r6+8] + movq mm0, [r4] + %2 mm1, [r4+8] + movq mm2, [r4+r3] + %2 mm3, [r4+r3+8] + pavgb mm0, [r4+r2] + pavgb mm1, [r4+r2+8] + pavgb mm2, [r4+r6] + pavgb mm3, [r4+r6+8] + lea r4, [r4+r3*2] movq [r0], mm0 %2 [r0+8], mm1 movq [r0+r1], mm2 %2 [r0+r1+8], mm3 - lea r2, [r2+r3*2] lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro AVG2_W16 12, movd AVG2_W16 16, movq -cglobal pixel_avg2_w20_mmxext, 6,7 - sub r4, r2 - lea r6, [r4+r3] +cglobal pixel_avg2_w20_mmx2, 6,7 + sub r2, r4 + lea r6, [r2+r3] .height_loop: - movq mm0, [r2] - movq mm1, [r2+8] - movd mm2, [r2+16] - movq mm3, [r2+r3] - movq mm4, [r2+r3+8] - movd mm5, [r2+r3+16] - pavgb mm0, [r2+r4] - pavgb mm1, [r2+r4+8] - pavgb mm2, [r2+r4+16] - pavgb mm3, [r2+r6] - pavgb mm4, [r2+r6+8] - pavgb mm5, [r2+r6+16] + movq mm0, [r4] + movq mm1, [r4+8] + movd mm2, [r4+16] + movq mm3, [r4+r3] + movq mm4, [r4+r3+8] + movd mm5, [r4+r3+16] + pavgb mm0, [r4+r2] + pavgb mm1, [r4+r2+8] + pavgb mm2, [r4+r2+16] + pavgb mm3, [r4+r6] + pavgb mm4, [r4+r6+8] + pavgb mm5, [r4+r6+16] + lea r4, [r4+r3*2] movq [r0], mm0 movq [r0+8], mm1 movd [r0+16], mm2 movq [r0+r1], mm3 movq [r0+r1+8], mm4 movd [r0+r1+16], mm5 - lea r2, [r2+r3*2] lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET + RET +INIT_XMM cglobal pixel_avg2_w16_sse2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: - movdqu xmm0, [r2] - movdqu xmm2, [r2+r3] - movdqu xmm1, [r2+r4] - movdqu xmm3, [r2+r6] - pavgb xmm0, xmm1 - pavgb xmm2, xmm3 - movdqa [r0], xmm0 - movdqa [r0+r1], xmm2 + movu m0, [r2] + movu m2, [r2+r3] + movu m1, [r2+r4] + movu m3, [r2+r6] lea r2, [r2+r3*2] + pavgb m0, m1 + pavgb m2, m3 + mova [r0], m0 + mova [r0+r1], m2 lea r0, [r0+r1*2] - sub r5d, 2 - jg .height_loop - REP_RET + sub r5d, 2 + jg .height_loop + RET -%macro AVG2_W20 1 -cglobal pixel_avg2_w20_%1, 6,7 - sub r4, r2 - lea r6, [r4+r3] +cglobal pixel_avg2_w20_sse2, 6,7 + sub r2, r4 + lea r6, [r2+r3] .height_loop: - movdqu xmm0, [r2] - movdqu xmm2, [r2+r3] - movd mm4, [r2+16] - movd mm5, [r2+r3+16] -%ifidn %1, sse2_misalign - pavgb xmm0, [r2+r4] - pavgb xmm2, [r2+r6] -%else - movdqu xmm1, [r2+r4] - movdqu xmm3, [r2+r6] - pavgb xmm0, xmm1 - pavgb xmm2, xmm3 -%endif - pavgb mm4, [r2+r4+16] - pavgb mm5, [r2+r6+16] - movdqa [r0], xmm0 - movd [r0+16], mm4 - movdqa [r0+r1], xmm2 - movd [r0+r1+16], mm5 - lea r2, [r2+r3*2] + movu m0, [r4] + movu m2, [r4+r3] + movu m1, [r4+r2] + movu m3, [r4+r6] + movd mm4, [r4+16] + movd mm5, [r4+r3+16] + pavgb m0, m1 + pavgb m2, m3 + pavgb mm4, [r4+r2+16] + pavgb mm5, [r4+r6+16] + lea r4, [r4+r3*2] + mova [r0], m0 + mova [r0+r1], m2 + movd [r0+16], mm4 + movd [r0+r1+16], mm5 + lea r0, [r0+r1*2] + sub r5d, 2 + jg .height_loop + RET + +INIT_YMM avx2 +cglobal pixel_avg2_w20, 6,7 + sub r2, r4 + lea r6, [r2+r3] +.height_loop: + movu m0, [r4] + movu m1, [r4+r3] + pavgb m0, [r4+r2] + pavgb m1, [r4+r6] + lea r4, [r4+r3*2] + mova [r0], m0 + mova [r0+r1], m1 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET -%endmacro - -AVG2_W20 sse2 -AVG2_W20 sse2_misalign + RET ; Cacheline split code for processors with high latencies for loads ; split over cache lines. See sad-a.asm for a more detailed explanation. @@ -657,20 +1105,6 @@ AVG2_W20 sse2_misalign psubw %1, %2 %endmacro -%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set -cglobal pixel_avg2_w%1_cache%2_%3 - mov eax, r2m - and eax, 0x1f|(%2>>1) - cmp eax, (32-%1)|(%2>>1) - jle pixel_avg2_w%1_%3 -;w12 isn't needed because w16 is just as fast if there's no cacheline split -%if %1 == 12 - jmp pixel_avg2_w16_cache_mmxext -%else - jmp pixel_avg2_w%1_cache_mmxext -%endif -%endmacro - %macro AVG_CACHELINE_START 0 %assign stack_offset 0 INIT_SHIFT mm6, mm7 @@ -684,61 +1118,86 @@ cglobal pixel_avg2_w%1_cache%2_%3 %endmacro %macro AVG_CACHELINE_LOOP 2 - movq mm0, [r2+8+%1] movq mm1, [r2+%1] - movq mm2, [r2+r4+8+%1] + movq mm0, [r2+8+%1] movq mm3, [r2+r4+%1] - psllq mm0, mm6 + movq mm2, [r2+r4+8+%1] psrlq mm1, mm7 - psllq mm2, mm4 + psllq mm0, mm6 psrlq mm3, mm5 + psllq mm2, mm4 por mm0, mm1 por mm2, mm3 - pavgb mm0, mm2 - %2 [r0+%1], mm0 + pavgb mm2, mm0 + %2 [r0+%1], mm2 %endmacro -pixel_avg2_w8_cache_mmxext: - AVG_CACHELINE_START - AVG_CACHELINE_LOOP 0, movq - add r2, r3 - add r0, r1 - dec r5d - jg .height_loop - REP_RET - -pixel_avg2_w16_cache_mmxext: - AVG_CACHELINE_START - AVG_CACHELINE_LOOP 0, movq - AVG_CACHELINE_LOOP 8, movq - add r2, r3 - add r0, r1 - dec r5d - jg .height_loop - REP_RET - -pixel_avg2_w20_cache_mmxext: +%macro AVG_CACHELINE_FUNC 2 +pixel_avg2_w%1_cache_mmx2: AVG_CACHELINE_START AVG_CACHELINE_LOOP 0, movq +%if %1>8 AVG_CACHELINE_LOOP 8, movq +%if %1>16 AVG_CACHELINE_LOOP 16, movd +%endif +%endif add r2, r3 add r0, r1 dec r5d jg .height_loop - REP_RET + RET +%endmacro -%ifndef ARCH_X86_64 -AVG_CACHELINE_CHECK 8, 32, mmxext -AVG_CACHELINE_CHECK 12, 32, mmxext -AVG_CACHELINE_CHECK 16, 32, mmxext -AVG_CACHELINE_CHECK 20, 32, mmxext -AVG_CACHELINE_CHECK 16, 64, mmxext -AVG_CACHELINE_CHECK 20, 64, mmxext +%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set +%if %1 == 12 +;w12 isn't needed because w16 is just as fast if there's no cacheline split +%define cachesplit pixel_avg2_w16_cache_mmx2 +%else +%define cachesplit pixel_avg2_w%1_cache_mmx2 %endif +cglobal pixel_avg2_w%1_cache%2_%3 + mov eax, r2m + and eax, %2-1 + cmp eax, (%2-%1-(%1 % 8)) +%if %1==12||%1==20 + jbe pixel_avg2_w%1_%3 +%else + jb pixel_avg2_w%1_%3 +%endif +%if 0 ; or %1==8 - but the extra branch seems too expensive + ja cachesplit +%if ARCH_X86_64 + test r4b, 1 +%else + test byte r4m, 1 +%endif + jz pixel_avg2_w%1_%3 +%else + or eax, r4m + and eax, 7 + jz pixel_avg2_w%1_%3 + mov eax, r2m +%endif +%if mmsize==16 || (%1==8 && %2==64) + AVG_CACHELINE_FUNC %1, %2 +%else + jmp cachesplit +%endif +%endmacro -AVG_CACHELINE_CHECK 8, 64, mmxext -AVG_CACHELINE_CHECK 12, 64, mmxext +INIT_MMX +AVG_CACHELINE_CHECK 8, 64, mmx2 +AVG_CACHELINE_CHECK 12, 64, mmx2 +%if ARCH_X86_64 == 0 +AVG_CACHELINE_CHECK 16, 64, mmx2 +AVG_CACHELINE_CHECK 20, 64, mmx2 +AVG_CACHELINE_CHECK 8, 32, mmx2 +AVG_CACHELINE_CHECK 12, 32, mmx2 +AVG_CACHELINE_CHECK 16, 32, mmx2 +AVG_CACHELINE_CHECK 20, 32, mmx2 +%endif +INIT_XMM AVG_CACHELINE_CHECK 16, 64, sse2 AVG_CACHELINE_CHECK 20, 64, sse2 @@ -746,31 +1205,51 @@ AVG_CACHELINE_CHECK 20, 64, sse2 %macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment ALIGN 16 avg_w16_align%1_%2_ssse3: -%if %2&15==0 +%if %1==0 && %2==0 + movdqa xmm1, [r2] + pavgb xmm1, [r2+r4] + add r2, r3 +%elif %1==0 + movdqa xmm1, [r2+r4+16] + palignr xmm1, [r2+r4], %2 + pavgb xmm1, [r2] + add r2, r3 +%elif %2&15==0 movdqa xmm1, [r2+16] palignr xmm1, [r2], %1 pavgb xmm1, [r2+r4] + add r2, r3 %else movdqa xmm1, [r2+16] movdqa xmm2, [r2+r4+16] palignr xmm1, [r2], %1 - palignr xmm2, [r2+r4], %2 + palignr xmm2, [r2+r4], %2&15 + add r2, r3 pavgb xmm1, xmm2 %endif movdqa [r0], xmm1 - add r2, r3 add r0, r1 dec r5d jg avg_w16_align%1_%2_ssse3 - rep ret + ret +%if %1==0 + ; make sure the first ones don't end up short + ALIGN 16 + times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop +%endif %endmacro cglobal pixel_avg2_w16_cache64_ssse3 - mov eax, r2m - and eax, 0x3f - cmp eax, 0x30 - jle pixel_avg2_w16_sse2 - PROLOGUE 6,7 +%if 0 ; seems both tests aren't worth it if src1%16==0 is optimized + mov eax, r2m + and eax, 0x3f + cmp eax, 0x30 + jb x264_pixel_avg2_w16_sse2 + or eax, r4m + and eax, 7 + jz x264_pixel_avg2_w16_sse2 +%endif + PROLOGUE 6, 8 lea r6, [r4+r2] and r4, ~0xf and r6, 0x1f @@ -780,133 +1259,135 @@ cglobal pixel_avg2_w16_cache64_ssse3 shl r6, 4 ;jump = (offset + align*2)*48 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3) %ifdef PIC - lea r11, [avg_w16_addr] - add r6, r11 + lea r7, [avg_w16_addr] + add r6, r7 %else lea r6, [avg_w16_addr + r6] %endif -%ifdef UNIX64 - jmp r6 -%else - call r6 - RET -%endif + TAIL_CALL r6, 1 -%assign j 1 -%assign k 2 -%rep 15 +%assign j 0 +%assign k 1 +%rep 16 AVG16_CACHELINE_LOOP_SSSE3 j, j AVG16_CACHELINE_LOOP_SSSE3 j, k %assign j j+1 %assign k k+1 %endrep +%endif ; !HIGH_BIT_DEPTH ;============================================================================= ; pixel copy ;============================================================================= -%macro COPY4 4 - %2 m0, [r2] - %2 m1, [r2+r3] - %2 m2, [r2+r3*2] - %2 m3, [r2+%4] - %1 [r0], m0 - %1 [r0+r1], m1 - %1 [r0+r1*2], m2 - %1 [r0+%3], m3 +%macro COPY1 2 + movu m0, [r2] + movu m1, [r2+r3] + movu m2, [r2+r3*2] + movu m3, [r2+%2] + mova [r0], m0 + mova [r0+r1], m1 + mova [r0+r1*2], m2 + mova [r0+%1], m3 +%endmacro + +%macro COPY2 2-4 0, 1 + movu m0, [r2+%3*mmsize] + movu m1, [r2+%4*mmsize] + movu m2, [r2+r3+%3*mmsize] + movu m3, [r2+r3+%4*mmsize] + mova [r0+%3*mmsize], m0 + mova [r0+%4*mmsize], m1 + mova [r0+r1+%3*mmsize], m2 + mova [r0+r1+%4*mmsize], m3 + movu m0, [r2+r3*2+%3*mmsize] + movu m1, [r2+r3*2+%4*mmsize] + movu m2, [r2+%2+%3*mmsize] + movu m3, [r2+%2+%4*mmsize] + mova [r0+r1*2+%3*mmsize], m0 + mova [r0+r1*2+%4*mmsize], m1 + mova [r0+%1+%3*mmsize], m2 + mova [r0+%1+%4*mmsize], m3 +%endmacro + +%macro COPY4 2 + COPY2 %1, %2, 0, 1 + COPY2 %1, %2, 2, 3 %endmacro -INIT_MMX ;----------------------------------------------------------------------------- -; void mc_copy_w4( uint8_t *dst, int i_dst_stride, -; uint8_t *src, int i_src_stride, int i_height ) +; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride, +; uint8_t *src, intptr_t i_src_stride, int i_height ) ;----------------------------------------------------------------------------- +INIT_MMX cglobal mc_copy_w4_mmx, 4,6 - cmp dword r4m, 4 + FIX_STRIDES r1, r3 + cmp dword r4m, 4 lea r5, [r3*3] lea r4, [r1*3] je .end - COPY4 movd, movd, r4, r5 +%if HIGH_BIT_DEPTH == 0 + %define mova movd + %define movu movd +%endif + COPY1 r4, r5 lea r2, [r2+r3*4] lea r0, [r0+r1*4] .end: - COPY4 movd, movd, r4, r5 + COPY1 r4, r5 RET -cglobal mc_copy_w8_mmx, 5,7 - lea r6, [r3*3] - lea r5, [r1*3] -.height_loop: - COPY4 movq, movq, r5, r6 - lea r2, [r2+r3*4] - lea r0, [r0+r1*4] - sub r4d, 4 - jg .height_loop - REP_RET - -cglobal mc_copy_w16_mmx, 5,7 - lea r6, [r3*3] - lea r5, [r1*3] -.height_loop: - movq mm0, [r2] - movq mm1, [r2+8] - movq mm2, [r2+r3] - movq mm3, [r2+r3+8] - movq mm4, [r2+r3*2] - movq mm5, [r2+r3*2+8] - movq mm6, [r2+r6] - movq mm7, [r2+r6+8] - movq [r0], mm0 - movq [r0+8], mm1 - movq [r0+r1], mm2 - movq [r0+r1+8], mm3 - movq [r0+r1*2], mm4 - movq [r0+r1*2+8], mm5 - movq [r0+r5], mm6 - movq [r0+r5+8], mm7 - lea r2, [r2+r3*4] - lea r0, [r0+r1*4] - sub r4d, 4 - jg .height_loop - REP_RET - -INIT_XMM -%macro COPY_W16_SSE2 2 -cglobal %1, 5,7 +%macro MC_COPY 1 +%assign %%w %1*SIZEOF_PIXEL/mmsize +%if %%w > 0 +cglobal mc_copy_w%1, 5,7 + FIX_STRIDES r1, r3 lea r6, [r3*3] lea r5, [r1*3] .height_loop: - COPY4 movdqa, %2, r5, r6 + COPY %+ %%w r5, r6 lea r2, [r2+r3*4] lea r0, [r0+r1*4] - sub r4d, 4 - jg .height_loop - REP_RET + sub r4d, 4 + jg .height_loop + RET +%endif %endmacro -COPY_W16_SSE2 mc_copy_w16_sse2, movdqu -; cacheline split with mmx has too much overhead; the speed benefit is near-zero. -; but with SSE3 the overhead is zero, so there's no reason not to include it. -COPY_W16_SSE2 mc_copy_w16_sse3, lddqu -COPY_W16_SSE2 mc_copy_w16_aligned_sse2, movdqa - - +INIT_MMX mmx +MC_COPY 8 +MC_COPY 16 +INIT_XMM sse +MC_COPY 8 +MC_COPY 16 +INIT_XMM aligned, sse +MC_COPY 16 +%if HIGH_BIT_DEPTH +INIT_YMM avx +MC_COPY 16 +INIT_YMM aligned, avx +MC_COPY 16 +%endif ;============================================================================= ; prefetch ;============================================================================= -; FIXME assumes 64 byte cachelines +; assumes 64 byte cachelines +; FIXME doesn't cover all pixels in high depth and/or 4:4:4 ;----------------------------------------------------------------------------- -; void prefetch_fenc( uint8_t *pix_y, int stride_y, -; uint8_t *pix_uv, int stride_uv, int mb_x ) +; void prefetch_fenc( pixel *pix_y, intptr_t stride_y, +; pixel *pix_uv, intptr_t stride_uv, int mb_x ) ;----------------------------------------------------------------------------- -%ifdef ARCH_X86_64 -cglobal prefetch_fenc_mmxext, 5,5 + +%macro PREFETCH_FENC 1 +%if ARCH_X86_64 +cglobal prefetch_fenc_%1, 5,5 + FIX_STRIDES r1, r3 and r4d, 3 mov eax, r4d imul r4d, r1d - lea r0, [r0+r4*4+64] + lea r0, [r0+r4*4+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] @@ -914,19 +1395,25 @@ cglobal prefetch_fenc_mmxext, 5,5 prefetcht0 [r0+r1] imul eax, r3d - lea r2, [r2+rax*2+64] + lea r2, [r2+rax*2+64*SIZEOF_PIXEL] prefetcht0 [r2] prefetcht0 [r2+r3] +%ifidn %1, 422 + lea r2, [r2+r3*2] + prefetcht0 [r2] + prefetcht0 [r2+r3] +%endif RET %else -cglobal prefetch_fenc_mmxext, 0,3 +cglobal prefetch_fenc_%1, 0,3 mov r2, r4m mov r1, r1m mov r0, r0m + FIX_STRIDES r1 and r2, 3 imul r2, r1 - lea r0, [r0+r2*4+64] + lea r0, [r0+r2*4+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] @@ -936,21 +1423,34 @@ cglobal prefetch_fenc_mmxext, 0,3 mov r2, r4m mov r1, r3m mov r0, r2m + FIX_STRIDES r1 and r2, 3 imul r2, r1 - lea r0, [r0+r2*2+64] + lea r0, [r0+r2*2+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] +%ifidn %1, 422 + lea r0, [r0+r1*2] + prefetcht0 [r0] + prefetcht0 [r0+r1] +%endif ret %endif ; ARCH_X86_64 +%endmacro + +INIT_MMX mmx2 +PREFETCH_FENC 420 +PREFETCH_FENC 422 ;----------------------------------------------------------------------------- -; void prefetch_ref( uint8_t *pix, int stride, int parity ) +; void prefetch_ref( pixel *pix, intptr_t stride, int parity ) ;----------------------------------------------------------------------------- -cglobal prefetch_ref_mmxext, 3,3 +INIT_MMX mmx2 +cglobal prefetch_ref, 3,3 + FIX_STRIDES r1 dec r2d and r2d, r1d - lea r0, [r0+r2*8+64] + lea r0, [r0+r2*8+64*SIZEOF_PIXEL] lea r2, [r1*3] prefetcht0 [r0] prefetcht0 [r0+r1] @@ -969,51 +1469,75 @@ cglobal prefetch_ref_mmxext, 3,3 ; chroma MC ;============================================================================= -%ifdef ARCH_X86_64 - DECLARE_REG_TMP 10,11,6 +%if ARCH_X86_64 + DECLARE_REG_TMP 6,7,8 %else DECLARE_REG_TMP 0,1,2 %endif -%macro MC_CHROMA_START 0 +%macro MC_CHROMA_START 1 +%if ARCH_X86_64 + PROLOGUE 0,9,%1 +%else + PROLOGUE 0,6,%1 +%endif movifnidn r3, r3mp movifnidn r4d, r4m movifnidn r5d, r5m - movifnidn t2d, r6m - mov t0d, t2d + movifnidn t0d, r6m + mov t2d, t0d mov t1d, r5d sar t0d, 3 sar t1d, 3 imul t0d, r4d lea t0d, [t0+t1*2] + FIX_STRIDES t0d movsxdifnidn t0, t0d add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride %endmacro -%macro UNPACK_UNALIGNED_MEM 3 +%if HIGH_BIT_DEPTH +%macro UNPACK_UNALIGNED 4 + movu %1, [%4+0] + movu %2, [%4+4] + punpckhwd %3, %1, %2 + punpcklwd %1, %2 +%if mmsize == 8 + mova %2, %1 punpcklwd %1, %3 + punpckhwd %2, %3 +%else + shufps %2, %1, %3, q3131 + shufps %1, %3, q2020 +%endif %endmacro - -%macro UNPACK_UNALIGNED_LOAD 3 +%else ; !HIGH_BIT_DEPTH +%macro UNPACK_UNALIGNED 3 +%if mmsize == 8 + punpcklwd %1, %3 +%else movh %2, %3 punpcklwd %1, %2 +%endif %endmacro +%endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride, -; uint8_t *src, int src_stride, +; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride, +; uint8_t *src, intptr_t src_stride, ; int dx, int dy, ; int width, int height ) ;----------------------------------------------------------------------------- -%macro MC_CHROMA 1 -cglobal mc_chroma_%1, 0,6 - MC_CHROMA_START +%macro MC_CHROMA 0 +cglobal mc_chroma + MC_CHROMA_START 0 + FIX_STRIDES r4 and r5d, 7 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 jz .mc1dy %endif and t2d, 7 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 jz .mc1dx %endif shl r5d, 16 @@ -1026,7 +1550,7 @@ cglobal mc_chroma_%1, 0,6 %if mmsize==8 .skip_prologue: %else - jl mc_chroma_mmxext %+ .skip_prologue + jl mc_chroma_mmx2 %+ .skip_prologue WIN64_SPILL_XMM 9 %endif movd m5, t2d @@ -1037,37 +1561,47 @@ cglobal mc_chroma_%1, 0,6 pxor m6, m6 punpcklbw m5, m6 %if mmsize==8 - pshufw m7, m5, 0xee - pshufw m6, m5, 0x00 - pshufw m5, m5, 0x55 + pshufw m7, m5, q3232 + pshufw m6, m5, q0000 + pshufw m5, m5, q1111 jge .width4 %else -%ifdef WIN64 +%if WIN64 cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM %endif - pshufd m7, m5, 0x55 + pshufd m7, m5, q1111 punpcklwd m5, m5 - pshufd m6, m5, 0x00 - pshufd m5, m5, 0x55 + pshufd m6, m5, q0000 + pshufd m5, m5, q1111 jg .width8 %endif +%if HIGH_BIT_DEPTH + add r2, r2 + UNPACK_UNALIGNED m0, m1, m2, r3 +%else movu m0, [r3] UNPACK_UNALIGNED m0, m1, [r3+2] mova m1, m0 pand m0, [pw_00ff] psrlw m1, 8 +%endif ; HIGH_BIT_DEPTH pmaddwd m0, m7 pmaddwd m1, m7 packssdw m0, m1 - SWAP m3, m0 + SWAP 3, 0 ALIGN 4 .loop2: +%if HIGH_BIT_DEPTH + UNPACK_UNALIGNED m0, m1, m2, r3+r4 + pmullw m3, m6 +%else ; !HIGH_BIT_DEPTH movu m0, [r3+r4] UNPACK_UNALIGNED m0, m1, [r3+r4+2] pmullw m3, m6 mova m1, m0 pand m0, [pw_00ff] psrlw m1, 8 +%endif ; HIGH_BIT_DEPTH pmaddwd m0, m7 pmaddwd m1, m7 mova m2, [pw_32] @@ -1077,6 +1611,15 @@ ALIGN 4 pmullw m0, m5 paddw m0, m2 psrlw m0, 6 +%if HIGH_BIT_DEPTH + movh [r0], m0 +%if mmsize == 8 + psrlq m0, 32 + movh [r1], m0 +%else + movhps [r1], m0 +%endif +%else ; !HIGH_BIT_DEPTH packuswb m0, m0 movd [r0], m0 %if mmsize==8 @@ -1085,20 +1628,25 @@ ALIGN 4 psrldq m0, 4 %endif movd [r1], m0 +%endif ; HIGH_BIT_DEPTH add r3, r4 add r0, r2 add r1, r2 dec r5d jg .loop2 - REP_RET + RET %if mmsize==8 .width4: -%ifdef ARCH_X86_64 +%if ARCH_X86_64 mov t0, r0 mov t1, r1 mov t2, r3 +%if WIN64 + %define multy0 r4m +%else %define multy0 [rsp-8] +%endif mova multy0, m5 %else mov r3m, r3 @@ -1107,58 +1655,70 @@ ALIGN 4 %endif %else .width8: -%ifdef ARCH_X86_64 +%if ARCH_X86_64 %define multy0 m8 - SWAP m8, m5 + SWAP 8, 5 %else %define multy0 r0m mova multy0, m5 %endif %endif + FIX_STRIDES r2 .loopx: +%if HIGH_BIT_DEPTH + UNPACK_UNALIGNED m0, m2, m4, r3 + UNPACK_UNALIGNED m1, m3, m5, r3+mmsize +%else movu m0, [r3] movu m1, [r3+mmsize/2] UNPACK_UNALIGNED m0, m2, [r3+2] UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2] - mova m2, m0 - mova m3, m1 + psrlw m2, m0, 8 + psrlw m3, m1, 8 pand m0, [pw_00ff] pand m1, [pw_00ff] - psrlw m2, 8 - psrlw m3, 8 +%endif pmaddwd m0, m7 pmaddwd m2, m7 pmaddwd m1, m7 pmaddwd m3, m7 packssdw m0, m2 packssdw m1, m3 - SWAP m4, m0 - SWAP m5, m1 + SWAP 4, 0 + SWAP 5, 1 add r3, r4 ALIGN 4 .loop4: +%if HIGH_BIT_DEPTH + UNPACK_UNALIGNED m0, m1, m2, r3 + pmaddwd m0, m7 + pmaddwd m1, m7 + packssdw m0, m1 + UNPACK_UNALIGNED m1, m2, m3, r3+mmsize + pmaddwd m1, m7 + pmaddwd m2, m7 + packssdw m1, m2 +%else ; !HIGH_BIT_DEPTH movu m0, [r3] movu m1, [r3+mmsize/2] UNPACK_UNALIGNED m0, m2, [r3+2] UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2] - mova m2, m0 - mova m3, m1 + psrlw m2, m0, 8 + psrlw m3, m1, 8 pand m0, [pw_00ff] pand m1, [pw_00ff] - psrlw m2, 8 - psrlw m3, 8 pmaddwd m0, m7 pmaddwd m2, m7 pmaddwd m1, m7 pmaddwd m3, m7 packssdw m0, m2 packssdw m1, m3 +%endif ; HIGH_BIT_DEPTH pmullw m4, m6 pmullw m5, m6 mova m2, [pw_32] - mova m3, m2 + paddw m3, m2, m5 paddw m2, m4 - paddw m3, m5 mova m4, m0 mova m5, m1 pmullw m0, multy0 @@ -1167,52 +1727,65 @@ ALIGN 4 paddw m1, m3 psrlw m0, 6 psrlw m1, 6 +%if HIGH_BIT_DEPTH + movh [r0], m0 + movh [r0+mmsize/2], m1 +%if mmsize==8 + psrlq m0, 32 + psrlq m1, 32 + movh [r1], m0 + movh [r1+mmsize/2], m1 +%else + movhps [r1], m0 + movhps [r1+mmsize/2], m1 +%endif +%else ; !HIGH_BIT_DEPTH packuswb m0, m1 %if mmsize==8 - pshufw m1, m0, 0x8 - pshufw m0, m0, 0xd + pshufw m1, m0, q0020 + pshufw m0, m0, q0031 movd [r0], m1 movd [r1], m0 %else - pshufd m0, m0, 0xd8 + pshufd m0, m0, q3120 movq [r0], m0 movhps [r1], m0 %endif +%endif ; HIGH_BIT_DEPTH add r3, r4 add r0, r2 add r1, r2 dec r5d jg .loop4 %if mmsize!=8 - REP_RET + RET %else sub dword r7m, 4 jg .width8 - REP_RET + RET .width8: -%ifdef ARCH_X86_64 - lea r3, [t2+8] - lea r0, [t0+4] - lea r1, [t1+4] +%if ARCH_X86_64 + lea r3, [t2+8*SIZEOF_PIXEL] + lea r0, [t0+4*SIZEOF_PIXEL] + lea r1, [t1+4*SIZEOF_PIXEL] %else mov r3, r3m mov r0, r0m mov r1, r1m - add r3, 8 - add r0, 4 - add r1, 4 + add r3, 8*SIZEOF_PIXEL + add r0, 4*SIZEOF_PIXEL + add r1, 4*SIZEOF_PIXEL %endif mov r5d, r8m jmp .loopx %endif -%ifdef ARCH_X86_64 ; too many regs for x86_32 +%if ARCH_X86_64 ; too many regs for x86_32 RESET_MM_PERMUTATION -%ifdef WIN64 -%if xmm_regs_used > 6 - %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16 - %assign xmm_regs_used 6 -%endif +%if WIN64 + %assign stack_offset stack_offset - stack_size_padded + %assign stack_size_padded 0 + %assign xmm_regs_used 0 %endif .mc1dy: and t2d, 7 @@ -1221,36 +1794,61 @@ ALIGN 4 jmp .mc1d .mc1dx: movd m5, r5d - mov r6d, 2 + mov r6d, 2*SIZEOF_PIXEL .mc1d: +%if HIGH_BIT_DEPTH && mmsize == 16 + WIN64_SPILL_XMM 8 +%endif mova m4, [pw_8] SPLATW m5, m5 psubw m4, m5 movifnidn r0, r0mp movifnidn r1, r1mp movifnidn r2d, r2m + FIX_STRIDES r2 movifnidn r5d, r8m cmp dword r7m, 4 jg .mc1d_w8 - mov r10, r2 - mov r11, r4 + mov r7, r2 + mov r8, r4 %if mmsize!=8 shr r5d, 1 %endif .loop1d_w4: +%if HIGH_BIT_DEPTH +%if mmsize == 8 + movq m0, [r3+0] + movq m2, [r3+8] + movq m1, [r3+r6+0] + movq m3, [r3+r6+8] +%else + movu m0, [r3] + movu m1, [r3+r6] + add r3, r8 + movu m2, [r3] + movu m3, [r3+r6] +%endif + SBUTTERFLY wd, 0, 2, 6 + SBUTTERFLY wd, 1, 3, 7 + SBUTTERFLY wd, 0, 2, 6 + SBUTTERFLY wd, 1, 3, 7 +%if mmsize == 16 + SBUTTERFLY wd, 0, 2, 6 + SBUTTERFLY wd, 1, 3, 7 +%endif +%else ; !HIGH_BIT_DEPTH movq m0, [r3] movq m1, [r3+r6] %if mmsize!=8 - add r3, r11 + add r3, r8 movhps m0, [r3] movhps m1, [r3+r6] %endif - mova m2, m0 - mova m3, m1 + psrlw m2, m0, 8 + psrlw m3, m1, 8 pand m0, [pw_00ff] pand m1, [pw_00ff] - psrlw m2, 8 - psrlw m3, 8 +%endif ; HIGH_BIT_DEPTH pmullw m0, m4 pmullw m1, m5 pmullw m2, m4 @@ -1261,10 +1859,24 @@ ALIGN 4 paddw m2, m3 psrlw m0, 3 psrlw m2, 3 +%if HIGH_BIT_DEPTH +%if mmsize == 8 + xchg r4, r8 + xchg r2, r7 +%endif + movq [r0], m0 + movq [r1], m2 +%if mmsize == 16 + add r0, r7 + add r1, r7 + movhps [r0], m0 + movhps [r1], m2 +%endif +%else ; !HIGH_BIT_DEPTH packuswb m0, m2 %if mmsize==8 - xchg r4, r11 - xchg r2, r10 + xchg r4, r8 + xchg r2, r7 movd [r0], m0 psrlq m0, 32 movd [r1], m0 @@ -1272,24 +1884,25 @@ ALIGN 4 movhlps m1, m0 movd [r0], m0 movd [r1], m1 - add r0, r10 - add r1, r10 + add r0, r7 + add r1, r7 psrldq m0, 4 psrldq m1, 4 movd [r0], m0 movd [r1], m1 %endif +%endif ; HIGH_BIT_DEPTH add r3, r4 add r0, r2 add r1, r2 dec r5d jg .loop1d_w4 - REP_RET + RET .mc1d_w8: - sub r2, 4 - sub r4, 8 - mov r10, 4 - mov r11, 8 + sub r2, 4*SIZEOF_PIXEL + sub r4, 8*SIZEOF_PIXEL + mov r7, 4*SIZEOF_PIXEL + mov r8, 8*SIZEOF_PIXEL %if mmsize==8 shl r5d, 1 %endif @@ -1297,11 +1910,9 @@ ALIGN 4 %endif ; ARCH_X86_64 %endmacro ; MC_CHROMA - -%macro MC_CHROMA_SSSE3 0-1 -INIT_XMM -cglobal mc_chroma_ssse3%1, 0,6,9 - MC_CHROMA_START +%macro MC_CHROMA_SSSE3 0 +cglobal mc_chroma + MC_CHROMA_START 10-cpuflag(avx2) and r5d, 7 and t2d, 7 mov t0d, r5d @@ -1312,18 +1923,18 @@ cglobal mc_chroma_ssse3%1, 0,6,9 sub r5d, t2d imul t2d, t0d ; (x*255+8)*y imul r5d, t0d ; (x*255+8)*(8-y) - movd m6, t2d - movd m7, r5d -%ifidn %1, _cache64 + movd xm6, t2d + movd xm7, r5d +%if cpuflag(cache64) mov t0d, r3d and t0d, 7 %ifdef PIC lea t1, [ch_shuf_adj] - movddup m5, [t1 + t0*4] + movddup xm5, [t1 + t0*4] %else - movddup m5, [ch_shuf_adj + t0*4] + movddup xm5, [ch_shuf_adj + t0*4] %endif - paddb m5, [ch_shuf] + paddb xm5, [ch_shuf] and r3, ~7 %else mova m5, [ch_shuf] @@ -1332,10 +1943,78 @@ cglobal mc_chroma_ssse3%1, 0,6,9 movifnidn r1, r1mp movifnidn r2d, r2m movifnidn r5d, r8m +%if cpuflag(avx2) + vpbroadcastw m6, xm6 + vpbroadcastw m7, xm7 +%else SPLATW m6, m6 SPLATW m7, m7 +%endif +%if ARCH_X86_64 + %define shiftround m8 + mova m8, [pw_512] +%else + %define shiftround [pw_512] +%endif cmp dword r7m, 4 jg .width8 + +%if cpuflag(avx2) +.loop4: + movu xm0, [r3] + movu xm1, [r3+r4] + vinserti128 m0, m0, [r3+r4], 1 + vinserti128 m1, m1, [r3+r4*2], 1 + pshufb m0, m5 + pshufb m1, m5 + pmaddubsw m0, m7 + pmaddubsw m1, m6 + paddw m0, m1 + pmulhrsw m0, shiftround + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [r0], xm0 + movd [r0+r2], xm1 + psrldq xm0, 4 + psrldq xm1, 4 + movd [r1], xm0 + movd [r1+r2], xm1 + lea r3, [r3+r4*2] + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + sub r5d, 2 + jg .loop4 + RET +.width8: + movu xm0, [r3] + vinserti128 m0, m0, [r3+8], 1 + pshufb m0, m5 +.loop8: + movu xm3, [r3+r4] + vinserti128 m3, m3, [r3+r4+8], 1 + pshufb m3, m5 + pmaddubsw m1, m0, m7 + pmaddubsw m2, m3, m6 + pmaddubsw m3, m3, m7 + + movu xm0, [r3+r4*2] + vinserti128 m0, m0, [r3+r4*2+8], 1 + pshufb m0, m5 + pmaddubsw m4, m0, m6 + + paddw m1, m2 + paddw m3, m4 + pmulhrsw m1, shiftround + pmulhrsw m3, shiftround + packuswb m1, m3 + mova m2, [deinterleave_shufd] + vpermd m1, m2, m1 + vextracti128 xm2, m1, 1 + movq [r0], xm1 + movhps [r1], xm1 + movq [r0+r2], xm2 + movhps [r1+r2], xm2 +%else movu m0, [r3] pshufb m0, m5 .loop4: @@ -1343,42 +2022,44 @@ cglobal mc_chroma_ssse3%1, 0,6,9 pshufb m1, m5 movu m3, [r3+r4*2] pshufb m3, m5 - mova m2, m1 mova m4, m3 pmaddubsw m0, m7 + pmaddubsw m2, m1, m7 pmaddubsw m1, m6 - pmaddubsw m2, m7 pmaddubsw m3, m6 - paddw m0, [pw_32] - paddw m2, [pw_32] paddw m1, m0 paddw m3, m2 + pmulhrsw m1, shiftround + pmulhrsw m3, shiftround mova m0, m4 - psrlw m1, 6 - psrlw m3, 6 packuswb m1, m3 - movhlps m3, m1 movd [r0], m1 +%if cpuflag(sse4) + pextrd [r1], m1, 1 + pextrd [r0+r2], m1, 2 + pextrd [r1+r2], m1, 3 +%else + movhlps m3, m1 movd [r0+r2], m3 psrldq m1, 4 psrldq m3, 4 movd [r1], m1 movd [r1+r2], m3 +%endif lea r3, [r3+r4*2] lea r0, [r0+r2*2] lea r1, [r1+r2*2] sub r5d, 2 jg .loop4 - REP_RET - + RET .width8: movu m0, [r3] pshufb m0, m5 movu m1, [r3+8] pshufb m1, m5 -%ifdef ARCH_X86_64 - SWAP m8, m6 - %define mult1 m8 +%if ARCH_X86_64 + SWAP 9, 6 + %define mult1 m9 %else mova r0m, m6 %define mult1 r0m @@ -1394,14 +2075,12 @@ cglobal mc_chroma_ssse3%1, 0,6,9 pmaddubsw m1, m7 pmaddubsw m2, mult1 pmaddubsw m3, mult1 - paddw m0, [pw_32] - paddw m1, [pw_32] paddw m0, m2 paddw m1, m3 - psrlw m0, 6 - psrlw m1, 6 + pmulhrsw m0, shiftround ; x + 32 >> 6 + pmulhrsw m1, shiftround packuswb m0, m1 - pshufd m0, m0, 0xd8 + pshufd m0, m0, q3120 movq [r0], m0 movhps [r1], m0 @@ -1415,30 +2094,41 @@ cglobal mc_chroma_ssse3%1, 0,6,9 pmaddubsw m6, m7 pmaddubsw m2, mult1 pmaddubsw m3, mult1 - paddw m4, [pw_32] - paddw m6, [pw_32] paddw m2, m4 paddw m3, m6 - psrlw m2, 6 - psrlw m3, 6 + pmulhrsw m2, shiftround + pmulhrsw m3, shiftround packuswb m2, m3 - pshufd m2, m2, 0xd8 + pshufd m2, m2, q3120 movq [r0+r2], m2 movhps [r1+r2], m2 +%endif lea r3, [r3+r4*2] lea r0, [r0+r2*2] lea r1, [r1+r2*2] sub r5d, 2 jg .loop8 - REP_RET + RET %endmacro -INIT_MMX -%define UNPACK_UNALIGNED UNPACK_UNALIGNED_MEM -MC_CHROMA mmxext -INIT_XMM -MC_CHROMA sse2_misalign -%define UNPACK_UNALIGNED UNPACK_UNALIGNED_LOAD -MC_CHROMA sse2 +%if HIGH_BIT_DEPTH +INIT_MMX mmx2 +MC_CHROMA +INIT_XMM sse2 +MC_CHROMA +INIT_XMM avx +MC_CHROMA +%else ; !HIGH_BIT_DEPTH +INIT_MMX mmx2 +MC_CHROMA +INIT_XMM sse2 +MC_CHROMA +INIT_XMM ssse3 +MC_CHROMA_SSSE3 +INIT_XMM ssse3, cache64 +MC_CHROMA_SSSE3 +INIT_XMM avx +MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64 +INIT_YMM avx2 MC_CHROMA_SSSE3 -MC_CHROMA_SSSE3 _cache64 +%endif ; HIGH_BIT_DEPTH