X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fx86%2Fmc-a.asm;h=608efcd97b40d70c9223ab23719dcd95baf5de07;hb=64f4e24909924fceeea6e154d71b7dfbf586c7ea;hp=05455c50a5deb65985455ad47f53c9ada8619c7d;hpb=19ebada1c9a67ac0837936eb9269224cb3ce8dd7;p=x264 diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 05455c50..608efcd9 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -1,12 +1,15 @@ ;***************************************************************************** -;* mc-a.asm: h264 encoder library +;* mc-a.asm: x86 motion compensation ;***************************************************************************** -;* Copyright (C) 2003-2008 x264 project +;* Copyright (C) 2003-2016 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Laurent Aimar +;* Dylan Yudaken +;* Holger Lubitz ;* Min Chen +;* Oskar Arvidsson ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -21,96 +24,126 @@ ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 -pw_4: times 8 dw 4 -pw_8: times 8 dw 8 -pw_32: times 8 dw 32 -pw_64: times 8 dw 64 -sw_64: dd 64 +ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 +ch_shuf_adj: times 8 db 0 + times 8 db 2 + times 8 db 4 + times 8 db 6 +sq_1: times 1 dq 1 SECTION .text +cextern pb_0 +cextern pw_1 +cextern pw_4 +cextern pw_8 +cextern pw_32 +cextern pw_64 +cextern pw_512 +cextern pw_00ff +cextern pw_pixel_max +cextern sw_64 +cextern pd_32 +cextern deinterleave_shufd + ;============================================================================= -; weighted prediction +; implicit weighted biprediction ;============================================================================= -; implicit bipred only: ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 -%ifdef ARCH_X86_64 - %define t0 r0 - %define t1 r1 - %define t2 r2 - %define t3 r3 - %define t4 r4 - %define t5 r5 - %define t6d r10d - %define t7d r11d - %macro AVG_START 0 - PROLOGUE 6,7 - .height_loop: +%if WIN64 + DECLARE_REG_TMP 0,1,2,3,4,5,4,5 + %macro AVG_START 0-1 0 + PROLOGUE 6,7,%1 + %endmacro +%elif UNIX64 + DECLARE_REG_TMP 0,1,2,3,4,5,7,8 + %macro AVG_START 0-1 0 + PROLOGUE 6,9,%1 %endmacro %else - %define t0 r1 - %define t1 r2 - %define t2 r3 - %define t3 r4 - %define t4 r5 - %define t5 r6 - %define t6d r1d - %define t7d r2d - %macro AVG_START 0 - PROLOGUE 0,7 + DECLARE_REG_TMP 1,2,3,4,5,6,1,2 + %macro AVG_START 0-1 0 + PROLOGUE 0,7,%1 mov t0, r0m mov t1, r1m mov t2, r2m mov t3, r3m mov t4, r4m mov t5, r5m - .height_loop: %endmacro %endif -%macro SPLATW 2 -%if mmsize==16 - pshuflw %1, %2, 0 - punpcklqdq %1, %1 -%else - pshufw %1, %2, 0 -%endif +%macro AVG_END 0 + lea t4, [t4+t5*2*SIZEOF_PIXEL] + lea t2, [t2+t3*2*SIZEOF_PIXEL] + lea t0, [t0+t1*2*SIZEOF_PIXEL] + sub eax, 2 + jg .height_loop + RET %endmacro +%if HIGH_BIT_DEPTH + %macro BIWEIGHT_MMX 2 movh m0, %1 movh m1, %2 - punpcklbw m0, m7 - punpcklbw m1, m7 - pmullw m0, m4 - pmullw m1, m5 + punpcklwd m0, m1 + pmaddwd m0, m3 + paddd m0, m4 + psrad m0, 6 +%endmacro + +%macro BIWEIGHT_START_MMX 0 + movzx t6d, word r6m + mov t7d, 64 + sub t7d, t6d + shl t7d, 16 + add t6d, t7d + movd m3, t6d + SPLATD m3, m3 + mova m4, [pd_32] + pxor m5, m5 +%endmacro + +%else ;!HIGH_BIT_DEPTH +%macro BIWEIGHT_MMX 2 + movh m0, %1 + movh m1, %2 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, m2 + pmullw m1, m3 paddw m0, m1 - paddw m0, m6 + paddw m0, m4 psraw m0, 6 %endmacro %macro BIWEIGHT_START_MMX 0 - movd m4, r6m - SPLATW m4, m4 ; weight_dst - mova m5, [pw_64 GLOBAL] - psubw m5, m4 ; weight_src - mova m6, [pw_32 GLOBAL] ; rounding - pxor m7, m7 + movd m2, r6m + SPLATW m2, m2 ; weight_dst + mova m3, [pw_64] + psubw m3, m2 ; weight_src + mova m4, [pw_32] ; rounding + pxor m5, m5 %endmacro +%endif ;HIGH_BIT_DEPTH %macro BIWEIGHT_SSSE3 2 movh m0, %1 movh m1, %2 punpcklbw m0, m1 - pmaddubsw m0, m5 - paddw m0, m6 - psraw m0, 6 + pmaddubsw m0, m3 + pmulhrsw m0, m4 %endmacro %macro BIWEIGHT_START_SSSE3 0 @@ -119,72 +152,465 @@ SECTION .text sub t7d, t6d shl t7d, 8 add t6d, t7d - movd m5, t6d - mova m6, [pw_32 GLOBAL] - SPLATW m5, m5 ; weight_dst,src + mova m4, [pw_512] + movd xm3, t6d +%if cpuflag(avx2) + vpbroadcastw m3, xm3 +%else + SPLATW m3, m3 ; weight_dst,src +%endif %endmacro +%if HIGH_BIT_DEPTH +%macro BIWEIGHT_ROW 4 + BIWEIGHT [%2], [%3] +%if %4==mmsize/4 + packssdw m0, m0 + CLIPW m0, m5, m7 + movh [%1], m0 +%else + SWAP 0, 6 + BIWEIGHT [%2+mmsize/2], [%3+mmsize/2] + packssdw m6, m0 + CLIPW m6, m5, m7 + mova [%1], m6 +%endif +%endmacro + +%else ;!HIGH_BIT_DEPTH %macro BIWEIGHT_ROW 4 BIWEIGHT [%2], [%3] %if %4==mmsize/2 packuswb m0, m0 movh [%1], m0 %else - SWAP 0, 2 + SWAP 0, 6 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2] - packuswb m2, m0 - mova [%1], m2 + packuswb m6, m0 + mova [%1], m6 %endif %endmacro +%endif ;HIGH_BIT_DEPTH + ;----------------------------------------------------------------------------- -; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight ) +; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight ) ;----------------------------------------------------------------------------- -%macro AVG_WEIGHT 2 -cglobal x264_pixel_avg_weight_w%2_%1, 0,0 +%macro AVG_WEIGHT 1-2 0 +cglobal pixel_avg_weight_w%1 BIWEIGHT_START - AVG_START -%if %2==8 && mmsize==16 + AVG_START %2 +%if HIGH_BIT_DEPTH + mova m7, [pw_pixel_max] +%endif +.height_loop: +%if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL) BIWEIGHT [t2], [t4] - SWAP 0, 2 - BIWEIGHT [t2+t3], [t4+t5] - packuswb m2, m0 - movlps [t0], m2 - movhps [t0+t1], m2 + SWAP 0, 6 + BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5] +%if HIGH_BIT_DEPTH + packssdw m6, m0 + CLIPW m6, m5, m7 +%else ;!HIGH_BIT_DEPTH + packuswb m6, m0 +%endif ;HIGH_BIT_DEPTH + movlps [t0], m6 + movhps [t0+SIZEOF_PIXEL*t1], m6 %else %assign x 0 -%rep 1+%2/(mmsize*2) - BIWEIGHT_ROW t0+x, t2+x, t4+x, %2 - BIWEIGHT_ROW t0+x+t1, t2+x+t3, t4+x+t5, %2 +%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize + BIWEIGHT_ROW t0+x, t2+x, t4+x, %1 + BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %1 %assign x x+mmsize %endrep %endif - lea t0, [t0+t1*2] - lea t2, [t2+t3*2] - lea t4, [t4+t5*2] - sub eax, 2 - jg .height_loop - REP_RET + AVG_END %endmacro %define BIWEIGHT BIWEIGHT_MMX %define BIWEIGHT_START BIWEIGHT_START_MMX -INIT_MMX -AVG_WEIGHT mmxext, 4 -AVG_WEIGHT mmxext, 8 -AVG_WEIGHT mmxext, 16 -INIT_XMM -%define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext -AVG_WEIGHT sse2, 8 -AVG_WEIGHT sse2, 16 +INIT_MMX mmx2 +AVG_WEIGHT 4 +AVG_WEIGHT 8 +AVG_WEIGHT 16 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +AVG_WEIGHT 4, 8 +AVG_WEIGHT 8, 8 +AVG_WEIGHT 16, 8 +%else ;!HIGH_BIT_DEPTH +INIT_XMM sse2 +AVG_WEIGHT 8, 7 +AVG_WEIGHT 16, 7 %define BIWEIGHT BIWEIGHT_SSSE3 %define BIWEIGHT_START BIWEIGHT_START_SSSE3 -INIT_MMX -AVG_WEIGHT ssse3, 4 -INIT_XMM -AVG_WEIGHT ssse3, 8 -AVG_WEIGHT ssse3, 16 +INIT_MMX ssse3 +AVG_WEIGHT 4 +INIT_XMM ssse3 +AVG_WEIGHT 8, 7 +AVG_WEIGHT 16, 7 + +INIT_YMM avx2 +cglobal pixel_avg_weight_w16 + BIWEIGHT_START + AVG_START 5 +.height_loop: + movu xm0, [t2] + movu xm1, [t4] + vinserti128 m0, m0, [t2+t3], 1 + vinserti128 m1, m1, [t4+t5], 1 + SBUTTERFLY bw, 0, 1, 2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 + mova [t0], xm0 + vextracti128 [t0+t1], m0, 1 + AVG_END +%endif ;HIGH_BIT_DEPTH + +;============================================================================= +; P frame explicit weighted prediction +;============================================================================= + +%if HIGH_BIT_DEPTH +; width +%macro WEIGHT_START 1 + mova m0, [r4+ 0] ; 1<= mmsize + WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4 + %assign x (x+mmsize) +%else + %assign w %3-x +%if w == 20 + %assign w 16 +%endif + WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4 + %assign x (x+w) +%endif +%if x >= %3 + %exitrep +%endif +%endrep +%endmacro + +%endif ; HIGH_BIT_DEPTH + +;----------------------------------------------------------------------------- +;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h ) +;----------------------------------------------------------------------------- + +%macro WEIGHTER 1 +cglobal mc_weight_w%1, 6,6,8 + FIX_STRIDES r1, r3 + WEIGHT_START %1 +%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 + ; we can merge the shift step into the scale factor + ; if (m3<<7) doesn't overflow an int16_t + cmp byte [r4+1], 0 + jz .fast +%endif +.loop: + WEIGHT_TWO_ROW r2, r0, %1, 0 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + sub r5d, 2 + jg .loop + RET +%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 +.fast: + psllw m3, 7 +.fastloop: + WEIGHT_TWO_ROW r2, r0, %1, 1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + sub r5d, 2 + jg .fastloop + RET +%endif +%endmacro + +INIT_MMX mmx2 +WEIGHTER 4 +WEIGHTER 8 +WEIGHTER 12 +WEIGHTER 16 +WEIGHTER 20 +INIT_XMM sse2 +WEIGHTER 8 +WEIGHTER 16 +WEIGHTER 20 +%if HIGH_BIT_DEPTH +WEIGHTER 12 +%else +INIT_MMX ssse3 +WEIGHTER 4 +INIT_XMM ssse3 +WEIGHTER 8 +WEIGHTER 16 +WEIGHTER 20 +INIT_YMM avx2 +WEIGHTER 8 +WEIGHTER 16 +WEIGHTER 20 +%endif + +%macro OFFSET_OP 7 + mov%6 m0, [%1] + mov%6 m1, [%2] +%if HIGH_BIT_DEPTH + p%5usw m0, m2 + p%5usw m1, m2 +%ifidn %5,add + pminsw m0, m3 + pminsw m1, m3 +%endif +%else + p%5usb m0, m2 + p%5usb m1, m2 +%endif + mov%7 [%3], m0 + mov%7 [%4], m1 +%endmacro + +%macro OFFSET_TWO_ROW 4 +%assign x 0 +%rep %3 +%if (%3*SIZEOF_PIXEL-x) >= mmsize + OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a + %assign x (x+mmsize) +%else +%if HIGH_BIT_DEPTH + OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h +%else + OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d +%endif + %exitrep +%endif +%if x >= %3*SIZEOF_PIXEL + %exitrep +%endif +%endrep +%endmacro + +;----------------------------------------------------------------------------- +;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h ) +;----------------------------------------------------------------------------- +%macro OFFSET 2 +cglobal mc_offset%2_w%1, 6,6 + FIX_STRIDES r1, r3 + mova m2, [r4] +%if HIGH_BIT_DEPTH +%ifidn %2,add + mova m3, [pw_pixel_max] +%endif +%endif +.loop: + OFFSET_TWO_ROW r2, r0, %1, %2 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + sub r5d, 2 + jg .loop + RET +%endmacro + +%macro OFFSETPN 1 + OFFSET %1, add + OFFSET %1, sub +%endmacro +INIT_MMX mmx2 +OFFSETPN 4 +OFFSETPN 8 +OFFSETPN 12 +OFFSETPN 16 +OFFSETPN 20 +INIT_XMM sse2 +OFFSETPN 12 +OFFSETPN 16 +OFFSETPN 20 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +OFFSETPN 8 +%endif ;============================================================================= @@ -192,108 +618,342 @@ AVG_WEIGHT ssse3, 16 ;============================================================================= ;----------------------------------------------------------------------------- -; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride, -; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight ); +; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, +; pixel *src2, intptr_t src2_stride, int weight ); ;----------------------------------------------------------------------------- -%macro AVGH 3 -cglobal x264_pixel_avg_%1x%2_%3,0,0 +%macro AVGH 2 +cglobal pixel_avg_%1x%2 mov eax, %2 cmp dword r6m, 32 - jne x264_pixel_avg_weight_w%1_%3 + jne pixel_avg_weight_w%1 %+ SUFFIX +%if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads + jmp pixel_avg_w%1_avx2 +%else %if mmsize == 16 && %1 == 16 test dword r4m, 15 - jz x264_pixel_avg_w%1_sse2 + jz pixel_avg_w%1_sse2 +%endif + jmp pixel_avg_w%1_mmx2 %endif - jmp x264_pixel_avg_w%1_mmxext %endmacro ;----------------------------------------------------------------------------- -; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride, -; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, -; int height, int weight ); +; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, +; pixel *src2, intptr_t src2_stride, int height, int weight ); ;----------------------------------------------------------------------------- -%macro AVG_END 0 - sub eax, 2 - lea t4, [t4+t5*2] - lea t2, [t2+t3*2] - lea t0, [t0+t1*2] - jg .height_loop - REP_RET -%endmacro - %macro AVG_FUNC 3 -cglobal %1 +cglobal pixel_avg_w%1 AVG_START - %2 m0, [t2] - %2 m1, [t2+t3] - pavgb m0, [t4] - pavgb m1, [t4+t5] - %3 [t0], m0 - %3 [t0+t1], m1 +.height_loop: +%assign x 0 +%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize + %2 m0, [t2+x] + %2 m1, [t2+x+SIZEOF_PIXEL*t3] +%if HIGH_BIT_DEPTH + pavgw m0, [t4+x] + pavgw m1, [t4+x+SIZEOF_PIXEL*t5] +%else ;!HIGH_BIT_DEPTH + pavgb m0, [t4+x] + pavgb m1, [t4+x+SIZEOF_PIXEL*t5] +%endif + %3 [t0+x], m0 + %3 [t0+x+SIZEOF_PIXEL*t1], m1 +%assign x x+mmsize +%endrep AVG_END %endmacro -INIT_MMX -AVG_FUNC x264_pixel_avg_w4_mmxext, movd, movd -AVGH 4, 8, mmxext -AVGH 4, 4, mmxext -AVGH 4, 2, mmxext +%if HIGH_BIT_DEPTH + +INIT_MMX mmx2 +AVG_FUNC 4, movq, movq +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 + +AVG_FUNC 8, movq, movq +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 + +AVG_FUNC 16, movq, movq +AVGH 16, 16 +AVGH 16, 8 + +INIT_XMM sse2 +AVG_FUNC 4, movq, movq +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 + +AVG_FUNC 8, movdqu, movdqa +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 + +AVG_FUNC 16, movdqu, movdqa +AVGH 16, 16 +AVGH 16, 8 + +%else ;!HIGH_BIT_DEPTH + +INIT_MMX mmx2 +AVG_FUNC 4, movd, movd +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 + +AVG_FUNC 8, movq, movq +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 + +AVG_FUNC 16, movq, movq +AVGH 16, 16 +AVGH 16, 8 + +INIT_XMM sse2 +AVG_FUNC 16, movdqu, movdqa +AVGH 16, 16 +AVGH 16, 8 +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 +INIT_XMM ssse3 +AVGH 16, 16 +AVGH 16, 8 +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 +INIT_MMX ssse3 +AVGH 4, 16 +AVGH 4, 8 +AVGH 4, 4 +AVGH 4, 2 +INIT_XMM avx2 +AVG_FUNC 16, movdqu, movdqa +AVGH 16, 16 +AVGH 16, 8 + +%endif ;HIGH_BIT_DEPTH -AVG_FUNC x264_pixel_avg_w8_mmxext, movq, movq -AVGH 8, 16, mmxext -AVGH 8, 8, mmxext -AVGH 8, 4, mmxext -cglobal x264_pixel_avg_w16_mmxext - AVG_START - movq mm0, [t2 ] - movq mm1, [t2+8] - movq mm2, [t2+t3 ] - movq mm3, [t2+t3+8] - pavgb mm0, [t4 ] - pavgb mm1, [t4+8] - pavgb mm2, [t4+t5 ] - pavgb mm3, [t4+t5+8] - movq [t0 ], mm0 - movq [t0+8], mm1 - movq [t0+t1 ], mm2 - movq [t0+t1+8], mm3 - AVG_END -AVGH 16, 16, mmxext -AVGH 16, 8, mmxext +;============================================================================= +; pixel avg2 +;============================================================================= + +%if HIGH_BIT_DEPTH +;----------------------------------------------------------------------------- +; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride, +; uint16_t *src1, intptr_t src_stride, +; uint16_t *src2, int height ); +;----------------------------------------------------------------------------- +%macro AVG2_W_ONE 1 +cglobal pixel_avg2_w%1, 6,7,4 + sub r4, r2 + lea r6, [r4+r3*2] +.height_loop: + movu m0, [r2] + movu m1, [r2+r3*2] +%if cpuflag(avx) || mmsize == 8 + pavgw m0, [r2+r4] + pavgw m1, [r2+r6] +%else + movu m2, [r2+r4] + movu m3, [r2+r6] + pavgw m0, m2 + pavgw m1, m3 +%endif + mova [r0], m0 + mova [r0+r1*2], m1 + lea r2, [r2+r3*4] + lea r0, [r0+r1*4] + sub r5d, 2 + jg .height_loop + RET +%endmacro + +%macro AVG2_W_TWO 3 +cglobal pixel_avg2_w%1, 6,7,8 + sub r4, r2 + lea r6, [r4+r3*2] +.height_loop: + movu m0, [r2] + %2 m1, [r2+mmsize] + movu m2, [r2+r3*2] + %2 m3, [r2+r3*2+mmsize] +%if mmsize == 8 + pavgw m0, [r2+r4] + pavgw m1, [r2+r4+mmsize] + pavgw m2, [r2+r6] + pavgw m3, [r2+r6+mmsize] +%else + movu m4, [r2+r4] + %2 m5, [r2+r4+mmsize] + movu m6, [r2+r6] + %2 m7, [r2+r6+mmsize] + pavgw m0, m4 + pavgw m1, m5 + pavgw m2, m6 + pavgw m3, m7 +%endif + mova [r0], m0 + %3 [r0+mmsize], m1 + mova [r0+r1*2], m2 + %3 [r0+r1*2+mmsize], m3 + lea r2, [r2+r3*4] + lea r0, [r0+r1*4] + sub r5d, 2 + jg .height_loop + RET +%endmacro + +INIT_MMX mmx2 +AVG2_W_ONE 4 +AVG2_W_TWO 8, movu, mova +INIT_XMM sse2 +AVG2_W_ONE 8 +AVG2_W_TWO 10, movd, movd +AVG2_W_TWO 16, movu, mova +INIT_YMM avx2 +AVG2_W_ONE 16 -INIT_XMM -AVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqa -AVGH 16, 16, sse2 -AVGH 16, 8, sse2 -AVGH 8, 16, sse2 -AVGH 8, 8, sse2 -AVGH 8, 4, sse2 -AVGH 16, 16, ssse3 -AVGH 16, 8, ssse3 -AVGH 8, 16, ssse3 -AVGH 8, 8, ssse3 -AVGH 8, 4, ssse3 INIT_MMX -AVGH 4, 8, ssse3 -AVGH 4, 4, ssse3 -AVGH 4, 2, ssse3 +cglobal pixel_avg2_w10_mmx2, 6,7 + sub r4, r2 + lea r6, [r4+r3*2] +.height_loop: + movu m0, [r2+ 0] + movu m1, [r2+ 8] + movh m2, [r2+16] + movu m3, [r2+r3*2+ 0] + movu m4, [r2+r3*2+ 8] + movh m5, [r2+r3*2+16] + pavgw m0, [r2+r4+ 0] + pavgw m1, [r2+r4+ 8] + pavgw m2, [r2+r4+16] + pavgw m3, [r2+r6+ 0] + pavgw m4, [r2+r6+ 8] + pavgw m5, [r2+r6+16] + mova [r0+ 0], m0 + mova [r0+ 8], m1 + movh [r0+16], m2 + mova [r0+r1*2+ 0], m3 + mova [r0+r1*2+ 8], m4 + movh [r0+r1*2+16], m5 + lea r2, [r2+r3*2*2] + lea r0, [r0+r1*2*2] + sub r5d, 2 + jg .height_loop + RET +cglobal pixel_avg2_w16_mmx2, 6,7 + sub r4, r2 + lea r6, [r4+r3*2] +.height_loop: + movu m0, [r2+ 0] + movu m1, [r2+ 8] + movu m2, [r2+16] + movu m3, [r2+24] + movu m4, [r2+r3*2+ 0] + movu m5, [r2+r3*2+ 8] + movu m6, [r2+r3*2+16] + movu m7, [r2+r3*2+24] + pavgw m0, [r2+r4+ 0] + pavgw m1, [r2+r4+ 8] + pavgw m2, [r2+r4+16] + pavgw m3, [r2+r4+24] + pavgw m4, [r2+r6+ 0] + pavgw m5, [r2+r6+ 8] + pavgw m6, [r2+r6+16] + pavgw m7, [r2+r6+24] + mova [r0+ 0], m0 + mova [r0+ 8], m1 + mova [r0+16], m2 + mova [r0+24], m3 + mova [r0+r1*2+ 0], m4 + mova [r0+r1*2+ 8], m5 + mova [r0+r1*2+16], m6 + mova [r0+r1*2+24], m7 + lea r2, [r2+r3*2*2] + lea r0, [r0+r1*2*2] + sub r5d, 2 + jg .height_loop + RET +cglobal pixel_avg2_w18_mmx2, 6,7 + sub r4, r2 +.height_loop: + movu m0, [r2+ 0] + movu m1, [r2+ 8] + movu m2, [r2+16] + movu m3, [r2+24] + movh m4, [r2+32] + pavgw m0, [r2+r4+ 0] + pavgw m1, [r2+r4+ 8] + pavgw m2, [r2+r4+16] + pavgw m3, [r2+r4+24] + pavgw m4, [r2+r4+32] + mova [r0+ 0], m0 + mova [r0+ 8], m1 + mova [r0+16], m2 + mova [r0+24], m3 + movh [r0+32], m4 + lea r2, [r2+r3*2] + lea r0, [r0+r1*2] + dec r5d + jg .height_loop + RET -;============================================================================= -; pixel avg2 -;============================================================================= +%macro PIXEL_AVG_W18 0 +cglobal pixel_avg2_w18, 6,7 + sub r4, r2 +.height_loop: + movu m0, [r2+ 0] + movd xm2, [r2+32] +%if mmsize == 32 + pavgw m0, [r2+r4+ 0] + movd xm1, [r2+r4+32] + pavgw xm2, xm1 +%else + movu m1, [r2+16] + movu m3, [r2+r4+ 0] + movu m4, [r2+r4+16] + movd m5, [r2+r4+32] + pavgw m0, m3 + pavgw m1, m4 + pavgw m2, m5 + mova [r0+16], m1 +%endif + mova [r0+ 0], m0 + movd [r0+32], xm2 + lea r2, [r2+r3*2] + lea r0, [r0+r1*2] + dec r5d + jg .height_loop + RET +%endmacro + +INIT_XMM sse2 +PIXEL_AVG_W18 +INIT_YMM avx2 +PIXEL_AVG_W18 + +%endif ; HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- -; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride, -; uint8_t *src1, int src_stride, -; uint8_t *src2, int height ); +; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride, +; uint8_t *src1, intptr_t src_stride, +; uint8_t *src2, int height ); ;----------------------------------------------------------------------------- %macro AVG2_W8 2 -cglobal x264_pixel_avg2_w%1_mmxext, 6,7 +cglobal pixel_avg2_w%1_mmx2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -301,124 +961,133 @@ cglobal x264_pixel_avg2_w%1_mmxext, 6,7 %2 mm1, [r2+r3] pavgb mm0, [r2+r4] pavgb mm1, [r2+r6] + lea r2, [r2+r3*2] %2 [r0], mm0 %2 [r0+r1], mm1 - sub r5d, 2 - lea r2, [r2+r3*2] lea r0, [r0+r1*2] + sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro +INIT_MMX AVG2_W8 4, movd AVG2_W8 8, movq %macro AVG2_W16 2 -cglobal x264_pixel_avg2_w%1_mmxext, 6,7 - sub r4, r2 - lea r6, [r4+r3] +cglobal pixel_avg2_w%1_mmx2, 6,7 + sub r2, r4 + lea r6, [r2+r3] .height_loop: - movq mm0, [r2] - %2 mm1, [r2+8] - movq mm2, [r2+r3] - %2 mm3, [r2+r3+8] - pavgb mm0, [r2+r4] - pavgb mm1, [r2+r4+8] - pavgb mm2, [r2+r6] - pavgb mm3, [r2+r6+8] + movq mm0, [r4] + %2 mm1, [r4+8] + movq mm2, [r4+r3] + %2 mm3, [r4+r3+8] + pavgb mm0, [r4+r2] + pavgb mm1, [r4+r2+8] + pavgb mm2, [r4+r6] + pavgb mm3, [r4+r6+8] + lea r4, [r4+r3*2] movq [r0], mm0 %2 [r0+8], mm1 movq [r0+r1], mm2 %2 [r0+r1+8], mm3 - lea r2, [r2+r3*2] lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro AVG2_W16 12, movd AVG2_W16 16, movq -cglobal x264_pixel_avg2_w20_mmxext, 6,7 - sub r4, r2 - lea r6, [r4+r3] +cglobal pixel_avg2_w20_mmx2, 6,7 + sub r2, r4 + lea r6, [r2+r3] .height_loop: - movq mm0, [r2] - movq mm1, [r2+8] - movd mm2, [r2+16] - movq mm3, [r2+r3] - movq mm4, [r2+r3+8] - movd mm5, [r2+r3+16] - pavgb mm0, [r2+r4] - pavgb mm1, [r2+r4+8] - pavgb mm2, [r2+r4+16] - pavgb mm3, [r2+r6] - pavgb mm4, [r2+r6+8] - pavgb mm5, [r2+r6+16] + movq mm0, [r4] + movq mm1, [r4+8] + movd mm2, [r4+16] + movq mm3, [r4+r3] + movq mm4, [r4+r3+8] + movd mm5, [r4+r3+16] + pavgb mm0, [r4+r2] + pavgb mm1, [r4+r2+8] + pavgb mm2, [r4+r2+16] + pavgb mm3, [r4+r6] + pavgb mm4, [r4+r6+8] + pavgb mm5, [r4+r6+16] + lea r4, [r4+r3*2] movq [r0], mm0 movq [r0+8], mm1 movd [r0+16], mm2 movq [r0+r1], mm3 movq [r0+r1+8], mm4 movd [r0+r1+16], mm5 - lea r2, [r2+r3*2] lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET + RET -cglobal x264_pixel_avg2_w16_sse2, 6,7 +INIT_XMM +cglobal pixel_avg2_w16_sse2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: - movdqu xmm0, [r2] - movdqu xmm2, [r2+r3] - movdqu xmm1, [r2+r4] - movdqu xmm3, [r2+r6] - pavgb xmm0, xmm1 - pavgb xmm2, xmm3 - movdqa [r0], xmm0 - movdqa [r0+r1], xmm2 + movu m0, [r2] + movu m2, [r2+r3] + movu m1, [r2+r4] + movu m3, [r2+r6] lea r2, [r2+r3*2] + pavgb m0, m1 + pavgb m2, m3 + mova [r0], m0 + mova [r0+r1], m2 lea r0, [r0+r1*2] - sub r5d, 2 - jg .height_loop - REP_RET + sub r5d, 2 + jg .height_loop + RET -%macro AVG2_W20 1 -cglobal x264_pixel_avg2_w20_%1, 6,7 - sub r4, r2 - lea r6, [r4+r3] +cglobal pixel_avg2_w20_sse2, 6,7 + sub r2, r4 + lea r6, [r2+r3] .height_loop: - movdqu xmm0, [r2] - movdqu xmm2, [r2+r3] - movd mm4, [r2+16] - movd mm5, [r2+r3+16] -%ifidn %1, sse2_misalign - pavgb xmm0, [r2+r4] - pavgb xmm2, [r2+r6] -%else - movdqu xmm1, [r2+r4] - movdqu xmm3, [r2+r6] - pavgb xmm0, xmm1 - pavgb xmm2, xmm3 -%endif - pavgb mm4, [r2+r4+16] - pavgb mm5, [r2+r6+16] - movdqa [r0], xmm0 - movd [r0+16], mm4 - movdqa [r0+r1], xmm2 - movd [r0+r1+16], mm5 - lea r2, [r2+r3*2] + movu m0, [r4] + movu m2, [r4+r3] + movu m1, [r4+r2] + movu m3, [r4+r6] + movd mm4, [r4+16] + movd mm5, [r4+r3+16] + pavgb m0, m1 + pavgb m2, m3 + pavgb mm4, [r4+r2+16] + pavgb mm5, [r4+r6+16] + lea r4, [r4+r3*2] + mova [r0], m0 + mova [r0+r1], m2 + movd [r0+16], mm4 + movd [r0+r1+16], mm5 + lea r0, [r0+r1*2] + sub r5d, 2 + jg .height_loop + RET + +INIT_YMM avx2 +cglobal pixel_avg2_w20, 6,7 + sub r2, r4 + lea r6, [r2+r3] +.height_loop: + movu m0, [r4] + movu m1, [r4+r3] + pavgb m0, [r4+r2] + pavgb m1, [r4+r6] + lea r4, [r4+r3*2] + mova [r0], m0 + mova [r0+r1], m1 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET -%endmacro - -AVG2_W20 sse2 -AVG2_W20 sse2_misalign + RET ; Cacheline split code for processors with high latencies for loads ; split over cache lines. See sad-a.asm for a more detailed explanation. @@ -431,25 +1100,11 @@ AVG2_W20 sse2_misalign %macro INIT_SHIFT 2 and eax, 7 shl eax, 3 - movd %1, [sw_64 GLOBAL] + movd %1, [sw_64] movd %2, eax psubw %1, %2 %endmacro -%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set -cglobal x264_pixel_avg2_w%1_cache%2_%3, 0,0 - mov eax, r2m - and eax, 0x1f|(%2>>1) - cmp eax, (32-%1)|(%2>>1) - jle x264_pixel_avg2_w%1_%3 -;w12 isn't needed because w16 is just as fast if there's no cacheline split -%if %1 == 12 - jmp x264_pixel_avg2_w16_cache_mmxext -%else - jmp x264_pixel_avg2_w%1_cache_mmxext -%endif -%endmacro - %macro AVG_CACHELINE_START 0 %assign stack_offset 0 INIT_SHIFT mm6, mm7 @@ -463,215 +1118,339 @@ cglobal x264_pixel_avg2_w%1_cache%2_%3, 0,0 %endmacro %macro AVG_CACHELINE_LOOP 2 - movq mm0, [r2+8+%1] movq mm1, [r2+%1] - movq mm2, [r2+r4+8+%1] + movq mm0, [r2+8+%1] movq mm3, [r2+r4+%1] - psllq mm0, mm6 + movq mm2, [r2+r4+8+%1] psrlq mm1, mm7 - psllq mm2, mm4 + psllq mm0, mm6 psrlq mm3, mm5 + psllq mm2, mm4 por mm0, mm1 por mm2, mm3 - pavgb mm0, mm2 - %2 [r0+%1], mm0 + pavgb mm2, mm0 + %2 [r0+%1], mm2 %endmacro -x264_pixel_avg2_w8_cache_mmxext: - AVG_CACHELINE_START - AVG_CACHELINE_LOOP 0, movq - add r2, r3 - add r0, r1 - dec r5d - jg .height_loop - RET - -x264_pixel_avg2_w16_cache_mmxext: - AVG_CACHELINE_START - AVG_CACHELINE_LOOP 0, movq - AVG_CACHELINE_LOOP 8, movq - add r2, r3 - add r0, r1 - dec r5d - jg .height_loop - RET - -x264_pixel_avg2_w20_cache_mmxext: +%macro AVG_CACHELINE_FUNC 2 +pixel_avg2_w%1_cache_mmx2: AVG_CACHELINE_START AVG_CACHELINE_LOOP 0, movq +%if %1>8 AVG_CACHELINE_LOOP 8, movq +%if %1>16 AVG_CACHELINE_LOOP 16, movd +%endif +%endif add r2, r3 add r0, r1 dec r5d jg .height_loop RET +%endmacro -%ifndef ARCH_X86_64 -AVG_CACHELINE_CHECK 8, 32, mmxext -AVG_CACHELINE_CHECK 12, 32, mmxext -AVG_CACHELINE_CHECK 16, 32, mmxext -AVG_CACHELINE_CHECK 20, 32, mmxext -AVG_CACHELINE_CHECK 16, 64, mmxext -AVG_CACHELINE_CHECK 20, 64, mmxext +%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set +%if %1 == 12 +;w12 isn't needed because w16 is just as fast if there's no cacheline split +%define cachesplit pixel_avg2_w16_cache_mmx2 +%else +%define cachesplit pixel_avg2_w%1_cache_mmx2 %endif +cglobal pixel_avg2_w%1_cache%2_%3 + mov eax, r2m + and eax, %2-1 + cmp eax, (%2-%1-(%1 % 8)) +%if %1==12||%1==20 + jbe pixel_avg2_w%1_%3 +%else + jb pixel_avg2_w%1_%3 +%endif +%if 0 ; or %1==8 - but the extra branch seems too expensive + ja cachesplit +%if ARCH_X86_64 + test r4b, 1 +%else + test byte r4m, 1 +%endif + jz pixel_avg2_w%1_%3 +%else + or eax, r4m + and eax, 7 + jz pixel_avg2_w%1_%3 + mov eax, r2m +%endif +%if mmsize==16 || (%1==8 && %2==64) + AVG_CACHELINE_FUNC %1, %2 +%else + jmp cachesplit +%endif +%endmacro -AVG_CACHELINE_CHECK 8, 64, mmxext -AVG_CACHELINE_CHECK 12, 64, mmxext +INIT_MMX +AVG_CACHELINE_CHECK 8, 64, mmx2 +AVG_CACHELINE_CHECK 12, 64, mmx2 +%if ARCH_X86_64 == 0 +AVG_CACHELINE_CHECK 16, 64, mmx2 +AVG_CACHELINE_CHECK 20, 64, mmx2 +AVG_CACHELINE_CHECK 8, 32, mmx2 +AVG_CACHELINE_CHECK 12, 32, mmx2 +AVG_CACHELINE_CHECK 16, 32, mmx2 +AVG_CACHELINE_CHECK 20, 32, mmx2 +%endif +INIT_XMM AVG_CACHELINE_CHECK 16, 64, sse2 AVG_CACHELINE_CHECK 20, 64, sse2 +; computed jump assumes this loop is exactly 48 bytes +%macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment +ALIGN 16 +avg_w16_align%1_%2_ssse3: +%if %1==0 && %2==0 + movdqa xmm1, [r2] + pavgb xmm1, [r2+r4] + add r2, r3 +%elif %1==0 + movdqa xmm1, [r2+r4+16] + palignr xmm1, [r2+r4], %2 + pavgb xmm1, [r2] + add r2, r3 +%elif %2&15==0 + movdqa xmm1, [r2+16] + palignr xmm1, [r2], %1 + pavgb xmm1, [r2+r4] + add r2, r3 +%else + movdqa xmm1, [r2+16] + movdqa xmm2, [r2+r4+16] + palignr xmm1, [r2], %1 + palignr xmm2, [r2+r4], %2&15 + add r2, r3 + pavgb xmm1, xmm2 +%endif + movdqa [r0], xmm1 + add r0, r1 + dec r5d + jg avg_w16_align%1_%2_ssse3 + ret +%if %1==0 + ; make sure the first ones don't end up short + ALIGN 16 + times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop +%endif +%endmacro + +cglobal pixel_avg2_w16_cache64_ssse3 +%if 0 ; seems both tests aren't worth it if src1%16==0 is optimized + mov eax, r2m + and eax, 0x3f + cmp eax, 0x30 + jb x264_pixel_avg2_w16_sse2 + or eax, r4m + and eax, 7 + jz x264_pixel_avg2_w16_sse2 +%endif + PROLOGUE 6, 8 + lea r6, [r4+r2] + and r4, ~0xf + and r6, 0x1f + and r2, ~0xf + lea r6, [r6*3] ;(offset + align*2)*3 + sub r4, r2 + shl r6, 4 ;jump = (offset + align*2)*48 +%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3) +%ifdef PIC + lea r7, [avg_w16_addr] + add r6, r7 +%else + lea r6, [avg_w16_addr + r6] +%endif + TAIL_CALL r6, 1 + +%assign j 0 +%assign k 1 +%rep 16 +AVG16_CACHELINE_LOOP_SSSE3 j, j +AVG16_CACHELINE_LOOP_SSSE3 j, k +%assign j j+1 +%assign k k+1 +%endrep +%endif ; !HIGH_BIT_DEPTH + ;============================================================================= ; pixel copy ;============================================================================= -%macro COPY4 4 - %2 m0, [r2] - %2 m1, [r2+r3] - %2 m2, [r2+r3*2] - %2 m3, [r2+%4] - %1 [r0], m0 - %1 [r0+r1], m1 - %1 [r0+r1*2], m2 - %1 [r0+%3], m3 +%macro COPY1 2 + movu m0, [r2] + movu m1, [r2+r3] + movu m2, [r2+r3*2] + movu m3, [r2+%2] + mova [r0], m0 + mova [r0+r1], m1 + mova [r0+r1*2], m2 + mova [r0+%1], m3 +%endmacro + +%macro COPY2 2-4 0, 1 + movu m0, [r2+%3*mmsize] + movu m1, [r2+%4*mmsize] + movu m2, [r2+r3+%3*mmsize] + movu m3, [r2+r3+%4*mmsize] + mova [r0+%3*mmsize], m0 + mova [r0+%4*mmsize], m1 + mova [r0+r1+%3*mmsize], m2 + mova [r0+r1+%4*mmsize], m3 + movu m0, [r2+r3*2+%3*mmsize] + movu m1, [r2+r3*2+%4*mmsize] + movu m2, [r2+%2+%3*mmsize] + movu m3, [r2+%2+%4*mmsize] + mova [r0+r1*2+%3*mmsize], m0 + mova [r0+r1*2+%4*mmsize], m1 + mova [r0+%1+%3*mmsize], m2 + mova [r0+%1+%4*mmsize], m3 +%endmacro + +%macro COPY4 2 + COPY2 %1, %2, 0, 1 + COPY2 %1, %2, 2, 3 %endmacro -INIT_MMX ;----------------------------------------------------------------------------- -; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride, -; uint8_t *src, int i_src_stride, int i_height ) +; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride, +; uint8_t *src, intptr_t i_src_stride, int i_height ) ;----------------------------------------------------------------------------- -cglobal x264_mc_copy_w4_mmx, 4,6 - cmp dword r4m, 4 +INIT_MMX +cglobal mc_copy_w4_mmx, 4,6 + FIX_STRIDES r1, r3 + cmp dword r4m, 4 lea r5, [r3*3] lea r4, [r1*3] je .end - COPY4 movd, movd, r4, r5 +%if HIGH_BIT_DEPTH == 0 + %define mova movd + %define movu movd +%endif + COPY1 r4, r5 lea r2, [r2+r3*4] lea r0, [r0+r1*4] .end: - COPY4 movd, movd, r4, r5 + COPY1 r4, r5 RET -cglobal x264_mc_copy_w8_mmx, 5,7 - lea r6, [r3*3] - lea r5, [r1*3] -.height_loop: - COPY4 movq, movq, r5, r6 - lea r2, [r2+r3*4] - lea r0, [r0+r1*4] - sub r4d, 4 - jg .height_loop - REP_RET - -cglobal x264_mc_copy_w16_mmx, 5,7 - lea r6, [r3*3] - lea r5, [r1*3] -.height_loop: - movq mm0, [r2] - movq mm1, [r2+8] - movq mm2, [r2+r3] - movq mm3, [r2+r3+8] - movq mm4, [r2+r3*2] - movq mm5, [r2+r3*2+8] - movq mm6, [r2+r6] - movq mm7, [r2+r6+8] - movq [r0], mm0 - movq [r0+8], mm1 - movq [r0+r1], mm2 - movq [r0+r1+8], mm3 - movq [r0+r1*2], mm4 - movq [r0+r1*2+8], mm5 - movq [r0+r5], mm6 - movq [r0+r5+8], mm7 - lea r2, [r2+r3*4] - lea r0, [r0+r1*4] - sub r4d, 4 - jg .height_loop - REP_RET - -INIT_XMM -%macro COPY_W16_SSE2 2 -cglobal %1, 5,7 +%macro MC_COPY 1 +%assign %%w %1*SIZEOF_PIXEL/mmsize +%if %%w > 0 +cglobal mc_copy_w%1, 5,7 + FIX_STRIDES r1, r3 lea r6, [r3*3] lea r5, [r1*3] .height_loop: - COPY4 movdqa, %2, r5, r6 + COPY %+ %%w r5, r6 lea r2, [r2+r3*4] lea r0, [r0+r1*4] - sub r4d, 4 - jg .height_loop - REP_RET + sub r4d, 4 + jg .height_loop + RET +%endif %endmacro -COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu -; cacheline split with mmx has too much overhead; the speed benefit is near-zero. -; but with SSE3 the overhead is zero, so there's no reason not to include it. -COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu -COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa - - +INIT_MMX mmx +MC_COPY 8 +MC_COPY 16 +INIT_XMM sse +MC_COPY 8 +MC_COPY 16 +INIT_XMM aligned, sse +MC_COPY 16 +%if HIGH_BIT_DEPTH +INIT_YMM avx +MC_COPY 16 +INIT_YMM aligned, avx +MC_COPY 16 +%endif ;============================================================================= ; prefetch ;============================================================================= -; FIXME assumes 64 byte cachelines +; assumes 64 byte cachelines +; FIXME doesn't cover all pixels in high depth and/or 4:4:4 ;----------------------------------------------------------------------------- -; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, -; uint8_t *pix_uv, int stride_uv, int mb_x ) +; void prefetch_fenc( pixel *pix_y, intptr_t stride_y, +; pixel *pix_uv, intptr_t stride_uv, int mb_x ) ;----------------------------------------------------------------------------- -%ifdef ARCH_X86_64 -cglobal x264_prefetch_fenc_mmxext, 5,5 + +%macro PREFETCH_FENC 1 +%if ARCH_X86_64 +cglobal prefetch_fenc_%1, 5,5 + FIX_STRIDES r1, r3 + and r4d, 3 mov eax, r4d - and eax, 3 - imul eax, r1d - lea r0, [r0+rax*4+64] + imul r4d, r1d + lea r0, [r0+r4*4+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] - and r4d, 6 - imul r4d, r3d - lea r2, [r2+r4+64] + imul eax, r3d + lea r2, [r2+rax*2+64*SIZEOF_PIXEL] prefetcht0 [r2] prefetcht0 [r2+r3] - ret +%ifidn %1, 422 + lea r2, [r2+r3*2] + prefetcht0 [r2] + prefetcht0 [r2+r3] +%endif + RET %else -cglobal x264_prefetch_fenc_mmxext - mov r2, [esp+20] - mov r1, [esp+8] - mov r0, [esp+4] +cglobal prefetch_fenc_%1, 0,3 + mov r2, r4m + mov r1, r1m + mov r0, r0m + FIX_STRIDES r1 and r2, 3 imul r2, r1 - lea r0, [r0+r2*4+64] + lea r0, [r0+r2*4+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] - mov r2, [esp+20] - mov r1, [esp+16] - mov r0, [esp+12] - and r2, 6 + mov r2, r4m + mov r1, r3m + mov r0, r2m + FIX_STRIDES r1 + and r2, 3 imul r2, r1 - lea r0, [r0+r2+64] + lea r0, [r0+r2*2+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] +%ifidn %1, 422 + lea r0, [r0+r1*2] + prefetcht0 [r0] + prefetcht0 [r0+r1] +%endif ret %endif ; ARCH_X86_64 +%endmacro + +INIT_MMX mmx2 +PREFETCH_FENC 420 +PREFETCH_FENC 422 ;----------------------------------------------------------------------------- -; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity ) +; void prefetch_ref( pixel *pix, intptr_t stride, int parity ) ;----------------------------------------------------------------------------- -cglobal x264_prefetch_ref_mmxext, 3,3 +INIT_MMX mmx2 +cglobal prefetch_ref, 3,3 + FIX_STRIDES r1 dec r2d and r2d, r1d - lea r0, [r0+r2*8+64] + lea r0, [r0+r2*8+64*SIZEOF_PIXEL] lea r2, [r1*3] prefetcht0 [r0] prefetcht0 [r0+r1] @@ -682,7 +1461,7 @@ cglobal x264_prefetch_ref_mmxext, 3,3 prefetcht0 [r0+r1] prefetcht0 [r0+r1*2] prefetcht0 [r0+r2] - ret + RET @@ -690,293 +1469,666 @@ cglobal x264_prefetch_ref_mmxext, 3,3 ; chroma MC ;============================================================================= - %define t0d eax - %define t0 rax -%ifdef ARCH_X86_64 - %define t1d r10d +%if ARCH_X86_64 + DECLARE_REG_TMP 6,7,8 %else - %define t1d r1d + DECLARE_REG_TMP 0,1,2 %endif -%macro MC_CHROMA_START 0 - movifnidn r2d, r2m - movifnidn r3d, r3m +%macro MC_CHROMA_START 1 +%if ARCH_X86_64 + PROLOGUE 0,9,%1 +%else + PROLOGUE 0,6,%1 +%endif + movifnidn r3, r3mp movifnidn r4d, r4m movifnidn r5d, r5m - mov t0d, r5d - mov t1d, r4d + movifnidn t0d, r6m + mov t2d, t0d + mov t1d, r5d sar t0d, 3 sar t1d, 3 - imul t0d, r3d - add t0d, t1d + imul t0d, r4d + lea t0d, [t0+t1*2] + FIX_STRIDES t0d movsxdifnidn t0, t0d - add r2, t0 ; src += (dx>>3) + (dy>>3) * src_stride + add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride %endmacro +%if HIGH_BIT_DEPTH +%macro UNPACK_UNALIGNED 4 + movu %1, [%4+0] + movu %2, [%4+4] + punpckhwd %3, %1, %2 + punpcklwd %1, %2 +%if mmsize == 8 + mova %2, %1 + punpcklwd %1, %3 + punpckhwd %2, %3 +%else + shufps %2, %1, %3, q3131 + shufps %1, %3, q2020 +%endif +%endmacro +%else ; !HIGH_BIT_DEPTH +%macro UNPACK_UNALIGNED 3 +%if mmsize == 8 + punpcklwd %1, %3 +%else + movh %2, %3 + punpcklwd %1, %2 +%endif +%endmacro +%endif ; HIGH_BIT_DEPTH + ;----------------------------------------------------------------------------- -; void x264_mc_chroma_mmxext( uint8_t *dst, int dst_stride, -; uint8_t *src, int src_stride, -; int dx, int dy, -; int width, int height ) +; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride, +; uint8_t *src, intptr_t src_stride, +; int dx, int dy, +; int width, int height ) ;----------------------------------------------------------------------------- -%macro MC_CHROMA 1 -cglobal x264_mc_chroma_%1, 0,6 -%if mmsize == 16 - cmp dword r6m, 4 - jle x264_mc_chroma_mmxext %+ .skip_prologue -%endif -.skip_prologue: - MC_CHROMA_START - pxor m3, m3 - and r4d, 7 ; dx &= 7 +%macro MC_CHROMA 0 +cglobal mc_chroma + MC_CHROMA_START 0 + FIX_STRIDES r4 + and r5d, 7 +%if ARCH_X86_64 jz .mc1dy - and r5d, 7 ; dy &= 7 +%endif + and t2d, 7 +%if ARCH_X86_64 jz .mc1dx - - movd m5, r4d - movd m6, r5d - SPLATW m5, m5 ; m5 = dx - SPLATW m6, m6 ; m6 = dy - - mova m4, [pw_8 GLOBAL] - mova m0, m4 - psubw m4, m5 ; m4 = 8-dx - psubw m0, m6 ; m0 = 8-dy - - mova m7, m5 - pmullw m5, m0 ; m5 = dx*(8-dy) = cB - pmullw m7, m6 ; m7 = dx*dy = cD - pmullw m6, m4 ; m6 = (8-dx)*dy = cC - pmullw m4, m0 ; m4 = (8-dx)*(8-dy) = cA - - mov r4d, r7m -%ifdef ARCH_X86_64 - mov r10, r0 - mov r11, r2 +%endif + shl r5d, 16 + add t2d, r5d + mov t0d, t2d + shl t2d, 8 + sub t2d, t0d + add t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y) + cmp dword r7m, 4 +%if mmsize==8 +.skip_prologue: %else - mov r0, r0m - mov r1, r1m - mov r5, r2 + jl mc_chroma_mmx2 %+ .skip_prologue + WIN64_SPILL_XMM 9 %endif - -.loop2d: - movh m1, [r2+r3] - movh m0, [r2] - punpcklbw m1, m3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 - punpcklbw m0, m3 - pmullw m1, m6 ; 2nd line * cC - pmullw m0, m4 ; 1st line * cA - paddw m0, m1 ; m0 <- result - - movh m2, [r2+1] - movh m1, [r2+r3+1] - punpcklbw m2, m3 - punpcklbw m1, m3 - - paddw m0, [pw_32 GLOBAL] - - pmullw m2, m5 ; line * cB - pmullw m1, m7 ; line * cD + movd m5, t2d + movifnidn r0, r0mp + movifnidn r1, r1mp + movifnidn r2d, r2m + movifnidn r5d, r8m + pxor m6, m6 + punpcklbw m5, m6 +%if mmsize==8 + pshufw m7, m5, q3232 + pshufw m6, m5, q0000 + pshufw m5, m5, q1111 + jge .width4 +%else +%if WIN64 + cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM +%endif + pshufd m7, m5, q1111 + punpcklwd m5, m5 + pshufd m6, m5, q0000 + pshufd m5, m5, q1111 + jg .width8 +%endif +%if HIGH_BIT_DEPTH + add r2, r2 + UNPACK_UNALIGNED m0, m1, m2, r3 +%else + movu m0, [r3] + UNPACK_UNALIGNED m0, m1, [r3+2] + mova m1, m0 + pand m0, [pw_00ff] + psrlw m1, 8 +%endif ; HIGH_BIT_DEPTH + pmaddwd m0, m7 + pmaddwd m1, m7 + packssdw m0, m1 + SWAP 3, 0 +ALIGN 4 +.loop2: +%if HIGH_BIT_DEPTH + UNPACK_UNALIGNED m0, m1, m2, r3+r4 + pmullw m3, m6 +%else ; !HIGH_BIT_DEPTH + movu m0, [r3+r4] + UNPACK_UNALIGNED m0, m1, [r3+r4+2] + pmullw m3, m6 + mova m1, m0 + pand m0, [pw_00ff] + psrlw m1, 8 +%endif ; HIGH_BIT_DEPTH + pmaddwd m0, m7 + pmaddwd m1, m7 + mova m2, [pw_32] + packssdw m0, m1 + paddw m2, m3 + mova m3, m0 + pmullw m0, m5 paddw m0, m2 - paddw m0, m1 psrlw m0, 6 - - packuswb m0, m3 ; 00 00 00 00 px1 px2 px3 px4 - movh [r0], m0 - - add r2, r3 - add r0, r1 ; dst_stride - dec r4d - jnz .loop2d - +%if HIGH_BIT_DEPTH + movh [r0], m0 %if mmsize == 8 - sub dword r6m, 8 - jnz .finish ; width != 8 so assume 4 -%ifdef ARCH_X86_64 - lea r0, [r10+4] ; dst - lea r2, [r11+4] ; src + psrlq m0, 32 + movh [r1], m0 %else - mov r0, r0m - lea r2, [r5+4] - add r0, 4 + movhps [r1], m0 %endif - mov r4d, r7m ; height - jmp .loop2d +%else ; !HIGH_BIT_DEPTH + packuswb m0, m0 + movd [r0], m0 +%if mmsize==8 + psrlq m0, 16 %else - REP_RET -%endif ; mmsize + psrldq m0, 4 +%endif + movd [r1], m0 +%endif ; HIGH_BIT_DEPTH + add r3, r4 + add r0, r2 + add r1, r2 + dec r5d + jg .loop2 + RET +%if mmsize==8 +.width4: +%if ARCH_X86_64 + mov t0, r0 + mov t1, r1 + mov t2, r3 +%if WIN64 + %define multy0 r4m +%else + %define multy0 [rsp-8] +%endif + mova multy0, m5 +%else + mov r3m, r3 + %define multy0 r4m + mova multy0, m5 +%endif +%else +.width8: +%if ARCH_X86_64 + %define multy0 m8 + SWAP 8, 5 +%else + %define multy0 r0m + mova multy0, m5 +%endif +%endif + FIX_STRIDES r2 +.loopx: +%if HIGH_BIT_DEPTH + UNPACK_UNALIGNED m0, m2, m4, r3 + UNPACK_UNALIGNED m1, m3, m5, r3+mmsize +%else + movu m0, [r3] + movu m1, [r3+mmsize/2] + UNPACK_UNALIGNED m0, m2, [r3+2] + UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2] + psrlw m2, m0, 8 + psrlw m3, m1, 8 + pand m0, [pw_00ff] + pand m1, [pw_00ff] +%endif + pmaddwd m0, m7 + pmaddwd m2, m7 + pmaddwd m1, m7 + pmaddwd m3, m7 + packssdw m0, m2 + packssdw m1, m3 + SWAP 4, 0 + SWAP 5, 1 + add r3, r4 +ALIGN 4 +.loop4: +%if HIGH_BIT_DEPTH + UNPACK_UNALIGNED m0, m1, m2, r3 + pmaddwd m0, m7 + pmaddwd m1, m7 + packssdw m0, m1 + UNPACK_UNALIGNED m1, m2, m3, r3+mmsize + pmaddwd m1, m7 + pmaddwd m2, m7 + packssdw m1, m2 +%else ; !HIGH_BIT_DEPTH + movu m0, [r3] + movu m1, [r3+mmsize/2] + UNPACK_UNALIGNED m0, m2, [r3+2] + UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2] + psrlw m2, m0, 8 + psrlw m3, m1, 8 + pand m0, [pw_00ff] + pand m1, [pw_00ff] + pmaddwd m0, m7 + pmaddwd m2, m7 + pmaddwd m1, m7 + pmaddwd m3, m7 + packssdw m0, m2 + packssdw m1, m3 +%endif ; HIGH_BIT_DEPTH + pmullw m4, m6 + pmullw m5, m6 + mova m2, [pw_32] + paddw m3, m2, m5 + paddw m2, m4 + mova m4, m0 + mova m5, m1 + pmullw m0, multy0 + pmullw m1, multy0 + paddw m0, m2 + paddw m1, m3 + psrlw m0, 6 + psrlw m1, 6 +%if HIGH_BIT_DEPTH + movh [r0], m0 + movh [r0+mmsize/2], m1 +%if mmsize==8 + psrlq m0, 32 + psrlq m1, 32 + movh [r1], m0 + movh [r1+mmsize/2], m1 +%else + movhps [r1], m0 + movhps [r1+mmsize/2], m1 +%endif +%else ; !HIGH_BIT_DEPTH + packuswb m0, m1 +%if mmsize==8 + pshufw m1, m0, q0020 + pshufw m0, m0, q0031 + movd [r0], m1 + movd [r1], m0 +%else + pshufd m0, m0, q3120 + movq [r0], m0 + movhps [r1], m0 +%endif +%endif ; HIGH_BIT_DEPTH + add r3, r4 + add r0, r2 + add r1, r2 + dec r5d + jg .loop4 +%if mmsize!=8 + RET +%else + sub dword r7m, 4 + jg .width8 + RET +.width8: +%if ARCH_X86_64 + lea r3, [t2+8*SIZEOF_PIXEL] + lea r0, [t0+4*SIZEOF_PIXEL] + lea r1, [t1+4*SIZEOF_PIXEL] +%else + mov r3, r3m + mov r0, r0m + mov r1, r1m + add r3, 8*SIZEOF_PIXEL + add r0, 4*SIZEOF_PIXEL + add r1, 4*SIZEOF_PIXEL +%endif + mov r5d, r8m + jmp .loopx +%endif + +%if ARCH_X86_64 ; too many regs for x86_32 + RESET_MM_PERMUTATION +%if WIN64 + %assign stack_offset stack_offset - stack_size_padded + %assign stack_size_padded 0 + %assign xmm_regs_used 0 +%endif .mc1dy: - and r5d, 7 - movd m6, r5d - mov r5, r3 ; pel_offset = dx ? 1 : src_stride + and t2d, 7 + movd m5, t2d + mov r6d, r4d ; pel_offset = dx ? 2 : src_stride jmp .mc1d .mc1dx: - movd m6, r4d - mov r5d, 1 + movd m5, r5d + mov r6d, 2*SIZEOF_PIXEL .mc1d: - mova m5, [pw_8 GLOBAL] - SPLATW m6, m6 - mova m7, [pw_4 GLOBAL] - psubw m5, m6 - movifnidn r0d, r0m - movifnidn r1d, r1m - mov r4d, r7m -%if mmsize == 8 - cmp dword r6m, 8 - je .loop1d_w8 +%if HIGH_BIT_DEPTH && mmsize == 16 + WIN64_SPILL_XMM 8 +%endif + mova m4, [pw_8] + SPLATW m5, m5 + psubw m4, m5 + movifnidn r0, r0mp + movifnidn r1, r1mp + movifnidn r2d, r2m + FIX_STRIDES r2 + movifnidn r5d, r8m + cmp dword r7m, 4 + jg .mc1d_w8 + mov r7, r2 + mov r8, r4 +%if mmsize!=8 + shr r5d, 1 %endif - .loop1d_w4: - movh m0, [r2+r5] - movh m1, [r2] - punpcklbw m0, m3 - punpcklbw m1, m3 - pmullw m0, m6 - pmullw m1, m5 - paddw m0, m7 - paddw m0, m1 - psrlw m0, 3 - packuswb m0, m3 - movh [r0], m0 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop1d_w4 -.finish: - REP_RET - +%if HIGH_BIT_DEPTH %if mmsize == 8 -.loop1d_w8: - movu m0, [r2+r5] - mova m1, [r2] - mova m2, m0 - mova m4, m1 - punpcklbw m0, m3 - punpcklbw m1, m3 - punpckhbw m2, m3 - punpckhbw m4, m3 - pmullw m0, m6 + movq m0, [r3+0] + movq m2, [r3+8] + movq m1, [r3+r6+0] + movq m3, [r3+r6+8] +%else + movu m0, [r3] + movu m1, [r3+r6] + add r3, r8 + movu m2, [r3] + movu m3, [r3+r6] +%endif + SBUTTERFLY wd, 0, 2, 6 + SBUTTERFLY wd, 1, 3, 7 + SBUTTERFLY wd, 0, 2, 6 + SBUTTERFLY wd, 1, 3, 7 +%if mmsize == 16 + SBUTTERFLY wd, 0, 2, 6 + SBUTTERFLY wd, 1, 3, 7 +%endif +%else ; !HIGH_BIT_DEPTH + movq m0, [r3] + movq m1, [r3+r6] +%if mmsize!=8 + add r3, r8 + movhps m0, [r3] + movhps m1, [r3+r6] +%endif + psrlw m2, m0, 8 + psrlw m3, m1, 8 + pand m0, [pw_00ff] + pand m1, [pw_00ff] +%endif ; HIGH_BIT_DEPTH + pmullw m0, m4 pmullw m1, m5 - pmullw m2, m6 - pmullw m4, m5 - paddw m0, m7 - paddw m2, m7 + pmullw m2, m4 + pmullw m3, m5 + paddw m0, [pw_4] + paddw m2, [pw_4] paddw m0, m1 - paddw m2, m4 + paddw m2, m3 psrlw m0, 3 psrlw m2, 3 +%if HIGH_BIT_DEPTH +%if mmsize == 8 + xchg r4, r8 + xchg r2, r7 +%endif + movq [r0], m0 + movq [r1], m2 +%if mmsize == 16 + add r0, r7 + add r1, r7 + movhps [r0], m0 + movhps [r1], m2 +%endif +%else ; !HIGH_BIT_DEPTH packuswb m0, m2 - mova [r0], m0 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop1d_w8 - REP_RET -%endif ; mmsize +%if mmsize==8 + xchg r4, r8 + xchg r2, r7 + movd [r0], m0 + psrlq m0, 32 + movd [r1], m0 +%else + movhlps m1, m0 + movd [r0], m0 + movd [r1], m1 + add r0, r7 + add r1, r7 + psrldq m0, 4 + psrldq m1, 4 + movd [r0], m0 + movd [r1], m1 +%endif +%endif ; HIGH_BIT_DEPTH + add r3, r4 + add r0, r2 + add r1, r2 + dec r5d + jg .loop1d_w4 + RET +.mc1d_w8: + sub r2, 4*SIZEOF_PIXEL + sub r4, 8*SIZEOF_PIXEL + mov r7, 4*SIZEOF_PIXEL + mov r8, 8*SIZEOF_PIXEL +%if mmsize==8 + shl r5d, 1 +%endif + jmp .loop1d_w4 +%endif ; ARCH_X86_64 %endmacro ; MC_CHROMA -INIT_MMX -MC_CHROMA mmxext -INIT_XMM -MC_CHROMA sse2 - -INIT_MMX -cglobal x264_mc_chroma_ssse3, 0,6 - MC_CHROMA_START - and r4d, 7 +%macro MC_CHROMA_SSSE3 0 +cglobal mc_chroma + MC_CHROMA_START 10-cpuflag(avx2) and r5d, 7 - mov t0d, r4d + and t2d, 7 + mov t0d, r5d shl t0d, 8 - sub t0d, r4d - mov r4d, 8 + sub t0d, r5d + mov r5d, 8 add t0d, 8 - sub r4d, r5d - imul r5d, t0d ; (x*255+8)*y - imul r4d, t0d ; (x*255+8)*(8-y) - cmp dword r6m, 4 - jg .width8 - mova m5, [pw_32 GLOBAL] - movd m6, r5d - movd m7, r4d - movifnidn r0d, r0m - movifnidn r1d, r1m - movifnidn r4d, r7m + sub r5d, t2d + imul t2d, t0d ; (x*255+8)*y + imul r5d, t0d ; (x*255+8)*(8-y) + movd xm6, t2d + movd xm7, r5d +%if cpuflag(cache64) + mov t0d, r3d + and t0d, 7 +%ifdef PIC + lea t1, [ch_shuf_adj] + movddup xm5, [t1 + t0*4] +%else + movddup xm5, [ch_shuf_adj + t0*4] +%endif + paddb xm5, [ch_shuf] + and r3, ~7 +%else + mova m5, [ch_shuf] +%endif + movifnidn r0, r0mp + movifnidn r1, r1mp + movifnidn r2d, r2m + movifnidn r5d, r8m +%if cpuflag(avx2) + vpbroadcastw m6, xm6 + vpbroadcastw m7, xm7 +%else SPLATW m6, m6 SPLATW m7, m7 - movh m0, [r2] - punpcklbw m0, [r2+1] - add r2, r3 +%endif +%if ARCH_X86_64 + %define shiftround m8 + mova m8, [pw_512] +%else + %define shiftround [pw_512] +%endif + cmp dword r7m, 4 + jg .width8 + +%if cpuflag(avx2) .loop4: - movh m1, [r2] - movh m3, [r2+r3] - punpcklbw m1, [r2+1] - punpcklbw m3, [r2+r3+1] - lea r2, [r2+2*r3] - mova m2, m1 - mova m4, m3 + movu xm0, [r3] + movu xm1, [r3+r4] + vinserti128 m0, m0, [r3+r4], 1 + vinserti128 m1, m1, [r3+r4*2], 1 + pshufb m0, m5 + pshufb m1, m5 pmaddubsw m0, m7 pmaddubsw m1, m6 - pmaddubsw m2, m7 - pmaddubsw m3, m6 - paddw m0, m5 - paddw m2, m5 - paddw m1, m0 - paddw m3, m2 - mova m0, m4 - psrlw m1, 6 - psrlw m3, 6 - packuswb m1, m1 - packuswb m3, m3 - movh [r0], m1 - movh [r0+r1], m3 - sub r4d, 2 - lea r0, [r0+2*r1] + paddw m0, m1 + pmulhrsw m0, shiftround + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [r0], xm0 + movd [r0+r2], xm1 + psrldq xm0, 4 + psrldq xm1, 4 + movd [r1], xm0 + movd [r1+r2], xm1 + lea r3, [r3+r4*2] + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + sub r5d, 2 jg .loop4 - REP_RET - -INIT_XMM + RET .width8: - mova m5, [pw_32 GLOBAL] - movd m6, r5d - movd m7, r4d - movifnidn r0d, r0m - movifnidn r1d, r1m - movifnidn r4d, r7m - SPLATW m6, m6 - SPLATW m7, m7 - movh m0, [r2] - movh m1, [r2+1] - punpcklbw m0, m1 - add r2, r3 + movu xm0, [r3] + vinserti128 m0, m0, [r3+8], 1 + pshufb m0, m5 .loop8: - movh m1, [r2] - movh m2, [r2+1] - movh m3, [r2+r3] - movh m4, [r2+r3+1] - punpcklbw m1, m2 - punpcklbw m3, m4 - lea r2, [r2+2*r3] - mova m2, m1 + movu xm3, [r3+r4] + vinserti128 m3, m3, [r3+r4+8], 1 + pshufb m3, m5 + pmaddubsw m1, m0, m7 + pmaddubsw m2, m3, m6 + pmaddubsw m3, m3, m7 + + movu xm0, [r3+r4*2] + vinserti128 m0, m0, [r3+r4*2+8], 1 + pshufb m0, m5 + pmaddubsw m4, m0, m6 + + paddw m1, m2 + paddw m3, m4 + pmulhrsw m1, shiftround + pmulhrsw m3, shiftround + packuswb m1, m3 + mova m2, [deinterleave_shufd] + vpermd m1, m2, m1 + vextracti128 xm2, m1, 1 + movq [r0], xm1 + movhps [r1], xm1 + movq [r0+r2], xm2 + movhps [r1+r2], xm2 +%else + movu m0, [r3] + pshufb m0, m5 +.loop4: + movu m1, [r3+r4] + pshufb m1, m5 + movu m3, [r3+r4*2] + pshufb m3, m5 mova m4, m3 pmaddubsw m0, m7 + pmaddubsw m2, m1, m7 pmaddubsw m1, m6 - pmaddubsw m2, m7 pmaddubsw m3, m6 - paddw m0, m5 - paddw m2, m5 paddw m1, m0 paddw m3, m2 + pmulhrsw m1, shiftround + pmulhrsw m3, shiftround mova m0, m4 - psrlw m1, 6 - psrlw m3, 6 packuswb m1, m3 - movh [r0], m1 - movhps [r0+r1], m1 - sub r4d, 2 - lea r0, [r0+2*r1] + movd [r0], m1 +%if cpuflag(sse4) + pextrd [r1], m1, 1 + pextrd [r0+r2], m1, 2 + pextrd [r1+r2], m1, 3 +%else + movhlps m3, m1 + movd [r0+r2], m3 + psrldq m1, 4 + psrldq m3, 4 + movd [r1], m1 + movd [r1+r2], m3 +%endif + lea r3, [r3+r4*2] + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + sub r5d, 2 + jg .loop4 + RET +.width8: + movu m0, [r3] + pshufb m0, m5 + movu m1, [r3+8] + pshufb m1, m5 +%if ARCH_X86_64 + SWAP 9, 6 + %define mult1 m9 +%else + mova r0m, m6 + %define mult1 r0m +%endif +.loop8: + movu m2, [r3+r4] + pshufb m2, m5 + movu m3, [r3+r4+8] + pshufb m3, m5 + mova m4, m2 + mova m6, m3 + pmaddubsw m0, m7 + pmaddubsw m1, m7 + pmaddubsw m2, mult1 + pmaddubsw m3, mult1 + paddw m0, m2 + paddw m1, m3 + pmulhrsw m0, shiftround ; x + 32 >> 6 + pmulhrsw m1, shiftround + packuswb m0, m1 + pshufd m0, m0, q3120 + movq [r0], m0 + movhps [r1], m0 + + movu m2, [r3+r4*2] + pshufb m2, m5 + movu m3, [r3+r4*2+8] + pshufb m3, m5 + mova m0, m2 + mova m1, m3 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pmaddubsw m2, mult1 + pmaddubsw m3, mult1 + paddw m2, m4 + paddw m3, m6 + pmulhrsw m2, shiftround + pmulhrsw m3, shiftround + packuswb m2, m3 + pshufd m2, m2, q3120 + movq [r0+r2], m2 + movhps [r1+r2], m2 +%endif + lea r3, [r3+r4*2] + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + sub r5d, 2 jg .loop8 - REP_RET - -; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size + RET +%endmacro +%if HIGH_BIT_DEPTH +INIT_MMX mmx2 +MC_CHROMA +INIT_XMM sse2 +MC_CHROMA +INIT_XMM avx +MC_CHROMA +%else ; !HIGH_BIT_DEPTH +INIT_MMX mmx2 +MC_CHROMA +INIT_XMM sse2 +MC_CHROMA +INIT_XMM ssse3 +MC_CHROMA_SSSE3 +INIT_XMM ssse3, cache64 +MC_CHROMA_SSSE3 +INIT_XMM avx +MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64 +INIT_YMM avx2 +MC_CHROMA_SSSE3 +%endif ; HIGH_BIT_DEPTH