X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavcodec%2Fx86%2Fh264_intrapred.asm;h=50a615b054cc925584ad2bd5484bd6edd2c71652;hb=b829b4ce29185625ab8cbcf0ce7a83cf8181ac3b;hp=dbe6b8ad61fcd847696997d7b2c9ec15a8586893;hpb=b9c7f66e6da8ac77eaa0c3fb6d476e6fc929b3c9;p=ffmpeg diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index dbe6b8ad61f..50a615b054c 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -5,21 +5,21 @@ ;* Copyright (c) 2010 Loren Merritt ;* Copyright (c) 2010 Ronald S. Bultje ;* -;* This file is part of FFmpeg. +;* This file is part of Libav. ;* -;* FFmpeg is free software; you can redistribute it and/or +;* Libav is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* FFmpeg is distributed in the hope that it will be useful, +;* Libav is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "x86inc.asm" @@ -87,31 +87,24 @@ cglobal pred16x16_vertical_sse, 2,3 ; void pred16x16_horizontal(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED16x16_H 1 -cglobal pred16x16_horizontal_%1, 2,3 +%macro PRED16x16_H 0 +cglobal pred16x16_horizontal, 2,3 mov r2, 8 -%ifidn %1, ssse3 +%if cpuflag(ssse3) mova m2, [pb_3] %endif .loop: movd m0, [r0+r1*0-4] movd m1, [r0+r1*1-4] -%ifidn %1, ssse3 +%if cpuflag(ssse3) pshufb m0, m2 pshufb m1, m2 %else punpcklbw m0, m0 punpcklbw m1, m1 -%ifidn %1, mmxext - pshufw m0, m0, 0xff - pshufw m1, m1, 0xff -%else - punpckhwd m0, m0 - punpckhwd m1, m1 - punpckhdq m0, m0 - punpckhdq m1, m1 -%endif + SPLATW m0, m0, 3 + SPLATW m1, m1, 3 mova [r0+r1*0+8], m0 mova [r0+r1*1+8], m1 %endif @@ -124,18 +117,20 @@ cglobal pred16x16_horizontal_%1, 2,3 REP_RET %endmacro -INIT_MMX -PRED16x16_H mmx -PRED16x16_H mmxext +INIT_MMX mmx +PRED16x16_H +INIT_MMX mmx2 +PRED16x16_H +INIT_XMM ssse3 +PRED16x16_H INIT_XMM -PRED16x16_H ssse3 ;----------------------------------------------------------------------------- ; void pred16x16_dc(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED16x16_DC 1 -cglobal pred16x16_dc_%1, 2,7 +%macro PRED16x16_DC 0 +cglobal pred16x16_dc, 2,7 mov r4, r0 sub r0, r1 pxor mm0, mm0 @@ -158,20 +153,10 @@ cglobal pred16x16_dc_%1, 2,7 add r5d, r6d lea r2d, [r2+r5+16] shr r2d, 5 -%ifidn %1, mmxext - movd m0, r2d - punpcklbw m0, m0 - pshufw m0, m0, 0 -%elifidn %1, sse2 - movd m0, r2d - punpcklbw m0, m0 - pshuflw m0, m0, 0 - punpcklqdq m0, m0 -%elifidn %1, ssse3 +%if cpuflag(ssse3) pxor m1, m1 - movd m0, r2d - pshufb m0, m1 %endif + SPLATB_REG m0, r2, m1 %if mmsize==8 mov r3d, 8 @@ -195,18 +180,20 @@ cglobal pred16x16_dc_%1, 2,7 REP_RET %endmacro -INIT_MMX -PRED16x16_DC mmxext +INIT_MMX mmx2 +PRED16x16_DC +INIT_XMM sse2 +PRED16x16_DC +INIT_XMM ssse3 +PRED16x16_DC INIT_XMM -PRED16x16_DC sse2 -PRED16x16_DC ssse3 ;----------------------------------------------------------------------------- ; void pred16x16_tm_vp8(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED16x16_TM_MMX 1 -cglobal pred16x16_tm_vp8_%1, 2,5 +%macro PRED16x16_TM_MMX 0 +cglobal pred16x16_tm_vp8, 2,5 sub r0, r1 pxor mm7, mm7 movq mm0, [r0+0] @@ -223,12 +210,7 @@ cglobal pred16x16_tm_vp8_%1, 2,5 movzx r2d, byte [r0+r1-1] sub r2d, r3d movd mm4, r2d -%ifidn %1, mmx - punpcklwd mm4, mm4 - punpckldq mm4, mm4 -%else - pshufw mm4, mm4, 0 -%endif + SPLATW mm4, mm4, 0 movq mm5, mm4 movq mm6, mm4 movq mm7, mm4 @@ -246,8 +228,11 @@ cglobal pred16x16_tm_vp8_%1, 2,5 REP_RET %endmacro -PRED16x16_TM_MMX mmx -PRED16x16_TM_MMX mmxext +INIT_MMX mmx +PRED16x16_TM_MMX +INIT_MMX mmx2 +PRED16x16_TM_MMX +INIT_MMX cglobal pred16x16_tm_vp8_sse2, 2,6,6 sub r0, r1 @@ -288,8 +273,8 @@ cglobal pred16x16_tm_vp8_sse2, 2,6,6 ; void pred16x16_plane(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro H264_PRED16x16_PLANE 3 -cglobal pred16x16_plane_%3_%1, 2, 7, %2 +%macro H264_PRED16x16_PLANE 1 +cglobal pred16x16_plane_%1, 2,9,7 mov r2, r1 ; +stride neg r1 ; -stride @@ -310,7 +295,10 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 paddw m0, m2 paddw m1, m3 %else ; mmsize == 16 -%ifidn %1, sse2 +%if cpuflag(ssse3) + movhps m0, [r0+r1 +8] + pmaddubsw m0, [plane_shuf] ; H coefficients +%else ; sse2 pxor m2, m2 movh m1, [r0+r1 +8] punpcklbw m0, m2 @@ -318,29 +306,22 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 pmullw m0, [pw_m8tom1] pmullw m1, [pw_1to8] paddw m0, m1 -%else ; ssse3 - movhps m0, [r0+r1 +8] - pmaddubsw m0, [plane_shuf] ; H coefficients %endif movhlps m1, m0 %endif paddw m0, m1 -%ifidn %1, mmx +%if cpuflag(mmx2) + PSHUFLW m1, m0, 0xE +%elif cpuflag(mmx) mova m1, m0 psrlq m1, 32 -%elifidn %1, mmx2 - pshufw m1, m0, 0xE -%else ; mmsize == 16 - pshuflw m1, m0, 0xE %endif paddw m0, m1 -%ifidn %1, mmx +%if cpuflag(mmx2) + PSHUFLW m1, m0, 0x1 +%elif cpuflag(mmx) mova m1, m0 psrlq m1, 16 -%elifidn %1, mmx2 - pshufw m1, m0, 0x1 -%else - pshuflw m1, m0, 0x1 %endif paddw m0, m1 ; sum of H coefficients @@ -348,8 +329,8 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 lea r3, [r0+r2*4-1] add r4, r2 -%ifdef ARCH_X86_64 -%define e_reg r11 +%if ARCH_X86_64 +%define e_reg r8 %else %define e_reg r0 %endif @@ -369,9 +350,9 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 lea r5, [r5+r6*4] movzx e_reg, byte [r3 ] -%ifdef ARCH_X86_64 - movzx r10, byte [r4+r2 ] - sub r10, e_reg +%if ARCH_X86_64 + movzx r7, byte [r4+r2 ] + sub r7, e_reg %else movzx r6, byte [r4+r2 ] sub r6, e_reg @@ -385,8 +366,8 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 movzx r4, byte [e_reg+r2 ] movzx r6, byte [r3 ] sub r6, r4 -%ifdef ARCH_X86_64 - lea r6, [r10+r6*2] +%if ARCH_X86_64 + lea r6, [r7+r6*2] lea r5, [r5+r6*2] add r5, r6 %else @@ -395,10 +376,10 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 %endif movzx r4, byte [e_reg ] -%ifdef ARCH_X86_64 - movzx r10, byte [r3 +r2 ] - sub r10, r4 - sub r5, r10 +%if ARCH_X86_64 + movzx r7, byte [r3 +r2 ] + sub r7, r4 + sub r5, r7 %else movzx r6, byte [r3 +r2 ] sub r6, r4 @@ -409,8 +390,8 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 movzx r4, byte [e_reg+r1 ] movzx r6, byte [r3 +r2*2] sub r6, r4 -%ifdef ARCH_X86_64 - add r6, r10 +%if ARCH_X86_64 + add r6, r7 %endif lea r5, [r5+r6*8] @@ -420,17 +401,17 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 lea r5, [r5+r6*4] add r5, r6 ; sum of V coefficients -%ifndef ARCH_X86_64 +%if ARCH_X86_64 == 0 mov r0, r0m %endif -%ifidn %3, h264 +%ifidn %1, h264 lea r5, [r5*5+32] sar r5, 6 -%elifidn %3, rv40 +%elifidn %1, rv40 lea r5, [r5*5] sar r5, 6 -%elifidn %3, svq3 +%elifidn %1, svq3 test r5, r5 lea r6, [r5+3] cmovs r5, r6 @@ -449,8 +430,8 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 movd r1d, m0 movsx r1d, r1w -%ifnidn %3, svq3 -%ifidn %3, h264 +%ifnidn %1, svq3 +%ifidn %1, h264 lea r1d, [r1d*5+32] %else ; rv40 lea r1d, [r1d*5] @@ -476,26 +457,10 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 movd m1, r5d movd m3, r3d -%ifidn %1, mmx - punpcklwd m0, m0 - punpcklwd m1, m1 - punpcklwd m3, m3 - punpckldq m0, m0 - punpckldq m1, m1 - punpckldq m3, m3 -%elifidn %1, mmx2 - pshufw m0, m0, 0x0 - pshufw m1, m1, 0x0 - pshufw m3, m3, 0x0 -%else - pshuflw m0, m0, 0x0 - pshuflw m1, m1, 0x0 - pshuflw m3, m3, 0x0 - punpcklqdq m0, m0 ; splat H (words) - punpcklqdq m1, m1 ; splat V (words) - punpcklqdq m3, m3 ; splat a (words) -%endif -%ifidn %3, svq3 + SPLATW m0, m0, 0 ; H + SPLATW m1, m1, 0 ; V + SPLATW m3, m3, 0 ; a +%ifidn %1, svq3 SWAP 0, 1 %endif mova m2, m0 @@ -568,27 +533,30 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 REP_RET %endmacro -INIT_MMX -H264_PRED16x16_PLANE mmx, 0, h264 -H264_PRED16x16_PLANE mmx, 0, rv40 -H264_PRED16x16_PLANE mmx, 0, svq3 -H264_PRED16x16_PLANE mmx2, 0, h264 -H264_PRED16x16_PLANE mmx2, 0, rv40 -H264_PRED16x16_PLANE mmx2, 0, svq3 +INIT_MMX mmx +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 +INIT_MMX mmx2 +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 +INIT_XMM sse2 +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 +INIT_XMM ssse3 +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 INIT_XMM -H264_PRED16x16_PLANE sse2, 8, h264 -H264_PRED16x16_PLANE sse2, 8, rv40 -H264_PRED16x16_PLANE sse2, 8, svq3 -H264_PRED16x16_PLANE ssse3, 8, h264 -H264_PRED16x16_PLANE ssse3, 8, rv40 -H264_PRED16x16_PLANE ssse3, 8, svq3 ;----------------------------------------------------------------------------- ; void pred8x8_plane(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro H264_PRED8x8_PLANE 2 -cglobal pred8x8_plane_%1, 2, 7, %2 +%macro H264_PRED8x8_PLANE 0 +cglobal pred8x8_plane, 2,9,7 mov r2, r1 ; +stride neg r1 ; -stride @@ -601,52 +569,44 @@ cglobal pred8x8_plane_%1, 2, 7, %2 pmullw m0, [pw_m4to4] pmullw m1, [pw_m4to4+8] %else ; mmsize == 16 -%ifidn %1, sse2 +%if cpuflag(ssse3) + movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary + pmaddubsw m0, [plane8_shuf] ; H coefficients +%else ; sse2 pxor m2, m2 movd m1, [r0+r1 +4] punpckldq m0, m1 punpcklbw m0, m2 pmullw m0, [pw_m4to4] -%else ; ssse3 - movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary - pmaddubsw m0, [plane8_shuf] ; H coefficients %endif movhlps m1, m0 %endif paddw m0, m1 -%ifnidn %1, ssse3 -%ifidn %1, mmx +%if notcpuflag(ssse3) +%if cpuflag(mmx2) + PSHUFLW m1, m0, 0xE +%elif cpuflag(mmx) mova m1, m0 psrlq m1, 32 -%elifidn %1, mmx2 - pshufw m1, m0, 0xE -%else ; mmsize == 16 - pshuflw m1, m0, 0xE %endif paddw m0, m1 %endif ; !ssse3 -%ifidn %1, mmx +%if cpuflag(mmx2) + PSHUFLW m1, m0, 0x1 +%elif cpuflag(mmx) mova m1, m0 psrlq m1, 16 -%elifidn %1, mmx2 - pshufw m1, m0, 0x1 -%else - pshuflw m1, m0, 0x1 %endif paddw m0, m1 ; sum of H coefficients - pmullw m0, [pw_17] - paddw m0, [pw_16] - psraw m0, 5 - lea r4, [r0+r2*4-1] lea r3, [r0 -1] add r4, r2 -%ifdef ARCH_X86_64 -%define e_reg r11 +%if ARCH_X86_64 +%define e_reg r8 %else %define e_reg r0 %endif @@ -656,10 +616,10 @@ cglobal pred8x8_plane_%1, 2, 7, %2 sub r5, e_reg movzx e_reg, byte [r3 ] -%ifdef ARCH_X86_64 - movzx r10, byte [r4+r2 ] - sub r10, e_reg - sub r5, r10 +%if ARCH_X86_64 + movzx r7, byte [r4+r2 ] + sub r7, e_reg + sub r5, r7 %else movzx r6, byte [r4+r2 ] sub r6, e_reg @@ -670,8 +630,8 @@ cglobal pred8x8_plane_%1, 2, 7, %2 movzx e_reg, byte [r3+r1 ] movzx r6, byte [r4+r2*2 ] sub r6, e_reg -%ifdef ARCH_X86_64 - add r6, r10 +%if ARCH_X86_64 + add r6, r7 %endif lea r5, [r5+r6*4] @@ -684,7 +644,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2 lea r5, [r5+r6*8] sar r5, 5 -%ifndef ARCH_X86_64 +%if ARCH_X86_64 == 0 mov r0, r0m %endif @@ -694,6 +654,10 @@ cglobal pred8x8_plane_%1, 2, 7, %2 shl r3, 4 movd r1d, m0 movsx r1d, r1w + imul r1d, 17 + add r1d, 16 + sar r1d, 5 + movd m0, r1d add r1d, r5d sub r3d, r1d add r1d, r1d @@ -701,25 +665,9 @@ cglobal pred8x8_plane_%1, 2, 7, %2 movd m1, r5d movd m3, r3d -%ifidn %1, mmx - punpcklwd m0, m0 - punpcklwd m1, m1 - punpcklwd m3, m3 - punpckldq m0, m0 - punpckldq m1, m1 - punpckldq m3, m3 -%elifidn %1, mmx2 - pshufw m0, m0, 0x0 - pshufw m1, m1, 0x0 - pshufw m3, m3, 0x0 -%else - pshuflw m0, m0, 0x0 - pshuflw m1, m1, 0x0 - pshuflw m3, m3, 0x0 - punpcklqdq m0, m0 ; splat H (words) - punpcklqdq m1, m1 ; splat V (words) - punpcklqdq m3, m3 ; splat a (words) -%endif + SPLATW m0, m0, 0 ; H + SPLATW m1, m1, 0 ; V + SPLATW m3, m3, 0 ; a %if mmsize == 8 mova m2, m0 %endif @@ -768,12 +716,15 @@ ALIGN 16 REP_RET %endmacro -INIT_MMX -H264_PRED8x8_PLANE mmx, 0 -H264_PRED8x8_PLANE mmx2, 0 +INIT_MMX mmx +H264_PRED8x8_PLANE +INIT_MMX mmx2 +H264_PRED8x8_PLANE +INIT_XMM sse2 +H264_PRED8x8_PLANE +INIT_XMM ssse3 +H264_PRED8x8_PLANE INIT_XMM -H264_PRED8x8_PLANE sse2, 8 -H264_PRED8x8_PLANE ssse3, 8 ;----------------------------------------------------------------------------- ; void pred8x8_vertical(uint8_t *src, int stride) @@ -795,31 +746,15 @@ cglobal pred8x8_vertical_mmx, 2,2 ; void pred8x8_horizontal(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED8x8_H 1 -cglobal pred8x8_horizontal_%1, 2,3 +%macro PRED8x8_H 0 +cglobal pred8x8_horizontal, 2,3 mov r2, 4 -%ifidn %1, ssse3 +%if cpuflag(ssse3) mova m2, [pb_3] %endif .loop: - movd m0, [r0+r1*0-4] - movd m1, [r0+r1*1-4] -%ifidn %1, ssse3 - pshufb m0, m2 - pshufb m1, m2 -%else - punpcklbw m0, m0 - punpcklbw m1, m1 -%ifidn %1, mmxext - pshufw m0, m0, 0xff - pshufw m1, m1, 0xff -%else - punpckhwd m0, m0 - punpckhwd m1, m1 - punpckhdq m0, m0 - punpckhdq m1, m1 -%endif -%endif + SPLATB_LOAD m0, r0+r1*0-1, m2 + SPLATB_LOAD m1, r0+r1*1-1, m2 mova [r0+r1*0], m0 mova [r0+r1*1], m1 lea r0, [r0+r1*2] @@ -828,15 +763,17 @@ cglobal pred8x8_horizontal_%1, 2,3 REP_RET %endmacro +INIT_MMX mmx +PRED8x8_H +INIT_MMX mmx2 +PRED8x8_H +INIT_MMX ssse3 +PRED8x8_H INIT_MMX -PRED8x8_H mmx -PRED8x8_H mmxext -PRED8x8_H ssse3 ;----------------------------------------------------------------------------- ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%ifdef CONFIG_GPL cglobal pred8x8_top_dc_mmxext, 2,5 sub r0, r1 movq mm0, [r0] @@ -927,7 +864,6 @@ cglobal pred8x8_dc_mmxext, 2,5 movq [r4+r1*1], m1 movq [r4+r1*2], m1 RET -%endif ;----------------------------------------------------------------------------- ; void pred8x8_dc_rv40(uint8_t *src, int stride) @@ -969,8 +905,8 @@ cglobal pred8x8_dc_rv40_mmxext, 2,7 ; void pred8x8_tm_vp8(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED8x8_TM_MMX 1 -cglobal pred8x8_tm_vp8_%1, 2,6 +%macro PRED8x8_TM_MMX 0 +cglobal pred8x8_tm_vp8, 2,6 sub r0, r1 pxor mm7, mm7 movq mm0, [r0] @@ -986,15 +922,8 @@ cglobal pred8x8_tm_vp8_%1, 2,6 sub r3d, r4d movd mm2, r2d movd mm4, r3d -%ifidn %1, mmx - punpcklwd mm2, mm2 - punpcklwd mm4, mm4 - punpckldq mm2, mm2 - punpckldq mm4, mm4 -%else - pshufw mm2, mm2, 0 - pshufw mm4, mm4, 0 -%endif + SPLATW mm2, mm2, 0 + SPLATW mm4, mm4, 0 movq mm3, mm2 movq mm5, mm4 paddw mm2, mm0 @@ -1011,8 +940,11 @@ cglobal pred8x8_tm_vp8_%1, 2,6 REP_RET %endmacro -PRED8x8_TM_MMX mmx -PRED8x8_TM_MMX mmxext +INIT_MMX mmx +PRED8x8_TM_MMX +INIT_MMX mmx2 +PRED8x8_TM_MMX +INIT_MMX cglobal pred8x8_tm_vp8_sse2, 2,6,4 sub r0, r1 @@ -1083,7 +1015,6 @@ cglobal pred8x8_tm_vp8_ssse3, 2,3,6 ;----------------------------------------------------------------------------- ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride) ;----------------------------------------------------------------------------- -%ifdef CONFIG_GPL %macro PRED8x8L_TOP_DC 1 cglobal pred8x8l_top_dc_%1, 4,4 sub r0, r3 @@ -1934,6 +1865,9 @@ cglobal pred8x8l_vertical_right_mmxext, 4,5 %macro PRED8x8L_VERTICAL_RIGHT 1 cglobal pred8x8l_vertical_right_%1, 4,5,7 + ; manually spill XMM registers for Win64 because + ; the code here is initialized with INIT_MMX + WIN64_SPILL_XMM 7 sub r0, r3 lea r4, [r0+r3*2] movq mm0, [r0+r3*1-8] @@ -2476,7 +2410,6 @@ PRED8x8L_HORIZONTAL_DOWN sse2 INIT_MMX %define PALIGNR PALIGNR_SSSE3 PRED8x8L_HORIZONTAL_DOWN ssse3 -%endif ;----------------------------------------------------------------------------- ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride) @@ -2511,8 +2444,8 @@ cglobal pred4x4_dc_mmxext, 3,5 ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride) ;----------------------------------------------------------------------------- -%macro PRED4x4_TM_MMX 1 -cglobal pred4x4_tm_vp8_%1, 3,6 +%macro PRED4x4_TM_MMX 0 +cglobal pred4x4_tm_vp8, 3,6 sub r0, r2 pxor mm7, mm7 movd mm0, [r0] @@ -2526,14 +2459,14 @@ cglobal pred4x4_tm_vp8_%1, 3,6 sub r3d, r4d movd mm2, r1d movd mm4, r3d -%ifidn %1, mmx +%if cpuflag(mmx2) + pshufw mm2, mm2, 0 + pshufw mm4, mm4, 0 +%else punpcklwd mm2, mm2 punpcklwd mm4, mm4 punpckldq mm2, mm2 punpckldq mm4, mm4 -%else - pshufw mm2, mm2, 0 - pshufw mm4, mm4, 0 %endif paddw mm2, mm0 paddw mm4, mm0 @@ -2547,8 +2480,11 @@ cglobal pred4x4_tm_vp8_%1, 3,6 REP_RET %endmacro -PRED4x4_TM_MMX mmx -PRED4x4_TM_MMX mmxext +INIT_MMX mmx +PRED4x4_TM_MMX +INIT_MMX mmx2 +PRED4x4_TM_MMX +INIT_MMX cglobal pred4x4_tm_vp8_ssse3, 3,3 sub r0, r2 @@ -2608,7 +2544,6 @@ cglobal pred4x4_vertical_vp8_mmxext, 3,3 ;----------------------------------------------------------------------------- ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride) ;----------------------------------------------------------------------------- -%ifdef CONFIG_GPL INIT_MMX cglobal pred4x4_down_left_mmxext, 3,3 sub r0, r2 @@ -2616,12 +2551,11 @@ cglobal pred4x4_down_left_mmxext, 3,3 punpckldq m1, [r1] movq m2, m1 movq m3, m1 - movq m4, m1 psllq m1, 8 pxor m2, m1 psrlq m2, 8 - pxor m3, m2 - PRED4x4_LOWPASS m0, m1, m3, m4, m5 + pxor m2, m3 + PRED4x4_LOWPASS m0, m1, m2, m3, m4 lea r1, [r0+r2*2] psrlq m0, 8 movd [r0+r2*1], m0 @@ -2786,4 +2720,3 @@ cglobal pred4x4_down_right_mmxext, 3,3 psrlq m0, 8 movh [r0+r2*1], m0 RET -%endif