X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavcodec%2Fx86%2Fh264_chromamc.asm;h=b7b18e03f8b58447371b7238db1ecb0ba0a4a6e9;hb=3aca10bf762a94d7de555cedf1ff0e4f6792bf41;hp=6df82cc523ceca7253a2d640b23ba960812e5758;hpb=d0eb5a1174c2626e26bf75801d2b99e1383884ac;p=ffmpeg diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm index 6df82cc523c..b7b18e03f8b 100644 --- a/libavcodec/x86/h264_chromamc.asm +++ b/libavcodec/x86/h264_chromamc.asm @@ -3,25 +3,24 @@ ;* Copyright (c) 2005 Zoltan Hidvegi , ;* 2005-2008 Loren Merritt ;* -;* This file is part of FFmpeg. +;* This file is part of Libav. ;* -;* FFmpeg is free software; you can redistribute it and/or +;* Libav is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* FFmpeg is distributed in the hope that it will be useful, +;* Libav is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86util.asm" SECTION_RODATA @@ -61,7 +60,7 @@ rnd_rv40_1d_tbl: times 4 dw 0 cextern pw_3 cextern pw_4 cextern pw_8 -cextern pw_28 +pw_28: times 8 dw 28 cextern pw_32 cextern pw_64 @@ -69,20 +68,20 @@ SECTION .text %macro mv0_pixels_mc8 0 lea r4, [r2*2 ] -.next4rows +.next4rows: movq mm0, [r1 ] movq mm1, [r1+r2] + add r1, r4 CHROMAMC_AVG mm0, [r0 ] CHROMAMC_AVG mm1, [r0+r2] movq [r0 ], mm0 movq [r0+r2], mm1 add r0, r4 - add r1, r4 movq mm0, [r1 ] movq mm1, [r1+r2] + add r1, r4 CHROMAMC_AVG mm0, [r0 ] CHROMAMC_AVG mm1, [r0+r2] - add r1, r4 movq [r0 ], mm0 movq [r0+r2], mm1 add r0, r4 @@ -90,11 +89,24 @@ SECTION .text jne .next4rows %endmacro -%macro chroma_mc8_mmx_func 3 -; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, -; int stride, int h, int mx, int my) -cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 -%ifdef ARCH_X86_64 +%macro chroma_mc8_mmx_func 2-3 +%ifidn %2, rv40 +%ifdef PIC +%define rnd_1d_rv40 r8 +%define rnd_2d_rv40 r8 +%define extra_regs 2 +%else ; no-PIC +%define rnd_1d_rv40 rnd_rv40_1d_tbl +%define rnd_2d_rv40 rnd_rv40_2d_tbl +%define extra_regs 1 +%endif ; PIC +%else +%define extra_regs 0 +%endif ; rv40 +; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, +; int stride, int h, int mx, int my) +cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0 +%if ARCH_X86_64 movsxd r2, r2d %endif mov r6d, r5d @@ -104,21 +116,14 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 mv0_pixels_mc8 REP_RET -.at_least_one_non_zero +.at_least_one_non_zero: %ifidn %2, rv40 -%ifdef PIC -%define rnd_1d_rv40 r11 -%define rnd_2d_rv40 r11 -%else ; no-PIC -%define rnd_1d_rv40 rnd_rv40_1d_tbl -%define rnd_2d_rv40 rnd_rv40_2d_tbl -%endif -%ifdef ARCH_X86_64 - mov r10, r5 - and r10, 6 ; &~1 for mx/my=[0,7] - lea r10, [r10*4+r4] - sar r10d, 1 -%define rnd_bias r10 +%if ARCH_X86_64 + mov r7, r5 + and r7, 6 ; &~1 for mx/my=[0,7] + lea r7, [r7*4+r4] + sar r7d, 1 +%define rnd_bias r7 %define dest_reg r0 %else ; x86-32 mov r0, r5 @@ -139,15 +144,15 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 test r4d, r4d mov r6, r2 ; dxy = x ? 1 : stride jne .both_non_zero -.my_is_zero +.my_is_zero: ; mx == 0 XOR my == 0 - 1 dimensional filter only or r4d, r5d ; x + y %ifidn %2, rv40 %ifdef PIC - lea r11, [rnd_rv40_1d_tbl] + lea r8, [rnd_rv40_1d_tbl] %endif -%ifndef ARCH_X86_64 +%if ARCH_X86_64 == 0 mov r5, r0m %endif %endif @@ -160,7 +165,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 pxor m7, m7 psubw m4, m5 ; mm4 = A = 8-x -.next1drow +.next1drow: movq m0, [r1 ] ; mm0 = src[0..7] movq m2, [r1+r6] ; mm1 = src[1..8] @@ -191,14 +196,14 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 jne .next1drow REP_RET -.both_non_zero ; general case, bilinear +.both_non_zero: ; general case, bilinear movd m4, r4d ; x movd m6, r5d ; y %ifidn %2, rv40 %ifdef PIC - lea r11, [rnd_rv40_2d_tbl] + lea r8, [rnd_rv40_2d_tbl] %endif -%ifndef ARCH_X86_64 +%if ARCH_X86_64 == 0 mov r5, r0m %endif %endif @@ -226,7 +231,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 movq m0, [r1 ] ; mm0 = src[0..7] movq m1, [r1+1] ; mm1 = src[1..8] -.next2drow +.next2drow: add r1, r2 movq m2, m0 @@ -277,9 +282,15 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 RET %endmacro -%macro chroma_mc4_mmx_func 3 -cglobal %1_%2_chroma_mc4_%3, 6, 6, 0 -%ifdef ARCH_X86_64 +%macro chroma_mc4_mmx_func 2 +%define extra_regs 0 +%ifidn %2, rv40 +%ifdef PIC +%define extra_regs 1 +%endif ; PIC +%endif ; rv40 +cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0 +%if ARCH_X86_64 movsxd r2, r2d %endif pxor m7, m7 @@ -296,8 +307,8 @@ cglobal %1_%2_chroma_mc4_%3, 6, 6, 0 %ifidn %2, rv40 %ifdef PIC - lea r11, [rnd_rv40_2d_tbl] -%define rnd_2d_rv40 r11 + lea r6, [rnd_rv40_2d_tbl] +%define rnd_2d_rv40 r6 %else %define rnd_2d_rv40 rnd_rv40_2d_tbl %endif @@ -318,7 +329,7 @@ cglobal %1_%2_chroma_mc4_%3, 6, 6, 0 pmullw m6, m2 paddw m6, m0 -.next2rows +.next2rows: movd m0, [r1 ] movd m1, [r1+1] add r1, r2 @@ -362,9 +373,9 @@ cglobal %1_%2_chroma_mc4_%3, 6, 6, 0 REP_RET %endmacro -%macro chroma_mc2_mmx_func 3 -cglobal %1_%2_chroma_mc2_%3, 6, 7, 0 -%ifdef ARCH_X86_64 +%macro chroma_mc2_mmx_func 2 +cglobal %1_%2_chroma_mc2, 6, 7, 0 +%if ARCH_X86_64 movsxd r2, r2d %endif @@ -385,7 +396,7 @@ cglobal %1_%2_chroma_mc2_%3, 6, 7, 0 punpcklbw m2, m7 pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] -.nextrow +.nextrow: add r1, r2 movq m1, m2 pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] @@ -416,43 +427,44 @@ cglobal %1_%2_chroma_mc2_%3, 6, 7, 0 %macro NOTHING 2-3 %endmacro %macro DIRECT_AVG 2 - PAVG %1, %2 + PAVGB %1, %2 %endmacro %macro COPY_AVG 3 movd %2, %3 - PAVG %1, %2 + PAVGB %1, %2 %endmacro -INIT_MMX +INIT_MMX mmx %define CHROMAMC_AVG NOTHING %define CHROMAMC_AVG4 NOTHING -chroma_mc8_mmx_func put, h264, mmx_rnd -chroma_mc8_mmx_func put, vc1, mmx_nornd -chroma_mc8_mmx_func put, rv40, mmx -chroma_mc4_mmx_func put, h264, mmx -chroma_mc4_mmx_func put, rv40, mmx -chroma_mc2_mmx_func put, h264, mmx2 +chroma_mc8_mmx_func put, h264, _rnd +chroma_mc8_mmx_func put, vc1, _nornd +chroma_mc8_mmx_func put, rv40 +chroma_mc4_mmx_func put, h264 +chroma_mc4_mmx_func put, rv40 + +INIT_MMX mmxext +chroma_mc2_mmx_func put, h264 %define CHROMAMC_AVG DIRECT_AVG %define CHROMAMC_AVG4 COPY_AVG -%define PAVG pavgb -chroma_mc8_mmx_func avg, h264, mmx2_rnd -chroma_mc8_mmx_func avg, vc1, mmx2_nornd -chroma_mc8_mmx_func avg, rv40, mmx2 -chroma_mc4_mmx_func avg, h264, mmx2 -chroma_mc4_mmx_func avg, rv40, mmx2 -chroma_mc2_mmx_func avg, h264, mmx2 - -%define PAVG pavgusb -chroma_mc8_mmx_func avg, h264, 3dnow_rnd -chroma_mc8_mmx_func avg, vc1, 3dnow_nornd -chroma_mc8_mmx_func avg, rv40, 3dnow -chroma_mc4_mmx_func avg, h264, 3dnow -chroma_mc4_mmx_func avg, rv40, 3dnow - -%macro chroma_mc8_ssse3_func 3 -cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 -%ifdef ARCH_X86_64 +chroma_mc8_mmx_func avg, h264, _rnd +chroma_mc8_mmx_func avg, vc1, _nornd +chroma_mc8_mmx_func avg, rv40 +chroma_mc4_mmx_func avg, h264 +chroma_mc4_mmx_func avg, rv40 +chroma_mc2_mmx_func avg, h264 + +INIT_MMX 3dnow +chroma_mc8_mmx_func avg, h264, _rnd +chroma_mc8_mmx_func avg, vc1, _nornd +chroma_mc8_mmx_func avg, rv40 +chroma_mc4_mmx_func avg, h264 +chroma_mc4_mmx_func avg, rv40 + +%macro chroma_mc8_ssse3_func 2-3 +cglobal %1_%2_chroma_mc8%3, 6, 7, 8 +%if ARCH_X86_64 movsxd r2, r2d %endif mov r6d, r5d @@ -462,7 +474,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 mv0_pixels_mc8 REP_RET -.at_least_one_non_zero +.at_least_one_non_zero: test r5d, r5d je .my_is_zero test r4d, r4d @@ -472,8 +484,8 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 mov r6d, r4d shl r4d, 8 sub r4, r6 - add r4, 8 ; x*288+8 = x<<8 | (8-x) mov r6, 8 + add r4, 8 ; x*288+8 = x<<8 | (8-x) sub r6d, r5d imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) @@ -481,24 +493,23 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 movd m7, r6d movd m6, r4d movdqa m5, [rnd_2d_%2] + movq m0, [r1 ] + movq m1, [r1+1] pshuflw m7, m7, 0 pshuflw m6, m6, 0 + punpcklbw m0, m1 movlhps m7, m7 movlhps m6, m6 - movq m0, [r1 ] - movq m1, [r1 +1] - punpcklbw m0, m1 - add r1, r2 -.next2rows - movq m1, [r1 ] - movq m2, [r1 +1] - movq m3, [r1+r2 ] - movq m4, [r1+r2+1] +.next2rows: + movq m1, [r1+r2*1 ] + movq m2, [r1+r2*1+1] + movq m3, [r1+r2*2 ] + movq m4, [r1+r2*2+1] lea r1, [r1+r2*2] punpcklbw m1, m2 - punpcklbw m3, m4 movdqa m2, m1 + punpcklbw m3, m4 movdqa m4, m3 pmaddubsw m0, m7 pmaddubsw m1, m6 @@ -508,8 +519,8 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 paddw m2, m5 paddw m1, m0 paddw m3, m2 - movdqa m0, m4 psrlw m1, 6 + movdqa m0, m4 psrlw m3, 6 %ifidn %1, avg movq m2, [r0 ] @@ -524,18 +535,17 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 jg .next2rows REP_RET -.my_is_zero +.my_is_zero: mov r5d, r4d shl r4d, 8 add r4, 8 sub r4, r5 ; 255*x+8 = x<<8 | (8-x) movd m7, r4d - movq m6, [rnd_1d_%2] + movdqa m6, [rnd_1d_%2] pshuflw m7, m7, 0 - movlhps m6, m6 movlhps m7, m7 -.next2xrows +.next2xrows: movq m0, [r1 ] movq m1, [r1 +1] movq m2, [r1+r2 ] @@ -562,22 +572,22 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 jg .next2xrows REP_RET -.mx_is_zero +.mx_is_zero: mov r4d, r5d shl r5d, 8 add r5, 8 sub r5, r4 ; 255*y+8 = y<<8 | (8-y) movd m7, r5d - movq m6, [rnd_1d_%2] + movdqa m6, [rnd_1d_%2] pshuflw m7, m7, 0 - movlhps m6, m6 movlhps m7, m7 -.next2yrows +.next2yrows: movq m0, [r1 ] movq m1, [r1+r2 ] movdqa m2, m1 movq m3, [r1+r2*2] + lea r1, [r1+r2*2] punpcklbw m0, m1 punpcklbw m2, m3 pmaddubsw m0, m7 @@ -596,21 +606,20 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 movhps [r0+r2], m0 sub r3d, 2 lea r0, [r0+r2*2] - lea r1, [r1+r2*2] jg .next2yrows REP_RET %endmacro -%macro chroma_mc4_ssse3_func 3 -cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 -%ifdef ARCH_X86_64 +%macro chroma_mc4_ssse3_func 2 +cglobal %1_%2_chroma_mc4, 6, 7, 0 +%if ARCH_X86_64 movsxd r2, r2d %endif mov r6, r4 shl r4d, 8 sub r4d, r6d - add r4d, 8 ; x*288+8 mov r6, 8 + add r4d, 8 ; x*288+8 sub r6d, r5d imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) @@ -618,17 +627,16 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 movd m7, r6d movd m6, r4d movq m5, [pw_32] + movd m0, [r1 ] pshufw m7, m7, 0 + punpcklbw m0, [r1+1] pshufw m6, m6, 0 - movd m0, [r1 ] - punpcklbw m0, [r1 +1] - add r1, r2 -.next2rows - movd m1, [r1 ] - movd m3, [r1+r2 ] - punpcklbw m1, [r1 +1] - punpcklbw m3, [r1+r2+1] +.next2rows: + movd m1, [r1+r2*1 ] + movd m3, [r1+r2*2 ] + punpcklbw m1, [r1+r2*1+1] + punpcklbw m3, [r1+r2*2+1] lea r1, [r1+r2*2] movq m2, m1 movq m4, m3 @@ -640,8 +648,8 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 paddw m2, m5 paddw m1, m0 paddw m3, m2 - movq m0, m4 psrlw m1, 6 + movq m0, m4 psrlw m3, 6 packuswb m1, m1 packuswb m3, m3 @@ -656,16 +664,15 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 %endmacro %define CHROMAMC_AVG NOTHING -INIT_XMM -chroma_mc8_ssse3_func put, h264, ssse3_rnd -chroma_mc8_ssse3_func put, vc1, ssse3_nornd -INIT_MMX -chroma_mc4_ssse3_func put, h264, ssse3 +INIT_XMM ssse3 +chroma_mc8_ssse3_func put, h264, _rnd +chroma_mc8_ssse3_func put, vc1, _nornd +INIT_MMX ssse3 +chroma_mc4_ssse3_func put, h264 %define CHROMAMC_AVG DIRECT_AVG -%define PAVG pavgb -INIT_XMM -chroma_mc8_ssse3_func avg, h264, ssse3_rnd -chroma_mc8_ssse3_func avg, vc1, ssse3_nornd -INIT_MMX -chroma_mc4_ssse3_func avg, h264, ssse3 +INIT_XMM ssse3 +chroma_mc8_ssse3_func avg, h264, _rnd +chroma_mc8_ssse3_func avg, vc1, _nornd +INIT_MMX ssse3 +chroma_mc4_ssse3_func avg, h264