X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavcodec%2Fx86%2Fvp8dsp.asm;h=b7355c4a119da1783f45d043eb7e51a6e8c21654;hb=6860b4081d046558c44b1b42f22022ea341a2a73;hp=3bdc8f7ed9cd8019973a047125d31044f317877e;hpb=e3f7bf774caad334b258f4494e77420a7c161b63;p=ffmpeg diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 3bdc8f7ed9c..b7355c4a119 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -3,24 +3,23 @@ ;* Copyright (c) 2010 Ronald S. Bultje ;* Copyright (c) 2010 Jason Garrett-Glaser ;* -;* This file is part of FFmpeg. +;* This file is part of Libav. ;* -;* FFmpeg is free software; you can redistribute it and/or +;* Libav is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* FFmpeg is distributed in the hope that it will be useful, +;* Libav is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" %include "x86util.asm" SECTION_RODATA @@ -116,23 +115,25 @@ bilinear_filter_vb_m: times 8 db 7, 1 times 8 db 1, 7 %ifdef PIC -%define fourtap_filter_hw r11 -%define sixtap_filter_hw r11 -%define fourtap_filter_hb r11 -%define sixtap_filter_hb r11 -%define fourtap_filter_v r11 -%define sixtap_filter_v r11 -%define bilinear_filter_vw r11 -%define bilinear_filter_vb r11 +%define fourtap_filter_hw picregq +%define sixtap_filter_hw picregq +%define fourtap_filter_hb picregq +%define sixtap_filter_hb picregq +%define fourtap_filter_v picregq +%define sixtap_filter_v picregq +%define bilinear_filter_vw picregq +%define bilinear_filter_vb picregq +%define npicregs 1 %else -%define fourtap_filter_hw fourtap_filter_hw_m -%define sixtap_filter_hw sixtap_filter_hw_m -%define fourtap_filter_hb fourtap_filter_hb_m -%define sixtap_filter_hb sixtap_filter_hb_m -%define fourtap_filter_v fourtap_filter_v_m -%define sixtap_filter_v sixtap_filter_v_m +%define fourtap_filter_hw fourtap_filter_hw_m +%define sixtap_filter_hw sixtap_filter_hw_m +%define fourtap_filter_hb fourtap_filter_hb_m +%define sixtap_filter_hb sixtap_filter_hb_m +%define fourtap_filter_v fourtap_filter_v_m +%define sixtap_filter_v sixtap_filter_v_m %define bilinear_filter_vw bilinear_filter_vw_m %define bilinear_filter_vb bilinear_filter_vb_m +%define npicregs 0 %endif filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 @@ -142,9 +143,15 @@ filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 +pw_256: times 8 dw 256 + pw_20091: times 4 dw 20091 pw_17734: times 4 dw 17734 +pb_27_63: times 8 db 27, 63 +pb_18_63: times 8 db 18, 63 +pb_9_63: times 8 db 9, 63 + cextern pb_1 cextern pw_3 cextern pb_3 @@ -169,26 +176,26 @@ SECTION .text ; int height, int mx, int my); ;----------------------------------------------------------------------------- -%macro FILTER_SSSE3 3 -cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 - lea r5d, [r5*3] +%macro FILTER_SSSE3 1 +cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] mova m3, [filter_h6_shuf2] mova m4, [filter_h6_shuf3] %ifdef PIC - lea r11, [sixtap_filter_hb_m] + lea picregq, [sixtap_filter_hb_m] %endif - mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes - mova m6, [sixtap_filter_hb+r5*8-32] - mova m7, [sixtap_filter_hb+r5*8-16] + mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes + mova m6, [sixtap_filter_hb+mxq*8-32] + mova m7, [sixtap_filter_hb+mxq*8-16] -.nextrow - movu m0, [r2-2] +.nextrow: + movu m0, [srcq-2] mova m1, m0 mova m2, m0 -%ifidn %1, 4 +%if mmsize == 8 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the ; shuffle with a memory operand - punpcklbw m0, [r2+3] + punpcklbw m0, [srcq+3] %else pshufb m0, [filter_h6_shuf1] %endif @@ -199,67 +206,65 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 pmaddubsw m2, m7 paddsw m0, m1 paddsw m0, m2 - paddsw m0, [pw_64] - psraw m0, 7 + pmulhrsw m0, [pw_256] packuswb m0, m0 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4 ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 - shl r5d, 4 - mova m2, [pw_64] +cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 + mova m2, [pw_256] mova m3, [filter_h2_shuf] mova m4, [filter_h4_shuf] %ifdef PIC - lea r11, [fourtap_filter_hb_m] + lea picregq, [fourtap_filter_hb_m] %endif - mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes - mova m6, [fourtap_filter_hb+r5] + mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes + mova m6, [fourtap_filter_hb+mxq] -.nextrow - movu m0, [r2-1] +.nextrow: + movu m0, [srcq-1] mova m1, m0 pshufb m0, m3 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m6 - paddsw m0, m2 paddsw m0, m1 - psraw m0, 7 + pmulhrsw m0, m2 packuswb m0, m0 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4 ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 - shl r6d, 4 +cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 %ifdef PIC - lea r11, [fourtap_filter_hb_m] + lea picregq, [fourtap_filter_hb_m] %endif - mova m5, [fourtap_filter_hb+r6-16] - mova m6, [fourtap_filter_hb+r6] - mova m7, [pw_64] + mova m5, [fourtap_filter_hb+myq-16] + mova m6, [fourtap_filter_hb+myq] + mova m7, [pw_256] ; read 3 lines - sub r2, r3 - movh m0, [r2] - movh m1, [r2+ r3] - movh m2, [r2+2*r3] - add r2, r3 - -.nextrow - movh m3, [r2+2*r3] ; read new row + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+ srcstrideq] + movh m2, [srcq+2*srcstrideq] + add srcq, srcstrideq + +.nextrow: + movh m3, [srcq+2*srcstrideq] ; read new row mova m4, m0 mova m0, m1 punpcklbw m4, m1 @@ -269,84 +274,83 @@ cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 pmaddubsw m2, m6 paddsw m4, m2 mova m2, m3 - paddsw m4, m7 - psraw m4, 7 + pmulhrsw m4, m7 packuswb m4, m4 - movh [r0], m4 + movh [dstq], m4 ; go to next line - add r0, r1 - add r2, r3 - dec r4 ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 - lea r6d, [r6*3] +cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + lea myd, [myq*3] %ifdef PIC - lea r11, [sixtap_filter_hb_m] + lea picregq, [sixtap_filter_hb_m] %endif - lea r6, [sixtap_filter_hb+r6*8] + lea myq, [sixtap_filter_hb+myq*8] ; read 5 lines - sub r2, r3 - sub r2, r3 - movh m0, [r2] - movh m1, [r2+r3] - movh m2, [r2+r3*2] - lea r2, [r2+r3*2] - add r2, r3 - movh m3, [r2] - movh m4, [r2+r3] - -.nextrow - movh m5, [r2+2*r3] ; read new row + sub srcq, srcstrideq + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+srcstrideq] + movh m2, [srcq+srcstrideq*2] + lea srcq, [srcq+srcstrideq*2] + add srcq, srcstrideq + movh m3, [srcq] + movh m4, [srcq+srcstrideq] + +.nextrow: + movh m5, [srcq+2*srcstrideq] ; read new row mova m6, m0 punpcklbw m6, m5 mova m0, m1 punpcklbw m1, m2 mova m7, m3 punpcklbw m7, m4 - pmaddubsw m6, [r6-48] - pmaddubsw m1, [r6-32] - pmaddubsw m7, [r6-16] + pmaddubsw m6, [myq-48] + pmaddubsw m1, [myq-32] + pmaddubsw m7, [myq-16] paddsw m6, m1 paddsw m6, m7 mova m1, m2 - paddsw m6, [pw_64] mova m2, m3 - psraw m6, 7 + pmulhrsw m6, [pw_256] mova m3, m4 packuswb m6, m6 mova m4, m5 - movh [r0], m6 + movh [dstq], m6 ; go to next line - add r0, r1 - add r2, r3 - dec r4 ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET %endmacro -INIT_MMX -FILTER_SSSE3 4, 0, 0 -INIT_XMM -FILTER_SSSE3 8, 8, 7 +INIT_MMX ssse3 +FILTER_SSSE3 4 +INIT_XMM ssse3 +FILTER_SSSE3 8 ; 4x4 block, H-only 4-tap filter -cglobal put_vp8_epel4_h4_mmxext, 6, 6 - shl r5d, 4 +INIT_MMX mmx2 +cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 %ifdef PIC - lea r11, [fourtap_filter_hw_m] + lea picregq, [fourtap_filter_hw_m] %endif - movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words - movq mm5, [fourtap_filter_hw+r5] + movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words + movq mm5, [fourtap_filter_hw+mxq] movq mm7, [pw_64] pxor mm6, mm6 -.nextrow - movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels +.nextrow: + movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels ; first set of 2 pixels movq mm2, mm1 ; byte ABCD.. @@ -372,29 +376,30 @@ cglobal put_vp8_epel4_h4_mmxext, 6, 6 paddsw mm3, mm7 ; rounding psraw mm3, 7 packuswb mm3, mm6 ; clip and word->bytes - movd [r0], mm3 ; store + movd [dstq], mm3 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4 ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET ; 4x4 block, H-only 6-tap filter -cglobal put_vp8_epel4_h6_mmxext, 6, 6 - lea r5d, [r5*3] +INIT_MMX mmx2 +cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] %ifdef PIC - lea r11, [sixtap_filter_hw_m] + lea picregq, [sixtap_filter_hw_m] %endif - movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words - movq mm5, [sixtap_filter_hw+r5*8-32] - movq mm6, [sixtap_filter_hw+r5*8-16] + movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words + movq mm5, [sixtap_filter_hw+mxq*8-32] + movq mm6, [sixtap_filter_hw+mxq*8-16] movq mm7, [pw_64] pxor mm3, mm3 -.nextrow - movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels +.nextrow: + movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels ; first set of 2 pixels movq mm2, mm1 ; byte ABCD.. @@ -414,7 +419,7 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6 paddd mm1, mm2 ; finish 1st 2px ; second set of 2 pixels, use backup of above - movd mm2, [r2+3] ; byte FGHI (prevent overreads) + movd mm2, [srcq+3] ; byte FGHI (prevent overreads) pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 paddd mm0, mm3 ; add to 2nd 2px cache @@ -429,172 +434,163 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6 paddsw mm1, mm7 ; rounding psraw mm1, 7 packuswb mm1, mm3 ; clip and word->bytes - movd [r0], mm1 ; store + movd [dstq], mm1 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4 ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -; 4x4 block, H-only 4-tap filter -INIT_XMM -cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 - shl r5d, 4 +INIT_XMM sse2 +cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 5 %ifdef PIC - lea r11, [fourtap_filter_hw_m] + lea picregq, [fourtap_filter_v_m] %endif - mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words - mova m6, [fourtap_filter_hw+r5] + lea mxq, [fourtap_filter_v+mxq-32] pxor m7, m7 - -.nextrow - movh m0, [r2-1] - punpcklbw m0, m7 ; ABCDEFGH - mova m1, m0 - mova m2, m0 - mova m3, m0 - psrldq m1, 2 ; BCDEFGH - psrldq m2, 4 ; CDEFGH - psrldq m3, 6 ; DEFGH - punpcklwd m0, m1 ; ABBCCDDE - punpcklwd m2, m3 ; CDDEEFFG - pmaddwd m0, m5 - pmaddwd m2, m6 - paddd m0, m2 - - movh m1, [r2+3] - punpcklbw m1, m7 ; ABCDEFGH - mova m2, m1 - mova m3, m1 - mova m4, m1 - psrldq m2, 2 ; BCDEFGH - psrldq m3, 4 ; CDEFGH - psrldq m4, 6 ; DEFGH - punpcklwd m1, m2 ; ABBCCDDE - punpcklwd m3, m4 ; CDDEEFFG - pmaddwd m1, m5 - pmaddwd m3, m6 - paddd m1, m3 - - packssdw m0, m1 - paddsw m0, [pw_64] + mova m4, [pw_64] + mova m5, [mxq+ 0] + mova m6, [mxq+16] +%ifdef m8 + mova m8, [mxq+32] + mova m9, [mxq+48] +%endif +.nextrow: + movq m0, [srcq-1] + movq m1, [srcq-0] + movq m2, [srcq+1] + movq m3, [srcq+2] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + pmullw m0, m5 + pmullw m1, m6 +%ifdef m8 + pmullw m2, m8 + pmullw m3, m9 +%else + pmullw m2, [mxq+32] + pmullw m3, [mxq+48] +%endif + paddsw m0, m1 + paddsw m2, m3 + paddsw m0, m2 + paddsw m0, m4 psraw m0, 7 packuswb m0, m7 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4 ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 - lea r5d, [r5*3] +INIT_XMM sse2 +cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] + shl mxd, 4 %ifdef PIC - lea r11, [sixtap_filter_hw_m] + lea picregq, [sixtap_filter_v_m] %endif - lea r5, [sixtap_filter_hw+r5*8] + lea mxq, [sixtap_filter_v+mxq-96] pxor m7, m7 - -.nextrow - movu m0, [r2-2] - mova m6, m0 - mova m4, m0 - punpcklbw m0, m7 ; ABCDEFGHI - mova m1, m0 - mova m2, m0 - mova m3, m0 - psrldq m1, 2 ; BCDEFGH - psrldq m2, 4 ; CDEFGH - psrldq m3, 6 ; DEFGH - psrldq m4, 4 - punpcklbw m4, m7 ; EFGH - mova m5, m4 - psrldq m5, 2 ; FGH - punpcklwd m0, m1 ; ABBCCDDE - punpcklwd m2, m3 ; CDDEEFFG - punpcklwd m4, m5 ; EFFGGHHI - pmaddwd m0, [r5-48] - pmaddwd m2, [r5-32] - pmaddwd m4, [r5-16] - paddd m0, m2 - paddd m0, m4 - - psrldq m6, 4 - mova m4, m6 - punpcklbw m6, m7 ; ABCDEFGHI - mova m1, m6 - mova m2, m6 - mova m3, m6 - psrldq m1, 2 ; BCDEFGH - psrldq m2, 4 ; CDEFGH - psrldq m3, 6 ; DEFGH - psrldq m4, 4 - punpcklbw m4, m7 ; EFGH - mova m5, m4 - psrldq m5, 2 ; FGH - punpcklwd m6, m1 ; ABBCCDDE - punpcklwd m2, m3 ; CDDEEFFG - punpcklwd m4, m5 ; EFFGGHHI - pmaddwd m6, [r5-48] - pmaddwd m2, [r5-32] - pmaddwd m4, [r5-16] - paddd m6, m2 - paddd m6, m4 - - packssdw m0, m6 - paddsw m0, [pw_64] + mova m6, [pw_64] +%ifdef m8 + mova m8, [mxq+ 0] + mova m9, [mxq+16] + mova m10, [mxq+32] + mova m11, [mxq+48] + mova m12, [mxq+64] + mova m13, [mxq+80] +%endif +.nextrow: + movq m0, [srcq-2] + movq m1, [srcq-1] + movq m2, [srcq-0] + movq m3, [srcq+1] + movq m4, [srcq+2] + movq m5, [srcq+3] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + punpcklbw m5, m7 +%ifdef m8 + pmullw m0, m8 + pmullw m1, m9 + pmullw m2, m10 + pmullw m3, m11 + pmullw m4, m12 + pmullw m5, m13 +%else + pmullw m0, [mxq+ 0] + pmullw m1, [mxq+16] + pmullw m2, [mxq+32] + pmullw m3, [mxq+48] + pmullw m4, [mxq+64] + pmullw m5, [mxq+80] +%endif + paddsw m1, m4 + paddsw m0, m5 + paddsw m1, m2 + paddsw m0, m3 + paddsw m0, m1 + paddsw m0, m6 psraw m0, 7 packuswb m0, m7 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4 ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -%macro FILTER_V 3 +%macro FILTER_V 1 ; 4x4 block, V-only 4-tap filter -cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 - shl r6d, 5 +cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 5 %ifdef PIC - lea r11, [fourtap_filter_v_m] + lea picregq, [fourtap_filter_v_m] %endif - lea r6, [fourtap_filter_v+r6-32] + lea myq, [fourtap_filter_v+myq-32] mova m6, [pw_64] pxor m7, m7 - mova m5, [r6+48] + mova m5, [myq+48] ; read 3 lines - sub r2, r3 - movh m0, [r2] - movh m1, [r2+ r3] - movh m2, [r2+2*r3] - add r2, r3 + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+ srcstrideq] + movh m2, [srcq+2*srcstrideq] + add srcq, srcstrideq punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 -.nextrow +.nextrow: ; first calculate negative taps (to prevent losing positive overflows) - movh m4, [r2+2*r3] ; read new row + movh m4, [srcq+2*srcstrideq] ; read new row punpcklbw m4, m7 mova m3, m4 - pmullw m0, [r6+0] + pmullw m0, [myq+0] pmullw m4, m5 paddsw m4, m0 ; then calculate positive taps mova m0, m1 - pmullw m1, [r6+16] + pmullw m1, [myq+16] paddsw m4, m1 mova m1, m2 - pmullw m2, [r6+32] + pmullw m2, [myq+32] paddsw m4, m2 mova m2, m3 @@ -602,101 +598,100 @@ cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 paddsw m4, m6 psraw m4, 7 packuswb m4, m7 - movh [r0], m4 + movh [dstq], m4 ; go to next line - add r0, r1 - add r2, r3 - dec r4 ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET ; 4x4 block, V-only 6-tap filter -cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 - shl r6d, 4 - lea r6, [r6*3] +cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 + lea myq, [myq*3] %ifdef PIC - lea r11, [sixtap_filter_v_m] + lea picregq, [sixtap_filter_v_m] %endif - lea r6, [sixtap_filter_v+r6-96] + lea myq, [sixtap_filter_v+myq-96] pxor m7, m7 ; read 5 lines - sub r2, r3 - sub r2, r3 - movh m0, [r2] - movh m1, [r2+r3] - movh m2, [r2+r3*2] - lea r2, [r2+r3*2] - add r2, r3 - movh m3, [r2] - movh m4, [r2+r3] + sub srcq, srcstrideq + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+srcstrideq] + movh m2, [srcq+srcstrideq*2] + lea srcq, [srcq+srcstrideq*2] + add srcq, srcstrideq + movh m3, [srcq] + movh m4, [srcq+srcstrideq] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 punpcklbw m4, m7 -.nextrow +.nextrow: ; first calculate negative taps (to prevent losing positive overflows) mova m5, m1 - pmullw m5, [r6+16] + pmullw m5, [myq+16] mova m6, m4 - pmullw m6, [r6+64] + pmullw m6, [myq+64] paddsw m6, m5 ; then calculate positive taps - movh m5, [r2+2*r3] ; read new row + movh m5, [srcq+2*srcstrideq] ; read new row punpcklbw m5, m7 - pmullw m0, [r6+0] + pmullw m0, [myq+0] paddsw m6, m0 mova m0, m1 mova m1, m2 - pmullw m2, [r6+32] + pmullw m2, [myq+32] paddsw m6, m2 mova m2, m3 - pmullw m3, [r6+48] + pmullw m3, [myq+48] paddsw m6, m3 mova m3, m4 mova m4, m5 - pmullw m5, [r6+80] + pmullw m5, [myq+80] paddsw m6, m5 ; round/clip/store paddsw m6, [pw_64] psraw m6, 7 packuswb m6, m7 - movh [r0], m6 + movh [dstq], m6 ; go to next line - add r0, r1 - add r2, r3 - dec r4 ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET %endmacro -INIT_MMX -FILTER_V mmxext, 4, 0 -INIT_XMM -FILTER_V sse2, 8, 8 +INIT_MMX mmx2 +FILTER_V 4 +INIT_XMM sse2 +FILTER_V 8 -%macro FILTER_BILINEAR 3 -cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 - mov r5d, 8*16 - shl r6d, 4 - sub r5d, r6d +%macro FILTER_BILINEAR 1 +cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 %ifdef PIC - lea r11, [bilinear_filter_vw_m] + lea picregq, [bilinear_filter_vw_m] %endif pxor m6, m6 - mova m4, [bilinear_filter_vw+r5-16] - mova m5, [bilinear_filter_vw+r6-16] -.nextrow - movh m0, [r2+r3*0] - movh m1, [r2+r3*1] - movh m3, [r2+r3*2] + mova m5, [bilinear_filter_vw+myq-1*16] + neg myq + mova m4, [bilinear_filter_vw+myq+7*16] +.nextrow: + movh m0, [srcq+srcstrideq*0] + movh m1, [srcq+srcstrideq*1] + movh m3, [srcq+srcstrideq*2] punpcklbw m0, m6 punpcklbw m1, m6 punpcklbw m3, m6 @@ -711,38 +706,37 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 psraw m2, 2 pavgw m0, m6 pavgw m2, m6 -%ifidn %1, mmxext +%if mmsize == 8 packuswb m0, m0 packuswb m2, m2 - movh [r0+r1*0], m0 - movh [r0+r1*1], m2 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m2 %else packuswb m0, m2 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET -cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 - mov r6d, 8*16 - shl r5d, 4 - sub r6d, r5d +cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 %ifdef PIC - lea r11, [bilinear_filter_vw_m] + lea picregq, [bilinear_filter_vw_m] %endif pxor m6, m6 - mova m4, [bilinear_filter_vw+r6-16] - mova m5, [bilinear_filter_vw+r5-16] -.nextrow - movh m0, [r2+r3*0+0] - movh m1, [r2+r3*0+1] - movh m2, [r2+r3*1+0] - movh m3, [r2+r3*1+1] + mova m5, [bilinear_filter_vw+mxq-1*16] + neg mxq + mova m4, [bilinear_filter_vw+mxq+7*16] +.nextrow: + movh m0, [srcq+srcstrideq*0+0] + movh m1, [srcq+srcstrideq*0+1] + movh m2, [srcq+srcstrideq*1+0] + movh m3, [srcq+srcstrideq*1+1] punpcklbw m0, m6 punpcklbw m1, m6 punpcklbw m2, m6 @@ -757,41 +751,41 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 psraw m2, 2 pavgw m0, m6 pavgw m2, m6 -%ifidn %1, mmxext +%if mmsize == 8 packuswb m0, m0 packuswb m2, m2 - movh [r0+r1*0], m0 - movh [r0+r1*1], m2 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m2 %else packuswb m0, m2 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET %endmacro -INIT_MMX -FILTER_BILINEAR mmxext, 4, 0 -INIT_XMM -FILTER_BILINEAR sse2, 8, 7 +INIT_MMX mmx2 +FILTER_BILINEAR 4 +INIT_XMM sse2 +FILTER_BILINEAR 8 %macro FILTER_BILINEAR_SSSE3 1 -cglobal put_vp8_bilinear%1_v_ssse3, 7,7 - shl r6d, 4 +cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 %ifdef PIC - lea r11, [bilinear_filter_vb_m] + lea picregq, [bilinear_filter_vb_m] %endif pxor m4, m4 - mova m3, [bilinear_filter_vb+r6-16] -.nextrow - movh m0, [r2+r3*0] - movh m1, [r2+r3*1] - movh m2, [r2+r3*2] + mova m3, [bilinear_filter_vb+myq-16] +.nextrow: + movh m0, [srcq+srcstrideq*0] + movh m1, [srcq+srcstrideq*1] + movh m2, [srcq+srcstrideq*2] punpcklbw m0, m1 punpcklbw m1, m2 pmaddubsw m0, m3 @@ -803,31 +797,31 @@ cglobal put_vp8_bilinear%1_v_ssse3, 7,7 %if mmsize==8 packuswb m0, m0 packuswb m1, m1 - movh [r0+r1*0], m0 - movh [r0+r1*1], m1 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m1 %else packuswb m0, m1 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET -cglobal put_vp8_bilinear%1_h_ssse3, 7,7 - shl r5d, 4 +cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 %ifdef PIC - lea r11, [bilinear_filter_vb_m] + lea picregq, [bilinear_filter_vb_m] %endif pxor m4, m4 mova m2, [filter_h2_shuf] - mova m3, [bilinear_filter_vb+r5-16] -.nextrow - movu m0, [r2+r3*0] - movu m1, [r2+r3*1] + mova m3, [bilinear_filter_vb+mxq-16] +.nextrow: + movu m0, [srcq+srcstrideq*0] + movu m1, [srcq+srcstrideq*1] pshufb m0, m2 pshufb m1, m2 pmaddubsw m0, m3 @@ -839,63 +833,68 @@ cglobal put_vp8_bilinear%1_h_ssse3, 7,7 %if mmsize==8 packuswb m0, m0 packuswb m1, m1 - movh [r0+r1*0], m0 - movh [r0+r1*1], m1 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m1 %else packuswb m0, m1 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET %endmacro -INIT_MMX +INIT_MMX ssse3 FILTER_BILINEAR_SSSE3 4 -INIT_XMM +INIT_XMM ssse3 FILTER_BILINEAR_SSSE3 8 -cglobal put_vp8_pixels8_mmx, 5,5 +INIT_MMX mmx +cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height .nextrow: - movq mm0, [r2+r3*0] - movq mm1, [r2+r3*1] - lea r2, [r2+r3*2] - movq [r0+r1*0], mm0 - movq [r0+r1*1], mm1 - lea r0, [r0+r1*2] - sub r4d, 2 + movq mm0, [srcq+srcstrideq*0] + movq mm1, [srcq+srcstrideq*1] + lea srcq, [srcq+srcstrideq*2] + movq [dstq+dststrideq*0], mm0 + movq [dstq+dststrideq*1], mm1 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 jg .nextrow REP_RET -cglobal put_vp8_pixels16_mmx, 5,5 +%if ARCH_X86_32 +INIT_MMX mmx +cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height .nextrow: - movq mm0, [r2+r3*0+0] - movq mm1, [r2+r3*0+8] - movq mm2, [r2+r3*1+0] - movq mm3, [r2+r3*1+8] - lea r2, [r2+r3*2] - movq [r0+r1*0+0], mm0 - movq [r0+r1*0+8], mm1 - movq [r0+r1*1+0], mm2 - movq [r0+r1*1+8], mm3 - lea r0, [r0+r1*2] - sub r4d, 2 + movq mm0, [srcq+srcstrideq*0+0] + movq mm1, [srcq+srcstrideq*0+8] + movq mm2, [srcq+srcstrideq*1+0] + movq mm3, [srcq+srcstrideq*1+8] + lea srcq, [srcq+srcstrideq*2] + movq [dstq+dststrideq*0+0], mm0 + movq [dstq+dststrideq*0+8], mm1 + movq [dstq+dststrideq*1+0], mm2 + movq [dstq+dststrideq*1+8], mm3 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 jg .nextrow REP_RET +%endif -cglobal put_vp8_pixels16_sse, 5,5,2 +INIT_XMM sse +cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height .nextrow: - movups xmm0, [r2+r3*0] - movups xmm1, [r2+r3*1] - lea r2, [r2+r3*2] - movaps [r0+r1*0], xmm0 - movaps [r0+r1*1], xmm1 - lea r0, [r0+r1*2] - sub r4d, 2 + movups xmm0, [srcq+srcstrideq*0] + movups xmm1, [srcq+srcstrideq*1] + lea srcq, [srcq+srcstrideq*2] + movaps [dstq+dststrideq*0], xmm0 + movaps [dstq+dststrideq*1], xmm1 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 jg .nextrow REP_RET @@ -904,10 +903,10 @@ cglobal put_vp8_pixels16_sse, 5,5,2 ;----------------------------------------------------------------------------- %macro ADD_DC 4 - %4 m2, [r0+%3] - %4 m3, [r0+r2+%3] - %4 m4, [r1+%3] - %4 m5, [r1+r2+%3] + %4 m2, [dst1q+%3] + %4 m3, [dst1q+strideq+%3] + %4 m4, [dst2q+%3] + %4 m5, [dst2q+strideq+%3] paddusb m2, %1 paddusb m3, %1 paddusb m4, %1 @@ -916,22 +915,22 @@ cglobal put_vp8_pixels16_sse, 5,5,2 psubusb m3, %2 psubusb m4, %2 psubusb m5, %2 - %4 [r0+%3], m2 - %4 [r0+r2+%3], m3 - %4 [r1+%3], m4 - %4 [r1+r2+%3], m5 + %4 [dst1q+%3], m2 + %4 [dst1q+strideq+%3], m3 + %4 [dst2q+%3], m4 + %4 [dst2q+strideq+%3], m5 %endmacro -INIT_MMX -cglobal vp8_idct_dc_add_mmx, 3, 3 +INIT_MMX mmx +cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride ; load data - movd m0, [r1] + movd m0, [blockq] ; calculate DC paddw m0, [pw_4] pxor m1, m1 psraw m0, 3 - movd [r1], m1 + movd [blockq], m1 psubw m1, m0 packuswb m0, m0 packuswb m1, m1 @@ -941,24 +940,26 @@ cglobal vp8_idct_dc_add_mmx, 3, 3 punpcklwd m1, m1 ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m1, 0, movh RET -INIT_XMM -cglobal vp8_idct_dc_add_sse4, 3, 3, 6 +INIT_XMM sse4 +cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride ; load data - movd m0, [r1] + movd m0, [blockq] pxor m1, m1 ; calculate DC paddw m0, [pw_4] - movd [r1], m1 - lea r1, [r0+r2*2] - movd m2, [r0] - movd m3, [r0+r2] - movd m4, [r1] - movd m5, [r1+r2] + movd [blockq], m1 + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] + movd m2, [dst1q] + movd m3, [dst1q+strideq] + movd m4, [dst2q] + movd m5, [dst2q+strideq] psraw m0, 3 pshuflw m0, m0, 0 punpcklqdq m0, m0 @@ -969,32 +970,33 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6 paddw m2, m0 paddw m4, m0 packuswb m2, m4 - movd [r0], m2 - pextrd [r0+r2], m2, 1 - pextrd [r1], m2, 2 - pextrd [r1+r2], m2, 3 + movd [dst1q], m2 + pextrd [dst1q+strideq], m2, 1 + pextrd [dst2q], m2, 2 + pextrd [dst2q+strideq], m2, 3 RET ;----------------------------------------------------------------------------- ; void vp8_idct_dc_add4y_(uint8_t *dst, DCTELEM block[4][16], int stride); ;----------------------------------------------------------------------------- -INIT_MMX -cglobal vp8_idct_dc_add4y_mmx, 3, 3 +%if ARCH_X86_32 +INIT_MMX mmx +cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride ; load data - movd m0, [r1+32*0] ; A - movd m1, [r1+32*2] ; C - punpcklwd m0, [r1+32*1] ; A B - punpcklwd m1, [r1+32*3] ; C D + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D punpckldq m0, m1 ; A B C D pxor m6, m6 ; calculate DC paddw m0, [pw_4] - movd [r1+32*0], m6 - movd [r1+32*1], m6 - movd [r1+32*2], m6 - movd [r1+32*3], m6 + movd [blockq+32*0], m6 + movd [blockq+32*1], m6 + movd [blockq+32*2], m6 + movd [blockq+32*3], m6 psraw m0, 3 psubw m6, m0 packuswb m0, m0 @@ -1009,27 +1011,29 @@ cglobal vp8_idct_dc_add4y_mmx, 3, 3 punpckhbw m7, m7 ; CCCCDDDD ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m6, 0, mova ADD_DC m1, m7, 8, mova RET +%endif -INIT_XMM -cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 +INIT_XMM sse2 +cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride ; load data - movd m0, [r1+32*0] ; A - movd m1, [r1+32*2] ; C - punpcklwd m0, [r1+32*1] ; A B - punpcklwd m1, [r1+32*3] ; C D + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D punpckldq m0, m1 ; A B C D pxor m1, m1 ; calculate DC paddw m0, [pw_4] - movd [r1+32*0], m1 - movd [r1+32*1], m1 - movd [r1+32*2], m1 - movd [r1+32*3], m1 + movd [blockq+32*0], m1 + movd [blockq+32*1], m1 + movd [blockq+32*2], m1 + movd [blockq+32*3], m1 psraw m0, 3 psubw m1, m0 packuswb m0, m0 @@ -1040,7 +1044,8 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 punpcklbw m1, m1 ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m1, 0, mova RET @@ -1048,22 +1053,22 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 ; void vp8_idct_dc_add4uv_(uint8_t *dst, DCTELEM block[4][16], int stride); ;----------------------------------------------------------------------------- -INIT_MMX -cglobal vp8_idct_dc_add4uv_mmx, 3, 3 +INIT_MMX mmx +cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride ; load data - movd m0, [r1+32*0] ; A - movd m1, [r1+32*2] ; C - punpcklwd m0, [r1+32*1] ; A B - punpcklwd m1, [r1+32*3] ; C D + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D punpckldq m0, m1 ; A B C D pxor m6, m6 ; calculate DC paddw m0, [pw_4] - movd [r1+32*0], m6 - movd [r1+32*1], m6 - movd [r1+32*2], m6 - movd [r1+32*3], m6 + movd [blockq+32*0], m6 + movd [blockq+32*1], m6 + movd [blockq+32*2], m6 + movd [blockq+32*3], m6 psraw m0, 3 psubw m6, m0 packuswb m0, m0 @@ -1078,10 +1083,11 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3 punpckhbw m7, m7 ; CCCCDDDD ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m6, 0, mova - lea r0, [r0+r2*4] - lea r1, [r1+r2*4] + lea dst1q, [dst1q+strideq*4] + lea dst2q, [dst2q+strideq*4] ADD_DC m1, m7, 0, mova RET @@ -1112,34 +1118,33 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3 ; %5/%6 are temporary registers ; we assume m6/m7 have constant words 20091/17734 loaded in them %macro VP8_IDCT_TRANSFORM4x4_1D 6 - SUMSUB_BA m%3, m%1, m%5 ;t0, t1 + SUMSUB_BA w, %3, %1, %5 ;t0, t1 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 - SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 - SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 + SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3 + SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2 SWAP %4, %1 SWAP %4, %3 %endmacro -INIT_MMX -%macro VP8_IDCT_ADD 1 -cglobal vp8_idct_add_%1, 3, 3 +%macro VP8_IDCT_ADD 0 +cglobal vp8_idct_add, 3, 3, 0, dst, block, stride ; load block data - movq m0, [r1+ 0] - movq m1, [r1+ 8] - movq m2, [r1+16] - movq m3, [r1+24] + movq m0, [blockq+ 0] + movq m1, [blockq+ 8] + movq m2, [blockq+16] + movq m3, [blockq+24] movq m6, [pw_20091] movq m7, [pw_17734] -%ifidn %1, sse +%if cpuflag(sse) xorps xmm0, xmm0 - movaps [r1+ 0], xmm0 - movaps [r1+16], xmm0 + movaps [blockq+ 0], xmm0 + movaps [blockq+16], xmm0 %else pxor m4, m4 - movq [r1+ 0], m4 - movq [r1+ 8], m4 - movq [r1+16], m4 - movq [r1+24], m4 + movq [blockq+ 0], m4 + movq [blockq+ 8], m4 + movq [blockq+16], m4 + movq [blockq+24], m4 %endif ; actual IDCT @@ -1151,53 +1156,69 @@ cglobal vp8_idct_add_%1, 3, 3 ; store pxor m4, m4 - lea r1, [r0+2*r2] - STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 - STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+2*strideq] + STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq + STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq RET %endmacro -VP8_IDCT_ADD mmx -VP8_IDCT_ADD sse +%if ARCH_X86_32 +INIT_MMX mmx +VP8_IDCT_ADD +%endif +INIT_MMX sse +VP8_IDCT_ADD ;----------------------------------------------------------------------------- ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) ;----------------------------------------------------------------------------- %macro SCATTER_WHT 3 - movd r1d, m%1 - movd r2d, m%2 - mov [r0+2*16*(0+%3)], r1w - mov [r0+2*16*(1+%3)], r2w - shr r1d, 16 - shr r2d, 16 + movd dc1d, m%1 + movd dc2d, m%2 + mov [blockq+2*16*(0+%3)], dc1w + mov [blockq+2*16*(1+%3)], dc2w + shr dc1d, 16 + shr dc2d, 16 psrlq m%1, 32 psrlq m%2, 32 - mov [r0+2*16*(4+%3)], r1w - mov [r0+2*16*(5+%3)], r2w - movd r1d, m%1 - movd r2d, m%2 - mov [r0+2*16*(8+%3)], r1w - mov [r0+2*16*(9+%3)], r2w - shr r1d, 16 - shr r2d, 16 - mov [r0+2*16*(12+%3)], r1w - mov [r0+2*16*(13+%3)], r2w + mov [blockq+2*16*(4+%3)], dc1w + mov [blockq+2*16*(5+%3)], dc2w + movd dc1d, m%1 + movd dc2d, m%2 + mov [blockq+2*16*(8+%3)], dc1w + mov [blockq+2*16*(9+%3)], dc2w + shr dc1d, 16 + shr dc2d, 16 + mov [blockq+2*16*(12+%3)], dc1w + mov [blockq+2*16*(13+%3)], dc2w %endmacro %macro HADAMARD4_1D 4 - SUMSUB_BADC m%2, m%1, m%4, m%3 - SUMSUB_BADC m%4, m%2, m%3, m%1 + SUMSUB_BADC w, %2, %1, %4, %3 + SUMSUB_BADC w, %4, %2, %3, %1 SWAP %1, %4, %3 %endmacro -INIT_MMX -cglobal vp8_luma_dc_wht_mmx, 2,3 - movq m0, [r1] - movq m1, [r1+8] - movq m2, [r1+16] - movq m3, [r1+24] +%macro VP8_DC_WHT 0 +cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2 + movq m0, [dc1q] + movq m1, [dc1q+8] + movq m2, [dc1q+16] + movq m3, [dc1q+24] +%if cpuflag(sse) + xorps xmm0, xmm0 + movaps [dc1q+ 0], xmm0 + movaps [dc1q+16], xmm0 +%else + pxor m4, m4 + movq [dc1q+ 0], m4 + movq [dc1q+ 8], m4 + movq [dc1q+16], m4 + movq [dc1q+24], m4 +%endif HADAMARD4_1D 0, 1, 2, 3 TRANSPOSE4x4W 0, 1, 2, 3, 4 paddw m0, [pw_3] @@ -1209,6 +1230,14 @@ cglobal vp8_luma_dc_wht_mmx, 2,3 SCATTER_WHT 0, 1, 0 SCATTER_WHT 2, 3, 2 RET +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +VP8_DC_WHT +%endif +INIT_MMX sse +VP8_DC_WHT ;----------------------------------------------------------------------------- ; void vp8_h/v_loop_filter_simple_(uint8_t *dst, int stride, int flim); @@ -1332,7 +1361,7 @@ cglobal vp8_luma_dc_wht_mmx, 2,3 psrldq m%2, 4 %if %10 == 8 movd [%5+%8*2], m%1 - movd %5, m%3 + movd %5d, m%3 %endif psrldq m%3, 4 psrldq m%4, 4 @@ -1360,74 +1389,121 @@ cglobal vp8_luma_dc_wht_mmx, 2,3 movd [%7+%9*2], m%4 %endmacro -%macro SPLATB_REG_MMX 2-3 - movd %1, %2 - punpcklbw %1, %1 - punpcklwd %1, %1 - punpckldq %1, %1 -%endmacro +; write 4 or 8 words in the mmx/xmm registers as 8 lines +; 1 and 2 are the registers to write, this can be the same (for SSE2) +; for pre-SSE4: +; 3 is a general-purpose register that we will clobber +; for SSE4: +; 3 is a pointer to the destination's 5th line +; 4 is a pointer to the destination's 4th line +; 5/6 is -stride and +stride +%macro WRITE_2x4W 6 + movd %3d, %1 + punpckhdq %1, %1 + mov [%4+%5*4], %3w + shr %3, 16 + add %4, %6 + mov [%4+%5*4], %3w -%macro SPLATB_REG_MMXEXT 2-3 - movd %1, %2 - punpcklbw %1, %1 - pshufw %1, %1, 0x0 -%endmacro + movd %3d, %1 + add %4, %5 + mov [%4+%5*2], %3w + shr %3, 16 + mov [%4+%5 ], %3w -%macro SPLATB_REG_SSE2 2-3 - movd %1, %2 - punpcklbw %1, %1 - pshuflw %1, %1, 0x0 - punpcklqdq %1, %1 -%endmacro + movd %3d, %2 + punpckhdq %2, %2 + mov [%4 ], %3w + shr %3, 16 + mov [%4+%6 ], %3w -%macro SPLATB_REG_SSSE3 3 - movd %1, %2 - pshufb %1, %3 + movd %3d, %2 + add %4, %6 + mov [%4+%6 ], %3w + shr %3, 16 + mov [%4+%6*2], %3w + add %4, %5 %endmacro -%macro SIMPLE_LOOPFILTER 3 -cglobal vp8_%2_loop_filter_simple_%1, 3, %3 -%ifidn %2, h - mov r5, rsp ; backup stack pointer - and rsp, ~(mmsize-1) ; align stack +%macro WRITE_8W 5 +%if cpuflag(sse4) + pextrw [%3+%4*4], %1, 0 + pextrw [%2+%4*4], %1, 1 + pextrw [%3+%4*2], %1, 2 + pextrw [%3+%4 ], %1, 3 + pextrw [%3 ], %1, 4 + pextrw [%2 ], %1, 5 + pextrw [%2+%5 ], %1, 6 + pextrw [%2+%5*2], %1, 7 +%else + movd %2d, %1 + psrldq %1, 4 + mov [%3+%4*4], %2w + shr %2, 16 + add %3, %5 + mov [%3+%4*4], %2w + + movd %2d, %1 + psrldq %1, 4 + add %3, %4 + mov [%3+%4*2], %2w + shr %2, 16 + mov [%3+%4 ], %2w + + movd %2d, %1 + psrldq %1, 4 + mov [%3 ], %2w + shr %2, 16 + mov [%3+%5 ], %2w + + movd %2d, %1 + add %3, %5 + mov [%3+%5 ], %2w + shr %2, 16 + mov [%3+%5*2], %2w %endif +%endmacro + +%macro SIMPLE_LOOPFILTER 2 +cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr %if mmsize == 8 ; mmx/mmxext - mov r3, 2 + mov cntrq, 2 %endif -%ifnidn %1, sse2 && mmsize == 16 +%if cpuflag(ssse3) pxor m0, m0 %endif - SPLATB_REG m7, r2, m0 ; splat "flim" into register + SPLATB_REG m7, flim, m0 ; splat "flim" into register ; set up indexes to address 4 rows - mov r2, r1 - neg r1 -%ifidn %2, h - lea r0, [r0+4*r2-2] - sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1 +%if mmsize == 8 + DEFINE_ARGS dst1, mstride, stride, cntr, dst2 +%else + DEFINE_ARGS dst1, mstride, stride, dst3, dst2 +%endif + mov strideq, mstrideq + neg mstrideq +%ifidn %1, h + lea dst1q, [dst1q+4*strideq-2] %endif %if mmsize == 8 ; mmx / mmxext -.next8px +.next8px: %endif -%ifidn %2, v +%ifidn %1, v ; read 4 half/full rows of pixels - mova m0, [r0+r1*2] ; p1 - mova m1, [r0+r1] ; p0 - mova m2, [r0] ; q0 - mova m3, [r0+r2] ; q1 + mova m0, [dst1q+mstrideq*2] ; p1 + mova m1, [dst1q+mstrideq] ; p0 + mova m2, [dst1q] ; q0 + mova m3, [dst1q+ strideq] ; q1 %else ; h - lea r4, [r0+r2] + lea dst2q, [dst1q+ strideq] %if mmsize == 8 ; mmx/mmxext - READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 + READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq %else ; sse2 - READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 + READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q %endif TRANSPOSE4x4W 0, 1, 2, 3, 4 - - mova [rsp], m0 ; store p1 - mova [rsp+mmsize], m3 ; store q1 %endif ; simple_limit @@ -1494,235 +1570,211 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3 psubusb m6, m3 ; p0+f2 ; store -%ifidn %2, v - mova [r0], m4 - mova [r0+r1], m6 +%ifidn %1, v + mova [dst1q], m4 + mova [dst1q+mstrideq], m6 %else ; h - mova m0, [rsp] ; p1 - SWAP 2, 4 ; p0 - SWAP 1, 6 ; q0 - mova m3, [rsp+mmsize] ; q1 + inc dst1q + SBUTTERFLY bw, 6, 4, 0 - TRANSPOSE4x4B 0, 1, 2, 3, 4 %if mmsize == 16 ; sse2 - add r3, r1 ; change from r4*8*stride to r0+8*stride - WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16 +%if cpuflag(sse4) + inc dst2q +%endif + WRITE_8W m6, dst2q, dst1q, mstrideq, strideq + lea dst2q, [dst3q+mstrideq+1] +%if cpuflag(sse4) + inc dst3q +%endif + WRITE_8W m4, dst3q, dst2q, mstrideq, strideq %else ; mmx/mmxext - WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 + WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq %endif %endif %if mmsize == 8 ; mmx/mmxext ; next 8 pixels -%ifidn %2, v - add r0, 8 ; advance 8 cols = pixels +%ifidn %1, v + add dst1q, 8 ; advance 8 cols = pixels %else ; h - lea r0, [r0+r2*8] ; advance 8 rows = lines + lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines %endif - dec r3 + dec cntrq jg .next8px -%ifidn %2, v REP_RET -%else ; h - mov rsp, r5 ; restore stack pointer - RET -%endif %else ; sse2 -%ifidn %2, h - mov rsp, r5 ; restore stack pointer -%endif RET %endif %endmacro -INIT_MMX -%define SPLATB_REG SPLATB_REG_MMX -SIMPLE_LOOPFILTER mmx, v, 4 -SIMPLE_LOOPFILTER mmx, h, 6 -%define SPLATB_REG SPLATB_REG_MMXEXT -SIMPLE_LOOPFILTER mmxext, v, 4 -SIMPLE_LOOPFILTER mmxext, h, 6 -INIT_XMM -%define SPLATB_REG SPLATB_REG_SSE2 -SIMPLE_LOOPFILTER sse2, v, 3 -SIMPLE_LOOPFILTER sse2, h, 6 -%define SPLATB_REG SPLATB_REG_SSSE3 -SIMPLE_LOOPFILTER ssse3, v, 3 -SIMPLE_LOOPFILTER ssse3, h, 6 +%if ARCH_X86_32 +INIT_MMX mmx +SIMPLE_LOOPFILTER v, 4 +SIMPLE_LOOPFILTER h, 5 +INIT_MMX mmx2 +SIMPLE_LOOPFILTER v, 4 +SIMPLE_LOOPFILTER h, 5 +%endif + +INIT_XMM sse2 +SIMPLE_LOOPFILTER v, 3 +SIMPLE_LOOPFILTER h, 5 +INIT_XMM ssse3 +SIMPLE_LOOPFILTER v, 3 +SIMPLE_LOOPFILTER h, 5 +INIT_XMM sse4 +SIMPLE_LOOPFILTER h, 5 ;----------------------------------------------------------------------------- ; void vp8_h/v_loop_filter_inner_(uint8_t *dst, [uint8_t *v,] int stride, ; int flimE, int flimI, int hev_thr); ;----------------------------------------------------------------------------- -%macro INNER_LOOPFILTER 5 -%if %4 == 8 ; chroma -cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 -%define dst8_reg r1 -%define mstride_reg r2 -%define E_reg r3 -%define I_reg r4 -%define hev_thr_reg r5 +%macro INNER_LOOPFILTER 2 +%if %2 == 8 ; chroma +cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, dst, dst8, stride, flimE, flimI, hevthr %else ; luma -cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 -%define mstride_reg r1 -%define E_reg r2 -%define I_reg r3 -%define hev_thr_reg r4 -%ifdef m8 ; x86-64, sse2 -%define dst8_reg r4 -%elif mmsize == 16 ; x86-32, sse2 -%define dst8_reg r5 -%else ; x86-32, mmx/mmxext -%define cnt_reg r5 -%endif -%endif -%define dst_reg r0 -%define stride_reg E_reg -%define dst2_reg I_reg -%ifndef m8 -%define stack_reg hev_thr_reg +cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr %endif -%ifnidn %1, sse2 && mmsize == 16 +%if cpuflag(ssse3) pxor m7, m7 %endif - -%ifndef m8 ; mmx/mmxext or sse2 on x86-32 - ; splat function arguments - SPLATB_REG m0, E_reg, m7 ; E - SPLATB_REG m1, I_reg, m7 ; I - SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh - - ; align stack - mov stack_reg, rsp ; backup stack pointer - and rsp, ~(mmsize-1) ; align stack -%ifidn %2, v - sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr - ; [3]=hev() result -%else ; h - sub rsp, mmsize * 5 ; extra storage space for transposes +%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr +%ifidn %1, v ; [3]=hev() result +%assign pad 16 + mmsize * 4 - gprsize - (stack_offset & 15) +%else ; h ; extra storage space for transposes +%assign pad 16 + mmsize * 5 - gprsize - (stack_offset & 15) %endif - -%define flim_E [rsp] -%define flim_I [rsp+mmsize] -%define hev_thr [rsp+mmsize*2] -%define mask_res [rsp+mmsize*3] -%define p0backup [rsp+mmsize*3] -%define q0backup [rsp+mmsize*4] - - mova flim_E, m0 - mova flim_I, m1 - mova hev_thr, m2 - -%else ; sse2 on x86-64 - -%define flim_E m9 -%define flim_I m10 -%define hev_thr m11 -%define mask_res m12 -%define p0backup m12 -%define q0backup m8 + ; splat function arguments + SPLATB_REG m0, flimEq, m7 ; E + SPLATB_REG m1, flimIq, m7 ; I + SPLATB_REG m2, hevthrq, m7 ; hev_thresh + + SUB rsp, pad + +%define m_flimE [rsp] +%define m_flimI [rsp+mmsize] +%define m_hevthr [rsp+mmsize*2] +%define m_maskres [rsp+mmsize*3] +%define m_p0backup [rsp+mmsize*3] +%define m_q0backup [rsp+mmsize*4] + + mova m_flimE, m0 + mova m_flimI, m1 + mova m_hevthr, m2 +%else +%define m_flimE m9 +%define m_flimI m10 +%define m_hevthr m11 +%define m_maskres m12 +%define m_p0backup m12 +%define m_q0backup m8 ; splat function arguments - SPLATB_REG flim_E, E_reg, m7 ; E - SPLATB_REG flim_I, I_reg, m7 ; I - SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh + SPLATB_REG m_flimE, flimEq, m7 ; E + SPLATB_REG m_flimI, flimIq, m7 ; I + SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh %endif -%if mmsize == 8 && %4 == 16 ; mmx/mmxext - mov cnt_reg, 2 +%if %2 == 8 ; chroma + DEFINE_ARGS dst1, dst8, mstride, stride, dst2 +%elif mmsize == 8 + DEFINE_ARGS dst1, mstride, stride, dst2, cntr + mov cntrq, 2 +%else + DEFINE_ARGS dst1, mstride, stride, dst2, dst8 %endif - mov stride_reg, mstride_reg - neg mstride_reg -%ifidn %2, h - lea dst_reg, [dst_reg + stride_reg*4-4] -%if %4 == 8 - lea dst8_reg, [dst8_reg+ stride_reg*4-4] + mov strideq, mstrideq + neg mstrideq +%ifidn %1, h + lea dst1q, [dst1q+strideq*4-4] +%if %2 == 8 ; chroma + lea dst8q, [dst8q+strideq*4-4] %endif %endif %if mmsize == 8 -.next8px +.next8px: %endif ; read - lea dst2_reg, [dst_reg + stride_reg] -%ifidn %2, v -%if %4 == 8 && mmsize == 16 + lea dst2q, [dst1q+strideq] +%ifidn %1, v +%if %2 == 8 && mmsize == 16 %define movrow movh %else %define movrow mova %endif - movrow m0, [dst_reg +mstride_reg*4] ; p3 - movrow m1, [dst2_reg+mstride_reg*4] ; p2 - movrow m2, [dst_reg +mstride_reg*2] ; p1 - movrow m5, [dst2_reg] ; q1 - movrow m6, [dst2_reg+ stride_reg] ; q2 - movrow m7, [dst2_reg+ stride_reg*2] ; q3 -%if mmsize == 16 && %4 == 8 - movhps m0, [dst8_reg+mstride_reg*4] - movhps m2, [dst8_reg+mstride_reg*2] - add dst8_reg, stride_reg - movhps m1, [dst8_reg+mstride_reg*4] - movhps m5, [dst8_reg] - movhps m6, [dst8_reg+ stride_reg] - movhps m7, [dst8_reg+ stride_reg*2] - add dst8_reg, mstride_reg + movrow m0, [dst1q+mstrideq*4] ; p3 + movrow m1, [dst2q+mstrideq*4] ; p2 + movrow m2, [dst1q+mstrideq*2] ; p1 + movrow m5, [dst2q] ; q1 + movrow m6, [dst2q+ strideq*1] ; q2 + movrow m7, [dst2q+ strideq*2] ; q3 +%if mmsize == 16 && %2 == 8 + movhps m0, [dst8q+mstrideq*4] + movhps m2, [dst8q+mstrideq*2] + add dst8q, strideq + movhps m1, [dst8q+mstrideq*4] + movhps m5, [dst8q] + movhps m6, [dst8q+ strideq ] + movhps m7, [dst8q+ strideq*2] + add dst8q, mstrideq %endif %elif mmsize == 8 ; mmx/mmxext (h) ; read 8 rows of 8px each - movu m0, [dst_reg +mstride_reg*4] - movu m1, [dst2_reg+mstride_reg*4] - movu m2, [dst_reg +mstride_reg*2] - movu m3, [dst_reg +mstride_reg] - movu m4, [dst_reg] - movu m5, [dst2_reg] - movu m6, [dst2_reg+ stride_reg] + movu m0, [dst1q+mstrideq*4] + movu m1, [dst2q+mstrideq*4] + movu m2, [dst1q+mstrideq*2] + movu m3, [dst1q+mstrideq ] + movu m4, [dst1q] + movu m5, [dst2q] + movu m6, [dst2q+ strideq ] ; 8x8 transpose TRANSPOSE4x4B 0, 1, 2, 3, 7 - mova q0backup, m1 - movu m7, [dst2_reg+ stride_reg*2] + mova m_q0backup, m1 + movu m7, [dst2q+ strideq*2] TRANSPOSE4x4B 4, 5, 6, 7, 1 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 - mova m1, q0backup - mova q0backup, m2 ; store q0 + mova m1, m_q0backup + mova m_q0backup, m2 ; store q0 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 - mova p0backup, m5 ; store p0 + mova m_p0backup, m5 ; store p0 SWAP 1, 4 SWAP 2, 4 SWAP 6, 3 SWAP 5, 3 %else ; sse2 (h) -%if %4 == 16 - lea dst8_reg, [dst_reg + stride_reg*8] +%if %2 == 16 + lea dst8q, [dst1q+ strideq*8] %endif ; read 16 rows of 8px each, interleave - movh m0, [dst_reg +mstride_reg*4] - movh m1, [dst8_reg+mstride_reg*4] - movh m2, [dst_reg +mstride_reg*2] - movh m5, [dst8_reg+mstride_reg*2] - movh m3, [dst_reg +mstride_reg] - movh m6, [dst8_reg+mstride_reg] - movh m4, [dst_reg] - movh m7, [dst8_reg] + movh m0, [dst1q+mstrideq*4] + movh m1, [dst8q+mstrideq*4] + movh m2, [dst1q+mstrideq*2] + movh m5, [dst8q+mstrideq*2] + movh m3, [dst1q+mstrideq ] + movh m6, [dst8q+mstrideq ] + movh m4, [dst1q] + movh m7, [dst8q] punpcklbw m0, m1 ; A/I punpcklbw m2, m5 ; C/K punpcklbw m3, m6 ; D/L punpcklbw m4, m7 ; E/M - add dst8_reg, stride_reg - movh m1, [dst2_reg+mstride_reg*4] - movh m6, [dst8_reg+mstride_reg*4] - movh m5, [dst2_reg] - movh m7, [dst8_reg] + add dst8q, strideq + movh m1, [dst2q+mstrideq*4] + movh m6, [dst8q+mstrideq*4] + movh m5, [dst2q] + movh m7, [dst8q] punpcklbw m1, m6 ; B/J punpcklbw m5, m7 ; F/N - movh m6, [dst2_reg+ stride_reg] - movh m7, [dst8_reg+ stride_reg] + movh m6, [dst2q+ strideq ] + movh m7, [dst8q+ strideq ] punpcklbw m6, m7 ; G/O ; 8x16 transpose @@ -1730,10 +1782,10 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 %ifdef m8 SWAP 1, 8 %else - mova q0backup, m1 + mova m_q0backup, m1 %endif - movh m7, [dst2_reg+ stride_reg*2] - movh m1, [dst8_reg+ stride_reg*2] + movh m7, [dst2q+ strideq*2] + movh m1, [dst8q+ strideq*2] punpcklbw m7, m1 ; H/P TRANSPOSE4x4B 4, 5, 6, 7, 1 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 @@ -1743,14 +1795,14 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 SWAP 1, 8 SWAP 2, 8 %else - mova m1, q0backup - mova q0backup, m2 ; store q0 + mova m1, m_q0backup + mova m_q0backup, m2 ; store q0 %endif SBUTTERFLY dq, 1, 5, 2 ; p1/p0 %ifdef m12 SWAP 5, 12 %else - mova p0backup, m5 ; store p0 + mova m_p0backup, m5 ; store p0 %endif SWAP 1, 4 SWAP 2, 4 @@ -1783,8 +1835,8 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 psubusb m6, m5 ; q2-q1 por m6, m4 ; abs(q2-q1) -%ifidn %1, mmx - mova m4, flim_I +%if notcpuflag(mmx2) + mova m4, m_flimI pxor m3, m3 psubusb m0, m4 psubusb m1, m4 @@ -1805,15 +1857,15 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 ; normal_limit and high_edge_variance for p1-p0, q1-q0 SWAP 7, 3 ; now m7 is zero -%ifidn %2, v - movrow m3, [dst_reg +mstride_reg] ; p0 -%if mmsize == 16 && %4 == 8 - movhps m3, [dst8_reg+mstride_reg] +%ifidn %1, v + movrow m3, [dst1q+mstrideq ] ; p0 +%if mmsize == 16 && %2 == 8 + movhps m3, [dst8q+mstrideq ] %endif %elifdef m12 SWAP 3, 12 %else - mova m3, p0backup + mova m3, m_p0backup %endif mova m1, m2 @@ -1823,29 +1875,29 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 psubusb m1, m3 ; p1-p0 psubusb m6, m2 ; p0-p1 por m1, m6 ; abs(p1-p0) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m6, m1 psubusb m1, m4 - psubusb m6, hev_thr + psubusb m6, m_hevthr pcmpeqb m1, m7 ; abs(p1-p0) <= I pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh pand m0, m1 - mova mask_res, m6 + mova m_maskres, m6 %else ; mmxext/sse2 pmaxub m0, m1 ; max_I SWAP 1, 4 ; max_hev_thresh %endif SWAP 6, 4 ; now m6 is I -%ifidn %2, v - movrow m4, [dst_reg] ; q0 -%if mmsize == 16 && %4 == 8 - movhps m4, [dst8_reg] +%ifidn %1, v + movrow m4, [dst1q] ; q0 +%if mmsize == 16 && %2 == 8 + movhps m4, [dst8q] %endif %elifdef m8 SWAP 4, 8 %else - mova m4, q0backup + mova m4, m_q0backup %endif mova m1, m4 SWAP 1, 4 @@ -1854,29 +1906,29 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 psubusb m1, m5 ; q0-q1 psubusb m7, m4 ; q1-q0 por m1, m7 ; abs(q1-q0) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m7, m1 psubusb m1, m6 - psubusb m7, hev_thr + psubusb m7, m_hevthr pxor m6, m6 pcmpeqb m1, m6 ; abs(q1-q0) <= I pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh - mova m6, mask_res + mova m6, m_maskres pand m0, m1 ; abs([pq][321]-[pq][210]) <= I pand m6, m7 %else ; mmxext/sse2 pxor m7, m7 pmaxub m0, m1 pmaxub m6, m1 - psubusb m0, flim_I - psubusb m6, hev_thr + psubusb m0, m_flimI + psubusb m6, m_hevthr pcmpeqb m0, m7 ; max(abs(..)) <= I pcmpeqb m6, m7 ; !(max(abs..) > thresh) %endif %ifdef m12 SWAP 6, 12 %else - mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) + mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) %endif ; simple_limit @@ -1900,28 +1952,28 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 pand m7, [pb_FE] psrlq m7, 1 ; abs(q1-p1)/2 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 - psubusb m7, flim_E + psubusb m7, m_flimE pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E pand m0, m7 ; normal_limit result ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask %ifdef m8 ; x86-64 && sse2 mova m8, [pb_80] -%define pb_80_var m8 +%define m_pb_80 m8 %else ; x86-32 or mmx/mmxext -%define pb_80_var [pb_80] +%define m_pb_80 [pb_80] %endif mova m1, m4 mova m7, m3 - pxor m1, pb_80_var - pxor m7, pb_80_var + pxor m1, m_pb_80 + pxor m7, m_pb_80 psubsb m1, m7 ; (signed) q0-p0 mova m6, m2 mova m7, m5 - pxor m6, pb_80_var - pxor m7, pb_80_var + pxor m6, m_pb_80 + pxor m7, m_pb_80 psubsb m6, m7 ; (signed) p1-q1 - mova m7, mask_res + mova m7, m_maskres pandn m7, m6 paddsb m7, m1 paddsb m7, m1 @@ -1960,16 +2012,16 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 %ifdef m12 SWAP 6, 12 %else - mova m6, mask_res + mova m6, m_maskres %endif -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m7, [pb_1] %else ; mmxext/sse2 pxor m7, m7 %endif pand m0, m6 pand m1, m6 -%ifidn %1, mmx +%if notcpuflag(mmx2) paddusb m0, m7 pand m1, [pb_FE] pandn m7, m0 @@ -1987,332 +2039,248 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 paddusb m2, m0 ; p1+a ; store -%ifidn %2, v - movrow [dst_reg +mstride_reg*2], m2 - movrow [dst_reg +mstride_reg ], m3 - movrow [dst_reg], m4 - movrow [dst_reg + stride_reg ], m5 -%if mmsize == 16 && %4 == 8 - movhps [dst8_reg+mstride_reg*2], m2 - movhps [dst8_reg+mstride_reg ], m3 - movhps [dst8_reg], m4 - movhps [dst8_reg+ stride_reg ], m5 +%ifidn %1, v + movrow [dst1q+mstrideq*2], m2 + movrow [dst1q+mstrideq ], m3 + movrow [dst1q], m4 + movrow [dst1q+ strideq ], m5 +%if mmsize == 16 && %2 == 8 + movhps [dst8q+mstrideq*2], m2 + movhps [dst8q+mstrideq ], m3 + movhps [dst8q], m4 + movhps [dst8q+ strideq ], m5 %endif %else ; h - add dst_reg, 2 - add dst2_reg, 2 + add dst1q, 2 + add dst2q, 2 ; 4x8/16 transpose TRANSPOSE4x4B 2, 3, 4, 5, 6 %if mmsize == 8 ; mmx/mmxext (h) - WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg + WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq %else ; sse2 (h) - lea dst8_reg, [dst8_reg+mstride_reg+2] - WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 + lea dst8q, [dst8q+mstrideq +2] + WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2 %endif %endif %if mmsize == 8 -%if %4 == 8 ; chroma -%ifidn %2, h - sub dst_reg, 2 +%if %2 == 8 ; chroma +%ifidn %1, h + sub dst1q, 2 %endif - cmp dst_reg, dst8_reg - mov dst_reg, dst8_reg + cmp dst1q, dst8q + mov dst1q, dst8q jnz .next8px %else -%ifidn %2, h - lea dst_reg, [dst_reg + stride_reg*8-2] +%ifidn %1, h + lea dst1q, [dst1q+ strideq*8-2] %else ; v - add dst_reg, 8 + add dst1q, 8 %endif - dec cnt_reg + dec cntrq jg .next8px %endif %endif %ifndef m8 ; sse2 on x86-32 or mmx/mmxext - mov rsp, stack_reg ; restore stack pointer + ADD rsp, pad %endif RET %endmacro -INIT_MMX -%define SPLATB_REG SPLATB_REG_MMX -INNER_LOOPFILTER mmx, v, 6, 16, 0 -INNER_LOOPFILTER mmx, h, 6, 16, 0 -INNER_LOOPFILTER mmx, v, 6, 8, 0 -INNER_LOOPFILTER mmx, h, 6, 8, 0 - -%define SPLATB_REG SPLATB_REG_MMXEXT -INNER_LOOPFILTER mmxext, v, 6, 16, 0 -INNER_LOOPFILTER mmxext, h, 6, 16, 0 -INNER_LOOPFILTER mmxext, v, 6, 8, 0 -INNER_LOOPFILTER mmxext, h, 6, 8, 0 - -INIT_XMM -%define SPLATB_REG SPLATB_REG_SSE2 -INNER_LOOPFILTER sse2, v, 5, 16, 13 -%ifdef m8 -INNER_LOOPFILTER sse2, h, 5, 16, 13 -%else -INNER_LOOPFILTER sse2, h, 6, 16, 13 -%endif -INNER_LOOPFILTER sse2, v, 6, 8, 13 -INNER_LOOPFILTER sse2, h, 6, 8, 13 +%if ARCH_X86_32 +INIT_MMX mmx +INNER_LOOPFILTER v, 16 +INNER_LOOPFILTER h, 16 +INNER_LOOPFILTER v, 8 +INNER_LOOPFILTER h, 8 -%define SPLATB_REG SPLATB_REG_SSSE3 -INNER_LOOPFILTER ssse3, v, 5, 16, 13 -%ifdef m8 -INNER_LOOPFILTER ssse3, h, 5, 16, 13 -%else -INNER_LOOPFILTER ssse3, h, 6, 16, 13 +INIT_MMX mmx2 +INNER_LOOPFILTER v, 16 +INNER_LOOPFILTER h, 16 +INNER_LOOPFILTER v, 8 +INNER_LOOPFILTER h, 8 %endif -INNER_LOOPFILTER ssse3, v, 6, 8, 13 -INNER_LOOPFILTER ssse3, h, 6, 8, 13 + +INIT_XMM sse2 +INNER_LOOPFILTER v, 16 +INNER_LOOPFILTER h, 16 +INNER_LOOPFILTER v, 8 +INNER_LOOPFILTER h, 8 + +INIT_XMM ssse3 +INNER_LOOPFILTER v, 16 +INNER_LOOPFILTER h, 16 +INNER_LOOPFILTER v, 8 +INNER_LOOPFILTER h, 8 ;----------------------------------------------------------------------------- ; void vp8_h/v_loop_filter_mbedge_(uint8_t *dst, [uint8_t *v,] int stride, ; int flimE, int flimI, int hev_thr); ;----------------------------------------------------------------------------- -; write 4 or 8 words in the mmx/xmm registers as 8 lines -; 1 and 2 are the registers to write, this can be the same (for SSE2) -; for pre-SSE4: -; 3 is a general-purpose register that we will clobber -; for SSE4: -; 3 is a pointer to the destination's 5th line -; 4 is a pointer to the destination's 4th line -; 5/6 is -stride and +stride -; 7 is optimization string -%macro WRITE_8W 7 -%ifidn %7, sse4 - pextrw [%4+%5*4], %1, 0 - pextrw [%3+%5*4], %1, 1 - pextrw [%4+%5*2], %1, 2 - pextrw [%4+%5 ], %1, 3 - pextrw [%4 ], %1, 4 - pextrw [%3 ], %1, 5 - pextrw [%3+%6 ], %1, 6 - pextrw [%3+%6*2], %1, 7 -%else - movd %3, %1 -%if mmsize == 8 - punpckhdq %1, %1 -%else - psrldq %1, 4 -%endif - mov [%4+%5*4], %3w - shr %3, 16 - add %4, %6 - mov [%4+%5*4], %3w - - movd %3, %1 -%if mmsize == 16 - psrldq %1, 4 -%endif - add %4, %5 - mov [%4+%5*2], %3w - shr %3, 16 - mov [%4+%5 ], %3w - - movd %3, %2 -%if mmsize == 8 - punpckhdq %2, %2 -%else - psrldq %2, 4 -%endif - mov [%4 ], %3w - shr %3, 16 - mov [%4+%6 ], %3w - - movd %3, %2 - add %4, %6 - mov [%4+%6 ], %3w - shr %3, 16 - mov [%4+%6*2], %3w -%if mmsize == 8 - add %4, %5 -%endif -%endif -%endmacro - -%macro MBEDGE_LOOPFILTER 5 -%if %4 == 8 ; chroma -cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 -%define dst8_reg r1 -%define mstride_reg r2 -%define E_reg r3 -%define I_reg r4 -%define hev_thr_reg r5 +%macro MBEDGE_LOOPFILTER 2 +%if %2 == 8 ; chroma +cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, dst1, dst8, stride, flimE, flimI, hevthr %else ; luma -cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 -%define mstride_reg r1 -%define E_reg r2 -%define I_reg r3 -%define hev_thr_reg r4 -%ifdef m8 ; x86-64, sse2 -%define dst8_reg r4 -%elif mmsize == 16 ; x86-32, sse2 -%define dst8_reg r5 -%else ; x86-32, mmx/mmxext -%define cnt_reg r5 -%endif -%endif -%define dst_reg r0 -%define stride_reg E_reg -%define dst2_reg I_reg -%ifndef m8 -%define stack_reg hev_thr_reg +cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevthr %endif -%ifnidn %1, sse2 && mmsize == 16 +%if cpuflag(ssse3) pxor m7, m7 %endif - -%ifndef m8 ; mmx/mmxext or sse2 on x86-32 +%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr +%if mmsize == 16 ; [3]=hev() result + ; [4]=filter tmp result + ; [5]/[6] = p2/q2 backup + ; [7]=lim_res sign result +%assign pad 16 + mmsize * 7 - gprsize - (stack_offset & 15) +%else ; 8 ; extra storage space for transposes +%assign pad 16 + mmsize * 8 - gprsize - (stack_offset & 15) +%endif ; splat function arguments - SPLATB_REG m0, E_reg, m7 ; E - SPLATB_REG m1, I_reg, m7 ; I - SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh - - ; align stack - mov stack_reg, rsp ; backup stack pointer - and rsp, ~(mmsize-1) ; align stack - sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr - ; [3]=hev() result - ; [4]=filter tmp result - ; [5]/[6] = p2/q2 backup - ; [7]=lim_res sign result - -%define flim_E [rsp] -%define flim_I [rsp+mmsize] -%define hev_thr [rsp+mmsize*2] -%define mask_res [rsp+mmsize*3] -%define lim_res [rsp+mmsize*4] -%define p0backup [rsp+mmsize*3] -%define q0backup [rsp+mmsize*4] -%define p2backup [rsp+mmsize*5] -%define q2backup [rsp+mmsize*6] -%define lim_sign [rsp+mmsize*7] - - mova flim_E, m0 - mova flim_I, m1 - mova hev_thr, m2 + SPLATB_REG m0, flimEq, m7 ; E + SPLATB_REG m1, flimIq, m7 ; I + SPLATB_REG m2, hevthrq, m7 ; hev_thresh + + SUB rsp, pad + +%define m_flimE [rsp] +%define m_flimI [rsp+mmsize] +%define m_hevthr [rsp+mmsize*2] +%define m_maskres [rsp+mmsize*3] +%define m_limres [rsp+mmsize*4] +%define m_p0backup [rsp+mmsize*3] +%define m_q0backup [rsp+mmsize*4] +%define m_p2backup [rsp+mmsize*5] +%define m_q2backup [rsp+mmsize*6] +%if mmsize == 16 +%define m_limsign [rsp] +%else +%define m_limsign [rsp+mmsize*7] +%endif + mova m_flimE, m0 + mova m_flimI, m1 + mova m_hevthr, m2 %else ; sse2 on x86-64 - -%define flim_E m9 -%define flim_I m10 -%define hev_thr m11 -%define mask_res m12 -%define lim_res m8 -%define p0backup m12 -%define q0backup m8 -%define p2backup m13 -%define q2backup m14 -%define lim_sign m15 +%define m_flimE m9 +%define m_flimI m10 +%define m_hevthr m11 +%define m_maskres m12 +%define m_limres m8 +%define m_p0backup m12 +%define m_q0backup m8 +%define m_p2backup m13 +%define m_q2backup m14 +%define m_limsign m9 ; splat function arguments - SPLATB_REG flim_E, E_reg, m7 ; E - SPLATB_REG flim_I, I_reg, m7 ; I - SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh + SPLATB_REG m_flimE, flimEq, m7 ; E + SPLATB_REG m_flimI, flimIq, m7 ; I + SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh %endif -%if mmsize == 8 && %4 == 16 ; mmx/mmxext - mov cnt_reg, 2 +%if %2 == 8 ; chroma + DEFINE_ARGS dst1, dst8, mstride, stride, dst2 +%elif mmsize == 8 + DEFINE_ARGS dst1, mstride, stride, dst2, cntr + mov cntrq, 2 +%else + DEFINE_ARGS dst1, mstride, stride, dst2, dst8 %endif - mov stride_reg, mstride_reg - neg mstride_reg -%ifidn %2, h - lea dst_reg, [dst_reg + stride_reg*4-4] -%if %4 == 8 - lea dst8_reg, [dst8_reg+ stride_reg*4-4] + mov strideq, mstrideq + neg mstrideq +%ifidn %1, h + lea dst1q, [dst1q+strideq*4-4] +%if %2 == 8 ; chroma + lea dst8q, [dst8q+strideq*4-4] %endif %endif %if mmsize == 8 -.next8px +.next8px: %endif ; read - lea dst2_reg, [dst_reg + stride_reg] -%ifidn %2, v -%if %4 == 8 && mmsize == 16 + lea dst2q, [dst1q+ strideq ] +%ifidn %1, v +%if %2 == 8 && mmsize == 16 %define movrow movh %else %define movrow mova %endif - movrow m0, [dst_reg +mstride_reg*4] ; p3 - movrow m1, [dst2_reg+mstride_reg*4] ; p2 - movrow m2, [dst_reg +mstride_reg*2] ; p1 - movrow m5, [dst2_reg] ; q1 - movrow m6, [dst2_reg+ stride_reg] ; q2 - movrow m7, [dst2_reg+ stride_reg*2] ; q3 -%if mmsize == 16 && %4 == 8 - movhps m0, [dst8_reg+mstride_reg*4] - movhps m2, [dst8_reg+mstride_reg*2] - add dst8_reg, stride_reg - movhps m1, [dst8_reg+mstride_reg*4] - movhps m5, [dst8_reg] - movhps m6, [dst8_reg+ stride_reg] - movhps m7, [dst8_reg+ stride_reg*2] - add dst8_reg, mstride_reg + movrow m0, [dst1q+mstrideq*4] ; p3 + movrow m1, [dst2q+mstrideq*4] ; p2 + movrow m2, [dst1q+mstrideq*2] ; p1 + movrow m5, [dst2q] ; q1 + movrow m6, [dst2q+ strideq ] ; q2 + movrow m7, [dst2q+ strideq*2] ; q3 +%if mmsize == 16 && %2 == 8 + movhps m0, [dst8q+mstrideq*4] + movhps m2, [dst8q+mstrideq*2] + add dst8q, strideq + movhps m1, [dst8q+mstrideq*4] + movhps m5, [dst8q] + movhps m6, [dst8q+ strideq ] + movhps m7, [dst8q+ strideq*2] + add dst8q, mstrideq %endif %elif mmsize == 8 ; mmx/mmxext (h) ; read 8 rows of 8px each - movu m0, [dst_reg +mstride_reg*4] - movu m1, [dst2_reg+mstride_reg*4] - movu m2, [dst_reg +mstride_reg*2] - movu m3, [dst_reg +mstride_reg] - movu m4, [dst_reg] - movu m5, [dst2_reg] - movu m6, [dst2_reg+ stride_reg] + movu m0, [dst1q+mstrideq*4] + movu m1, [dst2q+mstrideq*4] + movu m2, [dst1q+mstrideq*2] + movu m3, [dst1q+mstrideq ] + movu m4, [dst1q] + movu m5, [dst2q] + movu m6, [dst2q+ strideq ] ; 8x8 transpose TRANSPOSE4x4B 0, 1, 2, 3, 7 - mova q0backup, m1 - movu m7, [dst2_reg+ stride_reg*2] + mova m_q0backup, m1 + movu m7, [dst2q+ strideq*2] TRANSPOSE4x4B 4, 5, 6, 7, 1 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 - mova m1, q0backup - mova q0backup, m2 ; store q0 + mova m1, m_q0backup + mova m_q0backup, m2 ; store q0 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 - mova p0backup, m5 ; store p0 + mova m_p0backup, m5 ; store p0 SWAP 1, 4 SWAP 2, 4 SWAP 6, 3 SWAP 5, 3 %else ; sse2 (h) -%if %4 == 16 - lea dst8_reg, [dst_reg + stride_reg*8] +%if %2 == 16 + lea dst8q, [dst1q+ strideq*8 ] %endif ; read 16 rows of 8px each, interleave - movh m0, [dst_reg +mstride_reg*4] - movh m1, [dst8_reg+mstride_reg*4] - movh m2, [dst_reg +mstride_reg*2] - movh m5, [dst8_reg+mstride_reg*2] - movh m3, [dst_reg +mstride_reg] - movh m6, [dst8_reg+mstride_reg] - movh m4, [dst_reg] - movh m7, [dst8_reg] + movh m0, [dst1q+mstrideq*4] + movh m1, [dst8q+mstrideq*4] + movh m2, [dst1q+mstrideq*2] + movh m5, [dst8q+mstrideq*2] + movh m3, [dst1q+mstrideq ] + movh m6, [dst8q+mstrideq ] + movh m4, [dst1q] + movh m7, [dst8q] punpcklbw m0, m1 ; A/I punpcklbw m2, m5 ; C/K punpcklbw m3, m6 ; D/L punpcklbw m4, m7 ; E/M - add dst8_reg, stride_reg - movh m1, [dst2_reg+mstride_reg*4] - movh m6, [dst8_reg+mstride_reg*4] - movh m5, [dst2_reg] - movh m7, [dst8_reg] + add dst8q, strideq + movh m1, [dst2q+mstrideq*4] + movh m6, [dst8q+mstrideq*4] + movh m5, [dst2q] + movh m7, [dst8q] punpcklbw m1, m6 ; B/J punpcklbw m5, m7 ; F/N - movh m6, [dst2_reg+ stride_reg] - movh m7, [dst8_reg+ stride_reg] + movh m6, [dst2q+ strideq ] + movh m7, [dst8q+ strideq ] punpcklbw m6, m7 ; G/O ; 8x16 transpose @@ -2320,10 +2288,10 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 %ifdef m8 SWAP 1, 8 %else - mova q0backup, m1 + mova m_q0backup, m1 %endif - movh m7, [dst2_reg+ stride_reg*2] - movh m1, [dst8_reg+ stride_reg*2] + movh m7, [dst2q+ strideq*2] + movh m1, [dst8q+ strideq*2] punpcklbw m7, m1 ; H/P TRANSPOSE4x4B 4, 5, 6, 7, 1 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 @@ -2333,14 +2301,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 SWAP 1, 8 SWAP 2, 8 %else - mova m1, q0backup - mova q0backup, m2 ; store q0 + mova m1, m_q0backup + mova m_q0backup, m2 ; store q0 %endif SBUTTERFLY dq, 1, 5, 2 ; p1/p0 %ifdef m12 SWAP 5, 12 %else - mova p0backup, m5 ; store p0 + mova m_p0backup, m5 ; store p0 %endif SWAP 1, 4 SWAP 2, 4 @@ -2358,7 +2326,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 mova m4, m2 SWAP 4, 2 psubusb m4, m1 ; p1-p2 - mova p2backup, m1 + mova m_p2backup, m1 psubusb m1, m2 ; p2-p1 por m1, m4 ; abs(p2-p1) @@ -2371,12 +2339,12 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 mova m4, m5 SWAP 4, 5 psubusb m4, m6 ; q1-q2 - mova q2backup, m6 + mova m_q2backup, m6 psubusb m6, m5 ; q2-q1 por m6, m4 ; abs(q2-q1) -%ifidn %1, mmx - mova m4, flim_I +%if notcpuflag(mmx2) + mova m4, m_flimI pxor m3, m3 psubusb m0, m4 psubusb m1, m4 @@ -2397,15 +2365,15 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 ; normal_limit and high_edge_variance for p1-p0, q1-q0 SWAP 7, 3 ; now m7 is zero -%ifidn %2, v - movrow m3, [dst_reg +mstride_reg] ; p0 -%if mmsize == 16 && %4 == 8 - movhps m3, [dst8_reg+mstride_reg] +%ifidn %1, v + movrow m3, [dst1q+mstrideq ] ; p0 +%if mmsize == 16 && %2 == 8 + movhps m3, [dst8q+mstrideq ] %endif %elifdef m12 SWAP 3, 12 %else - mova m3, p0backup + mova m3, m_p0backup %endif mova m1, m2 @@ -2415,29 +2383,29 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubusb m1, m3 ; p1-p0 psubusb m6, m2 ; p0-p1 por m1, m6 ; abs(p1-p0) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m6, m1 psubusb m1, m4 - psubusb m6, hev_thr + psubusb m6, m_hevthr pcmpeqb m1, m7 ; abs(p1-p0) <= I pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh pand m0, m1 - mova mask_res, m6 + mova m_maskres, m6 %else ; mmxext/sse2 pmaxub m0, m1 ; max_I SWAP 1, 4 ; max_hev_thresh %endif SWAP 6, 4 ; now m6 is I -%ifidn %2, v - movrow m4, [dst_reg] ; q0 -%if mmsize == 16 && %4 == 8 - movhps m4, [dst8_reg] +%ifidn %1, v + movrow m4, [dst1q] ; q0 +%if mmsize == 16 && %2 == 8 + movhps m4, [dst8q] %endif %elifdef m8 SWAP 4, 8 %else - mova m4, q0backup + mova m4, m_q0backup %endif mova m1, m4 SWAP 1, 4 @@ -2446,29 +2414,29 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubusb m1, m5 ; q0-q1 psubusb m7, m4 ; q1-q0 por m1, m7 ; abs(q1-q0) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m7, m1 psubusb m1, m6 - psubusb m7, hev_thr + psubusb m7, m_hevthr pxor m6, m6 pcmpeqb m1, m6 ; abs(q1-q0) <= I pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh - mova m6, mask_res + mova m6, m_maskres pand m0, m1 ; abs([pq][321]-[pq][210]) <= I pand m6, m7 %else ; mmxext/sse2 pxor m7, m7 pmaxub m0, m1 pmaxub m6, m1 - psubusb m0, flim_I - psubusb m6, hev_thr + psubusb m0, m_flimI + psubusb m6, m_hevthr pcmpeqb m0, m7 ; max(abs(..)) <= I pcmpeqb m6, m7 ; !(max(abs..) > thresh) %endif %ifdef m12 SWAP 6, 12 %else - mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) + mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) %endif ; simple_limit @@ -2492,39 +2460,39 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 pand m7, [pb_FE] psrlq m7, 1 ; abs(q1-p1)/2 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 - psubusb m7, flim_E + psubusb m7, m_flimE pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E pand m0, m7 ; normal_limit result ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask %ifdef m8 ; x86-64 && sse2 mova m8, [pb_80] -%define pb_80_var m8 +%define m_pb_80 m8 %else ; x86-32 or mmx/mmxext -%define pb_80_var [pb_80] +%define m_pb_80 [pb_80] %endif mova m1, m4 mova m7, m3 - pxor m1, pb_80_var - pxor m7, pb_80_var + pxor m1, m_pb_80 + pxor m7, m_pb_80 psubsb m1, m7 ; (signed) q0-p0 mova m6, m2 mova m7, m5 - pxor m6, pb_80_var - pxor m7, pb_80_var + pxor m6, m_pb_80 + pxor m7, m_pb_80 psubsb m6, m7 ; (signed) p1-q1 - mova m7, mask_res + mova m7, m_maskres paddsb m6, m1 paddsb m6, m1 paddsb m6, m1 pand m6, m0 %ifdef m8 - mova lim_res, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge - pand lim_res, m7 + mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge + pand m_limres, m7 %else mova m0, m6 pand m0, m7 - mova lim_res, m0 + mova m_limres, m0 %endif pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common @@ -2558,24 +2526,53 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 paddusb m4, m1 ; q0-f1 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) +%if cpuflag(ssse3) + mova m7, [pb_1] +%else mova m7, [pw_63] +%endif %ifdef m8 SWAP 1, 8 %else - mova m1, lim_res + mova m1, m_limres %endif pxor m0, m0 mova m6, m1 pcmpgtb m0, m1 ; which are negative +%if cpuflag(ssse3) + punpcklbw m6, m7 ; interleave with "1" for rounding + punpckhbw m1, m7 +%else punpcklbw m6, m0 ; signed byte->word punpckhbw m1, m0 - mova lim_sign, m0 - mova mask_res, m6 ; backup for later in filter - mova lim_res, m1 +%endif + mova m_limsign, m0 +%if cpuflag(ssse3) + mova m7, [pb_27_63] +%ifndef m8 + mova m_limres, m1 +%endif +%ifdef m10 + SWAP 0, 10 ; don't lose lim_sign copy +%endif + mova m0, m7 + pmaddubsw m7, m6 + SWAP 6, 7 + pmaddubsw m0, m1 + SWAP 1, 0 +%ifdef m10 + SWAP 0, 10 +%else + mova m0, m_limsign +%endif +%else + mova m_maskres, m6 ; backup for later in filter + mova m_limres, m1 pmullw m6, [pw_27] pmullw m1, [pw_27] paddw m6, m7 paddw m1, m7 +%endif psraw m6, 7 psraw m1, 7 packsswb m6, m1 ; a0 @@ -2583,18 +2580,39 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubb m1, m6 pand m1, m0 ; -a0 pandn m0, m6 ; +a0 +%if cpuflag(ssse3) + mova m6, [pb_18_63] ; pipelining +%endif psubusb m3, m1 paddusb m4, m1 paddusb m3, m0 ; p0+a0 psubusb m4, m0 ; q0-a0 - mova m6, mask_res - mova m1, lim_res - mova m0, lim_sign +%if cpuflag(ssse3) + SWAP 6, 7 +%ifdef m10 + SWAP 1, 10 +%else + mova m1, m_limres +%endif + mova m0, m7 + pmaddubsw m7, m6 + SWAP 6, 7 + pmaddubsw m0, m1 + SWAP 1, 0 +%ifdef m10 + SWAP 0, 10 +%endif + mova m0, m_limsign +%else + mova m6, m_maskres + mova m1, m_limres pmullw m6, [pw_18] pmullw m1, [pw_18] paddw m6, m7 paddw m1, m7 +%endif + mova m0, m_limsign psraw m6, 7 psraw m1, 7 packsswb m6, m1 ; a1 @@ -2602,26 +2620,43 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubb m1, m6 pand m1, m0 ; -a1 pandn m0, m6 ; +a1 +%if cpuflag(ssse3) + mova m6, [pb_9_63] +%endif psubusb m2, m1 paddusb m5, m1 paddusb m2, m0 ; p1+a1 psubusb m5, m0 ; q1-a1 +%if cpuflag(ssse3) + SWAP 6, 7 +%ifdef m10 + SWAP 1, 10 +%else + mova m1, m_limres +%endif + mova m0, m7 + pmaddubsw m7, m6 + SWAP 6, 7 + pmaddubsw m0, m1 + SWAP 1, 0 +%else %ifdef m8 SWAP 6, 12 SWAP 1, 8 %else - mova m6, mask_res - mova m1, lim_res + mova m6, m_maskres + mova m1, m_limres %endif pmullw m6, [pw_9] pmullw m1, [pw_9] paddw m6, m7 paddw m1, m7 -%ifdef m15 - SWAP 7, 15 +%endif +%ifdef m9 + SWAP 7, 9 %else - mova m7, lim_sign + mova m7, m_limsign %endif psraw m6, 7 psraw m1, 7 @@ -2634,8 +2669,8 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 SWAP 1, 13 SWAP 6, 14 %else - mova m1, p2backup - mova m6, q2backup + mova m1, m_p2backup + mova m6, m_q2backup %endif psubusb m1, m0 paddusb m6, m0 @@ -2643,110 +2678,102 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubusb m6, m7 ; q1-a1 ; store -%ifidn %2, v - movrow [dst2_reg+mstride_reg*4], m1 - movrow [dst_reg +mstride_reg*2], m2 - movrow [dst_reg +mstride_reg ], m3 - movrow [dst_reg], m4 - movrow [dst2_reg], m5 - movrow [dst2_reg+ stride_reg ], m6 -%if mmsize == 16 && %4 == 8 - add dst8_reg, mstride_reg - movhps [dst8_reg+mstride_reg*2], m1 - movhps [dst8_reg+mstride_reg ], m2 - movhps [dst8_reg], m3 - add dst8_reg, stride_reg - movhps [dst8_reg], m4 - movhps [dst8_reg+ stride_reg ], m5 - movhps [dst8_reg+ stride_reg*2], m6 +%ifidn %1, v + movrow [dst2q+mstrideq*4], m1 + movrow [dst1q+mstrideq*2], m2 + movrow [dst1q+mstrideq ], m3 + movrow [dst1q], m4 + movrow [dst2q], m5 + movrow [dst2q+ strideq ], m6 +%if mmsize == 16 && %2 == 8 + add dst8q, mstrideq + movhps [dst8q+mstrideq*2], m1 + movhps [dst8q+mstrideq ], m2 + movhps [dst8q], m3 + add dst8q, strideq + movhps [dst8q], m4 + movhps [dst8q+ strideq ], m5 + movhps [dst8q+ strideq*2], m6 %endif %else ; h - inc dst_reg - inc dst2_reg + inc dst1q + inc dst2q ; 4x8/16 transpose - TRANSPOSE4x4B 1, 2, 3, 4, 0 - SBUTTERFLY bw, 5, 6, 0 + TRANSPOSE4x4B 1, 2, 3, 4, 0 + SBUTTERFLY bw, 5, 6, 0 %if mmsize == 8 ; mmx/mmxext (h) - WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg - add dst_reg, 4 - WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg, %4 + WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq + add dst1q, 4 + WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq %else ; sse2 (h) - lea dst8_reg, [dst8_reg+mstride_reg+1] - WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 - lea dst_reg, [dst2_reg+mstride_reg+4] - lea dst8_reg, [dst8_reg+mstride_reg+4] - WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg, %2 -%ifidn %2, sse4 - lea dst_reg, [dst8_reg+ stride_reg] + lea dst8q, [dst8q+mstrideq+1] + WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2 + lea dst1q, [dst2q+mstrideq+4] + lea dst8q, [dst8q+mstrideq+4] +%if cpuflag(sse4) + add dst2q, 4 %endif - WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg, %2 + WRITE_8W m5, dst2q, dst1q, mstrideq, strideq +%if cpuflag(sse4) + lea dst2q, [dst8q+ strideq ] +%endif + WRITE_8W m6, dst2q, dst8q, mstrideq, strideq %endif %endif %if mmsize == 8 -%if %4 == 8 ; chroma -%ifidn %2, h - sub dst_reg, 5 +%if %2 == 8 ; chroma +%ifidn %1, h + sub dst1q, 5 %endif - cmp dst_reg, dst8_reg - mov dst_reg, dst8_reg + cmp dst1q, dst8q + mov dst1q, dst8q jnz .next8px %else -%ifidn %2, h - lea dst_reg, [dst_reg + stride_reg*8-5] +%ifidn %1, h + lea dst1q, [dst1q+ strideq*8-5] %else ; v - add dst_reg, 8 + add dst1q, 8 %endif - dec cnt_reg + dec cntrq jg .next8px %endif %endif %ifndef m8 ; sse2 on x86-32 or mmx/mmxext - mov rsp, stack_reg ; restore stack pointer + ADD rsp, pad %endif RET %endmacro -INIT_MMX -%define SPLATB_REG SPLATB_REG_MMX -MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 -MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 -MBEDGE_LOOPFILTER mmx, v, 6, 8, 0 -MBEDGE_LOOPFILTER mmx, h, 6, 8, 0 - -%define SPLATB_REG SPLATB_REG_MMXEXT -MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 -MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 -MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 -MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 - -INIT_XMM -%define SPLATB_REG SPLATB_REG_SSE2 -MBEDGE_LOOPFILTER sse2, v, 5, 16, 16 -%ifdef m8 -MBEDGE_LOOPFILTER sse2, h, 5, 16, 16 -%else -MBEDGE_LOOPFILTER sse2, h, 6, 16, 16 -%endif -MBEDGE_LOOPFILTER sse2, v, 6, 8, 16 -MBEDGE_LOOPFILTER sse2, h, 6, 8, 16 - -%define SPLATB_REG SPLATB_REG_SSSE3 -MBEDGE_LOOPFILTER ssse3, v, 5, 16, 16 -%ifdef m8 -MBEDGE_LOOPFILTER ssse3, h, 5, 16, 16 -%else -MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16 -%endif -MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16 -MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16 - -%ifdef m8 -MBEDGE_LOOPFILTER sse4, h, 5, 16, 16 -%else -MBEDGE_LOOPFILTER sse4, h, 6, 16, 16 -%endif -MBEDGE_LOOPFILTER sse4, h, 6, 8, 16 +%if ARCH_X86_32 +INIT_MMX mmx +MBEDGE_LOOPFILTER v, 16 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER v, 8 +MBEDGE_LOOPFILTER h, 8 + +INIT_MMX mmx2 +MBEDGE_LOOPFILTER v, 16 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER v, 8 +MBEDGE_LOOPFILTER h, 8 +%endif + +INIT_XMM sse2 +MBEDGE_LOOPFILTER v, 16 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER v, 8 +MBEDGE_LOOPFILTER h, 8 + +INIT_XMM ssse3 +MBEDGE_LOOPFILTER v, 16 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER v, 8 +MBEDGE_LOOPFILTER h, 8 + +INIT_XMM sse4 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER h, 8