;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Laurent Aimar <fenrir@via.ecp.fr>
;* Alex Izvorski <aizvorksi@gmail.com>
+;* Fiona Glaser <fiona@x264.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*****************************************************************************
%include "x86inc.asm"
+%include "x86util.asm"
SECTION_RODATA
pw_1: times 8 dw 1
ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
mask_ff: times 16 db 0xff
times 16 db 0
+mask_ac4: dw 0,-1,-1,-1, 0,-1,-1,-1
+mask_ac8: dw 0,-1,-1,-1,-1,-1,-1,-1
SECTION .text
%macro HADDD 2 ; sum junk
+%if mmsize == 16
movhlps %2, %1
paddd %1, %2
pshuflw %2, %1, 0xE
paddd %1, %2
+%else
+ mova %2, %1
+ psrlq %2, 32
+ paddd %1, %2
+%endif
%endmacro
%macro HADDW 2
HADDD %1, %2
%endmacro
+%macro HADDUW 2
+ mova %2, %1
+ pslld %1, 16
+ psrld %2, 16
+ psrld %1, 16
+ paddd %1, %2
+ HADDD %1, %2
+%endmacro
+
;=============================================================================
; SSD
;=============================================================================
-%macro SSD_INC_1x16P 0
- movq mm1, [r0]
- movq mm2, [r2]
- movq mm3, [r0+8]
- movq mm4, [r2+8]
-
- movq mm5, mm2
- movq mm6, mm4
- psubusb mm2, mm1
- psubusb mm4, mm3
- psubusb mm1, mm5
- psubusb mm3, mm6
- por mm1, mm2
- por mm3, mm4
-
- movq mm2, mm1
- movq mm4, mm3
- punpcklbw mm1, mm7
- punpcklbw mm3, mm7
- punpckhbw mm2, mm7
- punpckhbw mm4, mm7
- pmaddwd mm1, mm1
- pmaddwd mm2, mm2
- pmaddwd mm3, mm3
- pmaddwd mm4, mm4
-
- add r0, r1
- add r2, r3
- paddd mm0, mm1
- paddd mm0, mm2
- paddd mm0, mm3
- paddd mm0, mm4
-%endmacro
-
-%macro SSD_INC_1x8P 0
- movq mm1, [r0]
- movq mm2, [r2]
-
- movq mm5, mm2
- psubusb mm2, mm1
- psubusb mm1, mm5
- por mm1, mm2 ; mm1 = 8bit abs diff
-
- movq mm2, mm1
- punpcklbw mm1, mm7
- punpckhbw mm2, mm7 ; (mm1,mm2) = 16bit abs diff
- pmaddwd mm1, mm1
- pmaddwd mm2, mm2
-
- add r0, r1
- add r2, r3
- paddd mm0, mm1
- paddd mm0, mm2
-%endmacro
-
-%macro SSD_INC_1x4P 0
- movd mm1, [r0]
- movd mm2, [r2]
-
- movq mm5, mm2
- psubusb mm2, mm1
- psubusb mm1, mm5
- por mm1, mm2
- punpcklbw mm1, mm7
- pmaddwd mm1, mm1
-
- add r0, r1
- add r2, r3
- paddd mm0, mm1
+%macro SSD_FULL 6
+ mova m1, [r0+%1]
+ mova m2, [r2+%2]
+ mova m3, [r0+%3]
+ mova m4, [r2+%4]
+
+ mova m5, m2
+ mova m6, m4
+ psubusb m2, m1
+ psubusb m4, m3
+ psubusb m1, m5
+ psubusb m3, m6
+ por m1, m2
+ por m3, m4
+
+ mova m2, m1
+ mova m4, m3
+ punpcklbw m1, m7
+ punpcklbw m3, m7
+ punpckhbw m2, m7
+ punpckhbw m4, m7
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+
+%if %6
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+%endif
+ paddd m1, m2
+ paddd m3, m4
+%if %5
+ paddd m0, m1
+%else
+ SWAP m0, m1
+%endif
+ paddd m0, m3
%endmacro
-;-----------------------------------------------------------------------------
-; int x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-%macro SSD_MMX 2
-cglobal x264_pixel_ssd_%1x%2_mmx, 4,4
- pxor mm7, mm7 ; zero
- pxor mm0, mm0 ; mm0 holds the sum
-%rep %2
- SSD_INC_1x%1P
-%endrep
- movq mm1, mm0
- psrlq mm1, 32
- paddd mm0, mm1
- movd eax, mm0
- RET
+%macro SSD_HALF 6
+ movh m1, [r0+%1]
+ movh m2, [r2+%2]
+ movh m3, [r0+%3]
+ movh m4, [r2+%4]
+
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m4, m7
+ psubw m1, m2
+ psubw m3, m4
+ pmaddwd m1, m1
+ pmaddwd m3, m3
+
+%if %6
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+%endif
+%if %5
+ paddd m0, m1
+%else
+ SWAP m0, m1
+%endif
+ paddd m0, m3
%endmacro
-SSD_MMX 16, 16
-SSD_MMX 16, 8
-SSD_MMX 8, 16
-SSD_MMX 8, 8
-SSD_MMX 8, 4
-SSD_MMX 4, 8
-SSD_MMX 4, 4
-
-%macro SSD_INC_2x16P_SSE2 0
- movdqu xmm1, [r0]
- movdqu xmm2, [r2]
- movdqu xmm3, [r0+r1]
- movdqu xmm4, [r2+r3]
-
- movdqa xmm5, xmm1
- movdqa xmm6, xmm3
- psubusb xmm1, xmm2
- psubusb xmm3, xmm4
- psubusb xmm2, xmm5
- psubusb xmm4, xmm6
- por xmm1, xmm2
- por xmm3, xmm4
-
- movdqa xmm2, xmm1
- movdqa xmm4, xmm3
- punpcklbw xmm1, xmm7
- punpckhbw xmm2, xmm7
- punpcklbw xmm3, xmm7
- punpckhbw xmm4, xmm7
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
- pmaddwd xmm4, xmm4
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
-
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm0, xmm1
- paddd xmm0, xmm3
+%macro SSD_QUARTER 6
+ movd m1, [r0+%1]
+ movd m2, [r2+%2]
+ movd m3, [r0+%3]
+ movd m4, [r2+%4]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ pinsrd m1, [r0+%1], 1
+ pinsrd m2, [r2+%2], 1
+ pinsrd m3, [r0+%3], 1
+ pinsrd m4, [r2+%4], 1
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m4, m7
+ psubw m1, m2
+ psubw m3, m4
+ pmaddwd m1, m1
+ pmaddwd m3, m3
+
+%if %6
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+%endif
+%if %5
+ paddd m0, m1
+%else
+ SWAP m0, m1
+%endif
+ paddd m0, m3
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
+; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-%macro SSD_SSE2 2
-cglobal x264_pixel_ssd_%1x%2_sse2, 4,4
- pxor xmm7, xmm7
- pxor xmm0, xmm0
+%macro SSD 3
+cglobal x264_pixel_ssd_%1x%2_%3, 4,4
+%if %1 >= mmsize
+ pxor m7, m7
+%endif
+%assign i 0
%rep %2/2
- SSD_INC_2x16P_SSE2
+%if %1 > mmsize
+ SSD_FULL 0, 0, mmsize, mmsize, i, 0
+ SSD_FULL r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/2-1
+%elif %1 == mmsize
+ SSD_FULL 0, 0, r1, r3, i, i<%2/2-1
+%else
+ SSD_HALF 0, 0, r1, r3, i, i<%2/2-1
+%endif
+%assign i i+1
%endrep
- HADDD xmm0, xmm1
- movd eax, xmm0
+ HADDD m0, m1
+ movd eax, m0
RET
%endmacro
-SSD_SSE2 16, 16
-SSD_SSE2 16, 8
+INIT_MMX
+SSD 16, 16, mmx
+SSD 16, 8, mmx
+SSD 8, 16, mmx
+SSD 8, 8, mmx
+SSD 8, 4, mmx
+SSD 4, 8, mmx
+SSD 4, 4, mmx
+INIT_XMM
+SSD 16, 16, sse2
+SSD 16, 8, sse2
+SSD 8, 16, sse2
+SSD 8, 8, sse2
+SSD 8, 4, sse2
+
+cglobal x264_pixel_ssd_4x8_sse4, 4,4
+ SSD_QUARTER 0, 0, r1, r3, 0, 1
+ SSD_QUARTER 0, 0, r1, r3, 1, 0
+ HADDD m0, m1
+ movd eax, m0
+ RET
+cglobal x264_pixel_ssd_4x4_sse4, 4,4
+ SSD_QUARTER 0, 0, r1, r3, 0, 0
+ HADDD m0, m1
+ movd eax, m0
+ RET
;=============================================================================
-; SATD
+; variance
;=============================================================================
-%macro LOAD_DIFF_4P 4 ; dst, tmp, [pix1], [pix2]
- movd %1, %3
- movd %2, %4
- punpcklbw %1, %2
- punpcklbw %2, %2
- psubw %1, %2
-%endmacro
-
-%macro LOAD_DIFF_8P 4 ; dst, tmp, [pix1], [pix2]
- movq %1, %3
- movq %2, %4
- punpcklbw %1, %2
- punpcklbw %2, %2
- psubw %1, %2
-%endmacro
-
-%macro LOAD_DIFF_8x4P 6 ; 4x dest, 2x temp
- LOAD_DIFF_8P %1, %5, [r0], [r2]
- LOAD_DIFF_8P %2, %6, [r0+r1], [r2+r3]
- LOAD_DIFF_8P %3, %5, [r0+2*r1], [r2+2*r3]
- LOAD_DIFF_8P %4, %6, [r0+r4], [r2+r5]
-%endmacro
-
-;;; row transform not used, because phaddw is much slower than paddw on a Conroe
-;%macro PHSUMSUB 3
-; movdqa %3, %1
-; phaddw %1, %2
-; phsubw %3, %2
-;%endmacro
-
-;%macro HADAMARD4_ROW_SSSE3 5 ; abcd-t -> adtc
-; PHSUMSUB %1, %2, %5
-; PHSUMSUB %3, %4, %2
-; PHSUMSUB %1, %3, %4
-; PHSUMSUB %5, %2, %3
-;%endmacro
-
-%macro SUMSUB_BADC 4
- paddw %1, %2
- paddw %3, %4
- paddw %2, %2
- paddw %4, %4
- psubw %2, %1
- psubw %4, %3
-%endmacro
-
-%macro HADAMARD4_1D 4
- SUMSUB_BADC %1, %2, %3, %4
- SUMSUB_BADC %1, %3, %2, %4
-%endmacro
-
-%macro HADAMARD8_1D 8
- SUMSUB_BADC %1, %5, %2, %6
- SUMSUB_BADC %3, %7, %4, %8
- SUMSUB_BADC %1, %3, %2, %4
- SUMSUB_BADC %5, %7, %6, %8
- SUMSUB_BADC %1, %2, %3, %4
- SUMSUB_BADC %5, %6, %7, %8
-%endmacro
-
-%macro SBUTTERFLY 5
- mov%1 %5, %3
- punpckl%2 %3, %4
- punpckh%2 %5, %4
-%endmacro
-
-%macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4x2 to not shuffle registers
- mov%1 %5, %3
- punpckh%2 %3, %4
- punpckl%2 %5, %4
+%macro VAR_START 0
+ pxor m5, m5 ; sum
+ pxor m6, m6 ; sum squared
+ pxor m7, m7 ; zero
+%ifdef ARCH_X86_64
+ %define t3d r3d
+%else
+ %define t3d r2d
+%endif
%endmacro
-%macro TRANSPOSE4x4W 5 ; abcd-t -> adtc
- SBUTTERFLY q, wd, %1, %2, %5
- SBUTTERFLY q, wd, %3, %4, %2
- SBUTTERFLY q, dq, %1, %3, %4
- SBUTTERFLY q, dq, %5, %2, %3
+%macro VAR_END 1
+%if mmsize == 16
+ movhlps m0, m5
+ paddw m5, m0
+%endif
+ movifnidn r2d, r2m
+ movd r1d, m5
+ movd [r2], m5 ; return sum
+ imul r1d, r1d
+ HADDD m6, m1
+ shr r1d, %1
+ movd eax, m6
+ sub eax, r1d ; sqr - (sum * sum >> shift)
+ RET
%endmacro
-%macro TRANSPOSE4x4D 5 ; abcd-t -> adtc
- SBUTTERFLY dqa, dq, %1, %2, %5
- SBUTTERFLY dqa, dq, %3, %4, %2
- SBUTTERFLY dqa, qdq, %1, %3, %4
- SBUTTERFLY dqa, qdq, %5, %2, %3
+%macro VAR_2ROW 2
+ mov t3d, %2
+.loop:
+ mova m0, [r0]
+ mova m1, m0
+ mova m3, [r0+%1]
+ mova m2, m0
+ punpcklbw m0, m7
+ mova m4, m3
+ punpckhbw m1, m7
+%ifidn %1, r1
+ lea r0, [r0+%1*2]
+%else
+ add r0, r1
+%endif
+ punpckhbw m4, m7
+ psadbw m2, m7
+ paddw m5, m2
+ mova m2, m3
+ punpcklbw m3, m7
+ dec t3d
+ psadbw m2, m7
+ pmaddwd m0, m0
+ paddw m5, m2
+ pmaddwd m1, m1
+ paddd m6, m0
+ pmaddwd m3, m3
+ paddd m6, m1
+ pmaddwd m4, m4
+ paddd m6, m3
+ paddd m6, m4
+ jg .loop
%endmacro
-%macro TRANSPOSE2x4x4W 5 ; abcd-t -> abcd
- SBUTTERFLY dqa, wd, %1, %2, %5
- SBUTTERFLY dqa, wd, %3, %4, %2
- SBUTTERFLY dqa, dq, %1, %3, %4
- SBUTTERFLY2 dqa, dq, %5, %2, %3
- SBUTTERFLY dqa, qdq, %1, %3, %2
- SBUTTERFLY2 dqa, qdq, %4, %5, %3
-%endmacro
+;-----------------------------------------------------------------------------
+; int x264_pixel_var_wxh_mmxext( uint8_t *, int, int * )
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal x264_pixel_var_16x16_mmxext, 2,3
+ VAR_START
+ VAR_2ROW 8, 16
+ VAR_END 8
+
+cglobal x264_pixel_var_8x8_mmxext, 2,3
+ VAR_START
+ VAR_2ROW r1, 4
+ VAR_END 6
+
+INIT_XMM
+cglobal x264_pixel_var_16x16_sse2, 2,3
+ VAR_START
+ VAR_2ROW r1, 8
+ VAR_END 8
+
+cglobal x264_pixel_var_8x8_sse2, 2,3
+ VAR_START
+ mov t3d, 4
+.loop:
+ movh m0, [r0]
+ movhps m0, [r0+r1]
+ lea r0, [r0+r1*2]
+ mova m1, m0
+ punpcklbw m0, m7
+ mova m2, m1
+ punpckhbw m1, m7
+ dec t3d
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ psadbw m2, m7
+ paddw m5, m2
+ paddd m6, m0
+ paddd m6, m1
+ jnz .loop
+ VAR_END 6
-%ifdef ARCH_X86_64
-%macro TRANSPOSE8x8W 9 ; abcdefgh-t -> afhdtecb
- SBUTTERFLY dqa, wd, %1, %2, %9
- SBUTTERFLY dqa, wd, %3, %4, %2
- SBUTTERFLY dqa, wd, %5, %6, %4
- SBUTTERFLY dqa, wd, %7, %8, %6
- SBUTTERFLY dqa, dq, %1, %3, %8
- SBUTTERFLY dqa, dq, %9, %2, %3
- SBUTTERFLY dqa, dq, %5, %7, %2
- SBUTTERFLY dqa, dq, %4, %6, %7
- SBUTTERFLY dqa, qdq, %1, %5, %6
- SBUTTERFLY dqa, qdq, %9, %4, %5
- SBUTTERFLY dqa, qdq, %8, %2, %4
- SBUTTERFLY dqa, qdq, %3, %7, %2
-%endmacro
-%else
-%macro TRANSPOSE8x8W 9 ; abcdefgh -> afhdgecb
- movdqa [%9], %8
- SBUTTERFLY dqa, wd, %1, %2, %8
- movdqa [%9+16], %8
- movdqa %8, [%9]
- SBUTTERFLY dqa, wd, %3, %4, %2
- SBUTTERFLY dqa, wd, %5, %6, %4
- SBUTTERFLY dqa, wd, %7, %8, %6
- SBUTTERFLY dqa, dq, %1, %3, %8
- movdqa [%9], %8
- movdqa %8, [16+%9]
- SBUTTERFLY dqa, dq, %8, %2, %3
- SBUTTERFLY dqa, dq, %5, %7, %2
- SBUTTERFLY dqa, dq, %4, %6, %7
- SBUTTERFLY dqa, qdq, %1, %5, %6
- SBUTTERFLY dqa, qdq, %8, %4, %5
- movdqa [%9+16], %8
- movdqa %8, [%9]
- SBUTTERFLY dqa, qdq, %8, %2, %4
- SBUTTERFLY dqa, qdq, %3, %7, %2
- movdqa %7, [%9+16]
-%endmacro
-%endif
-%macro ABS1_MMX 2 ; a, tmp
- pxor %2, %2
- psubw %2, %1
- pmaxsw %1, %2
-%endmacro
+;=============================================================================
+; SATD
+;=============================================================================
-%macro ABS2_MMX 4 ; a, b, tmp0, tmp1
- pxor %3, %3
- pxor %4, %4
- psubw %3, %1
- psubw %4, %2
- pmaxsw %1, %3
- pmaxsw %2, %4
-%endmacro
+; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower:
+; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1.
+; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging,
+; whereas phaddw-based transform doesn't care what order the coefs end up in.
-%macro ABS1_SSSE3 2
- pabsw %1, %1
+%macro PHSUMSUB 3
+ movdqa m%3, m%1
+ phaddw m%1, m%2
+ phsubw m%3, m%2
+ SWAP %2, %3
%endmacro
-%macro ABS2_SSSE3 4
- pabsw %1, %1
- pabsw %2, %2
+%macro HADAMARD4_ROW_PHADD 5
+ PHSUMSUB %1, %2, %5
+ PHSUMSUB %3, %4, %5
+ PHSUMSUB %1, %3, %5
+ PHSUMSUB %2, %4, %5
+ SWAP %3, %4
%endmacro
-%define ABS1 ABS1_MMX
-%define ABS2 ABS2_MMX
-
-%macro ABS4 6
- ABS2 %1, %2, %5, %6
- ABS2 %3, %4, %5, %6
+%macro HADAMARD4_1D 4
+ SUMSUB_BADC %1, %2, %3, %4
+ SUMSUB_BADC %1, %3, %2, %4
%endmacro
%macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block)
- HADAMARD4_1D mm4, mm5, mm6, mm7
- TRANSPOSE4x4W mm4, mm5, mm6, mm7, %1
- HADAMARD4_1D mm4, mm7, %1, mm6
- ABS2 mm4, mm7, mm3, mm5
- ABS2 %1, mm6, mm3, mm5
- paddw %1, mm4
- paddw mm6, mm7
- pavgw %1, mm6
+ %xdefine %%n n%1
+ HADAMARD4_1D m4, m5, m6, m7
+ TRANSPOSE4x4W 4, 5, 6, 7, %%n
+ HADAMARD4_1D m4, m5, m6, m7
+ ABS2 m4, m5, m3, m %+ %%n
+ ABS2 m6, m7, m3, m %+ %%n
+ paddw m6, m4
+ paddw m7, m5
+ pavgw m6, m7
+ SWAP %%n, 6
%endmacro
; in: r4=3*stride1, r5=3*stride2
; in: %2 = horizontal offset
; in: %3 = whether we need to increment pix1 and pix2
-; clobber: mm3..mm7
+; clobber: m3..m7
; out: %1 = satd
%macro SATD_4x4_MMX 3
- LOAD_DIFF_4P mm4, mm3, [r0+%2], [r2+%2]
- LOAD_DIFF_4P mm5, mm3, [r0+r1+%2], [r2+r3+%2]
- LOAD_DIFF_4P mm6, mm3, [r0+2*r1+%2], [r2+2*r3+%2]
- LOAD_DIFF_4P mm7, mm3, [r0+r4+%2], [r2+r5+%2]
+ LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2]
+ LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2]
+ LOAD_DIFF m6, m3, none, [r0+2*r1+%2], [r2+2*r3+%2]
+ LOAD_DIFF m7, m3, none, [r0+r4+%2], [r2+r5+%2]
%if %3
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
HADAMARD4x4_SUM %1
%endmacro
-%macro SATD_8x4_START 1
- SATD_4x4_MMX mm0, 0, 0
- SATD_4x4_MMX mm1, 4, %1
-%endmacro
-
-%macro SATD_8x4_INC 1
- SATD_4x4_MMX mm2, 0, 0
- paddw mm0, mm1
- SATD_4x4_MMX mm1, 4, %1
- paddw mm0, mm2
-%endmacro
-
-%macro SATD_16x4_START 1
- SATD_4x4_MMX mm0, 0, 0
- SATD_4x4_MMX mm1, 4, 0
- SATD_4x4_MMX mm2, 8, 0
- paddw mm0, mm1
- SATD_4x4_MMX mm1, 12, %1
- paddw mm0, mm2
-%endmacro
-
-%macro SATD_16x4_INC 1
- SATD_4x4_MMX mm2, 0, 0
- paddw mm0, mm1
- SATD_4x4_MMX mm1, 4, 0
- paddw mm0, mm2
- SATD_4x4_MMX mm2, 8, 0
- paddw mm0, mm1
- SATD_4x4_MMX mm1, 12, %1
- paddw mm0, mm2
-%endmacro
-
%macro SATD_8x4_SSE2 1
- LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-%if %1
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
+ HADAMARD4_1D m0, m1, m2, m3
+%ifidn %1, ssse3_phadd
+ HADAMARD4_ROW_PHADD 0, 1, 2, 3, 4
+%else
+ TRANSPOSE2x4x4W 0, 1, 2, 3, 4
+ HADAMARD4_1D m0, m1, m2, m3
%endif
- HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
- TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
- HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
- ABS4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- paddusw xmm0, xmm1
- paddusw xmm2, xmm3
- paddusw xmm6, xmm0
- paddusw xmm6, xmm2
+ ABS4 m0, m1, m2, m3, m4, m5
+ paddusw m0, m1
+ paddusw m2, m3
+ paddusw m6, m0
+ paddusw m6, m2
%endmacro
%macro SATD_START_MMX 0
%endmacro
%macro SATD_END_MMX 0
- pshufw mm1, mm0, 01001110b
- paddw mm0, mm1
- pshufw mm1, mm0, 10110001b
- paddw mm0, mm1
- movd eax, mm0
- and eax, 0xffff
+ pshufw m1, m0, 01001110b
+ paddw m0, m1
+ pshufw m1, m0, 10110001b
+ paddw m0, m1
+ movd eax, m0
+ and eax, 0xffff
RET
%endmacro
;-----------------------------------------------------------------------------
; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal x264_pixel_satd_16x4_internal_mmxext
+ SATD_4x4_MMX m2, 0, 0
+ SATD_4x4_MMX m1, 4, 0
+ paddw m0, m2
+ SATD_4x4_MMX m2, 8, 0
+ paddw m0, m1
+ SATD_4x4_MMX m1, 12, 0
+ paddw m0, m2
+ paddw m0, m1
+ ret
+
+cglobal x264_pixel_satd_8x8_internal_mmxext
+ SATD_4x4_MMX m2, 0, 0
+ SATD_4x4_MMX m1, 4, 1
+ paddw m0, m2
+ paddw m0, m1
+x264_pixel_satd_8x4_internal_mmxext:
+ SATD_4x4_MMX m2, 0, 0
+ SATD_4x4_MMX m1, 4, 0
+ paddw m0, m2
+ paddw m0, m1
+ ret
+
cglobal x264_pixel_satd_16x16_mmxext, 4,6
SATD_START_MMX
- SATD_16x4_START 1
- SATD_16x4_INC 1
- SATD_16x4_INC 1
- SATD_16x4_INC 0
- paddw mm0, mm1
- pxor mm3, mm3
- pshufw mm1, mm0, 01001110b
- paddw mm0, mm1
- punpcklwd mm0, mm3
- pshufw mm1, mm0, 01001110b
- paddd mm0, mm1
- movd eax, mm0
+ pxor m0, m0
+%rep 3
+ call x264_pixel_satd_16x4_internal_mmxext
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+%endrep
+ call x264_pixel_satd_16x4_internal_mmxext
+ HADDUW m0, m1
+ movd eax, m0
RET
cglobal x264_pixel_satd_16x8_mmxext, 4,6
SATD_START_MMX
- SATD_16x4_START 1
- SATD_16x4_INC 0
- paddw mm0, mm1
+ pxor m0, m0
+ call x264_pixel_satd_16x4_internal_mmxext
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ call x264_pixel_satd_16x4_internal_mmxext
SATD_END_MMX
cglobal x264_pixel_satd_8x16_mmxext, 4,6
SATD_START_MMX
- SATD_8x4_START 1
- SATD_8x4_INC 1
- SATD_8x4_INC 1
- SATD_8x4_INC 0
- paddw mm0, mm1
+ pxor m0, m0
+ call x264_pixel_satd_8x8_internal_mmxext
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ call x264_pixel_satd_8x8_internal_mmxext
SATD_END_MMX
cglobal x264_pixel_satd_8x8_mmxext, 4,6
SATD_START_MMX
- SATD_8x4_START 1
- SATD_8x4_INC 0
- paddw mm0, mm1
+ pxor m0, m0
+ call x264_pixel_satd_8x8_internal_mmxext
SATD_END_MMX
cglobal x264_pixel_satd_8x4_mmxext, 4,6
SATD_START_MMX
- SATD_8x4_START 0
- paddw mm0, mm1
+ pxor m0, m0
+ call x264_pixel_satd_8x4_internal_mmxext
SATD_END_MMX
cglobal x264_pixel_satd_4x8_mmxext, 4,6
SATD_START_MMX
- SATD_4x4_MMX mm0, 0, 1
- SATD_4x4_MMX mm1, 0, 0
- paddw mm0, mm1
+ SATD_4x4_MMX m0, 0, 1
+ SATD_4x4_MMX m1, 0, 0
+ paddw m0, m1
SATD_END_MMX
-cglobal x264_pixel_satd_4x4_mmxext, 4,6
+%macro SATD_W4 1
+INIT_MMX
+cglobal x264_pixel_satd_4x4_%1, 4,6
SATD_START_MMX
- SATD_4x4_MMX mm0, 0, 0
+ SATD_4x4_MMX m0, 0, 0
SATD_END_MMX
+%endmacro
-
+SATD_W4 mmxext
%macro SATD_START_SSE2 0
- pxor xmm6, xmm6
- lea r4, [3*r1]
- lea r5, [3*r3]
+ pxor m6, m6
+ lea r4, [3*r1]
+ lea r5, [3*r3]
%endmacro
%macro SATD_END_SSE2 0
- picgetgot ebx
- psrlw xmm6, 1
- HADDW xmm6, xmm7
- movd eax, xmm6
+ psrlw m6, 1
+ HADDW m6, m7
+ movd eax, m6
RET
%endmacro
%macro BACKUP_POINTERS 0
%ifdef ARCH_X86_64
- mov r10, r0
- mov r11, r2
+ mov r10, r0
+ mov r11, r2
%endif
%endmacro
; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SATDS_SSE2 1
+INIT_XMM
+cglobal x264_pixel_satd_8x8_internal_%1
+ LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
+ SATD_8x4_SSE2 %1
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+x264_pixel_satd_8x4_internal_%1:
+ LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
+x264_pixel_satd_4x8_internal_%1:
+ SAVE_MM_PERMUTATION satd_4x8_internal
+ SATD_8x4_SSE2 %1
+ ret
+
cglobal x264_pixel_satd_16x16_%1, 4,6
SATD_START_SSE2
BACKUP_POINTERS
- SATD_8x4_SSE2 1
- SATD_8x4_SSE2 1
- SATD_8x4_SSE2 1
- SATD_8x4_SSE2 0
+ call x264_pixel_satd_8x8_internal_%1
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ call x264_pixel_satd_8x8_internal_%1
RESTORE_AND_INC_POINTERS
- SATD_8x4_SSE2 1
- SATD_8x4_SSE2 1
- SATD_8x4_SSE2 1
- SATD_8x4_SSE2 0
+ call x264_pixel_satd_8x8_internal_%1
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ call x264_pixel_satd_8x8_internal_%1
SATD_END_SSE2
cglobal x264_pixel_satd_16x8_%1, 4,6
SATD_START_SSE2
BACKUP_POINTERS
- SATD_8x4_SSE2 1
- SATD_8x4_SSE2 0
+ call x264_pixel_satd_8x8_internal_%1
RESTORE_AND_INC_POINTERS
- SATD_8x4_SSE2 1
- SATD_8x4_SSE2 0
+ call x264_pixel_satd_8x8_internal_%1
SATD_END_SSE2
cglobal x264_pixel_satd_8x16_%1, 4,6
SATD_START_SSE2
- SATD_8x4_SSE2 1
- SATD_8x4_SSE2 1
- SATD_8x4_SSE2 1
- SATD_8x4_SSE2 0
+ call x264_pixel_satd_8x8_internal_%1
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ call x264_pixel_satd_8x8_internal_%1
SATD_END_SSE2
cglobal x264_pixel_satd_8x8_%1, 4,6
SATD_START_SSE2
- SATD_8x4_SSE2 1
- SATD_8x4_SSE2 0
+ call x264_pixel_satd_8x8_internal_%1
SATD_END_SSE2
cglobal x264_pixel_satd_8x4_%1, 4,6
SATD_START_SSE2
- SATD_8x4_SSE2 0
+ call x264_pixel_satd_8x4_internal_%1
+ SATD_END_SSE2
+
+cglobal x264_pixel_satd_4x8_%1, 4,6
+ INIT_XMM
+ LOAD_MM_PERMUTATION satd_4x8_internal
+ %define movh movd
+ SATD_START_SSE2
+ LOAD_DIFF m0, m7, m6, [r0], [r2]
+ LOAD_DIFF m1, m7, m6, [r0+r1], [r2+r3]
+ LOAD_DIFF m2, m7, m6, [r0+2*r1], [r2+2*r3]
+ LOAD_DIFF m3, m7, m6, [r0+r4], [r2+r5]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ LOAD_DIFF m4, m7, m6, [r0], [r2]
+ LOAD_DIFF m5, m7, m6, [r0+r1], [r2+r3]
+ punpcklqdq m0, m4
+ punpcklqdq m1, m5
+ LOAD_DIFF m4, m7, m6, [r0+2*r1], [r2+2*r3]
+ LOAD_DIFF m5, m7, m6, [r0+r4], [r2+r5]
+ punpcklqdq m2, m4
+ punpcklqdq m3, m5
+ %define movh movq
+ call x264_pixel_satd_4x8_internal_%1
SATD_END_SSE2
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sa8d_8x8_%1
+cglobal x264_pixel_sa8d_8x8_internal_%1
+ lea r10, [r0+4*r1]
+ lea r11, [r2+4*r3]
+ LOAD_DIFF_8x4P m0, m1, m2, m3, m8, m9, r0, r2
+ LOAD_DIFF_8x4P m4, m5, m6, m7, m8, m9, r10, r11
+
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+
+ ABS4 m0, m1, m2, m3, m8, m9
+ ABS4 m4, m5, m6, m7, m8, m9
+ paddusw m0, m1
+ paddusw m2, m3
+ paddusw m4, m5
+ paddusw m6, m7
+ paddusw m0, m2
+ paddusw m4, m6
+ pavgw m0, m4
+ ret
+
+cglobal x264_pixel_sa8d_8x8_%1, 4,6
lea r4, [3*r1]
lea r5, [3*r3]
-.skip_lea:
- LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm8, xmm9
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
-
- HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
- TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
- HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
-
- ABS4 xmm0, xmm1, xmm2, xmm3, xmm6, xmm9
- ABS4 xmm4, xmm5, xmm7, xmm8, xmm6, xmm9
- paddusw xmm0, xmm1
- paddusw xmm2, xmm3
- paddusw xmm4, xmm5
- paddusw xmm7, xmm8
- paddusw xmm0, xmm2
- paddusw xmm4, xmm7
- pavgw xmm0, xmm4
- HADDW xmm0, xmm1
- movd eax, xmm0
- add r10d, eax ; preserve rounding for 16x16
+ call x264_pixel_sa8d_8x8_internal_%1
+ HADDW m0, m1
+ movd eax, m0
add eax, 1
shr eax, 1
ret
-cglobal x264_pixel_sa8d_16x16_%1
- xor r10d, r10d
- call x264_pixel_sa8d_8x8_%1 ; pix[0]
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- call x264_pixel_sa8d_8x8_%1.skip_lea ; pix[8*stride]
- neg r4 ; it's already r1*3
- neg r5
- lea r0, [r0+4*r4+8]
- lea r2, [r2+4*r5+8]
- call x264_pixel_sa8d_8x8_%1 ; pix[8]
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- call x264_pixel_sa8d_8x8_%1.skip_lea ; pix[8*stride+8]
- mov eax, r10d
+cglobal x264_pixel_sa8d_16x16_%1, 4,6
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
+ add r0, 8
+ add r2, 8
+ mova m10, m0
+ call x264_pixel_sa8d_8x8_internal_%1 ; pix[8]
+ lea r0, [r0+8*r1]
+ lea r2, [r2+8*r3]
+ paddusw m10, m0
+ call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8]
+ sub r0, 8
+ sub r2, 8
+ paddusw m10, m0
+ call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
+ paddusw m0, m10
+ HADDUW m0, m1
+ movd eax, m0
add eax, 1
shr eax, 1
ret
+
%else ; ARCH_X86_32
+cglobal x264_pixel_sa8d_8x8_internal_%1
+ LOAD_DIFF_8x4P m0, m1, m2, m3, m6, m7
+ movdqa [esp+4], m2
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ LOAD_DIFF_8x4P m4, m5, m6, m7, m2, m2
+ movdqa m2, [esp+4]
+
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [esp+4], [esp+20]
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+
+%ifidn %1, sse2
+ movdqa [esp+4], m4
+ movdqa [esp+20], m2
+%endif
+ ABS2 m6, m3, m4, m2
+ ABS2 m0, m7, m4, m2
+ paddusw m0, m6
+ paddusw m7, m3
+%ifidn %1, sse2
+ movdqa m4, [esp+4]
+ movdqa m2, [esp+20]
+%endif
+ ABS2 m5, m1, m6, m3
+ ABS2 m4, m2, m6, m3
+ paddusw m5, m1
+ paddusw m4, m2
+ paddusw m0, m7
+ paddusw m5, m4
+ pavgw m0, m5
+ ret
+%endif ; ARCH
+%endmacro ; SATDS_SSE2
+
+%macro SA8D_16x16_32 1
+%ifndef ARCH_X86_64
cglobal x264_pixel_sa8d_8x8_%1, 4,7
mov r6, esp
and esp, ~15
sub esp, 32
lea r4, [3*r1]
lea r5, [3*r3]
- LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm6, xmm7
- movdqa [esp], xmm2
+ call x264_pixel_sa8d_8x8_internal_%1
+ HADDW m0, m1
+ movd eax, m0
+ add eax, 1
+ shr eax, 1
+ mov esp, r6
+ RET
+
+cglobal x264_pixel_sa8d_16x16_%1, 4,7
+ mov r6, esp
+ and esp, ~15
+ sub esp, 48
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ call x264_pixel_sa8d_8x8_internal_%1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
- LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm2, xmm2
- movdqa xmm2, [esp]
-
- HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
- TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, esp
- HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm6, xmm4, xmm2, xmm1
-
-%ifidn %1, sse2
- movdqa [esp], xmm6
- movdqa [esp+16], xmm7
+ mova [esp+32], m0
+ call x264_pixel_sa8d_8x8_internal_%1
+ mov r0, [r6+20]
+ mov r2, [r6+28]
+ add r0, 8
+ add r2, 8
+ paddusw m0, [esp+32]
+ mova [esp+32], m0
+ call x264_pixel_sa8d_8x8_internal_%1
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+%if mmsize == 16
+ paddusw m0, [esp+32]
%endif
- ABS2 xmm2, xmm3, xmm6, xmm7
- ABS2 xmm0, xmm1, xmm6, xmm7
- paddusw xmm0, xmm2
- paddusw xmm1, xmm3
-%ifidn %1, sse2
- movdqa xmm6, [esp]
- movdqa xmm7, [esp+16]
+ mova [esp+48-mmsize], m0
+ call x264_pixel_sa8d_8x8_internal_%1
+ paddusw m0, [esp+48-mmsize]
+%if mmsize == 16
+ HADDUW m0, m1
+%else
+ mova m2, [esp+32]
+ pxor m7, m7
+ mova m1, m0
+ mova m3, m2
+ punpcklwd m0, m7
+ punpckhwd m1, m7
+ punpcklwd m2, m7
+ punpckhwd m3, m7
+ paddd m0, m1
+ paddd m2, m3
+ paddd m0, m2
+ HADDD m0, m1
%endif
- ABS2 xmm4, xmm5, xmm2, xmm3
- ABS2 xmm6, xmm7, xmm2, xmm3
- paddusw xmm4, xmm5
- paddusw xmm6, xmm7
- paddusw xmm0, xmm1
- paddusw xmm4, xmm6
- pavgw xmm0, xmm4
- picgetgot ebx
- HADDW xmm0, xmm1
- movd eax, xmm0
- mov ecx, eax ; preserve rounding for 16x16
+ movd eax, m0
add eax, 1
shr eax, 1
mov esp, r6
RET
-%endif ; ARCH
-%endmacro ; SATDS_SSE2
-
-%macro SA8D_16x16_32 1
-%ifndef ARCH_X86_64
-cglobal x264_pixel_sa8d_16x16_%1
- push ebp
- push dword [esp+20] ; stride2
- push dword [esp+20] ; pix2
- push dword [esp+20] ; stride1
- push dword [esp+20] ; pix1
- call x264_pixel_sa8d_8x8_%1
- mov ebp, ecx
- add dword [esp+0], 8 ; pix1+8
- add dword [esp+8], 8 ; pix2+8
- call x264_pixel_sa8d_8x8_%1
- add ebp, ecx
- mov eax, [esp+4]
- mov edx, [esp+12]
- shl eax, 3
- shl edx, 3
- add [esp+0], eax ; pix1+8*stride1+8
- add [esp+8], edx ; pix2+8*stride2+8
- call x264_pixel_sa8d_8x8_%1
- add ebp, ecx
- sub dword [esp+0], 8 ; pix1+8*stride1
- sub dword [esp+8], 8 ; pix2+8*stride2
- call x264_pixel_sa8d_8x8_%1
- lea eax, [ebp+ecx+1]
- shr eax, 1
- add esp, 16
- pop ebp
- ret
%endif ; !ARCH_X86_64
%endmacro ; SA8D_16x16_32
%macro INTRA_SA8D_SSE2 1
%ifdef ARCH_X86_64
+INIT_XMM
;-----------------------------------------------------------------------------
; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
;-----------------------------------------------------------------------------
cglobal x264_intra_sa8d_x3_8x8_core_%1
; 8x8 hadamard
- pxor xmm4, xmm4
- movq xmm0, [r0+0*FENC_STRIDE]
- movq xmm7, [r0+1*FENC_STRIDE]
- movq xmm6, [r0+2*FENC_STRIDE]
- movq xmm3, [r0+3*FENC_STRIDE]
- movq xmm5, [r0+4*FENC_STRIDE]
- movq xmm1, [r0+5*FENC_STRIDE]
- movq xmm8, [r0+6*FENC_STRIDE]
- movq xmm2, [r0+7*FENC_STRIDE]
- punpcklbw xmm0, xmm4
- punpcklbw xmm7, xmm4
- punpcklbw xmm6, xmm4
- punpcklbw xmm3, xmm4
- punpcklbw xmm5, xmm4
- punpcklbw xmm1, xmm4
- punpcklbw xmm8, xmm4
- punpcklbw xmm2, xmm4
- HADAMARD8_1D xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2
- TRANSPOSE8x8W xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4
- HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+ pxor m8, m8
+ movq m0, [r0+0*FENC_STRIDE]
+ movq m1, [r0+1*FENC_STRIDE]
+ movq m2, [r0+2*FENC_STRIDE]
+ movq m3, [r0+3*FENC_STRIDE]
+ movq m4, [r0+4*FENC_STRIDE]
+ movq m5, [r0+5*FENC_STRIDE]
+ movq m6, [r0+6*FENC_STRIDE]
+ movq m7, [r0+7*FENC_STRIDE]
+ punpcklbw m0, m8
+ punpcklbw m1, m8
+ punpcklbw m2, m8
+ punpcklbw m3, m8
+ punpcklbw m4, m8
+ punpcklbw m5, m8
+ punpcklbw m6, m8
+ punpcklbw m7, m8
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+ HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
; dc
movzx edi, word [r1+0]
and edi, -16
shl edi, 2
- pxor xmm15, xmm15
- movdqa xmm8, xmm2
- movdqa xmm9, xmm3
- movdqa xmm10, xmm4
- movdqa xmm11, xmm5
- ABS4 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13
- paddusw xmm8, xmm10
- paddusw xmm9, xmm11
+ pxor m15, m15
+ movdqa m8, m2
+ movdqa m9, m3
+ movdqa m10, m4
+ movdqa m11, m5
+ ABS4 m8, m9, m10, m11, m12, m13
+ paddusw m8, m10
+ paddusw m9, m11
%ifidn %1, ssse3
- pabsw xmm10, xmm6
- pabsw xmm11, xmm7
- pabsw xmm15, xmm1
+ pabsw m10, m6
+ pabsw m11, m7
+ pabsw m15, m1
%else
- movdqa xmm10, xmm6
- movdqa xmm11, xmm7
- movdqa xmm15, xmm1
- ABS2 xmm10, xmm11, xmm13, xmm14
- ABS1 xmm15, xmm13
+ movdqa m10, m6
+ movdqa m11, m7
+ movdqa m15, m1
+ ABS2 m10, m11, m13, m14
+ ABS1 m15, m13
%endif
- paddusw xmm10, xmm11
- paddusw xmm8, xmm9
- paddusw xmm15, xmm10
- paddusw xmm15, xmm8
- movdqa xmm14, xmm15 ; 7x8 sum
-
- movdqa xmm8, [r1+0] ; left edge
- movd xmm9, edi
- psllw xmm8, 3
- psubw xmm8, xmm0
- psubw xmm9, xmm0
- ABS1 xmm8, xmm10
- ABS1 xmm9, xmm11 ; 1x8 sum
- paddusw xmm14, xmm8
- paddusw xmm15, xmm9
- punpcklwd xmm0, xmm1
- punpcklwd xmm2, xmm3
- punpcklwd xmm4, xmm5
- punpcklwd xmm6, xmm7
- punpckldq xmm0, xmm2
- punpckldq xmm4, xmm6
- punpcklqdq xmm0, xmm4 ; transpose
- movdqa xmm1, [r1+16] ; top edge
- movdqa xmm2, xmm15
- psllw xmm1, 3
- psrldq xmm2, 2 ; 8x7 sum
- psubw xmm0, xmm1 ; 8x1 sum
- ABS1 xmm0, xmm1
- paddusw xmm2, xmm0
+ paddusw m10, m11
+ paddusw m8, m9
+ paddusw m15, m10
+ paddusw m15, m8
+ movdqa m14, m15 ; 7x8 sum
+
+ movdqa m8, [r1+0] ; left edge
+ movd m9, edi
+ psllw m8, 3
+ psubw m8, m0
+ psubw m9, m0
+ ABS1 m8, m10
+ ABS1 m9, m11 ; 1x8 sum
+ paddusw m14, m8
+ paddusw m15, m9
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpcklwd m4, m5
+ punpcklwd m6, m7
+ punpckldq m0, m2
+ punpckldq m4, m6
+ punpcklqdq m0, m4 ; transpose
+ movdqa m1, [r1+16] ; top edge
+ movdqa m2, m15
+ psllw m1, 3
+ psrldq m2, 2 ; 8x7 sum
+ psubw m0, m1 ; 8x1 sum
+ ABS1 m0, m1
+ paddusw m2, m0
; 3x HADDW
- movdqa xmm7, [pw_1 GLOBAL]
- pmaddwd xmm2, xmm7
- pmaddwd xmm14, xmm7
- pmaddwd xmm15, xmm7
- movdqa xmm3, xmm2
- punpckldq xmm2, xmm14
- punpckhdq xmm3, xmm14
- pshufd xmm5, xmm15, 0xf5
- paddd xmm2, xmm3
- paddd xmm5, xmm15
- movdqa xmm3, xmm2
- punpcklqdq xmm2, xmm5
- punpckhqdq xmm3, xmm5
- pavgw xmm3, xmm2
- pxor xmm0, xmm0
- pavgw xmm3, xmm0
- movq [r2], xmm3 ; i8x8_v, i8x8_h
- psrldq xmm3, 8
- movd [r2+8], xmm3 ; i8x8_dc
+ movdqa m7, [pw_1 GLOBAL]
+ pmaddwd m2, m7
+ pmaddwd m14, m7
+ pmaddwd m15, m7
+ movdqa m3, m2
+ punpckldq m2, m14
+ punpckhdq m3, m14
+ pshufd m5, m15, 0xf5
+ paddd m2, m3
+ paddd m5, m15
+ movdqa m3, m2
+ punpcklqdq m2, m5
+ punpckhqdq m3, m5
+ pavgw m3, m2
+ pxor m0, m0
+ pavgw m3, m0
+ movq [r2], m3 ; i8x8_v, i8x8_h
+ psrldq m3, 8
+ movd [r2+8], m3 ; i8x8_dc
ret
%endif ; ARCH_X86_64
-%endmacro ; INTRA_SATDS
+%endmacro ; INTRA_SA8D_SSE2
; in: r0 = fenc
-; out: mm0..mm3 = hadamard coefs
+; out: m0..m3 = hadamard coefs
+INIT_MMX
ALIGN 16
load_hadamard:
- pxor mm7, mm7
- movd mm0, [r0+0*FENC_STRIDE]
- movd mm4, [r0+1*FENC_STRIDE]
- movd mm3, [r0+2*FENC_STRIDE]
- movd mm1, [r0+3*FENC_STRIDE]
- punpcklbw mm0, mm7
- punpcklbw mm4, mm7
- punpcklbw mm3, mm7
- punpcklbw mm1, mm7
- HADAMARD4_1D mm0, mm4, mm3, mm1
- TRANSPOSE4x4W mm0, mm4, mm3, mm1, mm2
- HADAMARD4_1D mm0, mm1, mm2, mm3
+ pxor m7, m7
+ movd m0, [r0+0*FENC_STRIDE]
+ movd m1, [r0+1*FENC_STRIDE]
+ movd m2, [r0+2*FENC_STRIDE]
+ movd m3, [r0+3*FENC_STRIDE]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ HADAMARD4_1D m0, m1, m2, m3
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ HADAMARD4_1D m0, m1, m2, m3
+ SAVE_MM_PERMUTATION load_hadamard
ret
%macro SCALAR_SUMSUB 4
mov qword [sums+8], 0
mov qword [sums+16], 0
%else
- pxor mm7, mm7
- movq [sums+0], mm7
- movq [sums+8], mm7
- movq [sums+16], mm7
+ pxor m7, m7
+ movq [sums+0], m7
+ movq [sums+8], m7
+ movq [sums+16], m7
%endif
%endmacro
-; in: mm1..mm3
-; out: mm7
-; clobber: mm4..mm6
+; in: m1..m3
+; out: m7
+; clobber: m4..m6
%macro SUM3x4 1
%ifidn %1, ssse3
- pabsw mm4, mm1
- pabsw mm5, mm2
- pabsw mm7, mm3
- paddw mm4, mm5
+ pabsw m4, m1
+ pabsw m5, m2
+ pabsw m7, m3
+ paddw m4, m5
%else
- movq mm4, mm1
- movq mm5, mm2
- ABS2 mm4, mm5, mm6, mm7
- movq mm7, mm3
- paddw mm4, mm5
- ABS1 mm7, mm6
+ movq m4, m1
+ movq m5, m2
+ ABS2 m4, m5, m6, m7
+ movq m7, m3
+ paddw m4, m5
+ ABS1 m7, m6
%endif
- paddw mm7, mm4
+ paddw m7, m4
%endmacro
-; in: mm0..mm3 (4x4), mm7 (3x4)
-; out: mm0 v, mm4 h, mm5 dc
-; clobber: mm6
+; in: m0..m3 (4x4), m7 (3x4)
+; out: m0 v, m4 h, m5 dc
+; clobber: m6
%macro SUM4x3 3 ; dc, left, top
- movq mm4, %2
- movd mm5, %1
- psllw mm4, 2
- psubw mm4, mm0
- psubw mm5, mm0
- punpcklwd mm0, mm1
- punpcklwd mm2, mm3
- punpckldq mm0, mm2 ; transpose
- movq mm1, %3
- psllw mm1, 2
- psubw mm0, mm1
- ABS2 mm4, mm5, mm2, mm3 ; 1x4 sum
- ABS1 mm0, mm1 ; 4x1 sum
+ movq m4, %2
+ movd m5, %1
+ psllw m4, 2
+ psubw m4, m0
+ psubw m5, m0
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpckldq m0, m2 ; transpose
+ movq m1, %3
+ psllw m1, 2
+ psubw m0, m1
+ ABS2 m4, m5, m2, m3 ; 1x4 sum
+ ABS1 m0, m1 ; 4x1 sum
%endmacro
%macro INTRA_SATDS_MMX 1
+INIT_MMX
;-----------------------------------------------------------------------------
; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
SUM3x4 %1
SUM4x3 t0d, [left_1d], [top_1d]
- paddw mm4, mm7
- paddw mm5, mm7
- movq mm1, mm5
- psrlq mm1, 16 ; 4x3 sum
- paddw mm0, mm1
+ paddw m4, m7
+ paddw m5, m7
+ movq m1, m5
+ psrlq m1, 16 ; 4x3 sum
+ paddw m0, m1
- SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw
+ SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw
%ifndef ARCH_X86_64
mov r2, r2m
%endif
- movd [r2+0], mm0 ; i4x4_v satd
- movd [r2+4], mm4 ; i4x4_h satd
- movd [r2+8], mm5 ; i4x4_dc satd
+ movd [r2+0], m0 ; i4x4_v satd
+ movd [r2+4], m4 ; i4x4_h satd
+ movd [r2+8], m5 ; i4x4_dc satd
%ifndef ARCH_X86_64
ADD esp, 16
%endif
%assign stack_pad 88 + ((stack_offset+88+4)&15)
%endif
; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
- SUB rsp, stack_pad
-%define sums rsp+64 ; size 24
-%define top_1d rsp+32 ; size 32
-%define left_1d rsp ; size 32
+ SUB rsp, stack_pad
+%define sums rsp+64 ; size 24
+%define top_1d rsp+32 ; size 32
+%define left_1d rsp ; size 32
movifnidn r1d, r1m
CLEAR_SUMS
SUM3x4 %1
SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
- pavgw mm4, mm7
- pavgw mm5, mm7
- paddw mm0, [sums+0] ; i16x16_v satd
- paddw mm4, [sums+8] ; i16x16_h satd
- paddw mm5, [sums+16] ; i16x16_dc satd
- movq [sums+0], mm0
- movq [sums+8], mm4
- movq [sums+16], mm5
+ pavgw m4, m7
+ pavgw m5, m7
+ paddw m0, [sums+0] ; i16x16_v satd
+ paddw m4, [sums+8] ; i16x16_h satd
+ paddw m5, [sums+16] ; i16x16_dc satd
+ movq [sums+0], m0
+ movq [sums+8], m4
+ movq [sums+16], m5
add r0, 4
inc r4d
; horizontal sum
movifnidn r2d, r2m
- movq mm2, [sums+16]
- movq mm1, [sums+8]
- movq mm0, [sums+0]
- movq mm7, mm2
- SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
- psrld mm0, 1
- pslld mm7, 16
- psrld mm7, 16
- paddd mm0, mm2
- psubd mm0, mm7
- movd [r2+8], mm2 ; i16x16_dc satd
- movd [r2+4], mm1 ; i16x16_h satd
- movd [r2+0], mm0 ; i16x16_v satd
+ movq m2, [sums+16]
+ movq m1, [sums+8]
+ movq m0, [sums+0]
+ movq m7, m2
+ SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
+ psrld m0, 1
+ pslld m7, 16
+ psrld m7, 16
+ paddd m0, m2
+ psubd m0, m7
+ movd [r2+8], m2 ; i16x16_dc satd
+ movd [r2+4], m1 ; i16x16_h satd
+ movd [r2+0], m0 ; i16x16_v satd
ADD rsp, stack_pad
RET
SUM3x4 %1
SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
- pavgw mm4, mm7
- pavgw mm5, mm7
- paddw mm0, [sums+16] ; i4x4_v satd
- paddw mm4, [sums+8] ; i4x4_h satd
- paddw mm5, [sums+0] ; i4x4_dc satd
- movq [sums+16], mm0
- movq [sums+8], mm4
- movq [sums+0], mm5
+ pavgw m4, m7
+ pavgw m5, m7
+ paddw m0, [sums+16] ; i4x4_v satd
+ paddw m4, [sums+8] ; i4x4_h satd
+ paddw m5, [sums+0] ; i4x4_dc satd
+ movq [sums+16], m0
+ movq [sums+8], m4
+ movq [sums+0], m5
add r0, 4
inc r4d
jl .loop_y
; horizontal sum
- movq mm0, [sums+0]
- movq mm1, [sums+8]
- movq mm2, [sums+16]
- movq mm7, mm0
- psrlq mm7, 15
- paddw mm2, mm7
- SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
- psrld mm2, 1
- movd [r2+0], mm0 ; i8x8c_dc satd
- movd [r2+4], mm1 ; i8x8c_h satd
- movd [r2+8], mm2 ; i8x8c_v satd
+ movq m0, [sums+0]
+ movq m1, [sums+8]
+ movq m2, [sums+16]
+ movq m7, m0
+ psrlq m7, 15
+ paddw m2, m7
+ SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
+ psrld m2, 1
+ movd [r2+0], m0 ; i8x8c_dc satd
+ movd [r2+4], m1 ; i8x8c_h satd
+ movd [r2+8], m2 ; i8x8c_v satd
ADD rsp, 72
RET
+%endmacro ; INTRA_SATDS_MMX
+
+
+%macro ABS_MOV_SSSE3 2
+ pabsw %1, %2
+%endmacro
+
+%macro ABS_MOV_MMX 2
+ pxor %1, %1
+ psubw %1, %2
+ pmaxsw %1, %2
%endmacro
+%define ABS_MOV ABS_MOV_MMX
+
+; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
+; out: [tmp]=hadamard4, m0=satd
+cglobal x264_hadamard_ac_4x4_mmxext
+ movh m0, [r0]
+ movh m1, [r0+r1]
+ movh m2, [r0+r1*2]
+ movh m3, [r0+r2]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ HADAMARD4_1D m0, m1, m2, m3
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ HADAMARD4_1D m0, m1, m2, m3
+ mova [r3], m0
+ mova [r3+8], m1
+ mova [r3+16], m2
+ mova [r3+24], m3
+ ABS1 m0, m4
+ ABS1 m1, m4
+ pand m0, m6
+ ABS1 m2, m4
+ ABS1 m3, m4
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m2
+ SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext
+ ret
+
+cglobal x264_hadamard_ac_2x2_mmxext
+ mova m0, [r3+0x00]
+ mova m1, [r3+0x20]
+ mova m2, [r3+0x40]
+ mova m3, [r3+0x60]
+ HADAMARD4_1D m0, m1, m2, m3
+ ABS2 m0, m1, m4, m5
+ ABS2 m2, m3, m4, m5
+ SAVE_MM_PERMUTATION x264_hadamard_ac_2x2_mmxext
+ ret
+
+cglobal x264_hadamard_ac_8x8_mmxext
+ mova m6, [mask_ac4 GLOBAL]
+ pxor m7, m7
+ call x264_hadamard_ac_4x4_mmxext
+ add r0, 4
+ add r3, 32
+ mova m5, m0
+ call x264_hadamard_ac_4x4_mmxext
+ lea r0, [r0+4*r1]
+ add r3, 64
+ paddw m5, m0
+ call x264_hadamard_ac_4x4_mmxext
+ sub r0, 4
+ sub r3, 32
+ paddw m5, m0
+ call x264_hadamard_ac_4x4_mmxext
+ paddw m5, m0
+ sub r3, 64
+ mova [rsp+gprsize+8], m5 ; save satd
+ call x264_hadamard_ac_2x2_mmxext
+ add r3, 8
+ pand m6, m0
+ mova m7, m1
+ paddw m6, m2
+ paddw m7, m3
+%rep 2
+ call x264_hadamard_ac_2x2_mmxext
+ add r3, 8
+ paddw m6, m0
+ paddw m7, m1
+ paddw m6, m2
+ paddw m7, m3
+%endrep
+ call x264_hadamard_ac_2x2_mmxext
+ sub r3, 24
+ paddw m6, m0
+ paddw m7, m1
+ paddw m6, m2
+ paddw m7, m3
+ paddw m6, m7
+ mova [rsp+gprsize], m6 ; save sa8d
+ SWAP m0, m6
+ SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext
+ ret
+
+%macro HADAMARD_AC_WXH_MMX 2
+cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4
+ %assign pad 16-gprsize-(stack_offset&15)
+ %define ysub r1
+ sub rsp, 16+128+pad
+ lea r2, [r1*3]
+ lea r3, [rsp+16]
+ call x264_hadamard_ac_8x8_mmxext
+%if %2==16
+ %define ysub r2
+ lea r0, [r0+r1*4]
+ sub rsp, 16
+ call x264_hadamard_ac_8x8_mmxext
+%endif
+%if %1==16
+ neg ysub
+ sub rsp, 16
+ lea r0, [r0+ysub*4+8]
+ neg ysub
+ call x264_hadamard_ac_8x8_mmxext
+%if %2==16
+ lea r0, [r0+r1*4]
+ sub rsp, 16
+ call x264_hadamard_ac_8x8_mmxext
+%endif
+%endif
+ mova m1, [rsp+0x08]
+%if %1*%2 >= 128
+ paddusw m0, [rsp+0x10]
+ paddusw m1, [rsp+0x18]
+%endif
+%if %1*%2 == 256
+ mova m2, [rsp+0x20]
+ paddusw m1, [rsp+0x28]
+ paddusw m2, [rsp+0x30]
+ mova m3, m0
+ paddusw m1, [rsp+0x38]
+ pxor m3, m2
+ pand m3, [pw_1 GLOBAL]
+ pavgw m0, m2
+ psubusw m0, m3
+ HADDUW m0, m2
+%else
+ psrlw m0, 1
+ HADDW m0, m2
+%endif
+ psrlw m1, 1
+ HADDW m1, m3
+ movd edx, m0
+ movd eax, m1
+ shr edx, 1
+%ifdef ARCH_X86_64
+ shl rdx, 32
+ add rax, rdx
+%endif
+ add rsp, 128+%1*%2/4+pad
+ RET
+%endmacro ; HADAMARD_AC_WXH_MMX
+
+HADAMARD_AC_WXH_MMX 16, 16
+HADAMARD_AC_WXH_MMX 8, 16
+HADAMARD_AC_WXH_MMX 16, 8
+HADAMARD_AC_WXH_MMX 8, 8
+
+%macro HADAMARD_AC_SSE2 1
+INIT_XMM
+; in: r0=pix, r1=stride, r2=stride*3
+; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
+cglobal x264_hadamard_ac_8x8_%1
+%ifdef ARCH_X86_64
+ %define spill0 m8
+ %define spill1 m9
+ %define spill2 m10
+%else
+ %define spill0 [rsp+gprsize]
+ %define spill1 [rsp+gprsize+16]
+ %define spill2 [rsp+gprsize+32]
+%endif
+ pxor m7, m7
+ movh m0, [r0]
+ movh m1, [r0+r1]
+ movh m2, [r0+r1*2]
+ movh m3, [r0+r2]
+ lea r0, [r0+r1*4]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ HADAMARD4_1D m0, m1, m2, m3
+ mova spill0, m3
+ SWAP m3, m7
+ movh m4, [r0]
+ movh m5, [r0+r1]
+ movh m6, [r0+r1*2]
+ movh m7, [r0+r2]
+ punpcklbw m4, m3
+ punpcklbw m5, m3
+ punpcklbw m6, m3
+ punpcklbw m7, m3
+ HADAMARD4_1D m4, m5, m6, m7
+ mova m3, spill0
+%ifdef ARCH_X86_64
+ TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
+%else
+ TRANSPOSE8x8W 0,1,2,3,4,5,6,7,spill0,spill1
+%endif
+ HADAMARD4_1D m0, m1, m2, m3
+ HADAMARD4_1D m4, m5, m6, m7
+ mova spill0, m1
+ mova spill1, m2
+ mova spill2, m3
+ ABS_MOV m1, m0
+ ABS_MOV m2, m4
+ ABS_MOV m3, m5
+ paddw m1, m2
+ SUMSUB_BA m0, m4
+ pand m1, [mask_ac4 GLOBAL]
+ ABS_MOV m2, spill0
+ paddw m1, m3
+ ABS_MOV m3, spill1
+ paddw m1, m2
+ ABS_MOV m2, spill2
+ paddw m1, m3
+ ABS_MOV m3, m6
+ paddw m1, m2
+ ABS_MOV m2, m7
+ paddw m1, m3
+ mova m3, m7
+ paddw m1, m2
+ mova m2, m6
+ psubw m7, spill2
+ paddw m3, spill2
+ mova [rsp+gprsize+32], m1 ; save satd
+ mova m1, m5
+ psubw m6, spill1
+ paddw m2, spill1
+ psubw m5, spill0
+ paddw m1, spill0
+ mova spill1, m7
+ SBUTTERFLY qdq, 0, 4, 7
+ SBUTTERFLY qdq, 1, 5, 7
+ SBUTTERFLY qdq, 2, 6, 7
+ SUMSUB_BADC m0, m4, m1, m5
+ SUMSUB_BA m2, m6
+ ABS1 m0, m7
+ ABS1 m1, m7
+ pand m0, [mask_ac8 GLOBAL]
+ ABS1 m2, m7
+ ABS1 m4, m7
+ ABS1 m5, m7
+ ABS1 m6, m7
+ mova m7, spill1
+ paddw m0, m4
+ SBUTTERFLY qdq, 3, 7, 4
+ SUMSUB_BA m3, m7
+ paddw m1, m5
+ ABS1 m3, m4
+ ABS1 m7, m4
+ paddw m2, m6
+ paddw m3, m7
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m2
+ mova [rsp+gprsize+16], m0 ; save sa8d
+ SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1
+ ret
+
+HADAMARD_AC_WXH_SSE2 16, 16, %1
+HADAMARD_AC_WXH_SSE2 8, 16, %1
+HADAMARD_AC_WXH_SSE2 16, 8, %1
+HADAMARD_AC_WXH_SSE2 8, 8, %1
+%endmacro ; HADAMARD_AC_SSE2
+
+; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
+%macro HADAMARD_AC_WXH_SSE2 3
+cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3
+ %assign pad 16-gprsize-(stack_offset&15)
+ %define ysub r1
+ sub rsp, 48+pad
+ lea r2, [r1*3]
+ call x264_hadamard_ac_8x8_%3
+%if %2==16
+ %define ysub r2
+ lea r0, [r0+r1*4]
+ sub rsp, 32
+ call x264_hadamard_ac_8x8_%3
+%endif
+%if %1==16
+ neg ysub
+ sub rsp, 32
+ lea r0, [r0+ysub*4+8]
+ neg ysub
+ call x264_hadamard_ac_8x8_%3
+%if %2==16
+ lea r0, [r0+r1*4]
+ sub rsp, 32
+ call x264_hadamard_ac_8x8_%3
+%endif
+%endif
+ mova m1, [rsp+0x20]
+%if %1*%2 >= 128
+ paddusw m0, [rsp+0x30]
+ paddusw m1, [rsp+0x40]
+%endif
+%if %1*%2 == 256
+ paddusw m0, [rsp+0x50]
+ paddusw m1, [rsp+0x60]
+ paddusw m0, [rsp+0x70]
+ paddusw m1, [rsp+0x80]
+ psrlw m0, 1
+%endif
+ HADDW m0, m2
+ HADDW m1, m3
+ movd edx, m0
+ movd eax, m1
+ shr edx, 2 - (%1*%2 >> 8)
+ shr eax, 1
+%ifdef ARCH_X86_64
+ shl rdx, 32
+ add rax, rdx
+%endif
+ add rsp, 16+%1*%2/2+pad
+ RET
+%endmacro ; HADAMARD_AC_WXH_SSE2
+
; instantiate satds
-; FIXME width4 can benefit from pabsw even if not sse2
-cextern x264_pixel_sa8d_8x8_mmxext
+%ifndef ARCH_X86_64
+cextern x264_pixel_sa8d_8x8_internal_mmxext
SA8D_16x16_32 mmxext
+%endif
%define ABS1 ABS1_MMX
%define ABS2 ABS2_MMX
SA8D_16x16_32 sse2
INTRA_SA8D_SSE2 sse2
INTRA_SATDS_MMX mmxext
-%ifdef HAVE_SSE3
+HADAMARD_AC_SSE2 sse2
%define ABS1 ABS1_SSSE3
%define ABS2 ABS2_SSSE3
+%define ABS_MOV ABS_MOV_SSSE3
+SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
SATDS_SSE2 ssse3
SA8D_16x16_32 ssse3
INTRA_SA8D_SSE2 ssse3
INTRA_SATDS_MMX ssse3
-%endif
+HADAMARD_AC_SSE2 ssse3
+SATDS_SSE2 ssse3_phadd
; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4
- pxor xmm0, xmm0
- pxor xmm1, xmm1
- pxor xmm2, xmm2
- pxor xmm3, xmm3
- pxor xmm4, xmm4
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+ pxor m4, m4
%rep 4
- movq xmm5, [r0]
- movq xmm6, [r2]
- punpcklbw xmm5, xmm0
- punpcklbw xmm6, xmm0
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- movdqa xmm7, xmm5
- pmaddwd xmm5, xmm5
- pmaddwd xmm7, xmm6
- pmaddwd xmm6, xmm6
- paddd xmm3, xmm5
- paddd xmm4, xmm7
- paddd xmm3, xmm6
+ movq m5, [r0]
+ movq m6, [r2]
+ punpcklbw m5, m0
+ punpcklbw m6, m0
+ paddw m1, m5
+ paddw m2, m6
+ movdqa m7, m5
+ pmaddwd m5, m5
+ pmaddwd m7, m6
+ pmaddwd m6, m6
+ paddd m3, m5
+ paddd m4, m7
+ paddd m3, m6
add r0, r1
add r2, r3
%endrep
- ; PHADDW xmm1, xmm2
- ; PHADDD xmm3, xmm4
- picgetgot eax
- movdqa xmm7, [pw_1 GLOBAL]
- pshufd xmm5, xmm3, 0xb1
- pmaddwd xmm1, xmm7
- pmaddwd xmm2, xmm7
- pshufd xmm6, xmm4, 0xb1
- packssdw xmm1, xmm2
- paddd xmm3, xmm5
- pshufd xmm1, xmm1, 0xd8
- paddd xmm4, xmm6
- pmaddwd xmm1, xmm7
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
- punpckhdq xmm5, xmm4
+ ; PHADDW m1, m2
+ ; PHADDD m3, m4
+ movdqa m7, [pw_1 GLOBAL]
+ pshufd m5, m3, 0xb1
+ pmaddwd m1, m7
+ pmaddwd m2, m7
+ pshufd m6, m4, 0xb1
+ packssdw m1, m2
+ paddd m3, m5
+ pshufd m1, m1, 0xd8
+ paddd m4, m6
+ pmaddwd m1, m7
+ movdqa m5, m3
+ punpckldq m3, m4
+ punpckhdq m5, m4
%ifdef ARCH_X86_64
%define t0 r4
%define t0 eax
mov t0, r4m
%endif
-%ifnidn r4d, r4m
- mov t0, r4m
-%endif
-
- movq [t0+ 0], xmm1
- movq [t0+ 8], xmm3
- psrldq xmm1, 8
- movq [t0+16], xmm1
- movq [t0+24], xmm5
+
+ movq [t0+ 0], m1
+ movq [t0+ 8], m3
+ psrldq m1, 8
+ movq [t0+16], m1
+ movq [t0+24], m5
RET
;-----------------------------------------------------------------------------
; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
;-----------------------------------------------------------------------------
cglobal x264_pixel_ssim_end4_sse2, 3,3
- movdqa xmm0, [r0+ 0]
- movdqa xmm1, [r0+16]
- movdqa xmm2, [r0+32]
- movdqa xmm3, [r0+48]
- movdqa xmm4, [r0+64]
- paddd xmm0, [r1+ 0]
- paddd xmm1, [r1+16]
- paddd xmm2, [r1+32]
- paddd xmm3, [r1+48]
- paddd xmm4, [r1+64]
- paddd xmm0, xmm1
- paddd xmm1, xmm2
- paddd xmm2, xmm3
- paddd xmm3, xmm4
- picgetgot r1
- movdqa xmm5, [ssim_c1 GLOBAL]
- movdqa xmm6, [ssim_c2 GLOBAL]
- TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4
-
-; s1=mm0, s2=mm3, ss=mm4, s12=mm2
- movdqa xmm1, xmm3
- pslld xmm3, 16
- pmaddwd xmm1, xmm0 ; s1*s2
- por xmm0, xmm3
- pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2
- pslld xmm1, 1
- pslld xmm2, 7
- pslld xmm4, 6
- psubd xmm2, xmm1 ; covar*2
- psubd xmm4, xmm0 ; vars
- paddd xmm0, xmm5
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm4, xmm6
- cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
- cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1)
- cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2)
- cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2)
- mulps xmm1, xmm2
- mulps xmm0, xmm4
- divps xmm1, xmm0 ; ssim
-
- cmp r2d, 4
+ movdqa m0, [r0+ 0]
+ movdqa m1, [r0+16]
+ movdqa m2, [r0+32]
+ movdqa m3, [r0+48]
+ movdqa m4, [r0+64]
+ paddd m0, [r1+ 0]
+ paddd m1, [r1+16]
+ paddd m2, [r1+32]
+ paddd m3, [r1+48]
+ paddd m4, [r1+64]
+ paddd m0, m1
+ paddd m1, m2
+ paddd m2, m3
+ paddd m3, m4
+ movdqa m5, [ssim_c1 GLOBAL]
+ movdqa m6, [ssim_c2 GLOBAL]
+ TRANSPOSE4x4D 0, 1, 2, 3, 4
+
+; s1=m0, s2=m1, ss=m2, s12=m3
+ movdqa m4, m1
+ pslld m1, 16
+ pmaddwd m4, m0 ; s1*s2
+ por m0, m1
+ pmaddwd m0, m0 ; s1*s1 + s2*s2
+ pslld m4, 1
+ pslld m3, 7
+ pslld m2, 6
+ psubd m3, m4 ; covar*2
+ psubd m2, m0 ; vars
+ paddd m0, m5
+ paddd m4, m5
+ paddd m3, m6
+ paddd m2, m6
+ cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
+ cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
+ cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
+ cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
+ mulps m4, m3
+ mulps m0, m2
+ divps m4, m0 ; ssim
+
+ cmp r2d, 4
je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
- neg r2
-%ifdef PIC64
- lea r3, [mask_ff + 16 GLOBAL]
- movdqu xmm3, [r3 + r2*4]
+ neg r2
+%ifdef PIC
+ lea r3, [mask_ff + 16 GLOBAL]
+ movdqu m1, [r3 + r2*4]
%else
- movdqu xmm3, [mask_ff + r2*4 + 16 GLOBAL]
+ movdqu m1, [mask_ff + r2*4 + 16 GLOBAL]
%endif
- pand xmm1, xmm3
+ pand m4, m1
.skip:
- movhlps xmm0, xmm1
- addps xmm0, xmm1
- pshuflw xmm1, xmm0, 0xE
- addss xmm0, xmm1
+ movhlps m0, m4
+ addps m0, m4
+ pshuflw m4, m0, 0xE
+ addss m0, m4
%ifndef ARCH_X86_64
- movd r0m, xmm0
- fld dword r0m
+ movd r0m, m0
+ fld dword r0m
%endif
RET
; Successive Elimination ADS
;=============================================================================
-%macro ADS_START 1 ; unroll_size
+%macro ADS_START 1 ; unroll_size
%ifdef ARCH_X86_64
%define t0 r6
mov r10, rsp
and rsp, ~15
mov t0, rsp
shl r2d, 1
-%endmacro
+%endmacro
%macro ADS_END 1
add r1, 8*%1
%endmacro
ADS_SSE2 sse2
-%ifdef HAVE_SSE3
%define ABS1 ABS1_SSSE3
ADS_SSE2 ssse3
-%endif
; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {