;***************************************************************************** ;* pixel.asm: h264 encoder library ;***************************************************************************** ;* Copyright (C) 2003-2008 x264 project ;* ;* Authors: Loren Merritt ;* Laurent Aimar ;* Alex Izvorski ;* Fiona Glaser ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA pw_1: times 8 dw 1 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 mask_ff: times 16 db 0xff times 16 db 0 mask_ac4: dw 0,-1,-1,-1, 0,-1,-1,-1 mask_ac8: dw 0,-1,-1,-1,-1,-1,-1,-1 SECTION .text %macro HADDD 2 ; sum junk %if mmsize == 16 movhlps %2, %1 paddd %1, %2 pshuflw %2, %1, 0xE paddd %1, %2 %else mova %2, %1 psrlq %2, 32 paddd %1, %2 %endif %endmacro %macro HADDW 2 pmaddwd %1, [pw_1 GLOBAL] HADDD %1, %2 %endmacro %macro HADDUW 2 mova %2, %1 pslld %1, 16 psrld %2, 16 psrld %1, 16 paddd %1, %2 HADDD %1, %2 %endmacro ;============================================================================= ; SSD ;============================================================================= %macro SSD_FULL 6 mova m1, [r0+%1] mova m2, [r2+%2] mova m3, [r0+%3] mova m4, [r2+%4] mova m5, m2 mova m6, m4 psubusb m2, m1 psubusb m4, m3 psubusb m1, m5 psubusb m3, m6 por m1, m2 por m3, m4 mova m2, m1 mova m4, m3 punpcklbw m1, m7 punpcklbw m3, m7 punpckhbw m2, m7 punpckhbw m4, m7 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 %if %6 lea r0, [r0+2*r1] lea r2, [r2+2*r3] %endif paddd m1, m2 paddd m3, m4 %if %5 paddd m0, m1 %else SWAP m0, m1 %endif paddd m0, m3 %endmacro %macro SSD_HALF 6 movh m1, [r0+%1] movh m2, [r2+%2] movh m3, [r0+%3] movh m4, [r2+%4] punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 punpcklbw m4, m7 psubw m1, m2 psubw m3, m4 pmaddwd m1, m1 pmaddwd m3, m3 %if %6 lea r0, [r0+2*r1] lea r2, [r2+2*r3] %endif %if %5 paddd m0, m1 %else SWAP m0, m1 %endif paddd m0, m3 %endmacro %macro SSD_QUARTER 6 movd m1, [r0+%1] movd m2, [r2+%2] movd m3, [r0+%3] movd m4, [r2+%4] lea r0, [r0+2*r1] lea r2, [r2+2*r3] pinsrd m1, [r0+%1], 1 pinsrd m2, [r2+%2], 1 pinsrd m3, [r0+%3], 1 pinsrd m4, [r2+%4], 1 punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 punpcklbw m4, m7 psubw m1, m2 psubw m3, m4 pmaddwd m1, m1 pmaddwd m3, m3 %if %6 lea r0, [r0+2*r1] lea r2, [r2+2*r3] %endif %if %5 paddd m0, m1 %else SWAP m0, m1 %endif paddd m0, m3 %endmacro ;----------------------------------------------------------------------------- ; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SSD 3 cglobal x264_pixel_ssd_%1x%2_%3, 4,4 %if %1 >= mmsize pxor m7, m7 %endif %assign i 0 %rep %2/2 %if %1 > mmsize SSD_FULL 0, 0, mmsize, mmsize, i, 0 SSD_FULL r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/2-1 %elif %1 == mmsize SSD_FULL 0, 0, r1, r3, i, i<%2/2-1 %else SSD_HALF 0, 0, r1, r3, i, i<%2/2-1 %endif %assign i i+1 %endrep HADDD m0, m1 movd eax, m0 RET %endmacro INIT_MMX SSD 16, 16, mmx SSD 16, 8, mmx SSD 8, 16, mmx SSD 8, 8, mmx SSD 8, 4, mmx SSD 4, 8, mmx SSD 4, 4, mmx INIT_XMM SSD 16, 16, sse2 SSD 16, 8, sse2 SSD 8, 16, sse2 SSD 8, 8, sse2 SSD 8, 4, sse2 cglobal x264_pixel_ssd_4x8_sse4, 4,4 SSD_QUARTER 0, 0, r1, r3, 0, 1 SSD_QUARTER 0, 0, r1, r3, 1, 0 HADDD m0, m1 movd eax, m0 RET cglobal x264_pixel_ssd_4x4_sse4, 4,4 SSD_QUARTER 0, 0, r1, r3, 0, 0 HADDD m0, m1 movd eax, m0 RET ;============================================================================= ; variance ;============================================================================= %macro VAR_START 0 pxor m5, m5 ; sum pxor m6, m6 ; sum squared pxor m7, m7 ; zero %ifdef ARCH_X86_64 %define t3 r3 %else %define t3 r2 %endif %endmacro %macro VAR_END 1 HADDW m5, m7 movd r1d, m5 imul r1d, r1d HADDD m6, m1 shr r1d, %1 movd eax, m6 sub eax, r1d ; sqr - (sum * sum >> shift) RET %endmacro %macro VAR_2ROW 2 mov t3d, %2 .loop: mova m0, [r0] mova m1, m0 mova m3, [r0+%1] mova m4, m3 punpcklbw m0, m7 punpckhbw m1, m7 %ifidn %1, r1 lea r0, [r0+%1*2] %else add r0, r1 %endif punpcklbw m3, m7 punpckhbw m4, m7 paddw m5, m0 dec t3d pmaddwd m0, m0 paddw m5, m1 pmaddwd m1, m1 paddw m5, m3 paddd m6, m0 pmaddwd m3, m3 paddw m5, m4 paddd m6, m1 pmaddwd m4, m4 paddd m6, m3 paddd m6, m4 jg .loop %endmacro ;----------------------------------------------------------------------------- ; int x264_pixel_var_wxh_mmxext( uint8_t *, int ) ;----------------------------------------------------------------------------- INIT_MMX cglobal x264_pixel_var_16x16_mmxext, 2,3 VAR_START VAR_2ROW 8, 16 VAR_END 8 cglobal x264_pixel_var_8x8_mmxext, 2,3 VAR_START VAR_2ROW r1, 4 VAR_END 6 INIT_XMM cglobal x264_pixel_var_16x16_sse2, 2,3 VAR_START VAR_2ROW r1, 8 VAR_END 8 cglobal x264_pixel_var_8x8_sse2, 2,3 VAR_START mov t3d, 4 .loop: movh m0, [r0] movhps m0, [r0+r1] lea r0, [r0+r1*2] mova m1, m0 punpcklbw m0, m7 punpckhbw m1, m7 dec t3d paddw m5, m0 paddw m5, m1 pmaddwd m0, m0 pmaddwd m1, m1 paddd m6, m0 paddd m6, m1 jnz .loop VAR_END 6 ;============================================================================= ; SATD ;============================================================================= ; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower: ; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1. ; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging, ; whereas phaddw-based transform doesn't care what order the coefs end up in. %macro PHSUMSUB 3 movdqa m%3, m%1 phaddw m%1, m%2 phsubw m%3, m%2 SWAP %2, %3 %endmacro %macro HADAMARD4_ROW_PHADD 5 PHSUMSUB %1, %2, %5 PHSUMSUB %3, %4, %5 PHSUMSUB %1, %3, %5 PHSUMSUB %2, %4, %5 SWAP %3, %4 %endmacro %macro HADAMARD4_1D 4 SUMSUB_BADC %1, %2, %3, %4 SUMSUB_BADC %1, %3, %2, %4 %endmacro %macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block) %xdefine %%n n%1 HADAMARD4_1D m4, m5, m6, m7 TRANSPOSE4x4W 4, 5, 6, 7, %%n HADAMARD4_1D m4, m5, m6, m7 ABS2 m4, m5, m3, m %+ %%n ABS2 m6, m7, m3, m %+ %%n paddw m6, m4 paddw m7, m5 pavgw m6, m7 SWAP %%n, 6 %endmacro ; in: r4=3*stride1, r5=3*stride2 ; in: %2 = horizontal offset ; in: %3 = whether we need to increment pix1 and pix2 ; clobber: m3..m7 ; out: %1 = satd %macro SATD_4x4_MMX 3 LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2] LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2] LOAD_DIFF m6, m3, none, [r0+2*r1+%2], [r2+2*r3+%2] LOAD_DIFF m7, m3, none, [r0+r4+%2], [r2+r5+%2] %if %3 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endif HADAMARD4x4_SUM %1 %endmacro %macro SATD_8x4_SSE2 1 HADAMARD4_1D m0, m1, m2, m3 %ifidn %1, ssse3_phadd HADAMARD4_ROW_PHADD 0, 1, 2, 3, 4 %else TRANSPOSE2x4x4W 0, 1, 2, 3, 4 HADAMARD4_1D m0, m1, m2, m3 %endif ABS4 m0, m1, m2, m3, m4, m5 paddusw m0, m1 paddusw m2, m3 paddusw m6, m0 paddusw m6, m2 %endmacro %macro SATD_START_MMX 0 lea r4, [3*r1] ; 3*stride1 lea r5, [3*r3] ; 3*stride2 %endmacro %macro SATD_END_MMX 0 pshufw m1, m0, 01001110b paddw m0, m1 pshufw m1, m0, 10110001b paddw m0, m1 movd eax, m0 and eax, 0xffff RET %endmacro ; FIXME avoid the spilling of regs to hold 3*stride. ; for small blocks on x86_32, modify pixel pointer instead. ;----------------------------------------------------------------------------- ; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- INIT_MMX cglobal x264_pixel_satd_16x4_internal_mmxext SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 0 paddw m0, m2 SATD_4x4_MMX m2, 8, 0 paddw m0, m1 SATD_4x4_MMX m1, 12, 0 paddw m0, m2 paddw m0, m1 ret cglobal x264_pixel_satd_8x8_internal_mmxext SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 1 paddw m0, m2 paddw m0, m1 x264_pixel_satd_8x4_internal_mmxext: SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 0 paddw m0, m2 paddw m0, m1 ret cglobal x264_pixel_satd_16x16_mmxext, 4,6 SATD_START_MMX pxor m0, m0 %rep 3 call x264_pixel_satd_16x4_internal_mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endrep call x264_pixel_satd_16x4_internal_mmxext HADDUW m0, m1 movd eax, m0 RET cglobal x264_pixel_satd_16x8_mmxext, 4,6 SATD_START_MMX pxor m0, m0 call x264_pixel_satd_16x4_internal_mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3] call x264_pixel_satd_16x4_internal_mmxext SATD_END_MMX cglobal x264_pixel_satd_8x16_mmxext, 4,6 SATD_START_MMX pxor m0, m0 call x264_pixel_satd_8x8_internal_mmxext lea r0, [r0+4*r1] lea r2, [r2+4*r3] call x264_pixel_satd_8x8_internal_mmxext SATD_END_MMX cglobal x264_pixel_satd_8x8_mmxext, 4,6 SATD_START_MMX pxor m0, m0 call x264_pixel_satd_8x8_internal_mmxext SATD_END_MMX cglobal x264_pixel_satd_8x4_mmxext, 4,6 SATD_START_MMX pxor m0, m0 call x264_pixel_satd_8x4_internal_mmxext SATD_END_MMX cglobal x264_pixel_satd_4x8_mmxext, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 1 SATD_4x4_MMX m1, 0, 0 paddw m0, m1 SATD_END_MMX %macro SATD_W4 1 INIT_MMX cglobal x264_pixel_satd_4x4_%1, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 0 SATD_END_MMX %endmacro SATD_W4 mmxext %macro SATD_START_SSE2 0 pxor m6, m6 lea r4, [3*r1] lea r5, [3*r3] %endmacro %macro SATD_END_SSE2 0 psrlw m6, 1 HADDW m6, m7 movd eax, m6 RET %endmacro %macro BACKUP_POINTERS 0 %ifdef ARCH_X86_64 mov r10, r0 mov r11, r2 %endif %endmacro %macro RESTORE_AND_INC_POINTERS 0 %ifdef ARCH_X86_64 lea r0, [r10+8] lea r2, [r11+8] %else mov r0, r0m mov r2, r2m add r0, 8 add r2, 8 %endif %endmacro ;----------------------------------------------------------------------------- ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SATDS_SSE2 1 INIT_XMM cglobal x264_pixel_satd_8x8_internal_%1 LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5 SATD_8x4_SSE2 %1 lea r0, [r0+4*r1] lea r2, [r2+4*r3] x264_pixel_satd_8x4_internal_%1: LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5 x264_pixel_satd_4x8_internal_%1: SAVE_MM_PERMUTATION satd_4x8_internal SATD_8x4_SSE2 %1 ret cglobal x264_pixel_satd_16x16_%1, 4,6 SATD_START_SSE2 BACKUP_POINTERS call x264_pixel_satd_8x8_internal_%1 lea r0, [r0+4*r1] lea r2, [r2+4*r3] call x264_pixel_satd_8x8_internal_%1 RESTORE_AND_INC_POINTERS call x264_pixel_satd_8x8_internal_%1 lea r0, [r0+4*r1] lea r2, [r2+4*r3] call x264_pixel_satd_8x8_internal_%1 SATD_END_SSE2 cglobal x264_pixel_satd_16x8_%1, 4,6 SATD_START_SSE2 BACKUP_POINTERS call x264_pixel_satd_8x8_internal_%1 RESTORE_AND_INC_POINTERS call x264_pixel_satd_8x8_internal_%1 SATD_END_SSE2 cglobal x264_pixel_satd_8x16_%1, 4,6 SATD_START_SSE2 call x264_pixel_satd_8x8_internal_%1 lea r0, [r0+4*r1] lea r2, [r2+4*r3] call x264_pixel_satd_8x8_internal_%1 SATD_END_SSE2 cglobal x264_pixel_satd_8x8_%1, 4,6 SATD_START_SSE2 call x264_pixel_satd_8x8_internal_%1 SATD_END_SSE2 cglobal x264_pixel_satd_8x4_%1, 4,6 SATD_START_SSE2 call x264_pixel_satd_8x4_internal_%1 SATD_END_SSE2 cglobal x264_pixel_satd_4x8_%1, 4,6 INIT_XMM LOAD_MM_PERMUTATION satd_4x8_internal %define movh movd SATD_START_SSE2 LOAD_DIFF m0, m7, m6, [r0], [r2] LOAD_DIFF m1, m7, m6, [r0+r1], [r2+r3] LOAD_DIFF m2, m7, m6, [r0+2*r1], [r2+2*r3] LOAD_DIFF m3, m7, m6, [r0+r4], [r2+r5] lea r0, [r0+4*r1] lea r2, [r2+4*r3] LOAD_DIFF m4, m7, m6, [r0], [r2] LOAD_DIFF m5, m7, m6, [r0+r1], [r2+r3] punpcklqdq m0, m4 punpcklqdq m1, m5 LOAD_DIFF m4, m7, m6, [r0+2*r1], [r2+2*r3] LOAD_DIFF m5, m7, m6, [r0+r4], [r2+r5] punpcklqdq m2, m4 punpcklqdq m3, m5 %define movh movq call x264_pixel_satd_4x8_internal_%1 SATD_END_SSE2 %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- cglobal x264_pixel_sa8d_8x8_internal_%1 lea r10, [r0+4*r1] lea r11, [r2+4*r3] LOAD_DIFF_8x4P m0, m1, m2, m3, m8, m9, r0, r2 LOAD_DIFF_8x4P m4, m5, m6, m7, m8, m9, r10, r11 HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 ABS4 m0, m1, m2, m3, m8, m9 ABS4 m4, m5, m6, m7, m8, m9 paddusw m0, m1 paddusw m2, m3 paddusw m4, m5 paddusw m6, m7 paddusw m0, m2 paddusw m4, m6 pavgw m0, m4 ret cglobal x264_pixel_sa8d_8x8_%1, 4,6 lea r4, [3*r1] lea r5, [3*r3] call x264_pixel_sa8d_8x8_internal_%1 HADDW m0, m1 movd eax, m0 add eax, 1 shr eax, 1 ret cglobal x264_pixel_sa8d_16x16_%1, 4,6 lea r4, [3*r1] lea r5, [3*r3] call x264_pixel_sa8d_8x8_internal_%1 ; pix[0] add r0, 8 add r2, 8 mova m10, m0 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8] lea r0, [r0+8*r1] lea r2, [r2+8*r3] paddusw m10, m0 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8] sub r0, 8 sub r2, 8 paddusw m10, m0 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride] paddusw m0, m10 HADDUW m0, m1 movd eax, m0 add eax, 1 shr eax, 1 ret %else ; ARCH_X86_32 cglobal x264_pixel_sa8d_8x8_internal_%1 LOAD_DIFF_8x4P m0, m1, m2, m3, m6, m7 movdqa [esp+4], m2 lea r0, [r0+4*r1] lea r2, [r2+4*r3] LOAD_DIFF_8x4P m4, m5, m6, m7, m2, m2 movdqa m2, [esp+4] HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [esp+4], [esp+20] HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 %ifidn %1, sse2 movdqa [esp+4], m4 movdqa [esp+20], m2 %endif ABS2 m6, m3, m4, m2 ABS2 m0, m7, m4, m2 paddusw m0, m6 paddusw m7, m3 %ifidn %1, sse2 movdqa m4, [esp+4] movdqa m2, [esp+20] %endif ABS2 m5, m1, m6, m3 ABS2 m4, m2, m6, m3 paddusw m5, m1 paddusw m4, m2 paddusw m0, m7 paddusw m5, m4 pavgw m0, m5 ret %endif ; ARCH %endmacro ; SATDS_SSE2 %macro SA8D_16x16_32 1 %ifndef ARCH_X86_64 cglobal x264_pixel_sa8d_8x8_%1, 4,7 mov r6, esp and esp, ~15 sub esp, 32 lea r4, [3*r1] lea r5, [3*r3] call x264_pixel_sa8d_8x8_internal_%1 HADDW m0, m1 movd eax, m0 add eax, 1 shr eax, 1 mov esp, r6 RET cglobal x264_pixel_sa8d_16x16_%1, 4,7 mov r6, esp and esp, ~15 sub esp, 48 lea r4, [3*r1] lea r5, [3*r3] call x264_pixel_sa8d_8x8_internal_%1 lea r0, [r0+4*r1] lea r2, [r2+4*r3] mova [esp+32], m0 call x264_pixel_sa8d_8x8_internal_%1 mov r0, [r6+20] mov r2, [r6+28] add r0, 8 add r2, 8 paddusw m0, [esp+32] mova [esp+32], m0 call x264_pixel_sa8d_8x8_internal_%1 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %if mmsize == 16 paddusw m0, [esp+32] %endif mova [esp+48-mmsize], m0 call x264_pixel_sa8d_8x8_internal_%1 paddusw m0, [esp+48-mmsize] %if mmsize == 16 HADDUW m0, m1 %else mova m2, [esp+32] pxor m7, m7 mova m1, m0 mova m3, m2 punpcklwd m0, m7 punpckhwd m1, m7 punpcklwd m2, m7 punpckhwd m3, m7 paddd m0, m1 paddd m2, m3 paddd m0, m2 HADDD m0, m1 %endif movd eax, m0 add eax, 1 shr eax, 1 mov esp, r6 RET %endif ; !ARCH_X86_64 %endmacro ; SA8D_16x16_32 ;============================================================================= ; INTRA SATD ;============================================================================= %macro INTRA_SA8D_SSE2 1 %ifdef ARCH_X86_64 INIT_XMM ;----------------------------------------------------------------------------- ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res ) ;----------------------------------------------------------------------------- cglobal x264_intra_sa8d_x3_8x8_core_%1 ; 8x8 hadamard pxor m8, m8 movq m0, [r0+0*FENC_STRIDE] movq m1, [r0+1*FENC_STRIDE] movq m2, [r0+2*FENC_STRIDE] movq m3, [r0+3*FENC_STRIDE] movq m4, [r0+4*FENC_STRIDE] movq m5, [r0+5*FENC_STRIDE] movq m6, [r0+6*FENC_STRIDE] movq m7, [r0+7*FENC_STRIDE] punpcklbw m0, m8 punpcklbw m1, m8 punpcklbw m2, m8 punpcklbw m3, m8 punpcklbw m4, m8 punpcklbw m5, m8 punpcklbw m6, m8 punpcklbw m7, m8 HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 ; dc movzx edi, word [r1+0] add di, word [r1+16] add edi, 8 and edi, -16 shl edi, 2 pxor m15, m15 movdqa m8, m2 movdqa m9, m3 movdqa m10, m4 movdqa m11, m5 ABS4 m8, m9, m10, m11, m12, m13 paddusw m8, m10 paddusw m9, m11 %ifidn %1, ssse3 pabsw m10, m6 pabsw m11, m7 pabsw m15, m1 %else movdqa m10, m6 movdqa m11, m7 movdqa m15, m1 ABS2 m10, m11, m13, m14 ABS1 m15, m13 %endif paddusw m10, m11 paddusw m8, m9 paddusw m15, m10 paddusw m15, m8 movdqa m14, m15 ; 7x8 sum movdqa m8, [r1+0] ; left edge movd m9, edi psllw m8, 3 psubw m8, m0 psubw m9, m0 ABS1 m8, m10 ABS1 m9, m11 ; 1x8 sum paddusw m14, m8 paddusw m15, m9 punpcklwd m0, m1 punpcklwd m2, m3 punpcklwd m4, m5 punpcklwd m6, m7 punpckldq m0, m2 punpckldq m4, m6 punpcklqdq m0, m4 ; transpose movdqa m1, [r1+16] ; top edge movdqa m2, m15 psllw m1, 3 psrldq m2, 2 ; 8x7 sum psubw m0, m1 ; 8x1 sum ABS1 m0, m1 paddusw m2, m0 ; 3x HADDW movdqa m7, [pw_1 GLOBAL] pmaddwd m2, m7 pmaddwd m14, m7 pmaddwd m15, m7 movdqa m3, m2 punpckldq m2, m14 punpckhdq m3, m14 pshufd m5, m15, 0xf5 paddd m2, m3 paddd m5, m15 movdqa m3, m2 punpcklqdq m2, m5 punpckhqdq m3, m5 pavgw m3, m2 pxor m0, m0 pavgw m3, m0 movq [r2], m3 ; i8x8_v, i8x8_h psrldq m3, 8 movd [r2+8], m3 ; i8x8_dc ret %endif ; ARCH_X86_64 %endmacro ; INTRA_SA8D_SSE2 ; in: r0 = fenc ; out: m0..m3 = hadamard coefs INIT_MMX ALIGN 16 load_hadamard: pxor m7, m7 movd m0, [r0+0*FENC_STRIDE] movd m1, [r0+1*FENC_STRIDE] movd m2, [r0+2*FENC_STRIDE] movd m3, [r0+3*FENC_STRIDE] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 HADAMARD4_1D m0, m1, m2, m3 TRANSPOSE4x4W 0, 1, 2, 3, 4 HADAMARD4_1D m0, m1, m2, m3 SAVE_MM_PERMUTATION load_hadamard ret %macro SCALAR_SUMSUB 4 add %1, %2 add %3, %4 add %2, %2 add %4, %4 sub %2, %1 sub %4, %3 %endmacro %macro SCALAR_HADAMARD_LEFT 5 ; y, 4x tmp %ifnidn %1, 0 shl %1d, 5 ; log(FDEC_STRIDE) %endif movzx %2d, byte [r1+%1-1+0*FDEC_STRIDE] movzx %3d, byte [r1+%1-1+1*FDEC_STRIDE] movzx %4d, byte [r1+%1-1+2*FDEC_STRIDE] movzx %5d, byte [r1+%1-1+3*FDEC_STRIDE] %ifnidn %1, 0 shr %1d, 5 %endif SCALAR_SUMSUB %2d, %3d, %4d, %5d SCALAR_SUMSUB %2d, %4d, %3d, %5d mov [left_1d+2*%1+0], %2w mov [left_1d+2*%1+2], %3w mov [left_1d+2*%1+4], %4w mov [left_1d+2*%1+6], %5w %endmacro %macro SCALAR_HADAMARD_TOP 5 ; x, 4x tmp movzx %2d, byte [r1+%1-FDEC_STRIDE+0] movzx %3d, byte [r1+%1-FDEC_STRIDE+1] movzx %4d, byte [r1+%1-FDEC_STRIDE+2] movzx %5d, byte [r1+%1-FDEC_STRIDE+3] SCALAR_SUMSUB %2d, %3d, %4d, %5d SCALAR_SUMSUB %2d, %4d, %3d, %5d mov [top_1d+2*%1+0], %2w mov [top_1d+2*%1+2], %3w mov [top_1d+2*%1+4], %4w mov [top_1d+2*%1+6], %5w %endmacro %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op pxor %7, %7 pshufw %4, %1, 01001110b pshufw %5, %2, 01001110b pshufw %6, %3, 01001110b paddw %1, %4 paddw %2, %5 paddw %3, %6 punpcklwd %1, %7 punpcklwd %2, %7 punpcklwd %3, %7 pshufw %4, %1, 01001110b pshufw %5, %2, 01001110b pshufw %6, %3, 01001110b %8 %1, %4 %8 %2, %5 %8 %3, %6 %endmacro %macro CLEAR_SUMS 0 %ifdef ARCH_X86_64 mov qword [sums+0], 0 mov qword [sums+8], 0 mov qword [sums+16], 0 %else pxor m7, m7 movq [sums+0], m7 movq [sums+8], m7 movq [sums+16], m7 %endif %endmacro ; in: m1..m3 ; out: m7 ; clobber: m4..m6 %macro SUM3x4 1 %ifidn %1, ssse3 pabsw m4, m1 pabsw m5, m2 pabsw m7, m3 paddw m4, m5 %else movq m4, m1 movq m5, m2 ABS2 m4, m5, m6, m7 movq m7, m3 paddw m4, m5 ABS1 m7, m6 %endif paddw m7, m4 %endmacro ; in: m0..m3 (4x4), m7 (3x4) ; out: m0 v, m4 h, m5 dc ; clobber: m6 %macro SUM4x3 3 ; dc, left, top movq m4, %2 movd m5, %1 psllw m4, 2 psubw m4, m0 psubw m5, m0 punpcklwd m0, m1 punpcklwd m2, m3 punpckldq m0, m2 ; transpose movq m1, %3 psllw m1, 2 psubw m0, m1 ABS2 m4, m5, m2, m3 ; 1x4 sum ABS1 m0, m1 ; 4x1 sum %endmacro %macro INTRA_SATDS_MMX 1 INIT_MMX ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- cglobal x264_intra_satd_x3_4x4_%1, 2,6 %ifdef ARCH_X86_64 ; stack is 16 byte aligned because abi says so %define top_1d rsp-8 ; size 8 %define left_1d rsp-16 ; size 8 %define t0 r10 %else ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned SUB esp, 16 %define top_1d esp+8 %define left_1d esp %define t0 r2 %endif call load_hadamard SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5 mov t0d, r0d SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5 lea t0d, [t0d + r0d + 4] and t0d, -8 shl t0d, 1 ; dc SUM3x4 %1 SUM4x3 t0d, [left_1d], [top_1d] paddw m4, m7 paddw m5, m7 movq m1, m5 psrlq m1, 16 ; 4x3 sum paddw m0, m1 SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw %ifndef ARCH_X86_64 mov r2, r2m %endif movd [r2+0], m0 ; i4x4_v satd movd [r2+4], m4 ; i4x4_h satd movd [r2+8], m5 ; i4x4_dc satd %ifndef ARCH_X86_64 ADD esp, 16 %endif RET %ifdef ARCH_X86_64 %define t0 r10 %define t2 r11 %else %define t0 r0 %define t2 r2 %endif ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- cglobal x264_intra_satd_x3_16x16_%1, 0,7 %ifdef ARCH_X86_64 %assign stack_pad 88 %else %assign stack_pad 88 + ((stack_offset+88+4)&15) %endif ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call SUB rsp, stack_pad %define sums rsp+64 ; size 24 %define top_1d rsp+32 ; size 32 %define left_1d rsp ; size 32 movifnidn r1d, r1m CLEAR_SUMS ; 1D hadamards xor t2d, t2d mov t0d, 12 .loop_edge: SCALAR_HADAMARD_LEFT t0, r3, r4, r5, r6 add t2d, r3d SCALAR_HADAMARD_TOP t0, r3, r4, r5, r6 add t2d, r3d sub t0d, 4 jge .loop_edge shr t2d, 1 add t2d, 8 and t2d, -16 ; dc ; 2D hadamards movifnidn r0d, r0m xor r3d, r3d .loop_y: xor r4d, r4d .loop_x: call load_hadamard SUM3x4 %1 SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4] pavgw m4, m7 pavgw m5, m7 paddw m0, [sums+0] ; i16x16_v satd paddw m4, [sums+8] ; i16x16_h satd paddw m5, [sums+16] ; i16x16_dc satd movq [sums+0], m0 movq [sums+8], m4 movq [sums+16], m5 add r0, 4 inc r4d cmp r4d, 4 jl .loop_x add r0, 4*FENC_STRIDE-16 inc r3d cmp r3d, 4 jl .loop_y ; horizontal sum movifnidn r2d, r2m movq m2, [sums+16] movq m1, [sums+8] movq m0, [sums+0] movq m7, m2 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd psrld m0, 1 pslld m7, 16 psrld m7, 16 paddd m0, m2 psubd m0, m7 movd [r2+8], m2 ; i16x16_dc satd movd [r2+4], m1 ; i16x16_h satd movd [r2+0], m0 ; i16x16_v satd ADD rsp, stack_pad RET ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- cglobal x264_intra_satd_x3_8x8c_%1, 0,6 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call SUB rsp, 72 %define sums rsp+48 ; size 24 %define dc_1d rsp+32 ; size 16 %define top_1d rsp+16 ; size 16 %define left_1d rsp ; size 16 movifnidn r1d, r1m CLEAR_SUMS ; 1D hadamards mov t0d, 4 .loop_edge: SCALAR_HADAMARD_LEFT t0, t2, r3, r4, r5 SCALAR_HADAMARD_TOP t0, t2, r3, r4, r5 sub t0d, 4 jge .loop_edge ; dc movzx t2d, word [left_1d+0] movzx r3d, word [top_1d+0] movzx r4d, word [left_1d+8] movzx r5d, word [top_1d+8] add t2d, r3d lea r3, [r4 + r5] lea t2, [2*t2 + 8] lea r3, [2*r3 + 8] lea r4, [4*r4 + 8] lea r5, [4*r5 + 8] and t2d, -16 ; tl and r3d, -16 ; br and r4d, -16 ; bl and r5d, -16 ; tr mov [dc_1d+ 0], t2d ; tl mov [dc_1d+ 4], r5d ; tr mov [dc_1d+ 8], r4d ; bl mov [dc_1d+12], r3d ; br lea r5, [dc_1d] ; 2D hadamards movifnidn r0d, r0m movifnidn r2d, r2m xor r3d, r3d .loop_y: xor r4d, r4d .loop_x: call load_hadamard SUM3x4 %1 SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4] pavgw m4, m7 pavgw m5, m7 paddw m0, [sums+16] ; i4x4_v satd paddw m4, [sums+8] ; i4x4_h satd paddw m5, [sums+0] ; i4x4_dc satd movq [sums+16], m0 movq [sums+8], m4 movq [sums+0], m5 add r0, 4 inc r4d cmp r4d, 2 jl .loop_x add r0, 4*FENC_STRIDE-8 add r5, 8 inc r3d cmp r3d, 2 jl .loop_y ; horizontal sum movq m0, [sums+0] movq m1, [sums+8] movq m2, [sums+16] movq m7, m0 psrlq m7, 15 paddw m2, m7 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd psrld m2, 1 movd [r2+0], m0 ; i8x8c_dc satd movd [r2+4], m1 ; i8x8c_h satd movd [r2+8], m2 ; i8x8c_v satd ADD rsp, 72 RET %endmacro ; INTRA_SATDS_MMX %macro ABS_MOV_SSSE3 2 pabsw %1, %2 %endmacro %macro ABS_MOV_MMX 2 pxor %1, %1 psubw %1, %2 pmaxsw %1, %2 %endmacro %define ABS_MOV ABS_MOV_MMX ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0 ; out: [tmp]=hadamard4, m0=satd cglobal x264_hadamard_ac_4x4_mmxext movh m0, [r0] movh m1, [r0+r1] movh m2, [r0+r1*2] movh m3, [r0+r2] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 HADAMARD4_1D m0, m1, m2, m3 TRANSPOSE4x4W 0, 1, 2, 3, 4 HADAMARD4_1D m0, m1, m2, m3 mova [r3], m0 mova [r3+8], m1 mova [r3+16], m2 mova [r3+24], m3 ABS1 m0, m4 ABS1 m1, m4 pand m0, m6 ABS1 m2, m4 ABS1 m3, m4 paddw m0, m1 paddw m2, m3 paddw m0, m2 SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext ret cglobal x264_hadamard_ac_2x2_mmxext mova m0, [r3+0x00] mova m1, [r3+0x20] mova m2, [r3+0x40] mova m3, [r3+0x60] HADAMARD4_1D m0, m1, m2, m3 ABS2 m0, m1, m4, m5 ABS2 m2, m3, m4, m5 SAVE_MM_PERMUTATION x264_hadamard_ac_2x2_mmxext ret cglobal x264_hadamard_ac_8x8_mmxext mova m6, [mask_ac4 GLOBAL] pxor m7, m7 call x264_hadamard_ac_4x4_mmxext add r0, 4 add r3, 32 mova m5, m0 call x264_hadamard_ac_4x4_mmxext lea r0, [r0+4*r1] add r3, 64 paddw m5, m0 call x264_hadamard_ac_4x4_mmxext sub r0, 4 sub r3, 32 paddw m5, m0 call x264_hadamard_ac_4x4_mmxext paddw m5, m0 sub r3, 64 mova [rsp+gprsize+8], m5 ; save satd call x264_hadamard_ac_2x2_mmxext add r3, 8 pand m6, m0 mova m7, m1 paddw m6, m2 paddw m7, m3 %rep 2 call x264_hadamard_ac_2x2_mmxext add r3, 8 paddw m6, m0 paddw m7, m1 paddw m6, m2 paddw m7, m3 %endrep call x264_hadamard_ac_2x2_mmxext sub r3, 24 paddw m6, m0 paddw m7, m1 paddw m6, m2 paddw m7, m3 paddw m6, m7 mova [rsp+gprsize], m6 ; save sa8d SWAP m0, m6 SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext ret %macro HADAMARD_AC_WXH_MMX 2 cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4 %assign pad 16-gprsize-(stack_offset&15) %define ysub r1 sub rsp, 16+128+pad lea r2, [r1*3] lea r3, [rsp+16] call x264_hadamard_ac_8x8_mmxext %if %2==16 %define ysub r2 lea r0, [r0+r1*4] sub rsp, 16 call x264_hadamard_ac_8x8_mmxext %endif %if %1==16 neg ysub sub rsp, 16 lea r0, [r0+ysub*4+8] neg ysub call x264_hadamard_ac_8x8_mmxext %if %2==16 lea r0, [r0+r1*4] sub rsp, 16 call x264_hadamard_ac_8x8_mmxext %endif %endif mova m1, [rsp+0x08] %if %1*%2 >= 128 paddusw m0, [rsp+0x10] paddusw m1, [rsp+0x18] %endif %if %1*%2 == 256 mova m2, [rsp+0x20] paddusw m1, [rsp+0x28] paddusw m2, [rsp+0x30] mova m3, m0 paddusw m1, [rsp+0x38] pxor m3, m2 pand m3, [pw_1 GLOBAL] pavgw m0, m2 psubusw m0, m3 HADDUW m0, m2 %else psrlw m0, 1 HADDW m0, m2 %endif psrlw m1, 1 HADDW m1, m3 movd edx, m0 movd eax, m1 shr edx, 1 %ifdef ARCH_X86_64 shl rdx, 32 add rax, rdx %endif add rsp, 128+%1*%2/4+pad RET %endmacro ; HADAMARD_AC_WXH_MMX HADAMARD_AC_WXH_MMX 16, 16 HADAMARD_AC_WXH_MMX 8, 16 HADAMARD_AC_WXH_MMX 16, 8 HADAMARD_AC_WXH_MMX 8, 8 %macro HADAMARD_AC_SSE2 1 INIT_XMM ; in: r0=pix, r1=stride, r2=stride*3 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4 cglobal x264_hadamard_ac_8x8_%1 %ifdef ARCH_X86_64 %define spill0 m8 %define spill1 m9 %define spill2 m10 %else %define spill0 [rsp+gprsize] %define spill1 [rsp+gprsize+16] %define spill2 [rsp+gprsize+32] %endif pxor m7, m7 movh m0, [r0] movh m1, [r0+r1] movh m2, [r0+r1*2] movh m3, [r0+r2] lea r0, [r0+r1*4] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 HADAMARD4_1D m0, m1, m2, m3 mova spill0, m3 SWAP m3, m7 movh m4, [r0] movh m5, [r0+r1] movh m6, [r0+r1*2] movh m7, [r0+r2] punpcklbw m4, m3 punpcklbw m5, m3 punpcklbw m6, m3 punpcklbw m7, m3 HADAMARD4_1D m4, m5, m6, m7 mova m3, spill0 %ifdef ARCH_X86_64 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 %else TRANSPOSE8x8W 0,1,2,3,4,5,6,7,spill0,spill1 %endif HADAMARD4_1D m0, m1, m2, m3 HADAMARD4_1D m4, m5, m6, m7 mova spill0, m1 mova spill1, m2 mova spill2, m3 ABS_MOV m1, m0 ABS_MOV m2, m4 ABS_MOV m3, m5 paddw m1, m2 SUMSUB_BA m0, m4 pand m1, [mask_ac4 GLOBAL] ABS_MOV m2, spill0 paddw m1, m3 ABS_MOV m3, spill1 paddw m1, m2 ABS_MOV m2, spill2 paddw m1, m3 ABS_MOV m3, m6 paddw m1, m2 ABS_MOV m2, m7 paddw m1, m3 mova m3, m7 paddw m1, m2 mova m2, m6 psubw m7, spill2 paddw m3, spill2 mova [rsp+gprsize+32], m1 ; save satd mova m1, m5 psubw m6, spill1 paddw m2, spill1 psubw m5, spill0 paddw m1, spill0 mova spill1, m7 SBUTTERFLY qdq, 0, 4, 7 SBUTTERFLY qdq, 1, 5, 7 SBUTTERFLY qdq, 2, 6, 7 SUMSUB_BADC m0, m4, m1, m5 SUMSUB_BA m2, m6 ABS1 m0, m7 ABS1 m1, m7 pand m0, [mask_ac8 GLOBAL] ABS1 m2, m7 ABS1 m4, m7 ABS1 m5, m7 ABS1 m6, m7 mova m7, spill1 paddw m0, m4 SBUTTERFLY qdq, 3, 7, 4 SUMSUB_BA m3, m7 paddw m1, m5 ABS1 m3, m4 ABS1 m7, m4 paddw m2, m6 paddw m3, m7 paddw m0, m1 paddw m2, m3 paddw m0, m2 mova [rsp+gprsize+16], m0 ; save sa8d SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1 ret HADAMARD_AC_WXH_SSE2 16, 16, %1 HADAMARD_AC_WXH_SSE2 8, 16, %1 HADAMARD_AC_WXH_SSE2 16, 8, %1 HADAMARD_AC_WXH_SSE2 8, 8, %1 %endmacro ; HADAMARD_AC_SSE2 ; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride ) %macro HADAMARD_AC_WXH_SSE2 3 cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3 %assign pad 16-gprsize-(stack_offset&15) %define ysub r1 sub rsp, 48+pad lea r2, [r1*3] call x264_hadamard_ac_8x8_%3 %if %2==16 %define ysub r2 lea r0, [r0+r1*4] sub rsp, 32 call x264_hadamard_ac_8x8_%3 %endif %if %1==16 neg ysub sub rsp, 32 lea r0, [r0+ysub*4+8] neg ysub call x264_hadamard_ac_8x8_%3 %if %2==16 lea r0, [r0+r1*4] sub rsp, 32 call x264_hadamard_ac_8x8_%3 %endif %endif mova m1, [rsp+0x20] %if %1*%2 >= 128 paddusw m0, [rsp+0x30] paddusw m1, [rsp+0x40] %endif %if %1*%2 == 256 paddusw m0, [rsp+0x50] paddusw m1, [rsp+0x60] paddusw m0, [rsp+0x70] paddusw m1, [rsp+0x80] psrlw m0, 1 %endif HADDW m0, m2 HADDW m1, m3 movd edx, m0 movd eax, m1 shr edx, 2 - (%1*%2 >> 8) shr eax, 1 %ifdef ARCH_X86_64 shl rdx, 32 add rax, rdx %endif add rsp, 16+%1*%2/2+pad RET %endmacro ; HADAMARD_AC_WXH_SSE2 ; instantiate satds %ifndef ARCH_X86_64 cextern x264_pixel_sa8d_8x8_internal_mmxext SA8D_16x16_32 mmxext %endif %define ABS1 ABS1_MMX %define ABS2 ABS2_MMX SATDS_SSE2 sse2 SA8D_16x16_32 sse2 INTRA_SA8D_SSE2 sse2 INTRA_SATDS_MMX mmxext HADAMARD_AC_SSE2 sse2 %define ABS1 ABS1_SSSE3 %define ABS2 ABS2_SSSE3 %define ABS_MOV ABS_MOV_SSSE3 SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3. SATDS_SSE2 ssse3 SA8D_16x16_32 ssse3 INTRA_SA8D_SSE2 ssse3 INTRA_SATDS_MMX ssse3 HADAMARD_AC_SSE2 ssse3 SATDS_SSE2 ssse3_phadd ;============================================================================= ; SSIM ;============================================================================= ;----------------------------------------------------------------------------- ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, ; const uint8_t *pix2, int stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4 pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 pxor m4, m4 %rep 4 movq m5, [r0] movq m6, [r2] punpcklbw m5, m0 punpcklbw m6, m0 paddw m1, m5 paddw m2, m6 movdqa m7, m5 pmaddwd m5, m5 pmaddwd m7, m6 pmaddwd m6, m6 paddd m3, m5 paddd m4, m7 paddd m3, m6 add r0, r1 add r2, r3 %endrep ; PHADDW m1, m2 ; PHADDD m3, m4 movdqa m7, [pw_1 GLOBAL] pshufd m5, m3, 0xb1 pmaddwd m1, m7 pmaddwd m2, m7 pshufd m6, m4, 0xb1 packssdw m1, m2 paddd m3, m5 pshufd m1, m1, 0xd8 paddd m4, m6 pmaddwd m1, m7 movdqa m5, m3 punpckldq m3, m4 punpckhdq m5, m4 %ifdef ARCH_X86_64 %define t0 r4 %else %define t0 eax mov t0, r4m %endif movq [t0+ 0], m1 movq [t0+ 8], m3 psrldq m1, 8 movq [t0+16], m1 movq [t0+24], m5 RET ;----------------------------------------------------------------------------- ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width ) ;----------------------------------------------------------------------------- cglobal x264_pixel_ssim_end4_sse2, 3,3 movdqa m0, [r0+ 0] movdqa m1, [r0+16] movdqa m2, [r0+32] movdqa m3, [r0+48] movdqa m4, [r0+64] paddd m0, [r1+ 0] paddd m1, [r1+16] paddd m2, [r1+32] paddd m3, [r1+48] paddd m4, [r1+64] paddd m0, m1 paddd m1, m2 paddd m2, m3 paddd m3, m4 movdqa m5, [ssim_c1 GLOBAL] movdqa m6, [ssim_c2 GLOBAL] TRANSPOSE4x4D 0, 1, 2, 3, 4 ; s1=m0, s2=m1, ss=m2, s12=m3 movdqa m4, m1 pslld m1, 16 pmaddwd m4, m0 ; s1*s2 por m0, m1 pmaddwd m0, m0 ; s1*s1 + s2*s2 pslld m4, 1 pslld m3, 7 pslld m2, 6 psubd m3, m4 ; covar*2 psubd m2, m0 ; vars paddd m0, m5 paddd m4, m5 paddd m3, m6 paddd m2, m6 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1) cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1) cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2) cvtdq2ps m2, m2 ; (float)(vars + ssim_c2) mulps m4, m3 mulps m0, m2 divps m4, m0 ; ssim cmp r2d, 4 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level neg r2 %ifdef PIC lea r3, [mask_ff + 16 GLOBAL] movdqu m1, [r3 + r2*4] %else movdqu m1, [mask_ff + r2*4 + 16 GLOBAL] %endif pand m4, m1 .skip: movhlps m0, m4 addps m0, m4 pshuflw m4, m0, 0xE addss m0, m4 %ifndef ARCH_X86_64 movd r0m, m0 fld dword r0m %endif RET ;============================================================================= ; Successive Elimination ADS ;============================================================================= %macro ADS_START 1 ; unroll_size %ifdef ARCH_X86_64 %define t0 r6 mov r10, rsp %else %define t0 r4 mov rbp, rsp %endif mov r0d, r5m sub rsp, r0 sub rsp, %1*4-1 and rsp, ~15 mov t0, rsp shl r2d, 1 %endmacro %macro ADS_END 1 add r1, 8*%1 add r3, 8*%1 add t0, 4*%1 sub r0d, 4*%1 jg .loop jmp ads_mvs %endmacro %define ABS1 ABS1_MMX ;----------------------------------------------------------------------------- ; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) ;----------------------------------------------------------------------------- cglobal x264_pixel_ads4_mmxext, 4,7 movq mm6, [r0] movq mm4, [r0+8] pshufw mm7, mm6, 0 pshufw mm6, mm6, 0xAA pshufw mm5, mm4, 0 pshufw mm4, mm4, 0xAA ADS_START 1 .loop: movq mm0, [r1] movq mm1, [r1+16] psubw mm0, mm7 psubw mm1, mm6 ABS1 mm0, mm2 ABS1 mm1, mm3 movq mm2, [r1+r2] movq mm3, [r1+r2+16] psubw mm2, mm5 psubw mm3, mm4 paddw mm0, mm1 ABS1 mm2, mm1 ABS1 mm3, mm1 paddw mm0, mm2 paddw mm0, mm3 %ifdef ARCH_X86_64 pshufw mm1, [r10+8], 0 %else pshufw mm1, [ebp+stack_offset+28], 0 %endif paddusw mm0, [r3] psubusw mm1, mm0 packsswb mm1, mm1 movd [t0], mm1 ADS_END 1 cglobal x264_pixel_ads2_mmxext, 4,7 movq mm6, [r0] pshufw mm5, r6m, 0 pshufw mm7, mm6, 0 pshufw mm6, mm6, 0xAA ADS_START 1 .loop: movq mm0, [r1] movq mm1, [r1+r2] psubw mm0, mm7 psubw mm1, mm6 ABS1 mm0, mm2 ABS1 mm1, mm3 paddw mm0, mm1 paddusw mm0, [r3] movq mm4, mm5 psubusw mm4, mm0 packsswb mm4, mm4 movd [t0], mm4 ADS_END 1 cglobal x264_pixel_ads1_mmxext, 4,7 pshufw mm7, [r0], 0 pshufw mm6, r6m, 0 ADS_START 2 .loop: movq mm0, [r1] movq mm1, [r1+8] psubw mm0, mm7 psubw mm1, mm7 ABS1 mm0, mm2 ABS1 mm1, mm3 paddusw mm0, [r3] paddusw mm1, [r3+8] movq mm4, mm6 movq mm5, mm6 psubusw mm4, mm0 psubusw mm5, mm1 packsswb mm4, mm5 movq [t0], mm4 ADS_END 2 %macro ADS_SSE2 1 cglobal x264_pixel_ads4_%1, 4,7 movdqa xmm4, [r0] pshuflw xmm7, xmm4, 0 pshuflw xmm6, xmm4, 0xAA pshufhw xmm5, xmm4, 0 pshufhw xmm4, xmm4, 0xAA punpcklqdq xmm7, xmm7 punpcklqdq xmm6, xmm6 punpckhqdq xmm5, xmm5 punpckhqdq xmm4, xmm4 %ifdef ARCH_X86_64 pshuflw xmm8, r6m, 0 punpcklqdq xmm8, xmm8 ADS_START 2 movdqu xmm10, [r1] movdqu xmm11, [r1+r2] .loop: movdqa xmm0, xmm10 movdqu xmm1, [r1+16] movdqa xmm10, xmm1 psubw xmm0, xmm7 psubw xmm1, xmm6 ABS1 xmm0, xmm2 ABS1 xmm1, xmm3 movdqa xmm2, xmm11 movdqu xmm3, [r1+r2+16] movdqa xmm11, xmm3 psubw xmm2, xmm5 psubw xmm3, xmm4 paddw xmm0, xmm1 movdqu xmm9, [r3] ABS1 xmm2, xmm1 ABS1 xmm3, xmm1 paddw xmm0, xmm2 paddw xmm0, xmm3 paddusw xmm0, xmm9 movdqa xmm1, xmm8 psubusw xmm1, xmm0 packsswb xmm1, xmm1 movq [t0], xmm1 %else ADS_START 2 .loop: movdqu xmm0, [r1] movdqu xmm1, [r1+16] psubw xmm0, xmm7 psubw xmm1, xmm6 ABS1 xmm0, xmm2 ABS1 xmm1, xmm3 movdqu xmm2, [r1+r2] movdqu xmm3, [r1+r2+16] psubw xmm2, xmm5 psubw xmm3, xmm4 paddw xmm0, xmm1 ABS1 xmm2, xmm1 ABS1 xmm3, xmm1 paddw xmm0, xmm2 paddw xmm0, xmm3 movd xmm1, [ebp+stack_offset+28] movdqu xmm2, [r3] pshuflw xmm1, xmm1, 0 punpcklqdq xmm1, xmm1 paddusw xmm0, xmm2 psubusw xmm1, xmm0 packsswb xmm1, xmm1 movq [t0], xmm1 %endif ; ARCH ADS_END 2 cglobal x264_pixel_ads2_%1, 4,7 movq xmm6, [r0] movd xmm5, r6m pshuflw xmm7, xmm6, 0 pshuflw xmm6, xmm6, 0xAA pshuflw xmm5, xmm5, 0 punpcklqdq xmm7, xmm7 punpcklqdq xmm6, xmm6 punpcklqdq xmm5, xmm5 ADS_START 2 .loop: movdqu xmm0, [r1] movdqu xmm1, [r1+r2] psubw xmm0, xmm7 psubw xmm1, xmm6 movdqu xmm4, [r3] ABS1 xmm0, xmm2 ABS1 xmm1, xmm3 paddw xmm0, xmm1 paddusw xmm0, xmm4 movdqa xmm1, xmm5 psubusw xmm1, xmm0 packsswb xmm1, xmm1 movq [t0], xmm1 ADS_END 2 cglobal x264_pixel_ads1_%1, 4,7 movd xmm7, [r0] movd xmm6, r6m pshuflw xmm7, xmm7, 0 pshuflw xmm6, xmm6, 0 punpcklqdq xmm7, xmm7 punpcklqdq xmm6, xmm6 ADS_START 4 .loop: movdqu xmm0, [r1] movdqu xmm1, [r1+16] psubw xmm0, xmm7 psubw xmm1, xmm7 movdqu xmm2, [r3] movdqu xmm3, [r3+16] ABS1 xmm0, xmm4 ABS1 xmm1, xmm5 paddusw xmm0, xmm2 paddusw xmm1, xmm3 movdqa xmm4, xmm6 movdqa xmm5, xmm6 psubusw xmm4, xmm0 psubusw xmm5, xmm1 packsswb xmm4, xmm5 movdqa [t0], xmm4 ADS_END 4 %endmacro ADS_SSE2 sse2 %define ABS1 ABS1_SSSE3 ADS_SSE2 ssse3 ; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ) ; { ; int nmv=0, i, j; ; *(uint32_t*)(masks+width) = 0; ; for( i=0; i