config.h
config.mak
x264
+checkasm
gtk/test
gtk/x264_gtk_encode
gtk/x264_icon.h
include config.mak
+all: default
+
SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
common/frame.c common/dct.c common/cpu.c common/cabac.c \
common/common.c common/mdate.c common/set.c \
endif
# MMX/SSE optims
-ifeq ($(ARCH),X86)
ifneq ($(AS),)
-SRCS += common/i386/mc-c.c common/i386/predict-c.c
-ASMSRC = common/i386/dct-a.asm common/i386/cpu-a.asm \
- common/i386/pixel-a.asm common/i386/mc-a.asm \
- common/i386/mc-a2.asm common/i386/predict-a.asm \
- common/i386/pixel-sse2.asm common/i386/quant-a.asm \
- common/i386/deblock-a.asm
+X86SRC0 = dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \
+ pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
+ cpu-32.asm dct-32.asm
+X86SRC = $(X86SRC0:%=common/x86/%)
+
+ifeq ($(ARCH),X86)
+SRCS += common/x86/mc-c.c common/x86/predict-c.c
+ASMSRC = $(X86SRC) common/x86/pixel-32.asm
OBJASM = $(ASMSRC:%.asm=%.o)
-ASFLAGS += -Icommon/i386/
-endif
+ASFLAGS += -Icommon/x86/
+$(OBJASM): common/x86/x86inc.asm common/x86/x86inc-32.asm
endif
-# MMX/SSE optims
ifeq ($(ARCH),X86_64)
-ifneq ($(AS),)
-SRCS += common/i386/mc-c.c common/i386/predict-c.c
-ASMSRC = common/amd64/dct-a.asm common/amd64/cpu-a.asm \
- common/amd64/pixel-a.asm common/amd64/mc-a.asm \
- common/amd64/mc-a2.asm common/amd64/predict-a.asm \
- common/amd64/pixel-sse2.asm common/amd64/quant-a.asm \
- common/amd64/deblock-a.asm
+SRCS += common/x86/mc-c.c common/x86/predict-c.c
+ASMSRC = $(X86SRC:-32.asm=-64.asm)
OBJASM = $(ASMSRC:%.asm=%.o)
-ASFLAGS += -Icommon/amd64
+ASFLAGS += -Icommon/x86/ -DARCH_X86_64
+$(OBJASM): common/x86/x86inc.asm common/x86/x86inc-64.asm
endif
endif
DEP = depend
.PHONY: all default fprofiled clean distclean install install-gtk uninstall dox test testclean
-all: default
default: $(DEP) x264$(EXE)
checkasm: tools/checkasm.o libx264.a
$(CC) -o $@ $+ $(LDFLAGS)
-common/amd64/*.o: common/amd64/amd64inc.asm
-common/i386/*.o: common/i386/i386inc.asm
%.o: %.asm
$(AS) $(ASFLAGS) -o $@ $<
# delete local/anonymous symbols, so they don't show up in oprofile
+++ /dev/null
-;*****************************************************************************
-;* dct.asm: h264 encoder library
-;*****************************************************************************
-;* Copyright (C) 2003 x264 project
-;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
-;*
-;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
-;* Min Chen <chenm001.163.com> (converted to nasm)
-;* Loren Merritt <lorenm@u.washington.edu> (dct8)
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
-;*****************************************************************************
-
-;*****************************************************************************
-;* *
-;* Revision history: *
-;* *
-;* 2004.04.28 portab all 4x4 function to nasm (CM) *
-;* *
-;*****************************************************************************
-
-BITS 64
-
-;=============================================================================
-; Macros and other preprocessor constants
-;=============================================================================
-
-%include "amd64inc.asm"
-
-%macro MMX_ZERO 1
- pxor %1, %1
-%endmacro
-
-%macro MMX_LOAD_DIFF_4P 5
- movd %1, %4
- punpcklbw %1, %3
- movd %2, %5
- punpcklbw %2, %3
- psubw %1, %2
-%endmacro
-
-%macro MMX_LOAD_DIFF_8P 5
- movq %1, %4
- punpcklbw %1, %3
- movq %2, %5
- punpcklbw %2, %3
- psubw %1, %2
-%endmacro
-
-%macro MMX_SUMSUB_BA 2
- paddw %1, %2
- paddw %2, %2
- psubw %2, %1
-%endmacro
-
-%macro MMX_SUMSUB_BADC 4
- paddw %1, %2
- paddw %3, %4
- paddw %2, %2
- paddw %4, %4
- psubw %2, %1
- psubw %4, %3
-%endmacro
-
-%macro MMX_SUMSUB2_AB 3
- movq %3, %1
- paddw %1, %1
- paddw %1, %2
- psubw %3, %2
- psubw %3, %2
-%endmacro
-
-%macro MMX_SUMSUBD2_AB 4
- movq %4, %1
- movq %3, %2
- psraw %2, 1
- psraw %4, 1
- paddw %1, %2
- psubw %4, %3
-%endmacro
-
-%macro SBUTTERFLY 5
- mov%1 %5, %3
- punpckl%2 %3, %4
- punpckh%2 %5, %4
-%endmacro
-
-;-----------------------------------------------------------------------------
-; input ABCD output ADTC
-;-----------------------------------------------------------------------------
-%macro MMX_TRANSPOSE 5
- SBUTTERFLY q, wd, %1, %2, %5
- SBUTTERFLY q, wd, %3, %4, %2
- SBUTTERFLY q, dq, %1, %3, %4
- SBUTTERFLY q, dq, %5, %2, %3
-%endmacro
-
-;-----------------------------------------------------------------------------
-; input ABCDEFGH output AFHDTECB
-;-----------------------------------------------------------------------------
-%macro SSE2_TRANSPOSE8x8 9
- SBUTTERFLY dqa, wd, %1, %2, %9
- SBUTTERFLY dqa, wd, %3, %4, %2
- SBUTTERFLY dqa, wd, %5, %6, %4
- SBUTTERFLY dqa, wd, %7, %8, %6
- SBUTTERFLY dqa, dq, %1, %3, %8
- SBUTTERFLY dqa, dq, %9, %2, %3
- SBUTTERFLY dqa, dq, %5, %7, %2
- SBUTTERFLY dqa, dq, %4, %6, %7
- SBUTTERFLY dqa, qdq, %1, %5, %6
- SBUTTERFLY dqa, qdq, %9, %4, %5
- SBUTTERFLY dqa, qdq, %8, %2, %4
- SBUTTERFLY dqa, qdq, %3, %7, %2
-%endmacro
-
-%macro MMX_STORE_DIFF_4P 5
- paddw %1, %3
- psraw %1, 6
- movd %2, %5
- punpcklbw %2, %4
- paddsw %1, %2
- packuswb %1, %1
- movd %5, %1
-%endmacro
-
-%macro MMX_STORE_DIFF_8P 4
- psraw %1, 6
- movq %2, %4
- punpcklbw %2, %3
- paddsw %1, %2
- packuswb %1, %1
- movq %4, %1
-%endmacro
-
-;=============================================================================
-; Constants
-;=============================================================================
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-pw_32: times 8 dw 32
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; void x264_dct4x4dc_mmx( int16_t d[4][4] )
-;-----------------------------------------------------------------------------
-cglobal x264_dct4x4dc_mmx
- movq mm0, [parm1q+ 0]
- movq mm1, [parm1q+ 8]
- movq mm2, [parm1q+16]
- movq mm3, [parm1q+24]
-
- MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
- MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
-
- MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
-
- MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
- MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
-
- movq mm6, [pw_1 GLOBAL]
- paddw mm0, mm6
- paddw mm2, mm6
- psraw mm0, 1
- movq [parm1q+ 0],mm0
- psraw mm2, 1
- movq [parm1q+ 8],mm2
- paddw mm3, mm6
- paddw mm4, mm6
- psraw mm3, 1
- movq [parm1q+16],mm3
- psraw mm4, 1
- movq [parm1q+24],mm4
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_idct4x4dc_mmx( int16_t d[4][4] )
-;-----------------------------------------------------------------------------
-cglobal x264_idct4x4dc_mmx
- movq mm0, [parm1q+ 0]
- movq mm1, [parm1q+ 8]
- movq mm2, [parm1q+16]
- movq mm3, [parm1q+24]
-
- MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
- MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
-
- MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
-
- MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
- MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
-
- movq [parm1q+ 0], mm0
- movq [parm1q+ 8], mm2
- movq [parm1q+16], mm3
- movq [parm1q+24], mm4
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
-;-----------------------------------------------------------------------------
-cglobal x264_sub4x4_dct_mmx
- MMX_ZERO mm7
-
- ; Load 4 lines
- MMX_LOAD_DIFF_4P mm0, mm6, mm7, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE]
- MMX_LOAD_DIFF_4P mm1, mm6, mm7, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE]
- MMX_LOAD_DIFF_4P mm2, mm6, mm7, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE]
- MMX_LOAD_DIFF_4P mm3, mm6, mm7, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE]
-
- MMX_SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12
-
- MMX_SUMSUB_BA mm2, mm3 ; mm2=s03+s12 mm3=s03-s12
- MMX_SUMSUB2_AB mm0, mm1, mm4 ; mm0=2.d03+d12 mm4=d03-2.d12
-
- ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
- MMX_TRANSPOSE mm2, mm0, mm3, mm4, mm1
-
- MMX_SUMSUB_BADC mm3, mm2, mm1, mm4 ; mm3=s03 mm2=d03 mm1=s12 mm4=d12
-
- MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
- MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
-
- movq [parm1q+ 0], mm1
- movq [parm1q+ 8], mm2
- movq [parm1q+16], mm3
- movq [parm1q+24], mm0
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
-;-----------------------------------------------------------------------------
-cglobal x264_add4x4_idct_mmx
- ; Load dct coeffs
- movq mm0, [parm2q+ 0] ; dct
- movq mm1, [parm2q+ 8]
- movq mm2, [parm2q+16]
- movq mm3, [parm2q+24]
-
- MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02
- MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
-
- MMX_SUMSUB_BADC mm1, mm2, mm4, mm0 ; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13
-
- ; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0
- MMX_TRANSPOSE mm1, mm4, mm0, mm2, mm3
-
- MMX_SUMSUB_BA mm3, mm1 ; mm3=s02 mm1=d02
- MMX_SUMSUBD2_AB mm2, mm0, mm5, mm4 ; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
-
- MMX_SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13
-
- MMX_ZERO mm7
- movq mm6, [pw_32 GLOBAL]
-
- MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [parm1q+0*FDEC_STRIDE]
- MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [parm1q+1*FDEC_STRIDE]
- MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [parm1q+2*FDEC_STRIDE]
- MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [parm1q+3*FDEC_STRIDE]
-
- ret
-
-
-
-; =============================================================================
-; 8x8 Transform
-; =============================================================================
-
-; in: ABCDEFGH
-; out: FBCGEDHI
-%macro DCT8_1D 10
- MMX_SUMSUB_BA %8, %1 ; %8=s07, %1=d07
- MMX_SUMSUB_BA %7, %2 ; %7=s16, %2=d16
- MMX_SUMSUB_BA %6, %3 ; %6=s25, %3=d25
- MMX_SUMSUB_BA %5, %4 ; %5=s34, %4=d34
-
- MMX_SUMSUB_BA %5, %8 ; %5=a0, %8=a2
- MMX_SUMSUB_BA %6, %7 ; %6=a1, %7=a3
-
- movdqa %9, %1
- psraw %9, 1
- paddw %9, %1
- paddw %9, %2
- paddw %9, %3 ; %9=a4
-
- movdqa %10, %4
- psraw %10, 1
- paddw %10, %4
- paddw %10, %2
- psubw %10, %3 ; %10=a7
-
- MMX_SUMSUB_BA %4, %1
- psubw %1, %3
- psubw %4, %2
- psraw %3, 1
- psraw %2, 1
- psubw %1, %3 ; %1=a5
- psubw %4, %2 ; %4=a6
-
- MMX_SUMSUB_BA %6, %5 ; %6=b0, %5=b4
-
- movdqa %2, %10
- psraw %2, 2
- paddw %2, %9 ; %2=b1
- psraw %9, 2
- psubw %9, %10 ; %9=b7
-
- movdqa %3, %7
- psraw %3, 1
- paddw %3, %8 ; %3=b2
- psraw %8, 1
- psubw %8, %7 ; %8=b6
-
- movdqa %7, %4
- psraw %7, 2
- paddw %7, %1 ; %7=b3
- psraw %1, 2
- psubw %4, %1 ; %4=b5
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
-;-----------------------------------------------------------------------------
-cglobal x264_sub8x8_dct8_sse2
- MMX_ZERO xmm9
-
- MMX_LOAD_DIFF_8P xmm0, xmm8, xmm9, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE]
- MMX_LOAD_DIFF_8P xmm1, xmm8, xmm9, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE]
- MMX_LOAD_DIFF_8P xmm2, xmm8, xmm9, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE]
- MMX_LOAD_DIFF_8P xmm3, xmm8, xmm9, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE]
- MMX_LOAD_DIFF_8P xmm4, xmm8, xmm9, [parm2q+4*FENC_STRIDE], [parm3q+4*FDEC_STRIDE]
- MMX_LOAD_DIFF_8P xmm5, xmm8, xmm9, [parm2q+5*FENC_STRIDE], [parm3q+5*FDEC_STRIDE]
- MMX_LOAD_DIFF_8P xmm6, xmm8, xmm9, [parm2q+6*FENC_STRIDE], [parm3q+6*FDEC_STRIDE]
- MMX_LOAD_DIFF_8P xmm7, xmm8, xmm9, [parm2q+7*FENC_STRIDE], [parm3q+7*FDEC_STRIDE]
-
- DCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
- SSE2_TRANSPOSE8x8 xmm5, xmm1, xmm2, xmm6, xmm4, xmm3, xmm7, xmm8, xmm0
- DCT8_1D xmm5, xmm3, xmm8, xmm6, xmm0, xmm4, xmm2, xmm1, xmm7, xmm9
-
- movdqa [parm1q+0x00], xmm4
- movdqa [parm1q+0x10], xmm3
- movdqa [parm1q+0x20], xmm8
- movdqa [parm1q+0x30], xmm2
- movdqa [parm1q+0x40], xmm0
- movdqa [parm1q+0x50], xmm6
- movdqa [parm1q+0x60], xmm1
- movdqa [parm1q+0x70], xmm7
-
- ret
-
-
-; in: ABCDEFGH
-; out: IBHDEACG
-%macro IDCT8_1D 10
- MMX_SUMSUB_BA %5, %1 ; %5=a0, %1=a2
- movdqa %10, %3
- psraw %3, 1
- psubw %3, %7 ; %3=a4
- psraw %7, 1
- paddw %7, %10 ; %7=a6
-
- movdqa %9, %2
- psraw %9, 1
- paddw %9, %2
- paddw %9, %4
- paddw %9, %6 ; %9=a7
-
- movdqa %10, %6
- psraw %10, 1
- paddw %10, %6
- paddw %10, %8
- psubw %10, %2 ; %10=a5
-
- psubw %2, %4
- psubw %6, %4
- paddw %2, %8
- psubw %6, %8
- psraw %4, 1
- psraw %8, 1
- psubw %2, %4 ; %2=a3
- psubw %6, %8 ; %6=a1
-
- MMX_SUMSUB_BA %7, %5 ; %7=b0, %5=b6
- MMX_SUMSUB_BA %3, %1 ; %3=b2, %1=b4
-
- movdqa %4, %9
- psraw %4, 2
- paddw %4, %6 ; %4=b1
- psraw %6, 2
- psubw %9, %6 ; %9=b7
-
- movdqa %8, %10
- psraw %8, 2
- paddw %8, %2 ; %8=b3
- psraw %2, 2
- psubw %2, %10 ; %2=b5
-
- MMX_SUMSUB_BA %9, %7 ; %9=c0, %7=c7
- MMX_SUMSUB_BA %2, %3 ; %2=c1, %3=c6
- MMX_SUMSUB_BA %8, %1 ; %8=c2, %1=c5
- MMX_SUMSUB_BA %4, %5 ; %4=c3, %5=c4
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
-;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_sse2
- movdqa xmm0, [parm2q+0x00]
- movdqa xmm1, [parm2q+0x10]
- movdqa xmm2, [parm2q+0x20]
- movdqa xmm3, [parm2q+0x30]
- movdqa xmm4, [parm2q+0x40]
- movdqa xmm5, [parm2q+0x50]
- movdqa xmm6, [parm2q+0x60]
- movdqa xmm7, [parm2q+0x70]
-
- IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm9, xmm8
- SSE2_TRANSPOSE8x8 xmm9, xmm1, xmm7, xmm3, xmm4, xmm0, xmm2, xmm6, xmm5
- paddw xmm9, [pw_32 GLOBAL] ; rounding for the >>6 at the end
- IDCT8_1D xmm9, xmm0, xmm6, xmm3, xmm5, xmm4, xmm7, xmm1, xmm8, xmm2
-
- MMX_ZERO xmm15
- MMX_STORE_DIFF_8P xmm8, xmm14, xmm15, [parm1q+0*FDEC_STRIDE]
- MMX_STORE_DIFF_8P xmm0, xmm14, xmm15, [parm1q+1*FDEC_STRIDE]
- MMX_STORE_DIFF_8P xmm1, xmm14, xmm15, [parm1q+2*FDEC_STRIDE]
- MMX_STORE_DIFF_8P xmm3, xmm14, xmm15, [parm1q+3*FDEC_STRIDE]
- MMX_STORE_DIFF_8P xmm5, xmm14, xmm15, [parm1q+4*FDEC_STRIDE]
- MMX_STORE_DIFF_8P xmm9, xmm14, xmm15, [parm1q+5*FDEC_STRIDE]
- MMX_STORE_DIFF_8P xmm6, xmm14, xmm15, [parm1q+6*FDEC_STRIDE]
- MMX_STORE_DIFF_8P xmm7, xmm14, xmm15, [parm1q+7*FDEC_STRIDE]
-
- ret
-
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4],
-; uint8_t *pix1, uint8_t *pix2 )
-;-----------------------------------------------------------------------------
-%macro SUB_NxN_DCT 6
-cglobal %1
- call %2
- add parm1q, %3
- add parm2q, %4-%5*FENC_STRIDE
- add parm3q, %4-%5*FDEC_STRIDE
- call %2
- add parm1q, %3
- add parm2q, %4*FENC_STRIDE-%6
- add parm3q, %4*FDEC_STRIDE-%6
- call %2
- add parm1q, %3
- add parm2q, %4-%5*FENC_STRIDE
- add parm3q, %4-%5*FDEC_STRIDE
- jmp %2
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
-;-----------------------------------------------------------------------------
-%macro ADD_NxN_IDCT 6
-cglobal %1
- call %2
- add parm1q, %4-%5*FDEC_STRIDE
- add parm2q, %3
- call %2
- add parm1q, %4*FDEC_STRIDE-%6
- add parm2q, %3
- call %2
- add parm1q, %4-%5*FDEC_STRIDE
- add parm2q, %3
- jmp %2
-%endmacro
-
-SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx, 32, 4, 0, 4
-ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx, 32, 4, 0, 4
-
-SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx, 32, 4, 4, 12
-ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx, 32, 4, 4, 12
-
-SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 8
-ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 8
-
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] )
-;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_4x4_field_sse2
- punpcklwd xmm0, [parm2q]
- punpckhwd xmm1, [parm2q]
- punpcklwd xmm2, [parm2q+16]
- punpckhwd xmm3, [parm2q+16]
- psrad xmm0, 16
- psrad xmm1, 16
- psrad xmm2, 16
- psrad xmm3, 16
- movq [parm1q ], xmm0
- movdqa [parm1q+16], xmm1
- movdqa [parm1q+32], xmm2
- movhlps xmm0, xmm0
- movdqa [parm1q+48], xmm3
- movq [parm1q+12], xmm0
- movd [parm1q+ 8], xmm1
- ret
-
+++ /dev/null
-;*****************************************************************************
-;* deblock-a.asm: h264 encoder library
-;*****************************************************************************
-;* Copyright (C) 2005 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
-;*****************************************************************************
-
-BITS 64
-
-%include "amd64inc.asm"
-
-SECTION_RODATA
-pb_01: times 16 db 0x01
-pb_03: times 16 db 0x03
-pb_a1: times 16 db 0xa1
-
-SECTION .text
-
-; expands to [base],...,[base+7*stride]
-%define PASS8ROWS(base, base3, stride, stride3) \
- [base], [base+stride], [base+stride*2], [base3], \
- [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
-
-; in: 8 rows of 4 bytes in %1..%8
-; out: 4 rows of 8 bytes in mm0..mm3
-%macro TRANSPOSE4x8_LOAD 8
- movd mm0, %1
- movd mm2, %2
- movd mm1, %3
- movd mm3, %4
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
- movq mm2, mm0
- punpcklwd mm0, mm1
- punpckhwd mm2, mm1
-
- movd mm4, %5
- movd mm6, %6
- movd mm5, %7
- movd mm7, %8
- punpcklbw mm4, mm6
- punpcklbw mm5, mm7
- movq mm6, mm4
- punpcklwd mm4, mm5
- punpckhwd mm6, mm5
-
- movq mm1, mm0
- movq mm3, mm2
- punpckldq mm0, mm4
- punpckhdq mm1, mm4
- punpckldq mm2, mm6
- punpckhdq mm3, mm6
-%endmacro
-
-; in: 4 rows of 8 bytes in mm0..mm3
-; out: 8 rows of 4 bytes in %1..%8
-%macro TRANSPOSE8x4_STORE 8
- movq mm4, mm0
- movq mm5, mm1
- movq mm6, mm2
- punpckhdq mm4, mm4
- punpckhdq mm5, mm5
- punpckhdq mm6, mm6
-
- punpcklbw mm0, mm1
- punpcklbw mm2, mm3
- movq mm1, mm0
- punpcklwd mm0, mm2
- punpckhwd mm1, mm2
- movd %1, mm0
- punpckhdq mm0, mm0
- movd %2, mm0
- movd %3, mm1
- punpckhdq mm1, mm1
- movd %4, mm1
-
- punpckhdq mm3, mm3
- punpcklbw mm4, mm5
- punpcklbw mm6, mm3
- movq mm5, mm4
- punpcklwd mm4, mm6
- punpckhwd mm5, mm6
- movd %5, mm4
- punpckhdq mm4, mm4
- movd %6, mm4
- movd %7, mm5
- punpckhdq mm5, mm5
- movd %8, mm5
-%endmacro
-
-%macro SBUTTERFLY 4
- movq %4, %2
- punpckl%1 %2, %3
- punpckh%1 %4, %3
-%endmacro
-
-; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
-; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
-%macro TRANSPOSE6x8_MEM 9
- movq mm0, %1
- movq mm1, %3
- movq mm2, %5
- movq mm3, %7
- SBUTTERFLY bw, mm0, %2, mm4
- SBUTTERFLY bw, mm1, %4, mm5
- SBUTTERFLY bw, mm2, %6, mm6
- movq [%9+0x10], mm5
- SBUTTERFLY bw, mm3, %8, mm7
- SBUTTERFLY wd, mm0, mm1, mm5
- SBUTTERFLY wd, mm2, mm3, mm1
- punpckhdq mm0, mm2
- movq [%9+0x00], mm0
- SBUTTERFLY wd, mm4, [%9+0x10], mm3
- SBUTTERFLY wd, mm6, mm7, mm2
- SBUTTERFLY dq, mm4, mm6, mm0
- SBUTTERFLY dq, mm5, mm1, mm7
- punpckldq mm3, mm2
- movq [%9+0x10], mm5
- movq [%9+0x20], mm7
- movq [%9+0x30], mm4
- movq [%9+0x40], mm0
- movq [%9+0x50], mm3
-%endmacro
-
-; out: %4 = |%1-%2|>%3
-; clobbers: %5
-%macro DIFF_GT 6
- mov%1 %6, %3
- mov%1 %5, %2
- psubusb %6, %2
- psubusb %5, %3
- por %5, %6
- psubusb %5, %4
-%endmacro
-%macro DIFF_GT_MMX 5
- DIFF_GT q, %1, %2, %3, %4, %5
-%endmacro
-%macro DIFF_GT_SSE2 5
- DIFF_GT dqa, %1, %2, %3, %4, %5
-%endmacro
-
-; out: %4 = |%1-%2|>%3
-; clobbers: %5
-%macro DIFF_GT2 6
- mov%1 %6, %3
- mov%1 %5, %2
- psubusb %6, %2
- psubusb %5, %3
- psubusb %6, %4
- psubusb %5, %4
- pcmpeqb %5, %6
-%endmacro
-%macro DIFF_GT2_MMX 5
- DIFF_GT2 q, %1, %2, %3, %4, %5
-%endmacro
-%macro DIFF_GT2_SSE2 5
- DIFF_GT2 dqa, %1, %2, %3, %4, %5
-%endmacro
-
-; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 %1=alpha-1 %2=beta-1
-; out: mm5=beta-1, mm7=mask
-; clobbers: mm4,mm6
-%macro LOAD_MASK_MMX 2
- movd mm4, %1
- movd mm5, %2
- pshufw mm4, mm4, 0
- pshufw mm5, mm5, 0
- packuswb mm4, mm4 ; 8x alpha-1
- packuswb mm5, mm5 ; 8x beta-1
- DIFF_GT_MMX mm1, mm2, mm4, mm7, mm6 ; |p0-q0| > alpha-1
- DIFF_GT_MMX mm0, mm1, mm5, mm4, mm6 ; |p1-p0| > beta-1
- por mm7, mm4
- DIFF_GT_MMX mm3, mm2, mm5, mm4, mm6 ; |q1-q0| > beta-1
- por mm7, mm4
- pxor mm6, mm6
- pcmpeqb mm7, mm6
-%endmacro
-%macro LOAD_MASK_SSE2 2
- movd xmm4, %1
- movd xmm5, %2
- pshuflw xmm4, xmm4, 0
- pshuflw xmm5, xmm5, 0
- punpcklqdq xmm4, xmm4
- punpcklqdq xmm5, xmm5
- packuswb xmm4, xmm4 ; 16x alpha-1
- packuswb xmm5, xmm5 ; 16x beta-1
- DIFF_GT_SSE2 xmm1, xmm2, xmm4, xmm7, xmm6 ; |p0-q0| > alpha-1
- DIFF_GT_SSE2 xmm0, xmm1, xmm5, xmm4, xmm6 ; |p1-p0| > beta-1
- por xmm7, xmm4
- DIFF_GT_SSE2 xmm3, xmm2, xmm5, xmm4, xmm6 ; |q1-q0| > beta-1
- por xmm7, xmm4
- pxor xmm6, xmm6
- pcmpeqb xmm7, xmm6
-%endmacro
-
-; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
-; out: mm1=p0' mm2=q0'
-; clobbers: mm0,3-6
-%macro DEBLOCK_P0_Q0 2
- mov%1 %2m5, %2m1
- pxor %2m5, %2m2 ; p0^q0
- pand %2m5, [pb_01 GLOBAL] ; (p0^q0)&1
- pcmpeqb %2m4, %2m4
- pxor %2m3, %2m4
- pavgb %2m3, %2m0 ; (p1 - q1 + 256)>>1
- pavgb %2m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
- pxor %2m4, %2m1
- pavgb %2m4, %2m2 ; (q0 - p0 + 256)>>1
- pavgb %2m3, %2m5
- paddusb %2m3, %2m4 ; d+128+33
- mov%1 %2m6, [pb_a1 GLOBAL]
- psubusb %2m6, %2m3
- psubusb %2m3, [pb_a1 GLOBAL]
- pminub %2m6, %2m7
- pminub %2m3, %2m7
- psubusb %2m1, %2m6
- psubusb %2m2, %2m3
- paddusb %2m1, %2m3
- paddusb %2m2, %2m6
-%endmacro
-%macro DEBLOCK_P0_Q0_MMX 0
- DEBLOCK_P0_Q0 q, m
-%endmacro
-%macro DEBLOCK_P0_Q0_SSE2 0
- DEBLOCK_P0_Q0 dqa, xm
-%endmacro
-
-; in: mm1=p0 mm2=q0
-; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
-; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
-; clobbers: q2, tmp, tc0
-%macro LUMA_Q1_SSE2 6
- movdqa %6, xmm1
- pavgb %6, xmm2
- pavgb %2, %6 ; avg(p2,avg(p0,q0))
- pxor %6, %3
- pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
- psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
- movdqa %6, %1
- psubusb %6, %5
- paddusb %5, %1
- pmaxub %2, %6
- pminub %2, %5
- movdqa %4, %2
-%endmacro
-
-
-SECTION .text
-;-----------------------------------------------------------------------------
-; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal x264_deblock_v_luma_sse2
- ; rdi = pix
- movsxd rsi, esi ; stride
- dec edx ; alpha-1
- dec ecx ; beta-1
- movd xmm8, [r8] ; tc0
- mov r8, rdi
- sub r8, rsi
- sub r8, rsi
- sub r8, rsi ; pix-3*stride
-
- movdqa xmm0, [r8+rsi] ; p1
- movdqa xmm1, [r8+2*rsi] ; p0
- movdqa xmm2, [rdi] ; q0
- movdqa xmm3, [rdi+rsi] ; q1
- LOAD_MASK_SSE2 edx, ecx
-
- punpcklbw xmm8, xmm8
- punpcklbw xmm8, xmm8 ; xmm8 = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
- pcmpeqb xmm9, xmm9
- pcmpeqb xmm9, xmm8
- pandn xmm9, xmm7
- pand xmm8, xmm9
-
- movdqa xmm3, [r8] ; p2
- DIFF_GT2_SSE2 xmm1, xmm3, xmm5, xmm6, xmm7 ; |p2-p0| > beta-1
- pand xmm6, xmm9
- movdqa xmm7, xmm8
- psubb xmm7, xmm6
- pand xmm6, xmm8
- LUMA_Q1_SSE2 xmm0, xmm3, [r8], [r8+rsi], xmm6, xmm4
-
- movdqa xmm4, [rdi+2*rsi] ; q2
- DIFF_GT2_SSE2 xmm2, xmm4, xmm5, xmm6, xmm3 ; |q2-q0| > beta-1
- pand xmm6, xmm9
- pand xmm8, xmm6
- psubb xmm7, xmm6
- movdqa xmm3, [rdi+rsi]
- LUMA_Q1_SSE2 xmm3, xmm4, [rdi+2*rsi], [rdi+rsi], xmm8, xmm6
-
- DEBLOCK_P0_Q0_SSE2
- movdqa [r8+2*rsi], xmm1
- movdqa [rdi], xmm2
-
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_luma_sse2
- movsxd r10, esi
- lea r11, [r10+r10*2]
- lea rax, [rdi-4]
- lea r9, [rdi-4+r11]
- sub rsp, 0x68
- %define pix_tmp rsp
-
- ; transpose 6x16 -> tmp space
- TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
- lea rax, [rax+r10*8]
- lea r9, [r9 +r10*8]
- TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
-
- ; vertical filter
- ; alpha, beta, tc0 are still in edx, ecx, r8
- ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
- lea rdi, [pix_tmp+0x30]
- mov esi, 0x10
- call x264_deblock_v_luma_sse2
-
- ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
- add rax, 2
- add r9, 2
- movq mm0, [pix_tmp+0x18]
- movq mm1, [pix_tmp+0x28]
- movq mm2, [pix_tmp+0x38]
- movq mm3, [pix_tmp+0x48]
- TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
-
- shl r10, 3
- sub rax, r10
- sub r9, r10
- shr r10, 3
- movq mm0, [pix_tmp+0x10]
- movq mm1, [pix_tmp+0x20]
- movq mm2, [pix_tmp+0x30]
- movq mm3, [pix_tmp+0x40]
- TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
-
- add rsp, 0x68
- ret
-
-
-%macro CHROMA_V_START 0
- ; rdi = pix
- movsxd rsi, esi ; stride
- dec edx ; alpha-1
- dec ecx ; beta-1
- mov rax, rdi
- sub rax, rsi
- sub rax, rsi
-%endmacro
-
-%macro CHROMA_H_START 0
- movsxd rsi, esi
- dec edx
- dec ecx
- sub rdi, 2
- lea r9, [rsi+rsi*2]
- mov rax, rdi
- add rdi, r9
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal x264_deblock_v_chroma_mmxext
- CHROMA_V_START
-
- movq mm0, [rax]
- movq mm1, [rax+rsi]
- movq mm2, [rdi]
- movq mm3, [rdi+rsi]
-
- LOAD_MASK_MMX edx, ecx
- movd mm6, [r8] ; tc0
- punpcklbw mm6, mm6
- pand mm7, mm6
- DEBLOCK_P0_Q0_MMX
-
- movq [rax+rsi], mm1
- movq [rdi], mm2
- ret
-
-
-;-----------------------------------------------------------------------------
-; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_chroma_mmxext
- CHROMA_H_START
-
- TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9)
- movq [rsp-8], mm0
- movq [rsp-16], mm3
-
- LOAD_MASK_MMX edx, ecx
- movd mm6, [r8] ; tc0
- punpcklbw mm6, mm6
- pand mm7, mm6
- DEBLOCK_P0_Q0_MMX
-
- movq mm0, [rsp-8]
- movq mm3, [rsp-16]
- TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9)
- ret
-
-
-; in: %1=p0 %2=p1 %3=q1
-; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
-%macro CHROMA_INTRA_P0 3
- movq mm4, %1
- pxor mm4, %3
- pand mm4, [pb_01 GLOBAL] ; mm4 = (p0^q1)&1
- pavgb %1, %3
- psubusb %1, mm4
- pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
-%endmacro
-
-%macro CHROMA_INTRA_BODY 0
- LOAD_MASK_MMX edx, ecx
- movq mm5, mm1
- movq mm6, mm2
- CHROMA_INTRA_P0 mm1, mm0, mm3
- CHROMA_INTRA_P0 mm2, mm3, mm0
- psubb mm1, mm5
- psubb mm2, mm6
- pand mm1, mm7
- pand mm2, mm7
- paddb mm1, mm5
- paddb mm2, mm6
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal x264_deblock_v_chroma_intra_mmxext
- CHROMA_V_START
-
- movq mm0, [rax]
- movq mm1, [rax+rsi]
- movq mm2, [rdi]
- movq mm3, [rdi+rsi]
-
- CHROMA_INTRA_BODY
-
- movq [rax+rsi], mm1
- movq [rdi], mm2
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_chroma_intra_mmxext
- CHROMA_H_START
- TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9)
- CHROMA_INTRA_BODY
- TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9)
- ret
-
+++ /dev/null
-;*****************************************************************************
-;* mc.asm: h264 encoder library
-;*****************************************************************************
-;* Copyright (C) 2003 x264 project
-;* $Id: mc.asm,v 1.3 2004/06/18 01:59:58 chenm001 Exp $
-;*
-;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
-;* Laurent Aimar <fenrir@via.ecp.fr> (init algorithm)
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
-;*****************************************************************************
-
-;*****************************************************************************
-;* *
-;* Revision history: *
-;* *
-;* 2004.05.17 portab mc_copy_w4/8/16 (CM) *
-;* *
-;*****************************************************************************
-
-BITS 64
-
-;=============================================================================
-; Macros and other preprocessor constants
-;=============================================================================
-
-%include "amd64inc.asm"
-
-;=============================================================================
-; Constants
-;=============================================================================
-
-SECTION_RODATA
-
-pw_4: times 4 dw 4
-pw_8: times 4 dw 8
-pw_32: times 4 dw 32
-pw_64: times 4 dw 64
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-;=============================================================================
-; pixel avg
-;=============================================================================
-
-;-----------------------------------------------------------------------------
-; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
-; uint8_t *src, int src_stride,
-; int height );
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_avg_w4_mmxext
-.height_loop:
- movd mm0, [parm3q]
- movd mm1, [parm3q+parm4q]
- pavgb mm0, [parm1q]
- pavgb mm1, [parm1q+parm2q]
- movd [parm1q], mm0
- movd [parm1q+parm2q], mm1
- sub temp1d, 2
- lea parm3q, [parm3q+parm4q*2]
- lea parm1q, [parm1q+parm2q*2]
- jg .height_loop
- rep ret
-
-cglobal x264_pixel_avg_w8_mmxext
-.height_loop:
- movq mm0, [parm3q]
- movq mm1, [parm3q+parm4q]
- pavgb mm0, [parm1q]
- pavgb mm1, [parm1q+parm2q]
- movq [parm1q], mm0
- movq [parm1q+parm2q], mm1
- sub temp1d, 2
- lea parm3q, [parm3q+parm4q*2]
- lea parm1q, [parm1q+parm2q*2]
- jg .height_loop
- rep ret
-
-cglobal x264_pixel_avg_w16_mmxext
-.height_loop:
- movq mm0, [parm3q ]
- movq mm1, [parm3q+8]
- movq mm2, [parm3q+parm4q ]
- movq mm3, [parm3q+parm4q+8]
- pavgb mm0, [parm1q ]
- pavgb mm1, [parm1q+8]
- pavgb mm2, [parm1q+parm2q ]
- pavgb mm3, [parm1q+parm2q+8]
- movq [parm1q ], mm0
- movq [parm1q+8], mm1
- movq [parm1q+parm2q ], mm2
- movq [parm1q+parm2q+8], mm3
- sub temp1d, 2
- lea parm3q, [parm3q+parm4q*2]
- lea parm1q, [parm1q+parm2q*2]
- jg .height_loop
- rep ret
-
-cglobal x264_pixel_avg_w16_sse2
-.height_loop:
- movdqu xmm0, [parm3q]
- movdqu xmm1, [parm3q+parm4q]
- pavgb xmm0, [parm1q]
- pavgb xmm1, [parm1q+parm2q]
- movdqa [parm1q], xmm0
- movdqa [parm1q+parm2q], xmm1
- sub temp1d, 2
- lea parm3q, [parm3q+parm4q*2]
- lea parm1q, [parm1q+parm2q*2]
- jg .height_loop
- rep ret
-
-%macro AVGH 2
-cglobal x264_pixel_avg_%1x%2_mmxext
- mov temp1d, %2
- jmp x264_pixel_avg_w%1_mmxext
-%endmacro
-
-AVGH 16, 16
-AVGH 16, 8
-AVGH 8, 16
-AVGH 8, 8
-AVGH 8, 4
-AVGH 4, 8
-AVGH 4, 4
-AVGH 4, 2
-
-;-----------------------------------------------------------------------------
-; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride,
-; uint8_t *src1, int src_stride,
-; uint8_t *src2, int height );
-;-----------------------------------------------------------------------------
-%macro AVG2_START 0
-%ifdef WIN64
- mov temp1d, parm6d
- mov temp2q, parm5q
-%endif
- sub parm5q, parm3q
-%endmacro
-
-cglobal x264_pixel_avg2_w4_mmxext
- AVG2_START
- lea r10, [temp2q+parm4q]
-.height_loop:
- movd mm0, [parm3q]
- movd mm1, [parm3q+parm4q]
- pavgb mm0, [parm3q+temp2q]
- pavgb mm1, [parm3q+r10]
- movd [parm1q], mm0
- movd [parm1q+parm2q], mm1
- sub temp1d, 2
- lea parm3q, [parm3q+parm4q*2]
- lea parm1q, [parm1q+parm2q*2]
- jg .height_loop
- rep ret
-
-cglobal x264_pixel_avg2_w8_mmxext
- AVG2_START
- lea r10, [temp2q+parm4q]
-.height_loop:
- movq mm0, [parm3q]
- movq mm1, [parm3q+parm4q]
- pavgb mm0, [parm3q+temp2q]
- pavgb mm1, [parm3q+r10]
- movq [parm1q], mm0
- movq [parm1q+parm2q], mm1
- sub temp1d, 2
- lea parm3q, [parm3q+parm4q*2]
- lea parm1q, [parm1q+parm2q*2]
- jg .height_loop
- rep ret
-
-cglobal x264_pixel_avg2_w16_mmxext
- AVG2_START
-.height_loop:
- movq mm0, [parm3q]
- movq mm1, [parm3q+8]
- pavgb mm0, [parm3q+temp2q]
- pavgb mm1, [parm3q+temp2q+8]
- movq [parm1q], mm0
- movq [parm1q+8], mm1
- add parm3q, parm4q
- add parm1q, parm2q
- dec temp1d
- jg .height_loop
- rep ret
-
-cglobal x264_pixel_avg2_w20_mmxext
- AVG2_START
-.height_loop:
- movq mm0, [parm3q]
- movq mm1, [parm3q+8]
- movd mm2, [parm3q+16]
- pavgb mm0, [parm3q+temp2q]
- pavgb mm1, [parm3q+temp2q+8]
- pavgb mm2, [parm3q+temp2q+16]
- movq [parm1q], mm0
- movq [parm1q+8], mm1
- movd [parm1q+16], mm2
- add parm3q, parm4q
- add parm1q, parm2q
- dec temp1d
- jg .height_loop
- rep ret
-
-
-
-;=============================================================================
-; weighted prediction
-;=============================================================================
-; implicit bipred only:
-; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
-
-%macro BIWEIGHT_4P_MMX 2
- movd mm0, %1
- movd mm1, %2
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- pmullw mm0, mm4
- pmullw mm1, mm5
- paddw mm0, mm1
- paddw mm0, mm6
- psraw mm0, 6
- pmaxsw mm0, mm7
- packuswb mm0, mm0
- movd %1, mm0
-%endmacro
-
-%macro BIWEIGHT_START_MMX 0
-; mov rdi, rdi ; dst
-; movsxd rsi, esi ; i_dst
-; mov rdx, rdx ; src
-; movsxd rcx, ecx ; i_src
-; movsxd r8, r8d ; i_weight_dst
-; movsxd r9, r9d ; i_height
- mov r11d, parm6d ; i_height
-
- movd mm4, parm5d
- pshufw mm4, mm4, 0 ; weight_dst
- movq mm5, [pw_64 GLOBAL]
- psubw mm5, mm4 ; weight_src
- movq mm6, [pw_32 GLOBAL] ; rounding
- pxor mm7, mm7
-
- ALIGN 4
- .height_loop
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_avg_weight_w16_mmxext
- BIWEIGHT_START_MMX
-
- BIWEIGHT_4P_MMX [parm1q ], [parm3q ]
- BIWEIGHT_4P_MMX [parm1q+ 4], [parm3q+ 4]
- BIWEIGHT_4P_MMX [parm1q+ 8], [parm3q+ 8]
- BIWEIGHT_4P_MMX [parm1q+12], [parm3q+12]
-
- add parm1q, parm2q
- add parm3q, parm4q
- dec r11d
- jg .height_loop
- rep ret
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_avg_weight_w8_mmxext
- BIWEIGHT_START_MMX
-
- BIWEIGHT_4P_MMX [parm1q ], [parm3q ]
- BIWEIGHT_4P_MMX [parm1q+4], [parm3q+4]
-
- add parm1q, parm2q
- add parm3q, parm4q
- dec r11d
- jg .height_loop
- rep ret
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_avg_weight_4x4_mmxext
- BIWEIGHT_START_MMX
- BIWEIGHT_4P_MMX [parm1q ], [parm3q ]
- BIWEIGHT_4P_MMX [parm1q+parm2q ], [parm3q+parm4q ]
- BIWEIGHT_4P_MMX [parm1q+parm2q*2], [parm3q+parm4q*2]
- add parm1q, parm2q
- add parm3q, parm4q
- BIWEIGHT_4P_MMX [parm1q+parm2q*2], [parm3q+parm4q*2]
- ret
-
-
-
-;=============================================================================
-; pixel copy
-;=============================================================================
-
-;-----------------------------------------------------------------------------
-; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride,
-; uint8_t *src, int i_src_stride, int i_height )
-;-----------------------------------------------------------------------------
-cglobal x264_mc_copy_w4_mmx
- mov eax, parm5d ; i_height
-
-ALIGN 4
-.height_loop
- mov r10d, [parm3q]
- mov r11d, [parm3q+parm4q]
- mov [parm1q], r10d
- mov [parm1q+parm2q], r11d
- lea parm3q, [parm3q+parm4q*2]
- lea parm1q, [parm1q+parm2q*2]
- dec eax
- dec eax
- jg .height_loop
- rep ret
-
-;-----------------------------------------------------------------------------
-; void x264_mc_copy_w8_mmx( uint8_t *dst, int i_dst_stride,
-; uint8_t *src, int i_src_stride, int i_height )
-;-----------------------------------------------------------------------------
-cglobal x264_mc_copy_w8_mmx
- mov eax, parm5d ; i_height
-
- lea r10, [parm4q+parm4q*2] ; 3 * i_src_stride
- lea r11, [parm2q+parm2q*2] ; 3 * i_dst_stride
-
-ALIGN 4
-.height_loop
- movq mm0, [parm3q]
- movq mm1, [parm3q+parm4q]
- movq mm2, [parm3q+parm4q*2]
- movq mm3, [parm3q+r10]
- movq [parm1q], mm0
- movq [parm1q+parm2q], mm1
- movq [parm1q+parm2q*2], mm2
- movq [parm1q+r11], mm3
- lea parm3q, [parm3q+parm4q*4]
- lea parm1q, [parm1q+parm2q*4]
-
- sub eax, byte 4
- jg .height_loop
- rep ret
-
-;-----------------------------------------------------------------------------
-; void x264_mc_copy_w16_mmx( uint8_t *dst, int i_dst_stride,
-; uint8_t *src, int i_src_stride, int i_height )
-;-----------------------------------------------------------------------------
-cglobal x264_mc_copy_w16_mmx
- mov eax, parm5d ; i_height
-
- lea r10, [parm4q+parm4q*2] ; 3 * i_src_stride
- lea r11, [parm2q+parm2q*2] ; 3 * i_dst_stride
-
-ALIGN 4
-.height_loop
- movq mm0, [parm3q]
- movq mm1, [parm3q+8]
- movq mm2, [parm3q+parm4q]
- movq mm3, [parm3q+parm4q+8]
- movq mm4, [parm3q+parm4q*2]
- movq mm5, [parm3q+parm4q*2+8]
- movq mm6, [parm3q+r10]
- movq mm7, [parm3q+r10+8]
- movq [parm1q], mm0
- movq [parm1q+8], mm1
- movq [parm1q+parm2q], mm2
- movq [parm1q+parm2q+8], mm3
- movq [parm1q+parm2q*2], mm4
- movq [parm1q+parm2q*2+8], mm5
- movq [parm1q+r11], mm6
- movq [parm1q+r11+8], mm7
- lea parm3q, [parm3q+parm4q*4]
- lea parm1q, [parm1q+parm2q*4]
- sub eax, byte 4
- jg .height_loop
- rep ret
-
-
-;-----------------------------------------------------------------------------
-; void x264_mc_copy_w16_sse2( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, int i_height )
-;-----------------------------------------------------------------------------
-cglobal x264_mc_copy_w16_sse2
- mov eax, parm5d ; i_height
-
-ALIGN 4
-.height_loop
- movdqu xmm0, [parm3q]
- movdqu xmm1, [parm3q+parm4q]
- movdqu [parm1q], xmm0
- movdqu [parm1q+parm2q], xmm1
- sub eax, byte 2
- lea parm3q, [parm3q+parm4q*2]
- lea parm1q, [parm1q+parm2q*2]
- jg .height_loop
- rep ret
-
-
-
-;=============================================================================
-; chroma MC
-;=============================================================================
-
-;-----------------------------------------------------------------------------
-; void x264_mc_chroma_mmxext( uint8_t *dst, int i_dst_stride,
-; uint8_t *src, int i_src_stride,
-; int dx, int dy,
-; int i_width, int i_height )
-;-----------------------------------------------------------------------------
-cglobal x264_mc_chroma_mmxext
- mov r10d, parm6d
- mov r11d, parm5d
- sar r10d, 3
- sar r11d, 3
- imul r10d, parm4d
- pxor mm3, mm3
- add r10d, r11d
- movsxd r10, r10d
- mov r11d, parm8d
- add parm3q, r10 ; src += (dx>>3) + (dy>>3) * src_stride
- and parm5d, 7 ; dx &= 7
- je .mc1d
- and parm6d, 7 ; dy &= 7
- je .mc1d
-
- movd mm0, parm5d
- movd mm1, parm6d
-
- pshufw mm5, mm0, 0 ; mm5 = dx
- pshufw mm6, mm1, 0 ; mm6 = dy
-
- movq mm4, [pw_8 GLOBAL]
- movq mm0, mm4
-
- psubw mm4, mm5 ; mm4 = 8-dx
- psubw mm0, mm6 ; mm0 = 8-dy
-
- movq mm7, mm5
- pmullw mm5, mm0 ; mm5 = dx*(8-dy) = cB
- pmullw mm7, mm6 ; mm7 = dx*dy = cD
- pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC
- pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA
-
- mov rax, parm3q
- mov r10, parm1q
-
-ALIGN 4
-.height_loop
-
- movd mm1, [rax+parm4q]
- movd mm0, [rax]
- punpcklbw mm1, mm3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4
- punpcklbw mm0, mm3
- pmullw mm1, mm6 ; 2nd line * cC
- pmullw mm0, mm4 ; 1st line * cA
-
- paddw mm0, mm1 ; mm0 <- result
-
- movd mm2, [rax+1]
- movd mm1, [rax+parm4q+1]
- punpcklbw mm2, mm3
- punpcklbw mm1, mm3
-
- paddw mm0, [pw_32 GLOBAL]
-
- pmullw mm2, mm5 ; line * cB
- pmullw mm1, mm7 ; line * cD
- paddw mm0, mm2
- paddw mm0, mm1
- psrlw mm0, 6
-
- packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4
- movd [r10], mm0
-
- add rax, parm4q
- add r10, parm2q ; i_dst_stride
- dec r11d
- jnz .height_loop
-
- sub parm7d, 8
- jnz .finish ; width != 8 so assume 4
-
- mov r10, parm1q ; dst
- mov rax, parm3q ; src
- mov r11d, parm8d ; i_height
- add r10, 4
- add rax, 4
- jmp .height_loop
-
-.finish
- rep ret
-
-ALIGN 4
-.mc1d
-%define pel_offset temp1q
- mov eax, parm5d
- or eax, parm6d
- and eax, 7
- cmp parm5d, 0
- mov pel_offset, 1
- cmove pel_offset, parm4q ; pel_offset = dx ? 1 : src_stride
- movd mm6, eax
- movq mm5, [pw_8 GLOBAL]
- pshufw mm6, mm6, 0
- movq mm7, [pw_4 GLOBAL]
- psubw mm5, mm6
-
- cmp parm7d, 8
- je .height_loop1_w8
-
-ALIGN 4
-.height_loop1_w4
- movd mm0, [parm3q+pel_offset]
- movd mm1, [parm3q]
- punpcklbw mm0, mm3
- punpcklbw mm1, mm3
- pmullw mm0, mm6
- pmullw mm1, mm5
- paddw mm0, mm7
- paddw mm0, mm1
- psrlw mm0, 3
- packuswb mm0, mm3
- movd [parm1q], mm0
- add parm3q, parm4q
- add parm1q, parm2q
- dec r11d
- jnz .height_loop1_w4
- rep ret
-
-ALIGN 4
-.height_loop1_w8
- movq mm0, [parm3q+pel_offset]
- movq mm1, [parm3q]
- movq mm2, mm0
- movq mm4, mm1
- punpcklbw mm0, mm3
- punpcklbw mm1, mm3
- punpckhbw mm2, mm3
- punpckhbw mm4, mm3
- pmullw mm0, mm6
- pmullw mm1, mm5
- pmullw mm2, mm6
- pmullw mm4, mm5
- paddw mm0, mm7
- paddw mm2, mm7
- paddw mm0, mm1
- paddw mm2, mm4
- psrlw mm0, 3
- psrlw mm2, 3
- packuswb mm0, mm2
- movq [parm1q], mm0
- add parm3q, parm4q
- add parm1q, parm2q
- dec r11d
- jnz .height_loop1_w8
- rep ret
-
-
-
-;-----------------------------------------------------------------------------
-; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
-; uint8_t *pix_uv, int stride_uv, int mb_x )
-;-----------------------------------------------------------------------------
-cglobal x264_prefetch_fenc_mmxext
- mov eax, parm5d
- and eax, 3
- imul eax, parm2d
- lea parm1q, [parm1q+rax*4+64]
- prefetcht0 [parm1q]
- prefetcht0 [parm1q+parm2q]
- lea parm1q, [parm1q+parm2q*2]
- prefetcht0 [parm1q]
- prefetcht0 [parm1q+parm2q]
-
- mov eax, parm5d
- and eax, 6
- imul eax, parm4d
- lea parm3q, [parm3q+rax+64]
- prefetcht0 [parm3q]
- prefetcht0 [parm3q+parm4q]
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
-;-----------------------------------------------------------------------------
-cglobal x264_prefetch_ref_mmxext
- dec parm3d
- and parm3d, parm2d
- lea parm1q, [parm1q+parm3q*8+64]
- lea rax, [parm2q*3]
- prefetcht0 [parm1q]
- prefetcht0 [parm1q+parm2q]
- prefetcht0 [parm1q+parm2q*2]
- prefetcht0 [parm1q+rax]
- lea parm1q, [parm1q+parm2q*4]
- prefetcht0 [parm1q]
- prefetcht0 [parm1q+parm2q]
- prefetcht0 [parm1q+parm2q*2]
- prefetcht0 [parm1q+rax]
- ret
+++ /dev/null
-;*****************************************************************************
-;* mc-a2.asm: h264 encoder library
-;*****************************************************************************
-;* Copyright (C) 2005 x264 project
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
-;*****************************************************************************
-
-BITS 64
-
-;=============================================================================
-; Macros and other preprocessor constants
-;=============================================================================
-
-%include "amd64inc.asm"
-
-;=============================================================================
-; Read only data
-;=============================================================================
-
-SECTION_RODATA
-
-pw_1: times 4 dw 1
-pw_16: times 4 dw 16
-pw_32: times 4 dw 32
-
-;=============================================================================
-; Macros
-;=============================================================================
-
-%macro LOAD_ADD 3
- movd %1, %2
- movd mm7, %3
- punpcklbw %1, mm0
- punpcklbw mm7, mm0
- paddw %1, mm7
-%endmacro
-
-%macro FILT_V 0
- psubw mm1, mm2 ; a-b
- psubw mm4, mm5
- psubw mm2, mm3 ; b-c
- psubw mm5, mm6
- psllw mm2, 2
- psllw mm5, 2
- psubw mm1, mm2 ; a-5*b+4*c
- psubw mm4, mm5
- psllw mm3, 4
- psllw mm6, 4
- paddw mm1, mm3 ; a-5*b+20*c
- paddw mm4, mm6
-%endmacro
-
-%macro FILT_H 0
- psubw mm1, mm2 ; a-b
- psubw mm4, mm5
- psraw mm1, 2 ; (a-b)/4
- psraw mm4, 2
- psubw mm1, mm2 ; (a-b)/4-b
- psubw mm4, mm5
- paddw mm1, mm3 ; (a-b)/4-b+c
- paddw mm4, mm6
- psraw mm1, 2 ; ((a-b)/4-b+c)/4
- psraw mm4, 2
- paddw mm1, mm3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- paddw mm4, mm6
-%endmacro
-
-%macro FILT_PACK 1
- paddw mm1, mm7
- paddw mm4, mm7
- psraw mm1, %1
- psraw mm4, %1
- packuswb mm1, mm4
-%endmacro
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
-; int i_stride, int i_width, int i_height );
-;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_mmxext
-
-%ifdef WIN64
- push rdi
- pushreg rdi
- push rsi
- pushreg rsi
-%endif
- push rbp
- pushreg rbp
- push rbx
- pushreg rbx
- mov rbp, rsp
- setframe rbp, 0
- endprolog
-
-%ifdef WIN64
- mov rdi, parm1q
- mov rsi, parm2q
- mov rdx, parm3q
- mov rcx, parm4q
- movsxd r8, dword [rbp+72]
- movsxd r9, dword [rbp+80]
- mov ebx, dword [rbp+88]
-%else
- mov ebx, dword [rbp+24]
-%endif
- %define dsth rdi
- %define dstv rsi
- %define dstc rdx
- %define src rcx
- %define stride r8
- %define width r9
- %define height ebx
- %define stride3 r10
- %define stride5 r11
- %define x rax
- %define tbuffer rsp + 8
-
- lea stride3, [stride*3]
- lea stride5, [stride*5]
- sub src, stride
- sub src, stride
-
- lea rax, [stride*2 + 24]
- sub rsp, rax
-
- pxor mm0, mm0
-
-.loopy:
-
- xor x, x
-ALIGN 16
-.vertical_filter:
-
- prefetcht0 [src + stride5 + 32]
-
- LOAD_ADD mm1, [src ], [src + stride5 ] ; a0
- LOAD_ADD mm2, [src + stride ], [src + stride*4 ] ; b0
- LOAD_ADD mm3, [src + stride*2 ], [src + stride3 ] ; c0
- LOAD_ADD mm4, [src + 4], [src + stride5 + 4] ; a1
- LOAD_ADD mm5, [src + stride + 4], [src + stride*4 + 4] ; b1
- LOAD_ADD mm6, [src + stride*2 + 4], [src + stride3 + 4] ; c1
-
- FILT_V
-
- movq mm7, [pw_16 GLOBAL]
- movq [tbuffer + x*2], mm1
- movq [tbuffer + x*2 + 8], mm4
- paddw mm1, mm7
- paddw mm4, mm7
- psraw mm1, 5
- psraw mm4, 5
- packuswb mm1, mm4
- movntq [dstv + x], mm1
-
- add x, 8
- add src, 8
- cmp x, width
- jle .vertical_filter
-
- pshufw mm2, [tbuffer], 0
- movq [tbuffer - 8], mm2 ; pad left
- ; no need to pad right, since vertical_filter already did 4 extra pixels
-
- sub src, x
- xor x, x
- movq mm7, [pw_32 GLOBAL]
-.center_filter:
-
- movq mm1, [tbuffer + x*2 - 4 ]
- movq mm2, [tbuffer + x*2 - 2 ]
- movq mm3, [tbuffer + x*2 ]
- movq mm4, [tbuffer + x*2 + 4 ]
- movq mm5, [tbuffer + x*2 + 6 ]
- paddw mm3, [tbuffer + x*2 + 2 ] ; c0
- paddw mm2, mm4 ; b0
- paddw mm1, mm5 ; a0
- movq mm6, [tbuffer + x*2 + 8 ]
- paddw mm4, [tbuffer + x*2 + 14] ; a1
- paddw mm5, [tbuffer + x*2 + 12] ; b1
- paddw mm6, [tbuffer + x*2 + 10] ; c1
-
- FILT_H
- FILT_PACK 6
- movntq [dstc + x], mm1
-
- add x, 8
- cmp x, width
- jl .center_filter
-
- lea src, [src + stride*2]
- xor x, x
-.horizontal_filter:
-
- movd mm1, [src + x - 2]
- movd mm2, [src + x - 1]
- movd mm3, [src + x ]
- movd mm6, [src + x + 1]
- movd mm4, [src + x + 2]
- movd mm5, [src + x + 3]
- punpcklbw mm1, mm0
- punpcklbw mm2, mm0
- punpcklbw mm3, mm0
- punpcklbw mm6, mm0
- punpcklbw mm4, mm0
- punpcklbw mm5, mm0
- paddw mm3, mm6 ; c0
- paddw mm2, mm4 ; b0
- paddw mm1, mm5 ; a0
- movd mm7, [src + x + 7]
- movd mm6, [src + x + 6]
- punpcklbw mm7, mm0
- punpcklbw mm6, mm0
- paddw mm4, mm7 ; c1
- paddw mm5, mm6 ; b1
- movd mm7, [src + x + 5]
- movd mm6, [src + x + 4]
- punpcklbw mm7, mm0
- punpcklbw mm6, mm0
- paddw mm6, mm7 ; a1
-
- movq mm7, [pw_1 GLOBAL]
- FILT_H
- FILT_PACK 1
- movntq [dsth + x], mm1
-
- add x, 8
- cmp x, width
- jl .horizontal_filter
-
- sub src, stride
- add dsth, stride
- add dstv, stride
- add dstc, stride
- dec height
- jg .loopy
-
- mov rsp, rbp
- pop rbx
- pop rbp
-%ifdef WIN64
- pop rsi
- pop rdi
-%endif
- ret
-
-
-
-;-----------------------------------------------------------------------------
-; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
-; uint8_t *src, int i_src, int w, int h)
-;-----------------------------------------------------------------------------
-cglobal x264_plane_copy_mmxext
- movsxd parm2q, parm2d
- movsxd parm4q, parm4d
- add parm5d, 3
- and parm5d, ~3
- sub parm2q, parm5q
- sub parm4q, parm5q
- ; shuffle regs because movsd needs dst=rdi, src=rsi, w=ecx
- xchg rsi, rdx
- mov rax, parm4q
-.loopy:
- mov ecx, parm5d
- sub ecx, 64
- jl .endx
-.loopx:
- prefetchnta [rsi+256]
- movq mm0, [rsi ]
- movq mm1, [rsi+ 8]
- movq mm2, [rsi+16]
- movq mm3, [rsi+24]
- movq mm4, [rsi+32]
- movq mm5, [rsi+40]
- movq mm6, [rsi+48]
- movq mm7, [rsi+56]
- movntq [rdi ], mm0
- movntq [rdi+ 8], mm1
- movntq [rdi+16], mm2
- movntq [rdi+24], mm3
- movntq [rdi+32], mm4
- movntq [rdi+40], mm5
- movntq [rdi+48], mm6
- movntq [rdi+56], mm7
- add rsi, 64
- add rdi, 64
- sub ecx, 64
- jge .loopx
-.endx:
- prefetchnta [rsi+256]
- add ecx, 64
- shr ecx, 2
- rep movsd
- add rdi, rdx
- add rsi, rax
- sub parm6d, 1
- jg .loopy
- emms
- ret
-
+++ /dev/null
-;*****************************************************************************
-;* pixel.asm: h264 encoder library
-;*****************************************************************************
-;* Copyright (C) 2003 x264 project
-;* $Id: pixel.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
-;*
-;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
-;*****************************************************************************
-
-BITS 64
-
-;=============================================================================
-; Macros and other preprocessor constants
-;=============================================================================
-
-%include "amd64inc.asm"
-
-; sad
-
-%macro SAD_INC_2x16P 0
- movq mm1, [parm1q]
- movq mm2, [parm1q+8]
- movq mm3, [parm1q+parm2q]
- movq mm4, [parm1q+parm2q+8]
- psadbw mm1, [parm3q]
- psadbw mm2, [parm3q+8]
- psadbw mm3, [parm3q+parm4q]
- psadbw mm4, [parm3q+parm4q+8]
- lea parm1q, [parm1q+2*parm2q]
- paddw mm1, mm2
- paddw mm3, mm4
- lea parm3q, [parm3q+2*parm4q]
- paddw mm0, mm1
- paddw mm0, mm3
-%endmacro
-
-%macro SAD_INC_2x8P 0
- movq mm1, [parm1q]
- movq mm2, [parm1q+parm2q]
- psadbw mm1, [parm3q]
- psadbw mm2, [parm3q+parm4q]
- lea parm1q, [parm1q+2*parm2q]
- paddw mm0, mm1
- paddw mm0, mm2
- lea parm3q, [parm3q+2*parm4q]
-%endmacro
-
-%macro SAD_INC_2x4P 0
- movd mm1, [parm1q]
- movd mm2, [parm3q]
- punpckldq mm1, [parm1q+parm2q]
- punpckldq mm2, [parm3q+parm4q]
- psadbw mm1, mm2
- paddw mm0, mm1
- lea parm1q, [parm1q+2*parm2q]
- lea parm3q, [parm3q+2*parm4q]
-%endmacro
-
-; sad x3 / x4
-
-%macro SAD_X3_START_1x8P 0
- movq mm3, [parm1q]
- movq mm0, [parm2q]
- movq mm1, [parm3q]
- movq mm2, [parm4q]
- psadbw mm0, mm3
- psadbw mm1, mm3
- psadbw mm2, mm3
-%endmacro
-
-%macro SAD_X3_1x8P 2
- movq mm3, [parm1q+%1]
- movq mm4, [parm2q+%2]
- movq mm5, [parm3q+%2]
- movq mm6, [parm4q+%2]
- psadbw mm4, mm3
- psadbw mm5, mm3
- psadbw mm6, mm3
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm2, mm6
-%endmacro
-
-%macro SAD_X3_START_2x4P 3
- movd mm3, [parm1q]
- movd %1, [parm2q]
- movd %2, [parm3q]
- movd %3, [parm4q]
- punpckldq mm3, [parm1q+FENC_STRIDE]
- punpckldq %1, [parm2q+parm5q]
- punpckldq %2, [parm3q+parm5q]
- punpckldq %3, [parm4q+parm5q]
- psadbw %1, mm3
- psadbw %2, mm3
- psadbw %3, mm3
-%endmacro
-
-%macro SAD_X3_2x16P 1
-%if %1
- SAD_X3_START_1x8P
-%else
- SAD_X3_1x8P 0, 0
-%endif
- SAD_X3_1x8P 8, 8
- SAD_X3_1x8P FENC_STRIDE, parm5q
- SAD_X3_1x8P FENC_STRIDE+8, parm5q+8
- add parm1q, 2*FENC_STRIDE
- lea parm2q, [parm2q+2*parm5q]
- lea parm3q, [parm3q+2*parm5q]
- lea parm4q, [parm4q+2*parm5q]
-%endmacro
-
-%macro SAD_X3_2x8P 1
-%if %1
- SAD_X3_START_1x8P
-%else
- SAD_X3_1x8P 0, 0
-%endif
- SAD_X3_1x8P FENC_STRIDE, parm5q
- add parm1q, 2*FENC_STRIDE
- lea parm2q, [parm2q+2*parm5q]
- lea parm3q, [parm3q+2*parm5q]
- lea parm4q, [parm4q+2*parm5q]
-%endmacro
-
-%macro SAD_X3_2x4P 1
-%if %1
- SAD_X3_START_2x4P mm0, mm1, mm2
-%else
- SAD_X3_START_2x4P mm4, mm5, mm6
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm2, mm6
-%endif
- add parm1q, 2*FENC_STRIDE
- lea parm2q, [parm2q+2*parm5q]
- lea parm3q, [parm3q+2*parm5q]
- lea parm4q, [parm4q+2*parm5q]
-%endmacro
-
-%macro SAD_X4_START_1x8P 0
- movq mm7, [parm1q]
- movq mm0, [parm2q]
- movq mm1, [parm3q]
- movq mm2, [parm4q]
- movq mm3, [parm5q]
- psadbw mm0, mm7
- psadbw mm1, mm7
- psadbw mm2, mm7
- psadbw mm3, mm7
-%endmacro
-
-%macro SAD_X4_1x8P 2
- movq mm7, [parm1q+%1]
- movq mm4, [parm2q+%2]
- movq mm5, [parm3q+%2]
- movq mm6, [parm4q+%2]
- psadbw mm4, mm7
- psadbw mm5, mm7
- psadbw mm6, mm7
- psadbw mm7, [parm5q+%2]
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm2, mm6
- paddw mm3, mm7
-%endmacro
-
-%macro SAD_X4_START_2x4P 0
- movd mm7, [parm1q]
- movd mm0, [parm2q]
- movd mm1, [parm3q]
- movd mm2, [parm4q]
- movd mm3, [parm5q]
- punpckldq mm7, [parm1q+FENC_STRIDE]
- punpckldq mm0, [parm2q+parm6q]
- punpckldq mm1, [parm3q+parm6q]
- punpckldq mm2, [parm4q+parm6q]
- punpckldq mm3, [parm5q+parm6q]
- psadbw mm0, mm7
- psadbw mm1, mm7
- psadbw mm2, mm7
- psadbw mm3, mm7
-%endmacro
-
-%macro SAD_X4_INC_2x4P 0
- movd mm7, [parm1q]
- movd mm4, [parm2q]
- movd mm5, [parm3q]
- punpckldq mm7, [parm1q+FENC_STRIDE]
- punpckldq mm4, [parm2q+parm6q]
- punpckldq mm5, [parm3q+parm6q]
- psadbw mm4, mm7
- psadbw mm5, mm7
- paddw mm0, mm4
- paddw mm1, mm5
- movd mm4, [parm4q]
- movd mm5, [parm5q]
- punpckldq mm4, [parm4q+parm6q]
- punpckldq mm5, [parm5q+parm6q]
- psadbw mm4, mm7
- psadbw mm5, mm7
- paddw mm2, mm4
- paddw mm3, mm5
-%endmacro
-
-%macro SAD_X4_2x16P 1
-%if %1
- SAD_X4_START_1x8P
-%else
- SAD_X4_1x8P 0, 0
-%endif
- SAD_X4_1x8P 8, 8
- SAD_X4_1x8P FENC_STRIDE, parm6q
- SAD_X4_1x8P FENC_STRIDE+8, parm6q+8
- add parm1q, 2*FENC_STRIDE
- lea parm2q, [parm2q+2*parm6q]
- lea parm3q, [parm3q+2*parm6q]
- lea parm4q, [parm4q+2*parm6q]
- lea parm5q, [parm5q+2*parm6q]
-%endmacro
-
-%macro SAD_X4_2x8P 1
-%if %1
- SAD_X4_START_1x8P
-%else
- SAD_X4_1x8P 0, 0
-%endif
- SAD_X4_1x8P FENC_STRIDE, parm6q
- add parm1q, 2*FENC_STRIDE
- lea parm2q, [parm2q+2*parm6q]
- lea parm3q, [parm3q+2*parm6q]
- lea parm4q, [parm4q+2*parm6q]
- lea parm5q, [parm5q+2*parm6q]
-%endmacro
-
-%macro SAD_X4_2x4P 1
-%if %1
- SAD_X4_START_2x4P
-%else
- SAD_X4_INC_2x4P
-%endif
- add parm1q, 2*FENC_STRIDE
- lea parm2q, [parm2q+2*parm6q]
- lea parm3q, [parm3q+2*parm6q]
- lea parm4q, [parm4q+2*parm6q]
- lea parm5q, [parm5q+2*parm6q]
-%endmacro
-
-%macro SAD_X3_END 0
- movd [parm6q+0], mm0
- movd [parm6q+4], mm1
- movd [parm6q+8], mm2
- ret
-%endmacro
-
-%macro SAD_X4_END 0
- mov rax, parm7q
- movd [rax+0], mm0
- movd [rax+4], mm1
- movd [rax+8], mm2
- movd [rax+12], mm3
- ret
-%endmacro
-
-; ssd
-
-%macro SSD_INC_1x16P 0
- movq mm1, [parm1q]
- movq mm2, [parm3q]
- movq mm3, [parm1q+8]
- movq mm4, [parm3q+8]
-
- movq mm5, mm2
- movq mm6, mm4
- psubusb mm2, mm1
- psubusb mm4, mm3
- psubusb mm1, mm5
- psubusb mm3, mm6
- por mm1, mm2
- por mm3, mm4
-
- movq mm2, mm1
- movq mm4, mm3
- punpcklbw mm1, mm7
- punpcklbw mm3, mm7
- punpckhbw mm2, mm7
- punpckhbw mm4, mm7
- pmaddwd mm1, mm1
- pmaddwd mm2, mm2
- pmaddwd mm3, mm3
- pmaddwd mm4, mm4
-
- add parm1q, parm2q
- add parm3q, parm4q
- paddd mm0, mm1
- paddd mm0, mm2
- paddd mm0, mm3
- paddd mm0, mm4
-%endmacro
-
-%macro SSD_INC_1x8P 0
- movq mm1, [parm1q]
- movq mm2, [parm3q]
-
- movq mm5, mm2
- psubusb mm2, mm1
- psubusb mm1, mm5
- por mm1, mm2 ; mm1 = 8bit abs diff
-
- movq mm2, mm1
- punpcklbw mm1, mm7
- punpckhbw mm2, mm7 ; (mm1,mm2) = 16bit abs diff
- pmaddwd mm1, mm1
- pmaddwd mm2, mm2
-
- add parm1q, parm2q
- add parm3q, parm4q
- paddd mm0, mm1
- paddd mm0, mm2
-%endmacro
-
-%macro SSD_INC_1x4P 0
- movd mm1, [parm1q]
- movd mm2, [parm3q]
-
- movq mm5, mm2
- psubusb mm2, mm1
- psubusb mm1, mm5
- por mm1, mm2
- punpcklbw mm1, mm7
- pmaddwd mm1, mm1
-
- add parm1q, parm2q
- add parm3q, parm4q
- paddd mm0, mm1
-%endmacro
-
-; satd
-
-%macro LOAD_DIFF_4P 4 ; MMP, MMT, [pix1], [pix2]
- movd %1, %3
- movd %2, %4
- punpcklbw %1, %2
- punpcklbw %2, %2
- psubw %1, %2
-%endmacro
-
-%macro HADAMARD4_SUB_BADC 4
- paddw %1, %2
- paddw %3, %4
- paddw %2, %2
- paddw %4, %4
- psubw %2, %1
- psubw %4, %3
-%endmacro
-
-%macro HADAMARD4x4 4
- HADAMARD4_SUB_BADC %1, %2, %3, %4
- HADAMARD4_SUB_BADC %1, %3, %2, %4
-%endmacro
-
-%macro SBUTTERFLYwd 3
- movq %3, %1
- punpcklwd %1, %2
- punpckhwd %3, %2
-%endmacro
-
-%macro SBUTTERFLYdq 3
- movq %3, %1
- punpckldq %1, %2
- punpckhdq %3, %2
-%endmacro
-
-%macro TRANSPOSE4x4 5 ; abcd-t -> adtc
- SBUTTERFLYwd %1, %2, %5
- SBUTTERFLYwd %3, %4, %2
- SBUTTERFLYdq %1, %3, %4
- SBUTTERFLYdq %5, %2, %3
-%endmacro
-
-%macro MMX_ABS 2 ; mma, tmp
- pxor %2, %2
- psubw %2, %1
- pmaxsw %1, %2
-%endmacro
-
-%macro MMX_ABS_TWO 4 ; mma, mmb, tmp0, tmp1
- pxor %3, %3
- pxor %4, %4
- psubw %3, %1
- psubw %4, %2
- pmaxsw %1, %3
- pmaxsw %2, %4
-%endmacro
-
-%macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block)
- HADAMARD4x4 mm4, mm5, mm6, mm7
- TRANSPOSE4x4 mm4, mm5, mm6, mm7, %1
- HADAMARD4x4 mm4, mm7, %1, mm6
- MMX_ABS_TWO mm4, mm7, mm3, mm5
- MMX_ABS_TWO %1, mm6, mm3, mm5
- paddw %1, mm4
- paddw mm6, mm7
- pavgw %1, mm6
-%endmacro
-
-; in: r10=3*stride1, r11=3*stride2
-; in: %2 = horizontal offset
-; in: %3 = whether we need to increment pix1 and pix2
-; clobber: mm3..mm7
-; out: %1 = satd
-%macro LOAD_DIFF_HADAMARD_SUM 3
- LOAD_DIFF_4P mm4, mm3, [parm1q+%2], [parm3q+%2]
- LOAD_DIFF_4P mm5, mm3, [parm1q+parm2q+%2], [parm3q+parm4q+%2]
- LOAD_DIFF_4P mm6, mm3, [parm1q+2*parm2q+%2], [parm3q+2*parm4q+%2]
- LOAD_DIFF_4P mm7, mm3, [parm1q+r10+%2], [parm3q+r11+%2]
-%if %3
- lea parm1q, [parm1q+4*parm2q]
- lea parm3q, [parm3q+4*parm4q]
-%endif
- HADAMARD4x4_SUM %1
-%endmacro
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-%macro SAD_START 0
- pxor mm0, mm0
-%endmacro
-
-%macro SAD_END 0
- movd eax, mm0
- ret
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-%macro SAD 2
-cglobal x264_pixel_sad_%1x%2_mmxext
- SAD_START
-%rep %2/2
- SAD_INC_2x%1P
-%endrep
- SAD_END
-%endmacro
-
-SAD 16, 16
-SAD 16, 8
-SAD 8, 16
-SAD 8, 8
-SAD 8, 4
-SAD 4, 8
-SAD 4, 4
-
-;-----------------------------------------------------------------------------
-; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-; uint8_t *pix2, int i_stride, int scores[3] )
-;-----------------------------------------------------------------------------
-%macro SAD_X 3
-cglobal x264_pixel_sad_x%1_%2x%3_mmxext
- SAD_X%1_2x%2P 1
-%rep %3/2-1
- SAD_X%1_2x%2P 0
-%endrep
- SAD_X%1_END
-%endmacro
-
-SAD_X 3, 16, 16
-SAD_X 3, 16, 8
-SAD_X 3, 8, 16
-SAD_X 3, 8, 8
-SAD_X 3, 8, 4
-SAD_X 3, 4, 8
-SAD_X 3, 4, 4
-SAD_X 4, 16, 16
-SAD_X 4, 16, 8
-SAD_X 4, 8, 16
-SAD_X 4, 8, 8
-SAD_X 4, 8, 4
-SAD_X 4, 4, 8
-SAD_X 4, 4, 4
-
-
-%macro SSD_START 0
- pxor mm7, mm7 ; zero
- pxor mm0, mm0 ; mm0 holds the sum
-%endmacro
-
-%macro SSD_END 0
- movq mm1, mm0
- psrlq mm1, 32
- paddd mm0, mm1
- movd eax, mm0
- ret
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-%macro SSD 2
-cglobal x264_pixel_ssd_%1x%2_mmx
- SSD_START
-%rep %2
- SSD_INC_1x%1P
-%endrep
- SSD_END
-%endmacro
-
-SSD 16, 16
-SSD 16, 8
-SSD 8, 16
-SSD 8, 8
-SSD 8, 4
-SSD 4, 8
-SSD 4, 4
-
-
-
-%macro SATD_START 0
- lea r10, [3*parm2q] ; 3*stride1
- lea r11, [3*parm4q] ; 3*stride2
-%endmacro
-
-%macro SATD_END 0
- pshufw mm1, mm0, 01001110b
- paddw mm0, mm1
- pshufw mm1, mm0, 10110001b
- paddw mm0, mm1
- movd eax, mm0
- and eax, 0xffff
- ret
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_4x4_mmxext
- SATD_START
- LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_4x8_mmxext
- SATD_START
- LOAD_DIFF_HADAMARD_SUM mm0, 0, 1
- LOAD_DIFF_HADAMARD_SUM mm1, 0, 0
- paddw mm0, mm1
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_8x4_mmxext
- SATD_START
- LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
- paddw mm0, mm1
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_8x8_mmxext
- SATD_START
- LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 1
- LOAD_DIFF_HADAMARD_SUM mm2, 0, 0
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
- paddw mm0, mm2
- paddw mm0, mm1
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_16x8_mmxext
- SATD_START
- LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
- LOAD_DIFF_HADAMARD_SUM mm2, 8, 0
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 12, 1
- paddw mm0, mm2
-
- LOAD_DIFF_HADAMARD_SUM mm2, 0, 0
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
- paddw mm0, mm2
- LOAD_DIFF_HADAMARD_SUM mm2, 8, 0
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 12, 1
- paddw mm0, mm2
- paddw mm0, mm1
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_8x16_mmxext
- SATD_START
- LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 1
- LOAD_DIFF_HADAMARD_SUM mm2, 0, 0
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 1
- paddw mm0, mm2
-
- LOAD_DIFF_HADAMARD_SUM mm2, 0, 0
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 1
- paddw mm1, mm2
- LOAD_DIFF_HADAMARD_SUM mm2, 0, 0
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 1
- paddw mm0, mm2
- paddw mm0, mm1
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_16x16_mmxext
- SATD_START
- LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
- LOAD_DIFF_HADAMARD_SUM mm2, 8, 0
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 12, 1
- paddw mm0, mm2
-
- LOAD_DIFF_HADAMARD_SUM mm2, 0, 0
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
- paddw mm0, mm2
- LOAD_DIFF_HADAMARD_SUM mm2, 8, 0
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 12, 1
- paddw mm0, mm2
-
- LOAD_DIFF_HADAMARD_SUM mm2, 0, 0
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
- paddw mm0, mm2
- LOAD_DIFF_HADAMARD_SUM mm2, 8, 0
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 12, 1
- paddw mm0, mm2
-
- LOAD_DIFF_HADAMARD_SUM mm2, 0, 0
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
- paddw mm0, mm2
- LOAD_DIFF_HADAMARD_SUM mm2, 8, 0
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 12, 0
- paddw mm0, mm2
- paddw mm0, mm1
-
- pxor mm3, mm3
- pshufw mm1, mm0, 01001110b
- paddw mm0, mm1
- punpcklwd mm0, mm3
- pshufw mm1, mm0, 01001110b
- paddd mm0, mm1
- movd eax, mm0
- ret
-
-
-; in: parm1 = fenc
-; out: mm0..mm3 = hadamard coefs
-ALIGN 16
-load_hadamard:
- pxor mm7, mm7
- movd mm0, [parm1q+0*FENC_STRIDE]
- movd mm4, [parm1q+1*FENC_STRIDE]
- movd mm3, [parm1q+2*FENC_STRIDE]
- movd mm1, [parm1q+3*FENC_STRIDE]
- punpcklbw mm0, mm7
- punpcklbw mm4, mm7
- punpcklbw mm3, mm7
- punpcklbw mm1, mm7
- HADAMARD4x4 mm0, mm4, mm3, mm1
- TRANSPOSE4x4 mm0, mm4, mm3, mm1, mm2
- HADAMARD4x4 mm0, mm1, mm2, mm3
- ret
-
-%macro SCALAR_SUMSUB 4
- add %1, %2
- add %3, %4
- add %2, %2
- add %4, %4
- sub %2, %1
- sub %4, %3
-%endmacro
-
-%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
- pxor %7, %7
- pshufw %4, %1, 01001110b
- pshufw %5, %2, 01001110b
- pshufw %6, %3, 01001110b
- paddw %1, %4
- paddw %2, %5
- paddw %3, %6
- punpcklwd %1, %7
- punpcklwd %2, %7
- punpcklwd %3, %7
- pshufw %4, %1, 01001110b
- pshufw %5, %2, 01001110b
- pshufw %6, %3, 01001110b
- %8 %1, %4
- %8 %2, %5
- %8 %3, %6
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
-;-----------------------------------------------------------------------------
-cglobal x264_intra_satd_x3_4x4_mmxext
-%define top_1d rsp-8 ; +8
-%define left_1d rsp-16 ; +8
- call load_hadamard
-
- movzx r8d, byte [parm2q-1+0*FDEC_STRIDE]
- movzx r9d, byte [parm2q-1+1*FDEC_STRIDE]
- movzx r10d, byte [parm2q-1+2*FDEC_STRIDE]
- movzx r11d, byte [parm2q-1+3*FDEC_STRIDE]
- SCALAR_SUMSUB r8d, r9d, r10d, r11d
- SCALAR_SUMSUB r8d, r10d, r9d, r11d ; 1x4 hadamard
- mov [left_1d+0], r8w
- mov [left_1d+2], r9w
- mov [left_1d+4], r10w
- mov [left_1d+6], r11w
- mov eax, r8d ; dc
-
- movzx r8d, byte [parm2q-FDEC_STRIDE+0]
- movzx r9d, byte [parm2q-FDEC_STRIDE+1]
- movzx r10d, byte [parm2q-FDEC_STRIDE+2]
- movzx r11d, byte [parm2q-FDEC_STRIDE+3]
- SCALAR_SUMSUB r8d, r9d, r10d, r11d
- SCALAR_SUMSUB r8d, r10d, r9d, r11d ; 4x1 hadamard
- lea rax, [rax + r8 + 4] ; dc
- mov [top_1d+0], r8w
- mov [top_1d+2], r9w
- mov [top_1d+4], r10w
- mov [top_1d+6], r11w
- and eax, -8
- shl eax, 1
-
- movq mm4, mm1
- movq mm5, mm2
- MMX_ABS_TWO mm4, mm5, mm6, mm7
- movq mm7, mm3
- paddw mm4, mm5
- MMX_ABS mm7, mm6
- paddw mm7, mm4 ; 3x4 sum
-
- movq mm4, [left_1d]
- movd mm5, eax
- psllw mm4, 2
- psubw mm4, mm0
- psubw mm5, mm0
- punpcklwd mm0, mm1
- punpcklwd mm2, mm3
- punpckldq mm0, mm2 ; transpose
- movq mm1, [top_1d]
- psllw mm1, 2
- psubw mm0, mm1
- MMX_ABS mm4, mm3 ; 1x4 sum
- MMX_ABS mm5, mm2 ; 1x4 sum
- MMX_ABS mm0, mm1 ; 4x1 sum
- paddw mm4, mm7
- paddw mm5, mm7
- movq mm1, mm5
- psrlq mm1, 16 ; 4x3 sum
- paddw mm0, mm1
-
- SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw
- movd [parm3q+0], mm0 ; i4x4_v satd
- movd [parm3q+4], mm4 ; i4x4_h satd
- movd [parm3q+8], mm5 ; i4x4_dc satd
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
-;-----------------------------------------------------------------------------
-cglobal x264_intra_satd_x3_16x16_mmxext
- sub rsp, 96
-%define sums rsp+64 ; size 24
-%define top_1d rsp+32 ; size 32
-%define left_1d rsp ; size 32
-
- mov qword [sums+0], 0
- mov qword [sums+8], 0
- mov qword [sums+16], 0
-
- ; 1D hadamards
- xor ecx, ecx
- mov eax, 12
-.loop_edge:
- ; left
- shl eax, 5 ; log(FDEC_STRIDE)
- movzx r8d, byte [parm2q+rax-1+0*FDEC_STRIDE]
- movzx r9d, byte [parm2q+rax-1+1*FDEC_STRIDE]
- movzx r10d, byte [parm2q+rax-1+2*FDEC_STRIDE]
- movzx r11d, byte [parm2q+rax-1+3*FDEC_STRIDE]
- shr eax, 5
- SCALAR_SUMSUB r8d, r9d, r10d, r11d
- SCALAR_SUMSUB r8d, r10d, r9d, r11d
- add ecx, r8d
- mov [left_1d+2*rax+0], r8w
- mov [left_1d+2*rax+2], r9w
- mov [left_1d+2*rax+4], r10w
- mov [left_1d+2*rax+6], r11w
-
- ; top
- movzx r8d, byte [parm2q+rax-FDEC_STRIDE+0]
- movzx r9d, byte [parm2q+rax-FDEC_STRIDE+1]
- movzx r10d, byte [parm2q+rax-FDEC_STRIDE+2]
- movzx r11d, byte [parm2q+rax-FDEC_STRIDE+3]
- SCALAR_SUMSUB r8d, r9d, r10d, r11d
- SCALAR_SUMSUB r8d, r10d, r9d, r11d
- add ecx, r8d
- mov [top_1d+2*rax+0], r8w
- mov [top_1d+2*rax+2], r9w
- mov [top_1d+2*rax+4], r10w
- mov [top_1d+2*rax+6], r11w
- sub eax, 4
- jge .loop_edge
-
- ; dc
- shr ecx, 1
- add ecx, 8
- and ecx, -16
-
- ; 2D hadamards
- xor eax, eax
-.loop_y:
- xor esi, esi
-.loop_x:
- call load_hadamard
-
- movq mm4, mm1
- movq mm5, mm2
- MMX_ABS_TWO mm4, mm5, mm6, mm7
- movq mm7, mm3
- paddw mm4, mm5
- MMX_ABS mm7, mm6
- paddw mm7, mm4 ; 3x4 sum
-
- movq mm4, [left_1d+8*rax]
- movd mm5, ecx
- psllw mm4, 2
- psubw mm4, mm0
- psubw mm5, mm0
- punpcklwd mm0, mm1
- punpcklwd mm2, mm3
- punpckldq mm0, mm2 ; transpose
- movq mm1, [top_1d+8*rsi]
- psllw mm1, 2
- psubw mm0, mm1
- MMX_ABS mm4, mm3 ; 1x4 sum
- MMX_ABS mm5, mm2 ; 1x4 sum
- MMX_ABS mm0, mm1 ; 4x1 sum
- pavgw mm4, mm7
- pavgw mm5, mm7
- paddw mm0, [sums+0] ; i4x4_v satd
- paddw mm4, [sums+8] ; i4x4_h satd
- paddw mm5, [sums+16] ; i4x4_dc satd
- movq [sums+0], mm0
- movq [sums+8], mm4
- movq [sums+16], mm5
-
- add parm1q, 4
- inc esi
- cmp esi, 4
- jl .loop_x
- add parm1q, 4*FENC_STRIDE-16
- inc eax
- cmp eax, 4
- jl .loop_y
-
-; horizontal sum
- movq mm2, [sums+16]
- movq mm1, [sums+8]
- movq mm0, [sums+0]
- movq mm7, mm2
- SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
- psrld mm0, 1
- pslld mm7, 16
- psrld mm7, 16
- paddd mm0, mm2
- psubd mm0, mm7
- movd [parm3q+8], mm2 ; i16x16_dc satd
- movd [parm3q+4], mm1 ; i16x16_h satd
- movd [parm3q+0], mm0 ; i16x16_v satd
- add rsp, 96
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
-;-----------------------------------------------------------------------------
-cglobal x264_intra_satd_x3_8x8c_mmxext
- sub rsp, 64
-%define sums rsp+32 ; size 24
-%define top_1d rsp+16 ; size 16
-%define left_1d rsp ; size 16
-
- mov qword [sums+0], 0
- mov qword [sums+8], 0
- mov qword [sums+16], 0
-
- ; 1D hadamards
- mov eax, 4
-.loop_edge:
- ; left
- shl eax, 5 ; log(FDEC_STRIDE)
- movzx r8d, byte [parm2q+rax-1+0*FDEC_STRIDE]
- movzx r9d, byte [parm2q+rax-1+1*FDEC_STRIDE]
- movzx r10d, byte [parm2q+rax-1+2*FDEC_STRIDE]
- movzx r11d, byte [parm2q+rax-1+3*FDEC_STRIDE]
- shr eax, 5
- SCALAR_SUMSUB r8d, r9d, r10d, r11d
- SCALAR_SUMSUB r8d, r10d, r9d, r11d
- mov [left_1d+2*rax+0], r8w
- mov [left_1d+2*rax+2], r9w
- mov [left_1d+2*rax+4], r10w
- mov [left_1d+2*rax+6], r11w
-
- ; top
- movzx r8d, byte [parm2q+rax-FDEC_STRIDE+0]
- movzx r9d, byte [parm2q+rax-FDEC_STRIDE+1]
- movzx r10d, byte [parm2q+rax-FDEC_STRIDE+2]
- movzx r11d, byte [parm2q+rax-FDEC_STRIDE+3]
- SCALAR_SUMSUB r8d, r9d, r10d, r11d
- SCALAR_SUMSUB r8d, r10d, r9d, r11d
- mov [top_1d+2*rax+0], r8w
- mov [top_1d+2*rax+2], r9w
- mov [top_1d+2*rax+4], r10w
- mov [top_1d+2*rax+6], r11w
- sub eax, 4
- jge .loop_edge
-
- ; dc
- movzx r8d, word [left_1d+0]
- movzx r9d, word [top_1d+0]
- movzx r10d, word [left_1d+8]
- movzx r11d, word [top_1d+8]
- add r8d, r9d
- lea r9, [r10 + r11]
- lea r8, [2*r8 + 8]
- lea r9, [2*r9 + 8]
- lea r10, [4*r10 + 8]
- lea r11, [4*r11 + 8]
- and r8d, -16 ; tl
- and r9d, -16 ; br
- and r10d, -16 ; bl
- and r11d, -16 ; tr
- shl r9, 16
- mov r9w, r10w
- shl r9, 16
- mov r9w, r11w
- shl r9, 16
- mov r9w, r8w
-
- ; 2D hadamards
- xor eax, eax
-.loop_y:
- xor esi, esi
-.loop_x:
- call load_hadamard
-
- movq mm4, mm1
- movq mm5, mm2
- MMX_ABS_TWO mm4, mm5, mm6, mm7
- movq mm7, mm3
- paddw mm4, mm5
- MMX_ABS mm7, mm6
- paddw mm7, mm4 ; 3x4 sum
-
- movq mm4, [left_1d+8*rax]
- movzx ecx, r9w
- shr r9, 16
- movd mm5, ecx
- psllw mm4, 2
- psubw mm4, mm0
- psubw mm5, mm0
- punpcklwd mm0, mm1
- punpcklwd mm2, mm3
- punpckldq mm0, mm2 ; transpose
- movq mm1, [top_1d+8*rsi]
- psllw mm1, 2
- psubw mm0, mm1
- MMX_ABS mm4, mm3 ; 1x4 sum
- MMX_ABS mm5, mm2 ; 1x4 sum
- MMX_ABS mm0, mm1 ; 4x1 sum
- pavgw mm4, mm7
- pavgw mm5, mm7
- paddw mm0, [sums+16] ; i4x4_v satd
- paddw mm4, [sums+8] ; i4x4_h satd
- paddw mm5, [sums+0] ; i4x4_dc satd
- movq [sums+16], mm0
- movq [sums+8], mm4
- movq [sums+0], mm5
-
- add parm1q, 4
- inc esi
- cmp esi, 2
- jl .loop_x
- add parm1q, 4*FENC_STRIDE-8
- inc eax
- cmp eax, 2
- jl .loop_y
-
-; horizontal sum
- movq mm0, [sums+0]
- movq mm1, [sums+8]
- movq mm2, [sums+16]
- movq mm7, mm0
- psrlq mm7, 15
- paddw mm2, mm7
- SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
- psrld mm2, 1
- movd [parm3q+0], mm0 ; i8x8c_dc satd
- movd [parm3q+4], mm1 ; i8x8c_h satd
- movd [parm3q+8], mm2 ; i8x8c_v satd
- add rsp, 64
- ret
-
-
-
-; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
-; {
-; int nmv=0, i, j;
-; *(uint32_t*)(masks+width) = 0;
-; for( i=0; i<width; i+=8 )
-; {
-; uint64_t mask = *(uint64_t*)(masks+i);
-; if( !mask ) continue;
-; for( j=0; j<8; j++ )
-; if( mask & (255<<j*8) )
-; mvs[nmv++] = i+j;
-; }
-; return nmv;
-; }
-cglobal x264_pixel_ads_mvs
- ; mvs = parm5q
- ; masks = rsp
- ; width = r10
- mov dword [rsp+r10], 0
- xor eax, eax
- xor esi, esi
-.loopi:
- mov rdi, [rsp+rsi]
- test rdi, rdi
- jz .nexti
- xor ecx, ecx
-%macro TEST 1
- mov [parm5q+rax*2], si
- test edi, 0xff<<(%1*8)
- setne cl
- add eax, ecx
- inc esi
-%endmacro
- TEST 0
- TEST 1
- TEST 2
- TEST 3
- shr rdi, 32
- TEST 0
- TEST 1
- TEST 2
- TEST 3
- cmp esi, r10d
- jl .loopi
- leave
- ret
-.nexti:
- add esi, 8
- cmp esi, r10d
- jl .loopi
- leave
- ret
-
-%macro ADS_START 0
- push rbp
- mov rbp, rsp
- sub rsp, parm6q
- sub rsp, 4
- and rsp, ~15
- mov rax, rsp
- mov r10d, parm6d
- shl parm3q, 1
-%endmacro
-
-%macro ADS_END 1
- add parm2q, 8*%1
- add parm4q, 8*%1
- add rax, 4*%1
- sub parm6d, 4*%1
- jg .loop
- jmp x264_pixel_ads_mvs
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
-; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_ads4_mmxext
- movq mm6, [parm1q]
- movq mm4, [parm1q+8]
- pshufw mm7, mm6, 0
- pshufw mm6, mm6, 0xAA
- pshufw mm5, mm4, 0
- pshufw mm4, mm4, 0xAA
- ADS_START
-.loop:
- movq mm0, [parm2q]
- movq mm1, [parm2q+16]
- psubw mm0, mm7
- psubw mm1, mm6
- MMX_ABS mm0, mm2
- MMX_ABS mm1, mm3
- movq mm2, [parm2q+parm3q]
- movq mm3, [parm2q+parm3q+16]
- psubw mm2, mm5
- psubw mm3, mm4
- paddw mm0, mm1
- MMX_ABS mm2, mm1
- MMX_ABS mm3, mm1
- paddw mm0, mm2
- paddw mm0, mm3
- pshufw mm1, [rbp+16], 0
- paddusw mm0, [parm4q]
- psubusw mm1, mm0
- packsswb mm1, mm1
- movd [rax], mm1
- ADS_END 1
-
-cglobal x264_pixel_ads2_mmxext
- movq mm6, [parm1q]
- pshufw mm5, parm7q, 0
- pshufw mm7, mm6, 0
- pshufw mm6, mm6, 0xAA
- ADS_START
-.loop:
- movq mm0, [parm2q]
- movq mm1, [parm2q+parm3q]
- psubw mm0, mm7
- psubw mm1, mm6
- MMX_ABS mm0, mm2
- MMX_ABS mm1, mm3
- paddw mm0, mm1
- paddusw mm0, [parm4q]
- movq mm4, mm5
- psubusw mm4, mm0
- packsswb mm4, mm4
- movd [rax], mm4
- ADS_END 1
-
-cglobal x264_pixel_ads1_mmxext
- pshufw mm7, [parm1q], 0
- pshufw mm6, parm7q, 0
- ADS_START
-.loop:
- movq mm0, [parm2q]
- movq mm1, [parm2q+8]
- psubw mm0, mm7
- psubw mm1, mm7
- MMX_ABS mm0, mm2
- MMX_ABS mm1, mm3
- paddusw mm0, [parm4q]
- paddusw mm1, [parm4q+8]
- movq mm4, mm6
- movq mm5, mm6
- psubusw mm4, mm0
- psubusw mm5, mm1
- packsswb mm4, mm5
- movq [rax], mm4
- ADS_END 2
-
-%macro ADS_SSE2 1
-cglobal x264_pixel_ads4_%1
- movdqa xmm4, [parm1q]
- pshuflw xmm8, parm7q, 0
- pshuflw xmm7, xmm4, 0
- pshuflw xmm6, xmm4, 0xAA
- pshufhw xmm5, xmm4, 0
- pshufhw xmm4, xmm4, 0xAA
- punpcklqdq xmm8, xmm8
- punpcklqdq xmm7, xmm7
- punpcklqdq xmm6, xmm6
- punpckhqdq xmm5, xmm5
- punpckhqdq xmm4, xmm4
- ADS_START
- movdqu xmm10, [parm2q]
- movdqu xmm11, [parm2q+parm3q]
-.loop:
- movdqa xmm0, xmm10
- movdqu xmm1, [parm2q+16]
- movdqa xmm10, xmm1
- psubw xmm0, xmm7
- psubw xmm1, xmm6
- MMX_ABS xmm0, xmm2
- MMX_ABS xmm1, xmm3
- movdqa xmm2, xmm11
- movdqu xmm3, [parm2q+parm3q+16]
- movdqa xmm11, xmm3
- psubw xmm2, xmm5
- psubw xmm3, xmm4
- paddw xmm0, xmm1
- movdqu xmm9, [parm4q]
- MMX_ABS xmm2, xmm1
- MMX_ABS xmm3, xmm1
- paddw xmm0, xmm2
- paddw xmm0, xmm3
- paddusw xmm0, xmm9
- movdqa xmm1, xmm8
- psubusw xmm1, xmm0
- packsswb xmm1, xmm1
- movq [rax], xmm1
- ADS_END 2
-
-cglobal x264_pixel_ads2_%1
- movq xmm6, [parm1q]
- pshuflw xmm8, parm7q, 0
- pshuflw xmm7, xmm6, 0
- pshuflw xmm6, xmm6, 0xAA
- punpcklqdq xmm8, xmm8
- punpcklqdq xmm7, xmm7
- punpcklqdq xmm6, xmm6
- ADS_START
-.loop:
- movdqu xmm0, [parm2q]
- movdqu xmm1, [parm2q+parm3q]
- psubw xmm0, xmm7
- psubw xmm1, xmm6
- movdqu xmm9, [parm4q]
- MMX_ABS xmm0, xmm2
- MMX_ABS xmm1, xmm3
- paddw xmm0, xmm1
- paddusw xmm0, xmm9
- movdqa xmm4, xmm8
- psubusw xmm4, xmm0
- packsswb xmm4, xmm4
- movq [rax], xmm4
- ADS_END 2
-
-cglobal x264_pixel_ads1_%1
- pshuflw xmm7, [parm1q], 0
- pshuflw xmm8, parm7q, 0
- punpcklqdq xmm7, xmm7
- punpcklqdq xmm8, xmm8
- ADS_START
-.loop:
- movdqu xmm0, [parm2q]
- movdqu xmm1, [parm2q+16]
- psubw xmm0, xmm7
- psubw xmm1, xmm7
- movdqu xmm9, [parm4q]
- movdqu xmm10, [parm4q+16]
- MMX_ABS xmm0, xmm2
- MMX_ABS xmm1, xmm3
- paddusw xmm0, xmm9
- paddusw xmm1, xmm10
- movdqa xmm4, xmm8
- movdqa xmm5, xmm8
- psubusw xmm4, xmm0
- psubusw xmm5, xmm1
- packsswb xmm4, xmm5
- movdqa [rax], xmm4
- ADS_END 4
-%endmacro
-
-ADS_SSE2 sse2
-%ifdef HAVE_SSE3
-%macro MMX_ABS 2
- pabsw %1, %1
-%endmacro
-ADS_SSE2 ssse3
-%endif
+++ /dev/null
-;*****************************************************************************
-;* pixel-sse2.asm: h264 encoder library
-;*****************************************************************************
-;* Copyright (C) 2005 x264 project
-;*
-;* Authors: Alex Izvorski <aizvorksi@gmail.com>
-;* Loren Merritt <lorenm@u.washington.edu>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
-;*****************************************************************************
-
-BITS 64
-
-;=============================================================================
-; Macros and other preprocessor constants
-;=============================================================================
-
-%include "amd64inc.asm"
-
-SECTION_RODATA
-
-pw_1: times 8 dw 1
-ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
-ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
-mask_ff: times 16 db 0xff
- times 16 db 0
-sw_64: dq 64
-
-SECTION .text
-
-%macro HADDD 2 ; sum junk
- movhlps %2, %1
- paddd %1, %2
- pshuflw %2, %1, 0xE
- paddd %1, %2
-%endmacro
-
-%macro HADDW 2
- pmaddwd %1, [pw_1 GLOBAL]
- HADDD %1, %2
-%endmacro
-
-%macro SAD_END_SSE2 0
- movhlps xmm1, xmm0
- paddw xmm0, xmm1
- movd eax, xmm0
- ret
-%endmacro
-
-%macro SAD_W16 1
-;-----------------------------------------------------------------------------
-; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x16_%1
- movdqu xmm0, [rdx]
- movdqu xmm1, [rdx+rcx]
- lea rdx, [rdx+2*rcx]
- movdqu xmm2, [rdx]
- movdqu xmm3, [rdx+rcx]
- lea rdx, [rdx+2*rcx]
- psadbw xmm0, [rdi]
- psadbw xmm1, [rdi+rsi]
- lea rdi, [rdi+2*rsi]
- movdqu xmm4, [rdx]
- paddw xmm0, xmm1
- psadbw xmm2, [rdi]
- psadbw xmm3, [rdi+rsi]
- lea rdi, [rdi+2*rsi]
- movdqu xmm5, [rdx+rcx]
- lea rdx, [rdx+2*rcx]
- paddw xmm2, xmm3
- movdqu xmm6, [rdx]
- movdqu xmm7, [rdx+rcx]
- lea rdx, [rdx+2*rcx]
- paddw xmm0, xmm2
- psadbw xmm4, [rdi]
- psadbw xmm5, [rdi+rsi]
- lea rdi, [rdi+2*rsi]
- movdqu xmm1, [rdx]
- paddw xmm4, xmm5
- psadbw xmm6, [rdi]
- psadbw xmm7, [rdi+rsi]
- lea rdi, [rdi+2*rsi]
- movdqu xmm2, [rdx+rcx]
- lea rdx, [rdx+2*rcx]
- paddw xmm6, xmm7
- movdqu xmm3, [rdx]
- paddw xmm0, xmm4
- movdqu xmm4, [rdx+rcx]
- lea rdx, [rdx+2*rcx]
- paddw xmm0, xmm6
- psadbw xmm1, [rdi]
- psadbw xmm2, [rdi+rsi]
- lea rdi, [rdi+2*rsi]
- movdqu xmm5, [rdx]
- paddw xmm1, xmm2
- psadbw xmm3, [rdi]
- psadbw xmm4, [rdi+rsi]
- lea rdi, [rdi+2*rsi]
- movdqu xmm6, [rdx+rcx]
- lea rdx, [rdx+2*rcx]
- paddw xmm3, xmm4
- movdqu xmm7, [rdx]
- paddw xmm0, xmm1
- movdqu xmm1, [rdx+rcx]
- paddw xmm0, xmm3
- psadbw xmm5, [rdi]
- psadbw xmm6, [rdi+rsi]
- lea rdi, [rdi+2*rsi]
- paddw xmm5, xmm6
- psadbw xmm7, [rdi]
- psadbw xmm1, [rdi+rsi]
- paddw xmm7, xmm1
- paddw xmm0, xmm5
- paddw xmm0, xmm7
- SAD_END_SSE2
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x8_%1
- movdqu xmm0, [rdx]
- movdqu xmm2, [rdx+rcx]
- lea rdx, [rdx+2*rcx]
- movdqu xmm3, [rdx]
- movdqu xmm4, [rdx+rcx]
- psadbw xmm0, [rdi]
- psadbw xmm2, [rdi+rsi]
- lea rdi, [rdi+2*rsi]
- psadbw xmm3, [rdi]
- psadbw xmm4, [rdi+rsi]
- lea rdi, [rdi+2*rsi]
- lea rdx, [rdx+2*rcx]
- paddw xmm0, xmm2
- paddw xmm3, xmm4
- paddw xmm0, xmm3
- movdqu xmm1, [rdx]
- movdqu xmm2, [rdx+rcx]
- lea rdx, [rdx+2*rcx]
- movdqu xmm3, [rdx]
- movdqu xmm4, [rdx+rcx]
- psadbw xmm1, [rdi]
- psadbw xmm2, [rdi+rsi]
- lea rdi, [rdi+2*rsi]
- psadbw xmm3, [rdi]
- psadbw xmm4, [rdi+rsi]
- lea rdi, [rdi+2*rsi]
- lea rdx, [rdx+2*rcx]
- paddw xmm1, xmm2
- paddw xmm3, xmm4
- paddw xmm0, xmm1
- paddw xmm0, xmm3
- SAD_END_SSE2
-%endmacro
-
-SAD_W16 sse2
-%ifdef HAVE_SSE3
-%define movdqu lddqu
-SAD_W16 sse3
-%undef movdqu
-%endif
-
-
-; sad x3 / x4
-
-%macro SAD_X3_START_1x16P 0
- movdqa xmm3, [parm1q]
- movdqu xmm0, [parm2q]
- movdqu xmm1, [parm3q]
- movdqu xmm2, [parm4q]
- psadbw xmm0, xmm3
- psadbw xmm1, xmm3
- psadbw xmm2, xmm3
-%endmacro
-
-%macro SAD_X3_1x16P 2
- movdqa xmm3, [parm1q+%1]
- movdqu xmm4, [parm2q+%2]
- movdqu xmm5, [parm3q+%2]
- movdqu xmm6, [parm4q+%2]
- psadbw xmm4, xmm3
- psadbw xmm5, xmm3
- psadbw xmm6, xmm3
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
-%endmacro
-
-%macro SAD_X3_2x16P 1
-%if %1
- SAD_X3_START_1x16P
-%else
- SAD_X3_1x16P 0, 0
-%endif
- SAD_X3_1x16P FENC_STRIDE, parm5q
- add parm1q, 2*FENC_STRIDE
- lea parm2q, [parm2q+2*parm5q]
- lea parm3q, [parm3q+2*parm5q]
- lea parm4q, [parm4q+2*parm5q]
-%endmacro
-
-%macro SAD_X4_START_1x16P 0
- movdqa xmm7, [parm1q]
- movdqu xmm0, [parm2q]
- movdqu xmm1, [parm3q]
- movdqu xmm2, [parm4q]
- movdqu xmm3, [parm5q]
- psadbw xmm0, xmm7
- psadbw xmm1, xmm7
- psadbw xmm2, xmm7
- psadbw xmm3, xmm7
-%endmacro
-
-%macro SAD_X4_1x16P 2
- movdqa xmm7, [parm1q+%1]
- movdqu xmm4, [parm2q+%2]
- movdqu xmm5, [parm3q+%2]
- movdqu xmm6, [parm4q+%2]
- movdqu xmm8, [parm5q+%2]
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- psadbw xmm6, xmm7
- psadbw xmm8, xmm7
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- paddw xmm3, xmm8
-%endmacro
-
-%macro SAD_X4_2x16P 1
-%if %1
- SAD_X4_START_1x16P
-%else
- SAD_X4_1x16P 0, 0
-%endif
- SAD_X4_1x16P FENC_STRIDE, parm6q
- add parm1q, 2*FENC_STRIDE
- lea parm2q, [parm2q+2*parm6q]
- lea parm3q, [parm3q+2*parm6q]
- lea parm4q, [parm4q+2*parm6q]
- lea parm5q, [parm5q+2*parm6q]
-%endmacro
-
-%macro SAD_X3_END 0
- movhlps xmm4, xmm0
- movhlps xmm5, xmm1
- movhlps xmm6, xmm2
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- movd [parm6q+0], xmm0
- movd [parm6q+4], xmm1
- movd [parm6q+8], xmm2
- ret
-%endmacro
-
-%macro SAD_X4_END 0
- mov rax, parm7q
- psllq xmm1, 32
- psllq xmm3, 32
- paddw xmm0, xmm1
- paddw xmm2, xmm3
- movhlps xmm1, xmm0
- movhlps xmm3, xmm2
- paddw xmm0, xmm1
- paddw xmm2, xmm3
- movq [rax+0], xmm0
- movq [rax+8], xmm2
- ret
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-; uint8_t *pix2, int i_stride, int scores[3] )
-;-----------------------------------------------------------------------------
-%macro SAD_X 4
-cglobal x264_pixel_sad_x%1_%2x%3_%4
- SAD_X%1_2x%2P 1
-%rep %3/2-1
- SAD_X%1_2x%2P 0
-%endrep
- SAD_X%1_END
-%endmacro
-
-SAD_X 3, 16, 16, sse2
-SAD_X 3, 16, 8, sse2
-SAD_X 4, 16, 16, sse2
-SAD_X 4, 16, 8, sse2
-
-%ifdef HAVE_SSE3
-%define movdqu lddqu
-SAD_X 3, 16, 16, sse3
-SAD_X 3, 16, 8, sse3
-SAD_X 4, 16, 16, sse3
-SAD_X 4, 16, 8, sse3
-%undef movdqu
-%endif
-
-
-; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
-; unless the unaligned data spans the border between 2 cachelines, in which
-; case it's really slow. The exact numbers may differ, but all Intel cpus
-; have a large penalty for cacheline splits.
-; (8-byte alignment exactly half way between two cachelines is ok though.)
-; LDDQU was supposed to fix this, but it only works on Pentium 4.
-; So in the split case we load aligned data and explicitly perform the
-; alignment between registers. Like on archs that have only aligned loads,
-; except complicated by the fact that PALIGNR takes only an immediate, not
-; a variable alignment.
-; It is also possible to hoist the realignment to the macroblock level (keep
-; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
-; needed for that method makes it often slower.
-
-; sad 16x16 costs on Core2:
-; good offsets: 49 cycles (50/64 of all mvs)
-; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
-; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
-; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
-
-; computed jump assumes this loop is exactly 64 bytes
-%macro SAD16_CACHELINE_LOOP 1 ; alignment
-ALIGN 16
-sad_w16_align%1:
- movdqa xmm1, [rdx+16]
- movdqa xmm2, [rdx+rcx+16]
- palignr xmm1, [rdx], %1
- palignr xmm2, [rdx+rcx], %1
- psadbw xmm1, [rdi]
- psadbw xmm2, [rdi+rsi]
- paddw xmm0, xmm1
- paddw xmm0, xmm2
- lea rdx, [rdx+2*rcx]
- lea rdi, [rdi+2*rsi]
- dec eax
- jg sad_w16_align%1
- ret
-%endmacro
-
-%macro SAD16_CACHELINE_FUNC 1 ; height
-cglobal x264_pixel_sad_16x%1_cache64_ssse3
- mov eax, parm3d
- and eax, 0x37
- cmp eax, 0x30
- jle x264_pixel_sad_16x%1_sse2
- mov eax, parm3d
- and eax, 15
- shl eax, 6
-%ifdef __PIC__
- lea r10, [sad_w16_align1 - 64 GLOBAL]
- add r10, rax
-%else
- lea r10, [sad_w16_align1 - 64 + rax]
-%endif
- and parm3q, ~15
- mov eax, %1/2
- pxor xmm0, xmm0
- call r10
- SAD_END_SSE2
-%endmacro
-
-%macro SAD8_CACHELINE_FUNC 1 ; height
-cglobal x264_pixel_sad_8x%1_cache64_mmxext
- mov eax, parm3d
- and eax, 0x3f
- cmp eax, 0x38
- jle x264_pixel_sad_8x%1_mmxext
- and eax, 7
- shl eax, 3
- movd mm6, [sw_64 GLOBAL]
- movd mm7, eax
- psubw mm6, mm7
- and parm3q, ~7
- mov eax, %1/2
- pxor mm0, mm0
-.loop:
- movq mm1, [parm3q+8]
- movq mm2, [parm3q+parm4q+8]
- movq mm3, [parm3q]
- movq mm4, [parm3q+parm4q]
- psllq mm1, mm6
- psllq mm2, mm6
- psrlq mm3, mm7
- psrlq mm4, mm7
- por mm1, mm3
- por mm2, mm4
- psadbw mm1, [parm1q]
- psadbw mm2, [parm1q+parm2q]
- paddw mm0, mm1
- paddw mm0, mm2
- lea parm3q, [parm3q+2*parm4q]
- lea parm1q, [parm1q+2*parm2q]
- dec eax
- jg .loop
- movd eax, mm0
- ret
-%endmacro
-
-
-; sad_x3/x4_cache64: check each mv.
-; if they're all within a cacheline, use normal sad_x3/x4.
-; otherwise, send them individually to sad_cache64.
-%macro CHECK_SPLIT 2 ; pix, width
- mov eax, %1
- and eax, 0x37|%2
- cmp eax, 0x30|%2
- jg .split
-%endmacro
-
-%macro SADX3_CACHELINE_FUNC 4 ; width, height, normal_ver, split_ver
-cglobal x264_pixel_sad_x3_%1x%2_cache64_%4
- CHECK_SPLIT parm2d, %1
- CHECK_SPLIT parm3d, %1
- CHECK_SPLIT parm4d, %1
- jmp x264_pixel_sad_x3_%1x%2_%3
-.split:
- push parm4q
- push parm3q
- mov parm3q, parm2q
- mov parm2q, FENC_STRIDE
- mov parm4q, parm5q
- mov parm5q, parm1q
- call x264_pixel_sad_%1x%2_cache64_%4
- mov [parm6q], eax
- pop parm3q
- mov parm1q, parm5q
- call x264_pixel_sad_%1x%2_cache64_%4
- mov [parm6q+4], eax
- pop parm3q
- mov parm1q, parm5q
- call x264_pixel_sad_%1x%2_cache64_%4
- mov [parm6q+8], eax
- ret
-%endmacro
-
-%macro SADX4_CACHELINE_FUNC 4 ; width, height, normal_ver, split_ver
-cglobal x264_pixel_sad_x4_%1x%2_cache64_%4
- CHECK_SPLIT parm2d, %1
- CHECK_SPLIT parm3d, %1
- CHECK_SPLIT parm4d, %1
- CHECK_SPLIT parm5d, %1
- jmp x264_pixel_sad_x4_%1x%2_%3
-.split:
- mov r11, parm7q
- push parm5q
- push parm4q
- push parm3q
- mov parm3q, parm2q
- mov parm2q, FENC_STRIDE
- mov parm4q, parm6q
- mov parm5q, parm1q
- call x264_pixel_sad_%1x%2_cache64_%4
- mov [r11], eax
- pop parm3q
- mov parm1q, parm5q
- call x264_pixel_sad_%1x%2_cache64_%4
- mov [r11+4], eax
- pop parm3q
- mov parm1q, parm5q
- call x264_pixel_sad_%1x%2_cache64_%4
- mov [r11+8], eax
- pop parm3q
- mov parm1q, parm5q
- call x264_pixel_sad_%1x%2_cache64_%4
- mov [r11+12], eax
- ret
-%endmacro
-
-%macro SADX34_CACHELINE_FUNC 4
- SADX3_CACHELINE_FUNC %1, %2, %3, %4
- SADX4_CACHELINE_FUNC %1, %2, %3, %4
-%endmacro
-
-cextern x264_pixel_sad_8x16_mmxext
-cextern x264_pixel_sad_8x8_mmxext
-cextern x264_pixel_sad_8x4_mmxext
-cextern x264_pixel_sad_x3_8x16_mmxext
-cextern x264_pixel_sad_x3_8x8_mmxext
-cextern x264_pixel_sad_x4_8x16_mmxext
-cextern x264_pixel_sad_x4_8x8_mmxext
-
-; instantiate the aligned sads
-
-SAD8_CACHELINE_FUNC 4
-SAD8_CACHELINE_FUNC 8
-SAD8_CACHELINE_FUNC 16
-SADX34_CACHELINE_FUNC 8, 16, mmxext, mmxext
-SADX34_CACHELINE_FUNC 8, 8, mmxext, mmxext
-
-%ifdef HAVE_SSE3
-
-SAD16_CACHELINE_FUNC 8
-SAD16_CACHELINE_FUNC 16
-%assign i 1
-%rep 15
-SAD16_CACHELINE_LOOP i
-%assign i i+1
-%endrep
-
-SADX34_CACHELINE_FUNC 16, 16, sse2, ssse3
-SADX34_CACHELINE_FUNC 16, 8, sse2, ssse3
-
-%endif ; HAVE_SSE3
-
-
-; ssd
-
-%macro SSD_INC_2x16P_SSE2 0
- movdqu xmm1, [rdi]
- movdqu xmm2, [rdx]
- movdqu xmm3, [rdi+rsi]
- movdqu xmm4, [rdx+rcx]
-
- movdqa xmm5, xmm1
- movdqa xmm6, xmm3
- psubusb xmm1, xmm2
- psubusb xmm3, xmm4
- psubusb xmm2, xmm5
- psubusb xmm4, xmm6
- por xmm1, xmm2
- por xmm3, xmm4
-
- movdqa xmm2, xmm1
- movdqa xmm4, xmm3
- punpcklbw xmm1, xmm7
- punpckhbw xmm2, xmm7
- punpcklbw xmm3, xmm7
- punpckhbw xmm4, xmm7
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
- pmaddwd xmm4, xmm4
-
- lea rdi, [rdi+2*rsi]
- lea rdx, [rdx+2*rcx]
-
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm0, xmm1
- paddd xmm0, xmm3
-%endmacro
-
-%macro SSD_START_SSE2 0
- pxor xmm7, xmm7 ; zero
- pxor xmm0, xmm0 ; mm0 holds the sum
-%endmacro
-
-%macro SSD_END_SSE2 0
- HADDD xmm0, xmm1
- movd eax, xmm0
- ret
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssd_16x16_sse2
- SSD_START_SSE2
-%rep 8
- SSD_INC_2x16P_SSE2
-%endrep
- SSD_END_SSE2
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssd_16x8_sse2
- SSD_START_SSE2
-%rep 4
- SSD_INC_2x16P_SSE2
-%endrep
- SSD_END_SSE2
-
-
-
-%macro SUMSUB_BADC 4
- paddw %1, %2
- paddw %3, %4
- paddw %2, %2
- paddw %4, %4
- psubw %2, %1
- psubw %4, %3
-%endmacro
-
-%macro HADAMARD1x4 4
- SUMSUB_BADC %1, %2, %3, %4
- SUMSUB_BADC %1, %3, %2, %4
-%endmacro
-
-%macro HADAMARD1x8 8
- SUMSUB_BADC %1, %5, %2, %6
- SUMSUB_BADC %3, %7, %4, %8
- SUMSUB_BADC %1, %3, %2, %4
- SUMSUB_BADC %5, %7, %6, %8
- SUMSUB_BADC %1, %2, %3, %4
- SUMSUB_BADC %5, %6, %7, %8
-%endmacro
-
-;;; row transform not used, because phaddw is much slower than paddw on a Conroe
-;%macro PHSUMSUB 3
-; movdqa %3, %1
-; phaddw %1, %2
-; phsubw %3, %2
-;%endmacro
-
-;%macro HADAMARD4x1_SSSE3 5 ; ABCD-T -> ADTC
-; PHSUMSUB %1, %2, %5
-; PHSUMSUB %3, %4, %2
-; PHSUMSUB %1, %3, %4
-; PHSUMSUB %5, %2, %3
-;%endmacro
-
-%macro SBUTTERFLY 5
- mov%1 %5, %3
- punpckl%2 %3, %4
- punpckh%2 %5, %4
-%endmacro
-
-%macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4 to not shuffle registers
- mov%1 %5, %3
- punpckh%2 %3, %4
- punpckl%2 %5, %4
-%endmacro
-
-%macro TRANSPOSE4x4D 5 ; ABCD-T -> ADTC
- SBUTTERFLY dqa, dq, %1, %2, %5
- SBUTTERFLY dqa, dq, %3, %4, %2
- SBUTTERFLY dqa, qdq, %1, %3, %4
- SBUTTERFLY dqa, qdq, %5, %2, %3
-%endmacro
-
-%macro TRANSPOSE2x4x4W 5 ; ABCD-T -> ABCD
- SBUTTERFLY dqa, wd, %1, %2, %5
- SBUTTERFLY dqa, wd, %3, %4, %2
- SBUTTERFLY dqa, dq, %1, %3, %4
- SBUTTERFLY2 dqa, dq, %5, %2, %3
- SBUTTERFLY dqa, qdq, %1, %3, %2
- SBUTTERFLY2 dqa, qdq, %4, %5, %3
-%endmacro
-
-%macro TRANSPOSE8x8 9 ; ABCDEFGH-T -> AFHDTECB
- SBUTTERFLY dqa, wd, %1, %2, %9
- SBUTTERFLY dqa, wd, %3, %4, %2
- SBUTTERFLY dqa, wd, %5, %6, %4
- SBUTTERFLY dqa, wd, %7, %8, %6
- SBUTTERFLY dqa, dq, %1, %3, %8
- SBUTTERFLY dqa, dq, %9, %2, %3
- SBUTTERFLY dqa, dq, %5, %7, %2
- SBUTTERFLY dqa, dq, %4, %6, %7
- SBUTTERFLY dqa, qdq, %1, %5, %6
- SBUTTERFLY dqa, qdq, %9, %4, %5
- SBUTTERFLY dqa, qdq, %8, %2, %4
- SBUTTERFLY dqa, qdq, %3, %7, %2
-%endmacro
-
-%macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2]
- movq %1, %3
- movq %2, %4
- punpcklbw %1, %2
- punpcklbw %2, %2
- psubw %1, %2
-%endmacro
-
-%macro LOAD_DIFF_4x8P 6 ; 4x dest, 2x temp
- LOAD_DIFF_8P %1, %5, [parm1q], [parm3q]
- LOAD_DIFF_8P %2, %6, [parm1q+parm2q], [parm3q+parm4q]
- LOAD_DIFF_8P %3, %5, [parm1q+2*parm2q], [parm3q+2*parm4q]
- LOAD_DIFF_8P %4, %6, [parm1q+r10], [parm3q+r11]
-%endmacro
-
-%macro SUM1x8_SSE2 3 ; 01 junk sum
- pxor %2, %2
- psubw %2, %1
- pmaxsw %1, %2
- paddusw %3, %1
-%endmacro
-
-%macro SUM4x4_SSE2 4 ; 02 13 junk sum
- pxor %3, %3
- psubw %3, %1
- pmaxsw %1, %3
-
- pxor %3, %3
- psubw %3, %2
- pmaxsw %2, %3
-
- paddusw %4, %1
- paddusw %4, %2
-%endmacro
-
-%macro SUM8x4_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
- pxor %3, %3
- pxor %6, %6
- psubw %3, %1
- psubw %6, %4
- pmaxsw %1, %3
- pmaxsw %4, %6
- pxor %3, %3
- pxor %6, %6
- psubw %3, %2
- psubw %6, %5
- pmaxsw %2, %3
- pmaxsw %5, %6
- paddusw %1, %2
- paddusw %4, %5
- paddusw %7, %1
- paddusw %7, %4
-%endmacro
-
-%macro SUM8x4_SSSE3 7 ; a02 a13 . b02 b13 . sum
- pabsw %1, %1
- pabsw %2, %2
- pabsw %4, %4
- pabsw %5, %5
- paddusw %1, %2
- paddusw %4, %5
- paddusw %7, %1
- paddusw %7, %4
-%endmacro
-
-%macro SATD_TWO_SSE2 0
- LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- lea parm1q, [parm1q+4*parm2q]
- lea parm3q, [parm3q+4*parm4q]
- HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
- TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
- HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
- SUM8x4 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6
-%endmacro
-
-%macro SATD_START 0
- pxor xmm6, xmm6
- lea r10, [3*parm2q]
- lea r11, [3*parm4q]
-%endmacro
-
-%macro SATD_END 0
- psrlw xmm6, 1
- HADDW xmm6, xmm7
- movd eax, xmm6
- ret
-%endmacro
-
-%macro SATDS 1
-;-----------------------------------------------------------------------------
-; int x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_16x16_%1
- SATD_START
- mov r8, rdi
- mov r9, rdx
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- lea rdi, [r8+8]
- lea rdx, [r9+8]
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_8x16_%1
- SATD_START
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_16x8_%1
- SATD_START
- mov r8, rdi
- mov r9, rdx
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- lea rdi, [r8+8]
- lea rdx, [r9+8]
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_8x8_%1
- SATD_START
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_8x4_%1
- SATD_START
- SATD_TWO_SSE2
- SATD_END
-
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_sa8d_8x8_%1
- lea r10, [3*parm2q]
- lea r11, [3*parm4q]
- LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm8, xmm8
- lea parm1q, [parm1q+4*parm2q]
- lea parm3q, [parm3q+4*parm4q]
- LOAD_DIFF_4x8P xmm4, xmm5, xmm6, xmm7, xmm8, xmm8
-
- HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
- TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
- HADAMARD1x8 xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
-
- pxor xmm10, xmm10
- SUM8x4 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
- SUM8x4 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
- psrlw xmm10, 1
- HADDW xmm10, xmm0
- movd eax, xmm10
- add r8d, eax ; preserve rounding for 16x16
- add eax, 1
- shr eax, 1
- ret
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-;; violates calling convention
-cglobal x264_pixel_sa8d_16x16_%1
- xor r8d, r8d
- call x264_pixel_sa8d_8x8_%1 ; pix[0]
- lea parm1q, [parm1q+4*parm2q]
- lea parm3q, [parm3q+4*parm4q]
- call x264_pixel_sa8d_8x8_%1 ; pix[8*stride]
- lea r10, [3*parm2q-2]
- lea r11, [3*parm4q-2]
- shl r10, 2
- shl r11, 2
- sub parm1q, r10
- sub parm3q, r11
- call x264_pixel_sa8d_8x8_%1 ; pix[8]
- lea parm1q, [parm1q+4*parm2q]
- lea parm3q, [parm3q+4*parm4q]
- call x264_pixel_sa8d_8x8_%1 ; pix[8*stride+8]
- mov eax, r8d
- add eax, 1
- shr eax, 1
- ret
-%endmacro ; SATDS
-
-%define SUM8x4 SUM8x4_SSE2
-SATDS sse2
-%ifdef HAVE_SSE3
-%define SUM8x4 SUM8x4_SSSE3
-SATDS ssse3
-%endif
-
-
-
-;-----------------------------------------------------------------------------
-; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
-;-----------------------------------------------------------------------------
-cglobal x264_intra_sa8d_x3_8x8_core_sse2
- ; 8x8 hadamard
- pxor xmm4, xmm4
- movq xmm0, [parm1q+0*FENC_STRIDE]
- movq xmm7, [parm1q+1*FENC_STRIDE]
- movq xmm6, [parm1q+2*FENC_STRIDE]
- movq xmm3, [parm1q+3*FENC_STRIDE]
- movq xmm5, [parm1q+4*FENC_STRIDE]
- movq xmm1, [parm1q+5*FENC_STRIDE]
- movq xmm8, [parm1q+6*FENC_STRIDE]
- movq xmm2, [parm1q+7*FENC_STRIDE]
- punpcklbw xmm0, xmm4
- punpcklbw xmm7, xmm4
- punpcklbw xmm6, xmm4
- punpcklbw xmm3, xmm4
- punpcklbw xmm5, xmm4
- punpcklbw xmm1, xmm4
- punpcklbw xmm8, xmm4
- punpcklbw xmm2, xmm4
- HADAMARD1x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2
- TRANSPOSE8x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4
- HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
-
- ; dc
- movzx edi, word [parm2q+0]
- add di, word [parm2q+16]
- add edi, 8
- and edi, -16
- shl edi, 2
-
- pxor xmm15, xmm15
- movdqa xmm8, xmm2
- movdqa xmm9, xmm3
- movdqa xmm10, xmm4
- movdqa xmm11, xmm5
- SUM8x4_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15
- movdqa xmm8, xmm6
- movdqa xmm9, xmm7
- SUM4x4_SSE2 xmm8, xmm9, xmm10, xmm15
- movdqa xmm8, xmm1
- SUM1x8_SSE2 xmm8, xmm10, xmm15
- movdqa xmm14, xmm15 ; 7x8 sum
-
- movdqa xmm8, [parm2q+0] ; left edge
- movd xmm9, edi
- psllw xmm8, 3
- psubw xmm8, xmm0
- psubw xmm9, xmm0
- SUM1x8_SSE2 xmm8, xmm10, xmm14
- SUM1x8_SSE2 xmm9, xmm11, xmm15 ; 1x8 sum
- punpcklwd xmm0, xmm1
- punpcklwd xmm2, xmm3
- punpcklwd xmm4, xmm5
- punpcklwd xmm6, xmm7
- punpckldq xmm0, xmm2
- punpckldq xmm4, xmm6
- punpcklqdq xmm0, xmm4 ; transpose
- movdqa xmm1, [parm2q+16] ; top edge
- movdqa xmm2, xmm15
- psllw xmm1, 3
- psrldq xmm2, 2 ; 8x7 sum
- psubw xmm0, xmm1 ; 8x1 sum
- SUM1x8_SSE2 xmm0, xmm1, xmm2
-
- HADDW xmm14, xmm3
- movd eax, xmm14
- add eax, 2
- shr eax, 2
- mov [parm3q+4], eax ; i8x8_h sa8d
- HADDW xmm15, xmm4
- movd eax, xmm15
- add eax, 2
- shr eax, 2
- mov [parm3q+8], eax ; i8x8_dc sa8d
- HADDW xmm2, xmm5
- movd eax, xmm2
- add eax, 2
- shr eax, 2
- mov [parm3q+0], eax ; i8x8_v sa8d
-
- ret
-
-
-
-;-----------------------------------------------------------------------------
-; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
-; const uint8_t *pix2, int stride2, int sums[2][4] )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssim_4x4x2_core_sse2
- pxor xmm0, xmm0
- pxor xmm1, xmm1
- pxor xmm2, xmm2
- pxor xmm3, xmm3
- pxor xmm4, xmm4
- movdqa xmm8, [pw_1 GLOBAL]
-%rep 4
- movq xmm5, [parm1q]
- movq xmm6, [parm3q]
- punpcklbw xmm5, xmm0
- punpcklbw xmm6, xmm0
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- movdqa xmm7, xmm5
- pmaddwd xmm5, xmm5
- pmaddwd xmm7, xmm6
- pmaddwd xmm6, xmm6
- paddd xmm3, xmm5
- paddd xmm4, xmm7
- paddd xmm3, xmm6
- add parm1q, parm2q
- add parm3q, parm4q
-%endrep
- ; PHADDW xmm1, xmm2
- ; PHADDD xmm3, xmm4
- pshufd xmm5, xmm3, 0xB1
- pmaddwd xmm1, xmm8
- pmaddwd xmm2, xmm8
- pshufd xmm6, xmm4, 0xB1
- packssdw xmm1, xmm2
- paddd xmm3, xmm5
- pshufd xmm1, xmm1, 0xD8
- paddd xmm4, xmm6
- pmaddwd xmm1, xmm8
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
- punpckhdq xmm5, xmm4
- movq [parm5q+ 0], xmm1
- movq [parm5q+ 8], xmm3
- psrldq xmm1, 8
- movq [parm5q+16], xmm1
- movq [parm5q+24], xmm5
- ret
-
-;-----------------------------------------------------------------------------
-; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssim_end4_sse2
- movdqa xmm0, [parm1q+ 0]
- movdqa xmm1, [parm1q+16]
- movdqa xmm2, [parm1q+32]
- movdqa xmm3, [parm1q+48]
- movdqa xmm4, [parm1q+64]
- paddd xmm0, [parm2q+ 0]
- paddd xmm1, [parm2q+16]
- paddd xmm2, [parm2q+32]
- paddd xmm3, [parm2q+48]
- paddd xmm4, [parm2q+64]
- paddd xmm0, xmm1
- paddd xmm1, xmm2
- paddd xmm2, xmm3
- paddd xmm3, xmm4
- movdqa xmm5, [ssim_c1 GLOBAL]
- movdqa xmm6, [ssim_c2 GLOBAL]
- TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4
-
-; s1=mm0, s2=mm3, ss=mm4, s12=mm2
- movdqa xmm1, xmm3
- pslld xmm3, 16
- pmaddwd xmm1, xmm0 ; s1*s2
- por xmm0, xmm3
- pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2
- pslld xmm1, 1
- pslld xmm2, 7
- pslld xmm4, 6
- psubd xmm2, xmm1 ; covar*2
- psubd xmm4, xmm0 ; vars
- paddd xmm0, xmm5
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm4, xmm6
- cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
- cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1)
- cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2)
- cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2)
- mulps xmm1, xmm2
- mulps xmm0, xmm4
- divps xmm1, xmm0 ; ssim
-
- neg parm3q
-%ifdef __PIC__
- lea rax, [mask_ff + 16 GLOBAL]
- movdqu xmm3, [rax + parm3q*4]
-%else
- movdqu xmm3, [mask_ff + parm3q*4 + 16]
-%endif
- pand xmm1, xmm3
- movhlps xmm0, xmm1
- addps xmm0, xmm1
- pshuflw xmm1, xmm0, 0xE
- addss xmm0, xmm1
- ret
-
int max_extended_cap;
int cache;
+#ifndef ARCH_X86_64
if( !x264_cpu_cpuid_test() )
return 0;
+#endif
x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 );
if( eax == 0 )
#include "common.h"
#ifdef HAVE_MMX
-# include "i386/dct.h"
+# include "x86/dct.h"
#endif
#ifdef ARCH_PPC
# include "ppc/dct.h"
if( cpu&X264_CPU_MMX )
pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmx;
#endif
-#ifdef ARCH_X86_64
if( cpu&X264_CPU_SSE2 )
pf->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
#endif
-#endif
#ifdef ARCH_PPC
if( cpu&X264_CPU_ALTIVEC )
void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
-#ifdef ARCH_X86_64
void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
-#else
+#ifdef ARCH_X86
void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
-
-#ifdef ARCH_X86_64
+#ifdef ARCH_X86
+ pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
+ pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
+#endif
if( cpu&X264_CPU_SSE2 )
{
pf->deblock_v_luma = x264_deblock_v_luma_sse2;
pf->deblock_h_luma = x264_deblock_h_luma_sse2;
}
-#else
- pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
- pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
-#endif
}
#endif
+++ /dev/null
-;*****************************************************************************
-;* dct.asm: h264 encoder library
-;*****************************************************************************
-;* Copyright (C) 2003 x264 project
-;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
-;*
-;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
-;* Min Chen <chenm001.163.com> (converted to nasm)
-;* Christian Heine <sennindemokrit@gmx.net> (dct8/idct8 functions)
-;* Loren Merritt <lorenm@u.washington.edu> (misc)
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
-;*****************************************************************************
-
-;*****************************************************************************
-;* *
-;* Revision history: *
-;* *
-;* 2004.04.28 portab all 4x4 function to nasm (CM) *
-;* 2005.08.24 added mmxext optimized dct8/idct8 functions (CH) *
-;* *
-;*****************************************************************************
-
-BITS 32
-
-;=============================================================================
-; Macros and other preprocessor constants
-;=============================================================================
-
-%include "i386inc.asm"
-
-%macro MMX_ZERO 1
- pxor %1, %1
-%endmacro
-
-%macro MMX_LOAD_DIFF_4P 5
- movd %1, %4
- punpcklbw %1, %3
- movd %2, %5
- punpcklbw %2, %3
- psubw %1, %2
-%endmacro
-
-%macro MMX_SUMSUB_BA 2
- paddw %1, %2
- paddw %2, %2
- psubw %2, %1
-%endmacro
-
-%macro MMX_SUMSUB_BADC 4
- paddw %1, %2
- paddw %3, %4
- paddw %2, %2
- paddw %4, %4
- psubw %2, %1
- psubw %4, %3
-%endmacro
-
-%macro MMX_SUMSUB2_AB 3
- movq %3, %1
- paddw %1, %1
- paddw %1, %2
- psubw %3, %2
- psubw %3, %2
-%endmacro
-
-%macro MMX_SUMSUBD2_AB 4
- movq %4, %1
- movq %3, %2
- psraw %2, 1
- psraw %4, 1
- paddw %1, %2
- psubw %4, %3
-%endmacro
-
-%macro SBUTTERFLY 5
- mov%1 %5, %3
- punpckl%2 %3, %4
- punpckh%2 %5, %4
-%endmacro
-
-;-----------------------------------------------------------------------------
-; input ABCD output ADTC
-;-----------------------------------------------------------------------------
-%macro MMX_TRANSPOSE 5
- SBUTTERFLY q, wd, %1, %2, %5
- SBUTTERFLY q, wd, %3, %4, %2
- SBUTTERFLY q, dq, %1, %3, %4
- SBUTTERFLY q, dq, %5, %2, %3
-%endmacro
-
-%macro MMX_STORE_DIFF_4P 5
- paddw %1, %3
- psraw %1, 6
- movd %2, %5
- punpcklbw %2, %4
- paddsw %1, %2
- packuswb %1, %1
- movd %5, %1
-%endmacro
-
-;=============================================================================
-; Local Data (Read Only)
-;=============================================================================
-
-SECTION_RODATA
-
-;-----------------------------------------------------------------------------
-; Various memory constants (trigonometric values or rounding values)
-;-----------------------------------------------------------------------------
-
-ALIGN 16
-pw_32: times 8 dw 32
-pw_1: times 4 dw 1
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_dct4x4dc_mmx( int16_t d[4][4] )
-;-----------------------------------------------------------------------------
-cglobal x264_dct4x4dc_mmx
- mov eax, [esp+ 4]
- movq mm0, [eax+ 0]
- movq mm1, [eax+ 8]
- movq mm2, [eax+16]
- movq mm3, [eax+24]
-
- picgetgot edx
-
- MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
- MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
-
- MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
-
- MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
- MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
-
- movq mm6, [pw_1 GLOBAL]
- paddw mm0, mm6
- paddw mm2, mm6
- psraw mm0, 1
- movq [eax+ 0], mm0
- psraw mm2, 1
- movq [eax+ 8], mm2
- paddw mm3, mm6
- paddw mm4, mm6
- psraw mm3, 1
- movq [eax+16], mm3
- psraw mm4, 1
- movq [eax+24], mm4
- ret
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_idct4x4dc_mmx( int16_t d[4][4] )
-;-----------------------------------------------------------------------------
-cglobal x264_idct4x4dc_mmx
- mov eax, [esp+ 4]
- movq mm0, [eax+ 0]
- movq mm1, [eax+ 8]
- movq mm2, [eax+16]
- movq mm3, [eax+24]
-
- MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
- MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
-
- MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
-
- MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
- MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
-
- movq [eax+ 0], mm0
- movq [eax+ 8], mm2
- movq [eax+16], mm3
- movq [eax+24], mm4
- ret
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
-;-----------------------------------------------------------------------------
-cglobal x264_sub4x4_dct_mmx
- mov eax, [esp+ 8] ; pix1
- mov ecx, [esp+12] ; pix2
-
- MMX_ZERO mm7
-
- ; Load 4 lines
- MMX_LOAD_DIFF_4P mm0, mm6, mm7, [eax+0*FENC_STRIDE], [ecx+0*FDEC_STRIDE]
- MMX_LOAD_DIFF_4P mm1, mm6, mm7, [eax+1*FENC_STRIDE], [ecx+1*FDEC_STRIDE]
- MMX_LOAD_DIFF_4P mm2, mm6, mm7, [eax+2*FENC_STRIDE], [ecx+2*FDEC_STRIDE]
- MMX_LOAD_DIFF_4P mm3, mm6, mm7, [eax+3*FENC_STRIDE], [ecx+3*FDEC_STRIDE]
-
- MMX_SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12
-
- MMX_SUMSUB_BA mm2, mm3 ; mm2=s03+s12 mm3=s03-s12
- MMX_SUMSUB2_AB mm0, mm1, mm4 ; mm0=2.d03+d12 mm4=d03-2.d12
-
- ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
- MMX_TRANSPOSE mm2, mm0, mm3, mm4, mm1
-
- MMX_SUMSUB_BADC mm3, mm2, mm1, mm4 ; mm3=s03 mm2=d03 mm1=s12 mm4=d12
-
- MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
- MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
-
- mov eax, [esp+ 4] ; dct
- movq [eax+ 0], mm1
- movq [eax+ 8], mm2
- movq [eax+16], mm3
- movq [eax+24], mm0
-
- ret
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
-;-----------------------------------------------------------------------------
-cglobal x264_add4x4_idct_mmx
- ; Load dct coeffs
- mov eax, [esp+ 8] ; dct
- movq mm0, [eax+ 0]
- movq mm1, [eax+ 8]
- movq mm2, [eax+16]
- movq mm3, [eax+24]
-
- mov eax, [esp+ 4] ; p_dst
-
- picgetgot edx
-
- MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02
- MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
-
- MMX_SUMSUB_BADC mm1, mm2, mm4, mm0 ; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13
-
- ; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0
- MMX_TRANSPOSE mm1, mm4, mm0, mm2, mm3
-
- MMX_SUMSUB_BA mm3, mm1 ; mm3=s02 mm1=d02
- MMX_SUMSUBD2_AB mm2, mm0, mm5, mm4 ; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
-
- MMX_SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13
-
- MMX_ZERO mm7
- movq mm6, [pw_32 GLOBAL]
-
- MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [eax+0*FDEC_STRIDE]
- MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [eax+1*FDEC_STRIDE]
- MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [eax+2*FDEC_STRIDE]
- MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [eax+3*FDEC_STRIDE]
-
- ret
-
-
-
-; =============================================================================
-; 8x8 Transform
-; =============================================================================
-
-; -----------------------------------------------------------------------------
-; input 2x8 unsigned bytes (%5,%6), zero (%7) output: difference (%1,%2)
-; -----------------------------------------------------------------------------
-%macro MMX_LOAD_DIFF_8P 7
- movq %1, %5
- movq %2, %1
- punpcklbw %1, %7
- punpckhbw %2, %7
- movq %3, %6
- movq %4, %3
- punpcklbw %3, %7
- punpckhbw %4, %7
- psubw %1, %3
- psubw %2, %4
-%endmacro
-
-%macro MMX_LOADSUMSUB 4 ; returns %1=%3+%4, %2=%3-%4
- movq %2, %3
- movq %1, %4
- MMX_SUMSUB_BA %1, %2
-%endmacro
-
-%macro MMX_STORE_DIFF_8P 4
- psraw %1, 6
- movq %3, %2
- punpcklbw %3, %4
- paddsw %1, %3
- packuswb %1, %1
- movq %2, %1
-%endmacro
-
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
-;-----------------------------------------------------------------------------
-ALIGN 16
-x264_pixel_sub_8x8_mmx:
-
- mov edx, [esp+ 4] ; diff
- mov eax, [esp+ 8] ; pix1
- mov ecx, [esp+12] ; pix2
-
- MMX_ZERO mm7
-
- %assign disp 0
- %rep 8
- MMX_LOAD_DIFF_8P mm0, mm1, mm2, mm3, [eax], [ecx], mm7
- movq [edx+disp], mm0
- movq [edx+disp+8], mm1
- add eax, FENC_STRIDE
- add ecx, FDEC_STRIDE
- %assign disp disp+16
- %endrep
-
- ret
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_ydct8_mmx( int16_t dest[8][8] );
-;-----------------------------------------------------------------------------
-ALIGN 16
-x264_ydct8_mmx:
-
- mov eax, [esp+04] ; dest
-
- ;-------------------------------------------------------------------------
- ; vertical dct ( compute 4 columns at a time -> 2 loops )
- ;-------------------------------------------------------------------------
-
- %assign disp 0
- %rep 2
-
- MMX_LOADSUMSUB mm2, mm3, [eax+disp+0*16], [eax+disp+7*16] ; mm2 = s07, mm3 = d07
- MMX_LOADSUMSUB mm1, mm5, [eax+disp+1*16], [eax+disp+6*16] ; mm1 = s16, mm5 = d16
- MMX_LOADSUMSUB mm0, mm6, [eax+disp+2*16], [eax+disp+5*16] ; mm0 = s25, mm6 = d25
- MMX_LOADSUMSUB mm4, mm7, [eax+disp+3*16], [eax+disp+4*16] ; mm4 = s34, mm7 = d34
-
- MMX_SUMSUB_BA mm4, mm2 ; mm4 = a0, mm2 = a2
- MMX_SUMSUB_BA mm0, mm1 ; mm0 = a1, mm1 = a3
- MMX_SUMSUB_BA mm0, mm4 ; mm0 = dst0, mm1 = dst4
-
- movq [eax+disp+0*16], mm0
- movq [eax+disp+4*16], mm4
-
- movq mm0, mm1 ; a3
- psraw mm0, 1 ; a3>>1
- paddw mm0, mm2 ; a2 + (a3>>1)
- psraw mm2, 1 ; a2>>1
- psubw mm2, mm1 ; (a2>>1) - a3
-
- movq [eax+disp+2*16], mm0
- movq [eax+disp+6*16], mm2
-
- movq mm0, mm6
- psraw mm0, 1
- paddw mm0, mm6 ; d25+(d25>>1)
- movq mm1, mm3
- psubw mm1, mm7 ; a5 = d07-d34-(d25+(d25>>1))
- psubw mm1, mm0
-
- movq mm0, mm5
- psraw mm0, 1
- paddw mm0, mm5 ; d16+(d16>>1)
- movq mm2, mm3
- paddw mm2, mm7 ; a6 = d07+d34-(d16+(d16>>1))
- psubw mm2, mm0
-
- movq mm0, mm3
- psraw mm0, 1
- paddw mm0, mm3 ; d07+(d07>>1)
- paddw mm0, mm5
- paddw mm0, mm6 ; a4 = d16+d25+(d07+(d07>>1))
-
- movq mm3, mm7
- psraw mm3, 1
- paddw mm3, mm7 ; d34+(d34>>1)
- paddw mm3, mm5
- psubw mm3, mm6 ; a7 = d16-d25+(d34+(d34>>1))
-
- movq mm7, mm3
- psraw mm7, 2
- paddw mm7, mm0 ; a4 + (a7>>2)
-
- movq mm6, mm2
- psraw mm6, 2
- paddw mm6, mm1 ; a5 + (a6>>2)
-
- psraw mm0, 2
- psraw mm1, 2
- psubw mm0, mm3 ; (a4>>2) - a7
- psubw mm2, mm1 ; a6 - (a5>>2)
-
- movq [eax+disp+1*16], mm7
- movq [eax+disp+3*16], mm6
- movq [eax+disp+5*16], mm2
- movq [eax+disp+7*16], mm0
-
- %assign disp disp+8
- %endrep
-
- ret
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_yidct8_mmx( int16_t dest[8][8] );
-;-----------------------------------------------------------------------------
-ALIGN 16
-x264_yidct8_mmx:
-
- mov eax, [esp+04] ; dest
-
- ;-------------------------------------------------------------------------
- ; vertical idct ( compute 4 columns at a time -> 2 loops )
- ;-------------------------------------------------------------------------
-
- %assign disp 0
- %rep 2
-
- movq mm1, [eax+disp+1*16] ; mm1 = d1
- movq mm3, [eax+disp+3*16] ; mm3 = d3
- movq mm5, [eax+disp+5*16] ; mm5 = d5
- movq mm7, [eax+disp+7*16] ; mm7 = d7
-
- movq mm4, mm7
- psraw mm4, 1
- movq mm0, mm5
- psubw mm0, mm7
- psubw mm0, mm4
- psubw mm0, mm3 ; mm0 = e1
-
- movq mm6, mm3
- psraw mm6, 1
- movq mm2, mm7
- psubw mm2, mm6
- psubw mm2, mm3
- paddw mm2, mm1 ; mm2 = e3
-
- movq mm4, mm5
- psraw mm4, 1
- paddw mm4, mm5
- paddw mm4, mm7
- psubw mm4, mm1 ; mm4 = e5
-
- movq mm6, mm1
- psraw mm6, 1
- paddw mm6, mm1
- paddw mm6, mm5
- paddw mm6, mm3 ; mm6 = e7
-
- movq mm1, mm0
- movq mm3, mm4
- movq mm5, mm2
- movq mm7, mm6
- psraw mm6, 2
- psraw mm3, 2
- psraw mm5, 2
- psraw mm0, 2
- paddw mm1, mm6 ; mm1 = f1
- paddw mm3, mm2 ; mm3 = f3
- psubw mm5, mm4 ; mm5 = f5
- psubw mm7, mm0 ; mm7 = f7
-
- movq mm2, [eax+disp+2*16] ; mm2 = d2
- movq mm6, [eax+disp+6*16] ; mm6 = d6
- movq mm4, mm2
- movq mm0, mm6
- psraw mm4, 1
- psraw mm6, 1
- psubw mm4, mm0 ; mm4 = a4
- paddw mm6, mm2 ; mm6 = a6
-
- movq mm2, [eax+disp+0*16] ; mm2 = d0
- movq mm0, [eax+disp+4*16] ; mm0 = d4
- MMX_SUMSUB_BA mm0, mm2 ; mm0 = a0, mm2 = a2
-
- MMX_SUMSUB_BADC mm6, mm0, mm4, mm2 ; mm6 = f0, mm0 = f6
- ; mm4 = f2, mm2 = f4
-
- MMX_SUMSUB_BADC mm7, mm6, mm5, mm4 ; mm7 = g0, mm6 = g7
- ; mm5 = g1, mm4 = g6
- MMX_SUMSUB_BADC mm3, mm2, mm1, mm0 ; mm3 = g2, mm2 = g5
- ; mm1 = g3, mm0 = g4
-
- movq [eax+disp+0*16], mm7
- movq [eax+disp+1*16], mm5
- movq [eax+disp+2*16], mm3
- movq [eax+disp+3*16], mm1
- movq [eax+disp+4*16], mm0
- movq [eax+disp+5*16], mm2
- movq [eax+disp+6*16], mm4
- movq [eax+disp+7*16], mm6
-
- %assign disp disp+8
- %endrep
-
- ret
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_pixel_add_8x8_mmx( uint8_t *dst, int16_t src[8][8] );
-;-----------------------------------------------------------------------------
-ALIGN 16
-x264_pixel_add_8x8_mmx:
- mov eax, [esp+4] ; dst
- mov edx, [esp+8] ; src
-
- MMX_ZERO mm7
-
- %assign disp 0
- %rep 8
- movq mm0, [eax]
- movq mm2, [edx+disp]
- movq mm3, [edx+disp+8]
- movq mm1, mm0
- psraw mm2, 6
- psraw mm3, 6
- punpcklbw mm0, mm7
- punpckhbw mm1, mm7
- paddw mm0, mm2
- paddw mm1, mm3
- packuswb mm0, mm1
- movq [eax], mm0
- add eax, FDEC_STRIDE
- %assign disp disp+16
- %endrep
- ret
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_transpose_8x8_mmx( int16_t src[8][8] );
-;-----------------------------------------------------------------------------
-ALIGN 16
-x264_transpose_8x8_mmx:
- mov eax, [esp+4]
-
- movq mm0, [eax ]
- movq mm1, [eax+ 16]
- movq mm2, [eax+ 32]
- movq mm3, [eax+ 48]
- MMX_TRANSPOSE mm0, mm1, mm2, mm3, mm4
- movq [eax ], mm0
- movq [eax+ 16], mm3
- movq [eax+ 32], mm4
- movq [eax+ 48], mm2
-
- movq mm0, [eax+ 72]
- movq mm1, [eax+ 88]
- movq mm2, [eax+104]
- movq mm3, [eax+120]
- MMX_TRANSPOSE mm0, mm1, mm2, mm3, mm4
- movq [eax+ 72], mm0
- movq [eax+ 88], mm3
- movq [eax+104], mm4
- movq [eax+120], mm2
-
- movq mm0, [eax+ 8]
- movq mm1, [eax+ 24]
- movq mm2, [eax+ 40]
- movq mm3, [eax+ 56]
- MMX_TRANSPOSE mm0, mm1, mm2, mm3, mm4
- movq mm1, [eax+ 64]
- movq mm5, [eax+ 80]
- movq mm6, [eax+ 96]
- movq mm7, [eax+112]
-
- movq [eax+ 64], mm0
- movq [eax+ 80], mm3
- movq [eax+ 96], mm4
- movq [eax+112], mm2
- MMX_TRANSPOSE mm1, mm5, mm6, mm7, mm4
- movq [eax+ 8], mm1
- movq [eax+ 24], mm7
- movq [eax+ 40], mm4
- movq [eax+ 56], mm6
-
- ret
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
-;-----------------------------------------------------------------------------
-cglobal x264_sub8x8_dct8_mmx
- push dword [esp+12]
- push dword [esp+12]
- push dword [esp+12]
- call x264_pixel_sub_8x8_mmx
- call x264_ydct8_mmx
- call x264_transpose_8x8_mmx
- add esp, 12
- jmp x264_ydct8_mmx
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
-;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_mmx
- mov eax, [esp+8]
- add word [eax], 32
- push eax
- call x264_yidct8_mmx
- call x264_transpose_8x8_mmx
- call x264_yidct8_mmx
- add esp, 4
- jmp x264_pixel_add_8x8_mmx
-
-%macro IDCT8_1D 8
- movdqa %1, %3
- movdqa %5, %7
- psraw %3, 1
- psraw %7, 1
- psubw %3, %5
- paddw %7, %1
- movdqa %5, %2
- psraw %5, 1
- paddw %5, %2
- paddw %5, %4
- paddw %5, %6
- movdqa %1, %6
- psraw %1, 1
- paddw %1, %6
- paddw %1, %8
- psubw %1, %2
- psubw %2, %4
- psubw %6, %4
- paddw %2, %8
- psubw %6, %8
- psraw %4, 1
- psraw %8, 1
- psubw %2, %4
- psubw %6, %8
- movdqa %4, %5
- movdqa %8, %1
- psraw %4, 2
- psraw %8, 2
- paddw %4, %6
- paddw %8, %2
- psraw %6, 2
- psraw %2, 2
- psubw %5, %6
- psubw %2, %1
- movdqa %1, [eax+0x00]
- movdqa %6, [eax+0x40]
- MMX_SUMSUB_BA %6, %1
- MMX_SUMSUB_BA %7, %6
- MMX_SUMSUB_BA %3, %1
- MMX_SUMSUB_BA %5, %7
- MMX_SUMSUB_BA %2, %3
- MMX_SUMSUB_BA %8, %1
- MMX_SUMSUB_BA %4, %6
-%endmacro
-
-%macro TRANSPOSE8 9
- movdqa [%9], %8
- SBUTTERFLY dqa, wd, %1, %2, %8
- movdqa [%9+16], %8
- movdqa %8, [%9]
- SBUTTERFLY dqa, wd, %3, %4, %2
- SBUTTERFLY dqa, wd, %5, %6, %4
- SBUTTERFLY dqa, wd, %7, %8, %6
- SBUTTERFLY dqa, dq, %1, %3, %8
- movdqa [%9], %8
- movdqa %8, [16+%9]
- SBUTTERFLY dqa, dq, %8, %2, %3
- SBUTTERFLY dqa, dq, %5, %7, %2
- SBUTTERFLY dqa, dq, %4, %6, %7
- SBUTTERFLY dqa, qdq, %1, %5, %6
- SBUTTERFLY dqa, qdq, %8, %4, %5
- movdqa [%9+16], %8
- movdqa %8, [%9]
- SBUTTERFLY dqa, qdq, %8, %2, %4
- SBUTTERFLY dqa, qdq, %3, %7, %2
- movdqa %7, [%9+16]
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
-;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_sse2
- mov ecx, [esp+4]
- mov eax, [esp+8]
- movdqa xmm1, [eax+0x10]
- movdqa xmm2, [eax+0x20]
- movdqa xmm3, [eax+0x30]
- movdqa xmm5, [eax+0x50]
- movdqa xmm6, [eax+0x60]
- movdqa xmm7, [eax+0x70]
- IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
- TRANSPOSE8 xmm4, xmm1, xmm7, xmm3, xmm5, xmm0, xmm2, xmm6, eax
- picgetgot edx
- paddw xmm4, [pw_32 GLOBAL]
- movdqa [eax+0x00], xmm4
- movdqa [eax+0x40], xmm2
- IDCT8_1D xmm4, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7, xmm1
- movdqa [eax+0x60], xmm6
- movdqa [eax+0x70], xmm7
- pxor xmm7, xmm7
- MMX_STORE_DIFF_8P xmm2, [ecx+FDEC_STRIDE*0], xmm6, xmm7
- MMX_STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*1], xmm6, xmm7
- MMX_STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*2], xmm6, xmm7
- MMX_STORE_DIFF_8P xmm3, [ecx+FDEC_STRIDE*3], xmm6, xmm7
- MMX_STORE_DIFF_8P xmm5, [ecx+FDEC_STRIDE*4], xmm6, xmm7
- MMX_STORE_DIFF_8P xmm4, [ecx+FDEC_STRIDE*5], xmm6, xmm7
- movdqa xmm0, [eax+0x60]
- movdqa xmm1, [eax+0x70]
- MMX_STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*6], xmm6, xmm7
- MMX_STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*7], xmm6, xmm7
- ret
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4],
-; uint8_t *pix1, uint8_t *pix2 )
-;-----------------------------------------------------------------------------
-%macro SUB_NxN_DCT 4
-cglobal %1
- mov edx, [esp+12]
- mov ecx, [esp+ 8]
- mov eax, [esp+ 4]
- add edx, %4
- add ecx, %4
- add eax, %3
- push edx
- push ecx
- push eax
- call %2
- add dword [esp+0], %3
- add dword [esp+4], %4*FENC_STRIDE-%4
- add dword [esp+8], %4*FDEC_STRIDE-%4
- call %2
- add dword [esp+0], %3
- add dword [esp+4], %4
- add dword [esp+8], %4
- call %2
- add esp, 12
- jmp %2
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
-;-----------------------------------------------------------------------------
-%macro ADD_NxN_IDCT 4
-cglobal %1
- mov ecx, [esp+8]
- mov eax, [esp+4]
- add ecx, %3
- add eax, %4
- push ecx
- push eax
- call %2
- add dword [esp+0], %4*FDEC_STRIDE-%4
- add dword [esp+4], %3
- call %2
- add dword [esp+0], %4
- add dword [esp+4], %3
- call %2
- add esp, 8
- jmp %2
-%endmacro
-
-SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx, 32, 4
-ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx, 32, 4
-
-SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx, 128, 8
-ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx, 128, 8
-
-SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx, 128, 8
-ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx, 128, 8
-
-ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8
-
-;-----------------------------------------------------------------------------
-; void __cdecl x264_zigzag_scan_4x4_field_mmx( int level[16], int16_t dct[4][4] )
-;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_4x4_field_mmx
- mov edx, [esp+8]
- mov ecx, [esp+4]
- punpcklwd mm0, [edx]
- punpckhwd mm1, [edx]
- punpcklwd mm2, [edx+8]
- punpckhwd mm3, [edx+8]
- punpcklwd mm4, [edx+16]
- punpckhwd mm5, [edx+16]
- punpcklwd mm6, [edx+24]
- punpckhwd mm7, [edx+24]
- psrad mm0, 16
- psrad mm1, 16
- psrad mm2, 16
- psrad mm3, 16
- psrad mm4, 16
- psrad mm5, 16
- psrad mm6, 16
- psrad mm7, 16
- movq [ecx ], mm0
- movq [ecx+16], mm2
- movq [ecx+24], mm3
- movq [ecx+32], mm4
- movq [ecx+40], mm5
- movq [ecx+48], mm6
- movq [ecx+56], mm7
- movq [ecx+12], mm1
- movd [ecx+ 8], mm2
- ret
+++ /dev/null
-;*****************************************************************************
-;* deblock-a.asm: h264 encoder library
-;*****************************************************************************
-;* Copyright (C) 2005 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
-;*****************************************************************************
-
-BITS 32
-
-%include "i386inc.asm"
-
-SECTION_RODATA
-pb_01: times 8 db 0x01
-pb_03: times 8 db 0x03
-pb_a1: times 8 db 0xa1
-
-SECTION .text
-
-; expands to [base],...,[base+7*stride]
-%define PASS8ROWS(base, base3, stride, stride3) \
- [base], [base+stride], [base+stride*2], [base3], \
- [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
-
-; in: 8 rows of 4 bytes in %1..%8
-; out: 4 rows of 8 bytes in mm0..mm3
-%macro TRANSPOSE4x8_LOAD 8
- movd mm0, %1
- movd mm2, %2
- movd mm1, %3
- movd mm3, %4
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
- movq mm2, mm0
- punpcklwd mm0, mm1
- punpckhwd mm2, mm1
-
- movd mm4, %5
- movd mm6, %6
- movd mm5, %7
- movd mm7, %8
- punpcklbw mm4, mm6
- punpcklbw mm5, mm7
- movq mm6, mm4
- punpcklwd mm4, mm5
- punpckhwd mm6, mm5
-
- movq mm1, mm0
- movq mm3, mm2
- punpckldq mm0, mm4
- punpckhdq mm1, mm4
- punpckldq mm2, mm6
- punpckhdq mm3, mm6
-%endmacro
-
-; in: 4 rows of 8 bytes in mm0..mm3
-; out: 8 rows of 4 bytes in %1..%8
-%macro TRANSPOSE8x4_STORE 8
- movq mm4, mm0
- movq mm5, mm1
- movq mm6, mm2
- punpckhdq mm4, mm4
- punpckhdq mm5, mm5
- punpckhdq mm6, mm6
-
- punpcklbw mm0, mm1
- punpcklbw mm2, mm3
- movq mm1, mm0
- punpcklwd mm0, mm2
- punpckhwd mm1, mm2
- movd %1, mm0
- punpckhdq mm0, mm0
- movd %2, mm0
- movd %3, mm1
- punpckhdq mm1, mm1
- movd %4, mm1
-
- punpckhdq mm3, mm3
- punpcklbw mm4, mm5
- punpcklbw mm6, mm3
- movq mm5, mm4
- punpcklwd mm4, mm6
- punpckhwd mm5, mm6
- movd %5, mm4
- punpckhdq mm4, mm4
- movd %6, mm4
- movd %7, mm5
- punpckhdq mm5, mm5
- movd %8, mm5
-%endmacro
-
-%macro SBUTTERFLY 4
- movq %4, %2
- punpckl%1 %2, %3
- punpckh%1 %4, %3
-%endmacro
-
-; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
-; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
-%macro TRANSPOSE6x8_MEM 9
- movq mm0, %1
- movq mm1, %3
- movq mm2, %5
- movq mm3, %7
- SBUTTERFLY bw, mm0, %2, mm4
- SBUTTERFLY bw, mm1, %4, mm5
- SBUTTERFLY bw, mm2, %6, mm6
- movq [%9+0x10], mm5
- SBUTTERFLY bw, mm3, %8, mm7
- SBUTTERFLY wd, mm0, mm1, mm5
- SBUTTERFLY wd, mm2, mm3, mm1
- punpckhdq mm0, mm2
- movq [%9+0x00], mm0
- SBUTTERFLY wd, mm4, [%9+0x10], mm3
- SBUTTERFLY wd, mm6, mm7, mm2
- SBUTTERFLY dq, mm4, mm6, mm0
- SBUTTERFLY dq, mm5, mm1, mm7
- punpckldq mm3, mm2
- movq [%9+0x10], mm5
- movq [%9+0x20], mm7
- movq [%9+0x30], mm4
- movq [%9+0x40], mm0
- movq [%9+0x50], mm3
-%endmacro
-
-; out: %4 = |%1-%2|>%3
-; clobbers: %5
-%macro DIFF_GT_MMX 5
- movq %5, %2
- movq %4, %1
- psubusb %5, %1
- psubusb %4, %2
- por %4, %5
- psubusb %4, %3
-%endmacro
-
-%macro DIFF_GT2_MMX 5
- movq %5, %2
- movq %4, %1
- psubusb %5, %1
- psubusb %4, %2
- psubusb %5, %3
- psubusb %4, %3
- pcmpeqb %4, %5
-%endmacro
-
-; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 %1=alpha-1 %2=beta-1
-; out: mm5=beta-1, mm7=mask
-; clobbers: mm4,mm6
-%macro LOAD_MASK_MMX 2
- movd mm4, %1
- movd mm5, %2
- pshufw mm4, mm4, 0
- pshufw mm5, mm5, 0
- packuswb mm4, mm4 ; 8x alpha-1
- packuswb mm5, mm5 ; 8x beta-1
- DIFF_GT_MMX mm1, mm2, mm4, mm7, mm6 ; |p0-q0| > alpha-1
- DIFF_GT_MMX mm0, mm1, mm5, mm4, mm6 ; |p1-p0| > beta-1
- por mm7, mm4
- DIFF_GT_MMX mm3, mm2, mm5, mm4, mm6 ; |q1-q0| > beta-1
- por mm7, mm4
- pxor mm6, mm6
- pcmpeqb mm7, mm6
-%endmacro
-
-; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
-; out: mm1=p0' mm2=q0'
-; clobbers: mm0,3-6
-%macro DEBLOCK_P0_Q0_MMX 0
- movq mm5, mm1
- pxor mm5, mm2 ; p0^q0
- pand mm5, [pb_01 GLOBAL] ; (p0^q0)&1
- pcmpeqb mm4, mm4
- pxor mm3, mm4
- pavgb mm3, mm0 ; (p1 - q1 + 256)>>1
- pavgb mm3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
- pxor mm4, mm1
- pavgb mm4, mm2 ; (q0 - p0 + 256)>>1
- pavgb mm3, mm5
- paddusb mm3, mm4 ; d+128+33
- movq mm6, [pb_a1 GLOBAL]
- psubusb mm6, mm3
- psubusb mm3, [pb_a1 GLOBAL]
- pminub mm6, mm7
- pminub mm3, mm7
- psubusb mm1, mm6
- psubusb mm2, mm3
- paddusb mm1, mm3
- paddusb mm2, mm6
-%endmacro
-
-; in: mm1=p0 mm2=q0
-; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
-; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
-; clobbers: q2, tmp, tc0
-%macro LUMA_Q1_MMX 6
- movq %6, mm1
- pavgb %6, mm2
- pavgb %2, %6 ; avg(p2,avg(p0,q0))
- pxor %6, %3
- pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
- psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
- movq %6, %1
- psubusb %6, %5
- paddusb %5, %1
- pmaxub %2, %6
- pminub %2, %5
- movq %4, %2
-%endmacro
-
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal x264_deblock_v8_luma_mmxext
- picpush ebx
- picgetgot ebx
- push edi
- push esi
- mov edi, [picesp+12] ; pix
- mov esi, [picesp+16] ; stride
- mov edx, [picesp+20] ; alpha
- mov ecx, [picesp+24] ; beta
- dec edx
- dec ecx
- mov eax, edi
- sub eax, esi
- sub eax, esi
- sub eax, esi ; pix-3*stride
- sub esp, 16
-
- movq mm0, [eax+esi] ; p1
- movq mm1, [eax+2*esi] ; p0
- movq mm2, [edi] ; q0
- movq mm3, [edi+esi] ; q1
- LOAD_MASK_MMX edx, ecx
-
- mov ecx, [picesp+44] ; tc0, use only the low 16 bits
- movd mm4, [ecx]
- punpcklbw mm4, mm4
- punpcklbw mm4, mm4 ; tc = 4x tc0[1], 4x tc0[0]
- movq [esp+8], mm4 ; tc
- pcmpeqb mm3, mm3
- pcmpgtb mm4, mm3
- pand mm4, mm7
- movq [esp+0], mm4 ; mask
-
- movq mm3, [eax] ; p2
- DIFF_GT2_MMX mm1, mm3, mm5, mm6, mm7 ; |p2-p0| > beta-1
- pand mm6, mm4
- pand mm4, [esp+8] ; tc
- movq mm7, mm4
- psubb mm7, mm6
- pand mm6, mm4
- LUMA_Q1_MMX mm0, mm3, [eax], [eax+esi], mm6, mm4
-
- movq mm4, [edi+2*esi] ; q2
- DIFF_GT2_MMX mm2, mm4, mm5, mm6, mm3 ; |q2-q0| > beta-1
- movq mm5, [esp+0] ; mask
- pand mm6, mm5
- movq mm5, [esp+8] ; tc
- pand mm5, mm6
- psubb mm7, mm6
- movq mm3, [edi+esi]
- LUMA_Q1_MMX mm3, mm4, [edi+2*esi], [edi+esi], mm5, mm6
-
- DEBLOCK_P0_Q0_MMX
- movq [eax+2*esi], mm1
- movq [edi], mm2
-
- add esp, 16
- pop esi
- pop edi
- picpop ebx
- ret
-
-
-;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_luma_mmxext
- push ebx
- push ebp
- mov eax, [esp+12] ; pix
- mov ebx, [esp+16] ; stride
- lea ebp, [ebx+ebx*2]
- sub eax, 4
- lea ecx, [eax+ebp]
- sub esp, 96
-%define pix_tmp esp
-
- ; transpose 6x16 -> tmp space
- TRANSPOSE6x8_MEM PASS8ROWS(eax, ecx, ebx, ebp), pix_tmp
- lea eax, [eax+ebx*8]
- lea ecx, [ecx+ebx*8]
- TRANSPOSE6x8_MEM PASS8ROWS(eax, ecx, ebx, ebp), pix_tmp+8
-
- ; vertical filter
- push dword [esp+124] ; tc0
- push dword [esp+124] ; beta
- push dword [esp+124] ; alpha
- push dword 16
- push dword pix_tmp
- add dword [esp], 0x40 ; pix_tmp+0x30
- call x264_deblock_v8_luma_mmxext
-
- add dword [esp ], 8 ; pix_tmp+0x38
- add dword [esp+16], 2 ; tc0+2
- call x264_deblock_v8_luma_mmxext
- add esp, 20
-
- ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
- mov eax, [esp+108] ; pix
- sub eax, 2
- lea ecx, [eax+ebp]
-
- movq mm0, [pix_tmp+0x10]
- movq mm1, [pix_tmp+0x20]
- movq mm2, [pix_tmp+0x30]
- movq mm3, [pix_tmp+0x40]
- TRANSPOSE8x4_STORE PASS8ROWS(eax, ecx, ebx, ebp)
-
- lea eax, [eax+ebx*8]
- lea ecx, [ecx+ebx*8]
- movq mm0, [pix_tmp+0x18]
- movq mm1, [pix_tmp+0x28]
- movq mm2, [pix_tmp+0x38]
- movq mm3, [pix_tmp+0x48]
- TRANSPOSE8x4_STORE PASS8ROWS(eax, ecx, ebx, ebp)
-
- add esp, 96
- pop ebp
- pop ebx
- ret
-
-
-%macro CHROMA_V_START 0
- push edi
- push esi
- mov edi, [esp+12] ; pix
- mov esi, [esp+16] ; stride
- mov edx, [esp+20] ; alpha
- mov ecx, [esp+24] ; beta
- dec edx
- dec ecx
- mov eax, edi
- sub eax, esi
- sub eax, esi
-%endmacro
-
-%macro CHROMA_H_START 0
- push edi
- push esi
- push ebp
- mov edi, [esp+16]
- mov esi, [esp+20]
- mov edx, [esp+24]
- mov ecx, [esp+28]
- dec edx
- dec ecx
- sub edi, 2
- mov ebp, esi
- add ebp, esi
- add ebp, esi
- mov eax, edi
- add edi, ebp
-%endmacro
-
-%macro CHROMA_END 0
- pop esi
- pop edi
- ret
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal x264_deblock_v_chroma_mmxext
- CHROMA_V_START
- push ebx
- mov ebx, [esp+32] ; tc0
-
- movq mm0, [eax]
- movq mm1, [eax+esi]
- movq mm2, [edi]
- movq mm3, [edi+esi]
-
- LOAD_MASK_MMX edx, ecx
- movd mm6, [ebx]
- punpcklbw mm6, mm6
- pand mm7, mm6
- picgetgot ebx ; no need to push ebx, it's already been done
- DEBLOCK_P0_Q0_MMX
-
- movq [eax+esi], mm1
- movq [edi], mm2
-
- pop ebx
- CHROMA_END
-
-
-;-----------------------------------------------------------------------------
-; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_chroma_mmxext
- CHROMA_H_START
- push ebx
- mov ebx, [esp+36] ; tc0
- sub esp, 16
-
- TRANSPOSE4x8_LOAD PASS8ROWS(eax, edi, esi, ebp)
- movq [esp+8], mm0
- movq [esp+0], mm3
-
- LOAD_MASK_MMX edx, ecx
- movd mm6, [ebx]
- punpcklbw mm6, mm6
- pand mm7, mm6
- picgetgot ebx ; no need to push ebx, it's already been done
- DEBLOCK_P0_Q0_MMX
-
- movq mm0, [esp+8]
- movq mm3, [esp+0]
- TRANSPOSE8x4_STORE PASS8ROWS(eax, edi, esi, ebp)
-
- add esp, 16
- pop ebx
- pop ebp
- CHROMA_END
-
-
-; in: %1=p0 %2=p1 %3=q1
-; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
-%macro CHROMA_INTRA_P0 3
- movq mm4, %1
- pxor mm4, %3
- pand mm4, [pb_01 GLOBAL] ; mm4 = (p0^q1)&1
- pavgb %1, %3
- psubusb %1, mm4
- pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
-%endmacro
-
-%macro CHROMA_INTRA_BODY 0
- LOAD_MASK_MMX edx, ecx
- movq mm5, mm1
- movq mm6, mm2
- CHROMA_INTRA_P0 mm1, mm0, mm3
- CHROMA_INTRA_P0 mm2, mm3, mm0
- psubb mm1, mm5
- psubb mm2, mm6
- pand mm1, mm7
- pand mm2, mm7
- paddb mm1, mm5
- paddb mm2, mm6
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal x264_deblock_v_chroma_intra_mmxext
- CHROMA_V_START
- picpush ebx
- picgetgot ebx
- movq mm0, [eax]
- movq mm1, [eax+esi]
- movq mm2, [edi]
- movq mm3, [edi+esi]
- CHROMA_INTRA_BODY
- movq [eax+esi], mm1
- movq [edi], mm2
- picpop ebx
- CHROMA_END
-
-;-----------------------------------------------------------------------------
-; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_chroma_intra_mmxext
- CHROMA_H_START
- picpush ebx
- picgetgot ebx
- TRANSPOSE4x8_LOAD PASS8ROWS(eax, edi, esi, ebp)
- CHROMA_INTRA_BODY
- TRANSPOSE8x4_STORE PASS8ROWS(eax, edi, esi, ebp)
- picpop ebx
- pop ebp ; needed because of CHROMA_H_START
- CHROMA_END
-
+++ /dev/null
-;*****************************************************************************
-;* mc.asm: h264 encoder library
-;*****************************************************************************
-;* Copyright (C) 2003 x264 project
-;* $Id: mc.asm,v 1.3 2004/06/18 01:59:58 chenm001 Exp $
-;*
-;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
-;* Laurent Aimar <fenrir@via.ecp.fr> (init algorithm)
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
-;*****************************************************************************
-
-;*****************************************************************************
-;* *
-;* Revision history: *
-;* *
-;* 2004.05.17 portab mc_copy_w4/8/16 (CM) *
-;* *
-;*****************************************************************************
-
-BITS 32
-
-;=============================================================================
-; Macros and other preprocessor constants
-;=============================================================================
-
-%include "i386inc.asm"
-
-;=============================================================================
-; Constants
-;=============================================================================
-
-SECTION_RODATA
-
-ALIGN 16
-pw_4: times 4 dw 4
-pw_8: times 4 dw 8
-pw_32: times 4 dw 32
-pw_64: times 4 dw 64
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-;=============================================================================
-; pixel avg
-;=============================================================================
-
-;-----------------------------------------------------------------------------
-; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
-; uint8_t *src, int src_stride,
-; int height );
-;-----------------------------------------------------------------------------
-%macro AVG_START 1
-cglobal %1
- push ebx
- mov eax, [esp+12] ; dst
- mov ebx, [esp+16] ; dst_stride
- mov ecx, [esp+20] ; src
- mov edx, [esp+24] ; src_stride
- ; esi = height
-.height_loop:
-%endmacro
-
-%macro AVG_END 0
- sub esi, 2
- lea eax, [eax+ebx*2]
- lea ecx, [ecx+edx*2]
- jg .height_loop
- pop ebx
- pop esi
- ret
-%endmacro
-
-AVG_START x264_pixel_avg_w4_mmxext
- movd mm0, [ecx]
- movd mm1, [ecx+edx]
- pavgb mm0, [eax]
- pavgb mm1, [eax+ebx]
- movd [eax], mm0
- movd [eax+ebx], mm1
-AVG_END
-
-AVG_START x264_pixel_avg_w8_mmxext
- movq mm0, [ecx]
- movq mm1, [ecx+edx]
- pavgb mm0, [eax]
- pavgb mm1, [eax+ebx]
- movq [eax], mm0
- movq [eax+ebx], mm1
-AVG_END
-
-AVG_START x264_pixel_avg_w16_mmxext
- movq mm0, [ecx]
- movq mm1, [ecx+8]
- movq mm2, [ecx+edx]
- movq mm3, [ecx+edx+8]
- pavgb mm0, [eax]
- pavgb mm1, [eax+8]
- pavgb mm2, [eax+ebx]
- pavgb mm3, [eax+ebx+8]
- movq [eax], mm0
- movq [eax+8], mm1
- movq [eax+ebx], mm2
- movq [eax+ebx+8], mm3
-AVG_END
-
-AVG_START x264_pixel_avg_w16_sse2
- movdqu xmm0, [ecx]
- movdqu xmm1, [ecx+edx]
- pavgb xmm0, [eax]
- pavgb xmm1, [eax+ebx]
- movdqa [eax], xmm0
- movdqa [eax+ebx], xmm1
-AVG_END
-
-%macro AVGH 2
-cglobal x264_pixel_avg_%1x%2_mmxext
- push esi
- mov esi, %2
- jmp x264_pixel_avg_w%1_mmxext
-%endmacro
-
-AVGH 16, 16
-AVGH 16, 8
-AVGH 8, 16
-AVGH 8, 8
-AVGH 8, 4
-AVGH 4, 8
-AVGH 4, 4
-AVGH 4, 2
-
-%macro AVG2_START 1
-cglobal %1
- push ebx
- push esi
- push edi
- push ebp
- mov eax, [esp+20] ; dst
- mov ebx, [esp+24] ; dst_stride
- mov ecx, [esp+28] ; src1
- mov edx, [esp+32] ; src_stride
- mov edi, [esp+36] ; src2
- mov esi, [esp+40] ; height
- sub edi, ecx
- lea ebp, [edi+edx]
-.height_loop:
-%endmacro
-
-%macro AVG2_END 0
- sub esi, 2
- lea eax, [eax+ebx*2]
- lea ecx, [ecx+edx*2]
- jg .height_loop
- pop ebp
- pop edi
- pop esi
- pop ebx
- ret
-%endmacro
-
-AVG2_START x264_pixel_avg2_w4_mmxext
- movd mm0, [ecx]
- movd mm1, [ecx+edx]
- pavgb mm0, [ecx+edi]
- pavgb mm1, [ecx+ebp]
- movd [eax], mm0
- movd [eax+ebx], mm1
-AVG2_END
-
-AVG2_START x264_pixel_avg2_w8_mmxext
- movq mm0, [ecx]
- movq mm1, [ecx+edx]
- pavgb mm0, [ecx+edi]
- pavgb mm1, [ecx+ebp]
- movq [eax], mm0
- movq [eax+ebx], mm1
-AVG2_END
-
-AVG2_START x264_pixel_avg2_w16_mmxext
- movq mm0, [ecx]
- movq mm1, [ecx+8]
- movq mm2, [ecx+edx]
- movq mm3, [ecx+edx+8]
- pavgb mm0, [ecx+edi]
- pavgb mm1, [ecx+edi+8]
- pavgb mm2, [ecx+ebp]
- pavgb mm3, [ecx+ebp+8]
- movq [eax], mm0
- movq [eax+8], mm1
- movq [eax+ebx], mm2
- movq [eax+ebx+8], mm3
-AVG2_END
-
-AVG2_START x264_pixel_avg2_w20_mmxext
- movq mm0, [ecx]
- movq mm1, [ecx+8]
- movd mm2, [ecx+16]
- movq mm3, [ecx+edx]
- movq mm4, [ecx+edx+8]
- movd mm5, [ecx+edx+16]
- pavgb mm0, [ecx+edi]
- pavgb mm1, [ecx+edi+8]
- pavgb mm2, [ecx+edi+16]
- pavgb mm3, [ecx+ebp]
- pavgb mm4, [ecx+ebp+8]
- pavgb mm5, [ecx+ebp+16]
- movq [eax], mm0
- movq [eax+8], mm1
- movd [eax+16], mm2
- movq [eax+ebx], mm3
- movq [eax+ebx+8], mm4
- movd [eax+ebx+16], mm5
-AVG2_END
-
-
-
-;=============================================================================
-; weighted prediction
-;=============================================================================
-; implicit bipred only:
-; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
-
-%macro BIWEIGHT_4P_MMX 2
- movd mm0, %1
- movd mm1, %2
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- pmullw mm0, mm4
- pmullw mm1, mm5
- paddw mm0, mm1
- paddw mm0, mm6
- psraw mm0, 6
- pmaxsw mm0, mm7
- packuswb mm0, mm0
- movd %1, mm0
-%endmacro
-
-%macro BIWEIGHT_START_MMX 0
- push edi
- push esi
- picgetgot ecx
- movq mm5, [pw_64 GLOBAL]
- movq mm6, [pw_32 GLOBAL] ; rounding
- mov edi, [esp+12] ; dst
- mov esi, [esp+16] ; i_dst
- mov edx, [esp+20] ; src
- mov ecx, [esp+24] ; i_src
- pshufw mm4, [esp+28], 0 ; weight_dst
- pxor mm7, mm7
- psubw mm5, mm4 ; weight_src
-%endmacro
-
-%macro BIWEIGHT_END_MMX 0
- pop esi
- pop edi
- ret
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_avg_weight_w16_mmxext
- BIWEIGHT_START_MMX
- mov eax, [esp+32] ; i_height
- ALIGN 4
- .height_loop
-
- BIWEIGHT_4P_MMX [edi ], [edx ]
- BIWEIGHT_4P_MMX [edi+ 4], [edx+ 4]
- BIWEIGHT_4P_MMX [edi+ 8], [edx+ 8]
- BIWEIGHT_4P_MMX [edi+12], [edx+12]
-
- add edi, esi
- add edx, ecx
- dec eax
- jg .height_loop
- BIWEIGHT_END_MMX
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_avg_weight_w8_mmxext
- BIWEIGHT_START_MMX
- mov eax, [esp+32]
- ALIGN 4
- .height_loop
-
- BIWEIGHT_4P_MMX [edi ], [edx ]
- BIWEIGHT_4P_MMX [edi+4 ], [edx+4 ]
- BIWEIGHT_4P_MMX [edi+esi ], [edx+ecx ]
- BIWEIGHT_4P_MMX [edi+esi+4], [edx+ecx+4]
-
- lea edi, [edi+esi*2]
- lea edx, [edx+ecx*2]
- sub eax, byte 2
- jg .height_loop
- BIWEIGHT_END_MMX
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_avg_weight_4x4_mmxext
- BIWEIGHT_START_MMX
- BIWEIGHT_4P_MMX [edi ], [edx ]
- BIWEIGHT_4P_MMX [edi+esi ], [edx+ecx ]
- BIWEIGHT_4P_MMX [edi+esi*2], [edx+ecx*2]
- add edi, esi
- add edx, ecx
- BIWEIGHT_4P_MMX [edi+esi*2], [edx+ecx*2]
- BIWEIGHT_END_MMX
-
-
-
-;=============================================================================
-; pixel copy
-;=============================================================================
-
-;-----------------------------------------------------------------------------
-; void x264_mc_copy_w4_mmx( uint8_t *src, int i_src_stride,
-; uint8_t *dst, int i_dst_stride, int i_height )
-;-----------------------------------------------------------------------------
-cglobal x264_mc_copy_w4_mmx
- push ebx
- push esi
- push edi
-
- mov esi, [esp+24] ; src
- mov edi, [esp+16] ; dst
- mov ebx, [esp+28] ; i_src_stride
- mov edx, [esp+20] ; i_dst_stride
- mov ecx, [esp+32] ; i_height
-ALIGN 4
-.height_loop
- mov eax, [esi]
- mov [edi], eax
- mov eax, [esi+ebx]
- mov [edi+edx], eax
- lea esi, [esi+ebx*2]
- lea edi, [edi+edx*2]
- dec ecx
- dec ecx
- jg .height_loop
-
- pop edi
- pop esi
- pop ebx
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_mc_copy_w8_mmx( uint8_t *src, int i_src_stride,
-; uint8_t *dst, int i_dst_stride, int i_height )
-;-----------------------------------------------------------------------------
-cglobal x264_mc_copy_w8_mmx
- push ebx
- push esi
- push edi
-
- mov esi, [esp+24] ; src
- mov edi, [esp+16] ; dst
- mov ebx, [esp+28] ; i_src_stride
- mov edx, [esp+20] ; i_dst_stride
- mov ecx, [esp+32] ; i_height
-ALIGN 4
-.height_loop
- movq mm0, [esi]
- movq [edi], mm0
- movq mm1, [esi+ebx]
- movq [edi+edx], mm1
- movq mm2, [esi+ebx*2]
- movq [edi+edx*2], mm2
- lea esi, [esi+ebx*2]
- lea edi, [edi+edx*2]
- movq mm3, [esi+ebx]
- movq [edi+edx], mm3
- lea esi, [esi+ebx*2]
- lea edi, [edi+edx*2]
-
- sub ecx, byte 4
- jg .height_loop
-
- pop edi
- pop esi
- pop ebx
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_mc_copy_w16_mmx( uint8_t *src, int i_src_stride,
-; uint8_t *dst, int i_dst_stride, int i_height )
-;-----------------------------------------------------------------------------
-cglobal x264_mc_copy_w16_mmx
- push ebx
- push esi
- push edi
-
- mov esi, [esp+24] ; src
- mov edi, [esp+16] ; dst
- mov ebx, [esp+28] ; i_src_stride
- mov edx, [esp+20] ; i_dst_stride
- mov ecx, [esp+32] ; i_height
-
-ALIGN 4
-.height_loop
- movq mm0, [esi]
- movq mm1, [esi+8]
- movq [edi], mm0
- movq [edi+8], mm1
- movq mm2, [esi+ebx]
- movq mm3, [esi+ebx+8]
- movq [edi+edx], mm2
- movq [edi+edx+8], mm3
- movq mm4, [esi+ebx*2]
- movq mm5, [esi+ebx*2+8]
- movq [edi+edx*2], mm4
- movq [edi+edx*2+8], mm5
- lea esi, [esi+ebx*2]
- lea edi, [edi+edx*2]
- movq mm6, [esi+ebx]
- movq mm7, [esi+ebx+8]
- movq [edi+edx], mm6
- movq [edi+edx+8], mm7
- lea esi, [esi+ebx*2]
- lea edi, [edi+edx*2]
- sub ecx, byte 4
- jg .height_loop
-
- pop edi
- pop esi
- pop ebx
- ret
-
-
-;-----------------------------------------------------------------------------
-; void x264_mc_copy_w16_sse2( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
-;-----------------------------------------------------------------------------
-cglobal x264_mc_copy_w16_sse2
- push ebx
- push esi
- push edi
-
- mov esi, [esp+24] ; src
- mov edi, [esp+16] ; dst
- mov ebx, [esp+28] ; i_src_stride
- mov edx, [esp+20] ; i_dst_stride
- mov ecx, [esp+32] ; i_height
-
-ALIGN 4
-.height_loop
- movdqu xmm0, [esi]
- movdqu xmm1, [esi+ebx]
- movdqu [edi], xmm0
- movdqu [edi+edx], xmm1
- dec ecx
- dec ecx
- lea esi, [esi+ebx*2]
- lea edi, [edi+edx*2]
- jg .height_loop
-
- pop edi
- pop esi
- pop ebx
- ret
-
-
-
-;=============================================================================
-; chroma MC
-;=============================================================================
-
-;-----------------------------------------------------------------------------
-; void x264_mc_chroma_mmxext( uint8_t *dst, int i_dst_stride,
-; uint8_t *src, int i_src_stride,
-; int dx, int dy,
-; int i_width, int i_height )
-;-----------------------------------------------------------------------------
-
-cglobal x264_mc_chroma_mmxext
- picpush ebx
- picgetgot ebx
- push edi
-
- mov ecx, [picesp+4+24]
- mov edx, [picesp+4+20]
- mov eax, ecx
- mov edi, edx
- sar ecx, 3
- sar edx, 3
- imul ecx, [picesp+4+16]
- add ecx, edx
- add [picesp+4+12], ecx ; src += (dx>>3) + (dy>>3) * src_stride
-
- pxor mm3, mm3
-
- and edi, 7
- and eax, 7
- movd mm5, edi
- movd mm6, eax
- pshufw mm5, mm5, 0 ; mm5 = dx&7
- pshufw mm6, mm6, 0 ; mm6 = dy&7
-
- movq mm4, [pw_8 GLOBAL]
- movq mm0, mm4
-
- psubw mm4, mm5 ; mm4 = 8-dx
- psubw mm0, mm6 ; mm0 = 8-dy
-
- movq mm7, mm5
- pmullw mm5, mm0 ; mm5 = dx*(8-dy) = cB
- pmullw mm7, mm6 ; mm7 = dx*dy = cD
- pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC
- pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA
-
- mov eax, [picesp+4+12] ; src
- mov edi, [picesp+4+4] ; dst
- mov ecx, [picesp+4+16] ; i_src_stride
- mov edx, [picesp+4+32] ; i_height
-
-ALIGN 4
-.height_loop
-
- movd mm1, [eax+ecx]
- movd mm0, [eax]
- punpcklbw mm1, mm3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4
- punpcklbw mm0, mm3
- pmullw mm1, mm6 ; 2nd line * cC
- pmullw mm0, mm4 ; 1st line * cA
-
- paddw mm0, mm1 ; mm0 <- result
-
- movd mm2, [eax+1]
- movd mm1, [eax+ecx+1]
- punpcklbw mm2, mm3
- punpcklbw mm1, mm3
-
- paddw mm0, [pw_32 GLOBAL]
-
- pmullw mm2, mm5 ; line * cB
- pmullw mm1, mm7 ; line * cD
- paddw mm0, mm2
- paddw mm0, mm1
-
- psrlw mm0, 6
- packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4
- movd [edi], mm0
-
- add eax, ecx
- add edi, [picesp+4+8]
-
- dec edx
- jnz .height_loop
-
- sub [picesp+4+28], dword 8
- jnz .finish ; width != 8 so assume 4
-
- mov edi, [picesp+4+4] ; dst
- mov eax, [picesp+4+12] ; src
- mov edx, [picesp+4+32] ; i_height
- add edi, 4
- add eax, 4
- jmp .height_loop
-
-.finish
- pop edi
- picpop ebx
- ret
-
-
-
-; prefetches tuned for 64 byte cachelines (K7/K8/Core2)
-; TODO add 32 and 128 byte versions for P3/P4
-
-;-----------------------------------------------------------------------------
-; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
-; uint8_t *pix_uv, int stride_uv, int mb_x )
-;-----------------------------------------------------------------------------
-cglobal x264_prefetch_fenc_mmxext
- mov eax, [esp+20]
- mov ecx, [esp+8]
- mov edx, [esp+4]
- and eax, 3
- imul eax, ecx
- lea edx, [edx+eax*4+64]
- prefetcht0 [edx]
- prefetcht0 [edx+ecx]
- lea edx, [edx+ecx*2]
- prefetcht0 [edx]
- prefetcht0 [edx+ecx]
-
- mov eax, [esp+20]
- mov ecx, [esp+16]
- mov edx, [esp+12]
- and eax, 6
- imul eax, ecx
- lea edx, [edx+eax+64]
- prefetcht0 [edx]
- prefetcht0 [edx+ecx]
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
-;-----------------------------------------------------------------------------
-cglobal x264_prefetch_ref_mmxext
- mov eax, [esp+12]
- mov ecx, [esp+8]
- mov edx, [esp+4]
- sub eax, 1
- and eax, ecx
- lea edx, [edx+eax*8+64]
- lea eax, [ecx*3]
- prefetcht0 [edx]
- prefetcht0 [edx+ecx]
- prefetcht0 [edx+ecx*2]
- prefetcht0 [edx+eax]
- lea edx, [edx+ecx*4]
- prefetcht0 [edx]
- prefetcht0 [edx+ecx]
- prefetcht0 [edx+ecx*2]
- prefetcht0 [edx+eax]
- ret
+++ /dev/null
-;*****************************************************************************
-;* pixel.asm: h264 encoder library
-;*****************************************************************************
-;* Copyright (C) 2003 x264 project
-;* $Id: pixel.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
-;*
-;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
-;*****************************************************************************
-
-BITS 32
-
-;=============================================================================
-; Macros and other preprocessor constants
-;=============================================================================
-
-%include "i386inc.asm"
-
-; sad
-
-%macro SAD_INC_2x16P 0
- movq mm1, [eax]
- movq mm2, [eax+8]
- movq mm3, [eax+ebx]
- movq mm4, [eax+ebx+8]
- psadbw mm1, [ecx]
- psadbw mm2, [ecx+8]
- psadbw mm3, [ecx+edx]
- psadbw mm4, [ecx+edx+8]
- lea eax, [eax+2*ebx]
- paddw mm1, mm2
- paddw mm3, mm4
- lea ecx, [ecx+2*edx]
- paddw mm0, mm1
- paddw mm0, mm3
-%endmacro
-
-%macro SAD_INC_2x8P 0
- movq mm1, [eax]
- movq mm2, [eax+ebx]
- psadbw mm1, [ecx]
- psadbw mm2, [ecx+edx]
- lea eax, [eax+2*ebx]
- paddw mm0, mm1
- paddw mm0, mm2
- lea ecx, [ecx+2*edx]
-%endmacro
-
-%macro SAD_INC_2x4P 0
- movd mm1, [eax]
- movd mm2, [ecx]
- punpckldq mm1, [eax+ebx]
- punpckldq mm2, [ecx+edx]
- psadbw mm1, mm2
- paddw mm0, mm1
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
-%endmacro
-
-; sad x3 / x4
-
-%macro SAD_X3_START 0
- push edi
- push esi
- mov edi, [esp+12]
- mov eax, [esp+16]
- mov ecx, [esp+20]
- mov edx, [esp+24]
- mov esi, [esp+28]
-%endmacro
-
-%macro SAD_X3_START_1x8P 0
- movq mm3, [edi]
- movq mm0, [eax]
- movq mm1, [ecx]
- movq mm2, [edx]
- psadbw mm0, mm3
- psadbw mm1, mm3
- psadbw mm2, mm3
-%endmacro
-
-%macro SAD_X3_1x8P 2
- movq mm3, [edi+%1]
- movq mm4, [eax+%2]
- movq mm5, [ecx+%2]
- movq mm6, [edx+%2]
- psadbw mm4, mm3
- psadbw mm5, mm3
- psadbw mm6, mm3
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm2, mm6
-%endmacro
-
-%macro SAD_X3_START_2x4P 3
- movd mm3, [edi]
- movd %1, [eax]
- movd %2, [ecx]
- movd %3, [edx]
- punpckldq mm3, [edi+FENC_STRIDE]
- punpckldq %1, [eax+esi]
- punpckldq %2, [ecx+esi]
- punpckldq %3, [edx+esi]
- psadbw %1, mm3
- psadbw %2, mm3
- psadbw %3, mm3
-%endmacro
-
-%macro SAD_X3_2x16P 1
-%if %1
- SAD_X3_START
- SAD_X3_START_1x8P
-%else
- SAD_X3_1x8P 0, 0
-%endif
- SAD_X3_1x8P 8, 8
- SAD_X3_1x8P FENC_STRIDE, esi
- SAD_X3_1x8P FENC_STRIDE+8, esi+8
- add edi, 2*FENC_STRIDE
- lea eax, [eax+2*esi]
- lea ecx, [ecx+2*esi]
- lea edx, [edx+2*esi]
-%endmacro
-
-%macro SAD_X3_2x8P 1
-%if %1
- SAD_X3_START
- SAD_X3_START_1x8P
-%else
- SAD_X3_1x8P 0, 0
-%endif
- SAD_X3_1x8P FENC_STRIDE, esi
- add edi, 2*FENC_STRIDE
- lea eax, [eax+2*esi]
- lea ecx, [ecx+2*esi]
- lea edx, [edx+2*esi]
-%endmacro
-
-%macro SAD_X3_2x4P 1
-%if %1
- SAD_X3_START
- SAD_X3_START_2x4P mm0, mm1, mm2
-%else
- SAD_X3_START_2x4P mm4, mm5, mm6
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm2, mm6
-%endif
- add edi, 2*FENC_STRIDE
- lea eax, [eax+2*esi]
- lea ecx, [ecx+2*esi]
- lea edx, [edx+2*esi]
-%endmacro
-
-%macro SAD_X4_START 0
- push edi
- push esi
- push ebx
- mov edi, [esp+16]
- mov eax, [esp+20]
- mov ebx, [esp+24]
- mov ecx, [esp+28]
- mov edx, [esp+32]
- mov esi, [esp+36]
-%endmacro
-
-%macro SAD_X4_START_1x8P 0
- movq mm7, [edi]
- movq mm0, [eax]
- movq mm1, [ebx]
- movq mm2, [ecx]
- movq mm3, [edx]
- psadbw mm0, mm7
- psadbw mm1, mm7
- psadbw mm2, mm7
- psadbw mm3, mm7
-%endmacro
-
-%macro SAD_X4_1x8P 2
- movq mm7, [edi+%1]
- movq mm4, [eax+%2]
- movq mm5, [ebx+%2]
- movq mm6, [ecx+%2]
- psadbw mm4, mm7
- psadbw mm5, mm7
- psadbw mm6, mm7
- psadbw mm7, [edx+%2]
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm2, mm6
- paddw mm3, mm7
-%endmacro
-
-%macro SAD_X4_START_2x4P 0
- movd mm7, [edi]
- movd mm0, [eax]
- movd mm1, [ebx]
- movd mm2, [ecx]
- movd mm3, [edx]
- punpckldq mm7, [edi+FENC_STRIDE]
- punpckldq mm0, [eax+esi]
- punpckldq mm1, [ebx+esi]
- punpckldq mm2, [ecx+esi]
- punpckldq mm3, [edx+esi]
- psadbw mm0, mm7
- psadbw mm1, mm7
- psadbw mm2, mm7
- psadbw mm3, mm7
-%endmacro
-
-%macro SAD_X4_INC_2x4P 0
- movd mm7, [edi]
- movd mm4, [eax]
- movd mm5, [ebx]
- punpckldq mm7, [edi+FENC_STRIDE]
- punpckldq mm4, [eax+esi]
- punpckldq mm5, [ebx+esi]
- psadbw mm4, mm7
- psadbw mm5, mm7
- paddw mm0, mm4
- paddw mm1, mm5
- movd mm4, [ecx]
- movd mm5, [edx]
- punpckldq mm4, [ecx+esi]
- punpckldq mm5, [edx+esi]
- psadbw mm4, mm7
- psadbw mm5, mm7
- paddw mm2, mm4
- paddw mm3, mm5
-%endmacro
-
-%macro SAD_X4_2x16P 1
-%if %1
- SAD_X4_START
- SAD_X4_START_1x8P
-%else
- SAD_X4_1x8P 0, 0
-%endif
- SAD_X4_1x8P 8, 8
- SAD_X4_1x8P FENC_STRIDE, esi
- SAD_X4_1x8P FENC_STRIDE+8, esi+8
- add edi, 2*FENC_STRIDE
- lea eax, [eax+2*esi]
- lea ebx, [ebx+2*esi]
- lea ecx, [ecx+2*esi]
- lea edx, [edx+2*esi]
-%endmacro
-
-%macro SAD_X4_2x8P 1
-%if %1
- SAD_X4_START
- SAD_X4_START_1x8P
-%else
- SAD_X4_1x8P 0, 0
-%endif
- SAD_X4_1x8P FENC_STRIDE, esi
- add edi, 2*FENC_STRIDE
- lea eax, [eax+2*esi]
- lea ebx, [ebx+2*esi]
- lea ecx, [ecx+2*esi]
- lea edx, [edx+2*esi]
-%endmacro
-
-%macro SAD_X4_2x4P 1
-%if %1
- SAD_X4_START
- SAD_X4_START_2x4P
-%else
- SAD_X4_INC_2x4P
-%endif
- add edi, 2*FENC_STRIDE
- lea eax, [eax+2*esi]
- lea ebx, [ebx+2*esi]
- lea ecx, [ecx+2*esi]
- lea edx, [edx+2*esi]
-%endmacro
-
-%macro SAD_X3_END 0
- mov eax, [esp+32]
- movd [eax+0], mm0
- movd [eax+4], mm1
- movd [eax+8], mm2
- pop esi
- pop edi
- ret
-%endmacro
-
-%macro SAD_X4_END 0
- mov eax, [esp+40]
- movd [eax+0], mm0
- movd [eax+4], mm1
- movd [eax+8], mm2
- movd [eax+12], mm3
- pop ebx
- pop esi
- pop edi
- ret
-%endmacro
-
-; ssd
-
-%macro SSD_INC_1x16P 0
- movq mm1, [eax]
- movq mm2, [ecx]
- movq mm3, [eax+8]
- movq mm4, [ecx+8]
-
- movq mm5, mm2
- movq mm6, mm4
- psubusb mm2, mm1
- psubusb mm4, mm3
- psubusb mm1, mm5
- psubusb mm3, mm6
- por mm1, mm2
- por mm3, mm4
-
- movq mm2, mm1
- movq mm4, mm3
- punpcklbw mm1, mm7
- punpcklbw mm3, mm7
- punpckhbw mm2, mm7
- punpckhbw mm4, mm7
- pmaddwd mm1, mm1
- pmaddwd mm2, mm2
- pmaddwd mm3, mm3
- pmaddwd mm4, mm4
-
- add eax, ebx
- add ecx, edx
- paddd mm0, mm1
- paddd mm0, mm2
- paddd mm0, mm3
- paddd mm0, mm4
-%endmacro
-
-%macro SSD_INC_1x8P 0
- movq mm1, [eax]
- movq mm2, [ecx]
-
- movq mm5, mm2
- psubusb mm2, mm1
- psubusb mm1, mm5
- por mm1, mm2 ; mm1 = 8bit abs diff
-
- movq mm2, mm1
- punpcklbw mm1, mm7
- punpckhbw mm2, mm7 ; (mm1,mm2) = 16bit abs diff
- pmaddwd mm1, mm1
- pmaddwd mm2, mm2
-
- add eax, ebx
- add ecx, edx
- paddd mm0, mm1
- paddd mm0, mm2
-%endmacro
-
-%macro SSD_INC_1x4P 0
- movd mm1, [eax]
- movd mm2, [ecx]
-
- movq mm5, mm2
- psubusb mm2, mm1
- psubusb mm1, mm5
- por mm1, mm2
- punpcklbw mm1, mm7
- pmaddwd mm1, mm1
-
- add eax, ebx
- add ecx, edx
- paddd mm0, mm1
-%endmacro
-
-; satd
-
-%macro SUMSUB_BADC 4
- paddw %1, %2
- paddw %3, %4
- paddw %2, %2
- paddw %4, %4
- psubw %2, %1
- psubw %4, %3
-%endmacro
-
-%macro HADAMARD4x4 4
- SUMSUB_BADC %1, %2, %3, %4
- SUMSUB_BADC %1, %3, %2, %4
-%endmacro
-
-%macro SBUTTERFLYwd 3
- movq %3, %1
- punpcklwd %1, %2
- punpckhwd %3, %2
-%endmacro
-
-%macro SBUTTERFLYdq 3
- movq %3, %1
- punpckldq %1, %2
- punpckhdq %3, %2
-%endmacro
-
-%macro TRANSPOSE4x4 5 ; abcd-t -> adtc
- SBUTTERFLYwd %1, %2, %5
- SBUTTERFLYwd %3, %4, %2
- SBUTTERFLYdq %1, %3, %4
- SBUTTERFLYdq %5, %2, %3
-%endmacro
-
-%macro MMX_ABS 2 ; mma, tmp
- pxor %2, %2
- psubw %2, %1
- pmaxsw %1, %2
-%endmacro
-
-%macro MMX_ABS_TWO 4 ; mma, mmb, tmp0, tmp1
- pxor %3, %3
- pxor %4, %4
- psubw %3, %1
- psubw %4, %2
- pmaxsw %1, %3
- pmaxsw %2, %4
-%endmacro
-
-%macro HADAMARD4x4_SUM 1 ; %1 - dest (row sum of one block)
- HADAMARD4x4 mm4, mm5, mm6, mm7
- TRANSPOSE4x4 mm4, mm5, mm6, mm7, %1
- HADAMARD4x4 mm4, mm7, %1, mm6
- MMX_ABS_TWO mm4, mm7, mm3, mm5
- MMX_ABS_TWO %1, mm6, mm3, mm5
- paddw %1, mm4
- paddw mm6, mm7
- pavgw %1, mm6
-%endmacro
-
-%macro LOAD_DIFF_4P 4 ; mmp, mmt, dx, dy
- movd %1, [eax+ebx*%4+%3]
- movd %2, [ecx+edx*%4+%3]
- punpcklbw %1, %2
- punpcklbw %2, %2
- psubw %1, %2
-%endmacro
-
-; in: %2 = horizontal offset
-; in: %3 = whether we need to increment pix1 and pix2
-; clobber: mm3..mm7
-; out: %1 = satd
-%macro LOAD_DIFF_HADAMARD_SUM 3
-%if %3
- LOAD_DIFF_4P mm4, mm3, %2, 0
- LOAD_DIFF_4P mm5, mm3, %2, 1
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- LOAD_DIFF_4P mm6, mm3, %2, 0
- LOAD_DIFF_4P mm7, mm3, %2, 1
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
-%else
- LOAD_DIFF_4P mm4, mm3, %2, 0
- LOAD_DIFF_4P mm6, mm3, %2, 2
- add eax, ebx
- add ecx, edx
- LOAD_DIFF_4P mm5, mm3, %2, 0
- LOAD_DIFF_4P mm7, mm3, %2, 2
-%endif
- HADAMARD4x4_SUM %1
-%endmacro
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-%macro SAD_START 0
- push ebx
-
- mov eax, [esp+ 8] ; pix1
- mov ebx, [esp+12] ; stride1
- mov ecx, [esp+16] ; pix2
- mov edx, [esp+20] ; stride2
-
- pxor mm0, mm0
-%endmacro
-%macro SAD_END 0
- movd eax, mm0
-
- pop ebx
- ret
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-%macro SAD 2
-cglobal x264_pixel_sad_%1x%2_mmxext
- SAD_START
-%rep %2/2
- SAD_INC_2x%1P
-%endrep
- SAD_END
-%endmacro
-
-SAD 16, 16
-SAD 16, 8
-SAD 8, 16
-SAD 8, 8
-SAD 8, 4
-SAD 4, 8
-SAD 4, 4
-
-;-----------------------------------------------------------------------------
-; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-; uint8_t *pix2, int i_stride, int scores[3] )
-;-----------------------------------------------------------------------------
-%macro SAD_X 3
-cglobal x264_pixel_sad_x%1_%2x%3_mmxext
- SAD_X%1_2x%2P 1
-%rep %3/2-1
- SAD_X%1_2x%2P 0
-%endrep
- SAD_X%1_END
-%endmacro
-
-SAD_X 3, 16, 16
-SAD_X 3, 16, 8
-SAD_X 3, 8, 16
-SAD_X 3, 8, 8
-SAD_X 3, 8, 4
-SAD_X 3, 4, 8
-SAD_X 3, 4, 4
-SAD_X 4, 16, 16
-SAD_X 4, 16, 8
-SAD_X 4, 8, 16
-SAD_X 4, 8, 8
-SAD_X 4, 8, 4
-SAD_X 4, 4, 8
-SAD_X 4, 4, 4
-
-
-%macro SSD_START 0
- push ebx
-
- mov eax, [esp+ 8] ; pix1
- mov ebx, [esp+12] ; stride1
- mov ecx, [esp+16] ; pix2
- mov edx, [esp+20] ; stride2
-
- pxor mm7, mm7 ; zero
- pxor mm0, mm0 ; mm0 holds the sum
-%endmacro
-
-%macro SSD_END 0
- movq mm1, mm0
- psrlq mm1, 32
- paddd mm0, mm1
- movd eax, mm0
-
- pop ebx
- ret
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-%macro SSD 2
-cglobal x264_pixel_ssd_%1x%2_mmx
- SSD_START
-%rep %2
- SSD_INC_1x%1P
-%endrep
- SSD_END
-%endmacro
-
-SSD 16, 16
-SSD 16, 8
-SSD 8, 16
-SSD 8, 8
-SSD 8, 4
-SSD 4, 8
-SSD 4, 4
-
-
-
-%macro SATD_START 0
- push ebx
-
- mov eax, [esp+ 8] ; pix1
- mov ebx, [esp+12] ; stride1
- mov ecx, [esp+16] ; pix2
- mov edx, [esp+20] ; stride2
-%endmacro
-
-%macro SATD_END 0
- pshufw mm1, mm0, 01001110b
- paddw mm0, mm1
- pshufw mm1, mm0, 10110001b
- paddw mm0, mm1
- movd eax, mm0
- and eax, 0xffff
- pop ebx
- ret
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_4x4_mmxext
- SATD_START
- LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_4x8_mmxext
- SATD_START
- LOAD_DIFF_HADAMARD_SUM mm0, 0, 1
- LOAD_DIFF_HADAMARD_SUM mm1, 0, 0
- paddw mm0, mm1
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_8x4_mmxext
- SATD_START
- LOAD_DIFF_HADAMARD_SUM mm0, 0, 0
- sub eax, ebx
- sub ecx, edx
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
- paddw mm0, mm1
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_8x8_mmxext
- SATD_START
- LOAD_DIFF_HADAMARD_SUM mm0, 0, 1
- LOAD_DIFF_HADAMARD_SUM mm1, 0, 0
-
- mov eax, [esp+ 8] ; pix1
- mov ecx, [esp+16] ; pix2
- LOAD_DIFF_HADAMARD_SUM mm2, 4, 1
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
- paddw mm0, mm2
- paddw mm0, mm1
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_16x8_mmxext
- SATD_START
- LOAD_DIFF_HADAMARD_SUM mm0, 0, 1
- LOAD_DIFF_HADAMARD_SUM mm1, 0, 0
-
- mov eax, [esp+ 8] ; pix1
- mov ecx, [esp+16] ; pix2
- LOAD_DIFF_HADAMARD_SUM mm2, 4, 1
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
- paddw mm0, mm2
-
- mov eax, [esp+ 8] ; pix1
- mov ecx, [esp+16] ; pix2
- LOAD_DIFF_HADAMARD_SUM mm2, 8, 1
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 8, 0
- paddw mm0, mm2
-
- mov eax, [esp+ 8] ; pix1
- mov ecx, [esp+16] ; pix2
- LOAD_DIFF_HADAMARD_SUM mm2, 12, 1
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 12, 0
- paddw mm0, mm2
- paddw mm0, mm1
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_8x16_mmxext
- SATD_START
- LOAD_DIFF_HADAMARD_SUM mm0, 0, 1
- LOAD_DIFF_HADAMARD_SUM mm1, 0, 1
- LOAD_DIFF_HADAMARD_SUM mm2, 0, 1
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 0, 0
- paddw mm0, mm2
-
- mov eax, [esp+ 8] ; pix1
- mov ecx, [esp+16] ; pix2
- LOAD_DIFF_HADAMARD_SUM mm2, 4, 1
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 1
- paddw mm0, mm2
- LOAD_DIFF_HADAMARD_SUM mm2, 4, 1
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
- paddw mm0, mm2
- paddw mm0, mm1
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_16x16_mmxext
- SATD_START
- LOAD_DIFF_HADAMARD_SUM mm0, 0, 1
- LOAD_DIFF_HADAMARD_SUM mm1, 0, 1
- LOAD_DIFF_HADAMARD_SUM mm2, 0, 1
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 0, 0
- paddw mm0, mm2
-
- mov eax, [esp+ 8] ; pix1
- mov ecx, [esp+16] ; pix2
- LOAD_DIFF_HADAMARD_SUM mm2, 4, 1
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 1
- paddw mm0, mm2
- LOAD_DIFF_HADAMARD_SUM mm2, 4, 1
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 4, 0
- paddw mm0, mm2
-
- mov eax, [esp+ 8] ; pix1
- mov ecx, [esp+16] ; pix2
- LOAD_DIFF_HADAMARD_SUM mm2, 8, 1
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 8, 1
- paddw mm0, mm2
- LOAD_DIFF_HADAMARD_SUM mm2, 8, 1
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 8, 0
- paddw mm0, mm2
-
- mov eax, [esp+ 8] ; pix1
- mov ecx, [esp+16] ; pix2
- LOAD_DIFF_HADAMARD_SUM mm2, 12, 1
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 12, 1
- paddw mm0, mm2
- LOAD_DIFF_HADAMARD_SUM mm2, 12, 1
- paddw mm0, mm1
- LOAD_DIFF_HADAMARD_SUM mm1, 12, 0
- paddw mm0, mm2
- paddw mm0, mm1
-
- pxor mm3, mm3
- pshufw mm1, mm0, 01001110b
- paddw mm0, mm1
- punpcklwd mm0, mm3
- pshufw mm1, mm0, 01001110b
- paddd mm0, mm1
- movd eax, mm0
- pop ebx
- ret
-
-
-%macro LOAD_DIFF_4x8P 1 ; dx
- LOAD_DIFF_4P mm0, mm7, %1, 0
- LOAD_DIFF_4P mm1, mm7, %1, 1
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- LOAD_DIFF_4P mm2, mm7, %1, 0
- LOAD_DIFF_4P mm3, mm7, %1, 1
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- LOAD_DIFF_4P mm4, mm7, %1, 0
- LOAD_DIFF_4P mm5, mm7, %1, 1
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- LOAD_DIFF_4P mm6, mm7, %1, 0
- movq [spill], mm6
- LOAD_DIFF_4P mm7, mm6, %1, 1
- movq mm6, [spill]
-%endmacro
-
-%macro HADAMARD1x8 8
- SUMSUB_BADC %1, %5, %2, %6
- SUMSUB_BADC %3, %7, %4, %8
- SUMSUB_BADC %1, %3, %2, %4
- SUMSUB_BADC %5, %7, %6, %8
- SUMSUB_BADC %1, %2, %3, %4
- SUMSUB_BADC %5, %6, %7, %8
-%endmacro
-
-%macro SUM4x8_MM 0
- movq [spill], mm6
- movq [spill+8], mm7
- MMX_ABS_TWO mm0, mm1, mm6, mm7
- MMX_ABS_TWO mm2, mm3, mm6, mm7
- paddw mm0, mm2
- paddw mm1, mm3
- movq mm6, [spill]
- movq mm7, [spill+8]
- MMX_ABS_TWO mm4, mm5, mm2, mm3
- MMX_ABS_TWO mm6, mm7, mm2, mm3
- paddw mm4, mm6
- paddw mm5, mm7
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm0, mm1
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_sa8d_8x8_mmxext
- SATD_START
- sub esp, 0x70
-%define args esp+0x74
-%define spill esp+0x60 ; +16
-%define trans esp+0 ; +96
- LOAD_DIFF_4x8P 0
- HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
-
- movq [spill], mm0
- TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0
- movq [trans+0x00], mm4
- movq [trans+0x08], mm7
- movq [trans+0x10], mm0
- movq [trans+0x18], mm6
- movq mm0, [spill]
- TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4
- movq [trans+0x20], mm0
- movq [trans+0x28], mm3
- movq [trans+0x30], mm4
- movq [trans+0x38], mm2
-
- mov eax, [args+4]
- mov ecx, [args+12]
- LOAD_DIFF_4x8P 4
- HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
-
- movq [spill], mm7
- TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm7
- movq [trans+0x40], mm0
- movq [trans+0x48], mm3
- movq [trans+0x50], mm7
- movq [trans+0x58], mm2
- movq mm7, [spill]
- TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0
- movq mm5, [trans+0x00]
- movq mm1, [trans+0x08]
- movq mm2, [trans+0x10]
- movq mm3, [trans+0x18]
-
- HADAMARD1x8 mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
- SUM4x8_MM
- movq [trans], mm0
-
- movq mm0, [trans+0x20]
- movq mm1, [trans+0x28]
- movq mm2, [trans+0x30]
- movq mm3, [trans+0x38]
- movq mm4, [trans+0x40]
- movq mm5, [trans+0x48]
- movq mm6, [trans+0x50]
- movq mm7, [trans+0x58]
-
- HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
- SUM4x8_MM
-
- pavgw mm0, [esp]
- pshufw mm1, mm0, 01001110b
- paddw mm0, mm1
- pshufw mm1, mm0, 10110001b
- paddw mm0, mm1
- movd eax, mm0
- and eax, 0xffff
- mov ecx, eax ; preserve rounding for 16x16
- add eax, 1
- shr eax, 1
- add esp, 0x70
- pop ebx
- ret
-%undef args
-%undef spill
-%undef trans
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_sa8d_16x16_mmxext( uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-;; violates calling convention
-cglobal x264_pixel_sa8d_16x16_mmxext
- push esi
- push edi
- push ebp
- mov esi, [esp+28] ; stride2
- mov edi, [esp+20] ; stride1
- push esi
- push dword [esp+28] ; pix2
- push edi
- push dword [esp+28] ; pix1
- call x264_pixel_sa8d_8x8_mmxext
- mov ebp, ecx
- shl edi, 3
- shl esi, 3
- add [esp+0], edi ; pix1+8*stride1
- add [esp+8], esi ; pix2+8*stride2
- call x264_pixel_sa8d_8x8_mmxext
- add ebp, ecx
- add dword [esp+0], 8 ; pix1+8*stride1+8
- add dword [esp+8], 8 ; pix2+8*stride2+8
- call x264_pixel_sa8d_8x8_mmxext
- add ebp, ecx
- sub [esp+0], edi ; pix1+8
- sub [esp+8], esi ; pix2+8
- call x264_pixel_sa8d_8x8_mmxext
- lea eax, [ebp+ecx+1]
- shr eax, 1
- add esp, 16
- pop ebp
- pop edi
- pop esi
- ret
-
-
-; in: fenc
-; out: mm0..mm3 = hadamard coefs
-%macro LOAD_HADAMARD 1
- pxor mm7, mm7
- movd mm0, [%1+0*FENC_STRIDE]
- movd mm4, [%1+1*FENC_STRIDE]
- movd mm3, [%1+2*FENC_STRIDE]
- movd mm1, [%1+3*FENC_STRIDE]
- punpcklbw mm0, mm7
- punpcklbw mm4, mm7
- punpcklbw mm3, mm7
- punpcklbw mm1, mm7
- HADAMARD4x4 mm0, mm4, mm3, mm1
- TRANSPOSE4x4 mm0, mm4, mm3, mm1, mm2
- HADAMARD4x4 mm0, mm1, mm2, mm3
-%endmacro
-
-%macro SCALAR_SUMSUB 4
- add %1, %2
- add %3, %4
- add %2, %2
- add %4, %4
- sub %2, %1
- sub %4, %3
-%endmacro
-
-%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
- pxor %7, %7
- pshufw %4, %1, 01001110b
- pshufw %5, %2, 01001110b
- pshufw %6, %3, 01001110b
- paddusw %1, %4
- paddusw %2, %5
- paddusw %3, %6
- punpcklwd %1, %7
- punpcklwd %2, %7
- punpcklwd %3, %7
- pshufw %4, %1, 01001110b
- pshufw %5, %2, 01001110b
- pshufw %6, %3, 01001110b
- %8 %1, %4
- %8 %2, %5
- %8 %3, %6
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
-;-----------------------------------------------------------------------------
-cglobal x264_intra_satd_x3_4x4_mmxext
- push ebx
- push edi
- push esi
- sub esp, 16
-%define args esp+32
-%define top_1d esp+8 ; +8
-%define left_1d esp+0 ; +8
-
- mov eax, [args+0] ; fenc
- LOAD_HADAMARD eax
-
- mov edi, [args+4] ; fdec
- movzx eax, byte [edi-1+0*FDEC_STRIDE]
- movzx ebx, byte [edi-1+1*FDEC_STRIDE]
- movzx ecx, byte [edi-1+2*FDEC_STRIDE]
- movzx edx, byte [edi-1+3*FDEC_STRIDE]
- SCALAR_SUMSUB eax, ebx, ecx, edx
- SCALAR_SUMSUB eax, ecx, ebx, edx ; 1x4 hadamard
- mov [left_1d+0], ax
- mov [left_1d+2], bx
- mov [left_1d+4], cx
- mov [left_1d+6], dx
- mov esi, eax ; dc
-
- movzx eax, byte [edi-FDEC_STRIDE+0]
- movzx ebx, byte [edi-FDEC_STRIDE+1]
- movzx ecx, byte [edi-FDEC_STRIDE+2]
- movzx edx, byte [edi-FDEC_STRIDE+3]
- SCALAR_SUMSUB eax, ebx, ecx, edx
- SCALAR_SUMSUB eax, ecx, ebx, edx ; 4x1 hadamard
- mov [top_1d+0], ax
- mov [top_1d+2], bx
- mov [top_1d+4], cx
- mov [top_1d+6], dx
- lea esi, [esi + eax + 4] ; dc
- and esi, -8
- shl esi, 1
-
- movq mm4, mm1
- movq mm5, mm2
- MMX_ABS_TWO mm4, mm5, mm6, mm7
- movq mm7, mm3
- paddw mm4, mm5
- MMX_ABS mm7, mm6
- paddw mm7, mm4 ; 3x4 sum
-
- movq mm4, [left_1d]
- movd mm5, esi
- psllw mm4, 2
- psubw mm4, mm0
- psubw mm5, mm0
- punpcklwd mm0, mm1
- punpcklwd mm2, mm3
- punpckldq mm0, mm2 ; transpose
- movq mm1, [top_1d]
- psllw mm1, 2
- psubw mm0, mm1
- MMX_ABS mm4, mm3 ; 1x4 sum
- MMX_ABS mm5, mm2 ; 1x4 sum
- MMX_ABS mm0, mm1 ; 4x1 sum
- paddw mm4, mm7
- paddw mm5, mm7
- movq mm1, mm5
- psrlq mm1, 16 ; 4x3 sum
- paddw mm0, mm1
-
- SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw
- mov eax, [args+8] ; res
- movd [eax+0], mm0 ; i4x4_v satd
- movd [eax+4], mm4 ; i4x4_h satd
- movd [eax+8], mm5 ; i4x4_dc satd
-
- add esp, 16
- pop esi
- pop edi
- pop ebx
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
-;-----------------------------------------------------------------------------
-cglobal x264_intra_satd_x3_16x16_mmxext
- push ebx
- push ebp
- push edi
- push esi
- sub esp, 88
-%define args esp+108
-%define sums esp+64 ; +24
-%define top_1d esp+32 ; +32
-%define left_1d esp+0 ; +32
-
- pxor mm0, mm0
- movq [sums+0], mm0
- movq [sums+8], mm0
- movq [sums+16], mm0
-
- ; 1D hadamards
- mov edi, [args+4] ; fdec
- xor ebp, ebp
- mov esi, 12
-.loop_edge:
- ; left
- shl esi, 5 ; log(FDEC_STRIDE)
- movzx eax, byte [edi+esi-1+0*FDEC_STRIDE]
- movzx ebx, byte [edi+esi-1+1*FDEC_STRIDE]
- movzx ecx, byte [edi+esi-1+2*FDEC_STRIDE]
- movzx edx, byte [edi+esi-1+3*FDEC_STRIDE]
- shr esi, 5
- SCALAR_SUMSUB eax, ebx, ecx, edx
- SCALAR_SUMSUB eax, ecx, ebx, edx
- add ebp, eax
- mov [left_1d+2*esi+0], ax
- mov [left_1d+2*esi+2], bx
- mov [left_1d+2*esi+4], cx
- mov [left_1d+2*esi+6], dx
-
- ; top
- movzx eax, byte [edi+esi-FDEC_STRIDE+0]
- movzx ebx, byte [edi+esi-FDEC_STRIDE+1]
- movzx ecx, byte [edi+esi-FDEC_STRIDE+2]
- movzx edx, byte [edi+esi-FDEC_STRIDE+3]
- SCALAR_SUMSUB eax, ebx, ecx, edx
- SCALAR_SUMSUB eax, ecx, ebx, edx
- add ebp, eax
- mov [top_1d+2*esi+0], ax
- mov [top_1d+2*esi+2], bx
- mov [top_1d+2*esi+4], cx
- mov [top_1d+2*esi+6], dx
- sub esi, 4
- jge .loop_edge
-
- ; dc
- shr ebp, 1
- add ebp, 8
- and ebp, -16
-
- ; 2D hadamards
- mov eax, [args+0] ; fenc
- xor edi, edi
-.loop_y:
- xor esi, esi
-.loop_x:
- LOAD_HADAMARD eax
-
- movq mm4, mm1
- movq mm5, mm2
- MMX_ABS_TWO mm4, mm5, mm6, mm7
- movq mm7, mm3
- paddw mm4, mm5
- MMX_ABS mm7, mm6
- paddw mm7, mm4 ; 3x4 sum
-
- movq mm4, [left_1d+8*edi]
- movd mm5, ebp
- psllw mm4, 2
- psubw mm4, mm0
- psubw mm5, mm0
- punpcklwd mm0, mm1
- punpcklwd mm2, mm3
- punpckldq mm0, mm2 ; transpose
- movq mm1, [top_1d+8*esi]
- psllw mm1, 2
- psubw mm0, mm1
- MMX_ABS mm4, mm3 ; 1x4 sum
- MMX_ABS mm5, mm2 ; 1x4 sum
- MMX_ABS mm0, mm1 ; 4x1 sum
- pavgw mm4, mm7
- pavgw mm5, mm7
- paddw mm0, [sums+0] ; i4x4_v satd
- paddw mm4, [sums+8] ; i4x4_h satd
- paddw mm5, [sums+16] ; i4x4_dc satd
- movq [sums+0], mm0
- movq [sums+8], mm4
- movq [sums+16], mm5
-
- add eax, 4
- inc esi
- cmp esi, 4
- jl .loop_x
- add eax, 4*FENC_STRIDE-16
- inc edi
- cmp edi, 4
- jl .loop_y
-
-; horizontal sum
- movq mm2, [sums+16]
- movq mm0, [sums+0]
- movq mm1, [sums+8]
- movq mm7, mm2
- SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
- psrld mm0, 1
- pslld mm7, 16
- psrld mm7, 16
- paddd mm0, mm2
- psubd mm0, mm7
- mov eax, [args+8] ; res
- movd [eax+0], mm0 ; i16x16_v satd
- movd [eax+4], mm1 ; i16x16_h satd
- movd [eax+8], mm2 ; i16x16_dc satd
-
- add esp, 88
- pop esi
- pop edi
- pop ebp
- pop ebx
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
-;-----------------------------------------------------------------------------
-cglobal x264_intra_satd_x3_8x8c_mmxext
- push ebx
- push ebp
- push edi
- push esi
- sub esp, 72
-%define args esp+92
-%define sums esp+48 ; +24
-%define dc_1d esp+32 ; +16
-%define top_1d esp+16 ; +16
-%define left_1d esp+0 ; +16
-
- pxor mm0, mm0
- movq [sums+0], mm0
- movq [sums+8], mm0
- movq [sums+16], mm0
-
- ; 1D hadamards
- mov edi, [args+4] ; fdec
- xor ebp, ebp
- mov esi, 12
-.loop_edge:
- ; left
- shl esi, 5 ; log(FDEC_STRIDE)
- movzx eax, byte [edi+esi-1+0*FDEC_STRIDE]
- movzx ebx, byte [edi+esi-1+1*FDEC_STRIDE]
- movzx ecx, byte [edi+esi-1+2*FDEC_STRIDE]
- movzx edx, byte [edi+esi-1+3*FDEC_STRIDE]
- shr esi, 5
- SCALAR_SUMSUB eax, ebx, ecx, edx
- SCALAR_SUMSUB eax, ecx, ebx, edx
- mov [left_1d+2*esi+0], ax
- mov [left_1d+2*esi+2], bx
- mov [left_1d+2*esi+4], cx
- mov [left_1d+2*esi+6], dx
-
- ; top
- movzx eax, byte [edi+esi-FDEC_STRIDE+0]
- movzx ebx, byte [edi+esi-FDEC_STRIDE+1]
- movzx ecx, byte [edi+esi-FDEC_STRIDE+2]
- movzx edx, byte [edi+esi-FDEC_STRIDE+3]
- SCALAR_SUMSUB eax, ebx, ecx, edx
- SCALAR_SUMSUB eax, ecx, ebx, edx
- mov [top_1d+2*esi+0], ax
- mov [top_1d+2*esi+2], bx
- mov [top_1d+2*esi+4], cx
- mov [top_1d+2*esi+6], dx
- sub esi, 4
- jge .loop_edge
-
- ; dc
- movzx eax, word [left_1d+0]
- movzx ebx, word [top_1d+0]
- movzx ecx, word [left_1d+8]
- movzx edx, word [top_1d+8]
- add eax, ebx
- lea ebx, [ecx + edx]
- lea eax, [2*eax + 8]
- lea ebx, [2*ebx + 8]
- lea ecx, [4*ecx + 8]
- lea edx, [4*edx + 8]
- and eax, -16
- and ebx, -16
- and ecx, -16
- and edx, -16
- mov [dc_1d+ 0], eax ; tl
- mov [dc_1d+ 4], edx ; tr
- mov [dc_1d+ 8], ecx ; bl
- mov [dc_1d+12], ebx ; br
- lea ebp, [dc_1d]
-
- ; 2D hadamards
- mov eax, [args+0] ; fenc
- xor edi, edi
-.loop_y:
- xor esi, esi
-.loop_x:
- LOAD_HADAMARD eax
-
- movq mm4, mm1
- movq mm5, mm2
- MMX_ABS_TWO mm4, mm5, mm6, mm7
- movq mm7, mm3
- paddw mm4, mm5
- MMX_ABS mm7, mm6
- paddw mm7, mm4 ; 3x4 sum
-
- movq mm4, [left_1d+8*edi]
- movd mm5, [ebp]
- psllw mm4, 2
- psubw mm4, mm0
- psubw mm5, mm0
- punpcklwd mm0, mm1
- punpcklwd mm2, mm3
- punpckldq mm0, mm2 ; transpose
- movq mm1, [top_1d+8*esi]
- psllw mm1, 2
- psubw mm0, mm1
- MMX_ABS mm4, mm3 ; 1x4 sum
- MMX_ABS mm5, mm2 ; 1x4 sum
- MMX_ABS mm0, mm1 ; 4x1 sum
- pavgw mm4, mm7
- pavgw mm5, mm7
- paddw mm0, [sums+16] ; i4x4_v satd
- paddw mm4, [sums+8] ; i4x4_h satd
- paddw mm5, [sums+0] ; i4x4_dc satd
- movq [sums+16], mm0
- movq [sums+8], mm4
- movq [sums+0], mm5
-
- add eax, 4
- add ebp, 4
- inc esi
- cmp esi, 2
- jl .loop_x
- add eax, 4*FENC_STRIDE-8
- inc edi
- cmp edi, 2
- jl .loop_y
-
-; horizontal sum
- movq mm0, [sums+0]
- movq mm1, [sums+8]
- movq mm2, [sums+16]
- movq mm6, mm0
- psrlq mm6, 15
- paddw mm2, mm6
- SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm7, paddd
- psrld mm2, 1
- mov eax, [args+8] ; res
- movd [eax+0], mm0 ; i8x8c_dc satd
- movd [eax+4], mm1 ; i8x8c_h satd
- movd [eax+8], mm2 ; i8x8c_v satd
-
- add esp, 72
- pop esi
- pop edi
- pop ebp
- pop ebx
- ret
-
-%macro LOAD_4x8P 1 ; dx
- pxor mm7, mm7
- movd mm6, [eax+%1+7*FENC_STRIDE]
- movd mm0, [eax+%1+0*FENC_STRIDE]
- movd mm1, [eax+%1+1*FENC_STRIDE]
- movd mm2, [eax+%1+2*FENC_STRIDE]
- movd mm3, [eax+%1+3*FENC_STRIDE]
- movd mm4, [eax+%1+4*FENC_STRIDE]
- movd mm5, [eax+%1+5*FENC_STRIDE]
- punpcklbw mm6, mm7
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- movq [spill], mm6
- punpcklbw mm2, mm7
- punpcklbw mm3, mm7
- movd mm6, [eax+%1+6*FENC_STRIDE]
- punpcklbw mm4, mm7
- punpcklbw mm5, mm7
- punpcklbw mm6, mm7
- movq mm7, [spill]
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res )
-;-----------------------------------------------------------------------------
-cglobal x264_intra_sa8d_x3_8x8_core_mmxext
- mov eax, [esp+4]
- mov ecx, [esp+8]
- sub esp, 0x70
-%define args esp+0x74
-%define spill esp+0x60 ; +16
-%define trans esp+0 ; +96
-%define sum esp+0 ; +32
- LOAD_4x8P 0
- HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
-
- movq [spill], mm0
- TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0
- movq [trans+0x00], mm4
- movq [trans+0x08], mm7
- movq [trans+0x10], mm0
- movq [trans+0x18], mm6
- movq mm0, [spill]
- TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4
- movq [trans+0x20], mm0
- movq [trans+0x28], mm3
- movq [trans+0x30], mm4
- movq [trans+0x38], mm2
-
- LOAD_4x8P 4
- HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
-
- movq [spill], mm7
- TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm7
- movq [trans+0x40], mm0
- movq [trans+0x48], mm3
- movq [trans+0x50], mm7
- movq [trans+0x58], mm2
- movq mm7, [spill]
- TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0
- movq mm5, [trans+0x00]
- movq mm1, [trans+0x08]
- movq mm2, [trans+0x10]
- movq mm3, [trans+0x18]
-
- HADAMARD1x8 mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
-
- movq [spill+0], mm5
- movq [spill+8], mm7
- MMX_ABS_TWO mm0, mm1, mm5, mm7
- MMX_ABS_TWO mm2, mm3, mm5, mm7
- paddw mm0, mm2
- paddw mm1, mm3
- paddw mm0, mm1
- MMX_ABS_TWO mm4, mm6, mm2, mm3
- movq mm5, [spill+0]
- movq mm7, [spill+8]
- paddw mm0, mm4
- paddw mm0, mm6
- MMX_ABS mm7, mm1
- paddw mm0, mm7 ; 7x4 sum
- movq mm6, mm5
- movq mm7, [ecx+8] ; left bottom
- psllw mm7, 3
- psubw mm6, mm7
- MMX_ABS_TWO mm5, mm6, mm2, mm3
- paddw mm5, mm0
- paddw mm6, mm0
- movq [sum+0], mm5 ; dc
- movq [sum+8], mm6 ; left
-
- movq mm0, [trans+0x20]
- movq mm1, [trans+0x28]
- movq mm2, [trans+0x30]
- movq mm3, [trans+0x38]
- movq mm4, [trans+0x40]
- movq mm5, [trans+0x48]
- movq mm6, [trans+0x50]
- movq mm7, [trans+0x58]
-
- HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
-
- movd [sum+0x10], mm0
- movd [sum+0x12], mm1
- movd [sum+0x14], mm2
- movd [sum+0x16], mm3
- movd [sum+0x18], mm4
- movd [sum+0x1a], mm5
- movd [sum+0x1c], mm6
- movd [sum+0x1e], mm7
-
- movq [spill], mm0
- movq [spill+8], mm1
- MMX_ABS_TWO mm2, mm3, mm0, mm1
- MMX_ABS_TWO mm4, mm5, mm0, mm1
- paddw mm2, mm3
- paddw mm4, mm5
- paddw mm2, mm4
- movq mm0, [spill]
- movq mm1, [spill+8]
- MMX_ABS_TWO mm6, mm7, mm4, mm5
- MMX_ABS mm1, mm4
- paddw mm2, mm7
- paddw mm1, mm6
- paddw mm2, mm1 ; 7x4 sum
- movq mm1, mm0
-
- movq mm7, [ecx+0]
- psllw mm7, 3 ; left top
-
- movzx edx, word [ecx+0]
- add dx, [ecx+16]
- lea edx, [4*edx+32]
- and edx, -64
- movd mm6, edx ; dc
-
- psubw mm1, mm7
- psubw mm0, mm6
- MMX_ABS_TWO mm0, mm1, mm5, mm6
- movq mm3, [sum+0] ; dc
- paddw mm0, mm2
- paddw mm1, mm2
- movq mm2, mm0
- paddw mm0, mm3
- paddw mm1, [sum+8] ; h
- psrlq mm2, 16
- paddw mm2, mm3
-
- movq mm3, [ecx+16] ; top left
- movq mm4, [ecx+24] ; top right
- psllw mm3, 3
- psllw mm4, 3
- psubw mm3, [sum+16]
- psubw mm4, [sum+24]
- MMX_ABS_TWO mm3, mm4, mm5, mm6
- paddw mm2, mm3
- paddw mm2, mm4 ; v
-
- SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
- mov eax, [args+8]
- movd ecx, mm2
- movd edx, mm1
- add ecx, 2
- add edx, 2
- shr ecx, 2
- shr edx, 2
- mov [eax+0], ecx ; i8x8_v satd
- mov [eax+4], edx ; i8x8_h satd
- movd ecx, mm0
- add ecx, 2
- shr ecx, 2
- mov [eax+8], ecx ; i8x8_dc satd
-
- add esp, 0x70
- ret
-%undef args
-%undef spill
-%undef trans
-%undef sum
-
-
-
-;-----------------------------------------------------------------------------
-; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
-; const uint8_t *pix2, int stride2, int sums[2][4] )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssim_4x4x2_core_mmxext
- push ebx
- push edi
- mov ebx, [esp+16]
- mov edx, [esp+24]
- mov edi, 4
- pxor mm0, mm0
-.loop
- mov eax, [esp+12]
- mov ecx, [esp+20]
- add eax, edi
- add ecx, edi
- pxor mm1, mm1
- pxor mm2, mm2
- pxor mm3, mm3
- pxor mm4, mm4
-%rep 4
- movd mm5, [eax]
- movd mm6, [ecx]
- punpcklbw mm5, mm0
- punpcklbw mm6, mm0
- paddw mm1, mm5
- paddw mm2, mm6
- movq mm7, mm5
- pmaddwd mm5, mm5
- pmaddwd mm7, mm6
- pmaddwd mm6, mm6
- paddd mm3, mm5
- paddd mm4, mm7
- paddd mm3, mm6
- add eax, ebx
- add ecx, edx
-%endrep
- mov eax, [esp+28]
- lea eax, [eax+edi*4]
- pshufw mm5, mm1, 0xE
- pshufw mm6, mm2, 0xE
- paddusw mm1, mm5
- paddusw mm2, mm6
- punpcklwd mm1, mm2
- pshufw mm2, mm1, 0xE
- pshufw mm5, mm3, 0xE
- pshufw mm6, mm4, 0xE
- paddusw mm1, mm2
- paddd mm3, mm5
- paddd mm4, mm6
- punpcklwd mm1, mm0
- punpckldq mm3, mm4
- movq [eax+0], mm1
- movq [eax+8], mm3
- sub edi, 4
- jge .loop
- pop edi
- pop ebx
- emms
- ret
-
-
-
-; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
-cglobal x264_pixel_ads_mvs
- mov ebx, [ebp+24] ; mvs
- mov ecx, esp ; masks
- mov edi, [ebp+28] ; width
- mov dword [ecx+edi], 0
- push esi
- push ebp
- xor eax, eax
- xor esi, esi
-.loopi:
- mov ebp, [ecx+esi]
- mov edx, [ecx+esi+4]
- or edx, ebp
- jz .nexti
- xor edx, edx
-%macro TEST 1
- mov [ebx+eax*2], si
- test ebp, 0xff<<(%1*8)
- setne dl
- add eax, edx
- inc esi
-%endmacro
- TEST 0
- TEST 1
- TEST 2
- TEST 3
- mov ebp, [ecx+esi]
- TEST 0
- TEST 1
- TEST 2
- TEST 3
- cmp esi, edi
- jl .loopi
- jmp .end
-.nexti:
- add esi, 8
- cmp esi, edi
- jl .loopi
-.end:
- pop ebp
- pop esi
- mov edi, [ebp-8]
- mov ebx, [ebp-4]
- leave
- ret
-
-%macro ADS_START 0
- push ebp
- mov ebp, esp
- push ebx
- push edi
- mov eax, [ebp+12] ; sums
- mov ebx, [ebp+16] ; delta
- mov ecx, [ebp+20] ; cost_mvx
- mov edx, [ebp+28] ; width
- sub esp, edx
- sub esp, 4
- and esp, ~15
- mov edi, esp
- shl ebx, 1
-%endmacro
-
-%macro ADS_END 1
- add eax, 8*%1
- add ecx, 8*%1
- add edi, 4*%1
- sub edx, 4*%1
- jg .loop
- jmp x264_pixel_ads_mvs
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
-; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_ads4_mmxext
- mov eax, [esp+4]
- movq mm6, [eax]
- movq mm4, [eax+8]
- pshufw mm7, mm6, 0
- pshufw mm6, mm6, 0xAA
- pshufw mm5, mm4, 0
- pshufw mm4, mm4, 0xAA
- ADS_START
-.loop:
- movq mm0, [eax]
- movq mm1, [eax+16]
- psubw mm0, mm7
- psubw mm1, mm6
- MMX_ABS mm0, mm2
- MMX_ABS mm1, mm3
- movq mm2, [eax+ebx]
- movq mm3, [eax+ebx+16]
- psubw mm2, mm5
- psubw mm3, mm4
- paddw mm0, mm1
- MMX_ABS mm2, mm1
- MMX_ABS mm3, mm1
- paddw mm0, mm2
- paddw mm0, mm3
- pshufw mm1, [ebp+32], 0
- paddusw mm0, [ecx]
- psubusw mm1, mm0
- packsswb mm1, mm1
- movd [edi], mm1
- ADS_END 1
-
-cglobal x264_pixel_ads2_mmxext
- mov eax, [esp+4]
- movq mm6, [eax]
- pshufw mm5, [esp+28], 0
- pshufw mm7, mm6, 0
- pshufw mm6, mm6, 0xAA
- ADS_START
-.loop:
- movq mm0, [eax]
- movq mm1, [eax+ebx]
- psubw mm0, mm7
- psubw mm1, mm6
- MMX_ABS mm0, mm2
- MMX_ABS mm1, mm3
- paddw mm0, mm1
- paddusw mm0, [ecx]
- movq mm4, mm5
- psubusw mm4, mm0
- packsswb mm4, mm4
- movd [edi], mm4
- ADS_END 1
-
-cglobal x264_pixel_ads1_mmxext
- mov eax, [esp+4]
- pshufw mm7, [eax], 0
- pshufw mm6, [esp+28], 0
- ADS_START
-.loop:
- movq mm0, [eax]
- movq mm1, [eax+8]
- psubw mm0, mm7
- psubw mm1, mm7
- MMX_ABS mm0, mm2
- MMX_ABS mm1, mm3
- paddusw mm0, [ecx]
- paddusw mm1, [ecx+8]
- movq mm4, mm6
- movq mm5, mm6
- psubusw mm4, mm0
- psubusw mm5, mm1
- packsswb mm4, mm5
- movq [edi], mm4
- ADS_END 2
-
-%macro ADS_SSE2 1
-cglobal x264_pixel_ads4_%1
- mov eax, [esp+4] ; enc_dc
- movdqa xmm4, [eax]
- pshuflw xmm7, xmm4, 0
- pshuflw xmm6, xmm4, 0xAA
- pshufhw xmm5, xmm4, 0
- pshufhw xmm4, xmm4, 0xAA
- punpcklqdq xmm7, xmm7
- punpcklqdq xmm6, xmm6
- punpckhqdq xmm5, xmm5
- punpckhqdq xmm4, xmm4
- ADS_START
-.loop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax+16]
- psubw xmm0, xmm7
- psubw xmm1, xmm6
- MMX_ABS xmm0, xmm2
- MMX_ABS xmm1, xmm3
- movdqu xmm2, [eax+ebx]
- movdqu xmm3, [eax+ebx+16]
- psubw xmm2, xmm5
- psubw xmm3, xmm4
- paddw xmm0, xmm1
- MMX_ABS xmm2, xmm1
- MMX_ABS xmm3, xmm1
- paddw xmm0, xmm2
- paddw xmm0, xmm3
- movd xmm1, [ebp+32] ; thresh
- movdqu xmm2, [ecx]
- pshuflw xmm1, xmm1, 0
- punpcklqdq xmm1, xmm1
- paddusw xmm0, xmm2
- psubusw xmm1, xmm0
- packsswb xmm1, xmm1
- movq [edi], xmm1
- ADS_END 2
-
-cglobal x264_pixel_ads2_%1
- mov eax, [esp+4] ; enc_dc
- movq xmm6, [eax]
- movd xmm5, [esp+28] ; thresh
- pshuflw xmm7, xmm6, 0
- pshuflw xmm6, xmm6, 0xAA
- pshuflw xmm5, xmm5, 0
- punpcklqdq xmm7, xmm7
- punpcklqdq xmm6, xmm6
- punpcklqdq xmm5, xmm5
- ADS_START
-.loop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax+ebx]
- psubw xmm0, xmm7
- psubw xmm1, xmm6
- movdqu xmm4, [ecx]
- MMX_ABS xmm0, xmm2
- MMX_ABS xmm1, xmm3
- paddw xmm0, xmm1
- paddusw xmm0, xmm4
- movdqa xmm1, xmm5
- psubusw xmm1, xmm0
- packsswb xmm1, xmm1
- movq [edi], xmm1
- ADS_END 2
-
-cglobal x264_pixel_ads1_%1
- mov eax, [esp+4] ; enc_dc
- movd xmm7, [eax]
- movd xmm6, [esp+28] ; thresh
- pshuflw xmm7, xmm7, 0
- pshuflw xmm6, xmm6, 0
- punpcklqdq xmm7, xmm7
- punpcklqdq xmm6, xmm6
- ADS_START
-.loop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax+16]
- psubw xmm0, xmm7
- psubw xmm1, xmm7
- movdqu xmm2, [ecx]
- movdqu xmm3, [ecx+16]
- MMX_ABS xmm0, xmm4
- MMX_ABS xmm1, xmm5
- paddusw xmm0, xmm2
- paddusw xmm1, xmm3
- movdqa xmm4, xmm6
- movdqa xmm5, xmm6
- psubusw xmm4, xmm0
- psubusw xmm5, xmm1
- packsswb xmm4, xmm5
- movdqa [edi], xmm4
- ADS_END 4
-%endmacro
-
-ADS_SSE2 sse2
-%ifdef HAVE_SSE3
-%macro MMX_ABS 2
- pabsw %1, %1
-%endmacro
-ADS_SSE2 ssse3
-%endif
+++ /dev/null
-;*****************************************************************************
-;* pixel-sse2.asm: h264 encoder library
-;*****************************************************************************
-;* Copyright (C) 2005 x264 project
-;*
-;* Authors: Alex Izvorski <aizvorksi@gmail.com>
-;* Loren Merritt <lorenm@u.washington.edu>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
-;*****************************************************************************
-
-BITS 32
-
-;=============================================================================
-; Macros and other preprocessor constants
-;=============================================================================
-
-%include "i386inc.asm"
-
-SECTION_RODATA
-
-pw_1: times 8 dw 1
-ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
-ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
-mask_ff: times 16 db 0xff
- times 16 db 0
-
-
-SECTION .text
-
-%macro HADDW 2 ; sum junk
- ; ebx is no longer used at this point, so no push needed
- picgetgot ebx
- pmaddwd %1, [pw_1 GLOBAL]
- movhlps %2, %1
- paddd %1, %2
- pshuflw %2, %1, 0xE
- paddd %1, %2
-%endmacro
-
-%macro SAD_START_SSE2 0
- push ebx
- mov eax, [esp+ 8] ; pix1
- mov ebx, [esp+12] ; stride1
- mov ecx, [esp+16] ; pix2
- mov edx, [esp+20] ; stride2
-%endmacro
-
-%macro SAD_END_SSE2 0
- movhlps xmm1, xmm0
- paddw xmm0, xmm1
- movd eax, xmm0
- pop ebx
- ret
-%endmacro
-
-%macro SAD_W16 1
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x16_%1
- SAD_START_SSE2
- movdqu xmm0, [ecx]
- movdqu xmm1, [ecx+edx]
- lea ecx, [ecx+2*edx]
- movdqu xmm2, [ecx]
- movdqu xmm3, [ecx+edx]
- lea ecx, [ecx+2*edx]
- psadbw xmm0, [eax]
- psadbw xmm1, [eax+ebx]
- lea eax, [eax+2*ebx]
- movdqu xmm4, [ecx]
- paddw xmm0, xmm1
- psadbw xmm2, [eax]
- psadbw xmm3, [eax+ebx]
- lea eax, [eax+2*ebx]
- movdqu xmm5, [ecx+edx]
- lea ecx, [ecx+2*edx]
- paddw xmm2, xmm3
- movdqu xmm6, [ecx]
- movdqu xmm7, [ecx+edx]
- lea ecx, [ecx+2*edx]
- paddw xmm0, xmm2
- psadbw xmm4, [eax]
- psadbw xmm5, [eax+ebx]
- lea eax, [eax+2*ebx]
- movdqu xmm1, [ecx]
- paddw xmm4, xmm5
- psadbw xmm6, [eax]
- psadbw xmm7, [eax+ebx]
- lea eax, [eax+2*ebx]
- movdqu xmm2, [ecx+edx]
- lea ecx, [ecx+2*edx]
- paddw xmm6, xmm7
- movdqu xmm3, [ecx]
- paddw xmm0, xmm4
- movdqu xmm4, [ecx+edx]
- lea ecx, [ecx+2*edx]
- paddw xmm0, xmm6
- psadbw xmm1, [eax]
- psadbw xmm2, [eax+ebx]
- lea eax, [eax+2*ebx]
- movdqu xmm5, [ecx]
- paddw xmm1, xmm2
- psadbw xmm3, [eax]
- psadbw xmm4, [eax+ebx]
- lea eax, [eax+2*ebx]
- movdqu xmm6, [ecx+edx]
- lea ecx, [ecx+2*edx]
- paddw xmm3, xmm4
- movdqu xmm7, [ecx]
- paddw xmm0, xmm1
- movdqu xmm1, [ecx+edx]
- paddw xmm0, xmm3
- psadbw xmm5, [eax]
- psadbw xmm6, [eax+ebx]
- lea eax, [eax+2*ebx]
- paddw xmm5, xmm6
- psadbw xmm7, [eax]
- psadbw xmm1, [eax+ebx]
- paddw xmm7, xmm1
- paddw xmm0, xmm5
- paddw xmm0, xmm7
- SAD_END_SSE2
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x8_%1
- SAD_START_SSE2
- movdqu xmm0, [ecx]
- movdqu xmm2, [ecx+edx]
- lea ecx, [ecx+2*edx]
- movdqu xmm3, [ecx]
- movdqu xmm4, [ecx+edx]
- psadbw xmm0, [eax]
- psadbw xmm2, [eax+ebx]
- lea eax, [eax+2*ebx]
- psadbw xmm3, [eax]
- psadbw xmm4, [eax+ebx]
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- paddw xmm0, xmm2
- paddw xmm3, xmm4
- paddw xmm0, xmm3
- movdqu xmm1, [ecx]
- movdqu xmm2, [ecx+edx]
- lea ecx, [ecx+2*edx]
- movdqu xmm3, [ecx]
- movdqu xmm4, [ecx+edx]
- psadbw xmm1, [eax]
- psadbw xmm2, [eax+ebx]
- lea eax, [eax+2*ebx]
- psadbw xmm3, [eax]
- psadbw xmm4, [eax+ebx]
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- paddw xmm1, xmm2
- paddw xmm3, xmm4
- paddw xmm0, xmm1
- paddw xmm0, xmm3
- SAD_END_SSE2
-%endmacro
-
-SAD_W16 sse2
-%ifdef HAVE_SSE3
-%define movdqu lddqu
-SAD_W16 sse3
-%undef movdqu
-%endif
-
-
-; sad x3 / x4
-
-%macro SAD_X3_START_1x16P 0
- push edi
- push esi
- mov edi, [esp+12]
- mov eax, [esp+16]
- mov ecx, [esp+20]
- mov edx, [esp+24]
- mov esi, [esp+28]
- movdqa xmm3, [edi]
- movdqu xmm0, [eax]
- movdqu xmm1, [ecx]
- movdqu xmm2, [edx]
- psadbw xmm0, xmm3
- psadbw xmm1, xmm3
- psadbw xmm2, xmm3
-%endmacro
-
-%macro SAD_X3_1x16P 2
- movdqa xmm3, [edi+%1]
- movdqu xmm4, [eax+%2]
- movdqu xmm5, [ecx+%2]
- movdqu xmm6, [edx+%2]
- psadbw xmm4, xmm3
- psadbw xmm5, xmm3
- psadbw xmm6, xmm3
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
-%endmacro
-
-%macro SAD_X3_2x16P 1
-%if %1
- SAD_X3_START_1x16P
-%else
- SAD_X3_1x16P 0, 0
-%endif
- SAD_X3_1x16P FENC_STRIDE, esi
- add edi, 2*FENC_STRIDE
- lea eax, [eax+2*esi]
- lea ecx, [ecx+2*esi]
- lea edx, [edx+2*esi]
-%endmacro
-
-%macro SAD_X4_START_1x16P 0
- push edi
- push esi
- push ebx
- mov edi, [esp+16]
- mov eax, [esp+20]
- mov ebx, [esp+24]
- mov ecx, [esp+28]
- mov edx, [esp+32]
- mov esi, [esp+36]
- movdqa xmm7, [edi]
- movdqu xmm0, [eax]
- movdqu xmm1, [ebx]
- movdqu xmm2, [ecx]
- movdqu xmm3, [edx]
- psadbw xmm0, xmm7
- psadbw xmm1, xmm7
- psadbw xmm2, xmm7
- psadbw xmm3, xmm7
-%endmacro
-
-%macro SAD_X4_1x16P 2
- movdqa xmm7, [edi+%1]
- movdqu xmm4, [eax+%2]
- movdqu xmm5, [ebx+%2]
- movdqu xmm6, [ecx+%2]
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- paddw xmm0, xmm4
- psadbw xmm6, xmm7
- movdqu xmm4, [edx+%2]
- paddw xmm1, xmm5
- psadbw xmm4, xmm7
- paddw xmm2, xmm6
- paddw xmm3, xmm4
-%endmacro
-
-%macro SAD_X4_2x16P 1
-%if %1
- SAD_X4_START_1x16P
-%else
- SAD_X4_1x16P 0, 0
-%endif
- SAD_X4_1x16P FENC_STRIDE, esi
- add edi, 2*FENC_STRIDE
- lea eax, [eax+2*esi]
- lea ebx, [ebx+2*esi]
- lea ecx, [ecx+2*esi]
- lea edx, [edx+2*esi]
-%endmacro
-
-%macro SAD_X3_END 0
- mov eax, [esp+32]
- pshufd xmm4, xmm0, 2
- pshufd xmm5, xmm1, 2
- pshufd xmm6, xmm2, 2
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- movd [eax+0], xmm0
- movd [eax+4], xmm1
- movd [eax+8], xmm2
- pop esi
- pop edi
- ret
-%endmacro
-
-%macro SAD_X4_END 0
- mov eax, [esp+40]
- psllq xmm1, 32
- psllq xmm3, 32
- paddw xmm0, xmm1
- paddw xmm2, xmm3
- pshufd xmm1, xmm0, 14
- pshufd xmm3, xmm2, 14
- paddw xmm0, xmm1
- paddw xmm2, xmm3
- movq [eax+0], xmm0
- movq [eax+8], xmm2
- pop ebx
- pop esi
- pop edi
- ret
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-; uint8_t *pix2, int i_stride, int scores[3] )
-;-----------------------------------------------------------------------------
-%macro SAD_X 4
-cglobal x264_pixel_sad_x%1_%2x%3_%4
- SAD_X%1_2x%2P 1
-%rep %3/2-1
- SAD_X%1_2x%2P 0
-%endrep
- SAD_X%1_END
-%endmacro
-
-SAD_X 3, 16, 16, sse2
-SAD_X 3, 16, 8, sse2
-SAD_X 4, 16, 16, sse2
-SAD_X 4, 16, 8, sse2
-
-%ifdef HAVE_SSE3
-%define movdqu lddqu
-SAD_X 3, 16, 16, sse3
-SAD_X 3, 16, 8, sse3
-SAD_X 4, 16, 16, sse3
-SAD_X 4, 16, 8, sse3
-%undef movdqu
-%endif
-
-
-; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
-; unless the unaligned data spans the border between 2 cachelines, in which
-; case it's really slow. The exact numbers may differ, but all Intel cpus
-; have a large penalty for cacheline splits.
-; (8-byte alignment exactly half way between two cachelines is ok though.)
-; LDDQU was supposed to fix this, but it only works on Pentium 4.
-; So in the split case we load aligned data and explicitly perform the
-; alignment between registers. Like on archs that have only aligned loads,
-; except complicated by the fact that PALIGNR takes only an immediate, not
-; a variable alignment.
-
-; computed jump assumes this loop is exactly 80 bytes
-%macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
-ALIGN 16
-sad_w16_align%1_sse2:
- movdqa xmm1, [ecx+16]
- movdqa xmm2, [ecx+edx+16]
- movdqa xmm3, [ecx]
- movdqa xmm4, [ecx+edx]
- pslldq xmm1, 16-%1
- pslldq xmm2, 16-%1
- psrldq xmm3, %1
- psrldq xmm4, %1
- por xmm1, xmm3
- por xmm2, xmm4
- psadbw xmm1, [eax]
- psadbw xmm2, [eax+ebx]
- paddw xmm0, xmm1
- paddw xmm0, xmm2
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- dec esi
- jg sad_w16_align%1_sse2
- ret
-%endmacro
-
-; computed jump assumes this loop is exactly 64 bytes
-%macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
-ALIGN 16
-sad_w16_align%1_ssse3:
- movdqa xmm1, [ecx+16]
- movdqa xmm2, [ecx+edx+16]
- palignr xmm1, [ecx], %1
- palignr xmm2, [ecx+edx], %1
- psadbw xmm1, [eax]
- psadbw xmm2, [eax+ebx]
- paddw xmm0, xmm1
- paddw xmm0, xmm2
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- dec esi
- jg sad_w16_align%1_ssse3
- ret
-%endmacro
-
-%macro SAD16_CACHELINE_FUNC 2 ; cpu, height
-cglobal x264_pixel_sad_16x%2_cache64_%1
- mov eax, [esp+12]
- and eax, 0x37
- cmp eax, 0x30
- jle x264_pixel_sad_16x%2_sse2
- mov eax, [esp+12]
- push ebx
- push edi
- push esi
- and eax, 15
-%ifidn %1, ssse3
- shl eax, 6
-%else
- lea eax, [eax*5]
- shl eax, 4
-%endif
- picgetgot ebx
- lea edi, [sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1) + eax GLOBAL]
- mov eax, [esp+16]
- mov ebx, [esp+20]
- mov ecx, [esp+24]
- mov edx, [esp+28]
- and ecx, ~15
- mov esi, %2/2
- pxor xmm0, xmm0
- call edi
- pop esi
- pop edi
- SAD_END_SSE2
-%endmacro
-
-%macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
- mov eax, [esp+12]
- and eax, 0x17|%2|(%4>>1)
- cmp eax, 0x10|%2|(%4>>1)
- jle x264_pixel_sad_%1x%2_mmxext
- push ebx
- push esi
- and eax, 7
- shl eax, 3
- mov ecx, 64
- sub ecx, eax
- movd mm7, eax
- movd mm6, ecx
- mov eax, [esp+12]
- mov ebx, [esp+16]
- mov ecx, [esp+20]
- mov edx, [esp+24]
- and ecx, ~7
- mov esi, %3
- pxor mm0, mm0
-%endmacro
-
-%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal x264_pixel_sad_16x%1_cache%2_mmxext
- SAD_CACHELINE_START_MMX2 16, %1, %1, %2
-.loop:
- movq mm1, [ecx]
- movq mm2, [ecx+8]
- movq mm3, [ecx+16]
- movq mm4, mm2
- psrlq mm1, mm7
- psllq mm2, mm6
- psllq mm3, mm6
- psrlq mm4, mm7
- por mm1, mm2
- por mm3, mm4
- psadbw mm1, [eax]
- psadbw mm3, [eax+8]
- paddw mm0, mm1
- paddw mm0, mm3
- add ecx, edx
- add eax, ebx
- dec esi
- jg .loop
- pop esi
- pop ebx
- movd eax, mm0
- ret
-%endmacro
-
-%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal x264_pixel_sad_8x%1_cache%2_mmxext
- SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
-.loop:
- movq mm1, [ecx+8]
- movq mm2, [ecx+edx+8]
- movq mm3, [ecx]
- movq mm4, [ecx+edx]
- psllq mm1, mm6
- psllq mm2, mm6
- psrlq mm3, mm7
- psrlq mm4, mm7
- por mm1, mm3
- por mm2, mm4
- psadbw mm1, [eax]
- psadbw mm2, [eax+ebx]
- paddw mm0, mm1
- paddw mm0, mm2
- lea ecx, [ecx+2*edx]
- lea eax, [eax+2*ebx]
- dec esi
- jg .loop
- pop esi
- pop ebx
- movd eax, mm0
- ret
-%endmacro
-
-
-; sad_x3/x4_cache64: check each mv.
-; if they're all within a cacheline, use normal sad_x3/x4.
-; otherwise, send them individually to sad_cache64.
-%macro CHECK_SPLIT 3 ; pix, width, cacheline
- mov eax, %1
- and eax, 0x17|%2|(%3>>1)
- cmp eax, 0x10|%2|(%3>>1)
- jg .split
-%endmacro
-
-%macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
-cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5
- CHECK_SPLIT [esp+8], %1, %3
- CHECK_SPLIT [esp+12], %1, %3
- CHECK_SPLIT [esp+16], %1, %3
- jmp x264_pixel_sad_x3_%1x%2_%4
-.split:
- push edi
- mov edi, [esp+28]
- push dword [esp+24]
- push dword [esp+16]
- push dword 16
- push dword [esp+20]
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov ecx, [esp+32]
- mov [edi], eax
- mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov ecx, [esp+36]
- mov [edi+4], eax
- mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [edi+8], eax
- add esp, 16
- pop edi
- ret
-%endmacro
-
-%macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
-cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5
- CHECK_SPLIT [esp+8], %1, %3
- CHECK_SPLIT [esp+12], %1, %3
- CHECK_SPLIT [esp+16], %1, %3
- CHECK_SPLIT [esp+20], %1, %3
- jmp x264_pixel_sad_x4_%1x%2_%4
-.split:
- push edi
- mov edi, [esp+32]
- push dword [esp+28]
- push dword [esp+16]
- push dword 16
- push dword [esp+20]
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov ecx, [esp+32]
- mov [edi], eax
- mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov ecx, [esp+36]
- mov [edi+4], eax
- mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov ecx, [esp+40]
- mov [edi+8], eax
- mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [edi+12], eax
- add esp, 16
- pop edi
- ret
-%endmacro
-
-%macro SADX34_CACHELINE_FUNC 5
- SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5
- SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5
-%endmacro
-
-cextern x264_pixel_sad_16x16_mmxext
-cextern x264_pixel_sad_16x8_mmxext
-cextern x264_pixel_sad_8x16_mmxext
-cextern x264_pixel_sad_8x8_mmxext
-cextern x264_pixel_sad_8x4_mmxext
-cextern x264_pixel_sad_x3_16x16_mmxext
-cextern x264_pixel_sad_x3_16x8_mmxext
-cextern x264_pixel_sad_x3_8x16_mmxext
-cextern x264_pixel_sad_x3_8x8_mmxext
-cextern x264_pixel_sad_x4_16x16_mmxext
-cextern x264_pixel_sad_x4_16x8_mmxext
-cextern x264_pixel_sad_x4_8x16_mmxext
-cextern x264_pixel_sad_x4_8x8_mmxext
-
-; instantiate the aligned sads
-
-SAD16_CACHELINE_FUNC sse2, 8
-SAD16_CACHELINE_FUNC sse2, 16
-%assign i 1
-%rep 15
-SAD16_CACHELINE_LOOP_SSE2 i
-%assign i i+1
-%endrep
-
-SAD16_CACHELINE_FUNC_MMX2 16, 32
-SAD8_CACHELINE_FUNC_MMX2 4, 32
-SAD8_CACHELINE_FUNC_MMX2 8, 32
-SAD8_CACHELINE_FUNC_MMX2 16, 32
-SAD16_CACHELINE_FUNC_MMX2 8, 64
-SAD16_CACHELINE_FUNC_MMX2 16, 64
-SAD8_CACHELINE_FUNC_MMX2 4, 64
-SAD8_CACHELINE_FUNC_MMX2 8, 64
-SAD8_CACHELINE_FUNC_MMX2 16, 64
-SAD16_CACHELINE_FUNC_MMX2 8, 32
-
-SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext
-SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext
-SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext
-SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext
-SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2
-SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2
-
-%ifdef HAVE_SSE3
-
-SAD16_CACHELINE_FUNC ssse3, 8
-SAD16_CACHELINE_FUNC ssse3, 16
-%assign i 1
-%rep 15
-SAD16_CACHELINE_LOOP_SSSE3 i
-%assign i i+1
-%endrep
-
-SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3
-SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3
-
-%endif ; HAVE_SSE3
-
-
-%macro SSD_INC_2x16P_SSE2 0
- movdqu xmm1, [eax]
- movdqu xmm2, [ecx]
- movdqu xmm3, [eax+ebx]
- movdqu xmm4, [ecx+edx]
-
- movdqa xmm5, xmm1
- movdqa xmm6, xmm3
- psubusb xmm1, xmm2
- psubusb xmm3, xmm4
- psubusb xmm2, xmm5
- psubusb xmm4, xmm6
- por xmm1, xmm2
- por xmm3, xmm4
-
- movdqa xmm2, xmm1
- movdqa xmm4, xmm3
- punpcklbw xmm1, xmm7
- punpckhbw xmm2, xmm7
- punpcklbw xmm3, xmm7
- punpckhbw xmm4, xmm7
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
- pmaddwd xmm4, xmm4
-
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
-
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm0, xmm1
- paddd xmm0, xmm3
-%endmacro
-
-%macro SSD_START_SSE2 0
- push ebx
-
- mov eax, [esp+ 8] ; pix1
- mov ebx, [esp+12] ; stride1
- mov ecx, [esp+16] ; pix2
- mov edx, [esp+20] ; stride2
-
- pxor xmm7, xmm7 ; zero
- pxor xmm0, xmm0 ; mm0 holds the sum
-%endmacro
-
-%macro SSD_END_SSE2 0
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddd xmm0, xmm1
-
- movdqa xmm1, xmm0
- psrldq xmm1, 4
- paddd xmm0, xmm1
-
- movd eax, xmm0
-
- pop ebx
- ret
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssd_16x16_sse2
- SSD_START_SSE2
-%rep 8
- SSD_INC_2x16P_SSE2
-%endrep
- SSD_END_SSE2
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssd_16x8_sse2
- SSD_START_SSE2
-%rep 4
- SSD_INC_2x16P_SSE2
-%endrep
- SSD_END_SSE2
-
-
-
-%macro SUMSUB_BADC 4
- paddw %1, %2
- paddw %3, %4
- paddw %2, %2
- paddw %4, %4
- psubw %2, %1
- psubw %4, %3
-%endmacro
-
-%macro HADAMARD1x4 4
- SUMSUB_BADC %1, %2, %3, %4
- SUMSUB_BADC %1, %3, %2, %4
-%endmacro
-
-%macro SBUTTERFLY 5
- mov%1 %5, %3
- punpckl%2 %3, %4
- punpckh%2 %5, %4
-%endmacro
-
-%macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4 to not shuffle registers
- mov%1 %5, %3
- punpckh%2 %3, %4
- punpckl%2 %5, %4
-%endmacro
-
-%macro TRANSPOSE4x4D 5 ; ABCD-T -> ADTC
- SBUTTERFLY dqa, dq, %1, %2, %5
- SBUTTERFLY dqa, dq, %3, %4, %2
- SBUTTERFLY dqa, qdq, %1, %3, %4
- SBUTTERFLY dqa, qdq, %5, %2, %3
-%endmacro
-
-%macro TRANSPOSE2x4x4W 5 ; ABCD-T -> ABCD
- SBUTTERFLY dqa, wd, %1, %2, %5
- SBUTTERFLY dqa, wd, %3, %4, %2
- SBUTTERFLY dqa, dq, %1, %3, %4
- SBUTTERFLY2 dqa, dq, %5, %2, %3
- SBUTTERFLY dqa, qdq, %1, %3, %2
- SBUTTERFLY2 dqa, qdq, %4, %5, %3
-%endmacro
-
-%macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2]
- movq %1, %3
- movq %2, %4
- punpcklbw %1, %2
- punpcklbw %2, %2
- psubw %1, %2
-%endmacro
-
-%macro SUM4x4_SSE2 4 ; 02 13 junk sum
- pxor %3, %3
- psubw %3, %1
- pmaxsw %1, %3
-
- pxor %3, %3
- psubw %3, %2
- pmaxsw %2, %3
-
- paddusw %4, %1
- paddusw %4, %2
-%endmacro
-
-%macro SUM8x4_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
- pxor %3, %3
- pxor %6, %6
- psubw %3, %1
- psubw %6, %4
- pmaxsw %1, %3
- pmaxsw %4, %6
- pxor %3, %3
- pxor %6, %6
- psubw %3, %2
- psubw %6, %5
- pmaxsw %2, %3
- pmaxsw %5, %6
- paddusw %1, %2
- paddusw %4, %5
- paddusw %7, %1
- paddusw %7, %4
-%endmacro
-
-%macro SUM8x4_SSSE3 7 ; a02 a13 . b02 b13 . sum
- pabsw %1, %1
- pabsw %2, %2
- pabsw %4, %4
- pabsw %5, %5
- paddusw %1, %2
- paddusw %4, %5
- paddusw %7, %1
- paddusw %7, %4
-%endmacro
-
-%macro SATD_TWO_SSE2 0
- LOAD_DIFF_8P xmm0, xmm4, [eax], [ecx]
- LOAD_DIFF_8P xmm1, xmm5, [eax+ebx], [ecx+edx]
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- LOAD_DIFF_8P xmm2, xmm4, [eax], [ecx]
- LOAD_DIFF_8P xmm3, xmm5, [eax+ebx], [ecx+edx]
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
-
- HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
- TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
- HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
- SUM8x4 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6
-%endmacro
-
-%macro SATD_START 0
- push ebx
-
- mov eax, [esp+ 8] ; pix1
- mov ebx, [esp+12] ; stride1
- mov ecx, [esp+16] ; pix2
- mov edx, [esp+20] ; stride2
-
- pxor xmm6, xmm6
-%endmacro
-
-%macro SATD_END 0
- ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
- psrlw xmm6, 1
- HADDW xmm6, xmm7
- movd eax, xmm6
- pop ebx
- ret
-%endmacro
-
-%macro SATDS 1
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_16x16_%1
- SATD_START
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- mov eax, [esp+ 8]
- mov ecx, [esp+16]
- add eax, 8
- add ecx, 8
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_8x16_%1
- SATD_START
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_16x8_%1
- SATD_START
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- mov eax, [esp+ 8]
- mov ecx, [esp+16]
- add eax, 8
- add ecx, 8
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_8x8_%1
- SATD_START
- SATD_TWO_SSE2
- SATD_TWO_SSE2
- SATD_END
-
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_satd_8x4_%1
- SATD_START
- SATD_TWO_SSE2
- SATD_END
-%endmacro ; SATDS
-
-%define SUM8x4 SUM8x4_SSE2
-SATDS sse2
-%ifdef HAVE_SSE3
-%define SUM8x4 SUM8x4_SSSE3
-SATDS ssse3
-%endif
-
-
-
-;-----------------------------------------------------------------------------
-; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
-; const uint8_t *pix2, int stride2, int sums[2][4] )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssim_4x4x2_core_sse2
- push ebx
- mov eax, [esp+ 8]
- mov ebx, [esp+12]
- mov ecx, [esp+16]
- mov edx, [esp+20]
- pxor xmm0, xmm0
- pxor xmm1, xmm1
- pxor xmm2, xmm2
- pxor xmm3, xmm3
- pxor xmm4, xmm4
-%rep 4
- movq xmm5, [eax]
- movq xmm6, [ecx]
- punpcklbw xmm5, xmm0
- punpcklbw xmm6, xmm0
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- movdqa xmm7, xmm5
- pmaddwd xmm5, xmm5
- pmaddwd xmm7, xmm6
- pmaddwd xmm6, xmm6
- paddd xmm3, xmm5
- paddd xmm4, xmm7
- paddd xmm3, xmm6
- add eax, ebx
- add ecx, edx
-%endrep
- ; PHADDW xmm1, xmm2
- ; PHADDD xmm3, xmm4
- mov eax, [esp+24]
- picgetgot ebx
- movdqa xmm7, [pw_1 GLOBAL]
- pshufd xmm5, xmm3, 0xB1
- pmaddwd xmm1, xmm7
- pmaddwd xmm2, xmm7
- pshufd xmm6, xmm4, 0xB1
- packssdw xmm1, xmm2
- paddd xmm3, xmm5
- pshufd xmm1, xmm1, 0xD8
- paddd xmm4, xmm6
- pmaddwd xmm1, xmm7
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
- punpckhdq xmm5, xmm4
- movq [eax+ 0], xmm1
- movq [eax+ 8], xmm3
- psrldq xmm1, 8
- movq [eax+16], xmm1
- movq [eax+24], xmm5
- pop ebx
- ret
-
-;-----------------------------------------------------------------------------
-; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssim_end4_sse2
- mov eax, [esp+ 4]
- mov ecx, [esp+ 8]
- mov edx, [esp+12]
- picpush ebx
- picgetgot ebx
- movdqa xmm0, [eax+ 0]
- movdqa xmm1, [eax+16]
- movdqa xmm2, [eax+32]
- movdqa xmm3, [eax+48]
- movdqa xmm4, [eax+64]
- paddd xmm0, [ecx+ 0]
- paddd xmm1, [ecx+16]
- paddd xmm2, [ecx+32]
- paddd xmm3, [ecx+48]
- paddd xmm4, [ecx+64]
- paddd xmm0, xmm1
- paddd xmm1, xmm2
- paddd xmm2, xmm3
- paddd xmm3, xmm4
- movdqa xmm5, [ssim_c1 GLOBAL]
- movdqa xmm6, [ssim_c2 GLOBAL]
- TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4
-
-; s1=mm0, s2=mm3, ss=mm4, s12=mm2
- movdqa xmm1, xmm3
- pslld xmm3, 16
- pmaddwd xmm1, xmm0 ; s1*s2
- por xmm0, xmm3
- pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2
- pslld xmm1, 1
- pslld xmm2, 7
- pslld xmm4, 6
- psubd xmm2, xmm1 ; covar*2
- psubd xmm4, xmm0 ; vars
- paddd xmm0, xmm5
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm4, xmm6
- cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
- cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1)
- cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2)
- cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2)
- mulps xmm1, xmm2
- mulps xmm0, xmm4
- divps xmm1, xmm0 ; ssim
-
- neg edx
- movdqu xmm3, [mask_ff + edx*4 + 16 GLOBAL]
- pand xmm1, xmm3
- movhlps xmm0, xmm1
- addps xmm0, xmm1
- pshuflw xmm1, xmm0, 0xE
- addss xmm0, xmm1
-
- movd [picesp+4], xmm0
- fld dword [picesp+4]
- picpop ebx
- ret
-
+++ /dev/null
-;*****************************************************************************
-;* predict-a.asm: h264 encoder library
-;*****************************************************************************
-;* Copyright (C) 2005 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
-;*****************************************************************************
-
-BITS 32
-
-;=============================================================================
-; Macros and other preprocessor constants
-;=============================================================================
-
-%include "i386inc.asm"
-
-%macro STORE8x8 2
- movq [edx + 0*FDEC_STRIDE], %1
- movq [edx + 1*FDEC_STRIDE], %1
- movq [edx + 2*FDEC_STRIDE], %1
- movq [edx + 3*FDEC_STRIDE], %1
- movq [edx + 4*FDEC_STRIDE], %2
- movq [edx + 5*FDEC_STRIDE], %2
- movq [edx + 6*FDEC_STRIDE], %2
- movq [edx + 7*FDEC_STRIDE], %2
-%endmacro
-
-%macro STORE16x16 2
- mov eax, 4
-.loop:
- movq [edx + 0*FDEC_STRIDE], %1
- movq [edx + 1*FDEC_STRIDE], %1
- movq [edx + 2*FDEC_STRIDE], %1
- movq [edx + 3*FDEC_STRIDE], %1
- movq [edx + 0*FDEC_STRIDE + 8], %2
- movq [edx + 1*FDEC_STRIDE + 8], %2
- movq [edx + 2*FDEC_STRIDE + 8], %2
- movq [edx + 3*FDEC_STRIDE + 8], %2
- add edx, 4*FDEC_STRIDE
- dec eax
- jg .loop
- nop
-%endmacro
-
-%macro STORE16x16_SSE2 1
- mov eax, 4
-.loop:
- movdqa [edx + 0*FDEC_STRIDE], %1
- movdqa [edx + 1*FDEC_STRIDE], %1
- movdqa [edx + 2*FDEC_STRIDE], %1
- movdqa [edx + 3*FDEC_STRIDE], %1
- add edx, 4*FDEC_STRIDE
- dec eax
- jg .loop
- nop
-%endmacro
-
-SECTION_RODATA
-
-ALIGN 16
-pb_1: times 16 db 1
-pw_2: times 4 dw 2
-pw_4: times 4 dw 4
-pw_8: times 8 dw 8
-pw_76543210:
-pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
-pb_00s_ff: times 8 db 0
-pb_0s_ff: times 7 db 0
- db 0xff
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-; dest, left, right, src, tmp
-; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
-; dest, left, right, src, tmp
-; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
-%macro PRED8x8_LOWPASS0 6
- mov%6 %5, %2
- pavgb %2, %3
- pxor %3, %5
- mov%6 %1, %4
- pand %3, [pb_1 GLOBAL]
- psubusb %2, %3
- pavgb %1, %2
-%endmacro
-%macro PRED8x8_LOWPASS 5
- PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, q
-%endmacro
-%macro PRED8x8_LOWPASS_XMM 5
- PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa
-%endmacro
-
-
-;-----------------------------------------------------------------------------
-; void predict_4x4_ddl_mmxext( uint8_t *src )
-;-----------------------------------------------------------------------------
-cglobal predict_4x4_ddl_mmxext
- mov eax, [esp + 4]
- picgetgot ecx
- movq mm3, [eax - FDEC_STRIDE ]
- movq mm1, [eax - FDEC_STRIDE - 1]
- movq mm2, mm3
- movq mm4, [pb_0s_ff GLOBAL]
- psrlq mm2, 8
- pand mm4, mm3
- por mm2, mm4
- PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5
-%assign Y 0
-%rep 4
- psrlq mm0, 8
- movd [eax + Y * FDEC_STRIDE], mm0
-%assign Y (Y+1)
-%endrep
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_4x4_vl_mmxext( uint8_t *src )
-;-----------------------------------------------------------------------------
-cglobal predict_4x4_vl_mmxext
- mov eax, [esp + 4]
- picgetgot ecx
- movq mm1, [eax - FDEC_STRIDE]
- movq mm3, mm1
- movq mm2, mm1
- psrlq mm3, 8
- psrlq mm2, 16
- movq mm4, mm3
- pavgb mm4, mm1
- PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5
- movd [eax + 0*FDEC_STRIDE], mm4
- movd [eax + 1*FDEC_STRIDE], mm0
- psrlq mm4, 8
- psrlq mm0, 8
- movd [eax + 2*FDEC_STRIDE], mm4
- movd [eax + 3*FDEC_STRIDE], mm0
-
- ret
-
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8_v_mmxext
- mov eax, [esp+8]
- mov edx, [esp+4]
- movq mm0, [eax+16]
- STORE8x8 mm0, mm0
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8_dc_mmxext
- picgetgot ecx
- mov eax, [esp + 8]
- mov edx, [esp + 4]
- pxor mm0, mm0
- pxor mm1, mm1
- psadbw mm0, [eax+7]
- psadbw mm1, [eax+16]
- paddw mm0, [pw_8 GLOBAL]
- paddw mm0, mm1
- psrlw mm0, 4
- pshufw mm0, mm0, 0
- packuswb mm0, mm0
- STORE8x8 mm0, mm0
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_top_mmxext( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-%macro PRED8x8_DC 2
-cglobal %1
- picgetgot ecx
- mov eax, [esp + 8]
- mov edx, [esp + 4]
- pxor mm0, mm0
- psadbw mm0, [eax+%2]
- paddw mm0, [pw_4 GLOBAL]
- psrlw mm0, 3
- pshufw mm0, mm0, 0
- packuswb mm0, mm0
- STORE8x8 mm0, mm0
- ret
-%endmacro
-
-PRED8x8_DC predict_8x8_dc_top_mmxext, 16
-PRED8x8_DC predict_8x8_dc_left_mmxext, 7
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddl_mmxext
- picgetgot ecx
- mov eax, [esp + 8]
- mov edx, [esp + 4]
- movq mm1, [eax + 15]
- movq mm2, [eax + 17]
- movq mm3, [eax + 23]
- movq mm4, [eax + 25]
- PRED8x8_LOWPASS mm0, mm1, mm2, [eax + 16], mm7
- PRED8x8_LOWPASS mm1, mm3, mm4, [eax + 24], mm6
-
-%assign Y 7
-%rep 6
- movq [edx + Y*FDEC_STRIDE], mm1
- movq mm2, mm0
- psllq mm1, 8
- psrlq mm2, 56
- psllq mm0, 8
- por mm1, mm2
-%assign Y (Y-1)
-%endrep
- movq [edx + Y*FDEC_STRIDE], mm1
- psllq mm1, 8
- psrlq mm0, 56
- por mm1, mm0
-%assign Y (Y-1)
- movq [edx + Y*FDEC_STRIDE], mm1
-
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddr_mmxext
- picgetgot ecx
- mov eax, [esp + 8]
- mov edx, [esp + 4]
- movq mm1, [eax + 7]
- movq mm2, [eax + 9]
- movq mm3, [eax + 15]
- movq mm4, [eax + 17]
- PRED8x8_LOWPASS mm0, mm1, mm2, [eax + 8], mm7
- PRED8x8_LOWPASS mm1, mm3, mm4, [eax + 16], mm6
-
-%assign Y 7
-%rep 6
- movq [edx + Y*FDEC_STRIDE], mm0
- movq mm2, mm1
- psrlq mm0, 8
- psllq mm2, 56
- psrlq mm1, 8
- por mm0, mm2
-%assign Y (Y-1)
-%endrep
- movq [edx + Y*FDEC_STRIDE], mm0
- psrlq mm0, 8
- psllq mm1, 56
- por mm0, mm1
-%assign Y (Y-1)
- movq [edx + Y*FDEC_STRIDE], mm0
-
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-
-; fills only some pixels:
-; f01234567
-; 0........
-; 1,,,,,,,,
-; 2 .......
-; 3 ,,,,,,,
-; 4 ......
-; 5 ,,,,,,
-; 6 .....
-; 7 ,,,,,
-
-cglobal predict_8x8_vr_core_mmxext
- picgetgot ecx
- mov eax, [esp + 8]
- mov edx, [esp + 4]
- movq mm2, [eax + 16]
- movq mm3, [eax + 15]
- movq mm1, [eax + 14]
- movq mm4, mm3
- pavgb mm3, mm2
- PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
-
-%assign Y 0
-%rep 3
- movq [edx + Y *FDEC_STRIDE], mm3
- movq [edx + (Y+1)*FDEC_STRIDE], mm0
- psllq mm3, 8
- psllq mm0, 8
-%assign Y (Y+2)
-%endrep
- movq [edx + Y *FDEC_STRIDE], mm3
- movq [edx + (Y+1)*FDEC_STRIDE], mm0
-
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_8x8c_v_mmx( uint8_t *src )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8c_v_mmx
- mov edx, [esp + 4]
- movq mm0, [edx - FDEC_STRIDE]
- STORE8x8 mm0, mm0
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8c_dc_core_mmxext
- picgetgot ecx
-
- mov edx, [esp + 4]
-
- movq mm0, [edx - FDEC_STRIDE]
- pxor mm1, mm1
- pxor mm2, mm2
- punpckhbw mm1, mm0
- punpcklbw mm0, mm2
- psadbw mm1, mm2 ; s1
- psadbw mm0, mm2 ; s0
-
- paddw mm0, [esp + 8]
- pshufw mm2, [esp + 12], 0
- psrlw mm0, 3
- paddw mm1, [pw_2 GLOBAL]
- movq mm3, mm2
- pshufw mm1, mm1, 0
- pshufw mm0, mm0, 0 ; dc0 (w)
- paddw mm3, mm1
- psrlw mm3, 3 ; dc3 (w)
- psrlw mm2, 2 ; dc2 (w)
- psrlw mm1, 2 ; dc1 (w)
-
- packuswb mm0, mm1 ; dc0,dc1 (b)
- packuswb mm2, mm3 ; dc2,dc3 (b)
-
- STORE8x8 mm0, mm2
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8c_p_core_mmxext
- picgetgot ecx
-
- mov edx, [esp + 4]
- pshufw mm0, [esp + 8], 0
- pshufw mm2, [esp +12], 0
- pshufw mm4, [esp +16], 0
- movq mm1, mm2
- pmullw mm2, [pw_3210 GLOBAL]
- psllw mm1, 2
- paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
- paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
-
- mov eax, 8
-ALIGN 4
-.loop:
- movq mm5, mm0
- movq mm6, mm1
- psraw mm5, 5
- psraw mm6, 5
- packuswb mm5, mm6
- movq [edx], mm5
-
- paddsw mm0, mm4
- paddsw mm1, mm4
- add edx, FDEC_STRIDE
- dec eax
- jg .loop
-
- nop
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
-;-----------------------------------------------------------------------------
-cglobal predict_16x16_p_core_mmxext
- picgetgot ecx
-
- mov edx, [esp + 4]
- pshufw mm0, [esp + 8], 0
- pshufw mm2, [esp +12], 0
- pshufw mm4, [esp +16], 0
- movq mm5, mm2
- movq mm1, mm2
- pmullw mm5, [pw_3210 GLOBAL]
- psllw mm2, 3
- psllw mm1, 2
- movq mm3, mm2
- paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
- paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
- paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
- paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
-
- mov eax, 16
-ALIGN 4
-.loop:
- movq mm5, mm0
- movq mm6, mm1
- psraw mm5, 5
- psraw mm6, 5
- packuswb mm5, mm6
- movq [edx], mm5
-
- movq mm5, mm2
- movq mm6, mm3
- psraw mm5, 5
- psraw mm6, 5
- packuswb mm5, mm6
- movq [edx+8], mm5
-
- paddsw mm0, mm4
- paddsw mm1, mm4
- paddsw mm2, mm4
- paddsw mm3, mm4
- add edx, FDEC_STRIDE
- dec eax
- jg .loop
-
- nop
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c )
-;-----------------------------------------------------------------------------
-cglobal predict_16x16_p_core_sse2
- picgetgot ecx
-
- mov edx, [esp + 4 ]
- movd xmm0, [esp + 8 ]
- movd xmm1, [esp + 12]
- movd xmm2, [esp + 16]
- pshuflw xmm0, xmm0, 0
- pshuflw xmm1, xmm1, 0
- pshuflw xmm2, xmm2, 0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- punpcklqdq xmm2, xmm2
- movdqa xmm3, xmm1
- pmullw xmm3, [pw_76543210 GLOBAL]
- psllw xmm1, 3
- paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
- paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
-
- mov eax, 16
-ALIGN 4
-.loop:
- movdqa xmm3, xmm0
- movdqa xmm4, xmm1
- psraw xmm3, 5
- psraw xmm4, 5
- packuswb xmm3, xmm4
- movdqa [edx], xmm3
-
- paddsw xmm0, xmm2
- paddsw xmm1, xmm2
- add edx, FDEC_STRIDE
- dec eax
- jg .loop
-
- nop
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_16x16_v_mmx( uint8_t *src )
-;-----------------------------------------------------------------------------
-cglobal predict_16x16_v_mmx
- mov edx, [esp + 4]
- movq mm0, [edx - FDEC_STRIDE]
- movq mm1, [edx + 8 - FDEC_STRIDE]
- STORE16x16 mm0, mm1
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_16x16_v_sse2( uint8_t *src )
-;-----------------------------------------------------------------------------
-cglobal predict_16x16_v_sse2
- mov edx, [esp + 4]
- movdqa xmm0, [edx - FDEC_STRIDE]
- STORE16x16_SSE2 xmm0
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
-;-----------------------------------------------------------------------------
-
-%macro PRED16x16_DC 2
- mov edx, [esp+4]
- pxor mm0, mm0
- pxor mm1, mm1
- psadbw mm0, [edx - FDEC_STRIDE]
- psadbw mm1, [edx - FDEC_STRIDE + 8]
- paddusw mm0, mm1
- paddusw mm0, %1
- psrlw mm0, %2 ; dc
- pshufw mm0, mm0, 0
- packuswb mm0, mm0 ; dc in bytes
- STORE16x16 mm0, mm0
-%endmacro
-
-cglobal predict_16x16_dc_core_mmxext
- PRED16x16_DC [esp+8], 5
- ret
-
-cglobal predict_16x16_dc_top_mmxext
- picgetgot ecx
- PRED16x16_DC [pw_8 GLOBAL], 4
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left )
-;-----------------------------------------------------------------------------
-
-%macro PRED16x16_DC_SSE2 2
- mov edx, [esp+4]
- pxor xmm0, xmm0
- psadbw xmm0, [edx - FDEC_STRIDE]
- movhlps xmm1, xmm0
- paddw xmm0, xmm1
- paddusw xmm0, %1
- psrlw xmm0, %2 ; dc
- pshuflw xmm0, xmm0, 0
- punpcklqdq xmm0, xmm0
- packuswb xmm0, xmm0 ; dc in bytes
- STORE16x16_SSE2 xmm0
-%endmacro
-
-cglobal predict_16x16_dc_core_sse2
- movd xmm2, [esp+8]
- PRED16x16_DC_SSE2 xmm2, 5
- ret
-
-cglobal predict_16x16_dc_top_sse2
- picgetgot ecx
- PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddr_sse2
- mov edx, [esp + 8]
- mov eax, [esp + 4]
- picgetgot ecx
- movdqu xmm3, [edx + 8]
- movdqu xmm1, [edx + 7]
- movdqa xmm2, xmm3
- psrldq xmm2, 1
- PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-%assign Y 7
-%rep 3
- movq [eax + Y * FDEC_STRIDE], xmm0
- movq [eax + (Y-1) * FDEC_STRIDE], xmm1
- psrldq xmm0, 2
- psrldq xmm1, 2
-%assign Y (Y-2)
-%endrep
- movq [eax + 1 * FDEC_STRIDE], xmm0
- movq [eax + 0 * FDEC_STRIDE], xmm1
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddl_sse2
- mov edx, [esp + 8]
- mov eax, [esp + 4]
- picgetgot ecx
- movdqa xmm3, [edx + 16]
- movdqu xmm2, [edx + 17]
- movdqa xmm1, xmm3
- pslldq xmm1, 1
- PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
-%assign Y 0
-%rep 8
- psrldq xmm0, 1
- movq [eax + Y * FDEC_STRIDE], xmm0
-%assign Y (Y+1)
-%endrep
- ret
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8_vl_sse2
- mov edx, [esp + 8]
- mov eax, [esp + 4]
- picgetgot ecx
- movdqa xmm4, [edx + 16]
- movdqa xmm2, xmm4
- movdqa xmm1, xmm4
- movdqa xmm3, xmm4
- psrldq xmm2, 1
- pslldq xmm1, 1
- pavgb xmm3, xmm2
- PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5
-; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
-; xmm3: (t0 + t1 + 1) >> 1
-%assign Y 0
-%rep 3
- psrldq xmm0, 1
- movq [eax + Y * FDEC_STRIDE], xmm3
- movq [eax + (Y+1) * FDEC_STRIDE], xmm0
- psrldq xmm3, 1
-%assign Y (Y+2)
-%endrep
- psrldq xmm0, 1
- movq [eax + Y * FDEC_STRIDE], xmm3
- movq [eax + (Y+1) * FDEC_STRIDE], xmm0
- ret
+++ /dev/null
-;*****************************************************************************
-;* quant-a.asm: h264 encoder library
-;*****************************************************************************
-;* Copyright (C) 2005 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
-;*****************************************************************************
-
-BITS 32
-
-%include "i386inc.asm"
-
-SECTION_RODATA
-pd_1: times 2 dd 1
-
-SECTION .text
-
-%macro QUANT_AC_START 0
- mov eax, [esp+ 4] ; dct
- mov ecx, [esp+ 8] ; mf
- mov edx, [esp+12] ; bias
-%endmacro
-
-%macro MMX_QUANT_DC_START 0
- mov eax, [esp+ 4] ; dct
- movd mm6, [esp+ 8] ; mf
- movd mm7, [esp+12] ; bias
- pshufw mm6, mm6, 0
- pshufw mm7, mm7, 0
-%endmacro
-
-%macro SSE2_QUANT_DC_START 0
- mov eax, [esp+ 4] ; dct
- movd xmm6, [esp+ 8] ; mf
- movd xmm7, [esp+12] ; bias
- pshuflw xmm6, xmm6, 0
- pshuflw xmm7, xmm7, 0
- punpcklqdq xmm6, xmm6
- punpcklqdq xmm7, xmm7
-%endmacro
-
-%macro QUANT_ONE 5
-;;; %1 (m64) dct[y][x]
-;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
-;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
-
- mov%1 %2m0, %3 ; load dct coeffs
- pxor %2m1, %2m1
- pcmpgtw %2m1, %2m0 ; sign(coeff)
- pxor %2m0, %2m1
- psubw %2m0, %2m1 ; abs(coeff)
- paddusw %2m0, %5 ; round
- pmulhuw %2m0, %4 ; divide
- pxor %2m0, %2m1 ; restore sign
- psubw %2m0, %2m1
- mov%1 %3, %2m0 ; store
-%endmacro
-%macro MMX_QUANT_1x4 3
- QUANT_ONE q, m, %1, %2, %3
-%endmacro
-%macro SSE2_QUANT_1x8 3
- QUANT_ONE dqa, xm, %1, %2, %3
-%endmacro
-
-%macro SSSE3_QUANT_1x8 3
- movdqa xmm1, %1 ; load dct coeffs
- pabsw xmm0, xmm1
- paddusw xmm0, %3 ; round
- pmulhuw xmm0, %2 ; divide
- psignw xmm0, xmm1 ; restore sign
- movdqa %1, xmm0 ; store
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias )
-;-----------------------------------------------------------------------------
-cglobal x264_quant_2x2_dc_mmxext
- MMX_QUANT_DC_START
- MMX_QUANT_1x4 [eax], mm6, mm7
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_mmxext
- MMX_QUANT_DC_START
-%assign x 0
-%rep 4
- MMX_QUANT_1x4 [eax+x], mm6, mm7
-%assign x (x+8)
-%endrep
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_mmx
- QUANT_AC_START
-%assign x 0
-%rep 4
- MMX_QUANT_1x4 [eax+x], [ecx+x], [edx+x]
-%assign x (x+8)
-%endrep
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_quant_8x8_mmx( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
-;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_mmx
- QUANT_AC_START
-%assign x 0
-%rep 16
- MMX_QUANT_1x4 [eax+x], [ecx+x], [edx+x]
-%assign x (x+8)
-%endrep
- ret
-
-%macro QUANT_SSE 1
-;-----------------------------------------------------------------------------
-; void x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias )
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_%1
- SSE2_QUANT_DC_START
-%assign x 0
-%rep 2
- QUANT_1x8 [eax+x], xmm6, xmm7
-%assign x (x+16)
-%endrep
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_%1
- QUANT_AC_START
-%assign x 0
-%rep 2
- QUANT_1x8 [eax+x], [ecx+x], [edx+x]
-%assign x (x+16)
-%endrep
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
-;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_%1
- QUANT_AC_START
-%assign x 0
-%rep 8
- QUANT_1x8 [eax+x], [ecx+x], [edx+x]
-%assign x (x+16)
-%endrep
- ret
-%endmacro
-
-%define QUANT_1x8 SSE2_QUANT_1x8
-QUANT_SSE sse2
-%ifdef HAVE_SSE3
-%define QUANT_1x8 SSSE3_QUANT_1x8
-QUANT_SSE ssse3
-%endif
-
-
-;=============================================================================
-; dequant
-;=============================================================================
-
-%macro DEQUANT16_L_1x4 3
-;;; %1 dct[y][x]
-;;; %2,%3 dequant_mf[i_mf][y][x]
-;;; mm5 i_qbits
-
- movq mm1, %2
- movq mm2, %3
- movq mm0, %1
- packssdw mm1, mm2
- pmullw mm0, mm1
- psllw mm0, mm5
- movq %1, mm0
-%endmacro
-
-%macro DEQUANT16_R_1x4 3
-;;; %1 dct[y][x]
-;;; %2,%3 dequant_mf[i_mf][y][x]
-;;; mm5 -i_qbits
-;;; mm6 f as words
-
- movq mm1, %2
- movq mm2, %3
- movq mm0, %1
- packssdw mm1, mm2
- pmullw mm0, mm1
- paddw mm0, mm6
- psraw mm0, mm5
- movq %1, mm0
-%endmacro
-
-%macro DEQUANT32_R_1x4 3
-;;; %1 dct[y][x]
-;;; %2,%3 dequant_mf[i_mf][y][x]
-;;; mm5 -i_qbits
-;;; mm6 f as dwords
-;;; mm7 0
-
- movq mm0, %1
- movq mm1, mm0
- punpcklwd mm0, mm0
- punpckhwd mm1, mm1
-
- movq mm2, mm0
- movq mm3, mm1
- pmulhw mm0, %2
- pmulhw mm1, %3
- pmullw mm2, %2
- pmullw mm3, %3
- pslld mm0, 16
- pslld mm1, 16
- paddd mm0, mm2
- paddd mm1, mm3
-
- paddd mm0, mm6
- paddd mm1, mm6
- psrad mm0, mm5
- psrad mm1, mm5
-
- packssdw mm0, mm1
- movq %1, mm0
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
-;-----------------------------------------------------------------------------
-%macro DEQUANT_WxH 3
-cglobal %1
- mov edx, [esp+12] ; i_qp
- imul eax, edx, 0x2b
- shr eax, 8 ; i_qbits = i_qp / 6
- lea ecx, [eax+eax*2]
- sub edx, ecx
- sub edx, ecx ; i_mf = i_qp % 6
- shl edx, %3+2
- add edx, [esp+8] ; dequant_mf[i_mf]
- mov ecx, [esp+4] ; dct
-
- sub eax, %3
- jl .rshift32 ; negative qbits => rightshift
-
-.lshift:
- movd mm5, eax
-
- mov eax, 8*(%2-1)
-.loopl16
-%rep 2
- DEQUANT16_L_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]
- sub eax, byte 8
-%endrep
- jge .loopl16
-
- nop
- ret
-
-.rshift32:
- neg eax
- movd mm5, eax
- picgetgot eax
- movq mm6, [pd_1 GLOBAL]
- pxor mm7, mm7
- pslld mm6, mm5
- psrld mm6, 1
-
- mov eax, 8*(%2-1)
-.loopr32
-%rep 2
- DEQUANT32_R_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]
- sub eax, byte 8
-%endrep
- jge .loopr32
-
- nop
- ret
-%endmacro
-
-DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4
-DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6
#include "clip1.h"
#ifdef HAVE_MMX
-#include "i386/mc.h"
+#include "x86/mc.h"
#endif
#ifdef ARCH_PPC
#include "ppc/mc.h"
#ifdef HAVE_MMX
x264_mc_init_mmx( cpu, pf );
- if( cpu&X264_CPU_MMXEXT )
- pf->mc_chroma = x264_mc_chroma_mmxext;
#endif
#ifdef ARCH_PPC
if( cpu&X264_CPU_ALTIVEC )
const int b_interlaced = h->sh.b_mbaff;
const int stride = frame->i_stride[0] << b_interlaced;
const int width = frame->i_width[0];
- int start = (mb_y*16 >> b_interlaced) - 8;
+ int start = (mb_y*16 >> b_interlaced) - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
int height = ((b_end ? frame->i_lines[0] : mb_y*16) >> b_interlaced) + 8;
- int offs = start*stride - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
+ int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd
int x, y;
if( mb_y & b_interlaced )
#include "clip1.h"
#ifdef HAVE_MMX
-# include "i386/pixel.h"
+# include "x86/pixel.h"
#endif
#ifdef ARCH_PPC
# include "ppc/pixel.h"
INIT2( ssd, _sse2 );
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
-
-#ifdef ARCH_X86_64
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
+#ifdef ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
}
INIT5( satd_x3, _ssse3 );
INIT5( satd_x4, _ssse3 );
INIT_ADS( _ssse3 );
-#ifdef ARCH_X86_64
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
+ pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
+ pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
+ pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3;
+#ifdef ARCH_X86_64
+ pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
#endif
if( cpu&X264_CPU_CACHELINE_SPLIT )
{
#undef HAVE_MMX /* not finished now */
#endif
#ifdef HAVE_MMX
-# include "i386/predict.h"
+# include "x86/predict.h"
#endif
#ifdef ARCH_PPC
# include "ppc/predict.h"
#include "common.h"
#ifdef HAVE_MMX
-#include "i386/quant.h"
+#include "x86/quant.h"
#endif
#ifdef ARCH_PPC
# include "ppc/quant.h"
;*****************************************************************************
-;* cpu.asm: h264 encoder library
+;* cpu-32.asm: h264 encoder library
;*****************************************************************************
-;* Copyright (C) 2003 x264 project
-;* $Id: cpu.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;* Copyright (C) 2003-2008 x264 project
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;*
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
-BITS 32
-
-;=============================================================================
-; Macros and other preprocessor constants
-;=============================================================================
-
-%include "i386inc.asm"
-
-;=============================================================================
-; Code
-;=============================================================================
+%include "x86inc.asm"
SECTION .text
;-----------------------------------------------------------------------------
-; int __cdecl x264_cpu_cpuid_test( void ) return 0 if unsupported
+; int x264_cpu_cpuid_test( void )
+; return 0 if unsupported
;-----------------------------------------------------------------------------
cglobal x264_cpu_cpuid_test
pushfd
push ebp
push esi
push edi
-
pushfd
pop eax
mov ebx, eax
pushfd
pop eax
xor eax, ebx
-
pop edi
pop esi
pop ebp
ret
;-----------------------------------------------------------------------------
-; int __cdecl x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
-cglobal x264_cpu_cpuid
-
- push ebp
- mov ebp, esp
- push ebx
- push esi
- push edi
-
- mov eax, [ebp + 8]
+cglobal x264_cpu_cpuid, 0,6
+ mov eax, r0m
cpuid
-
- mov esi, [ebp + 12]
+ mov esi, r1m
mov [esi], eax
-
- mov esi, [ebp + 16]
+ mov esi, r2m
mov [esi], ebx
-
- mov esi, [ebp + 20]
+ mov esi, r3m
mov [esi], ecx
-
- mov esi, [ebp + 24]
+ mov esi, r4m
mov [esi], edx
-
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
+ RET
;-----------------------------------------------------------------------------
-; void __cdecl x264_emms( void )
+; void x264_emms( void )
;-----------------------------------------------------------------------------
cglobal x264_emms
emms
mov edx, [ebp+12]
mov [esp], edx
call ecx
- mov esp, ebp
- pop ebp
+ leave
ret
;*****************************************************************************
-;* cpu.asm: h264 encoder library
+;* cpu-64.asm: h264 encoder library
;*****************************************************************************
-;* Copyright (C) 2003 x264 project
-;* $Id: cpu.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;* Copyright (C) 2003-2008 x264 project
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;*
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
-BITS 64
-
-;=============================================================================
-; Macros and other preprocessor constants
-;=============================================================================
-
-%include "amd64inc.asm"
-
-;=============================================================================
-; Code
-;=============================================================================
+%include "x86inc.asm"
SECTION .text
;-----------------------------------------------------------------------------
-; int x264_cpu_cpuid_test( void ) return 0 if unsupported
-;-----------------------------------------------------------------------------
-cglobal x264_cpu_cpuid_test
- firstpush rbx
- pushreg rbx
- push rbp
- pushreg rbp
- mov rbp, rsp
- setframe rbp, 0
- endprolog
-
- pushfq
- pop rax
- mov ebx, eax
- xor eax, 0x200000
- push rax
-
- popfq
- pushfq
- pop rax
- xor eax, ebx
-
- lea rsp, [rbp]
- pop rbp
- pop rbx
- ret
- endfunc
-
-;-----------------------------------------------------------------------------
-; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
cglobal x264_cpu_cpuid
firstpush rbx
pushreg rbx
endprolog
-
+
mov r10, parm4q
mov r11, parm3q
mov r9, parm2q
%ifdef WIN64
mov r8, [rsp+40+8]
-%endif
-
+%endif
+
mov eax, parm1d
cpuid
endfunc
;-----------------------------------------------------------------------------
-; void x264_emms( void )
+; void x264_emms( void )
;-----------------------------------------------------------------------------
cglobal x264_emms
emms
--- /dev/null
+;*****************************************************************************
+;* dct-32.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003-2008 x264 project
+;*
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
+;* Min Chen <chenm001.163.com> (converted to nasm)
+;* Christian Heine <sennindemokrit@gmx.net> (dct8/idct8 functions)
+;* Loren Merritt <lorenm@u.washington.edu> (misc)
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+
+pw_32: times 8 dw 32
+
+SECTION .text
+
+%macro SUMSUB_BA 2
+ paddw %1, %2
+ paddw %2, %2
+ psubw %2, %1
+%endmacro
+
+%macro SUMSUB_BADC 4
+ paddw %1, %2
+ paddw %3, %4
+ paddw %2, %2
+ paddw %4, %4
+ psubw %2, %1
+ psubw %4, %3
+%endmacro
+
+%macro SBUTTERFLY 5
+ mov%1 %5, %3
+ punpckl%2 %3, %4
+ punpckh%2 %5, %4
+%endmacro
+
+; input ABCD output ADTC
+%macro TRANSPOSE4x4W 5
+ SBUTTERFLY q, wd, %1, %2, %5
+ SBUTTERFLY q, wd, %3, %4, %2
+ SBUTTERFLY q, dq, %1, %3, %4
+ SBUTTERFLY q, dq, %5, %2, %3
+%endmacro
+
+; input 2x8 unsigned bytes (%5,%6), zero (%7) output: difference (%1,%2)
+%macro LOAD_DIFF_8P 7
+ movq %1, %5
+ movq %2, %1
+ punpcklbw %1, %7
+ punpckhbw %2, %7
+ movq %3, %6
+ movq %4, %3
+ punpcklbw %3, %7
+ punpckhbw %4, %7
+ psubw %1, %3
+ psubw %2, %4
+%endmacro
+
+%macro LOADSUMSUB 4 ; returns %1=%3+%4, %2=%3-%4
+ movq %2, %3
+ movq %1, %4
+ SUMSUB_BA %1, %2
+%endmacro
+
+%macro STORE_DIFF_8P 4
+ psraw %1, 6
+ movq %3, %2
+ punpcklbw %3, %4
+ paddsw %1, %3
+ packuswb %1, %1
+ movq %2, %1
+%endmacro
+
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_pixel_sub_8x8_mmx:
+ pxor mm7, mm7
+ %assign i 0
+ %rep 8
+ LOAD_DIFF_8P mm0, mm1, mm2, mm3, [r1], [r2], mm7
+ movq [r0+i], mm0
+ movq [r0+i+8], mm1
+ add r1, FENC_STRIDE
+ add r2, FDEC_STRIDE
+ %assign i i+16
+ %endrep
+ ret
+
+;-----------------------------------------------------------------------------
+; void x264_ydct8_mmx( int16_t dest[8][8] );
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_ydct8_mmx:
+ ;-------------------------------------------------------------------------
+ ; vertical dct ( compute 4 columns at a time -> 2 loops )
+ ;-------------------------------------------------------------------------
+ %assign i 0
+ %rep 2
+
+ LOADSUMSUB mm2, mm3, [r0+i+0*16], [r0+i+7*16] ; mm2 = s07, mm3 = d07
+ LOADSUMSUB mm1, mm5, [r0+i+1*16], [r0+i+6*16] ; mm1 = s16, mm5 = d16
+ LOADSUMSUB mm0, mm6, [r0+i+2*16], [r0+i+5*16] ; mm0 = s25, mm6 = d25
+ LOADSUMSUB mm4, mm7, [r0+i+3*16], [r0+i+4*16] ; mm4 = s34, mm7 = d34
+
+ SUMSUB_BA mm4, mm2 ; mm4 = a0, mm2 = a2
+ SUMSUB_BA mm0, mm1 ; mm0 = a1, mm1 = a3
+ SUMSUB_BA mm0, mm4 ; mm0 = dst0, mm1 = dst4
+
+ movq [r0+i+0*16], mm0
+ movq [r0+i+4*16], mm4
+
+ movq mm0, mm1 ; a3
+ psraw mm0, 1 ; a3>>1
+ paddw mm0, mm2 ; a2 + (a3>>1)
+ psraw mm2, 1 ; a2>>1
+ psubw mm2, mm1 ; (a2>>1) - a3
+
+ movq [r0+i+2*16], mm0
+ movq [r0+i+6*16], mm2
+
+ movq mm0, mm6
+ psraw mm0, 1
+ paddw mm0, mm6 ; d25+(d25>>1)
+ movq mm1, mm3
+ psubw mm1, mm7 ; a5 = d07-d34-(d25+(d25>>1))
+ psubw mm1, mm0
+
+ movq mm0, mm5
+ psraw mm0, 1
+ paddw mm0, mm5 ; d16+(d16>>1)
+ movq mm2, mm3
+ paddw mm2, mm7 ; a6 = d07+d34-(d16+(d16>>1))
+ psubw mm2, mm0
+
+ movq mm0, mm3
+ psraw mm0, 1
+ paddw mm0, mm3 ; d07+(d07>>1)
+ paddw mm0, mm5
+ paddw mm0, mm6 ; a4 = d16+d25+(d07+(d07>>1))
+
+ movq mm3, mm7
+ psraw mm3, 1
+ paddw mm3, mm7 ; d34+(d34>>1)
+ paddw mm3, mm5
+ psubw mm3, mm6 ; a7 = d16-d25+(d34+(d34>>1))
+
+ movq mm7, mm3
+ psraw mm7, 2
+ paddw mm7, mm0 ; a4 + (a7>>2)
+
+ movq mm6, mm2
+ psraw mm6, 2
+ paddw mm6, mm1 ; a5 + (a6>>2)
+
+ psraw mm0, 2
+ psraw mm1, 2
+ psubw mm0, mm3 ; (a4>>2) - a7
+ psubw mm2, mm1 ; a6 - (a5>>2)
+
+ movq [r0+i+1*16], mm7
+ movq [r0+i+3*16], mm6
+ movq [r0+i+5*16], mm2
+ movq [r0+i+7*16], mm0
+
+ %assign i i+8
+ %endrep
+ ret
+
+;-----------------------------------------------------------------------------
+; void x264_yidct8_mmx( int16_t dest[8][8] );
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_yidct8_mmx:
+ ;-------------------------------------------------------------------------
+ ; vertical idct ( compute 4 columns at a time -> 2 loops )
+ ;-------------------------------------------------------------------------
+ %assign i 0
+ %rep 2
+
+ movq mm1, [r0+i+1*16] ; mm1 = d1
+ movq mm3, [r0+i+3*16] ; mm3 = d3
+ movq mm5, [r0+i+5*16] ; mm5 = d5
+ movq mm7, [r0+i+7*16] ; mm7 = d7
+
+ movq mm4, mm7
+ psraw mm4, 1
+ movq mm0, mm5
+ psubw mm0, mm7
+ psubw mm0, mm4
+ psubw mm0, mm3 ; mm0 = e1
+
+ movq mm6, mm3
+ psraw mm6, 1
+ movq mm2, mm7
+ psubw mm2, mm6
+ psubw mm2, mm3
+ paddw mm2, mm1 ; mm2 = e3
+
+ movq mm4, mm5
+ psraw mm4, 1
+ paddw mm4, mm5
+ paddw mm4, mm7
+ psubw mm4, mm1 ; mm4 = e5
+
+ movq mm6, mm1
+ psraw mm6, 1
+ paddw mm6, mm1
+ paddw mm6, mm5
+ paddw mm6, mm3 ; mm6 = e7
+
+ movq mm1, mm0
+ movq mm3, mm4
+ movq mm5, mm2
+ movq mm7, mm6
+ psraw mm6, 2
+ psraw mm3, 2
+ psraw mm5, 2
+ psraw mm0, 2
+ paddw mm1, mm6 ; mm1 = f1
+ paddw mm3, mm2 ; mm3 = f3
+ psubw mm5, mm4 ; mm5 = f5
+ psubw mm7, mm0 ; mm7 = f7
+
+ movq mm2, [r0+i+2*16] ; mm2 = d2
+ movq mm6, [r0+i+6*16] ; mm6 = d6
+ movq mm4, mm2
+ movq mm0, mm6
+ psraw mm4, 1
+ psraw mm6, 1
+ psubw mm4, mm0 ; mm4 = a4
+ paddw mm6, mm2 ; mm6 = a6
+
+ movq mm2, [r0+i+0*16] ; mm2 = d0
+ movq mm0, [r0+i+4*16] ; mm0 = d4
+ SUMSUB_BA mm0, mm2 ; mm0 = a0, mm2 = a2
+
+ SUMSUB_BADC mm6, mm0, mm4, mm2 ; mm6 = f0, mm0 = f6
+ ; mm4 = f2, mm2 = f4
+
+ SUMSUB_BADC mm7, mm6, mm5, mm4 ; mm7 = g0, mm6 = g7
+ ; mm5 = g1, mm4 = g6
+ SUMSUB_BADC mm3, mm2, mm1, mm0 ; mm3 = g2, mm2 = g5
+ ; mm1 = g3, mm0 = g4
+
+ movq [r0+i+0*16], mm7
+ movq [r0+i+1*16], mm5
+ movq [r0+i+2*16], mm3
+ movq [r0+i+3*16], mm1
+ movq [r0+i+4*16], mm0
+ movq [r0+i+5*16], mm2
+ movq [r0+i+6*16], mm4
+ movq [r0+i+7*16], mm6
+
+ %assign i i+8
+ %endrep
+ ret
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_add_8x8_mmx( uint8_t *dst, int16_t src[8][8] );
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_pixel_add_8x8_mmx:
+ pxor mm7, mm7
+ %assign i 0
+ %rep 8
+ movq mm0, [r0]
+ movq mm2, [r1+i]
+ movq mm3, [r1+i+8]
+ movq mm1, mm0
+ psraw mm2, 6
+ psraw mm3, 6
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+ paddw mm0, mm2
+ paddw mm1, mm3
+ packuswb mm0, mm1
+ movq [r0], mm0
+ add r0, FDEC_STRIDE
+ %assign i i+16
+ %endrep
+ ret
+
+;-----------------------------------------------------------------------------
+; void x264_transpose_8x8_mmx( int16_t src[8][8] );
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_transpose_8x8_mmx:
+ movq mm0, [r0 ]
+ movq mm1, [r0+ 16]
+ movq mm2, [r0+ 32]
+ movq mm3, [r0+ 48]
+ TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
+ movq [r0 ], mm0
+ movq [r0+ 16], mm3
+ movq [r0+ 32], mm4
+ movq [r0+ 48], mm2
+
+ movq mm0, [r0+ 72]
+ movq mm1, [r0+ 88]
+ movq mm2, [r0+104]
+ movq mm3, [r0+120]
+ TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
+ movq [r0+ 72], mm0
+ movq [r0+ 88], mm3
+ movq [r0+104], mm4
+ movq [r0+120], mm2
+
+ movq mm0, [r0+ 8]
+ movq mm1, [r0+ 24]
+ movq mm2, [r0+ 40]
+ movq mm3, [r0+ 56]
+ TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
+ movq mm1, [r0+ 64]
+ movq mm5, [r0+ 80]
+ movq mm6, [r0+ 96]
+ movq mm7, [r0+112]
+
+ movq [r0+ 64], mm0
+ movq [r0+ 80], mm3
+ movq [r0+ 96], mm4
+ movq [r0+112], mm2
+ TRANSPOSE4x4W mm1, mm5, mm6, mm7, mm4
+ movq [r0+ 8], mm1
+ movq [r0+ 24], mm7
+ movq [r0+ 40], mm4
+ movq [r0+ 56], mm6
+ ret
+
+;-----------------------------------------------------------------------------
+; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+cglobal x264_sub8x8_dct8_mmx, 3,3
+ call x264_pixel_sub_8x8_mmx
+ call x264_ydct8_mmx
+ call x264_transpose_8x8_mmx
+ jmp x264_ydct8_mmx
+
+;-----------------------------------------------------------------------------
+; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
+;-----------------------------------------------------------------------------
+cglobal x264_add8x8_idct8_mmx, 0,1
+ mov r0, r1m
+ add word [r0], 32
+ call x264_yidct8_mmx
+ call x264_transpose_8x8_mmx
+ call x264_yidct8_mmx
+ mov r1, r0
+ mov r0, r0m
+ jmp x264_pixel_add_8x8_mmx
+
+%macro IDCT8_1D 8
+ movdqa %1, %3
+ movdqa %5, %7
+ psraw %3, 1
+ psraw %7, 1
+ psubw %3, %5
+ paddw %7, %1
+ movdqa %5, %2
+ psraw %5, 1
+ paddw %5, %2
+ paddw %5, %4
+ paddw %5, %6
+ movdqa %1, %6
+ psraw %1, 1
+ paddw %1, %6
+ paddw %1, %8
+ psubw %1, %2
+ psubw %2, %4
+ psubw %6, %4
+ paddw %2, %8
+ psubw %6, %8
+ psraw %4, 1
+ psraw %8, 1
+ psubw %2, %4
+ psubw %6, %8
+ movdqa %4, %5
+ movdqa %8, %1
+ psraw %4, 2
+ psraw %8, 2
+ paddw %4, %6
+ paddw %8, %2
+ psraw %6, 2
+ psraw %2, 2
+ psubw %5, %6
+ psubw %2, %1
+ movdqa %1, [eax+0x00]
+ movdqa %6, [eax+0x40]
+ SUMSUB_BA %6, %1
+ SUMSUB_BA %7, %6
+ SUMSUB_BA %3, %1
+ SUMSUB_BA %5, %7
+ SUMSUB_BA %2, %3
+ SUMSUB_BA %8, %1
+ SUMSUB_BA %4, %6
+%endmacro
+
+%macro TRANSPOSE8 9
+ movdqa [%9], %8
+ SBUTTERFLY dqa, wd, %1, %2, %8
+ movdqa [%9+16], %8
+ movdqa %8, [%9]
+ SBUTTERFLY dqa, wd, %3, %4, %2
+ SBUTTERFLY dqa, wd, %5, %6, %4
+ SBUTTERFLY dqa, wd, %7, %8, %6
+ SBUTTERFLY dqa, dq, %1, %3, %8
+ movdqa [%9], %8
+ movdqa %8, [16+%9]
+ SBUTTERFLY dqa, dq, %8, %2, %3
+ SBUTTERFLY dqa, dq, %5, %7, %2
+ SBUTTERFLY dqa, dq, %4, %6, %7
+ SBUTTERFLY dqa, qdq, %1, %5, %6
+ SBUTTERFLY dqa, qdq, %8, %4, %5
+ movdqa [%9+16], %8
+ movdqa %8, [%9]
+ SBUTTERFLY dqa, qdq, %8, %2, %4
+ SBUTTERFLY dqa, qdq, %3, %7, %2
+ movdqa %7, [%9+16]
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
+;-----------------------------------------------------------------------------
+cglobal x264_add8x8_idct8_sse2
+ mov ecx, [esp+4]
+ mov eax, [esp+8]
+ movdqa xmm1, [eax+0x10]
+ movdqa xmm2, [eax+0x20]
+ movdqa xmm3, [eax+0x30]
+ movdqa xmm5, [eax+0x50]
+ movdqa xmm6, [eax+0x60]
+ movdqa xmm7, [eax+0x70]
+ IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+ TRANSPOSE8 xmm4, xmm1, xmm7, xmm3, xmm5, xmm0, xmm2, xmm6, eax
+ picgetgot edx
+ paddw xmm4, [pw_32 GLOBAL]
+ movdqa [eax+0x00], xmm4
+ movdqa [eax+0x40], xmm2
+ IDCT8_1D xmm4, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7, xmm1
+ movdqa [eax+0x60], xmm6
+ movdqa [eax+0x70], xmm7
+ pxor xmm7, xmm7
+ STORE_DIFF_8P xmm2, [ecx+FDEC_STRIDE*0], xmm6, xmm7
+ STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*1], xmm6, xmm7
+ STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*2], xmm6, xmm7
+ STORE_DIFF_8P xmm3, [ecx+FDEC_STRIDE*3], xmm6, xmm7
+ STORE_DIFF_8P xmm5, [ecx+FDEC_STRIDE*4], xmm6, xmm7
+ STORE_DIFF_8P xmm4, [ecx+FDEC_STRIDE*5], xmm6, xmm7
+ movdqa xmm0, [eax+0x60]
+ movdqa xmm1, [eax+0x70]
+ STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*6], xmm6, xmm7
+ STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*7], xmm6, xmm7
+ ret
+
+;-----------------------------------------------------------------------------
+; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+%macro SUB_NxN_DCT 4
+cglobal %1
+ mov edx, [esp+12]
+ mov ecx, [esp+ 8]
+ mov eax, [esp+ 4]
+ add edx, %4
+ add ecx, %4
+ add eax, %3
+ push edx
+ push ecx
+ push eax
+ call %2
+ add dword [esp+0], %3
+ add dword [esp+4], %4*FENC_STRIDE-%4
+ add dword [esp+8], %4*FDEC_STRIDE-%4
+ call %2
+ add dword [esp+0], %3
+ add dword [esp+4], %4
+ add dword [esp+8], %4
+ call %2
+ add esp, 12
+ jmp %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
+;-----------------------------------------------------------------------------
+%macro ADD_NxN_IDCT 4
+cglobal %1
+ mov ecx, [esp+8]
+ mov eax, [esp+4]
+ add ecx, %3
+ add eax, %4
+ push ecx
+ push eax
+ call %2
+ add dword [esp+0], %4*FDEC_STRIDE-%4
+ add dword [esp+4], %3
+ call %2
+ add dword [esp+0], %4
+ add dword [esp+4], %3
+ call %2
+ add esp, 8
+ jmp %2
+%endmacro
+
+SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx, 128, 8
+ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx, 128, 8
+
+ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8
+
+;-----------------------------------------------------------------------------
+; void x264_zigzag_scan_4x4_field_mmx( int level[16], int16_t dct[4][4] )
+;-----------------------------------------------------------------------------
+cglobal x264_zigzag_scan_4x4_field_mmx
+ mov edx, [esp+8]
+ mov ecx, [esp+4]
+ punpcklwd mm0, [edx]
+ punpckhwd mm1, [edx]
+ punpcklwd mm2, [edx+8]
+ punpckhwd mm3, [edx+8]
+ punpcklwd mm4, [edx+16]
+ punpckhwd mm5, [edx+16]
+ punpcklwd mm6, [edx+24]
+ punpckhwd mm7, [edx+24]
+ psrad mm0, 16
+ psrad mm1, 16
+ psrad mm2, 16
+ psrad mm3, 16
+ psrad mm4, 16
+ psrad mm5, 16
+ psrad mm6, 16
+ psrad mm7, 16
+ movq [ecx ], mm0
+ movq [ecx+16], mm2
+ movq [ecx+24], mm3
+ movq [ecx+32], mm4
+ movq [ecx+40], mm5
+ movq [ecx+48], mm6
+ movq [ecx+56], mm7
+ movq [ecx+12], mm1
+ movd [ecx+ 8], mm2
+ ret
--- /dev/null
+;*****************************************************************************
+;* dct-64.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003-2008 x264 project
+;*
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
+;* Min Chen <chenm001.163.com> (converted to nasm)
+;* Loren Merritt <lorenm@u.washington.edu> (dct8)
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+pw_32: times 8 dw 32
+
+SECTION .text
+
+%macro LOAD_DIFF_8P 5
+ movq %1, %4
+ punpcklbw %1, %3
+ movq %2, %5
+ punpcklbw %2, %3
+ psubw %1, %2
+%endmacro
+
+%macro SUMSUB_BA 2
+ paddw %1, %2
+ paddw %2, %2
+ psubw %2, %1
+%endmacro
+
+%macro SBUTTERFLY 5
+ mov%1 %5, %3
+ punpckl%2 %3, %4
+ punpckh%2 %5, %4
+%endmacro
+
+;-----------------------------------------------------------------------------
+; input ABCDEFGH output AFHDTECB
+;-----------------------------------------------------------------------------
+%macro TRANSPOSE8x8W 9
+ SBUTTERFLY dqa, wd, %1, %2, %9
+ SBUTTERFLY dqa, wd, %3, %4, %2
+ SBUTTERFLY dqa, wd, %5, %6, %4
+ SBUTTERFLY dqa, wd, %7, %8, %6
+ SBUTTERFLY dqa, dq, %1, %3, %8
+ SBUTTERFLY dqa, dq, %9, %2, %3
+ SBUTTERFLY dqa, dq, %5, %7, %2
+ SBUTTERFLY dqa, dq, %4, %6, %7
+ SBUTTERFLY dqa, qdq, %1, %5, %6
+ SBUTTERFLY dqa, qdq, %9, %4, %5
+ SBUTTERFLY dqa, qdq, %8, %2, %4
+ SBUTTERFLY dqa, qdq, %3, %7, %2
+%endmacro
+
+%macro STORE_DIFF_8P 4
+ psraw %1, 6
+ movq %2, %4
+ punpcklbw %2, %3
+ paddsw %1, %2
+ packuswb %1, %1
+ movq %4, %1
+%endmacro
+
+SECTION .text
+
+; in: ABCDEFGH
+; out: FBCGEDHI
+%macro DCT8_1D 10
+ SUMSUB_BA %8, %1 ; %8=s07, %1=d07
+ SUMSUB_BA %7, %2 ; %7=s16, %2=d16
+ SUMSUB_BA %6, %3 ; %6=s25, %3=d25
+ SUMSUB_BA %5, %4 ; %5=s34, %4=d34
+
+ SUMSUB_BA %5, %8 ; %5=a0, %8=a2
+ SUMSUB_BA %6, %7 ; %6=a1, %7=a3
+
+ movdqa %9, %1
+ psraw %9, 1
+ paddw %9, %1
+ paddw %9, %2
+ paddw %9, %3 ; %9=a4
+
+ movdqa %10, %4
+ psraw %10, 1
+ paddw %10, %4
+ paddw %10, %2
+ psubw %10, %3 ; %10=a7
+
+ SUMSUB_BA %4, %1
+ psubw %1, %3
+ psubw %4, %2
+ psraw %3, 1
+ psraw %2, 1
+ psubw %1, %3 ; %1=a5
+ psubw %4, %2 ; %4=a6
+
+ SUMSUB_BA %6, %5 ; %6=b0, %5=b4
+
+ movdqa %2, %10
+ psraw %2, 2
+ paddw %2, %9 ; %2=b1
+ psraw %9, 2
+ psubw %9, %10 ; %9=b7
+
+ movdqa %3, %7
+ psraw %3, 1
+ paddw %3, %8 ; %3=b2
+ psraw %8, 1
+ psubw %8, %7 ; %8=b6
+
+ movdqa %7, %4
+ psraw %7, 2
+ paddw %7, %1 ; %7=b3
+ psraw %1, 2
+ psubw %4, %1 ; %4=b5
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+cglobal x264_sub8x8_dct8_sse2
+ pxor xmm9, xmm9
+ LOAD_DIFF_8P xmm0, xmm8, xmm9, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE]
+ LOAD_DIFF_8P xmm1, xmm8, xmm9, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE]
+ LOAD_DIFF_8P xmm2, xmm8, xmm9, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE]
+ LOAD_DIFF_8P xmm3, xmm8, xmm9, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE]
+ LOAD_DIFF_8P xmm4, xmm8, xmm9, [parm2q+4*FENC_STRIDE], [parm3q+4*FDEC_STRIDE]
+ LOAD_DIFF_8P xmm5, xmm8, xmm9, [parm2q+5*FENC_STRIDE], [parm3q+5*FDEC_STRIDE]
+ LOAD_DIFF_8P xmm6, xmm8, xmm9, [parm2q+6*FENC_STRIDE], [parm3q+6*FDEC_STRIDE]
+ LOAD_DIFF_8P xmm7, xmm8, xmm9, [parm2q+7*FENC_STRIDE], [parm3q+7*FDEC_STRIDE]
+
+ DCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
+ TRANSPOSE8x8W xmm5, xmm1, xmm2, xmm6, xmm4, xmm3, xmm7, xmm8, xmm0
+ DCT8_1D xmm5, xmm3, xmm8, xmm6, xmm0, xmm4, xmm2, xmm1, xmm7, xmm9
+
+ movdqa [parm1q+0x00], xmm4
+ movdqa [parm1q+0x10], xmm3
+ movdqa [parm1q+0x20], xmm8
+ movdqa [parm1q+0x30], xmm2
+ movdqa [parm1q+0x40], xmm0
+ movdqa [parm1q+0x50], xmm6
+ movdqa [parm1q+0x60], xmm1
+ movdqa [parm1q+0x70], xmm7
+ ret
+
+
+; in: ABCDEFGH
+; out: IBHDEACG
+%macro IDCT8_1D 10
+ SUMSUB_BA %5, %1 ; %5=a0, %1=a2
+ movdqa %10, %3
+ psraw %3, 1
+ psubw %3, %7 ; %3=a4
+ psraw %7, 1
+ paddw %7, %10 ; %7=a6
+
+ movdqa %9, %2
+ psraw %9, 1
+ paddw %9, %2
+ paddw %9, %4
+ paddw %9, %6 ; %9=a7
+
+ movdqa %10, %6
+ psraw %10, 1
+ paddw %10, %6
+ paddw %10, %8
+ psubw %10, %2 ; %10=a5
+
+ psubw %2, %4
+ psubw %6, %4
+ paddw %2, %8
+ psubw %6, %8
+ psraw %4, 1
+ psraw %8, 1
+ psubw %2, %4 ; %2=a3
+ psubw %6, %8 ; %6=a1
+
+ SUMSUB_BA %7, %5 ; %7=b0, %5=b6
+ SUMSUB_BA %3, %1 ; %3=b2, %1=b4
+
+ movdqa %4, %9
+ psraw %4, 2
+ paddw %4, %6 ; %4=b1
+ psraw %6, 2
+ psubw %9, %6 ; %9=b7
+
+ movdqa %8, %10
+ psraw %8, 2
+ paddw %8, %2 ; %8=b3
+ psraw %2, 2
+ psubw %2, %10 ; %2=b5
+
+ SUMSUB_BA %9, %7 ; %9=c0, %7=c7
+ SUMSUB_BA %2, %3 ; %2=c1, %3=c6
+ SUMSUB_BA %8, %1 ; %8=c2, %1=c5
+ SUMSUB_BA %4, %5 ; %4=c3, %5=c4
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
+;-----------------------------------------------------------------------------
+cglobal x264_add8x8_idct8_sse2
+ movdqa xmm0, [parm2q+0x00]
+ movdqa xmm1, [parm2q+0x10]
+ movdqa xmm2, [parm2q+0x20]
+ movdqa xmm3, [parm2q+0x30]
+ movdqa xmm4, [parm2q+0x40]
+ movdqa xmm5, [parm2q+0x50]
+ movdqa xmm6, [parm2q+0x60]
+ movdqa xmm7, [parm2q+0x70]
+
+ IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm9, xmm8
+ TRANSPOSE8x8W xmm9, xmm1, xmm7, xmm3, xmm4, xmm0, xmm2, xmm6, xmm5
+ paddw xmm9, [pw_32 GLOBAL] ; rounding for the >>6 at the end
+ IDCT8_1D xmm9, xmm0, xmm6, xmm3, xmm5, xmm4, xmm7, xmm1, xmm8, xmm2
+
+ pxor xmm15, xmm15
+ STORE_DIFF_8P xmm8, xmm14, xmm15, [parm1q+0*FDEC_STRIDE]
+ STORE_DIFF_8P xmm0, xmm14, xmm15, [parm1q+1*FDEC_STRIDE]
+ STORE_DIFF_8P xmm1, xmm14, xmm15, [parm1q+2*FDEC_STRIDE]
+ STORE_DIFF_8P xmm3, xmm14, xmm15, [parm1q+3*FDEC_STRIDE]
+ STORE_DIFF_8P xmm5, xmm14, xmm15, [parm1q+4*FDEC_STRIDE]
+ STORE_DIFF_8P xmm9, xmm14, xmm15, [parm1q+5*FDEC_STRIDE]
+ STORE_DIFF_8P xmm6, xmm14, xmm15, [parm1q+6*FDEC_STRIDE]
+ STORE_DIFF_8P xmm7, xmm14, xmm15, [parm1q+7*FDEC_STRIDE]
+ ret
+
+
--- /dev/null
+;*****************************************************************************
+;* dct-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003-2008 x264 project
+;*
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+;* Min Chen <chenm001.163.com>
+;* Loren Merritt <lorenm@u.washington.edu>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+pw_32: times 8 dw 32
+
+SECTION .text
+
+%macro LOAD_DIFF_4P 5
+ movd %1, %4
+ punpcklbw %1, %3
+ movd %2, %5
+ punpcklbw %2, %3
+ psubw %1, %2
+%endmacro
+
+%macro SUMSUB_BA 2
+ paddw %1, %2
+ paddw %2, %2
+ psubw %2, %1
+%endmacro
+
+%macro SUMSUB_BADC 4
+ paddw %1, %2
+ paddw %3, %4
+ paddw %2, %2
+ paddw %4, %4
+ psubw %2, %1
+ psubw %4, %3
+%endmacro
+
+%macro SUMSUB2_AB 3
+ movq %3, %1
+ paddw %1, %1
+ paddw %1, %2
+ psubw %3, %2
+ psubw %3, %2
+%endmacro
+
+%macro SUMSUBD2_AB 4
+ movq %4, %1
+ movq %3, %2
+ psraw %2, 1
+ psraw %4, 1
+ paddw %1, %2
+ psubw %4, %3
+%endmacro
+
+%macro SBUTTERFLY 5
+ mov%1 %5, %3
+ punpckl%2 %3, %4
+ punpckh%2 %5, %4
+%endmacro
+
+;-----------------------------------------------------------------------------
+; input ABCD output ADTC
+;-----------------------------------------------------------------------------
+%macro TRANSPOSE4x4W 5
+ SBUTTERFLY q, wd, %1, %2, %5
+ SBUTTERFLY q, wd, %3, %4, %2
+ SBUTTERFLY q, dq, %1, %3, %4
+ SBUTTERFLY q, dq, %5, %2, %3
+%endmacro
+
+%macro STORE_DIFF_4P 5
+ paddw %1, %3
+ psraw %1, 6
+ movd %2, %5
+ punpcklbw %2, %4
+ paddsw %1, %2
+ packuswb %1, %1
+ movd %5, %1
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_dct4x4dc_mmx( int16_t d[4][4] )
+;-----------------------------------------------------------------------------
+cglobal x264_dct4x4dc_mmx, 1,1,1
+ movq mm0, [r0+ 0]
+ movq mm1, [r0+ 8]
+ movq mm2, [r0+16]
+ movq mm3, [r0+24]
+
+ SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
+ SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
+
+ TRANSPOSE4x4W mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
+
+ SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
+ SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
+
+ movq mm6, [pw_1 GLOBAL]
+ paddw mm0, mm6
+ paddw mm2, mm6
+ psraw mm0, 1
+ movq [r0+ 0], mm0
+ psraw mm2, 1
+ movq [r0+ 8], mm2
+ paddw mm3, mm6
+ paddw mm4, mm6
+ psraw mm3, 1
+ movq [r0+16], mm3
+ psraw mm4, 1
+ movq [r0+24], mm4
+ RET
+
+;-----------------------------------------------------------------------------
+; void x264_idct4x4dc_mmx( int16_t d[4][4] )
+;-----------------------------------------------------------------------------
+cglobal x264_idct4x4dc_mmx, 1,1
+ movq mm0, [r0+ 0]
+ movq mm1, [r0+ 8]
+ movq mm2, [r0+16]
+ movq mm3, [r0+24]
+
+ SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
+ SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
+
+ TRANSPOSE4x4W mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
+
+ SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
+ SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
+
+ movq [r0+ 0], mm0
+ movq [r0+ 8], mm2
+ movq [r0+16], mm3
+ movq [r0+24], mm4
+ RET
+
+;-----------------------------------------------------------------------------
+; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+cglobal x264_sub4x4_dct_mmx, 3,3
+.skip_prologue:
+ pxor mm7, mm7
+
+ ; Load 4 lines
+ LOAD_DIFF_4P mm0, mm6, mm7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+ LOAD_DIFF_4P mm1, mm6, mm7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
+ LOAD_DIFF_4P mm2, mm6, mm7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+ LOAD_DIFF_4P mm3, mm6, mm7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
+
+ SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12
+
+ SUMSUB_BA mm2, mm3 ; mm2=s03+s12 mm3=s03-s12
+ SUMSUB2_AB mm0, mm1, mm4 ; mm0=2.d03+d12 mm4=d03-2.d12
+
+ ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
+ TRANSPOSE4x4W mm2, mm0, mm3, mm4, mm1
+
+ SUMSUB_BADC mm3, mm2, mm1, mm4 ; mm3=s03 mm2=d03 mm1=s12 mm4=d12
+
+ SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
+ SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
+
+ movq [r0+ 0], mm1
+ movq [r0+ 8], mm2
+ movq [r0+16], mm3
+ movq [r0+24], mm0
+ RET
+
+;-----------------------------------------------------------------------------
+; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
+;-----------------------------------------------------------------------------
+cglobal x264_add4x4_idct_mmx, 2,2,1
+.skip_prologue:
+ ; Load dct coeffs
+ movq mm0, [r1+ 0] ; dct
+ movq mm1, [r1+ 8]
+ movq mm2, [r1+16]
+ movq mm3, [r1+24]
+
+ SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02
+ SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
+
+ SUMSUB_BADC mm1, mm2, mm4, mm0 ; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13
+
+ ; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0
+ TRANSPOSE4x4W mm1, mm4, mm0, mm2, mm3
+
+ SUMSUB_BA mm3, mm1 ; mm3=s02 mm1=d02
+ SUMSUBD2_AB mm2, mm0, mm5, mm4 ; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
+
+ SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13
+
+ pxor mm7, mm7
+ movq mm6, [pw_32 GLOBAL]
+
+ STORE_DIFF_4P mm2, mm0, mm6, mm7, [r0+0*FDEC_STRIDE]
+ STORE_DIFF_4P mm4, mm0, mm6, mm7, [r0+1*FDEC_STRIDE]
+ STORE_DIFF_4P mm1, mm0, mm6, mm7, [r0+2*FDEC_STRIDE]
+ STORE_DIFF_4P mm3, mm0, mm6, mm7, [r0+3*FDEC_STRIDE]
+
+ RET
+
+
+
+;-----------------------------------------------------------------------------
+; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+%macro SUB_NxN_DCT 6
+cglobal %1, 3,3
+.skip_prologue:
+ call %2
+ add r0, %3
+ add r1, %4-%5*FENC_STRIDE
+ add r2, %4-%5*FDEC_STRIDE
+ call %2
+ add r0, %3
+ add r1, %4*FENC_STRIDE-%6
+ add r2, %4*FDEC_STRIDE-%6
+ call %2
+ add r0, %3
+ add r1, %4-%5*FENC_STRIDE
+ add r2, %4-%5*FDEC_STRIDE
+ jmp %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
+;-----------------------------------------------------------------------------
+%macro ADD_NxN_IDCT 6
+cglobal %1, 2,2,1
+.skip_prologue:
+ call %2
+ add r0, %4-%5*FDEC_STRIDE
+ add r1, %3
+ call %2
+ add r0, %4*FDEC_STRIDE-%6
+ add r1, %3
+ call %2
+ add r0, %4-%5*FDEC_STRIDE
+ add r1, %3
+ jmp %2
+%endmacro
+
+SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 4
+ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 4
+
+SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx.skip_prologue, 32, 4, 4, 12
+ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 4, 4, 12
+
+%ifdef ARCH_X86_64
+cextern x264_sub8x8_dct8_sse2
+cextern x264_add8x8_idct8_sse2
+SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 8
+ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 8
+%endif
+
+
+;-----------------------------------------------------------------------------
+; void x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] )
+;-----------------------------------------------------------------------------
+cglobal x264_zigzag_scan_4x4_field_sse2, 2,2
+ punpcklwd xmm0, [r1]
+ punpckhwd xmm1, [r1]
+ punpcklwd xmm2, [r1+16]
+ punpckhwd xmm3, [r1+16]
+ psrad xmm0, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ movq [r0 ], xmm0
+ movdqa [r0+16], xmm1
+ movdqa [r0+32], xmm2
+ movhlps xmm0, xmm0
+ movdqa [r0+48], xmm3
+ movq [r0+12], xmm0
+ movd [r0+ 8], xmm1
+ RET
+
* dct.h: h264 encoder library
*****************************************************************************
* Copyright (C) 2003 Laurent Aimar
- * $Id: dct.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
*
--- /dev/null
+;*****************************************************************************
+;* deblock-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2005-2008 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+pb_01: times 16 db 0x01
+pb_03: times 16 db 0x03
+pb_a1: times 16 db 0xa1
+
+SECTION .text
+
+%macro INIT_MMX 0
+ %undef movq
+ %define m0 mm0
+ %define m1 mm1
+ %define m2 mm2
+ %define m3 mm3
+ %define m4 mm4
+ %define m5 mm5
+ %define m6 mm6
+ %define m7 mm7
+ %undef m8
+ %undef m9
+%endmacro
+
+%macro INIT_XMM 0
+ %define movq movdqa
+ %define m0 xmm0
+ %define m1 xmm1
+ %define m2 xmm2
+ %define m3 xmm3
+ %define m4 xmm4
+ %define m5 xmm5
+ %define m6 xmm6
+ %define m7 xmm7
+ %define m8 xmm8
+ %define m9 xmm9
+%endmacro
+
+; expands to [base],...,[base+7*stride]
+%define PASS8ROWS(base, base3, stride, stride3) \
+ [base], [base+stride], [base+stride*2], [base3], \
+ [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
+
+; in: 8 rows of 4 bytes in %1..%8
+; out: 4 rows of 8 bytes in m0..m3
+%macro TRANSPOSE4x8_LOAD 8
+ movd m0, %1
+ movd m2, %2
+ movd m1, %3
+ movd m3, %4
+ punpcklbw m0, m2
+ punpcklbw m1, m3
+ movq m2, m0
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+
+ movd m4, %5
+ movd m6, %6
+ movd m5, %7
+ movd m7, %8
+ punpcklbw m4, m6
+ punpcklbw m5, m7
+ movq m6, m4
+ punpcklwd m4, m5
+ punpckhwd m6, m5
+
+ movq m1, m0
+ movq m3, m2
+ punpckldq m0, m4
+ punpckhdq m1, m4
+ punpckldq m2, m6
+ punpckhdq m3, m6
+%endmacro
+
+; in: 4 rows of 8 bytes in m0..m3
+; out: 8 rows of 4 bytes in %1..%8
+%macro TRANSPOSE8x4_STORE 8
+ movq m4, m0
+ movq m5, m1
+ movq m6, m2
+ punpckhdq m4, m4
+ punpckhdq m5, m5
+ punpckhdq m6, m6
+
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ movq m1, m0
+ punpcklwd m0, m2
+ punpckhwd m1, m2
+ movd %1, m0
+ punpckhdq m0, m0
+ movd %2, m0
+ movd %3, m1
+ punpckhdq m1, m1
+ movd %4, m1
+
+ punpckhdq m3, m3
+ punpcklbw m4, m5
+ punpcklbw m6, m3
+ movq m5, m4
+ punpcklwd m4, m6
+ punpckhwd m5, m6
+ movd %5, m4
+ punpckhdq m4, m4
+ movd %6, m4
+ movd %7, m5
+ punpckhdq m5, m5
+ movd %8, m5
+%endmacro
+
+%macro SBUTTERFLY 4
+ movq %4, %2
+ punpckl%1 %2, %3
+ punpckh%1 %4, %3
+%endmacro
+
+; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
+; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
+%macro TRANSPOSE6x8_MEM 9
+ movq m0, %1
+ movq m1, %3
+ movq m2, %5
+ movq m3, %7
+ SBUTTERFLY bw, m0, %2, m4
+ SBUTTERFLY bw, m1, %4, m5
+ SBUTTERFLY bw, m2, %6, m6
+ movq [%9+0x10], m5
+ SBUTTERFLY bw, m3, %8, m7
+ SBUTTERFLY wd, m0, m1, m5
+ SBUTTERFLY wd, m2, m3, m1
+ punpckhdq m0, m2
+ movq [%9+0x00], m0
+ SBUTTERFLY wd, m4, [%9+0x10], m3
+ SBUTTERFLY wd, m6, m7, m2
+ SBUTTERFLY dq, m4, m6, m0
+ SBUTTERFLY dq, m5, m1, m7
+ punpckldq m3, m2
+ movq [%9+0x10], m5
+ movq [%9+0x20], m7
+ movq [%9+0x30], m4
+ movq [%9+0x40], m0
+ movq [%9+0x50], m3
+%endmacro
+
+; out: %4 = |%1-%2|>%3
+; clobbers: %5
+%macro DIFF_GT 5
+ movq %5, %2
+ movq %4, %1
+ psubusb %5, %1
+ psubusb %4, %2
+ por %4, %5
+ psubusb %4, %3
+%endmacro
+
+; out: %4 = |%1-%2|>%3
+; clobbers: %5
+%macro DIFF_GT2 5
+ movq %5, %2
+ movq %4, %1
+ psubusb %5, %1
+ psubusb %4, %2
+ psubusb %5, %3
+ psubusb %4, %3
+ pcmpeqb %4, %5
+%endmacro
+
+%macro SPLATW 1
+%ifidn m0, xmm0
+ pshuflw %1, %1, 0
+ punpcklqdq %1, %1
+%else
+ pshufw %1, %1, 0
+%endif
+%endmacro
+
+; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
+; out: m5=beta-1, m7=mask
+; clobbers: m4,m6
+%macro LOAD_MASK 2
+ movd m4, %1
+ movd m5, %2
+ SPLATW m4
+ SPLATW m5
+ packuswb m4, m4 ; 16x alpha-1
+ packuswb m5, m5 ; 16x beta-1
+ DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
+ DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
+ por m7, m4
+ DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
+ por m7, m4
+ pxor m6, m6
+ pcmpeqb m7, m6
+%endmacro
+
+; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
+; out: m1=p0' m2=q0'
+; clobbers: m0,3-6
+%macro DEBLOCK_P0_Q0 0
+ movq m5, m1
+ pxor m5, m2 ; p0^q0
+ pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
+ pcmpeqb m4, m4
+ pxor m3, m4
+ pavgb m3, m0 ; (p1 - q1 + 256)>>1
+ pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
+ pxor m4, m1
+ pavgb m4, m2 ; (q0 - p0 + 256)>>1
+ pavgb m3, m5
+ paddusb m3, m4 ; d+128+33
+ movq m6, [pb_a1 GLOBAL]
+ psubusb m6, m3
+ psubusb m3, [pb_a1 GLOBAL]
+ pminub m6, m7
+ pminub m3, m7
+ psubusb m1, m6
+ psubusb m2, m3
+ paddusb m1, m3
+ paddusb m2, m6
+%endmacro
+
+; in: m1=p0 m2=q0
+; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
+; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
+; clobbers: q2, tmp, tc0
+%macro LUMA_Q1 6
+ movq %6, m1
+ pavgb %6, m2
+ pavgb %2, %6 ; avg(p2,avg(p0,q0))
+ pxor %6, %3
+ pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
+ psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
+ movq %6, %1
+ psubusb %6, %5
+ paddusb %5, %1
+ pmaxub %2, %6
+ pminub %2, %5
+ movq %4, %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+%ifdef ARCH_X86_64
+INIT_XMM
+cglobal x264_deblock_v_luma_sse2
+ movd m8, [r4] ; tc0
+ lea r4, [r1*3]
+ dec r2d ; alpha-1
+ neg r4
+ dec r3d ; beta-1
+ add r4, r0 ; pix-3*stride
+
+ movdqa m0, [r4+r1] ; p1
+ movdqa m1, [r4+2*r1] ; p0
+ movdqa m2, [r0] ; q0
+ movdqa m3, [r0+r1] ; q1
+ LOAD_MASK r2d, r3d
+
+ punpcklbw m8, m8
+ punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
+ pcmpeqb m9, m9
+ pcmpeqb m9, m8
+ pandn m9, m7
+ pand m8, m9
+
+ movdqa m3, [r4] ; p2
+ DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
+ pand m6, m9
+ movdqa m7, m8
+ psubb m7, m6
+ pand m6, m8
+ LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
+
+ movdqa m4, [r0+2*r1] ; q2
+ DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
+ pand m6, m9
+ pand m8, m6
+ psubb m7, m6
+ movdqa m3, [r0+r1]
+ LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
+
+ DEBLOCK_P0_Q0
+ movdqa [r4+2*r1], m1
+ movdqa [r0], m2
+ ret
+
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal x264_deblock_h_luma_sse2
+ movsxd r10, esi
+ lea r11, [r10+r10*2]
+ lea rax, [r0-4]
+ lea r9, [r0-4+r11]
+ sub rsp, 0x68
+ %define pix_tmp rsp
+
+ ; transpose 6x16 -> tmp space
+ TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
+ lea rax, [rax+r10*8]
+ lea r9, [r9 +r10*8]
+ TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
+
+ ; vertical filter
+ ; alpha, beta, tc0 are still in r2d, r3d, r4
+ ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
+ lea r0, [pix_tmp+0x30]
+ mov esi, 0x10
+ call x264_deblock_v_luma_sse2
+
+ ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
+ add rax, 2
+ add r9, 2
+ movq m0, [pix_tmp+0x18]
+ movq m1, [pix_tmp+0x28]
+ movq m2, [pix_tmp+0x38]
+ movq m3, [pix_tmp+0x48]
+ TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
+
+ shl r10, 3
+ sub rax, r10
+ sub r9, r10
+ shr r10, 3
+ movq m0, [pix_tmp+0x10]
+ movq m1, [pix_tmp+0x20]
+ movq m2, [pix_tmp+0x30]
+ movq m3, [pix_tmp+0x40]
+ TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
+
+ add rsp, 0x68
+ ret
+
+%else
+
+%macro DEBLOCK_LUMA 3
+;-----------------------------------------------------------------------------
+; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal x264_deblock_%2_luma_%1, 5,5,1
+ lea r4, [r1*3]
+ dec r2 ; alpha-1
+ neg r4
+ dec r3 ; beta-1
+ add r4, r0 ; pix-3*stride
+
+ movq m0, [r4+r1] ; p1
+ movq m1, [r4+2*r1] ; p0
+ movq m2, [r0] ; q0
+ movq m3, [r0+r1] ; q1
+ LOAD_MASK r2, r3
+
+ mov r3, r4m
+%if %3 == 16
+ mov r2, esp
+ and esp, -16
+ sub esp, 32
+%else
+ sub esp, 16
+%endif
+
+ movd m4, [r3] ; tc0
+ punpcklbw m4, m4
+ punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
+ movq [esp+%3], m4 ; tc
+ pcmpeqb m3, m3
+ pcmpgtb m4, m3
+ pand m4, m7
+ movq [esp], m4 ; mask
+
+ movq m3, [r4] ; p2
+ DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
+ pand m6, m4
+ pand m4, [esp+%3] ; tc
+ movq m7, m4
+ psubb m7, m6
+ pand m6, m4
+ LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
+
+ movq m4, [r0+2*r1] ; q2
+ DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
+ movq m5, [esp] ; mask
+ pand m6, m5
+ movq m5, [esp+%3] ; tc
+ pand m5, m6
+ psubb m7, m6
+ movq m3, [r0+r1]
+ LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
+
+ DEBLOCK_P0_Q0
+ movq [r4+2*r1], m1
+ movq [r0], m2
+
+%if %3 == 16
+ mov esp, r2
+%else
+ add esp, 16
+%endif
+ RET
+
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal x264_deblock_h_luma_%1, 0,6
+ mov r0, r0m
+ mov r3, r1m
+ lea r4, [r3*3]
+ sub r0, 4
+ lea r1, [r0+r4]
+ SUB esp, 0x6c
+ lea r5, [esp+12]
+ and r5, -16
+%define pix_tmp r5
+
+ ; transpose 6x16 -> tmp space
+ TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
+ lea r0, [r0+r3*8]
+ lea r1, [r1+r3*8]
+ TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
+
+ ; vertical filter
+ lea r0, [pix_tmp+0x30]
+ PUSH dword r4m
+ PUSH dword r3m
+ PUSH dword r2m
+ PUSH dword 16
+ PUSH dword r0
+ call x264_deblock_%2_luma_%1
+%ifidn %2, v8
+ add dword [esp ], 8 ; pix_tmp+0x38
+ add dword [esp+16], 2 ; tc0+2
+ call x264_deblock_%2_luma_%1
+%endif
+ ADD esp, 20
+
+ ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
+ mov r0, r0m
+ sub r0, 2
+ lea r1, [r0+r4]
+
+ movq m0, [pix_tmp+0x10]
+ movq m1, [pix_tmp+0x20]
+ movq m2, [pix_tmp+0x30]
+ movq m3, [pix_tmp+0x40]
+ TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
+
+ lea r0, [r0+r3*8]
+ lea r1, [r1+r3*8]
+ movq m0, [pix_tmp+0x18]
+ movq m1, [pix_tmp+0x28]
+ movq m2, [pix_tmp+0x38]
+ movq m3, [pix_tmp+0x48]
+ TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
+
+ ADD esp, 0x6c
+ RET
+%endmacro ; DEBLOCK_LUMA
+
+INIT_MMX
+DEBLOCK_LUMA mmxext, v8, 8
+INIT_XMM
+DEBLOCK_LUMA sse2, v, 16
+
+%endif ; ARCH
+
+
+
+INIT_MMX
+
+%macro CHROMA_V_START 0
+ dec r2d ; alpha-1
+ dec r3d ; beta-1
+ mov t5, r0
+ sub t5, r1
+ sub t5, r1
+%endmacro
+
+%macro CHROMA_H_START 0
+ dec r2d
+ dec r3d
+ sub r0, 2
+ lea t6, [r1*3]
+ mov t5, r0
+ add r0, t6
+%endmacro
+
+%define t5 r5
+%define t6 r6
+
+;-----------------------------------------------------------------------------
+; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal x264_deblock_v_chroma_mmxext, 5,6
+ CHROMA_V_START
+
+ movq m0, [t5]
+ movq m1, [t5+r1]
+ movq m2, [r0]
+ movq m3, [r0+r1]
+
+ LOAD_MASK r2d, r3d
+ movd m6, [r4] ; tc0
+ punpcklbw m6, m6
+ pand m7, m6
+ picgetgot r4
+ DEBLOCK_P0_Q0
+
+ movq [t5+r1], m1
+ movq [r0], m2
+ RET
+
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal x264_deblock_h_chroma_mmxext, 5,7
+%ifdef ARCH_X86_64
+ %define buf0 [rsp-16]
+ %define buf1 [rsp-8]
+%else
+ %define buf0 r0m
+ %define buf1 r2m
+%endif
+ CHROMA_H_START
+
+ TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
+ movq buf0, m0
+ movq buf1, m3
+
+ LOAD_MASK r2d, r3d
+ movd m6, [r4] ; tc0
+ punpcklbw m6, m6
+ pand m7, m6
+ picgetgot r4
+ DEBLOCK_P0_Q0
+
+ movq m0, buf0
+ movq m3, buf1
+ TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
+ RET
+
+
+
+; in: %1=p0 %2=p1 %3=q1
+; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
+%macro CHROMA_INTRA_P0 3
+ movq m4, %1
+ pxor m4, %3
+ pand m4, [pb_01 GLOBAL] ; m4 = (p0^q1)&1
+ pavgb %1, %3
+ psubusb %1, m4
+ pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
+%endmacro
+
+%macro CHROMA_INTRA_BODY 0
+ LOAD_MASK r2d, r3d
+ movq m5, m1
+ movq m6, m2
+ CHROMA_INTRA_P0 m1, m0, m3
+ CHROMA_INTRA_P0 m2, m3, m0
+ psubb m1, m5
+ psubb m2, m6
+ pand m1, m7
+ pand m2, m7
+ paddb m1, m5
+ paddb m2, m6
+%endmacro
+
+%define t5 r4
+%define t6 r5
+
+;-----------------------------------------------------------------------------
+; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal x264_deblock_v_chroma_intra_mmxext, 4,5,1
+ CHROMA_V_START
+
+ movq m0, [t5]
+ movq m1, [t5+r1]
+ movq m2, [r0]
+ movq m3, [r0+r1]
+
+ CHROMA_INTRA_BODY
+
+ movq [t5+r1], m1
+ movq [r0], m2
+ RET
+
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal x264_deblock_h_chroma_intra_mmxext, 4,6,1
+ CHROMA_H_START
+ TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
+ CHROMA_INTRA_BODY
+ TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
+ RET
+
--- /dev/null
+;*****************************************************************************
+;* mc-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003-2008 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Laurent Aimar <fenrir@via.ecp.fr>
+;* Min Chen <chenm001.163.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+
+pw_4: times 4 dw 4
+pw_8: times 4 dw 8
+pw_32: times 4 dw 32
+pw_64: times 4 dw 64
+
+SECTION .text
+
+;=============================================================================
+; pixel avg
+;=============================================================================
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
+; uint8_t *src, int src_stride );
+;-----------------------------------------------------------------------------
+%macro AVGH 2
+%assign function_align 8 ; the whole function fits in 8 bytes, so a larger align just wastes space
+cglobal x264_pixel_avg_%1x%2_mmxext
+ mov eax, %2
+ jmp x264_pixel_avg_w%1_mmxext
+%assign function_align 16
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
+; uint8_t *src, int src_stride,
+; int height );
+;-----------------------------------------------------------------------------
+%ifdef ARCH_X86_64
+ %define t0 r0
+ %define t1 r1
+ %define t2 r2
+ %define t3 r3
+ %macro AVG_START 1
+ cglobal %1, 4,5
+ .height_loop:
+ %endmacro
+%else
+ %define t0 r1
+ %define t1 r2
+ %define t2 r3
+ %define t3 r4
+ %macro AVG_START 1
+ cglobal %1, 0,5
+ mov t0, r0m
+ mov t1, r1m
+ mov t2, r2m
+ mov t3, r3m
+ .height_loop:
+ %endmacro
+%endif
+
+%macro AVG_END 0
+ sub eax, 2
+ lea t2, [t2+t3*2]
+ lea t0, [t0+t1*2]
+ jg .height_loop
+ REP_RET
+%endmacro
+
+AVG_START x264_pixel_avg_w4_mmxext
+ movd mm0, [t2]
+ movd mm1, [t2+t3]
+ pavgb mm0, [t0]
+ pavgb mm1, [t0+t1]
+ movd [t0], mm0
+ movd [t0+t1], mm1
+AVG_END
+
+AVGH 4, 8
+AVGH 4, 4
+AVGH 4, 2
+
+AVG_START x264_pixel_avg_w8_mmxext
+ movq mm0, [t2]
+ movq mm1, [t2+t3]
+ pavgb mm0, [t0]
+ pavgb mm1, [t0+t1]
+ movq [t0], mm0
+ movq [t0+t1], mm1
+AVG_END
+
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
+
+AVG_START x264_pixel_avg_w16_mmxext
+ movq mm0, [t2 ]
+ movq mm1, [t2+8]
+ movq mm2, [t2+t3 ]
+ movq mm3, [t2+t3+8]
+ pavgb mm0, [t0 ]
+ pavgb mm1, [t0+8]
+ pavgb mm2, [t0+t1 ]
+ pavgb mm3, [t0+t1+8]
+ movq [t0 ], mm0
+ movq [t0+8], mm1
+ movq [t0+t1 ], mm2
+ movq [t0+t1+8], mm3
+AVG_END
+
+AVGH 16, 16
+AVGH 16, 8
+
+AVG_START x264_pixel_avg_w16_sse2
+ movdqu xmm0, [t2]
+ movdqu xmm1, [t2+t3]
+ pavgb xmm0, [t0]
+ pavgb xmm1, [t0+t1]
+ movdqa [t0], xmm0
+ movdqa [t0+t1], xmm1
+AVG_END
+
+
+
+;=============================================================================
+; pixel avg2
+;=============================================================================
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride,
+; uint8_t *src1, int src_stride,
+; uint8_t *src2, int height );
+;-----------------------------------------------------------------------------
+%macro AVG2_W8 2
+cglobal x264_pixel_avg2_w%1_mmxext, 6,7
+ sub r4, r2
+ lea r6, [r4+r3]
+.height_loop:
+ %2 mm0, [r2]
+ %2 mm1, [r2+r3]
+ pavgb mm0, [r2+r4]
+ pavgb mm1, [r2+r6]
+ %2 [r0], mm0
+ %2 [r0+r1], mm1
+ sub r5d, 2
+ lea r2, [r2+r3*2]
+ lea r0, [r0+r1*2]
+ jg .height_loop
+ REP_RET
+%endmacro
+
+AVG2_W8 4, movd
+AVG2_W8 8, movq
+
+%macro AVG2_W16 2
+cglobal x264_pixel_avg2_w%1_mmxext, 6,7
+ sub r4, r2
+ lea r6, [r4+r3]
+.height_loop:
+ movq mm0, [r2]
+ %2 mm1, [r2+8]
+ movq mm2, [r2+r3]
+ %2 mm3, [r2+r3+8]
+ pavgb mm0, [r2+r4]
+ pavgb mm1, [r2+r4+8]
+ pavgb mm2, [r2+r6]
+ pavgb mm3, [r2+r6+8]
+ movq [r0], mm0
+ %2 [r0+8], mm1
+ movq [r0+r1], mm2
+ %2 [r0+r1+8], mm3
+ lea r2, [r2+r3*2]
+ lea r0, [r0+r1*2]
+ sub r5d, 2
+ jg .height_loop
+ REP_RET
+%endmacro
+
+AVG2_W16 12, movd
+AVG2_W16 16, movq
+
+cglobal x264_pixel_avg2_w20_mmxext, 6,7
+ sub r4, r2
+ lea r6, [r4+r3]
+.height_loop:
+ movq mm0, [r2]
+ movq mm1, [r2+8]
+ movd mm2, [r2+16]
+ movq mm3, [r2+r3]
+ movq mm4, [r2+r3+8]
+ movd mm5, [r2+r3+16]
+ pavgb mm0, [r2+r4]
+ pavgb mm1, [r2+r4+8]
+ pavgb mm2, [r2+r4+16]
+ pavgb mm3, [r2+r6]
+ pavgb mm4, [r2+r6+8]
+ pavgb mm5, [r2+r6+16]
+ movq [r0], mm0
+ movq [r0+8], mm1
+ movd [r0+16], mm2
+ movq [r0+r1], mm3
+ movq [r0+r1+8], mm4
+ movd [r0+r1+16], mm5
+ lea r2, [r2+r3*2]
+ lea r0, [r0+r1*2]
+ sub r5d, 2
+ jg .height_loop
+ REP_RET
+
+
+
+;=============================================================================
+; pixel copy
+;=============================================================================
+
+%macro COPY4 3
+ %1 mm0, [r2]
+ %1 mm1, [r2+r3]
+ %1 mm2, [r2+r3*2]
+ %1 mm3, [r2+%3]
+ %1 [r0], mm0
+ %1 [r0+r1], mm1
+ %1 [r0+r1*2], mm2
+ %1 [r0+%2], mm3
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride,
+; uint8_t *src, int i_src_stride, int i_height )
+;-----------------------------------------------------------------------------
+cglobal x264_mc_copy_w4_mmx, 4,6
+ cmp r4m, dword 4
+ lea r5, [r3*3]
+ lea r4, [r1*3]
+ je .end
+ COPY4 movd, r4, r5
+ lea r2, [r2+r3*4]
+ lea r0, [r0+r1*4]
+.end:
+ COPY4 movd, r4, r5
+ RET
+
+cglobal x264_mc_copy_w8_mmx, 5,7
+ lea r6, [r3*3]
+ lea r5, [r1*3]
+.height_loop:
+ COPY4 movq, r5, r6
+ lea r2, [r2+r3*4]
+ lea r0, [r0+r1*4]
+ sub r4d, 4
+ jg .height_loop
+ REP_RET
+
+cglobal x264_mc_copy_w16_mmx, 5,7
+ lea r6, [r3*3]
+ lea r5, [r1*3]
+.height_loop:
+ movq mm0, [r2]
+ movq mm1, [r2+8]
+ movq mm2, [r2+r3]
+ movq mm3, [r2+r3+8]
+ movq mm4, [r2+r3*2]
+ movq mm5, [r2+r3*2+8]
+ movq mm6, [r2+r6]
+ movq mm7, [r2+r6+8]
+ movq [r0], mm0
+ movq [r0+8], mm1
+ movq [r0+r1], mm2
+ movq [r0+r1+8], mm3
+ movq [r0+r1*2], mm4
+ movq [r0+r1*2+8], mm5
+ movq [r0+r5], mm6
+ movq [r0+r5+8], mm7
+ lea r2, [r2+r3*4]
+ lea r0, [r0+r1*4]
+ sub r4d, 4
+ jg .height_loop
+ REP_RET
+
+
+
+;=============================================================================
+; weighted prediction
+;=============================================================================
+; implicit bipred only:
+; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
+
+%macro BIWEIGHT_4P_MMX 2
+ movd mm0, %1
+ movd mm1, %2
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ pmullw mm0, mm4
+ pmullw mm1, mm5
+ paddw mm0, mm1
+ paddw mm0, mm6
+ psraw mm0, 6
+ pmaxsw mm0, mm7
+ packuswb mm0, mm0
+ movd %1, mm0
+%endmacro
+
+%macro BIWEIGHT_START_MMX 1
+%ifidn r4m, r4d
+ movd mm4, r4m
+ pshufw mm4, mm4, 0 ; weight_dst
+%else
+ pshufw mm4, r4m, 0
+%endif
+ picgetgot r4
+ movq mm5, [pw_64 GLOBAL]
+ psubw mm5, mm4 ; weight_src
+ movq mm6, [pw_32 GLOBAL] ; rounding
+ pxor mm7, mm7
+%if %1
+%ifidn r5m, r5d
+ %define t0 r5d
+%else
+ %define t0 r4d
+ mov r4d, r5m
+%endif
+%endif
+.height_loop:
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int )
+;-----------------------------------------------------------------------------
+cglobal x264_pixel_avg_weight_w16_mmxext, 4,5
+ BIWEIGHT_START_MMX 1
+ BIWEIGHT_4P_MMX [r0 ], [r2 ]
+ BIWEIGHT_4P_MMX [r0+ 4], [r2+ 4]
+ BIWEIGHT_4P_MMX [r0+ 8], [r2+ 8]
+ BIWEIGHT_4P_MMX [r0+12], [r2+12]
+ add r0, r1
+ add r2, r3
+ dec t0
+ jg .height_loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int )
+;-----------------------------------------------------------------------------
+cglobal x264_pixel_avg_weight_w8_mmxext, 4,5
+ BIWEIGHT_START_MMX 1
+ BIWEIGHT_4P_MMX [r0 ], [r2 ]
+ BIWEIGHT_4P_MMX [r0+4], [r2+4]
+ add r0, r1
+ add r2, r3
+ dec t0
+ jg .height_loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int )
+;-----------------------------------------------------------------------------
+cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4
+ BIWEIGHT_START_MMX 0
+ BIWEIGHT_4P_MMX [r0 ], [r2 ]
+ BIWEIGHT_4P_MMX [r0+r1 ], [r2+r3 ]
+ BIWEIGHT_4P_MMX [r0+r1*2], [r2+r3*2]
+ add r0, r1
+ add r2, r3
+ BIWEIGHT_4P_MMX [r0+r1*2], [r2+r3*2]
+ RET
+
+
+
+;=============================================================================
+; prefetch
+;=============================================================================
+; FIXME assumes 64 byte cachelines
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
+; uint8_t *pix_uv, int stride_uv, int mb_x )
+;-----------------------------------------------------------------------------
+%ifdef ARCH_X86_64
+cglobal x264_prefetch_fenc_mmxext, 5,5
+ mov eax, r4d
+ and eax, 3
+ imul eax, r1d
+ lea r0, [r0+rax*4+64]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+ lea r0, [r0+r1*2]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+
+ and r4d, 6
+ imul r4d, r3d
+ lea r2, [r2+r4+64]
+ prefetcht0 [r2]
+ prefetcht0 [r2+r3]
+ ret
+
+%else
+cglobal x264_prefetch_fenc_mmxext
+ mov r2, [esp+20]
+ mov r1, [esp+8]
+ mov r0, [esp+4]
+ and r2, 3
+ imul r2, r1
+ lea r0, [r0+r2*4+64]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+ lea r0, [r0+r1*2]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+
+ mov r2, [esp+20]
+ mov r1, [esp+16]
+ mov r0, [esp+12]
+ and r2, 6
+ imul r2, r1
+ lea r0, [r0+r2+64]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+ ret
+%endif ; ARCH_X86_64
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
+;-----------------------------------------------------------------------------
+cglobal x264_prefetch_ref_mmxext, 3,3
+ dec r2d
+ and r2d, r1d
+ lea r0, [r0+r2*8+64]
+ lea r2, [r1*3]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+ prefetcht0 [r0+r1*2]
+ prefetcht0 [r0+r2]
+ lea r0, [r0+r1*4]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+ prefetcht0 [r0+r1*2]
+ prefetcht0 [r0+r2]
+ ret
+
+
+
+;=============================================================================
+; chroma MC
+;=============================================================================
+
+;-----------------------------------------------------------------------------
+; void x264_mc_chroma_mmxext( uint8_t *dst, int i_dst_stride,
+; uint8_t *src, int i_src_stride,
+; int dx, int dy,
+; int i_width, int i_height )
+;-----------------------------------------------------------------------------
+cglobal x264_mc_chroma_mmxext, 0,6,1
+%ifdef ARCH_X86_64
+ %define t0 r10d
+%else
+ %define t0 r1d
+%endif
+ movifnidn r2d, r2m
+ movifnidn r3d, r3m
+ movifnidn r4d, r4m
+ movifnidn r5d, r5m
+ mov eax, r5d
+ mov t0, r4d
+ sar eax, 3
+ sar t0, 3
+ imul eax, r3d
+ pxor mm3, mm3
+ add eax, t0
+ movsxdifnidn rax, eax
+ add r2, rax ; src += (dx>>3) + (dy>>3) * src_stride
+ and r4d, 7 ; dx &= 7
+ je .mc1d
+ and r5d, 7 ; dy &= 7
+ je .mc1d
+
+ movd mm0, r4d
+ movd mm1, r5d
+ pshufw mm5, mm0, 0 ; mm5 = dx
+ pshufw mm6, mm1, 0 ; mm6 = dy
+
+ movq mm4, [pw_8 GLOBAL]
+ movq mm0, mm4
+ psubw mm4, mm5 ; mm4 = 8-dx
+ psubw mm0, mm6 ; mm0 = 8-dy
+
+ movq mm7, mm5
+ pmullw mm5, mm0 ; mm5 = dx*(8-dy) = cB
+ pmullw mm7, mm6 ; mm7 = dx*dy = cD
+ pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC
+ pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA
+
+ mov r4d, r7m
+%ifdef ARCH_X86_64
+ mov r10, r0
+ mov r11, r2
+%else
+ mov r0, r0m
+ mov r1, r1m
+ mov r5, r2
+%endif
+
+ALIGN 4
+.height_loop
+
+ movd mm1, [r2+r3]
+ movd mm0, [r2]
+ punpcklbw mm1, mm3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4
+ punpcklbw mm0, mm3
+ pmullw mm1, mm6 ; 2nd line * cC
+ pmullw mm0, mm4 ; 1st line * cA
+ paddw mm0, mm1 ; mm0 <- result
+
+ movd mm2, [r2+1]
+ movd mm1, [r2+r3+1]
+ punpcklbw mm2, mm3
+ punpcklbw mm1, mm3
+
+ paddw mm0, [pw_32 GLOBAL]
+
+ pmullw mm2, mm5 ; line * cB
+ pmullw mm1, mm7 ; line * cD
+ paddw mm0, mm2
+ paddw mm0, mm1
+ psrlw mm0, 6
+
+ packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4
+ movd [r0], mm0
+
+ add r2, r3
+ add r0, r1 ; i_dst_stride
+ dec r4d
+ jnz .height_loop
+
+ sub dword r6m, 8
+ jnz .finish ; width != 8 so assume 4
+
+%ifdef ARCH_X86_64
+ lea r0, [r10+4] ; dst
+ lea r2, [r11+4] ; src
+%else
+ mov r0, r0m
+ lea r2, [r5+4]
+ add r0, 4
+%endif
+ mov r4d, r7m ; i_height
+ jmp .height_loop
+
+.finish
+ REP_RET
+
+ALIGN 4
+.mc1d
+ mov eax, r4d
+ or eax, r5d
+ and eax, 7
+ cmp r4d, 0
+ mov r5d, 1
+ cmove r5, r3 ; pel_offset = dx ? 1 : src_stride
+ movd mm6, eax
+ movq mm5, [pw_8 GLOBAL]
+ pshufw mm6, mm6, 0
+ movq mm7, [pw_4 GLOBAL]
+ psubw mm5, mm6
+
+ cmp dword r6m, 8
+ movifnidn r0d, r0m
+ movifnidn r1d, r1m
+ mov r4d, r7m
+ je .height_loop1_w8
+
+ALIGN 4
+.height_loop1_w4
+ movd mm0, [r2+r5]
+ movd mm1, [r2]
+ punpcklbw mm0, mm3
+ punpcklbw mm1, mm3
+ pmullw mm0, mm6
+ pmullw mm1, mm5
+ paddw mm0, mm7
+ paddw mm0, mm1
+ psrlw mm0, 3
+ packuswb mm0, mm3
+ movd [r0], mm0
+ add r2, r3
+ add r0, r1
+ dec r4d
+ jnz .height_loop1_w4
+ REP_RET
+
+ALIGN 4
+.height_loop1_w8
+ movq mm0, [r2+r5]
+ movq mm1, [r2]
+ movq mm2, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm3
+ punpcklbw mm1, mm3
+ punpckhbw mm2, mm3
+ punpckhbw mm4, mm3
+ pmullw mm0, mm6
+ pmullw mm1, mm5
+ pmullw mm2, mm6
+ pmullw mm4, mm5
+ paddw mm0, mm7
+ paddw mm2, mm7
+ paddw mm0, mm1
+ paddw mm2, mm4
+ psrlw mm0, 3
+ psrlw mm2, 3
+ packuswb mm0, mm2
+ movq [r0], mm0
+ add r2, r3
+ add r0, r1
+ dec r4d
+ jnz .height_loop1_w8
+ REP_RET
+
;*****************************************************************************
;* mc-a2.asm: h264 encoder library
;*****************************************************************************
-;* Copyright (C) 2005 x264 project
+;* Copyright (C) 2005-2008 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Mathieu Monnier <manao@melix.net>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
-BITS 32
-
-;=============================================================================
-; Macros and other preprocessor constants
-;=============================================================================
-
-%include "i386inc.asm"
-
-;=============================================================================
-; Read only data
-;=============================================================================
+%include "x86inc.asm"
SECTION_RODATA
-ALIGN 16
pw_1: times 4 dw 1
pw_16: times 4 dw 16
pw_32: times 4 dw 32
-;=============================================================================
-; Macros
-;=============================================================================
+SECTION .text
%macro LOAD_ADD 3
movd %1, %2
packuswb mm1, mm4
%endmacro
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
;-----------------------------------------------------------------------------
; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
; int i_stride, int i_width, int i_height );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_mmxext
- push ebp
- mov ebp, esp
- push ebx
- push esi
- push edi
- picgetgot ebx
-
- %define tdsth ebp + 8
- %define tdstv ebp + 12
- %define tdstc ebp + 16
- %define tsrc ebp + 20
- %define tstride ebp + 24
- %define twidth ebp + 28
- %define theight ebp + 32
- %define tpw_1 ebp - 36
- %define tpw_16 ebp - 28
- %define tpw_32 ebp - 20
- %define tbuffer esp + 8
-
- %define x eax
- %define dsth ebx
- %define dstv ebx
- %define dstc ebx
- %define src ecx
- %define src3 edx
- %define stride esi
- %define width edi
-
- mov stride, [tstride]
- mov width, [twidth]
- lea eax, [stride*2 + 24 + 24]
- sub esp, eax
+cglobal x264_hpel_filter_mmxext, 0,7
+ %define x r0
+ %define xd r0d
+ %define dsth r1
+ %define dstv r1
+ %define dstc r1
+ %define src r2
+ %define src3 r3
+ %define stride r4
+ %define width r5d
+ %define tbuffer rsp+8
+
+%ifdef ARCH_X86_64
+ PUSH rbp
+ PUSH r12
+ PUSH r13
+ PUSH r14
+ %define tdsth r10 ; FIXME r8,9
+ %define tdstv r11
+ %define tdstc r12
+ %define tsrc r13
+ %define theight r14d
+ mov tdsth, r0
+ mov tdstv, r1
+ mov tdstc, r2
+ mov tsrc, r3
+ mov theight, r6m
+%else
+ %define tdsth [rbp + 20]
+ %define tdstv [rbp + 24]
+ %define tdstc [rbp + 28]
+ %define tsrc [rbp + 32]
+ %define theight [rbp + 44]
+%endif
+
+ movifnidn r4d, r4m
+ movifnidn r5d, r5m
+ mov rbp, rsp
+ lea rax, [stride*2 + 24]
+ sub rsp, rax
pxor mm0, mm0
- ; mov globals onto the stack, to free up ebx
+ %define tpw_1 [pw_1 GLOBAL]
+ %define tpw_16 [pw_16 GLOBAL]
+ %define tpw_32 [pw_32 GLOBAL]
+%ifdef PIC32
+ ; mov globals onto the stack, to free up PIC pointer
+ %define tpw_1 [ebp - 24]
+ %define tpw_16 [ebp - 16]
+ %define tpw_32 [ebp - 8]
+ picgetgot ebx
+ sub esp, 24
movq mm1, [pw_1 GLOBAL]
movq mm2, [pw_16 GLOBAL]
movq mm3, [pw_32 GLOBAL]
- movq [tpw_1], mm1
- movq [tpw_16], mm2
- movq [tpw_32], mm3
+ movq tpw_1, mm1
+ movq tpw_16, mm2
+ movq tpw_32, mm3
+%endif
.loopy:
- mov src, [tsrc]
- mov dstv, [tdstv]
+ mov src, tsrc
+ mov dstv, tdstv
lea src3, [src + stride]
sub src, stride
sub src, stride
- xor x, x
+ xor xd, xd
ALIGN 16
.vertical_filter:
FILT_V
- movq mm7, [tpw_16]
+ movq mm7, tpw_16
movq [tbuffer + x*2], mm1
movq [tbuffer + x*2 + 8], mm4
paddw mm1, mm7
packuswb mm1, mm4
movntq [dstv + x], mm1
- add x, 8
+ add xd, 8
add src, 8
add src3, 8
- cmp x, width
+ cmp xd, width
jle .vertical_filter
pshufw mm2, [tbuffer], 0
movq [tbuffer - 8], mm2 ; pad left
; no need to pad right, since vertical_filter already did 4 extra pixels
- mov dstc, [tdstc]
- xor x, x
- movq mm7, [tpw_32]
+ mov dstc, tdstc
+ xor xd, xd
+ movq mm7, tpw_32
.center_filter:
movq mm1, [tbuffer + x*2 - 4 ]
FILT_PACK 6
movntq [dstc + x], mm1
- add x, 8
- cmp x, width
+ add xd, 8
+ cmp xd, width
jl .center_filter
- mov dsth, [tdsth]
- mov src, [tsrc]
- xor x, x
+ mov dsth, tdsth
+ mov src, tsrc
+ xor xd, xd
.horizontal_filter:
movd mm1, [src + x - 2]
punpcklbw mm6, mm0
paddw mm6, mm7 ; a1
- movq mm7, [tpw_1]
+ movq mm7, tpw_1
FILT_H
FILT_PACK 1
movntq [dsth + x], mm1
- add x, 8
- cmp x, width
+ add xd, 8
+ cmp xd, width
jl .horizontal_filter
- add [tsrc], stride
- add [tdsth], stride
- add [tdstv], stride
- add [tdstc], stride
- dec dword [theight]
+ add tsrc, stride
+ add tdsth, stride
+ add tdstv, stride
+ add tdstc, stride
+ dec dword theight
jg .loopy
- lea esp, [ebp-12]
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
-
+ mov rsp, rbp
+%ifdef ARCH_X86_64
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+%endif
+ RET
; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
; uint8_t *src, int i_src, int w, int h)
;-----------------------------------------------------------------------------
-cglobal x264_plane_copy_mmxext
- push edi
- push esi
- push ebx
- mov edi, [esp+16] ; dst
- mov ebx, [esp+20] ; i_dst
- mov esi, [esp+24] ; src
- mov eax, [esp+28] ; i_src
- mov edx, [esp+32] ; w
- add edx, 3
- and edx, ~3
- sub ebx, edx
- sub eax, edx
+cglobal x264_plane_copy_mmxext, 6,7
+ movsxdifnidn r1, r1d
+ movsxdifnidn r3, r3d
+ add r4d, 3
+ and r4d, ~3
+ mov r6d, r4d
+ and r6d, ~15
+ sub r1, r6
+ sub r3, r6
.loopy:
- mov ecx, edx
- sub ecx, 64
+ mov r6d, r4d
+ sub r6d, 64
jl .endx
.loopx:
- prefetchnta [esi+256]
- movq mm0, [esi ]
- movq mm1, [esi+ 8]
- movq mm2, [esi+16]
- movq mm3, [esi+24]
- movq mm4, [esi+32]
- movq mm5, [esi+40]
- movq mm6, [esi+48]
- movq mm7, [esi+56]
- movntq [edi ], mm0
- movntq [edi+ 8], mm1
- movntq [edi+16], mm2
- movntq [edi+24], mm3
- movntq [edi+32], mm4
- movntq [edi+40], mm5
- movntq [edi+48], mm6
- movntq [edi+56], mm7
- add esi, 64
- add edi, 64
- sub ecx, 64
+ prefetchnta [r2+256]
+ movq mm0, [r2 ]
+ movq mm1, [r2+ 8]
+ movq mm2, [r2+16]
+ movq mm3, [r2+24]
+ movq mm4, [r2+32]
+ movq mm5, [r2+40]
+ movq mm6, [r2+48]
+ movq mm7, [r2+56]
+ movntq [r0 ], mm0
+ movntq [r0+ 8], mm1
+ movntq [r0+16], mm2
+ movntq [r0+24], mm3
+ movntq [r0+32], mm4
+ movntq [r0+40], mm5
+ movntq [r0+48], mm6
+ movntq [r0+56], mm7
+ add r2, 64
+ add r0, 64
+ sub r6d, 64
jge .loopx
.endx:
- prefetchnta [esi+256]
- add ecx, 64
- shr ecx, 2
- rep movsd
- add edi, ebx
- add esi, eax
- sub dword [esp+36], 1
+ prefetchnta [r2+256]
+ add r6d, 48
+ jl .end16
+.loop16:
+ movq mm0, [r2 ]
+ movq mm1, [r2+8]
+ movntq [r0 ], mm0
+ movntq [r0+8], mm1
+ add r2, 16
+ add r0, 16
+ sub r6d, 16
+ jge .loop16
+.end16:
+ add r6d, 12
+ jl .end4
+.loop4:
+ movd mm2, [r2+r6]
+ movd [r0+r6], mm2
+ sub r6d, 4
+ jge .loop4
+.end4:
+ add r2, r3
+ add r0, r1
+ dec r5d
jg .loopy
- pop ebx
- pop esi
- pop edi
emms
- ret
+ RET
/*****************************************************************************
- * mc.c: h264 encoder library (Motion Compensation)
+ * mc-c.c: h264 encoder library (Motion Compensation)
*****************************************************************************
- * Copyright (C) 2003 Laurent Aimar
- * $Id: mc-c.c,v 1.5 2004/06/18 01:59:58 chenm001 Exp $
+ * Copyright (C) 2003-2008 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ * Loren Merritt <lorenm@u.washington.edu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg2_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg2_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg2_w12_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg2_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg2_w20_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
-extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
-extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int );
extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
-extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
+extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
+extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int );
extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
+extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
+ uint8_t *dst, int i_dst_stride,
+ int dx, int dy, int i_width, int i_height );
+extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
extern void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
int i_stride, int i_width, int i_height );
NULL,
x264_pixel_avg2_w4_mmxext,
x264_pixel_avg2_w8_mmxext,
- x264_pixel_avg2_w16_mmxext,
+ x264_pixel_avg2_w12_mmxext,
x264_pixel_avg2_w16_mmxext,
x264_pixel_avg2_w20_mmxext,
};
pf->mc_luma = mc_luma_mmxext;
pf->get_ref = get_ref_mmxext;
+ pf->mc_chroma = x264_mc_chroma_mmxext;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmxext;
* mc.h: h264 encoder library
*****************************************************************************
* Copyright (C) 2003 Laurent Aimar
- * $Id: mc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
*
--- /dev/null
+;*****************************************************************************
+;* pixel-32.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003-2008 x264 project
+;*
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+;* Loren Merritt <lorenm@u.washington.edu>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION .text
+
+%macro SUMSUB_BADC 4
+ paddw %1, %2
+ paddw %3, %4
+ paddw %2, %2
+ paddw %4, %4
+ psubw %2, %1
+ psubw %4, %3
+%endmacro
+
+%macro SBUTTERFLY 5
+ mov%1 %5, %3
+ punpckl%2 %3, %4
+ punpckh%2 %5, %4
+%endmacro
+
+%macro TRANSPOSE4x4W 5 ; abcd-t -> adtc
+ SBUTTERFLY q, wd, %1, %2, %5
+ SBUTTERFLY q, wd, %3, %4, %2
+ SBUTTERFLY q, dq, %1, %3, %4
+ SBUTTERFLY q, dq, %5, %2, %3
+%endmacro
+
+%macro ABS1 2 ; mma, tmp
+ pxor %2, %2
+ psubw %2, %1
+ pmaxsw %1, %2
+%endmacro
+
+%macro ABS2 4 ; mma, mmb, tmp0, tmp1
+ pxor %3, %3
+ pxor %4, %4
+ psubw %3, %1
+ psubw %4, %2
+ pmaxsw %1, %3
+ pmaxsw %2, %4
+%endmacro
+
+%macro LOAD_DIFF_4P 4 ; mmp, mmt, dx, dy
+ movd %1, [eax+ebx*%4+%3]
+ movd %2, [ecx+edx*%4+%3]
+ punpcklbw %1, %2
+ punpcklbw %2, %2
+ psubw %1, %2
+%endmacro
+
+%macro LOAD_DIFF_4x8P 1 ; dx
+ LOAD_DIFF_4P mm0, mm7, %1, 0
+ LOAD_DIFF_4P mm1, mm7, %1, 1
+ lea eax, [eax+2*ebx]
+ lea ecx, [ecx+2*edx]
+ LOAD_DIFF_4P mm2, mm7, %1, 0
+ LOAD_DIFF_4P mm3, mm7, %1, 1
+ lea eax, [eax+2*ebx]
+ lea ecx, [ecx+2*edx]
+ LOAD_DIFF_4P mm4, mm7, %1, 0
+ LOAD_DIFF_4P mm5, mm7, %1, 1
+ lea eax, [eax+2*ebx]
+ lea ecx, [ecx+2*edx]
+ LOAD_DIFF_4P mm6, mm7, %1, 0
+ movq [spill], mm6
+ LOAD_DIFF_4P mm7, mm6, %1, 1
+ movq mm6, [spill]
+%endmacro
+
+%macro HADAMARD8_1D 8
+ SUMSUB_BADC %1, %5, %2, %6
+ SUMSUB_BADC %3, %7, %4, %8
+ SUMSUB_BADC %1, %3, %2, %4
+ SUMSUB_BADC %5, %7, %6, %8
+ SUMSUB_BADC %1, %2, %3, %4
+ SUMSUB_BADC %5, %6, %7, %8
+%endmacro
+
+%macro SUM4x8_MM 0
+ movq [spill], mm6
+ movq [spill+8], mm7
+ ABS2 mm0, mm1, mm6, mm7
+ ABS2 mm2, mm3, mm6, mm7
+ paddw mm0, mm2
+ paddw mm1, mm3
+ movq mm6, [spill]
+ movq mm7, [spill+8]
+ ABS2 mm4, mm5, mm2, mm3
+ ABS2 mm6, mm7, mm2, mm3
+ paddw mm4, mm6
+ paddw mm5, mm7
+ paddw mm0, mm4
+ paddw mm1, mm5
+ paddw mm0, mm1
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+cglobal x264_pixel_sa8d_8x8_mmxext
+ push ebx
+ mov eax, [esp+ 8] ; pix1
+ mov ebx, [esp+12] ; stride1
+ mov ecx, [esp+16] ; pix2
+ mov edx, [esp+20] ; stride2
+ sub esp, 0x70
+%define args esp+0x74
+%define spill esp+0x60 ; +16
+%define trans esp+0 ; +96
+ LOAD_DIFF_4x8P 0
+ HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+
+ movq [spill], mm0
+ TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
+ movq [trans+0x00], mm4
+ movq [trans+0x08], mm7
+ movq [trans+0x10], mm0
+ movq [trans+0x18], mm6
+ movq mm0, [spill]
+ TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
+ movq [trans+0x20], mm0
+ movq [trans+0x28], mm3
+ movq [trans+0x30], mm4
+ movq [trans+0x38], mm2
+
+ mov eax, [args+4]
+ mov ecx, [args+12]
+ LOAD_DIFF_4x8P 4
+ HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+
+ movq [spill], mm7
+ TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7
+ movq [trans+0x40], mm0
+ movq [trans+0x48], mm3
+ movq [trans+0x50], mm7
+ movq [trans+0x58], mm2
+ movq mm7, [spill]
+ TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
+ movq mm5, [trans+0x00]
+ movq mm1, [trans+0x08]
+ movq mm2, [trans+0x10]
+ movq mm3, [trans+0x18]
+
+ HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
+ SUM4x8_MM
+ movq [trans], mm0
+
+ movq mm0, [trans+0x20]
+ movq mm1, [trans+0x28]
+ movq mm2, [trans+0x30]
+ movq mm3, [trans+0x38]
+ movq mm4, [trans+0x40]
+ movq mm5, [trans+0x48]
+ movq mm6, [trans+0x50]
+ movq mm7, [trans+0x58]
+
+ HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+ SUM4x8_MM
+
+ pavgw mm0, [esp]
+ pshufw mm1, mm0, 01001110b
+ paddw mm0, mm1
+ pshufw mm1, mm0, 10110001b
+ paddw mm0, mm1
+ movd eax, mm0
+ and eax, 0xffff
+ mov ecx, eax ; preserve rounding for 16x16
+ add eax, 1
+ shr eax, 1
+ add esp, 0x70
+ pop ebx
+ ret
+%undef args
+%undef spill
+%undef trans
+
+%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
+ pxor %7, %7
+ pshufw %4, %1, 01001110b
+ pshufw %5, %2, 01001110b
+ pshufw %6, %3, 01001110b
+ paddusw %1, %4
+ paddusw %2, %5
+ paddusw %3, %6
+ punpcklwd %1, %7
+ punpcklwd %2, %7
+ punpcklwd %3, %7
+ pshufw %4, %1, 01001110b
+ pshufw %5, %2, 01001110b
+ pshufw %6, %3, 01001110b
+ %8 %1, %4
+ %8 %2, %5
+ %8 %3, %6
+%endmacro
+
+%macro LOAD_4x8P 1 ; dx
+ pxor mm7, mm7
+ movd mm6, [eax+%1+7*FENC_STRIDE]
+ movd mm0, [eax+%1+0*FENC_STRIDE]
+ movd mm1, [eax+%1+1*FENC_STRIDE]
+ movd mm2, [eax+%1+2*FENC_STRIDE]
+ movd mm3, [eax+%1+3*FENC_STRIDE]
+ movd mm4, [eax+%1+4*FENC_STRIDE]
+ movd mm5, [eax+%1+5*FENC_STRIDE]
+ punpcklbw mm6, mm7
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ movq [spill], mm6
+ punpcklbw mm2, mm7
+ punpcklbw mm3, mm7
+ movd mm6, [eax+%1+6*FENC_STRIDE]
+ punpcklbw mm4, mm7
+ punpcklbw mm5, mm7
+ punpcklbw mm6, mm7
+ movq mm7, [spill]
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res )
+;-----------------------------------------------------------------------------
+cglobal x264_intra_sa8d_x3_8x8_core_mmxext
+ mov eax, [esp+4]
+ mov ecx, [esp+8]
+ sub esp, 0x70
+%define args esp+0x74
+%define spill esp+0x60 ; +16
+%define trans esp+0 ; +96
+%define sum esp+0 ; +32
+ LOAD_4x8P 0
+ HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+
+ movq [spill], mm0
+ TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
+ movq [trans+0x00], mm4
+ movq [trans+0x08], mm7
+ movq [trans+0x10], mm0
+ movq [trans+0x18], mm6
+ movq mm0, [spill]
+ TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
+ movq [trans+0x20], mm0
+ movq [trans+0x28], mm3
+ movq [trans+0x30], mm4
+ movq [trans+0x38], mm2
+
+ LOAD_4x8P 4
+ HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+
+ movq [spill], mm7
+ TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7
+ movq [trans+0x40], mm0
+ movq [trans+0x48], mm3
+ movq [trans+0x50], mm7
+ movq [trans+0x58], mm2
+ movq mm7, [spill]
+ TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
+ movq mm5, [trans+0x00]
+ movq mm1, [trans+0x08]
+ movq mm2, [trans+0x10]
+ movq mm3, [trans+0x18]
+
+ HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
+
+ movq [spill+0], mm5
+ movq [spill+8], mm7
+ ABS2 mm0, mm1, mm5, mm7
+ ABS2 mm2, mm3, mm5, mm7
+ paddw mm0, mm2
+ paddw mm1, mm3
+ paddw mm0, mm1
+ ABS2 mm4, mm6, mm2, mm3
+ movq mm5, [spill+0]
+ movq mm7, [spill+8]
+ paddw mm0, mm4
+ paddw mm0, mm6
+ ABS1 mm7, mm1
+ paddw mm0, mm7 ; 7x4 sum
+ movq mm6, mm5
+ movq mm7, [ecx+8] ; left bottom
+ psllw mm7, 3
+ psubw mm6, mm7
+ ABS2 mm5, mm6, mm2, mm3
+ paddw mm5, mm0
+ paddw mm6, mm0
+ movq [sum+0], mm5 ; dc
+ movq [sum+8], mm6 ; left
+
+ movq mm0, [trans+0x20]
+ movq mm1, [trans+0x28]
+ movq mm2, [trans+0x30]
+ movq mm3, [trans+0x38]
+ movq mm4, [trans+0x40]
+ movq mm5, [trans+0x48]
+ movq mm6, [trans+0x50]
+ movq mm7, [trans+0x58]
+
+ HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+
+ movd [sum+0x10], mm0
+ movd [sum+0x12], mm1
+ movd [sum+0x14], mm2
+ movd [sum+0x16], mm3
+ movd [sum+0x18], mm4
+ movd [sum+0x1a], mm5
+ movd [sum+0x1c], mm6
+ movd [sum+0x1e], mm7
+
+ movq [spill], mm0
+ movq [spill+8], mm1
+ ABS2 mm2, mm3, mm0, mm1
+ ABS2 mm4, mm5, mm0, mm1
+ paddw mm2, mm3
+ paddw mm4, mm5
+ paddw mm2, mm4
+ movq mm0, [spill]
+ movq mm1, [spill+8]
+ ABS2 mm6, mm7, mm4, mm5
+ ABS1 mm1, mm4
+ paddw mm2, mm7
+ paddw mm1, mm6
+ paddw mm2, mm1 ; 7x4 sum
+ movq mm1, mm0
+
+ movq mm7, [ecx+0]
+ psllw mm7, 3 ; left top
+
+ movzx edx, word [ecx+0]
+ add dx, [ecx+16]
+ lea edx, [4*edx+32]
+ and edx, -64
+ movd mm6, edx ; dc
+
+ psubw mm1, mm7
+ psubw mm0, mm6
+ ABS2 mm0, mm1, mm5, mm6
+ movq mm3, [sum+0] ; dc
+ paddw mm0, mm2
+ paddw mm1, mm2
+ movq mm2, mm0
+ paddw mm0, mm3
+ paddw mm1, [sum+8] ; h
+ psrlq mm2, 16
+ paddw mm2, mm3
+
+ movq mm3, [ecx+16] ; top left
+ movq mm4, [ecx+24] ; top right
+ psllw mm3, 3
+ psllw mm4, 3
+ psubw mm3, [sum+16]
+ psubw mm4, [sum+24]
+ ABS2 mm3, mm4, mm5, mm6
+ paddw mm2, mm3
+ paddw mm2, mm4 ; v
+
+ SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
+ mov eax, [args+8]
+ movd ecx, mm2
+ movd edx, mm1
+ add ecx, 2
+ add edx, 2
+ shr ecx, 2
+ shr edx, 2
+ mov [eax+0], ecx ; i8x8_v satd
+ mov [eax+4], edx ; i8x8_h satd
+ movd ecx, mm0
+ add ecx, 2
+ shr ecx, 2
+ mov [eax+8], ecx ; i8x8_dc satd
+
+ add esp, 0x70
+ ret
+%undef args
+%undef spill
+%undef trans
+%undef sum
+
+
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
+; const uint8_t *pix2, int stride2, int sums[2][4] )
+;-----------------------------------------------------------------------------
+cglobal x264_pixel_ssim_4x4x2_core_mmxext
+ push ebx
+ push edi
+ mov ebx, [esp+16]
+ mov edx, [esp+24]
+ mov edi, 4
+ pxor mm0, mm0
+.loop
+ mov eax, [esp+12]
+ mov ecx, [esp+20]
+ add eax, edi
+ add ecx, edi
+ pxor mm1, mm1
+ pxor mm2, mm2
+ pxor mm3, mm3
+ pxor mm4, mm4
+%rep 4
+ movd mm5, [eax]
+ movd mm6, [ecx]
+ punpcklbw mm5, mm0
+ punpcklbw mm6, mm0
+ paddw mm1, mm5
+ paddw mm2, mm6
+ movq mm7, mm5
+ pmaddwd mm5, mm5
+ pmaddwd mm7, mm6
+ pmaddwd mm6, mm6
+ paddd mm3, mm5
+ paddd mm4, mm7
+ paddd mm3, mm6
+ add eax, ebx
+ add ecx, edx
+%endrep
+ mov eax, [esp+28]
+ lea eax, [eax+edi*4]
+ pshufw mm5, mm1, 0xE
+ pshufw mm6, mm2, 0xE
+ paddusw mm1, mm5
+ paddusw mm2, mm6
+ punpcklwd mm1, mm2
+ pshufw mm2, mm1, 0xE
+ pshufw mm5, mm3, 0xE
+ pshufw mm6, mm4, 0xE
+ paddusw mm1, mm2
+ paddd mm3, mm5
+ paddd mm4, mm6
+ punpcklwd mm1, mm0
+ punpckldq mm3, mm4
+ movq [eax+0], mm1
+ movq [eax+8], mm3
+ sub edi, 4
+ jge .loop
+ pop edi
+ pop ebx
+ emms
+ ret
+
--- /dev/null
+;*****************************************************************************
+;* pixel.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003-2008 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Laurent Aimar <fenrir@via.ecp.fr>
+;* Alex Izvorski <aizvorksi@gmail.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
+ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
+mask_ff: times 16 db 0xff
+ times 16 db 0
+
+SECTION .text
+
+%macro HADDD 2 ; sum junk
+ movhlps %2, %1
+ paddd %1, %2
+ pshuflw %2, %1, 0xE
+ paddd %1, %2
+%endmacro
+
+%macro HADDW 2
+ pmaddwd %1, [pw_1 GLOBAL]
+ HADDD %1, %2
+%endmacro
+
+;=============================================================================
+; SSD
+;=============================================================================
+
+%macro SSD_INC_1x16P 0
+ movq mm1, [r0]
+ movq mm2, [r2]
+ movq mm3, [r0+8]
+ movq mm4, [r2+8]
+
+ movq mm5, mm2
+ movq mm6, mm4
+ psubusb mm2, mm1
+ psubusb mm4, mm3
+ psubusb mm1, mm5
+ psubusb mm3, mm6
+ por mm1, mm2
+ por mm3, mm4
+
+ movq mm2, mm1
+ movq mm4, mm3
+ punpcklbw mm1, mm7
+ punpcklbw mm3, mm7
+ punpckhbw mm2, mm7
+ punpckhbw mm4, mm7
+ pmaddwd mm1, mm1
+ pmaddwd mm2, mm2
+ pmaddwd mm3, mm3
+ pmaddwd mm4, mm4
+
+ add r0, r1
+ add r2, r3
+ paddd mm0, mm1
+ paddd mm0, mm2
+ paddd mm0, mm3
+ paddd mm0, mm4
+%endmacro
+
+%macro SSD_INC_1x8P 0
+ movq mm1, [r0]
+ movq mm2, [r2]
+
+ movq mm5, mm2
+ psubusb mm2, mm1
+ psubusb mm1, mm5
+ por mm1, mm2 ; mm1 = 8bit abs diff
+
+ movq mm2, mm1
+ punpcklbw mm1, mm7
+ punpckhbw mm2, mm7 ; (mm1,mm2) = 16bit abs diff
+ pmaddwd mm1, mm1
+ pmaddwd mm2, mm2
+
+ add r0, r1
+ add r2, r3
+ paddd mm0, mm1
+ paddd mm0, mm2
+%endmacro
+
+%macro SSD_INC_1x4P 0
+ movd mm1, [r0]
+ movd mm2, [r2]
+
+ movq mm5, mm2
+ psubusb mm2, mm1
+ psubusb mm1, mm5
+ por mm1, mm2
+ punpcklbw mm1, mm7
+ pmaddwd mm1, mm1
+
+ add r0, r1
+ add r2, r3
+ paddd mm0, mm1
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+%macro SSD_MMX 2
+cglobal x264_pixel_ssd_%1x%2_mmx, 4,4
+ pxor mm7, mm7 ; zero
+ pxor mm0, mm0 ; mm0 holds the sum
+%rep %2
+ SSD_INC_1x%1P
+%endrep
+ movq mm1, mm0
+ psrlq mm1, 32
+ paddd mm0, mm1
+ movd eax, mm0
+ RET
+%endmacro
+
+SSD_MMX 16, 16
+SSD_MMX 16, 8
+SSD_MMX 8, 16
+SSD_MMX 8, 8
+SSD_MMX 8, 4
+SSD_MMX 4, 8
+SSD_MMX 4, 4
+
+%macro SSD_INC_2x16P_SSE2 0
+ movdqu xmm1, [r0]
+ movdqu xmm2, [r2]
+ movdqu xmm3, [r0+r1]
+ movdqu xmm4, [r2+r3]
+
+ movdqa xmm5, xmm1
+ movdqa xmm6, xmm3
+ psubusb xmm1, xmm2
+ psubusb xmm3, xmm4
+ psubusb xmm2, xmm5
+ psubusb xmm4, xmm6
+ por xmm1, xmm2
+ por xmm3, xmm4
+
+ movdqa xmm2, xmm1
+ movdqa xmm4, xmm3
+ punpcklbw xmm1, xmm7
+ punpckhbw xmm2, xmm7
+ punpcklbw xmm3, xmm7
+ punpckhbw xmm4, xmm7
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+ pmaddwd xmm4, xmm4
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+
+ paddd xmm1, xmm2
+ paddd xmm3, xmm4
+ paddd xmm0, xmm1
+ paddd xmm0, xmm3
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+%macro SSD_SSE2 2
+cglobal x264_pixel_ssd_%1x%2_sse2, 4,4
+ pxor xmm7, xmm7
+ pxor xmm0, xmm0
+%rep %2/2
+ SSD_INC_2x16P_SSE2
+%endrep
+ HADDD xmm0, xmm1
+ movd eax, xmm0
+ RET
+%endmacro
+
+SSD_SSE2 16, 16
+SSD_SSE2 16, 8
+
+
+
+;=============================================================================
+; SATD
+;=============================================================================
+
+%macro LOAD_DIFF_4P 4 ; dst, tmp, [pix1], [pix2]
+ movd %1, %3
+ movd %2, %4
+ punpcklbw %1, %2
+ punpcklbw %2, %2
+ psubw %1, %2
+%endmacro
+
+%macro LOAD_DIFF_8P 4 ; dst, tmp, [pix1], [pix2]
+ movq %1, %3
+ movq %2, %4
+ punpcklbw %1, %2
+ punpcklbw %2, %2
+ psubw %1, %2
+%endmacro
+
+%macro LOAD_DIFF_8x4P 6 ; 4x dest, 2x temp
+ LOAD_DIFF_8P %1, %5, [r0], [r2]
+ LOAD_DIFF_8P %2, %6, [r0+r1], [r2+r3]
+ LOAD_DIFF_8P %3, %5, [r0+2*r1], [r2+2*r3]
+ LOAD_DIFF_8P %4, %6, [r0+r4], [r2+r5]
+%endmacro
+
+;;; row transform not used, because phaddw is much slower than paddw on a Conroe
+;%macro PHSUMSUB 3
+; movdqa %3, %1
+; phaddw %1, %2
+; phsubw %3, %2
+;%endmacro
+
+;%macro HADAMARD4_ROW_SSSE3 5 ; abcd-t -> adtc
+; PHSUMSUB %1, %2, %5
+; PHSUMSUB %3, %4, %2
+; PHSUMSUB %1, %3, %4
+; PHSUMSUB %5, %2, %3
+;%endmacro
+
+%macro SUMSUB_BADC 4
+ paddw %1, %2
+ paddw %3, %4
+ paddw %2, %2
+ paddw %4, %4
+ psubw %2, %1
+ psubw %4, %3
+%endmacro
+
+%macro HADAMARD4_1D 4
+ SUMSUB_BADC %1, %2, %3, %4
+ SUMSUB_BADC %1, %3, %2, %4
+%endmacro
+
+%macro HADAMARD8_1D 8
+ SUMSUB_BADC %1, %5, %2, %6
+ SUMSUB_BADC %3, %7, %4, %8
+ SUMSUB_BADC %1, %3, %2, %4
+ SUMSUB_BADC %5, %7, %6, %8
+ SUMSUB_BADC %1, %2, %3, %4
+ SUMSUB_BADC %5, %6, %7, %8
+%endmacro
+
+%macro SBUTTERFLY 5
+ mov%1 %5, %3
+ punpckl%2 %3, %4
+ punpckh%2 %5, %4
+%endmacro
+
+%macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4x2 to not shuffle registers
+ mov%1 %5, %3
+ punpckh%2 %3, %4
+ punpckl%2 %5, %4
+%endmacro
+
+%macro TRANSPOSE4x4W 5 ; abcd-t -> adtc
+ SBUTTERFLY q, wd, %1, %2, %5
+ SBUTTERFLY q, wd, %3, %4, %2
+ SBUTTERFLY q, dq, %1, %3, %4
+ SBUTTERFLY q, dq, %5, %2, %3
+%endmacro
+
+%macro TRANSPOSE4x4D 5 ; abcd-t -> adtc
+ SBUTTERFLY dqa, dq, %1, %2, %5
+ SBUTTERFLY dqa, dq, %3, %4, %2
+ SBUTTERFLY dqa, qdq, %1, %3, %4
+ SBUTTERFLY dqa, qdq, %5, %2, %3
+%endmacro
+
+%macro TRANSPOSE2x4x4W 5 ; abcd-t -> abcd
+ SBUTTERFLY dqa, wd, %1, %2, %5
+ SBUTTERFLY dqa, wd, %3, %4, %2
+ SBUTTERFLY dqa, dq, %1, %3, %4
+ SBUTTERFLY2 dqa, dq, %5, %2, %3
+ SBUTTERFLY dqa, qdq, %1, %3, %2
+ SBUTTERFLY2 dqa, qdq, %4, %5, %3
+%endmacro
+
+%ifdef ARCH_X86_64
+%macro TRANSPOSE8x8W 9 ; abcdefgh-t -> afhdtecb
+ SBUTTERFLY dqa, wd, %1, %2, %9
+ SBUTTERFLY dqa, wd, %3, %4, %2
+ SBUTTERFLY dqa, wd, %5, %6, %4
+ SBUTTERFLY dqa, wd, %7, %8, %6
+ SBUTTERFLY dqa, dq, %1, %3, %8
+ SBUTTERFLY dqa, dq, %9, %2, %3
+ SBUTTERFLY dqa, dq, %5, %7, %2
+ SBUTTERFLY dqa, dq, %4, %6, %7
+ SBUTTERFLY dqa, qdq, %1, %5, %6
+ SBUTTERFLY dqa, qdq, %9, %4, %5
+ SBUTTERFLY dqa, qdq, %8, %2, %4
+ SBUTTERFLY dqa, qdq, %3, %7, %2
+%endmacro
+%else
+%macro TRANSPOSE8x8W 9 ; abcdefgh -> afhdgecb
+ movdqa [%9], %8
+ SBUTTERFLY dqa, wd, %1, %2, %8
+ movdqa [%9+16], %8
+ movdqa %8, [%9]
+ SBUTTERFLY dqa, wd, %3, %4, %2
+ SBUTTERFLY dqa, wd, %5, %6, %4
+ SBUTTERFLY dqa, wd, %7, %8, %6
+ SBUTTERFLY dqa, dq, %1, %3, %8
+ movdqa [%9], %8
+ movdqa %8, [16+%9]
+ SBUTTERFLY dqa, dq, %8, %2, %3
+ SBUTTERFLY dqa, dq, %5, %7, %2
+ SBUTTERFLY dqa, dq, %4, %6, %7
+ SBUTTERFLY dqa, qdq, %1, %5, %6
+ SBUTTERFLY dqa, qdq, %8, %4, %5
+ movdqa [%9+16], %8
+ movdqa %8, [%9]
+ SBUTTERFLY dqa, qdq, %8, %2, %4
+ SBUTTERFLY dqa, qdq, %3, %7, %2
+ movdqa %7, [%9+16]
+%endmacro
+%endif
+
+%macro ABS1_MMX 2 ; a, tmp
+ pxor %2, %2
+ psubw %2, %1
+ pmaxsw %1, %2
+%endmacro
+
+%macro ABS2_MMX 4 ; a, b, tmp0, tmp1
+ pxor %3, %3
+ pxor %4, %4
+ psubw %3, %1
+ psubw %4, %2
+ pmaxsw %1, %3
+ pmaxsw %2, %4
+%endmacro
+
+%macro ABS1_SSSE3 2
+ pabsw %1, %1
+%endmacro
+
+%macro ABS2_SSSE3 4
+ pabsw %1, %1
+ pabsw %2, %2
+%endmacro
+
+%define ABS1 ABS1_MMX
+%define ABS2 ABS2_MMX
+
+%macro ABS4 6
+ ABS2 %1, %2, %5, %6
+ ABS2 %3, %4, %5, %6
+%endmacro
+
+%macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block)
+ HADAMARD4_1D mm4, mm5, mm6, mm7
+ TRANSPOSE4x4W mm4, mm5, mm6, mm7, %1
+ HADAMARD4_1D mm4, mm7, %1, mm6
+ ABS2 mm4, mm7, mm3, mm5
+ ABS2 %1, mm6, mm3, mm5
+ paddw %1, mm4
+ paddw mm6, mm7
+ pavgw %1, mm6
+%endmacro
+
+; in: r4=3*stride1, r5=3*stride2
+; in: %2 = horizontal offset
+; in: %3 = whether we need to increment pix1 and pix2
+; clobber: mm3..mm7
+; out: %1 = satd
+%macro SATD_4x4_MMX 3
+ LOAD_DIFF_4P mm4, mm3, [r0+%2], [r2+%2]
+ LOAD_DIFF_4P mm5, mm3, [r0+r1+%2], [r2+r3+%2]
+ LOAD_DIFF_4P mm6, mm3, [r0+2*r1+%2], [r2+2*r3+%2]
+ LOAD_DIFF_4P mm7, mm3, [r0+r4+%2], [r2+r5+%2]
+%if %3
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+%endif
+ HADAMARD4x4_SUM %1
+%endmacro
+
+%macro SATD_8x4_START 1
+ SATD_4x4_MMX mm0, 0, 0
+ SATD_4x4_MMX mm1, 4, %1
+%endmacro
+
+%macro SATD_8x4_INC 1
+ SATD_4x4_MMX mm2, 0, 0
+ paddw mm0, mm1
+ SATD_4x4_MMX mm1, 4, %1
+ paddw mm0, mm2
+%endmacro
+
+%macro SATD_16x4_START 1
+ SATD_4x4_MMX mm0, 0, 0
+ SATD_4x4_MMX mm1, 4, 0
+ SATD_4x4_MMX mm2, 8, 0
+ paddw mm0, mm1
+ SATD_4x4_MMX mm1, 12, %1
+ paddw mm0, mm2
+%endmacro
+
+%macro SATD_16x4_INC 1
+ SATD_4x4_MMX mm2, 0, 0
+ paddw mm0, mm1
+ SATD_4x4_MMX mm1, 4, 0
+ paddw mm0, mm2
+ SATD_4x4_MMX mm2, 8, 0
+ paddw mm0, mm1
+ SATD_4x4_MMX mm1, 12, %1
+ paddw mm0, mm2
+%endmacro
+
+%macro SATD_8x4_SSE2 1
+ LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+%if %1
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+%endif
+ HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
+ TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
+ HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
+ ABS4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ paddusw xmm0, xmm1
+ paddusw xmm2, xmm3
+ paddusw xmm6, xmm0
+ paddusw xmm6, xmm2
+%endmacro
+
+%macro SATD_START_MMX 0
+ lea r4, [3*r1] ; 3*stride1
+ lea r5, [3*r3] ; 3*stride2
+%endmacro
+
+%macro SATD_END_MMX 0
+ pshufw mm1, mm0, 01001110b
+ paddw mm0, mm1
+ pshufw mm1, mm0, 10110001b
+ paddw mm0, mm1
+ movd eax, mm0
+ and eax, 0xffff
+ RET
+%endmacro
+
+; FIXME avoid the spilling of regs to hold 3*stride.
+; for small blocks on x86_32, modify pixel pointer instead.
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+cglobal x264_pixel_satd_16x16_mmxext, 4,6
+ SATD_START_MMX
+ SATD_16x4_START 1
+ SATD_16x4_INC 1
+ SATD_16x4_INC 1
+ SATD_16x4_INC 0
+ paddw mm0, mm1
+ pxor mm3, mm3
+ pshufw mm1, mm0, 01001110b
+ paddw mm0, mm1
+ punpcklwd mm0, mm3
+ pshufw mm1, mm0, 01001110b
+ paddd mm0, mm1
+ movd eax, mm0
+ RET
+
+cglobal x264_pixel_satd_16x8_mmxext, 4,6
+ SATD_START_MMX
+ SATD_16x4_START 1
+ SATD_16x4_INC 0
+ paddw mm0, mm1
+ SATD_END_MMX
+
+cglobal x264_pixel_satd_8x16_mmxext, 4,6
+ SATD_START_MMX
+ SATD_8x4_START 1
+ SATD_8x4_INC 1
+ SATD_8x4_INC 1
+ SATD_8x4_INC 0
+ paddw mm0, mm1
+ SATD_END_MMX
+
+cglobal x264_pixel_satd_8x8_mmxext, 4,6
+ SATD_START_MMX
+ SATD_8x4_START 1
+ SATD_8x4_INC 0
+ paddw mm0, mm1
+ SATD_END_MMX
+
+cglobal x264_pixel_satd_8x4_mmxext, 4,6
+ SATD_START_MMX
+ SATD_8x4_START 0
+ paddw mm0, mm1
+ SATD_END_MMX
+
+cglobal x264_pixel_satd_4x8_mmxext, 4,6
+ SATD_START_MMX
+ SATD_4x4_MMX mm0, 0, 1
+ SATD_4x4_MMX mm1, 0, 0
+ paddw mm0, mm1
+ SATD_END_MMX
+
+cglobal x264_pixel_satd_4x4_mmxext, 4,6
+ SATD_START_MMX
+ SATD_4x4_MMX mm0, 0, 0
+ SATD_END_MMX
+
+
+
+%macro SATD_START_SSE2 0
+ pxor xmm6, xmm6
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+%endmacro
+
+%macro SATD_END_SSE2 0
+ picgetgot ebx
+ psrlw xmm6, 1
+ HADDW xmm6, xmm7
+ movd eax, xmm6
+ RET
+%endmacro
+
+%macro BACKUP_POINTERS 0
+%ifdef ARCH_X86_64
+ mov r10, r0
+ mov r11, r2
+%endif
+%endmacro
+
+%macro RESTORE_AND_INC_POINTERS 0
+%ifdef ARCH_X86_64
+ lea r0, [r10+8]
+ lea r2, [r11+8]
+%else
+ mov r0, r0m
+ mov r2, r2m
+ add r0, 8
+ add r2, 8
+%endif
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+%macro SATDS_SSE2 1
+cglobal x264_pixel_satd_16x16_%1, 4,6
+ SATD_START_SSE2
+ BACKUP_POINTERS
+ SATD_8x4_SSE2 1
+ SATD_8x4_SSE2 1
+ SATD_8x4_SSE2 1
+ SATD_8x4_SSE2 0
+ RESTORE_AND_INC_POINTERS
+ SATD_8x4_SSE2 1
+ SATD_8x4_SSE2 1
+ SATD_8x4_SSE2 1
+ SATD_8x4_SSE2 0
+ SATD_END_SSE2
+
+cglobal x264_pixel_satd_16x8_%1, 4,6
+ SATD_START_SSE2
+ BACKUP_POINTERS
+ SATD_8x4_SSE2 1
+ SATD_8x4_SSE2 0
+ RESTORE_AND_INC_POINTERS
+ SATD_8x4_SSE2 1
+ SATD_8x4_SSE2 0
+ SATD_END_SSE2
+
+cglobal x264_pixel_satd_8x16_%1, 4,6
+ SATD_START_SSE2
+ SATD_8x4_SSE2 1
+ SATD_8x4_SSE2 1
+ SATD_8x4_SSE2 1
+ SATD_8x4_SSE2 0
+ SATD_END_SSE2
+
+cglobal x264_pixel_satd_8x8_%1, 4,6
+ SATD_START_SSE2
+ SATD_8x4_SSE2 1
+ SATD_8x4_SSE2 0
+ SATD_END_SSE2
+
+cglobal x264_pixel_satd_8x4_%1, 4,6
+ SATD_START_SSE2
+ SATD_8x4_SSE2 0
+ SATD_END_SSE2
+
+%ifdef ARCH_X86_64
+;-----------------------------------------------------------------------------
+; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+cglobal x264_pixel_sa8d_8x8_%1
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+.skip_lea:
+ LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm8, xmm9
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
+
+ HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+ TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+ HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
+
+ ABS4 xmm0, xmm1, xmm2, xmm3, xmm6, xmm9
+ ABS4 xmm4, xmm5, xmm7, xmm8, xmm6, xmm9
+ paddusw xmm0, xmm1
+ paddusw xmm2, xmm3
+ paddusw xmm4, xmm5
+ paddusw xmm7, xmm8
+ paddusw xmm0, xmm2
+ paddusw xmm4, xmm7
+ pavgw xmm0, xmm4
+ HADDW xmm0, xmm1
+ movd eax, xmm0
+ add r10d, eax ; preserve rounding for 16x16
+ add eax, 1
+ shr eax, 1
+ ret
+
+cglobal x264_pixel_sa8d_16x16_%1
+ xor r10d, r10d
+ call x264_pixel_sa8d_8x8_%1 ; pix[0]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ call x264_pixel_sa8d_8x8_%1.skip_lea ; pix[8*stride]
+ neg r4 ; it's already r1*3
+ neg r5
+ lea r0, [r0+4*r4+8]
+ lea r2, [r2+4*r5+8]
+ call x264_pixel_sa8d_8x8_%1 ; pix[8]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ call x264_pixel_sa8d_8x8_%1.skip_lea ; pix[8*stride+8]
+ mov eax, r10d
+ add eax, 1
+ shr eax, 1
+ ret
+%else ; ARCH_X86_32
+cglobal x264_pixel_sa8d_8x8_%1, 4,7
+ mov r6, esp
+ and esp, ~15
+ sub esp, 32
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm6, xmm7
+ movdqa [esp], xmm2
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm2, xmm2
+ movdqa xmm2, [esp]
+
+ HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+ TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, esp
+ HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm6, xmm4, xmm2, xmm1
+
+%ifidn %1, sse2
+ movdqa [esp], xmm6
+ movdqa [esp+16], xmm7
+%endif
+ ABS2 xmm2, xmm3, xmm6, xmm7
+ ABS2 xmm0, xmm1, xmm6, xmm7
+ paddusw xmm0, xmm2
+ paddusw xmm1, xmm3
+%ifidn %1, sse2
+ movdqa xmm6, [esp]
+ movdqa xmm7, [esp+16]
+%endif
+ ABS2 xmm4, xmm5, xmm2, xmm3
+ ABS2 xmm6, xmm7, xmm2, xmm3
+ paddusw xmm4, xmm5
+ paddusw xmm6, xmm7
+ paddusw xmm0, xmm1
+ paddusw xmm4, xmm6
+ pavgw xmm0, xmm4
+ picgetgot ebx
+ HADDW xmm0, xmm1
+ movd eax, xmm0
+ mov ecx, eax ; preserve rounding for 16x16
+ add eax, 1
+ shr eax, 1
+ mov esp, r6
+ RET
+%endif ; ARCH
+%endmacro ; SATDS_SSE2
+
+%macro SA8D_16x16_32 1
+%ifndef ARCH_X86_64
+cglobal x264_pixel_sa8d_16x16_%1
+ push ebp
+ push dword [esp+20] ; stride2
+ push dword [esp+20] ; pix2
+ push dword [esp+20] ; stride1
+ push dword [esp+20] ; pix1
+ call x264_pixel_sa8d_8x8_%1
+ mov ebp, ecx
+ add dword [esp+0], 8 ; pix1+8
+ add dword [esp+8], 8 ; pix2+8
+ call x264_pixel_sa8d_8x8_%1
+ add ebp, ecx
+ mov eax, [esp+4]
+ mov edx, [esp+12]
+ shl eax, 3
+ shl edx, 3
+ add [esp+0], eax ; pix1+8*stride1+8
+ add [esp+8], edx ; pix2+8*stride2+8
+ call x264_pixel_sa8d_8x8_%1
+ add ebp, ecx
+ sub dword [esp+0], 8 ; pix1+8*stride1
+ sub dword [esp+8], 8 ; pix2+8*stride2
+ call x264_pixel_sa8d_8x8_%1
+ lea eax, [ebp+ecx+1]
+ shr eax, 1
+ add esp, 16
+ pop ebp
+ ret
+%endif ; !ARCH_X86_64
+%endmacro ; SA8D_16x16_32
+
+
+
+;=============================================================================
+; INTRA SATD
+;=============================================================================
+
+%macro INTRA_SA8D_SSE2 1
+%ifdef ARCH_X86_64
+;-----------------------------------------------------------------------------
+; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
+;-----------------------------------------------------------------------------
+cglobal x264_intra_sa8d_x3_8x8_core_%1
+ ; 8x8 hadamard
+ pxor xmm4, xmm4
+ movq xmm0, [r0+0*FENC_STRIDE]
+ movq xmm7, [r0+1*FENC_STRIDE]
+ movq xmm6, [r0+2*FENC_STRIDE]
+ movq xmm3, [r0+3*FENC_STRIDE]
+ movq xmm5, [r0+4*FENC_STRIDE]
+ movq xmm1, [r0+5*FENC_STRIDE]
+ movq xmm8, [r0+6*FENC_STRIDE]
+ movq xmm2, [r0+7*FENC_STRIDE]
+ punpcklbw xmm0, xmm4
+ punpcklbw xmm7, xmm4
+ punpcklbw xmm6, xmm4
+ punpcklbw xmm3, xmm4
+ punpcklbw xmm5, xmm4
+ punpcklbw xmm1, xmm4
+ punpcklbw xmm8, xmm4
+ punpcklbw xmm2, xmm4
+ HADAMARD8_1D xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2
+ TRANSPOSE8x8W xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4
+ HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+
+ ; dc
+ movzx edi, word [r1+0]
+ add di, word [r1+16]
+ add edi, 8
+ and edi, -16
+ shl edi, 2
+
+ pxor xmm15, xmm15
+ movdqa xmm8, xmm2
+ movdqa xmm9, xmm3
+ movdqa xmm10, xmm4
+ movdqa xmm11, xmm5
+ ABS4 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13
+ paddusw xmm8, xmm10
+ paddusw xmm9, xmm11
+%ifidn %1, ssse3
+ pabsw xmm10, xmm6
+ pabsw xmm11, xmm7
+ pabsw xmm15, xmm1
+%else
+ movdqa xmm10, xmm6
+ movdqa xmm11, xmm7
+ movdqa xmm15, xmm1
+ ABS2 xmm10, xmm11, xmm13, xmm14
+ ABS1 xmm15, xmm13
+%endif
+ paddusw xmm10, xmm11
+ paddusw xmm8, xmm9
+ paddusw xmm15, xmm10
+ paddusw xmm15, xmm8
+ movdqa xmm14, xmm15 ; 7x8 sum
+
+ movdqa xmm8, [r1+0] ; left edge
+ movd xmm9, edi
+ psllw xmm8, 3
+ psubw xmm8, xmm0
+ psubw xmm9, xmm0
+ ABS1 xmm8, xmm10
+ ABS1 xmm9, xmm11 ; 1x8 sum
+ paddusw xmm14, xmm8
+ paddusw xmm15, xmm9
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+ punpcklwd xmm4, xmm5
+ punpcklwd xmm6, xmm7
+ punpckldq xmm0, xmm2
+ punpckldq xmm4, xmm6
+ punpcklqdq xmm0, xmm4 ; transpose
+ movdqa xmm1, [r1+16] ; top edge
+ movdqa xmm2, xmm15
+ psllw xmm1, 3
+ psrldq xmm2, 2 ; 8x7 sum
+ psubw xmm0, xmm1 ; 8x1 sum
+ ABS1 xmm0, xmm1
+ paddusw xmm2, xmm0
+
+ ; 3x HADDW
+ movdqa xmm7, [pw_1 GLOBAL]
+ pmaddwd xmm2, xmm7
+ pmaddwd xmm14, xmm7
+ pmaddwd xmm15, xmm7
+ movdqa xmm3, xmm2
+ punpckldq xmm2, xmm14
+ punpckhdq xmm3, xmm14
+ pshufd xmm5, xmm15, 0xf5
+ paddd xmm2, xmm3
+ paddd xmm5, xmm15
+ movdqa xmm3, xmm2
+ punpcklqdq xmm2, xmm5
+ punpckhqdq xmm3, xmm5
+ pavgw xmm3, xmm2
+ pxor xmm0, xmm0
+ pavgw xmm3, xmm0
+ movq [r2], xmm3 ; i8x8_v, i8x8_h
+ psrldq xmm3, 8
+ movd [r2+8], xmm3 ; i8x8_dc
+ ret
+%endif ; ARCH_X86_64
+%endmacro ; INTRA_SATDS
+
+; in: r0 = fenc
+; out: mm0..mm3 = hadamard coefs
+ALIGN 16
+load_hadamard:
+ pxor mm7, mm7
+ movd mm0, [r0+0*FENC_STRIDE]
+ movd mm4, [r0+1*FENC_STRIDE]
+ movd mm3, [r0+2*FENC_STRIDE]
+ movd mm1, [r0+3*FENC_STRIDE]
+ punpcklbw mm0, mm7
+ punpcklbw mm4, mm7
+ punpcklbw mm3, mm7
+ punpcklbw mm1, mm7
+ HADAMARD4_1D mm0, mm4, mm3, mm1
+ TRANSPOSE4x4W mm0, mm4, mm3, mm1, mm2
+ HADAMARD4_1D mm0, mm1, mm2, mm3
+ ret
+
+%macro SCALAR_SUMSUB 4
+ add %1, %2
+ add %3, %4
+ add %2, %2
+ add %4, %4
+ sub %2, %1
+ sub %4, %3
+%endmacro
+
+%macro SCALAR_HADAMARD_LEFT 5 ; y, 4x tmp
+%ifnidn %1, 0
+ shl %1d, 5 ; log(FDEC_STRIDE)
+%endif
+ movzx %2d, byte [r1+%1-1+0*FDEC_STRIDE]
+ movzx %3d, byte [r1+%1-1+1*FDEC_STRIDE]
+ movzx %4d, byte [r1+%1-1+2*FDEC_STRIDE]
+ movzx %5d, byte [r1+%1-1+3*FDEC_STRIDE]
+%ifnidn %1, 0
+ shr %1d, 5
+%endif
+ SCALAR_SUMSUB %2d, %3d, %4d, %5d
+ SCALAR_SUMSUB %2d, %4d, %3d, %5d
+ mov [left_1d+2*%1+0], %2w
+ mov [left_1d+2*%1+2], %3w
+ mov [left_1d+2*%1+4], %4w
+ mov [left_1d+2*%1+6], %5w
+%endmacro
+
+%macro SCALAR_HADAMARD_TOP 5 ; x, 4x tmp
+ movzx %2d, byte [r1+%1-FDEC_STRIDE+0]
+ movzx %3d, byte [r1+%1-FDEC_STRIDE+1]
+ movzx %4d, byte [r1+%1-FDEC_STRIDE+2]
+ movzx %5d, byte [r1+%1-FDEC_STRIDE+3]
+ SCALAR_SUMSUB %2d, %3d, %4d, %5d
+ SCALAR_SUMSUB %2d, %4d, %3d, %5d
+ mov [top_1d+2*%1+0], %2w
+ mov [top_1d+2*%1+2], %3w
+ mov [top_1d+2*%1+4], %4w
+ mov [top_1d+2*%1+6], %5w
+%endmacro
+
+%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
+ pxor %7, %7
+ pshufw %4, %1, 01001110b
+ pshufw %5, %2, 01001110b
+ pshufw %6, %3, 01001110b
+ paddw %1, %4
+ paddw %2, %5
+ paddw %3, %6
+ punpcklwd %1, %7
+ punpcklwd %2, %7
+ punpcklwd %3, %7
+ pshufw %4, %1, 01001110b
+ pshufw %5, %2, 01001110b
+ pshufw %6, %3, 01001110b
+ %8 %1, %4
+ %8 %2, %5
+ %8 %3, %6
+%endmacro
+
+%macro CLEAR_SUMS 0
+%ifdef ARCH_X86_64
+ mov qword [sums+0], 0
+ mov qword [sums+8], 0
+ mov qword [sums+16], 0
+%else
+ pxor mm7, mm7
+ movq [sums+0], mm7
+ movq [sums+8], mm7
+ movq [sums+16], mm7
+%endif
+%endmacro
+
+; in: mm1..mm3
+; out: mm7
+; clobber: mm4..mm6
+%macro SUM3x4 1
+%ifidn %1, ssse3
+ pabsw mm4, mm1
+ pabsw mm5, mm2
+ pabsw mm7, mm3
+ paddw mm4, mm5
+%else
+ movq mm4, mm1
+ movq mm5, mm2
+ ABS2 mm4, mm5, mm6, mm7
+ movq mm7, mm3
+ paddw mm4, mm5
+ ABS1 mm7, mm6
+%endif
+ paddw mm7, mm4
+%endmacro
+
+; in: mm0..mm3 (4x4), mm7 (3x4)
+; out: mm0 v, mm4 h, mm5 dc
+; clobber: mm6
+%macro SUM4x3 3 ; dc, left, top
+ movq mm4, %2
+ movd mm5, %1
+ psllw mm4, 2
+ psubw mm4, mm0
+ psubw mm5, mm0
+ punpcklwd mm0, mm1
+ punpcklwd mm2, mm3
+ punpckldq mm0, mm2 ; transpose
+ movq mm1, %3
+ psllw mm1, 2
+ psubw mm0, mm1
+ ABS2 mm4, mm5, mm2, mm3 ; 1x4 sum
+ ABS1 mm0, mm1 ; 4x1 sum
+%endmacro
+
+%macro INTRA_SATDS_MMX 1
+;-----------------------------------------------------------------------------
+; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+;-----------------------------------------------------------------------------
+cglobal x264_intra_satd_x3_4x4_%1, 2,6
+%ifdef ARCH_X86_64
+ ; stack is 16 byte aligned because abi says so
+ %define top_1d rsp-8 ; size 8
+ %define left_1d rsp-16 ; size 8
+ %define t0 r10
+ %define t0d r10d
+%else
+ ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
+ SUB esp, 16
+ %define top_1d esp+8
+ %define left_1d esp
+ %define t0 r2
+ %define t0d r2d
+%endif
+
+ call load_hadamard
+ SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5
+ mov t0d, r0d
+ SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5
+ lea t0d, [t0d + r0d + 4]
+ and t0d, -8
+ shl t0d, 1 ; dc
+
+ SUM3x4 %1
+ SUM4x3 t0d, [left_1d], [top_1d]
+ paddw mm4, mm7
+ paddw mm5, mm7
+ movq mm1, mm5
+ psrlq mm1, 16 ; 4x3 sum
+ paddw mm0, mm1
+
+ SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw
+%ifndef ARCH_X86_64
+ mov r2, r2m
+%endif
+ movd [r2+0], mm0 ; i4x4_v satd
+ movd [r2+4], mm4 ; i4x4_h satd
+ movd [r2+8], mm5 ; i4x4_dc satd
+%ifndef ARCH_X86_64
+ ADD esp, 16
+%endif
+ RET
+
+%ifdef ARCH_X86_64
+ %define t0 r10
+ %define t0d r10d
+ %define t2 r11
+ %define t2w r11w
+ %define t2d r11d
+%else
+ %define t0 r0
+ %define t0d r0d
+ %define t2 r2
+ %define t2w r2w
+ %define t2d r2d
+%endif
+
+;-----------------------------------------------------------------------------
+; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+;-----------------------------------------------------------------------------
+cglobal x264_intra_satd_x3_16x16_%1, 0,7
+%ifdef ARCH_X86_64
+ %assign stack_pad 88
+%else
+ %assign stack_pad 88 + ((stack_offset+88+4)&15)
+%endif
+ ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
+ SUB rsp, stack_pad
+%define sums rsp+64 ; size 24
+%define top_1d rsp+32 ; size 32
+%define left_1d rsp ; size 32
+ movifnidn r1d, r1m
+ CLEAR_SUMS
+
+ ; 1D hadamards
+ xor t2d, t2d
+ mov t0d, 12
+.loop_edge:
+ SCALAR_HADAMARD_LEFT t0, r3, r4, r5, r6
+ add t2d, r3d
+ SCALAR_HADAMARD_TOP t0, r3, r4, r5, r6
+ add t2d, r3d
+ sub t0d, 4
+ jge .loop_edge
+ shr t2d, 1
+ add t2d, 8
+ and t2d, -16 ; dc
+
+ ; 2D hadamards
+ movifnidn r0d, r0m
+ xor r3d, r3d
+.loop_y:
+ xor r4d, r4d
+.loop_x:
+ call load_hadamard
+
+ SUM3x4 %1
+ SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
+ pavgw mm4, mm7
+ pavgw mm5, mm7
+ paddw mm0, [sums+0] ; i16x16_v satd
+ paddw mm4, [sums+8] ; i16x16_h satd
+ paddw mm5, [sums+16] ; i16x16_dc satd
+ movq [sums+0], mm0
+ movq [sums+8], mm4
+ movq [sums+16], mm5
+
+ add r0, 4
+ inc r4d
+ cmp r4d, 4
+ jl .loop_x
+ add r0, 4*FENC_STRIDE-16
+ inc r3d
+ cmp r3d, 4
+ jl .loop_y
+
+; horizontal sum
+ movifnidn r2d, r2m
+ movq mm2, [sums+16]
+ movq mm1, [sums+8]
+ movq mm0, [sums+0]
+ movq mm7, mm2
+ SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
+ psrld mm0, 1
+ pslld mm7, 16
+ psrld mm7, 16
+ paddd mm0, mm2
+ psubd mm0, mm7
+ movd [r2+8], mm2 ; i16x16_dc satd
+ movd [r2+4], mm1 ; i16x16_h satd
+ movd [r2+0], mm0 ; i16x16_v satd
+ ADD rsp, stack_pad
+ RET
+
+;-----------------------------------------------------------------------------
+; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+;-----------------------------------------------------------------------------
+cglobal x264_intra_satd_x3_8x8c_%1, 0,6
+ ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
+ SUB rsp, 72
+%define sums rsp+48 ; size 24
+%define dc_1d rsp+32 ; size 16
+%define top_1d rsp+16 ; size 16
+%define left_1d rsp ; size 16
+ movifnidn r1d, r1m
+ CLEAR_SUMS
+
+ ; 1D hadamards
+ mov t0d, 4
+.loop_edge:
+ SCALAR_HADAMARD_LEFT t0, t2, r3, r4, r5
+ SCALAR_HADAMARD_TOP t0, t2, r3, r4, r5
+ sub t0d, 4
+ jge .loop_edge
+
+ ; dc
+ movzx t2d, word [left_1d+0]
+ movzx r3d, word [top_1d+0]
+ movzx r4d, word [left_1d+8]
+ movzx r5d, word [top_1d+8]
+ add t2d, r3d
+ lea r3, [r4 + r5]
+ lea t2, [2*t2 + 8]
+ lea r3, [2*r3 + 8]
+ lea r4, [4*r4 + 8]
+ lea r5, [4*r5 + 8]
+ and t2d, -16 ; tl
+ and r3d, -16 ; br
+ and r4d, -16 ; bl
+ and r5d, -16 ; tr
+ mov [dc_1d+ 0], t2d ; tl
+ mov [dc_1d+ 4], r5d ; tr
+ mov [dc_1d+ 8], r4d ; bl
+ mov [dc_1d+12], r3d ; br
+ lea r5, [dc_1d]
+
+ ; 2D hadamards
+ movifnidn r0d, r0m
+ movifnidn r2d, r2m
+ xor r3d, r3d
+.loop_y:
+ xor r4d, r4d
+.loop_x:
+ call load_hadamard
+
+ SUM3x4 %1
+ SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
+ pavgw mm4, mm7
+ pavgw mm5, mm7
+ paddw mm0, [sums+16] ; i4x4_v satd
+ paddw mm4, [sums+8] ; i4x4_h satd
+ paddw mm5, [sums+0] ; i4x4_dc satd
+ movq [sums+16], mm0
+ movq [sums+8], mm4
+ movq [sums+0], mm5
+
+ add r0, 4
+ inc r4d
+ cmp r4d, 2
+ jl .loop_x
+ add r0, 4*FENC_STRIDE-8
+ add r5, 8
+ inc r3d
+ cmp r3d, 2
+ jl .loop_y
+
+; horizontal sum
+ movq mm0, [sums+0]
+ movq mm1, [sums+8]
+ movq mm2, [sums+16]
+ movq mm7, mm0
+ psrlq mm7, 15
+ paddw mm2, mm7
+ SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
+ psrld mm2, 1
+ movd [r2+0], mm0 ; i8x8c_dc satd
+ movd [r2+4], mm1 ; i8x8c_h satd
+ movd [r2+8], mm2 ; i8x8c_v satd
+ ADD rsp, 72
+ RET
+%endmacro
+
+; instantiate satds
+; FIXME width4 can benefit from pabsw even if not sse2
+
+cextern x264_pixel_sa8d_8x8_mmxext
+SA8D_16x16_32 mmxext
+
+%define ABS1 ABS1_MMX
+%define ABS2 ABS2_MMX
+SATDS_SSE2 sse2
+SA8D_16x16_32 sse2
+INTRA_SA8D_SSE2 sse2
+INTRA_SATDS_MMX mmxext
+%ifdef HAVE_SSE3
+%define ABS1 ABS1_SSSE3
+%define ABS2 ABS2_SSSE3
+SATDS_SSE2 ssse3
+SA8D_16x16_32 ssse3
+INTRA_SA8D_SSE2 ssse3
+INTRA_SATDS_MMX ssse3
+%endif
+
+
+
+;=============================================================================
+; SSIM
+;=============================================================================
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
+; const uint8_t *pix2, int stride2, int sums[2][4] )
+;-----------------------------------------------------------------------------
+cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4
+ pxor xmm0, xmm0
+ pxor xmm1, xmm1
+ pxor xmm2, xmm2
+ pxor xmm3, xmm3
+ pxor xmm4, xmm4
+%rep 4
+ movq xmm5, [r0]
+ movq xmm6, [r2]
+ punpcklbw xmm5, xmm0
+ punpcklbw xmm6, xmm0
+ paddw xmm1, xmm5
+ paddw xmm2, xmm6
+ movdqa xmm7, xmm5
+ pmaddwd xmm5, xmm5
+ pmaddwd xmm7, xmm6
+ pmaddwd xmm6, xmm6
+ paddd xmm3, xmm5
+ paddd xmm4, xmm7
+ paddd xmm3, xmm6
+ add r0, r1
+ add r2, r3
+%endrep
+ ; PHADDW xmm1, xmm2
+ ; PHADDD xmm3, xmm4
+ picgetgot eax
+ movdqa xmm7, [pw_1 GLOBAL]
+ pshufd xmm5, xmm3, 0xb1
+ pmaddwd xmm1, xmm7
+ pmaddwd xmm2, xmm7
+ pshufd xmm6, xmm4, 0xb1
+ packssdw xmm1, xmm2
+ paddd xmm3, xmm5
+ pshufd xmm1, xmm1, 0xd8
+ paddd xmm4, xmm6
+ pmaddwd xmm1, xmm7
+ movdqa xmm5, xmm3
+ punpckldq xmm3, xmm4
+ punpckhdq xmm5, xmm4
+
+%ifdef ARCH_X86_64
+ %define t0 r4
+%else
+ %define t0 eax
+ mov t0, r4m
+%endif
+%ifnidn r4d, r4m
+ mov t0, r4m
+%endif
+
+ movq [t0+ 0], xmm1
+ movq [t0+ 8], xmm3
+ psrldq xmm1, 8
+ movq [t0+16], xmm1
+ movq [t0+24], xmm5
+ RET
+
+;-----------------------------------------------------------------------------
+; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
+;-----------------------------------------------------------------------------
+cglobal x264_pixel_ssim_end4_sse2, 3,3
+ movdqa xmm0, [r0+ 0]
+ movdqa xmm1, [r0+16]
+ movdqa xmm2, [r0+32]
+ movdqa xmm3, [r0+48]
+ movdqa xmm4, [r0+64]
+ paddd xmm0, [r1+ 0]
+ paddd xmm1, [r1+16]
+ paddd xmm2, [r1+32]
+ paddd xmm3, [r1+48]
+ paddd xmm4, [r1+64]
+ paddd xmm0, xmm1
+ paddd xmm1, xmm2
+ paddd xmm2, xmm3
+ paddd xmm3, xmm4
+ picgetgot r1
+ movdqa xmm5, [ssim_c1 GLOBAL]
+ movdqa xmm6, [ssim_c2 GLOBAL]
+ TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4
+
+; s1=mm0, s2=mm3, ss=mm4, s12=mm2
+ movdqa xmm1, xmm3
+ pslld xmm3, 16
+ pmaddwd xmm1, xmm0 ; s1*s2
+ por xmm0, xmm3
+ pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2
+ pslld xmm1, 1
+ pslld xmm2, 7
+ pslld xmm4, 6
+ psubd xmm2, xmm1 ; covar*2
+ psubd xmm4, xmm0 ; vars
+ paddd xmm0, xmm5
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm4, xmm6
+ cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
+ cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1)
+ cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2)
+ cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2)
+ mulps xmm1, xmm2
+ mulps xmm0, xmm4
+ divps xmm1, xmm0 ; ssim
+
+ cmp r2d, 4
+ je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
+ neg r2
+%ifdef PIC64
+ lea r3, [mask_ff + 16 GLOBAL]
+ movdqu xmm3, [r3 + r2*4]
+%else
+ movdqu xmm3, [mask_ff + r2*4 + 16 GLOBAL]
+%endif
+ pand xmm1, xmm3
+.skip:
+ movhlps xmm0, xmm1
+ addps xmm0, xmm1
+ pshuflw xmm1, xmm0, 0xE
+ addss xmm0, xmm1
+%ifndef ARCH_X86_64
+ movd r0m, xmm0
+ fld dword r0m
+%endif
+ RET
+
+
+
+;=============================================================================
+; Successive Elimination ADS
+;=============================================================================
+
+%macro ADS_START 1 ; unroll_size
+%ifdef ARCH_X86_64
+ %define t0 r6
+ mov r10, rsp
+%else
+ %define t0 r4
+ PUSH rbp
+ mov rbp, rsp
+%endif
+ mov r0d, r5m
+ sub rsp, r0
+ sub rsp, %1*4-1
+ and rsp, ~15
+ mov t0, rsp
+ shl r2d, 1
+%endmacro
+
+%macro ADS_END 1
+ add r1, 8*%1
+ add r3, 8*%1
+ add t0, 4*%1
+ sub r0d, 4*%1
+ jg .loop
+ jmp x264_pixel_ads_mvs
+%endmacro
+
+%define ABS1 ABS1_MMX
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
+; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
+;-----------------------------------------------------------------------------
+cglobal x264_pixel_ads4_mmxext, 4,5
+ movq mm6, [r0]
+ movq mm4, [r0+8]
+ pshufw mm7, mm6, 0
+ pshufw mm6, mm6, 0xAA
+ pshufw mm5, mm4, 0
+ pshufw mm4, mm4, 0xAA
+ ADS_START 1
+.loop:
+ movq mm0, [r1]
+ movq mm1, [r1+16]
+ psubw mm0, mm7
+ psubw mm1, mm6
+ ABS1 mm0, mm2
+ ABS1 mm1, mm3
+ movq mm2, [r1+r2]
+ movq mm3, [r1+r2+16]
+ psubw mm2, mm5
+ psubw mm3, mm4
+ paddw mm0, mm1
+ ABS1 mm2, mm1
+ ABS1 mm3, mm1
+ paddw mm0, mm2
+ paddw mm0, mm3
+%ifdef ARCH_X86_64
+ pshufw mm1, [r10+8], 0
+%else
+ pshufw mm1, [ebp+stack_offset+28], 0
+%endif
+ paddusw mm0, [r3]
+ psubusw mm1, mm0
+ packsswb mm1, mm1
+ movd [t0], mm1
+ ADS_END 1
+
+cglobal x264_pixel_ads2_mmxext, 4,5
+ movq mm6, [r0]
+ pshufw mm5, r6m, 0
+ pshufw mm7, mm6, 0
+ pshufw mm6, mm6, 0xAA
+ ADS_START 1
+.loop:
+ movq mm0, [r1]
+ movq mm1, [r1+r2]
+ psubw mm0, mm7
+ psubw mm1, mm6
+ ABS1 mm0, mm2
+ ABS1 mm1, mm3
+ paddw mm0, mm1
+ paddusw mm0, [r3]
+ movq mm4, mm5
+ psubusw mm4, mm0
+ packsswb mm4, mm4
+ movd [t0], mm4
+ ADS_END 1
+
+cglobal x264_pixel_ads1_mmxext, 4,5
+ pshufw mm7, [r0], 0
+ pshufw mm6, r6m, 0
+ ADS_START 2
+.loop:
+ movq mm0, [r1]
+ movq mm1, [r1+8]
+ psubw mm0, mm7
+ psubw mm1, mm7
+ ABS1 mm0, mm2
+ ABS1 mm1, mm3
+ paddusw mm0, [r3]
+ paddusw mm1, [r3+8]
+ movq mm4, mm6
+ movq mm5, mm6
+ psubusw mm4, mm0
+ psubusw mm5, mm1
+ packsswb mm4, mm5
+ movq [t0], mm4
+ ADS_END 2
+
+%macro ADS_SSE2 1
+cglobal x264_pixel_ads4_%1, 4,5
+ movdqa xmm4, [r0]
+ pshuflw xmm7, xmm4, 0
+ pshuflw xmm6, xmm4, 0xAA
+ pshufhw xmm5, xmm4, 0
+ pshufhw xmm4, xmm4, 0xAA
+ punpcklqdq xmm7, xmm7
+ punpcklqdq xmm6, xmm6
+ punpckhqdq xmm5, xmm5
+ punpckhqdq xmm4, xmm4
+%ifdef ARCH_X86_64
+ pshuflw xmm8, r6m, 0
+ punpcklqdq xmm8, xmm8
+ ADS_START 2
+ movdqu xmm10, [r1]
+ movdqu xmm11, [r1+r2]
+.loop:
+ movdqa xmm0, xmm10
+ movdqu xmm1, [r1+16]
+ movdqa xmm10, xmm1
+ psubw xmm0, xmm7
+ psubw xmm1, xmm6
+ ABS1 xmm0, xmm2
+ ABS1 xmm1, xmm3
+ movdqa xmm2, xmm11
+ movdqu xmm3, [r1+r2+16]
+ movdqa xmm11, xmm3
+ psubw xmm2, xmm5
+ psubw xmm3, xmm4
+ paddw xmm0, xmm1
+ movdqu xmm9, [r3]
+ ABS1 xmm2, xmm1
+ ABS1 xmm3, xmm1
+ paddw xmm0, xmm2
+ paddw xmm0, xmm3
+ paddusw xmm0, xmm9
+ movdqa xmm1, xmm8
+ psubusw xmm1, xmm0
+ packsswb xmm1, xmm1
+ movq [t0], xmm1
+%else
+ ADS_START 2
+.loop:
+ movdqu xmm0, [r1]
+ movdqu xmm1, [r1+16]
+ psubw xmm0, xmm7
+ psubw xmm1, xmm6
+ ABS1 xmm0, xmm2
+ ABS1 xmm1, xmm3
+ movdqu xmm2, [r1+r2]
+ movdqu xmm3, [r1+r2+16]
+ psubw xmm2, xmm5
+ psubw xmm3, xmm4
+ paddw xmm0, xmm1
+ ABS1 xmm2, xmm1
+ ABS1 xmm3, xmm1
+ paddw xmm0, xmm2
+ paddw xmm0, xmm3
+ movd xmm1, [ebp+stack_offset+28]
+ movdqu xmm2, [r3]
+ pshuflw xmm1, xmm1, 0
+ punpcklqdq xmm1, xmm1
+ paddusw xmm0, xmm2
+ psubusw xmm1, xmm0
+ packsswb xmm1, xmm1
+ movq [t0], xmm1
+%endif ; ARCH
+ ADS_END 2
+
+cglobal x264_pixel_ads2_%1, 4,5
+ movq xmm6, [r0]
+ movd xmm5, r6m
+ pshuflw xmm7, xmm6, 0
+ pshuflw xmm6, xmm6, 0xAA
+ pshuflw xmm5, xmm5, 0
+ punpcklqdq xmm7, xmm7
+ punpcklqdq xmm6, xmm6
+ punpcklqdq xmm5, xmm5
+ ADS_START 2
+.loop:
+ movdqu xmm0, [r1]
+ movdqu xmm1, [r1+r2]
+ psubw xmm0, xmm7
+ psubw xmm1, xmm6
+ movdqu xmm4, [r3]
+ ABS1 xmm0, xmm2
+ ABS1 xmm1, xmm3
+ paddw xmm0, xmm1
+ paddusw xmm0, xmm4
+ movdqa xmm1, xmm5
+ psubusw xmm1, xmm0
+ packsswb xmm1, xmm1
+ movq [t0], xmm1
+ ADS_END 2
+
+cglobal x264_pixel_ads1_%1, 4,5
+ movd xmm7, [r0]
+ movd xmm6, r6m
+ pshuflw xmm7, xmm7, 0
+ pshuflw xmm6, xmm6, 0
+ punpcklqdq xmm7, xmm7
+ punpcklqdq xmm6, xmm6
+ ADS_START 4
+.loop:
+ movdqu xmm0, [r1]
+ movdqu xmm1, [r1+16]
+ psubw xmm0, xmm7
+ psubw xmm1, xmm7
+ movdqu xmm2, [r3]
+ movdqu xmm3, [r3+16]
+ ABS1 xmm0, xmm4
+ ABS1 xmm1, xmm5
+ paddusw xmm0, xmm2
+ paddusw xmm1, xmm3
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm6
+ psubusw xmm4, xmm0
+ psubusw xmm5, xmm1
+ packsswb xmm4, xmm5
+ movdqa [t0], xmm4
+ ADS_END 4
+%endmacro
+
+ADS_SSE2 sse2
+%ifdef HAVE_SSE3
+%define ABS1 ABS1_SSSE3
+ADS_SSE2 ssse3
+%endif
+
+; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
+; {
+; int nmv=0, i, j;
+; *(uint32_t*)(masks+width) = 0;
+; for( i=0; i<width; i+=8 )
+; {
+; uint64_t mask = *(uint64_t*)(masks+i);
+; if( !mask ) continue;
+; for( j=0; j<8; j++ )
+; if( mask & (255<<j*8) )
+; mvs[nmv++] = i+j;
+; }
+; return nmv;
+; }
+%ifdef ARCH_X86_64
+cglobal x264_pixel_ads_mvs
+ ; mvs = r4
+ ; masks = rsp
+ ; width = r5
+ ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
+ mov dword [rsp+r5], 0
+ xor eax, eax
+ xor esi, esi
+ jmp .loopi
+.loopi0:
+ add esi, 8
+ cmp esi, r5d
+ jge .end
+.loopi:
+ mov rdi, [rsp+rsi]
+ test rdi, rdi
+ jz .loopi0
+ xor ecx, ecx
+%macro TEST 1
+ mov [r4+rax*2], si
+ test edi, 0xff<<(%1*8)
+ setne cl
+ add eax, ecx
+ inc esi
+%endmacro
+ TEST 0
+ TEST 1
+ TEST 2
+ TEST 3
+ shr rdi, 32
+ TEST 0
+ TEST 1
+ TEST 2
+ TEST 3
+ cmp esi, r5d
+ jl .loopi
+.end:
+ mov rsp, r10
+ ret
+
+%else
+cglobal x264_pixel_ads_mvs
+ ; no PROLOGUE, inherit from x264_pixel_ads1
+ mov ebx, [ebp+stack_offset+20] ; mvs
+ mov edi, [ebp+stack_offset+24] ; width
+ mov dword [esp+edi], 0
+ push ebp
+ xor eax, eax
+ xor esi, esi
+ jmp .loopi
+.loopi0:
+ add esi, 8
+ cmp esi, edi
+ jge .end
+.loopi:
+ mov ebp, [esp+esi+4]
+ mov edx, [esp+esi+8]
+ mov ecx, ebp
+ or ecx, edx
+ jz .loopi0
+ xor ecx, ecx
+%macro TEST 2
+ mov [ebx+eax*2], si
+ test %2, 0xff<<(%1*8)
+ setne cl
+ add eax, ecx
+ inc esi
+%endmacro
+ TEST 0, ebp
+ TEST 1, ebp
+ TEST 2, ebp
+ TEST 3, ebp
+ TEST 0, edx
+ TEST 1, edx
+ TEST 2, edx
+ TEST 3, edx
+ cmp esi, edi
+ jl .loopi
+.end:
+ pop esp
+ pop ebp
+ RET
+%endif ; ARCH
+
/*****************************************************************************
* mc.h: h264 encoder library
*****************************************************************************
- * Copyright (C) 2003 Laurent Aimar
- * $Id: pixel.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ * Copyright (C) 2003-2008 Laurent Aimar
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ * Loren Merritt <lorenm@u.washington.edu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
#undef DECL_X4
void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_4x4_ssse3( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_8x8c_ssse3( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_sse2( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *, int16_t [2][8], int * );
+void x264_intra_sa8d_x3_8x8_sse2( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_ssse3( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * );
+void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *, int16_t [2][8], int * );
+void x264_intra_sa8d_x3_8x8_core_ssse3( uint8_t *, int16_t [2][8], int * );
void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
;*****************************************************************************
;* predict-a.asm: h264 encoder library
;*****************************************************************************
-;* Copyright (C) 2005 x264 project
+;* Copyright (C) 2005-2008 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
-BITS 64
-
-;=============================================================================
-; Macros and other preprocessor constants
-;=============================================================================
-
-%include "amd64inc.asm"
+%include "x86inc.asm"
%macro STORE8x8 2
- movq [parm1q + 0*FDEC_STRIDE], %1
- movq [parm1q + 1*FDEC_STRIDE], %1
- movq [parm1q + 2*FDEC_STRIDE], %1
- movq [parm1q + 3*FDEC_STRIDE], %1
- movq [parm1q + 4*FDEC_STRIDE], %2
- movq [parm1q + 5*FDEC_STRIDE], %2
- movq [parm1q + 6*FDEC_STRIDE], %2
- movq [parm1q + 7*FDEC_STRIDE], %2
+ movq [r0 + 0*FDEC_STRIDE], %1
+ movq [r0 + 1*FDEC_STRIDE], %1
+ movq [r0 + 2*FDEC_STRIDE], %1
+ movq [r0 + 3*FDEC_STRIDE], %1
+ movq [r0 + 4*FDEC_STRIDE], %2
+ movq [r0 + 5*FDEC_STRIDE], %2
+ movq [r0 + 6*FDEC_STRIDE], %2
+ movq [r0 + 7*FDEC_STRIDE], %2
%endmacro
%macro STORE16x16 2
- mov eax, 4
+ mov r1d, 4
.loop:
- movq [parm1q + 0*FDEC_STRIDE], %1
- movq [parm1q + 1*FDEC_STRIDE], %1
- movq [parm1q + 2*FDEC_STRIDE], %1
- movq [parm1q + 3*FDEC_STRIDE], %1
- movq [parm1q + 0*FDEC_STRIDE + 8], %2
- movq [parm1q + 1*FDEC_STRIDE + 8], %2
- movq [parm1q + 2*FDEC_STRIDE + 8], %2
- movq [parm1q + 3*FDEC_STRIDE + 8], %2
- add parm1q, 4*FDEC_STRIDE
- dec eax
+ movq [r0 + 0*FDEC_STRIDE], %1
+ movq [r0 + 1*FDEC_STRIDE], %1
+ movq [r0 + 2*FDEC_STRIDE], %1
+ movq [r0 + 3*FDEC_STRIDE], %1
+ movq [r0 + 0*FDEC_STRIDE + 8], %2
+ movq [r0 + 1*FDEC_STRIDE + 8], %2
+ movq [r0 + 2*FDEC_STRIDE + 8], %2
+ movq [r0 + 3*FDEC_STRIDE + 8], %2
+ add r0, 4*FDEC_STRIDE
+ dec r1d
jg .loop
- nop
%endmacro
%macro STORE16x16_SSE2 1
- mov eax, 4
+ mov r1d, 4
.loop:
- movdqa [parm1q + 0*FDEC_STRIDE], %1
- movdqa [parm1q + 1*FDEC_STRIDE], %1
- movdqa [parm1q + 2*FDEC_STRIDE], %1
- movdqa [parm1q + 3*FDEC_STRIDE], %1
- add parm1q, 4*FDEC_STRIDE
- dec eax
+ movdqa [r0 + 0*FDEC_STRIDE], %1
+ movdqa [r0 + 1*FDEC_STRIDE], %1
+ movdqa [r0 + 2*FDEC_STRIDE], %1
+ movdqa [r0 + 3*FDEC_STRIDE], %1
+ add r0, 4*FDEC_STRIDE
+ dec r1d
jg .loop
- nop
%endmacro
SECTION_RODATA
pb_0s_ff: times 7 db 0
db 0xff
-;=============================================================================
-; Code
-;=============================================================================
-
SECTION .text
; dest, left, right, src, tmp
;-----------------------------------------------------------------------------
; void predict_4x4_ddl_mmxext( uint8_t *src )
;-----------------------------------------------------------------------------
-cglobal predict_4x4_ddl_mmxext
- sub parm1q, FDEC_STRIDE
- movq mm3, [parm1q]
- movq mm1, [parm1q-1]
+cglobal predict_4x4_ddl_mmxext, 1,1,1
+ sub r0, FDEC_STRIDE
+ movq mm3, [r0]
+ movq mm1, [r0-1]
movq mm2, mm3
movq mm4, [pb_0s_ff GLOBAL]
psrlq mm2, 8
%assign Y 1
%rep 4
psrlq mm0, 8
- movd [parm1q+Y*FDEC_STRIDE], mm0
+ movd [r0+Y*FDEC_STRIDE], mm0
%assign Y (Y+1)
%endrep
- ret
+ RET
;-----------------------------------------------------------------------------
; void predict_4x4_vl_mmxext( uint8_t *src )
;-----------------------------------------------------------------------------
-cglobal predict_4x4_vl_mmxext
- movq mm1, [parm1q-FDEC_STRIDE]
+cglobal predict_4x4_vl_mmxext, 1,1,1
+ movq mm1, [r0-FDEC_STRIDE]
movq mm3, mm1
movq mm2, mm1
psrlq mm3, 8
PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5
- movd [parm1q+0*FDEC_STRIDE], mm4
- movd [parm1q+1*FDEC_STRIDE], mm0
+ movd [r0+0*FDEC_STRIDE], mm4
+ movd [r0+1*FDEC_STRIDE], mm0
psrlq mm4, 8
psrlq mm0, 8
- movd [parm1q+2*FDEC_STRIDE], mm4
- movd [parm1q+3*FDEC_STRIDE], mm0
+ movd [r0+2*FDEC_STRIDE], mm4
+ movd [r0+3*FDEC_STRIDE], mm0
- ret
+ RET
;-----------------------------------------------------------------------------
; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_v_mmxext
- movq mm0, [parm2q+16]
+cglobal predict_8x8_v_mmxext, 2,2
+ movq mm0, [r1+16]
STORE8x8 mm0, mm0
- ret
+ RET
;-----------------------------------------------------------------------------
; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
-cglobal predict_8x8_dc_mmxext
+cglobal predict_8x8_dc_mmxext, 2,2,1
pxor mm0, mm0
pxor mm1, mm1
- psadbw mm0, [parm2q+7]
- psadbw mm1, [parm2q+16]
+ psadbw mm0, [r1+7]
+ psadbw mm1, [r1+16]
paddw mm0, [pw_8 GLOBAL]
paddw mm0, mm1
psrlw mm0, 4
pshufw mm0, mm0, 0
packuswb mm0, mm0
STORE8x8 mm0, mm0
- ret
+ RET
;-----------------------------------------------------------------------------
; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
-cglobal predict_8x8_dc_top_mmxext
+%macro PRED8x8_DC 2
+cglobal %1, 2,2,1
pxor mm0, mm0
- psadbw mm0, [parm2q+16]
+ psadbw mm0, [r1+%2]
paddw mm0, [pw_4 GLOBAL]
psrlw mm0, 3
pshufw mm0, mm0, 0
packuswb mm0, mm0
STORE8x8 mm0, mm0
- ret
+ RET
+%endmacro
+
+PRED8x8_DC predict_8x8_dc_top_mmxext, 16
+PRED8x8_DC predict_8x8_dc_left_mmxext, 7
+
+%ifndef ARCH_X86_64
+; sse2 is faster even on amd, so there's no sense in spending exe size on these
+; functions if we know sse2 is available.
;-----------------------------------------------------------------------------
-; void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t *edge );
+; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_dc_left_mmxext
- pxor mm0, mm0
- psadbw mm0, [parm2q+7]
- paddw mm0, [pw_4 GLOBAL]
- psrlw mm0, 3
- pshufw mm0, mm0, 0
- packuswb mm0, mm0
- STORE8x8 mm0, mm0
- ret
+cglobal predict_8x8_ddl_mmxext, 2,2,1
+ movq mm5, [r1+16]
+ movq mm2, [r1+17]
+ movq mm3, [r1+23]
+ movq mm4, [r1+25]
+ movq mm1, mm5
+ psllq mm1, 8
+ PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7
+ PRED8x8_LOWPASS mm1, mm3, mm4, [r1+24], mm6
+
+%assign Y 7
+%rep 6
+ movq [r0+Y*FDEC_STRIDE], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+%assign Y (Y-1)
+%endrep
+ movq [r0+Y*FDEC_STRIDE], mm1
+ psllq mm1, 8
+ psrlq mm0, 56
+ por mm1, mm0
+%assign Y (Y-1)
+ movq [r0+Y*FDEC_STRIDE], mm1
+ RET
+
+;-----------------------------------------------------------------------------
+; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge )
+;-----------------------------------------------------------------------------
+cglobal predict_8x8_ddr_mmxext, 2,2,1
+ movq mm1, [r1+7]
+ movq mm2, [r1+9]
+ movq mm3, [r1+15]
+ movq mm4, [r1+17]
+ PRED8x8_LOWPASS mm0, mm1, mm2, [r1+8], mm7
+ PRED8x8_LOWPASS mm1, mm3, mm4, [r1+16], mm6
+
+%assign Y 7
+%rep 6
+ movq [r0+Y*FDEC_STRIDE], mm0
+ movq mm2, mm1
+ psrlq mm0, 8
+ psllq mm2, 56
+ psrlq mm1, 8
+ por mm0, mm2
+%assign Y (Y-1)
+%endrep
+ movq [r0+Y*FDEC_STRIDE], mm0
+ psrlq mm0, 8
+ psllq mm1, 56
+ por mm0, mm1
+%assign Y (Y-1)
+ movq [r0+Y*FDEC_STRIDE], mm0
+ RET
+
+%endif ; !ARCH_X86_64
;-----------------------------------------------------------------------------
; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddl_sse2
- movdqa xmm3, [parm2q+16]
- movdqu xmm2, [parm2q+17]
+cglobal predict_8x8_ddl_sse2, 2,2,1
+ movdqa xmm3, [r1+16]
+ movdqu xmm2, [r1+17]
movdqa xmm1, xmm3
pslldq xmm1, 1
PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
%assign Y 0
%rep 8
psrldq xmm0, 1
- movq [parm1q+Y*FDEC_STRIDE], xmm0
+ movq [r0+Y*FDEC_STRIDE], xmm0
%assign Y (Y+1)
%endrep
- ret
+ RET
;-----------------------------------------------------------------------------
; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddr_sse2
- movdqu xmm3, [parm2q+8]
- movdqu xmm1, [parm2q+7]
+cglobal predict_8x8_ddr_sse2, 2,2,1
+ movdqu xmm3, [r1+8]
+ movdqu xmm1, [r1+7]
movdqa xmm2, xmm3
psrldq xmm2, 1
PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
psrldq xmm1, 1
%assign Y 7
%rep 3
- movq [parm1q+Y*FDEC_STRIDE], xmm0
- movq [parm1q+(Y-1)*FDEC_STRIDE], xmm1
+ movq [r0+Y*FDEC_STRIDE], xmm0
+ movq [r0+(Y-1)*FDEC_STRIDE], xmm1
psrldq xmm0, 2
psrldq xmm1, 2
%assign Y (Y-2)
%endrep
- movq [parm1q+1*FDEC_STRIDE], xmm0
- movq [parm1q+0*FDEC_STRIDE], xmm1
+ movq [r0+1*FDEC_STRIDE], xmm0
+ movq [r0+0*FDEC_STRIDE], xmm1
- ret
+ RET
;-----------------------------------------------------------------------------
; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_vl_sse2
- movdqa xmm4, [parm2q+16]
+cglobal predict_8x8_vl_sse2, 2,2,1
+ movdqa xmm4, [r1+16]
movdqa xmm2, xmm4
movdqa xmm1, xmm4
movdqa xmm3, xmm4
%assign Y 0
%rep 3
psrldq xmm0, 1
- movq [parm1q+ Y *FDEC_STRIDE], xmm3
- movq [parm1q+(Y+1)*FDEC_STRIDE], xmm0
+ movq [r0+ Y *FDEC_STRIDE], xmm3
+ movq [r0+(Y+1)*FDEC_STRIDE], xmm0
psrldq xmm3, 1
%assign Y (Y+2)
%endrep
psrldq xmm0, 1
- movq [parm1q+ Y *FDEC_STRIDE], xmm3
- movq [parm1q+(Y+1)*FDEC_STRIDE], xmm0
+ movq [r0+ Y *FDEC_STRIDE], xmm3
+ movq [r0+(Y+1)*FDEC_STRIDE], xmm0
- ret
+ RET
;-----------------------------------------------------------------------------
; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
; 6 .....
; 7 ,,,,,
-cglobal predict_8x8_vr_core_mmxext
- movq mm2, [parm2q+16]
- movq mm3, [parm2q+15]
- movq mm1, [parm2q+14]
+cglobal predict_8x8_vr_core_mmxext, 2,2,1
+ movq mm2, [r1+16]
+ movq mm3, [r1+15]
+ movq mm1, [r1+14]
movq mm4, mm3
pavgb mm3, mm2
PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
%assign Y 0
%rep 3
- movq [parm1q+ Y *FDEC_STRIDE], mm3
- movq [parm1q+(Y+1)*FDEC_STRIDE], mm0
+ movq [r0+ Y *FDEC_STRIDE], mm3
+ movq [r0+(Y+1)*FDEC_STRIDE], mm0
psllq mm3, 8
psllq mm0, 8
%assign Y (Y+2)
%endrep
- movq [parm1q+ Y *FDEC_STRIDE], mm3
- movq [parm1q+(Y+1)*FDEC_STRIDE], mm0
+ movq [r0+ Y *FDEC_STRIDE], mm3
+ movq [r0+(Y+1)*FDEC_STRIDE], mm0
- ret
+ RET
;-----------------------------------------------------------------------------
; void predict_8x8c_v_mmx( uint8_t *src )
;-----------------------------------------------------------------------------
-cglobal predict_8x8c_v_mmx
- movq mm0, [parm1q - FDEC_STRIDE]
+cglobal predict_8x8c_v_mmx, 1,1
+ movq mm0, [r0 - FDEC_STRIDE]
STORE8x8 mm0, mm0
- ret
+ RET
;-----------------------------------------------------------------------------
; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
;-----------------------------------------------------------------------------
-cglobal predict_8x8c_dc_core_mmxext
- movq mm0, [parm1q - FDEC_STRIDE]
+cglobal predict_8x8c_dc_core_mmxext, 1,1,1
+ movq mm0, [r0 - FDEC_STRIDE]
pxor mm1, mm1
pxor mm2, mm2
punpckhbw mm1, mm0
psadbw mm1, mm2 ; s1
psadbw mm0, mm2 ; s0
- movd mm4, parm2d
- movd mm5, parm3d
+%ifdef ARCH_X86_64
+ movd mm4, r1d
+ movd mm5, r2d
paddw mm0, mm4
pshufw mm2, mm5, 0
+%else
+ paddw mm0, r1m
+ pshufw mm2, r2m, 0
+%endif
psrlw mm0, 3
paddw mm1, [pw_2 GLOBAL]
movq mm3, mm2
packuswb mm2, mm3 ; dc2,dc3 (b)
STORE8x8 mm0, mm2
- ret
+ RET
-;-----------------------------------------------------------------------------
-; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
-;-----------------------------------------------------------------------------
-cglobal predict_8x8c_p_core_mmxext
- movd mm0, parm2d
- movd mm2, parm3d
- movd mm4, parm4d
+%macro LOAD_PLANE_ARGS 0
+%ifdef ARCH_X86_64
+ movd mm0, r1d
+ movd mm2, r2d
+ movd mm4, r3d
pshufw mm0, mm0, 0
pshufw mm2, mm2, 0
pshufw mm4, mm4, 0
+%else
+ pshufw mm0, r1m, 0
+ pshufw mm2, r2m, 0
+ pshufw mm4, r3m, 0
+%endif
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
+;-----------------------------------------------------------------------------
+cglobal predict_8x8c_p_core_mmxext, 1,2,1
+ LOAD_PLANE_ARGS
movq mm1, mm2
pmullw mm2, [pw_3210 GLOBAL]
psllw mm1, 2
paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
- mov eax, 8
+ mov r1d, 8
ALIGN 4
.loop:
movq mm5, mm0
psraw mm5, 5
psraw mm6, 5
packuswb mm5, mm6
- movq [parm1q], mm5
+ movq [r0], mm5
paddsw mm0, mm4
paddsw mm1, mm4
- add parm1q, FDEC_STRIDE
- dec eax
+ add r0, FDEC_STRIDE
+ dec r1d
jg .loop
-
- nop
- ret
+ REP_RET
;-----------------------------------------------------------------------------
; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-cglobal predict_16x16_p_core_mmxext
- movd mm0, parm2d
- movd mm2, parm3d
- movd mm4, parm4d
- pshufw mm0, mm0, 0
- pshufw mm2, mm2, 0
- pshufw mm4, mm4, 0
+cglobal predict_16x16_p_core_mmxext, 1,2,1
+ LOAD_PLANE_ARGS
movq mm5, mm2
movq mm1, mm2
pmullw mm5, [pw_3210 GLOBAL]
paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
- mov eax, 16
+ mov r1d, 16
ALIGN 4
.loop:
movq mm5, mm0
psraw mm5, 5
psraw mm6, 5
packuswb mm5, mm6
- movq [parm1q], mm5
+ movq [r0], mm5
movq mm5, mm2
movq mm6, mm3
psraw mm5, 5
psraw mm6, 5
packuswb mm5, mm6
- movq [parm1q+8], mm5
+ movq [r0+8], mm5
paddsw mm0, mm4
paddsw mm1, mm4
paddsw mm2, mm4
paddsw mm3, mm4
- add parm1q, FDEC_STRIDE
- dec eax
+ add r0, FDEC_STRIDE
+ dec r1d
jg .loop
-
- nop
- ret
+ REP_RET
;-----------------------------------------------------------------------------
; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-cglobal predict_16x16_p_core_sse2
- movd xmm0, parm2d
- movd xmm1, parm3d
- movd xmm2, parm4d
+cglobal predict_16x16_p_core_sse2, 1,2,1
+ movd xmm0, r1m
+ movd xmm1, r2m
+ movd xmm2, r3m
pshuflw xmm0, xmm0, 0
pshuflw xmm1, xmm1, 0
pshuflw xmm2, xmm2, 0
paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
- mov eax, 16
+ mov r1d, 16
ALIGN 4
.loop:
movdqa xmm3, xmm0
psraw xmm3, 5
psraw xmm4, 5
packuswb xmm3, xmm4
- movdqa [parm1q], xmm3
+ movdqa [r0], xmm3
paddsw xmm0, xmm2
paddsw xmm1, xmm2
- add parm1q, FDEC_STRIDE
- dec eax
+ add r0, FDEC_STRIDE
+ dec r1d
jg .loop
-
- nop
- ret
+ REP_RET
;-----------------------------------------------------------------------------
; void predict_16x16_v_mmx( uint8_t *src )
;-----------------------------------------------------------------------------
-cglobal predict_16x16_v_mmx
- movq mm0, [parm1q - FDEC_STRIDE]
- movq mm1, [parm1q - FDEC_STRIDE + 8]
+cglobal predict_16x16_v_mmx, 1,2
+ movq mm0, [r0 - FDEC_STRIDE]
+ movq mm1, [r0 - FDEC_STRIDE + 8]
STORE16x16 mm0, mm1
- ret
+ REP_RET
;-----------------------------------------------------------------------------
; void predict_16x16_v_sse2( uint8_t *src )
;-----------------------------------------------------------------------------
-cglobal predict_16x16_v_sse2
- movdqa xmm0, [parm1q - FDEC_STRIDE]
+cglobal predict_16x16_v_sse2, 1,2
+ movdqa xmm0, [r0 - FDEC_STRIDE]
STORE16x16_SSE2 xmm0
- ret
+ REP_RET
;-----------------------------------------------------------------------------
; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
%macro PRED16x16_DC 2
pxor mm0, mm0
pxor mm1, mm1
- psadbw mm0, [parm1q - FDEC_STRIDE]
- psadbw mm1, [parm1q - FDEC_STRIDE + 8]
+ psadbw mm0, [r0 - FDEC_STRIDE]
+ psadbw mm1, [r0 - FDEC_STRIDE + 8]
paddusw mm0, mm1
paddusw mm0, %1
psrlw mm0, %2 ; dc
STORE16x16 mm0, mm0
%endmacro
-cglobal predict_16x16_dc_core_mmxext
- movd mm2, parm2d
+cglobal predict_16x16_dc_core_mmxext, 1,2
+%ifdef ARCH_X86_64
+ movd mm2, r1d
PRED16x16_DC mm2, 5
- ret
+%else
+ PRED16x16_DC r1m, 5
+%endif
+ REP_RET
-cglobal predict_16x16_dc_top_mmxext
+cglobal predict_16x16_dc_top_mmxext, 1,2,1
PRED16x16_DC [pw_8 GLOBAL], 4
- ret
+ REP_RET
;-----------------------------------------------------------------------------
; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left )
%macro PRED16x16_DC_SSE2 2
pxor xmm0, xmm0
- psadbw xmm0, [parm1q - FDEC_STRIDE]
+ psadbw xmm0, [r0 - FDEC_STRIDE]
movhlps xmm1, xmm0
paddw xmm0, xmm1
paddusw xmm0, %1
STORE16x16_SSE2 xmm0
%endmacro
-cglobal predict_16x16_dc_core_sse2
- movd xmm2, parm2d
+cglobal predict_16x16_dc_core_sse2, 1,2
+ movd xmm2, r1m
PRED16x16_DC_SSE2 xmm2, 5
- ret
+ REP_RET
-cglobal predict_16x16_dc_top_sse2
+cglobal predict_16x16_dc_top_sse2, 1,2,1
PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4
- ret
+ REP_RET
t=e; e+=f; f-=t;\
t=g; g+=h; h-=t;
+#define INTRA_SA8D_X3(cpu) \
+void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] )\
+{\
+ PREDICT_8x8_LOAD_TOP\
+ PREDICT_8x8_LOAD_LEFT\
+ int t;\
+ DECLARE_ALIGNED( int16_t, sa8d_1d[2][8], 16 );\
+ SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7);\
+ SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7);\
+ SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7);\
+ sa8d_1d[0][0] = l0;\
+ sa8d_1d[0][1] = l1;\
+ sa8d_1d[0][2] = l2;\
+ sa8d_1d[0][3] = l3;\
+ sa8d_1d[0][4] = l4;\
+ sa8d_1d[0][5] = l5;\
+ sa8d_1d[0][6] = l6;\
+ sa8d_1d[0][7] = l7;\
+ SUMSUB(t0,t4,t1,t5,t2,t6,t3,t7);\
+ SUMSUB(t0,t2,t1,t3,t4,t6,t5,t7);\
+ SUMSUB(t0,t1,t2,t3,t4,t5,t6,t7);\
+ sa8d_1d[1][0] = t0;\
+ sa8d_1d[1][1] = t1;\
+ sa8d_1d[1][2] = t2;\
+ sa8d_1d[1][3] = t3;\
+ sa8d_1d[1][4] = t4;\
+ sa8d_1d[1][5] = t5;\
+ sa8d_1d[1][6] = t6;\
+ sa8d_1d[1][7] = t7;\
+ x264_intra_sa8d_x3_8x8_core_##cpu( fenc, sa8d_1d, res );\
+}
+
#ifdef ARCH_X86_64
-void x264_intra_sa8d_x3_8x8_sse2( uint8_t *fenc, uint8_t edge[33], int res[3] )
-#else
-void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *fenc, uint8_t edge[33], int res[3] )
-#endif
-{
- PREDICT_8x8_LOAD_TOP
- PREDICT_8x8_LOAD_LEFT
- int t;
- DECLARE_ALIGNED( int16_t, sa8d_1d[2][8], 16 );
- SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7);
- SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7);
- SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7);
- sa8d_1d[0][0] = l0;
- sa8d_1d[0][1] = l1;
- sa8d_1d[0][2] = l2;
- sa8d_1d[0][3] = l3;
- sa8d_1d[0][4] = l4;
- sa8d_1d[0][5] = l5;
- sa8d_1d[0][6] = l6;
- sa8d_1d[0][7] = l7;
- SUMSUB(t0,t4,t1,t5,t2,t6,t3,t7);
- SUMSUB(t0,t2,t1,t3,t4,t6,t5,t7);
- SUMSUB(t0,t1,t2,t3,t4,t5,t6,t7);
- sa8d_1d[1][0] = t0;
- sa8d_1d[1][1] = t1;
- sa8d_1d[1][2] = t2;
- sa8d_1d[1][3] = t3;
- sa8d_1d[1][4] = t4;
- sa8d_1d[1][5] = t5;
- sa8d_1d[1][6] = t6;
- sa8d_1d[1][7] = t7;
-#ifdef ARCH_X86_64
- x264_intra_sa8d_x3_8x8_core_sse2( fenc, sa8d_1d, res );
+INTRA_SA8D_X3(sse2)
+INTRA_SA8D_X3(ssse3)
#else
- x264_intra_sa8d_x3_8x8_core_mmxext( fenc, sa8d_1d, res );
+INTRA_SA8D_X3(mmxext)
#endif
-}
/****************************************************************************
* Exported functions:
* predict.h: h264 encoder library
*****************************************************************************
* Copyright (C) 2003 Laurent Aimar
- * $Id: predict.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
*
;*****************************************************************************
;* quant-a.asm: h264 encoder library
;*****************************************************************************
-;* Copyright (C) 2005 x264 project
+;* Copyright (C) 2005-2008 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Christian Heine <sennindemokrit@gmx.net>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
-BITS 64
-
-%include "amd64inc.asm"
+%include "x86inc.asm"
SECTION_RODATA
pd_1: times 2 dd 1
SECTION .text
%macro MMX_QUANT_DC_START 0
- movd mm6, parm2d ; mf
- movd mm7, parm3d ; bias
+ movd mm6, r1m ; mf
+ movd mm7, r2m ; bias
pshufw mm6, mm6, 0
pshufw mm7, mm7, 0
%endmacro
%macro SSE2_QUANT_DC_START 0
- movd xmm6, parm2d ; mf
- movd xmm7, parm3d ; bias
+ movd xmm6, r1m ; mf
+ movd xmm7, r2m ; bias
pshuflw xmm6, xmm6, 0
pshuflw xmm7, xmm7, 0
punpcklqdq xmm6, xmm6
;-----------------------------------------------------------------------------
; void x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias )
;-----------------------------------------------------------------------------
-cglobal x264_quant_2x2_dc_mmxext
+cglobal x264_quant_2x2_dc_mmxext, 1,1
MMX_QUANT_DC_START
- MMX_QUANT_1x4 [parm1q], mm6, mm7
- ret
-
-%macro QUANT_SSE 1
-;-----------------------------------------------------------------------------
-; void x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias )
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_%1
- SSE2_QUANT_DC_START
-%assign x 0
-%rep 2
- QUANT_1x8 [parm1q+x], xmm6, xmm7
-%assign x (x+16)
-%endrep
- ret
+ MMX_QUANT_1x4 [r0], mm6, mm7
+ RET
;-----------------------------------------------------------------------------
-; void x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
+; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_%1
+%macro QUANT_DC 6
+cglobal %1, 1,1
+ %2
%assign x 0
-%rep 2
- QUANT_1x8 [parm1q+x], [parm2q+x], [parm3q+x]
-%assign x (x+16)
+%rep %5
+ %3 [r0+x], %4m6, %4m7
+%assign x x+%6
%endrep
- ret
+ RET
+%endmacro
;-----------------------------------------------------------------------------
-; void x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
+; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_%1
+%macro QUANT_AC 4
+cglobal %1, 3,3
%assign x 0
-%rep 8
- QUANT_1x8 [parm1q+x], [parm2q+x], [parm3q+x]
-%assign x (x+16)
+%rep %3
+ %2 [r0+x], [r1+x], [r2+x]
+%assign x x+%4
%endrep
- ret
+ RET
%endmacro
-%define QUANT_1x8 SSE2_QUANT_1x8
-QUANT_SSE sse2
+%ifndef ARCH_X86_64 ; not needed because sse2 is faster
+QUANT_DC x264_quant_4x4_dc_mmxext, MMX_QUANT_DC_START, MMX_QUANT_1x4, m, 4, 8
+QUANT_AC x264_quant_4x4_mmx, MMX_QUANT_1x4, 4, 8
+QUANT_AC x264_quant_8x8_mmx, MMX_QUANT_1x4, 16, 8
+%endif
+
+QUANT_DC x264_quant_4x4_dc_sse2, SSE2_QUANT_DC_START, SSE2_QUANT_1x8, xm, 2, 16
+QUANT_AC x264_quant_4x4_sse2, SSE2_QUANT_1x8, 2, 16
+QUANT_AC x264_quant_8x8_sse2, SSE2_QUANT_1x8, 8, 16
+
%ifdef HAVE_SSE3
-%define QUANT_1x8 SSSE3_QUANT_1x8
-QUANT_SSE ssse3
+QUANT_DC x264_quant_4x4_dc_ssse3, SSE2_QUANT_DC_START, SSSE3_QUANT_1x8, xm, 2, 16
+QUANT_AC x264_quant_4x4_ssse3, SSSE3_QUANT_1x8, 2, 16
+QUANT_AC x264_quant_8x8_ssse3, SSSE3_QUANT_1x8, 8, 16
%endif
movq %1, mm0
%endmacro
+%macro DEQUANT_LOOP 2
+ mov t0d, 8*(%2-2)
+%%loop:
+ %1 [r0+t0+8], [r1+t0*2+16], [r1+t0*2+24]
+ %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8]
+ sub t0d, 16
+ jge %%loop
+ rep ret
+%endmacro
+
;-----------------------------------------------------------------------------
; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
%macro DEQUANT_WxH 3
-cglobal %1
-; mov rdi, rdi ; dct
-; mov rsi, rsi ; dequant_mf
-; mov edx, edx ; i_qp
-
- imul eax, edx, 0x2b
- shr eax, 8 ; i_qbits = i_qp / 6
- lea ecx, [eax+eax*2]
- sub edx, ecx
- sub edx, ecx ; i_mf = i_qp % 6
- shl edx, %3+2
- movsxd rdx, edx
- add rsi, rdx ; dequant_mf[i_mf]
-
- sub eax, %3
+cglobal %1, 0,3
+%ifdef ARCH_X86_64
+ %define t0 r4
+ %define t0d r4d
+ imul r4d, r2d, 0x2b
+ shr r4d, 8 ; i_qbits = i_qp / 6
+ lea r3d, [r4d*3]
+ sub r2, r3
+ sub r2, r3 ; i_mf = i_qp % 6
+ shl r2, %3+2
+ add r1, r2 ; dequant_mf[i_mf]
+%else
+ %define t0 r2
+ %define t0d r2d
+ mov r1, r2m ; i_qp
+ imul r2, r1, 0x2b
+ shr r2, 8 ; i_qbits = i_qp / 6
+ lea r0, [r2*3]
+ sub r1, r0
+ sub r1, r0 ; i_mf = i_qp % 6
+ shl r1, %3+2
+ add r1, r1m ; dequant_mf[i_mf]
+ mov r0, r0m ; dct
+%endif
+
+ sub t0d, %3
jl .rshift32 ; negative qbits => rightshift
.lshift:
- movd mm5, eax
-
-%rep %2
- DEQUANT16_L_1x4 [rdi], [rsi], [rsi+8]
- add rsi, byte 16
- add rdi, byte 8
-%endrep
-
- ret
+ movd mm5, t0d
+ DEQUANT_LOOP DEQUANT16_L_1x4, %2
.rshift32:
- neg eax
- movd mm5, eax
+ neg t0d
+ movd mm5, t0d
+ picgetgot t0d
movq mm6, [pd_1 GLOBAL]
pxor mm7, mm7
pslld mm6, mm5
psrld mm6, 1
-
-%rep %2
- DEQUANT32_R_1x4 [rdi], [rsi], [rsi+8]
- add rsi, byte 16
- add rdi, byte 8
-%endrep
-
- ret
+ DEQUANT_LOOP DEQUANT32_R_1x4, %2
%endmacro
DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4
--- /dev/null
+;*****************************************************************************
+;* sad-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003-2008 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Laurent Aimar <fenrir@via.ecp.fr>
+;* Alex Izvorski <aizvorksi@gmail.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+sw_64: dq 64
+
+SECTION .text
+
+;=============================================================================
+; SAD MMX
+;=============================================================================
+
+%macro SAD_INC_2x16P 0
+ movq mm1, [r0]
+ movq mm2, [r0+8]
+ movq mm3, [r0+r1]
+ movq mm4, [r0+r1+8]
+ psadbw mm1, [r2]
+ psadbw mm2, [r2+8]
+ psadbw mm3, [r2+r3]
+ psadbw mm4, [r2+r3+8]
+ lea r0, [r0+2*r1]
+ paddw mm1, mm2
+ paddw mm3, mm4
+ lea r2, [r2+2*r3]
+ paddw mm0, mm1
+ paddw mm0, mm3
+%endmacro
+
+%macro SAD_INC_2x8P 0
+ movq mm1, [r0]
+ movq mm2, [r0+r1]
+ psadbw mm1, [r2]
+ psadbw mm2, [r2+r3]
+ lea r0, [r0+2*r1]
+ paddw mm0, mm1
+ paddw mm0, mm2
+ lea r2, [r2+2*r3]
+%endmacro
+
+%macro SAD_INC_2x4P 0
+ movd mm1, [r0]
+ movd mm2, [r2]
+ punpckldq mm1, [r0+r1]
+ punpckldq mm2, [r2+r3]
+ psadbw mm1, mm2
+ paddw mm0, mm1
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+%macro SAD 2
+cglobal x264_pixel_sad_%1x%2_mmxext, 4,4
+ pxor mm0, mm0
+%rep %2/2
+ SAD_INC_2x%1P
+%endrep
+ movd eax, mm0
+ RET
+%endmacro
+
+SAD 16, 16
+SAD 16, 8
+SAD 8, 16
+SAD 8, 8
+SAD 8, 4
+SAD 4, 8
+SAD 4, 4
+
+
+
+;=============================================================================
+; SAD XMM
+;=============================================================================
+
+%macro SAD_END_SSE2 0
+ movhlps xmm1, xmm0
+ paddw xmm0, xmm1
+ movd eax, xmm0
+ RET
+%endmacro
+
+%macro SAD_W16 1
+;-----------------------------------------------------------------------------
+; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+cglobal x264_pixel_sad_16x16_%1, 4,4
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movdqu xmm2, [r2]
+ movdqu xmm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ psadbw xmm0, [r0]
+ psadbw xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm4, [r2]
+ paddw xmm0, xmm1
+ psadbw xmm2, [r0]
+ psadbw xmm3, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ paddw xmm2, xmm3
+ movdqu xmm6, [r2]
+ movdqu xmm7, [r2+r3]
+ lea r2, [r2+2*r3]
+ paddw xmm0, xmm2
+ psadbw xmm4, [r0]
+ psadbw xmm5, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm1, [r2]
+ paddw xmm4, xmm5
+ psadbw xmm6, [r0]
+ psadbw xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm2, [r2+r3]
+ lea r2, [r2+2*r3]
+ paddw xmm6, xmm7
+ movdqu xmm3, [r2]
+ paddw xmm0, xmm4
+ movdqu xmm4, [r2+r3]
+ lea r2, [r2+2*r3]
+ paddw xmm0, xmm6
+ psadbw xmm1, [r0]
+ psadbw xmm2, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm5, [r2]
+ paddw xmm1, xmm2
+ psadbw xmm3, [r0]
+ psadbw xmm4, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm6, [r2+r3]
+ lea r2, [r2+2*r3]
+ paddw xmm3, xmm4
+ movdqu xmm7, [r2]
+ paddw xmm0, xmm1
+ movdqu xmm1, [r2+r3]
+ paddw xmm0, xmm3
+ psadbw xmm5, [r0]
+ psadbw xmm6, [r0+r1]
+ lea r0, [r0+2*r1]
+ paddw xmm5, xmm6
+ psadbw xmm7, [r0]
+ psadbw xmm1, [r0+r1]
+ paddw xmm7, xmm1
+ paddw xmm0, xmm5
+ paddw xmm0, xmm7
+ SAD_END_SSE2
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+cglobal x264_pixel_sad_16x8_%1, 4,4
+ movdqu xmm0, [r2]
+ movdqu xmm2, [r2+r3]
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r2]
+ movdqu xmm4, [r2+r3]
+ psadbw xmm0, [r0]
+ psadbw xmm2, [r0+r1]
+ lea r0, [r0+2*r1]
+ psadbw xmm3, [r0]
+ psadbw xmm4, [r0+r1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ paddw xmm0, xmm2
+ paddw xmm3, xmm4
+ paddw xmm0, xmm3
+ movdqu xmm1, [r2]
+ movdqu xmm2, [r2+r3]
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r2]
+ movdqu xmm4, [r2+r3]
+ psadbw xmm1, [r0]
+ psadbw xmm2, [r0+r1]
+ lea r0, [r0+2*r1]
+ psadbw xmm3, [r0]
+ psadbw xmm4, [r0+r1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ paddw xmm1, xmm2
+ paddw xmm3, xmm4
+ paddw xmm0, xmm1
+ paddw xmm0, xmm3
+ SAD_END_SSE2
+%endmacro
+
+SAD_W16 sse2
+%ifdef HAVE_SSE3
+%define movdqu lddqu
+SAD_W16 sse3
+%undef movdqu
+%endif
+
+
+
+;=============================================================================
+; SAD x3/x4 MMX
+;=============================================================================
+
+%macro SAD_X3_START_1x8P 0
+ movq mm3, [r0]
+ movq mm0, [r1]
+ movq mm1, [r2]
+ movq mm2, [r3]
+ psadbw mm0, mm3
+ psadbw mm1, mm3
+ psadbw mm2, mm3
+%endmacro
+
+%macro SAD_X3_1x8P 2
+ movq mm3, [r0+%1]
+ movq mm4, [r1+%2]
+ movq mm5, [r2+%2]
+ movq mm6, [r3+%2]
+ psadbw mm4, mm3
+ psadbw mm5, mm3
+ psadbw mm6, mm3
+ paddw mm0, mm4
+ paddw mm1, mm5
+ paddw mm2, mm6
+%endmacro
+
+%macro SAD_X3_START_2x4P 3
+ movd mm3, [r0]
+ movd %1, [r1]
+ movd %2, [r2]
+ movd %3, [r3]
+ punpckldq mm3, [r0+FENC_STRIDE]
+ punpckldq %1, [r1+r4]
+ punpckldq %2, [r2+r4]
+ punpckldq %3, [r3+r4]
+ psadbw %1, mm3
+ psadbw %2, mm3
+ psadbw %3, mm3
+%endmacro
+
+%macro SAD_X3_2x16P 1
+%if %1
+ SAD_X3_START_1x8P
+%else
+ SAD_X3_1x8P 0, 0
+%endif
+ SAD_X3_1x8P 8, 8
+ SAD_X3_1x8P FENC_STRIDE, r4
+ SAD_X3_1x8P FENC_STRIDE+8, r4+8
+ add r0, 2*FENC_STRIDE
+ lea r1, [r1+2*r4]
+ lea r2, [r2+2*r4]
+ lea r3, [r3+2*r4]
+%endmacro
+
+%macro SAD_X3_2x8P 1
+%if %1
+ SAD_X3_START_1x8P
+%else
+ SAD_X3_1x8P 0, 0
+%endif
+ SAD_X3_1x8P FENC_STRIDE, r4
+ add r0, 2*FENC_STRIDE
+ lea r1, [r1+2*r4]
+ lea r2, [r2+2*r4]
+ lea r3, [r3+2*r4]
+%endmacro
+
+%macro SAD_X3_2x4P 1
+%if %1
+ SAD_X3_START_2x4P mm0, mm1, mm2
+%else
+ SAD_X3_START_2x4P mm4, mm5, mm6
+ paddw mm0, mm4
+ paddw mm1, mm5
+ paddw mm2, mm6
+%endif
+ add r0, 2*FENC_STRIDE
+ lea r1, [r1+2*r4]
+ lea r2, [r2+2*r4]
+ lea r3, [r3+2*r4]
+%endmacro
+
+%macro SAD_X4_START_1x8P 0
+ movq mm7, [r0]
+ movq mm0, [r1]
+ movq mm1, [r2]
+ movq mm2, [r3]
+ movq mm3, [r4]
+ psadbw mm0, mm7
+ psadbw mm1, mm7
+ psadbw mm2, mm7
+ psadbw mm3, mm7
+%endmacro
+
+%macro SAD_X4_1x8P 2
+ movq mm7, [r0+%1]
+ movq mm4, [r1+%2]
+ movq mm5, [r2+%2]
+ movq mm6, [r3+%2]
+ psadbw mm4, mm7
+ psadbw mm5, mm7
+ psadbw mm6, mm7
+ psadbw mm7, [r4+%2]
+ paddw mm0, mm4
+ paddw mm1, mm5
+ paddw mm2, mm6
+ paddw mm3, mm7
+%endmacro
+
+%macro SAD_X4_START_2x4P 0
+ movd mm7, [r0]
+ movd mm0, [r1]
+ movd mm1, [r2]
+ movd mm2, [r3]
+ movd mm3, [r4]
+ punpckldq mm7, [r0+FENC_STRIDE]
+ punpckldq mm0, [r1+r5]
+ punpckldq mm1, [r2+r5]
+ punpckldq mm2, [r3+r5]
+ punpckldq mm3, [r4+r5]
+ psadbw mm0, mm7
+ psadbw mm1, mm7
+ psadbw mm2, mm7
+ psadbw mm3, mm7
+%endmacro
+
+%macro SAD_X4_INC_2x4P 0
+ movd mm7, [r0]
+ movd mm4, [r1]
+ movd mm5, [r2]
+ punpckldq mm7, [r0+FENC_STRIDE]
+ punpckldq mm4, [r1+r5]
+ punpckldq mm5, [r2+r5]
+ psadbw mm4, mm7
+ psadbw mm5, mm7
+ paddw mm0, mm4
+ paddw mm1, mm5
+ movd mm4, [r3]
+ movd mm5, [r4]
+ punpckldq mm4, [r3+r5]
+ punpckldq mm5, [r4+r5]
+ psadbw mm4, mm7
+ psadbw mm5, mm7
+ paddw mm2, mm4
+ paddw mm3, mm5
+%endmacro
+
+%macro SAD_X4_2x16P 1
+%if %1
+ SAD_X4_START_1x8P
+%else
+ SAD_X4_1x8P 0, 0
+%endif
+ SAD_X4_1x8P 8, 8
+ SAD_X4_1x8P FENC_STRIDE, r5
+ SAD_X4_1x8P FENC_STRIDE+8, r5+8
+ add r0, 2*FENC_STRIDE
+ lea r1, [r1+2*r5]
+ lea r2, [r2+2*r5]
+ lea r3, [r3+2*r5]
+ lea r4, [r4+2*r5]
+%endmacro
+
+%macro SAD_X4_2x8P 1
+%if %1
+ SAD_X4_START_1x8P
+%else
+ SAD_X4_1x8P 0, 0
+%endif
+ SAD_X4_1x8P FENC_STRIDE, r5
+ add r0, 2*FENC_STRIDE
+ lea r1, [r1+2*r5]
+ lea r2, [r2+2*r5]
+ lea r3, [r3+2*r5]
+ lea r4, [r4+2*r5]
+%endmacro
+
+%macro SAD_X4_2x4P 1
+%if %1
+ SAD_X4_START_2x4P
+%else
+ SAD_X4_INC_2x4P
+%endif
+ add r0, 2*FENC_STRIDE
+ lea r1, [r1+2*r5]
+ lea r2, [r2+2*r5]
+ lea r3, [r3+2*r5]
+ lea r4, [r4+2*r5]
+%endmacro
+
+%macro SAD_X3_END 0
+%ifdef ARCH_X86_64
+ movd [r5+0], mm0
+ movd [r5+4], mm1
+ movd [r5+8], mm2
+%else
+ mov r0, r5m
+ movd [r0+0], mm0
+ movd [r0+4], mm1
+ movd [r0+8], mm2
+%endif
+ RET
+%endmacro
+
+%macro SAD_X4_END 0
+ mov r0, r6m
+ movd [r0+0], mm0
+ movd [r0+4], mm1
+ movd [r0+8], mm2
+ movd [r0+12], mm3
+ RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+; uint8_t *pix2, int i_stride, int scores[3] )
+;-----------------------------------------------------------------------------
+%macro SAD_X 3
+cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
+ SAD_X%1_2x%2P 1
+%rep %3/2-1
+ SAD_X%1_2x%2P 0
+%endrep
+ SAD_X%1_END
+%endmacro
+
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+SAD_X 3, 8, 16
+SAD_X 3, 8, 8
+SAD_X 3, 8, 4
+SAD_X 3, 4, 8
+SAD_X 3, 4, 4
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+SAD_X 4, 8, 16
+SAD_X 4, 8, 8
+SAD_X 4, 8, 4
+SAD_X 4, 4, 8
+SAD_X 4, 4, 4
+
+
+
+;=============================================================================
+; SAD x3/x4 XMM
+;=============================================================================
+
+%macro SAD_X3_START_1x16P_SSE2 0
+ movdqa xmm3, [r0]
+ movdqu xmm0, [r1]
+ movdqu xmm1, [r2]
+ movdqu xmm2, [r3]
+ psadbw xmm0, xmm3
+ psadbw xmm1, xmm3
+ psadbw xmm2, xmm3
+%endmacro
+
+%macro SAD_X3_1x16P_SSE2 2
+ movdqa xmm3, [r0+%1]
+ movdqu xmm4, [r1+%2]
+ movdqu xmm5, [r2+%2]
+ movdqu xmm6, [r3+%2]
+ psadbw xmm4, xmm3
+ psadbw xmm5, xmm3
+ psadbw xmm6, xmm3
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+ paddw xmm2, xmm6
+%endmacro
+
+%macro SAD_X3_2x16P_SSE2 1
+%if %1
+ SAD_X3_START_1x16P_SSE2
+%else
+ SAD_X3_1x16P_SSE2 0, 0
+%endif
+ SAD_X3_1x16P_SSE2 FENC_STRIDE, r4
+ add r0, 2*FENC_STRIDE
+ lea r1, [r1+2*r4]
+ lea r2, [r2+2*r4]
+ lea r3, [r3+2*r4]
+%endmacro
+
+%macro SAD_X4_START_1x16P_SSE2 0
+ movdqa xmm7, [r0]
+ movdqu xmm0, [r1]
+ movdqu xmm1, [r2]
+ movdqu xmm2, [r3]
+ movdqu xmm3, [r4]
+ psadbw xmm0, xmm7
+ psadbw xmm1, xmm7
+ psadbw xmm2, xmm7
+ psadbw xmm3, xmm7
+%endmacro
+
+%macro SAD_X4_1x16P_SSE2 2
+ movdqa xmm7, [r0+%1]
+ movdqu xmm4, [r1+%2]
+ movdqu xmm5, [r2+%2]
+ movdqu xmm6, [r3+%2]
+%ifdef ARCH_X86_64
+ movdqu xmm8, [r4+%2]
+ psadbw xmm4, xmm7
+ psadbw xmm5, xmm7
+ psadbw xmm6, xmm7
+ psadbw xmm8, xmm7
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+ paddw xmm2, xmm6
+ paddw xmm3, xmm8
+%else
+ psadbw xmm4, xmm7
+ psadbw xmm5, xmm7
+ paddw xmm0, xmm4
+ psadbw xmm6, xmm7
+ movdqu xmm4, [r4+%2]
+ paddw xmm1, xmm5
+ psadbw xmm4, xmm7
+ paddw xmm2, xmm6
+ paddw xmm3, xmm4
+%endif
+%endmacro
+
+%macro SAD_X4_2x16P_SSE2 1
+%if %1
+ SAD_X4_START_1x16P_SSE2
+%else
+ SAD_X4_1x16P_SSE2 0, 0
+%endif
+ SAD_X4_1x16P_SSE2 FENC_STRIDE, r5
+ add r0, 2*FENC_STRIDE
+ lea r1, [r1+2*r5]
+ lea r2, [r2+2*r5]
+ lea r3, [r3+2*r5]
+ lea r4, [r4+2*r5]
+%endmacro
+
+%macro SAD_X3_END_SSE2 0
+ movhlps xmm4, xmm0
+ movhlps xmm5, xmm1
+ movhlps xmm6, xmm2
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+ paddw xmm2, xmm6
+%ifdef ARCH_X86_64
+ movd [r5+0], xmm0
+ movd [r5+4], xmm1
+ movd [r5+8], xmm2
+%else
+ mov r0, r5m
+ movd [r0+0], xmm0
+ movd [r0+4], xmm1
+ movd [r0+8], xmm2
+%endif
+ RET
+%endmacro
+
+%macro SAD_X4_END_SSE2 0
+ mov r0, r6m
+ psllq xmm1, 32
+ psllq xmm3, 32
+ paddw xmm0, xmm1
+ paddw xmm2, xmm3
+ movhlps xmm1, xmm0
+ movhlps xmm3, xmm2
+ paddw xmm0, xmm1
+ paddw xmm2, xmm3
+ movq [r0+0], xmm0
+ movq [r0+8], xmm2
+ RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+; uint8_t *pix2, int i_stride, int scores[3] )
+;-----------------------------------------------------------------------------
+%macro SAD_X_SSE2 4
+cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1
+ SAD_X%1_2x%2P_SSE2 1
+%rep %3/2-1
+ SAD_X%1_2x%2P_SSE2 0
+%endrep
+ SAD_X%1_END_SSE2
+%endmacro
+
+SAD_X_SSE2 3, 16, 16, sse2
+SAD_X_SSE2 3, 16, 8, sse2
+SAD_X_SSE2 4, 16, 16, sse2
+SAD_X_SSE2 4, 16, 8, sse2
+
+%ifdef HAVE_SSE3
+%define movdqu lddqu
+SAD_X_SSE2 3, 16, 16, sse3
+SAD_X_SSE2 3, 16, 8, sse3
+SAD_X_SSE2 4, 16, 16, sse3
+SAD_X_SSE2 4, 16, 8, sse3
+%undef movdqu
+%endif
+
+
+
+;=============================================================================
+; SAD cacheline split
+;=============================================================================
+
+; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
+; unless the unaligned data spans the border between 2 cachelines, in which
+; case it's really slow. The exact numbers may differ, but all Intel cpus
+; have a large penalty for cacheline splits.
+; (8-byte alignment exactly half way between two cachelines is ok though.)
+; LDDQU was supposed to fix this, but it only works on Pentium 4.
+; So in the split case we load aligned data and explicitly perform the
+; alignment between registers. Like on archs that have only aligned loads,
+; except complicated by the fact that PALIGNR takes only an immediate, not
+; a variable alignment.
+; It is also possible to hoist the realignment to the macroblock level (keep
+; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
+; needed for that method makes it often slower.
+
+; sad 16x16 costs on Core2:
+; good offsets: 49 cycles (50/64 of all mvs)
+; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
+; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
+; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
+
+; computed jump assumes this loop is exactly 80 bytes
+%macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
+ALIGN 16
+sad_w16_align%1_sse2:
+ movdqa xmm1, [r2+16]
+ movdqa xmm2, [r2+r3+16]
+ movdqa xmm3, [r2]
+ movdqa xmm4, [r2+r3]
+ pslldq xmm1, 16-%1
+ pslldq xmm2, 16-%1
+ psrldq xmm3, %1
+ psrldq xmm4, %1
+ por xmm1, xmm3
+ por xmm2, xmm4
+ psadbw xmm1, [r0]
+ psadbw xmm2, [r0+r1]
+ paddw xmm0, xmm1
+ paddw xmm0, xmm2
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ dec r4
+ jg sad_w16_align%1_sse2
+ rep ret
+%endmacro
+
+; computed jump assumes this loop is exactly 64 bytes
+%macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
+ALIGN 16
+sad_w16_align%1_ssse3:
+ movdqa xmm1, [r2+16]
+ movdqa xmm2, [r2+r3+16]
+ palignr xmm1, [r2], %1
+ palignr xmm2, [r2+r3], %1
+ psadbw xmm1, [r0]
+ psadbw xmm2, [r0+r1]
+ paddw xmm0, xmm1
+ paddw xmm0, xmm2
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ dec r4
+ jg sad_w16_align%1_ssse3
+ rep ret
+%endmacro
+
+%macro SAD16_CACHELINE_FUNC 2 ; cpu, height
+cglobal x264_pixel_sad_16x%2_cache64_%1, 0,0
+ mov eax, r2m
+ and eax, 0x37
+ cmp eax, 0x30
+ jle x264_pixel_sad_16x%2_sse2
+ PROLOGUE 4,6,0
+ mov r4d, r2d
+ and r4d, 15
+%ifidn %1, ssse3
+ shl r4d, 6 ; code size = 64
+%else
+ lea r4, [r4*5]
+ shl r4d, 4 ; code size = 80
+%endif
+%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
+%ifdef PIC64
+ lea r5, [sad_w16_addr GLOBAL]
+ add r5, r4
+%else
+ picgetgot r5
+ lea r5, [sad_w16_addr + r4 GLOBAL]
+%endif
+ and r2, ~15
+ mov r4d, %2/2
+ pxor xmm0, xmm0
+ call r5
+ movhlps xmm1, xmm0
+ paddw xmm0, xmm1
+ movd eax, xmm0
+ RET
+%endmacro
+
+%macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
+ mov eax, r2m
+ and eax, 0x17|%2|(%4>>1)
+ cmp eax, 0x10|%2|(%4>>1)
+ jle x264_pixel_sad_%1x%2_mmxext
+ and eax, 7
+ shl eax, 3
+%ifdef PIC32
+ ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx
+ mov r2, 64
+ sub r2, eax
+ movd mm7, eax
+ movd mm6, r2
+%else
+ movd mm6, [sw_64 GLOBAL]
+ movd mm7, eax
+ psubw mm6, mm7
+%endif
+ PROLOGUE 4,5,0
+ and r2, ~7
+ mov r4d, %3
+ pxor mm0, mm0
+%endmacro
+
+%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
+cglobal x264_pixel_sad_16x%1_cache%2_mmxext, 0,0
+ SAD_CACHELINE_START_MMX2 16, %1, %1, %2
+.loop:
+ movq mm1, [r2]
+ movq mm2, [r2+8]
+ movq mm3, [r2+16]
+ movq mm4, mm2
+ psrlq mm1, mm7
+ psllq mm2, mm6
+ psllq mm3, mm6
+ psrlq mm4, mm7
+ por mm1, mm2
+ por mm3, mm4
+ psadbw mm1, [r0]
+ psadbw mm3, [r0+8]
+ paddw mm0, mm1
+ paddw mm0, mm3
+ add r2, r3
+ add r0, r1
+ dec r4
+ jg .loop
+ movd eax, mm0
+ RET
+%endmacro
+
+%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
+cglobal x264_pixel_sad_8x%1_cache%2_mmxext, 0,0
+ SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
+.loop:
+ movq mm1, [r2+8]
+ movq mm2, [r2+r3+8]
+ movq mm3, [r2]
+ movq mm4, [r2+r3]
+ psllq mm1, mm6
+ psllq mm2, mm6
+ psrlq mm3, mm7
+ psrlq mm4, mm7
+ por mm1, mm3
+ por mm2, mm4
+ psadbw mm1, [r0]
+ psadbw mm2, [r0+r1]
+ paddw mm0, mm1
+ paddw mm0, mm2
+ lea r2, [r2+2*r3]
+ lea r0, [r0+2*r1]
+ dec r4
+ jg .loop
+ movd eax, mm0
+ RET
+%endmacro
+
+; sad_x3/x4_cache64: check each mv.
+; if they're all within a cacheline, use normal sad_x3/x4.
+; otherwise, send them individually to sad_cache64.
+%macro CHECK_SPLIT 3 ; pix, width, cacheline
+ mov eax, %1
+ and eax, 0x17|%2|(%3>>1)
+ cmp eax, 0x10|%2|(%3>>1)
+ jg .split
+%endmacro
+
+%macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
+cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5, 0,0
+ CHECK_SPLIT r1m, %1, %3
+ CHECK_SPLIT r2m, %1, %3
+ CHECK_SPLIT r3m, %1, %3
+ jmp x264_pixel_sad_x3_%1x%2_%4
+.split:
+%ifdef ARCH_X86_64
+ push r3
+ push r2
+ mov r2, r1
+ mov r1, FENC_STRIDE
+ mov r3, r4
+ mov r10, r0
+ mov r11, r5
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov [r11], eax
+ pop r2
+ mov r0, r10
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov [r11+4], eax
+ pop r2
+ mov r0, r10
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov [r11+8], eax
+%else
+ push edi
+ mov edi, [esp+28]
+ push dword [esp+24]
+ push dword [esp+16]
+ push dword 16
+ push dword [esp+20]
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov ecx, [esp+32]
+ mov [edi], eax
+ mov [esp+8], ecx
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov ecx, [esp+36]
+ mov [edi+4], eax
+ mov [esp+8], ecx
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov [edi+8], eax
+ add esp, 16
+ pop edi
+%endif
+ ret
+%endmacro
+
+%macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
+cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5, 0,0
+ CHECK_SPLIT r1m, %1, %3
+ CHECK_SPLIT r2m, %1, %3
+ CHECK_SPLIT r3m, %1, %3
+ CHECK_SPLIT r4m, %1, %3
+ jmp x264_pixel_sad_x4_%1x%2_%4
+.split:
+%ifdef ARCH_X86_64
+ mov r11, r6m
+ push r4
+ push r3
+ push r2
+ mov r2, r1
+ mov r1, FENC_STRIDE
+ mov r3, r5
+ mov r10, r0
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov [r11], eax
+ pop r2
+ mov r0, r10
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov [r11+4], eax
+ pop r2
+ mov r0, r10
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov [r11+8], eax
+ pop r2
+ mov r0, r10
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov [r11+12], eax
+%else
+ push edi
+ mov edi, [esp+32]
+ push dword [esp+28]
+ push dword [esp+16]
+ push dword 16
+ push dword [esp+20]
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov ecx, [esp+32]
+ mov [edi], eax
+ mov [esp+8], ecx
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov ecx, [esp+36]
+ mov [edi+4], eax
+ mov [esp+8], ecx
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov ecx, [esp+40]
+ mov [edi+8], eax
+ mov [esp+8], ecx
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov [edi+12], eax
+ add esp, 16
+ pop edi
+%endif
+ ret
+%endmacro
+
+%macro SADX34_CACHELINE_FUNC 5
+ SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5
+ SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5
+%endmacro
+
+
+; instantiate the aligned sads
+
+%ifndef ARCH_X86_64
+SAD16_CACHELINE_FUNC_MMX2 8, 32
+SAD16_CACHELINE_FUNC_MMX2 16, 32
+SAD8_CACHELINE_FUNC_MMX2 4, 32
+SAD8_CACHELINE_FUNC_MMX2 8, 32
+SAD8_CACHELINE_FUNC_MMX2 16, 32
+SAD16_CACHELINE_FUNC_MMX2 8, 64
+SAD16_CACHELINE_FUNC_MMX2 16, 64
+%endif ; !ARCH_X86_64
+SAD8_CACHELINE_FUNC_MMX2 4, 64
+SAD8_CACHELINE_FUNC_MMX2 8, 64
+SAD8_CACHELINE_FUNC_MMX2 16, 64
+
+%ifndef ARCH_X86_64
+SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext
+SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext
+SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext
+%endif ; !ARCH_X86_64
+SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext
+SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext
+
+%ifndef ARCH_X86_64
+SAD16_CACHELINE_FUNC sse2, 8
+SAD16_CACHELINE_FUNC sse2, 16
+%assign i 1
+%rep 15
+SAD16_CACHELINE_LOOP_SSE2 i
+%assign i i+1
+%endrep
+SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2
+SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2
+%endif ; !ARCH_X86_64
+
+%ifdef HAVE_SSE3
+SAD16_CACHELINE_FUNC ssse3, 8
+SAD16_CACHELINE_FUNC ssse3, 16
+%assign i 1
+%rep 15
+SAD16_CACHELINE_LOOP_SSSE3 i
+%assign i i+1
+%endrep
+SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3
+SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3
+%endif ; HAVE_SSE3
;*****************************************************************************
-;* i386inc.asm: h264 encoder library
+;* x86inc-32.asm: h264 encoder library
;*****************************************************************************
-;* Copyright (C) 2006 x264 project
+;* Copyright (C) 2006-2008 x264 project
;*
;* Author: Sam Hocevar <sam@zoy.org>
;*
BITS 32
-;=============================================================================
-; Macros and other preprocessor constants
-;=============================================================================
-
-; Symbol prefix for C linkage
-%macro cglobal 1
- %ifdef PREFIX
- global _%1
- %define %1 _%1
- %else
- global %1
- %endif
- align 16
- %1:
-%endmacro
-
-%macro cextern 1
- %ifdef PREFIX
- extern _%1
- %define %1 _%1
- %else
- extern %1
- %endif
-%endmacro
-
; Name of the .rodata section. On OS X we cannot use .rodata because NASM
; is unable to compute address offsets outside of .text so we use the .text
; section instead until NASM is fixed.
; mov eax, [esp + 12]
;
%ifdef __PIC__
+ %define PIC32
%ifidn __OUTPUT_FORMAT__,macho
; There is no real global offset table on OS X, but we still
; need to reference our variables by offset.
%define picesp esp
%endif
-%assign FENC_STRIDE 16
-%assign FDEC_STRIDE 32
-
-; This is needed for ELF, otherwise the GNU linker assumes the stack is
-; executable by default.
-%ifidn __OUTPUT_FORMAT__,elf
-SECTION ".note.GNU-stack" noalloc noexec nowrite progbits
-%endif
-
;*****************************************************************************
-;* amd64inc.asm: h264 encoder library
+;* x86inc-64.asm: h264 encoder library
;*****************************************************************************
-;* Copyright (C) 2005 x264 project
+;* Copyright (C) 2005-2008 x264 project
;*
;* Authors: Andrew Dunstan
;*
; FIXME: All of the 64bit asm functions that take a stride as an argument
; via register, assume that the high dword of that register is filled with 0.
-; This is true in practice (since we never do any 64bit arithmetic on strides),
-; but is not guaranteed by the ABI.
-
-%macro cglobal 1
- %ifdef PREFIX
- global _%1:function hidden
- %define %1 _%1
- %else
- global %1:function hidden
- %endif
-%ifdef WIN64
- %define %1 pad %1
-%endif
- align 16
- %1:
-%endmacro
-
-%macro cextern 1
- %ifdef PREFIX
- extern _%1
- %define %1 _%1
- %else
- extern %1
- %endif
-%endmacro
+; This is true in practice (since we never do any 64bit arithmetic on strides,
+; and x264's strides are all positive), but is not guaranteed by the ABI.
; Name of the .rodata section. On OS X we cannot use .rodata because YASM
; is unable to compute address offsets outside of .text so we use the .text
;
%ifdef __PIC__
%define GLOBAL wrt rip
+ %define PIC64
%else
%define GLOBAL
%endif
-%assign FENC_STRIDE 16
-%assign FDEC_STRIDE 32
-
-; This is needed for ELF, otherwise the GNU linker assumes the stack is
-; executable by default.
-%ifidn __YASM_OBJFMT__,elf
-section ".note.GNU-stack" noalloc noexec nowrite progbits
-%endif
+%macro picgetgot 1
+%endmacro
--- /dev/null
+;*****************************************************************************
+;* x86inc.asm
+;*****************************************************************************
+;* Copyright (C) 2008 Loren Merritt <lorenm@u.washington.edu>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;*****************************************************************************
+
+%ifdef WIN64
+%define ARCH_X86_64
+%endif
+
+%ifdef ARCH_X86_64
+%include "x86inc-64.asm"
+%else
+%include "x86inc-32.asm"
+%endif
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed.
+; %3 = whether global constants are used in this function. inits x86_32 PIC if needed.
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE
+
+; REP_RET:
+; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
+; which are slow when a normal ret follows a branch.
+
+%macro DECLARE_REG 5
+ %define r%1q %2
+ %define r%1d %3
+ %define r%1w %4
+ ; no r%1b, because some regs don't have a byte form, and anyway x264 doesn't need it
+ %define r%1m %5
+ %define r%1 r%1q
+%endmacro
+
+%macro DECLARE_REG_SIZE 1
+ %define r%1q r%1
+ %define e%1q r%1
+ %define r%1d e%1
+ %define e%1d e%1
+ %define r%1w %1
+ %define e%1w %1
+%ifndef ARCH_X86_64
+ %define r%1 e%1
+%endif
+%endmacro
+
+DECLARE_REG_SIZE ax
+DECLARE_REG_SIZE bx
+DECLARE_REG_SIZE cx
+DECLARE_REG_SIZE dx
+DECLARE_REG_SIZE si
+DECLARE_REG_SIZE di
+DECLARE_REG_SIZE bp
+
+%ifdef ARCH_X86_64
+ %define push_size 8
+%else
+ %define push_size 4
+%endif
+
+%macro PUSH 1
+ push %1
+ %assign stack_offset stack_offset+push_size
+%endmacro
+
+%macro POP 1
+ pop %1
+ %assign stack_offset stack_offset-push_size
+%endmacro
+
+%macro SUB 2
+ sub %1, %2
+ %ifidn %1, rsp
+ %assign stack_offset stack_offset+(%2)
+ %endif
+%endmacro
+
+%macro ADD 2
+ add %1, %2
+ %ifidn %1, rsp
+ %assign stack_offset stack_offset-(%2)
+ %endif
+%endmacro
+
+%macro movifnidn 2
+ %ifnidn %1, %2
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro movsxdifnidn 2
+ %ifnidn %1, %2
+ movsxd %1, %2
+ %endif
+%endmacro
+
+%macro ASSERT 1
+ %if (%1) == 0
+ %error assert failed
+ %endif
+%endmacro
+
+%ifdef WIN64 ;================================================================
+
+DECLARE_REG 0, rcx, ecx, cx, ecx
+DECLARE_REG 1, rdx, edx, dx, edx
+DECLARE_REG 2, r8, r8d, r8w, r8d
+DECLARE_REG 3, r9, r9d, r9w, r9d
+DECLARE_REG 4, rdi, edi, di, [rsp + stack_offset + 40]
+DECLARE_REG 5, rsi, esi, si, [rsp + stack_offset + 48]
+DECLARE_REG 6, rax, eax, ax, [rsp + stack_offset + 56]
+%define r7m [rsp + stack_offset + 64]
+
+%macro LOAD_IF_USED 2 ; reg_id, number_of_args
+ %if %1 < %2
+ mov r%1, [rsp + 8 + %1*8]
+ %endif
+%endmacro
+
+%macro PROLOGUE 3
+ ASSERT %2 >= %1
+ ASSERT %2 <= 7
+ %assign stack_offset 0
+ LOAD_IF_USED 4, %1
+ LOAD_IF_USED 5, %1
+ LOAD_IF_USED 6, %1
+%endmacro
+
+%macro RET 0
+ ret
+%endmacro
+
+%macro REP_RET 0
+ rep ret
+%endmacro
+
+%elifdef ARCH_X86_64 ;========================================================
+
+DECLARE_REG 0, rdi, edi, di, edi
+DECLARE_REG 1, rsi, esi, si, esi
+DECLARE_REG 2, rdx, edx, dx, edx
+DECLARE_REG 3, rcx, ecx, cx, ecx
+DECLARE_REG 4, r8, r8d, r8w, r8d
+DECLARE_REG 5, r9, r9d, r9w, r9d
+DECLARE_REG 6, rax, eax, ax, [rsp + stack_offset + 8]
+%define r7m [rsp + stack_offset + 16]
+
+%macro LOAD_IF_USED 2 ; reg_id, number_of_args
+ %if %1 < %2
+ mov r%1, [rsp - 40 + %1*8]
+ %endif
+%endmacro
+
+%macro PROLOGUE 3
+ ASSERT %2 >= %1
+ ASSERT %2 <= 7
+ %assign stack_offset 0
+ LOAD_IF_USED 6, %1
+%endmacro
+
+%macro RET 0
+ ret
+%endmacro
+
+%macro REP_RET 0
+ rep ret
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, eax, ax, [esp + stack_offset + 4]
+DECLARE_REG 1, ecx, ecx, cx, [esp + stack_offset + 8]
+DECLARE_REG 2, edx, edx, dx, [esp + stack_offset + 12]
+DECLARE_REG 3, ebx, ebx, bx, [esp + stack_offset + 16]
+DECLARE_REG 4, esi, esi, si, [esp + stack_offset + 20]
+DECLARE_REG 5, edi, edi, di, [esp + stack_offset + 24]
+DECLARE_REG 6, ebp, ebp, bp, [esp + stack_offset + 28]
+%define r7m [esp + stack_offset + 32]
+%define rsp esp
+
+%macro PUSH_IF_USED 1 ; reg_id
+ %if %1 < regs_used
+ push r%1
+ %assign stack_offset stack_offset+4
+ %endif
+%endmacro
+
+%macro POP_IF_USED 1 ; reg_id
+ %if %1 < regs_used
+ pop r%1
+ %endif
+%endmacro
+
+%macro LOAD_IF_USED 2 ; reg_id, number_of_args
+ %if %1 < %2
+ mov r%1, [esp + stack_offset + 4 + %1*4]
+ %endif
+%endmacro
+
+%macro PROLOGUE 3
+ ASSERT %2 >= %1
+ %assign stack_offset 0
+ %assign regs_used %2
+ %if %3
+ %assign regs_used regs_used+1
+ %endif
+ ASSERT regs_used <= 7
+ PUSH_IF_USED 3
+ PUSH_IF_USED 4
+ PUSH_IF_USED 5
+ PUSH_IF_USED 6
+ LOAD_IF_USED 0, %1
+ LOAD_IF_USED 1, %1
+ LOAD_IF_USED 2, %1
+ LOAD_IF_USED 3, %1
+ LOAD_IF_USED 4, %1
+ LOAD_IF_USED 5, %1
+ LOAD_IF_USED 6, %1
+ %if %3
+ picgetgot r%2
+ %endif
+%endmacro
+
+%macro RET 0
+ POP_IF_USED 6
+ POP_IF_USED 5
+ POP_IF_USED 4
+ POP_IF_USED 3
+ ret
+%endmacro
+
+%macro REP_RET 0
+ %if regs_used > 3
+ RET
+ %else
+ rep ret
+ %endif
+%endmacro
+
+%endif ;======================================================================
+
+
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Symbol prefix for C linkage
+%macro cglobal 1
+ %ifidn __OUTPUT_FORMAT__,elf
+ %ifdef PREFIX
+ global _%1:function hidden
+ %define %1 _%1
+ %else
+ global %1:function hidden
+ %endif
+ %else
+ %ifdef PREFIX
+ global _%1
+ %define %1 _%1
+ %else
+ global %1
+ %endif
+ %endif
+%ifdef WIN64
+ %define %1 pad %1
+%endif
+ align function_align
+ %1:
+%endmacro
+
+%macro cglobal 3
+ cglobal %1
+ PROLOGUE %2, %3, 0
+%endmacro
+
+%macro cglobal 4
+ cglobal %1
+ PROLOGUE %2, %3, %4
+%endmacro
+
+%macro cextern 1
+ %ifdef PREFIX
+ extern _%1
+ %define %1 _%1
+ %else
+ extern %1
+ %endif
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is
+; executable by default.
+%ifidn __OUTPUT_FORMAT__,elf
+SECTION ".note.GNU-stack" noalloc noexec nowrite progbits
+%endif
+
+%assign FENC_STRIDE 16
+%assign FDEC_STRIDE 32
+
#include "common/common.h"
#include "common/cpu.h"
-#ifdef HAVE_MMX
-#include "common/i386/pixel.h"
-#include "common/i386/dct.h"
-#include "common/i386/mc.h"
-#endif
-#ifdef ARCH_PPC
-#include "common/ppc/pixel.h"
-#include "common/ppc/mc.h"
-#endif
/* buf1, buf2: initialised to random data and shouldn't write into them */
uint8_t * buf1, * buf2;
for( j=0; j<4; j++ )
dc[j] = rand() & 0x3fff;
used_asm = 1;
- mvn_c = pixel_c.ads[i&3]( dc, sums, 32, cost_mv, mvs_c, 32, thresh );
- mvn_a = pixel_asm.ads[i&3]( dc, sums, 32, cost_mv, mvs_a, 32, thresh );
+ mvn_c = pixel_c.ads[i&3]( dc, sums, 32, cost_mv, mvs_c, 28, thresh );
+ mvn_a = pixel_asm.ads[i&3]( dc, sums, 32, cost_mv, mvs_a, 28, thresh );
if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) )
+ {
ok = 0;
+ printf("c%d: ", i&3);
+ for(j=0; j<mvn_c; j++)
+ printf("%d ", mvs_c[j]);
+ printf("\na%d: ", i&3);
+ for(j=0; j<mvn_a; j++)
+ printf("%d ", mvs_a[j]);
+ printf("\n\n");
+ }
}
report( "esa ads:" );
uint8_t *src = &buf1[2*32+2];
uint8_t *src2[4] = { &buf1[2*32+2], &buf1[6*32+2],
&buf1[10*32+2], &buf1[14*32+2] };
- uint8_t *dst1 = &buf3[2*32+2];
- uint8_t *dst2 = &buf4[2*32+2];
+ uint8_t *dst1 = &buf3[2*32];
+ uint8_t *dst2 = &buf4[2*32];
- int dx, dy, i, j, w;
+ int dx, dy, i, j, k, w;
int ret = 0, ok, used_asm;
x264_mc_init( 0, &mc_c );
x264_pixel_init( 0, &pixel );
#define MC_TEST_LUMA( w, h ) \
- if( mc_a.mc_luma != mc_ref.mc_luma ) \
+ if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
{ \
used_asm = 1; \
memset(buf3, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
- mc_c.mc_luma( dst1, 16, src2, 32, dx, dy, w, h ); \
- mc_a.mc_luma( dst2, 16, src2, 32, dx, dy, w, h ); \
+ mc_c.mc_luma( dst1, 32, src2, 16, dx, dy, w, h ); \
+ mc_a.mc_luma( dst2, 32, src2, 16, dx, dy, w, h ); \
if( memcmp( buf3, buf4, 1024 ) ) \
{ \
fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
if( mc_a.get_ref != mc_ref.get_ref ) \
{ \
uint8_t *ref = dst2; \
- int ref_stride = 16; \
+ int ref_stride = 32; \
used_asm = 1; \
memset(buf3, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
- mc_c.mc_luma( dst1, 16, src2, 32, dx, dy, w, h ); \
- ref = mc_a.get_ref( ref, &ref_stride, src2, 32, dx, dy, w, h ); \
- if( pixel.sad[PIXEL_##w##x##h]( dst1, 16, ref, ref_stride ) ) \
- { \
- fprintf( stderr, "get_ref[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
- ok = 0; \
- } \
+ mc_c.mc_luma( dst1, 32, src2, 16, dx, dy, w, h ); \
+ ref = mc_a.get_ref( ref, &ref_stride, src2, 16, dx, dy, w, h ); \
+ for( i=0; i<h; i++ ) \
+ if( memcmp( dst1+i*32, ref+i*ref_stride, w ) ) \
+ { \
+ fprintf( stderr, "get_ref[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
+ ok = 0; \
+ break; \
+ } \
}
#define MC_TEST_CHROMA( w, h ) \
for( dy = -8; dy < 8; dy++ )
for( dx = -8; dx < 8; dx++ )
{
+ MC_TEST_LUMA( 20, 18 );
MC_TEST_LUMA( 16, 16 );
MC_TEST_LUMA( 16, 8 );
+ MC_TEST_LUMA( 12, 10 );
MC_TEST_LUMA( 8, 16 );
MC_TEST_LUMA( 8, 8 );
MC_TEST_LUMA( 8, 4 );
MC_TEST_AVG( avg_weight, w );
report( "mc wpredb :" );
+ if( mc_a.hpel_filter != mc_ref.hpel_filter )
+ {
+ uint8_t *src = buf1+16+2*64;
+ uint8_t *dstc[3] = { buf3+16, buf3+16+16*64, buf3+16+32*64 };
+ uint8_t *dsta[3] = { buf4+16, buf4+16+16*64, buf4+16+32*64 };
+ ok = 1; used_asm = 1;
+ memset( buf3, 0, 4096 );
+ memset( buf4, 0, 4096 );
+ mc_c.hpel_filter( dstc[0], dstc[1], dstc[2], src, 64, 48, 10 );
+ mc_a.hpel_filter( dsta[0], dsta[1], dsta[2], src, 64, 48, 10 );
+ for( i=0; i<3; i++ )
+ for( j=0; j<10; j++ )
+ //FIXME ideally the first pixels would match too, but they aren't actually used
+ if( memcmp( dstc[i]+j*64+2, dsta[i]+j*64+2, 46 ) )
+ {
+ ok = 0;
+ fprintf( stderr, "hpel filter differs at plane %c line %d\n", "hvc"[i], j );
+ for( k=0; k<48; k++ )
+ printf("%02x%s", dstc[i][j*64+k], (k+1)&3 ? "" : " ");
+ printf("\n");
+ for( k=0; k<48; k++ )
+ printf("%02x%s", dsta[i][j*64+k], (k+1)&3 ? "" : " ");
+ printf("\n");
+ break;
+ }
+ report( "hpel filter :" );
+ }
+
return ret;
}
x264_predict_8x8_filter( buf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
-#define INTRA_TEST( name, dir, ... ) \
+#define INTRA_TEST( name, dir, w, ... ) \
if( ip_a.name[dir] != ip_ref.name[dir] )\
{ \
used_asm = 1; \
for(k=-1; k<16; k++)\
printf("%2x ", edge[16+k]);\
printf("\n");\
- for(j=0; j<8; j++){\
+ for(j=0; j<w; j++){\
printf("%2x ", edge[14-j]);\
- for(k=0; k<8; k++)\
+ for(k=0; k<w; k++)\
printf("%2x ", buf4[48+k+j*32]);\
printf("\n");\
}\
printf("\n");\
- for(j=0; j<8; j++){\
+ for(j=0; j<w; j++){\
printf(" ");\
- for(k=0; k<8; k++)\
+ for(k=0; k<w; k++)\
printf("%2x ", buf3[48+k+j*32]);\
printf("\n");\
}\
}
for( i = 0; i < 12; i++ )
- INTRA_TEST( predict_4x4, i );
+ INTRA_TEST( predict_4x4, i, 4 );
for( i = 0; i < 7; i++ )
- INTRA_TEST( predict_8x8c, i );
+ INTRA_TEST( predict_8x8c, i, 8 );
for( i = 0; i < 7; i++ )
- INTRA_TEST( predict_16x16, i );
+ INTRA_TEST( predict_16x16, i, 16 );
for( i = 0; i < 12; i++ )
- INTRA_TEST( predict_8x8, i, edge );
+ INTRA_TEST( predict_8x8, i, 8, edge );
report( "intra pred :" );
return ret;
buf1 = x264_malloc( 1024 ); /* 32 x 32 */
buf2 = x264_malloc( 1024 );
- buf3 = x264_malloc( 1024 );
- buf4 = x264_malloc( 1024 );
+ buf3 = x264_malloc( 4096 );
+ buf4 = x264_malloc( 4096 );
i = ( argc > 1 ) ? atoi(argv[1]) : x264_mdate();
fprintf( stderr, "x264: using random seed %u\n", i );