ifneq ($(AS),)
X86SRC0 = cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \
pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
- cpu-32.asm dct-32.asm
+ cpu-32.asm dct-32.asm x86util.asm
X86SRC = $(X86SRC0:%=common/x86/%)
ifeq ($(ARCH),X86)
;*****************************************************************************
%include "x86inc.asm"
+%include "x86util.asm"
SECTION_RODATA
SECTION .text
-%macro SUMSUB_BA 2
- paddw %1, %2
- paddw %2, %2
- psubw %2, %1
-%endmacro
-
%macro SBUTTERFLY 4
mova m%4, m%2
punpckl%1 m%2, m%3
SWAP %2, %3
%endmacro
-%macro LOAD_DIFF_8P 4
- movh %1, %3
- movh %2, %4
- punpcklbw %1, %2
- punpcklbw %2, %2
- psubw %1, %2
-%endmacro
-
-%macro STORE_DIFF_8P 4
- psraw %1, 6
- movh %3, %2
- punpcklbw %3, %4
- paddsw %1, %3
- packuswb %1, %1
- movh %2, %1
-%endmacro
-
; in: m0..m7
; out: 0,4,6 in mem, rest in regs
%macro DCT8_1D 9
INIT_MMX
ALIGN 16
load_diff_4x8_mmx:
- LOAD_DIFF_8P m0, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
- LOAD_DIFF_8P m1, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
- LOAD_DIFF_8P m2, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
- LOAD_DIFF_8P m3, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
- LOAD_DIFF_8P m4, m7, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
- LOAD_DIFF_8P m5, m7, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
+ LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+ LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
+ LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+ LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
+ LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
+ LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
movq [r0], m0
- LOAD_DIFF_8P m6, m7, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
- LOAD_DIFF_8P m7, m0, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
+ LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
+ LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
movq m0, [r0]
ret
cglobal x264_sub8x8_dct8_sse2, 3,3
global x264_sub8x8_dct8_sse2 %+ .skip_prologue
.skip_prologue:
- LOAD_DIFF_8P m0, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
- LOAD_DIFF_8P m1, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
- LOAD_DIFF_8P m2, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
- LOAD_DIFF_8P m3, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
- LOAD_DIFF_8P m4, m7, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
- LOAD_DIFF_8P m5, m7, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
+ LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+ LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
+ LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+ LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
+ LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
+ LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
SPILL r0, 0
- LOAD_DIFF_8P m6, m7, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
- LOAD_DIFF_8P m7, m0, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
+ LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
+ LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
UNSPILL r0, 0
DCT8_1D 0,1,2,3,4,5,6,7,r0
UNSPILL r0, 0,4
IDCT8_1D 0,1,2,3,4,5,6,7,r1
SPILL r1, 6,7
pxor m7, m7
- STORE_DIFF_8P m0, [r0+FDEC_STRIDE*0], m6, m7
- STORE_DIFF_8P m1, [r0+FDEC_STRIDE*1], m6, m7
- STORE_DIFF_8P m2, [r0+FDEC_STRIDE*2], m6, m7
- STORE_DIFF_8P m3, [r0+FDEC_STRIDE*3], m6, m7
- STORE_DIFF_8P m4, [r0+FDEC_STRIDE*4], m6, m7
- STORE_DIFF_8P m5, [r0+FDEC_STRIDE*5], m6, m7
+ STORE_DIFF m0, m6, m7, [r0+FDEC_STRIDE*0]
+ STORE_DIFF m1, m6, m7, [r0+FDEC_STRIDE*1]
+ STORE_DIFF m2, m6, m7, [r0+FDEC_STRIDE*2]
+ STORE_DIFF m3, m6, m7, [r0+FDEC_STRIDE*3]
+ STORE_DIFF m4, m6, m7, [r0+FDEC_STRIDE*4]
+ STORE_DIFF m5, m6, m7, [r0+FDEC_STRIDE*5]
UNSPILL_SHUFFLE r1, 0,1, 6,7
- STORE_DIFF_8P m0, [r0+FDEC_STRIDE*6], m6, m7
- STORE_DIFF_8P m1, [r0+FDEC_STRIDE*7], m6, m7
+ STORE_DIFF m0, m6, m7, [r0+FDEC_STRIDE*6]
+ STORE_DIFF m1, m6, m7, [r0+FDEC_STRIDE*7]
ret
;*****************************************************************************
%include "x86inc.asm"
+%include "x86util.asm"
SECTION_RODATA
pw_32: times 8 dw 32
INIT_XMM
-%macro LOAD_DIFF_8P 5
- movq %1, %4
- punpcklbw %1, %3
- movq %2, %5
- punpcklbw %2, %3
- psubw %1, %2
-%endmacro
-
-%macro SUMSUB_BA 2
- paddw %1, %2
- paddw %2, %2
- psubw %2, %1
-%endmacro
-
%macro SBUTTERFLY 4
mova m%4, m%2
punpckl%1 m%2, m%3
SWAP %4, %7
%endmacro
-%macro STORE_DIFF_8P 4
- psraw %1, 6
- movq %2, %4
- punpcklbw %2, %3
- paddsw %1, %2
- packuswb %1, %1
- movq %4, %1
-%endmacro
-
SECTION .text
%macro DCT8_1D 10
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_sse2
- LOAD_DIFF_8P m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
- LOAD_DIFF_8P m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
- LOAD_DIFF_8P m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
- LOAD_DIFF_8P m3, m8, m9, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
- LOAD_DIFF_8P m4, m8, m9, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
- LOAD_DIFF_8P m5, m8, m9, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
- LOAD_DIFF_8P m6, m8, m9, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
- LOAD_DIFF_8P m7, m8, m9, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
+ LOAD_DIFF m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+ LOAD_DIFF m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
+ LOAD_DIFF m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+ LOAD_DIFF m3, m8, m9, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
+ LOAD_DIFF m4, m8, m9, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
+ LOAD_DIFF m5, m8, m9, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
+ LOAD_DIFF m6, m8, m9, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
+ LOAD_DIFF m7, m8, m9, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
DCT8_1D 0,1,2,3,4,5,6,7,8,9
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
IDCT8_1D 0,1,2,3,4,5,6,7,8,9
pxor m9, m9
- STORE_DIFF_8P m0, m8, m9, [r0+0*FDEC_STRIDE]
- STORE_DIFF_8P m1, m8, m9, [r0+1*FDEC_STRIDE]
- STORE_DIFF_8P m2, m8, m9, [r0+2*FDEC_STRIDE]
- STORE_DIFF_8P m3, m8, m9, [r0+3*FDEC_STRIDE]
- STORE_DIFF_8P m4, m8, m9, [r0+4*FDEC_STRIDE]
- STORE_DIFF_8P m5, m8, m9, [r0+5*FDEC_STRIDE]
- STORE_DIFF_8P m6, m8, m9, [r0+6*FDEC_STRIDE]
- STORE_DIFF_8P m7, m8, m9, [r0+7*FDEC_STRIDE]
+ STORE_DIFF m0, m8, m9, [r0+0*FDEC_STRIDE]
+ STORE_DIFF m1, m8, m9, [r0+1*FDEC_STRIDE]
+ STORE_DIFF m2, m8, m9, [r0+2*FDEC_STRIDE]
+ STORE_DIFF m3, m8, m9, [r0+3*FDEC_STRIDE]
+ STORE_DIFF m4, m8, m9, [r0+4*FDEC_STRIDE]
+ STORE_DIFF m5, m8, m9, [r0+5*FDEC_STRIDE]
+ STORE_DIFF m6, m8, m9, [r0+6*FDEC_STRIDE]
+ STORE_DIFF m7, m8, m9, [r0+7*FDEC_STRIDE]
ret
;*****************************************************************************
%include "x86inc.asm"
+%include "x86util.asm"
SECTION_RODATA
pw_1: times 8 dw 1
SECTION .text
-%macro LOAD_DIFF_4P 5
- movh %1, %4
- punpcklbw %1, %3
- movh %2, %5
- punpcklbw %2, %3
- psubw %1, %2
-%endmacro
-
-%macro SUMSUB_BA 2
- paddw %1, %2
- paddw %2, %2
- psubw %2, %1
-%endmacro
-
-%macro SUMSUB_BADC 4
- paddw %1, %2
- paddw %3, %4
- paddw %2, %2
- paddw %4, %4
- psubw %2, %1
- psubw %4, %3
-%endmacro
-
-%macro SUMSUB2_AB 3
- mova %3, %1
- paddw %1, %1
- paddw %1, %2
- psubw %3, %2
- psubw %3, %2
-%endmacro
-
-%macro SUMSUBD2_AB 4
- mova %4, %1
- mova %3, %2
- psraw %2, 1
- psraw %4, 1
- paddw %1, %2
- psubw %4, %3
-%endmacro
-
%macro SBUTTERFLY 4
mova m%4, m%2
punpckl%1 m%2, m%3
SBUTTERFLY qdq, %3, %4, %5
%endmacro
-%macro STORE_DIFF_4P 4
- psraw %1, 6
- movh %2, %4
- punpcklbw %2, %3
- paddsw %1, %2
- packuswb %1, %1
- movh %4, %1
-%endmacro
-
%macro HADAMARD4_1D 4
SUMSUB_BADC m%2, m%1, m%4, m%3
SUMSUB_BADC m%4, m%2, m%3, m%1
cglobal x264_sub4x4_dct_mmx, 3,3
.skip_prologue:
%macro SUB_DCT4 1
- LOAD_DIFF_4P m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
- LOAD_DIFF_4P m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
- LOAD_DIFF_4P m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
- LOAD_DIFF_4P m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
+ LOAD_DIFF m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+ LOAD_DIFF m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
+ LOAD_DIFF m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+ LOAD_DIFF m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
DCT4_1D 0,1,2,3,4
TRANSPOSE%1 0,1,2,3,4
DCT4_1D 0,1,2,3,4
paddw m0, [pw_32 GLOBAL]
IDCT4_1D 0,1,2,3,4,5
pxor m7, m7
- STORE_DIFF_4P m0, m4, m7, [r0+0*FDEC_STRIDE]
- STORE_DIFF_4P m1, m4, m7, [r0+1*FDEC_STRIDE]
- STORE_DIFF_4P m2, m4, m7, [r0+2*FDEC_STRIDE]
- STORE_DIFF_4P m3, m4, m7, [r0+3*FDEC_STRIDE]
+ STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
+ STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
+ STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
+ STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
%endmacro
ADD_IDCT4 4x4W
RET