1 ;*****************************************************************************
2 ;* dct-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 ;* Loren Merritt <lorenm@u.washington.edu>
8 ;* Min Chen <chenm001.163.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;*****************************************************************************
26 %include "x86util.asm"
31 pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
36 SUMSUB_BADC m%2, m%1, m%4, m%3
37 SUMSUB_BADC m%4, m%2, m%3, m%1
41 ;-----------------------------------------------------------------------------
42 ; void x264_dct4x4dc_mmx( int16_t d[4][4] )
43 ;-----------------------------------------------------------------------------
44 cglobal x264_dct4x4dc_mmx, 1,1,1
50 TRANSPOSE4x4W 0,1,2,3,4
52 movq m6, [pw_1 GLOBAL]
67 ;-----------------------------------------------------------------------------
68 ; void x264_idct4x4dc_mmx( int16_t d[4][4] )
69 ;-----------------------------------------------------------------------------
70 cglobal x264_idct4x4dc_mmx, 1,1
76 TRANSPOSE4x4W 0,1,2,3,4
85 SUMSUB_BADC m%4, m%1, m%3, m%2
87 SUMSUB2_AB m%1, m%2, m%5
88 SWAP %1, %3, %4, %5, %2
93 SUMSUBD2_AB m%2, m%4, m%6, m%5
94 SUMSUB_BADC m%2, m%3, m%5, m%1
95 SWAP %1, %2, %5, %4, %3
98 ;-----------------------------------------------------------------------------
99 ; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
100 ;-----------------------------------------------------------------------------
101 cglobal x264_sub4x4_dct_mmx, 3,3
104 LOAD_DIFF m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
105 LOAD_DIFF m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
106 LOAD_DIFF m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
107 LOAD_DIFF m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
109 TRANSPOSE%1 0,1,2,3,4
119 ;-----------------------------------------------------------------------------
120 ; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
121 ;-----------------------------------------------------------------------------
122 cglobal x264_add4x4_idct_mmx, 2,2,1
130 TRANSPOSE%1 0,1,2,3,4
131 paddw m0, [pw_32 GLOBAL]
134 STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
135 STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
136 STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
137 STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
144 cglobal x264_sub8x8_dct_sse2, 3,3
148 add r1, 4*FENC_STRIDE
149 add r2, 4*FDEC_STRIDE
158 cglobal x264_add8x8_idct_sse2, 2,2,1
162 add r0, 4*FDEC_STRIDE
175 ;-----------------------------------------------------------------------------
176 ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
177 ;-----------------------------------------------------------------------------
183 add r1, %4-%5-%6*FENC_STRIDE
184 add r2, %4-%5-%6*FDEC_STRIDE
187 add r1, (%4-%6)*FENC_STRIDE-%5-%4
188 add r2, (%4-%6)*FDEC_STRIDE-%5-%4
191 add r1, %4-%5-%6*FENC_STRIDE
192 add r2, %4-%5-%6*FDEC_STRIDE
196 ;-----------------------------------------------------------------------------
197 ; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
198 ;-----------------------------------------------------------------------------
199 %macro ADD_NxN_IDCT 6
203 add r0, %4-%5-%6*FDEC_STRIDE
206 add r0, (%4-%6)*FDEC_STRIDE-%5-%4
209 add r0, %4-%5-%6*FDEC_STRIDE
215 SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0
216 ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0
217 SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4
218 ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4
220 cextern x264_sub8x8_dct8_mmx.skip_prologue
221 cextern x264_add8x8_idct8_mmx.skip_prologue
222 SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0
223 ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 0
224 %define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
225 %define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
228 SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4
229 ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
231 cextern x264_sub8x8_dct8_sse2
232 cextern x264_add8x8_idct8_sse2
233 SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0
234 ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
238 ;-----------------------------------------------------------------------------
239 ; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
240 ;-----------------------------------------------------------------------------
241 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
242 cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
243 pshufw mm0, [r1+4], 0xd2
255 ;-----------------------------------------------------------------------------
256 ; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
257 ;-----------------------------------------------------------------------------
258 cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
259 movd xmm0, [r1+0*FENC_STRIDE]
260 movd xmm1, [r1+1*FENC_STRIDE]
261 movd xmm2, [r1+2*FENC_STRIDE]
262 movd xmm3, [r1+3*FENC_STRIDE]
263 movd xmm4, [r2+0*FDEC_STRIDE]
264 movd xmm5, [r2+1*FDEC_STRIDE]
265 movd xmm6, [r2+2*FDEC_STRIDE]
266 movd xmm7, [r2+3*FDEC_STRIDE]
267 movd [r2+0*FDEC_STRIDE], xmm0
268 movd [r2+1*FDEC_STRIDE], xmm1
269 movd [r2+2*FDEC_STRIDE], xmm2
270 movd [r2+3*FDEC_STRIDE], xmm3
278 movdqa xmm7, [pb_zigzag4 GLOBAL]