1 ;*****************************************************************************
2 ;* dct-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 ;* Loren Merritt <lorenm@u.washington.edu>
8 ;* Min Chen <chenm001.163.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;*****************************************************************************
30 pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
81 %macro TRANSPOSE4x4W 5
82 SBUTTERFLY wd, %1, %2, %5
83 SBUTTERFLY wd, %3, %4, %5
84 SBUTTERFLY dq, %1, %3, %5
85 SBUTTERFLY dq, %2, %4, %5
89 %macro TRANSPOSE2x4x4W 5
90 SBUTTERFLY wd, %1, %2, %5
91 SBUTTERFLY wd, %3, %4, %5
92 SBUTTERFLY dq, %1, %3, %5
93 SBUTTERFLY dq, %2, %4, %5
94 SBUTTERFLY qdq, %1, %2, %5
95 SBUTTERFLY qdq, %3, %4, %5
98 %macro STORE_DIFF_4P 4
107 %macro HADAMARD4_1D 4
108 SUMSUB_BADC m%2, m%1, m%4, m%3
109 SUMSUB_BADC m%4, m%2, m%3, m%1
113 ;-----------------------------------------------------------------------------
114 ; void x264_dct4x4dc_mmx( int16_t d[4][4] )
115 ;-----------------------------------------------------------------------------
116 cglobal x264_dct4x4dc_mmx, 1,1,1
122 TRANSPOSE4x4W 0,1,2,3,4
124 movq m6, [pw_1 GLOBAL]
139 ;-----------------------------------------------------------------------------
140 ; void x264_idct4x4dc_mmx( int16_t d[4][4] )
141 ;-----------------------------------------------------------------------------
142 cglobal x264_idct4x4dc_mmx, 1,1
148 TRANSPOSE4x4W 0,1,2,3,4
157 SUMSUB_BADC m%4, m%1, m%3, m%2
159 SUMSUB2_AB m%1, m%2, m%5
160 SWAP %1, %3, %4, %5, %2
165 SUMSUBD2_AB m%2, m%4, m%6, m%5
166 SUMSUB_BADC m%2, m%3, m%5, m%1
167 SWAP %1, %2, %5, %4, %3
170 ;-----------------------------------------------------------------------------
171 ; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
172 ;-----------------------------------------------------------------------------
173 cglobal x264_sub4x4_dct_mmx, 3,3
176 LOAD_DIFF_4P m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
177 LOAD_DIFF_4P m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
178 LOAD_DIFF_4P m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
179 LOAD_DIFF_4P m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
181 TRANSPOSE%1 0,1,2,3,4
191 ;-----------------------------------------------------------------------------
192 ; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
193 ;-----------------------------------------------------------------------------
194 cglobal x264_add4x4_idct_mmx, 2,2,1
202 TRANSPOSE%1 0,1,2,3,4
203 paddw m0, [pw_32 GLOBAL]
206 STORE_DIFF_4P m0, m4, m7, [r0+0*FDEC_STRIDE]
207 STORE_DIFF_4P m1, m4, m7, [r0+1*FDEC_STRIDE]
208 STORE_DIFF_4P m2, m4, m7, [r0+2*FDEC_STRIDE]
209 STORE_DIFF_4P m3, m4, m7, [r0+3*FDEC_STRIDE]
216 cglobal x264_sub8x8_dct_sse2, 3,3
220 add r1, 4*FENC_STRIDE
221 add r2, 4*FDEC_STRIDE
230 cglobal x264_add8x8_idct_sse2, 2,2,1
234 add r0, 4*FDEC_STRIDE
247 ;-----------------------------------------------------------------------------
248 ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
249 ;-----------------------------------------------------------------------------
255 add r1, %4-%5-%6*FENC_STRIDE
256 add r2, %4-%5-%6*FDEC_STRIDE
259 add r1, (%4-%6)*FENC_STRIDE-%5-%4
260 add r2, (%4-%6)*FDEC_STRIDE-%5-%4
263 add r1, %4-%5-%6*FENC_STRIDE
264 add r2, %4-%5-%6*FDEC_STRIDE
268 ;-----------------------------------------------------------------------------
269 ; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
270 ;-----------------------------------------------------------------------------
271 %macro ADD_NxN_IDCT 6
275 add r0, %4-%5-%6*FDEC_STRIDE
278 add r0, (%4-%6)*FDEC_STRIDE-%5-%4
281 add r0, %4-%5-%6*FDEC_STRIDE
287 SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0
288 ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0
289 SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4
290 ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4
292 cextern x264_sub8x8_dct8_mmx.skip_prologue
293 cextern x264_add8x8_idct8_mmx.skip_prologue
294 SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0
295 ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 0
296 %define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
297 %define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
300 SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4
301 ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
303 cextern x264_sub8x8_dct8_sse2
304 cextern x264_add8x8_idct8_sse2
305 SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0
306 ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
310 ;-----------------------------------------------------------------------------
311 ; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
312 ;-----------------------------------------------------------------------------
313 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
314 cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
315 pshufw mm0, [r1+4], 0xd2
327 ;-----------------------------------------------------------------------------
328 ; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
329 ;-----------------------------------------------------------------------------
330 cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
331 movd xmm0, [r1+0*FENC_STRIDE]
332 movd xmm1, [r1+1*FENC_STRIDE]
333 movd xmm2, [r1+2*FENC_STRIDE]
334 movd xmm3, [r1+3*FENC_STRIDE]
335 movd xmm4, [r2+0*FDEC_STRIDE]
336 movd xmm5, [r2+1*FDEC_STRIDE]
337 movd xmm6, [r2+2*FDEC_STRIDE]
338 movd xmm7, [r2+3*FDEC_STRIDE]
339 movd [r2+0*FDEC_STRIDE], xmm0
340 movd [r2+1*FDEC_STRIDE], xmm1
341 movd [r2+2*FDEC_STRIDE], xmm2
342 movd [r2+3*FDEC_STRIDE], xmm3
350 movdqa xmm7, [pb_zigzag4 GLOBAL]