1 ;*****************************************************************************
2 ;* dct-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 ;* Loren Merritt <lorenm@u.washington.edu>
8 ;* Min Chen <chenm001.163.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;*****************************************************************************
26 %include "x86util.asm"
31 pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
42 %macro TRANSPOSE4x4W 5
43 SBUTTERFLY wd, %1, %2, %5
44 SBUTTERFLY wd, %3, %4, %5
45 SBUTTERFLY dq, %1, %3, %5
46 SBUTTERFLY dq, %2, %4, %5
50 %macro TRANSPOSE2x4x4W 5
51 SBUTTERFLY wd, %1, %2, %5
52 SBUTTERFLY wd, %3, %4, %5
53 SBUTTERFLY dq, %1, %3, %5
54 SBUTTERFLY dq, %2, %4, %5
55 SBUTTERFLY qdq, %1, %2, %5
56 SBUTTERFLY qdq, %3, %4, %5
60 SUMSUB_BADC m%2, m%1, m%4, m%3
61 SUMSUB_BADC m%4, m%2, m%3, m%1
65 ;-----------------------------------------------------------------------------
66 ; void x264_dct4x4dc_mmx( int16_t d[4][4] )
67 ;-----------------------------------------------------------------------------
68 cglobal x264_dct4x4dc_mmx, 1,1,1
74 TRANSPOSE4x4W 0,1,2,3,4
76 movq m6, [pw_1 GLOBAL]
91 ;-----------------------------------------------------------------------------
92 ; void x264_idct4x4dc_mmx( int16_t d[4][4] )
93 ;-----------------------------------------------------------------------------
94 cglobal x264_idct4x4dc_mmx, 1,1
100 TRANSPOSE4x4W 0,1,2,3,4
109 SUMSUB_BADC m%4, m%1, m%3, m%2
111 SUMSUB2_AB m%1, m%2, m%5
112 SWAP %1, %3, %4, %5, %2
117 SUMSUBD2_AB m%2, m%4, m%6, m%5
118 SUMSUB_BADC m%2, m%3, m%5, m%1
119 SWAP %1, %2, %5, %4, %3
122 ;-----------------------------------------------------------------------------
123 ; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
124 ;-----------------------------------------------------------------------------
125 cglobal x264_sub4x4_dct_mmx, 3,3
128 LOAD_DIFF m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
129 LOAD_DIFF m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
130 LOAD_DIFF m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
131 LOAD_DIFF m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
133 TRANSPOSE%1 0,1,2,3,4
143 ;-----------------------------------------------------------------------------
144 ; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
145 ;-----------------------------------------------------------------------------
146 cglobal x264_add4x4_idct_mmx, 2,2,1
154 TRANSPOSE%1 0,1,2,3,4
155 paddw m0, [pw_32 GLOBAL]
158 STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
159 STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
160 STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
161 STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
168 cglobal x264_sub8x8_dct_sse2, 3,3
172 add r1, 4*FENC_STRIDE
173 add r2, 4*FDEC_STRIDE
182 cglobal x264_add8x8_idct_sse2, 2,2,1
186 add r0, 4*FDEC_STRIDE
199 ;-----------------------------------------------------------------------------
200 ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
201 ;-----------------------------------------------------------------------------
207 add r1, %4-%5-%6*FENC_STRIDE
208 add r2, %4-%5-%6*FDEC_STRIDE
211 add r1, (%4-%6)*FENC_STRIDE-%5-%4
212 add r2, (%4-%6)*FDEC_STRIDE-%5-%4
215 add r1, %4-%5-%6*FENC_STRIDE
216 add r2, %4-%5-%6*FDEC_STRIDE
220 ;-----------------------------------------------------------------------------
221 ; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
222 ;-----------------------------------------------------------------------------
223 %macro ADD_NxN_IDCT 6
227 add r0, %4-%5-%6*FDEC_STRIDE
230 add r0, (%4-%6)*FDEC_STRIDE-%5-%4
233 add r0, %4-%5-%6*FDEC_STRIDE
239 SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0
240 ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0
241 SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4
242 ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4
244 cextern x264_sub8x8_dct8_mmx.skip_prologue
245 cextern x264_add8x8_idct8_mmx.skip_prologue
246 SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0
247 ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 0
248 %define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
249 %define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
252 SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4
253 ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
255 cextern x264_sub8x8_dct8_sse2
256 cextern x264_add8x8_idct8_sse2
257 SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0
258 ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
262 ;-----------------------------------------------------------------------------
263 ; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
264 ;-----------------------------------------------------------------------------
265 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
266 cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
267 pshufw mm0, [r1+4], 0xd2
279 ;-----------------------------------------------------------------------------
280 ; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
281 ;-----------------------------------------------------------------------------
282 cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
283 movd xmm0, [r1+0*FENC_STRIDE]
284 movd xmm1, [r1+1*FENC_STRIDE]
285 movd xmm2, [r1+2*FENC_STRIDE]
286 movd xmm3, [r1+3*FENC_STRIDE]
287 movd xmm4, [r2+0*FDEC_STRIDE]
288 movd xmm5, [r2+1*FDEC_STRIDE]
289 movd xmm6, [r2+2*FDEC_STRIDE]
290 movd xmm7, [r2+3*FDEC_STRIDE]
291 movd [r2+0*FDEC_STRIDE], xmm0
292 movd [r2+1*FDEC_STRIDE], xmm1
293 movd [r2+2*FDEC_STRIDE], xmm2
294 movd [r2+3*FDEC_STRIDE], xmm3
302 movdqa xmm7, [pb_zigzag4 GLOBAL]