1 ;*****************************************************************************
2 ;* dct-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 ;* Min Chen <chenm001.163.com>
8 ;* Loren Merritt <lorenm@u.washington.edu>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 ;*****************************************************************************
30 pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
81 %macro TRANSPOSE4x4W 5
82 SBUTTERFLY wd, %1, %2, %5
83 SBUTTERFLY wd, %3, %4, %5
84 SBUTTERFLY dq, %1, %3, %5
85 SBUTTERFLY dq, %2, %4, %5
89 %macro STORE_DIFF_4P 5
100 SUMSUB_BADC m%2, m%1, m%4, m%3
101 SUMSUB_BADC m%4, m%2, m%3, m%1
105 ;-----------------------------------------------------------------------------
106 ; void x264_dct4x4dc_mmx( int16_t d[4][4] )
107 ;-----------------------------------------------------------------------------
108 cglobal x264_dct4x4dc_mmx, 1,1,1
114 TRANSPOSE4x4W 0,1,2,3,4
116 movq m6, [pw_1 GLOBAL]
131 ;-----------------------------------------------------------------------------
132 ; void x264_idct4x4dc_mmx( int16_t d[4][4] )
133 ;-----------------------------------------------------------------------------
134 cglobal x264_idct4x4dc_mmx, 1,1
140 TRANSPOSE4x4W 0,1,2,3,4
149 SUMSUB_BADC m%4, m%1, m%3, m%2
151 SUMSUB2_AB m%1, m%2, m%5
152 SWAP %1, %3, %4, %5, %2
157 SUMSUBD2_AB m%2, m%4, m%6, m%5
158 SUMSUB_BADC m%2, m%3, m%5, m%1
159 SWAP %1, %2, %5, %4, %3
162 ;-----------------------------------------------------------------------------
163 ; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
164 ;-----------------------------------------------------------------------------
165 cglobal x264_sub4x4_dct_mmx, 3,3
167 LOAD_DIFF_4P m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
168 LOAD_DIFF_4P m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
169 LOAD_DIFF_4P m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
170 LOAD_DIFF_4P m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
172 TRANSPOSE4x4W 0,1,2,3,4
180 ;-----------------------------------------------------------------------------
181 ; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
182 ;-----------------------------------------------------------------------------
183 cglobal x264_add4x4_idct_mmx, 2,2,1
190 TRANSPOSE4x4W 0,1,2,3,4
193 movq m6, [pw_32 GLOBAL]
194 STORE_DIFF_4P m0, m4, m6, m7, [r0+0*FDEC_STRIDE]
195 STORE_DIFF_4P m1, m4, m6, m7, [r0+1*FDEC_STRIDE]
196 STORE_DIFF_4P m2, m4, m6, m7, [r0+2*FDEC_STRIDE]
197 STORE_DIFF_4P m3, m4, m6, m7, [r0+3*FDEC_STRIDE]
202 ;-----------------------------------------------------------------------------
203 ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
204 ;-----------------------------------------------------------------------------
210 add r1, %4-%5*FENC_STRIDE
211 add r2, %4-%5*FDEC_STRIDE
214 add r1, %4*FENC_STRIDE-%6
215 add r2, %4*FDEC_STRIDE-%6
218 add r1, %4-%5*FENC_STRIDE
219 add r2, %4-%5*FDEC_STRIDE
223 ;-----------------------------------------------------------------------------
224 ; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
225 ;-----------------------------------------------------------------------------
226 %macro ADD_NxN_IDCT 6
230 add r0, %4-%5*FDEC_STRIDE
233 add r0, %4*FDEC_STRIDE-%6
236 add r0, %4-%5*FDEC_STRIDE
241 SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 4
242 ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 4
244 SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 4, 4, 12
245 ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 4, 4, 12
248 cextern x264_sub8x8_dct8_sse2
249 cextern x264_add8x8_idct8_sse2
250 SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 8
251 ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 8
256 ;-----------------------------------------------------------------------------
257 ; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
258 ;-----------------------------------------------------------------------------
259 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
260 cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
261 pshufw mm0, [r1+4], 0xd2
274 ;-----------------------------------------------------------------------------
275 ; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
276 ;-----------------------------------------------------------------------------
277 cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
278 movd xmm0, [r1+0*FENC_STRIDE]
279 movd xmm1, [r1+1*FENC_STRIDE]
280 movd xmm2, [r1+2*FENC_STRIDE]
281 movd xmm3, [r1+3*FENC_STRIDE]
282 movd xmm4, [r2+0*FDEC_STRIDE]
283 movd xmm5, [r2+1*FDEC_STRIDE]
284 movd xmm6, [r2+2*FDEC_STRIDE]
285 movd xmm7, [r2+3*FDEC_STRIDE]
286 movd [r2+0*FDEC_STRIDE], xmm0
287 movd [r2+1*FDEC_STRIDE], xmm1
288 movd [r2+2*FDEC_STRIDE], xmm2
289 movd [r2+3*FDEC_STRIDE], xmm3
297 movdqa xmm7, [pb_zigzag4 GLOBAL]