1 ;*****************************************************************************
2 ;* dct-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 ;* Min Chen <chenm001.163.com>
8 ;* Loren Merritt <lorenm@u.washington.edu>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 ;*****************************************************************************
30 pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
80 ;-----------------------------------------------------------------------------
81 ; input ABCD output ADTC
82 ;-----------------------------------------------------------------------------
83 %macro TRANSPOSE4x4W 5
84 SBUTTERFLY q, wd, %1, %2, %5
85 SBUTTERFLY q, wd, %3, %4, %2
86 SBUTTERFLY q, dq, %1, %3, %4
87 SBUTTERFLY q, dq, %5, %2, %3
90 %macro STORE_DIFF_4P 5
100 ;-----------------------------------------------------------------------------
101 ; void x264_dct4x4dc_mmx( int16_t d[4][4] )
102 ;-----------------------------------------------------------------------------
103 cglobal x264_dct4x4dc_mmx, 1,1,1
109 SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
110 SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
112 TRANSPOSE4x4W mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
114 SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
115 SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
117 movq mm6, [pw_1 GLOBAL]
132 ;-----------------------------------------------------------------------------
133 ; void x264_idct4x4dc_mmx( int16_t d[4][4] )
134 ;-----------------------------------------------------------------------------
135 cglobal x264_idct4x4dc_mmx, 1,1
141 SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
142 SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
144 TRANSPOSE4x4W mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
146 SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
147 SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
155 ;-----------------------------------------------------------------------------
156 ; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
157 ;-----------------------------------------------------------------------------
158 cglobal x264_sub4x4_dct_mmx, 3,3
163 LOAD_DIFF_4P mm0, mm6, mm7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
164 LOAD_DIFF_4P mm1, mm6, mm7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
165 LOAD_DIFF_4P mm2, mm6, mm7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
166 LOAD_DIFF_4P mm3, mm6, mm7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
168 SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12
170 SUMSUB_BA mm2, mm3 ; mm2=s03+s12 mm3=s03-s12
171 SUMSUB2_AB mm0, mm1, mm4 ; mm0=2.d03+d12 mm4=d03-2.d12
173 ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
174 TRANSPOSE4x4W mm2, mm0, mm3, mm4, mm1
176 SUMSUB_BADC mm3, mm2, mm1, mm4 ; mm3=s03 mm2=d03 mm1=s12 mm4=d12
178 SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
179 SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
187 ;-----------------------------------------------------------------------------
188 ; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
189 ;-----------------------------------------------------------------------------
190 cglobal x264_add4x4_idct_mmx, 2,2,1
193 movq mm0, [r1+ 0] ; dct
198 SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02
199 SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
201 SUMSUB_BADC mm1, mm2, mm4, mm0 ; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13
203 ; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0
204 TRANSPOSE4x4W mm1, mm4, mm0, mm2, mm3
206 SUMSUB_BA mm3, mm1 ; mm3=s02 mm1=d02
207 SUMSUBD2_AB mm2, mm0, mm5, mm4 ; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
209 SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13
212 movq mm6, [pw_32 GLOBAL]
214 STORE_DIFF_4P mm2, mm0, mm6, mm7, [r0+0*FDEC_STRIDE]
215 STORE_DIFF_4P mm4, mm0, mm6, mm7, [r0+1*FDEC_STRIDE]
216 STORE_DIFF_4P mm1, mm0, mm6, mm7, [r0+2*FDEC_STRIDE]
217 STORE_DIFF_4P mm3, mm0, mm6, mm7, [r0+3*FDEC_STRIDE]
223 ;-----------------------------------------------------------------------------
224 ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
225 ;-----------------------------------------------------------------------------
231 add r1, %4-%5*FENC_STRIDE
232 add r2, %4-%5*FDEC_STRIDE
235 add r1, %4*FENC_STRIDE-%6
236 add r2, %4*FDEC_STRIDE-%6
239 add r1, %4-%5*FENC_STRIDE
240 add r2, %4-%5*FDEC_STRIDE
244 ;-----------------------------------------------------------------------------
245 ; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
246 ;-----------------------------------------------------------------------------
247 %macro ADD_NxN_IDCT 6
251 add r0, %4-%5*FDEC_STRIDE
254 add r0, %4*FDEC_STRIDE-%6
257 add r0, %4-%5*FDEC_STRIDE
262 SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 4
263 ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 4
265 SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 4, 4, 12
266 ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 4, 4, 12
269 cextern x264_sub8x8_dct8_sse2
270 cextern x264_add8x8_idct8_sse2
271 SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 8
272 ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 8
277 ;-----------------------------------------------------------------------------
278 ; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
279 ;-----------------------------------------------------------------------------
280 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
281 cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
282 pshufw mm0, [r1+4], 0xd2
295 ;-----------------------------------------------------------------------------
296 ; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
297 ;-----------------------------------------------------------------------------
298 cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
299 movd xmm0, [r1+0*FENC_STRIDE]
300 movd xmm1, [r1+1*FENC_STRIDE]
301 movd xmm2, [r1+2*FENC_STRIDE]
302 movd xmm3, [r1+3*FENC_STRIDE]
303 movd xmm4, [r2+0*FDEC_STRIDE]
304 movd xmm5, [r2+1*FDEC_STRIDE]
305 movd xmm6, [r2+2*FDEC_STRIDE]
306 movd xmm7, [r2+3*FDEC_STRIDE]
307 movd [r2+0*FDEC_STRIDE], xmm0
308 movd [r2+1*FDEC_STRIDE], xmm1
309 movd [r2+2*FDEC_STRIDE], xmm2
310 movd [r2+3*FDEC_STRIDE], xmm3
318 movdqa xmm7, [pb_zigzag4 GLOBAL]