1 ;*****************************************************************************
2 ;* dct-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 ;* Min Chen <chenm001.163.com>
8 ;* Loren Merritt <lorenm@u.washington.edu>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 ;*****************************************************************************
79 ;-----------------------------------------------------------------------------
80 ; input ABCD output ADTC
81 ;-----------------------------------------------------------------------------
82 %macro TRANSPOSE4x4W 5
83 SBUTTERFLY q, wd, %1, %2, %5
84 SBUTTERFLY q, wd, %3, %4, %2
85 SBUTTERFLY q, dq, %1, %3, %4
86 SBUTTERFLY q, dq, %5, %2, %3
89 %macro STORE_DIFF_4P 5
99 ;-----------------------------------------------------------------------------
100 ; void x264_dct4x4dc_mmx( int16_t d[4][4] )
101 ;-----------------------------------------------------------------------------
102 cglobal x264_dct4x4dc_mmx, 1,1,1
108 SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
109 SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
111 TRANSPOSE4x4W mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
113 SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
114 SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
116 movq mm6, [pw_1 GLOBAL]
131 ;-----------------------------------------------------------------------------
132 ; void x264_idct4x4dc_mmx( int16_t d[4][4] )
133 ;-----------------------------------------------------------------------------
134 cglobal x264_idct4x4dc_mmx, 1,1
140 SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
141 SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
143 TRANSPOSE4x4W mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
145 SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
146 SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
154 ;-----------------------------------------------------------------------------
155 ; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
156 ;-----------------------------------------------------------------------------
157 cglobal x264_sub4x4_dct_mmx, 3,3
162 LOAD_DIFF_4P mm0, mm6, mm7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
163 LOAD_DIFF_4P mm1, mm6, mm7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
164 LOAD_DIFF_4P mm2, mm6, mm7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
165 LOAD_DIFF_4P mm3, mm6, mm7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
167 SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12
169 SUMSUB_BA mm2, mm3 ; mm2=s03+s12 mm3=s03-s12
170 SUMSUB2_AB mm0, mm1, mm4 ; mm0=2.d03+d12 mm4=d03-2.d12
172 ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
173 TRANSPOSE4x4W mm2, mm0, mm3, mm4, mm1
175 SUMSUB_BADC mm3, mm2, mm1, mm4 ; mm3=s03 mm2=d03 mm1=s12 mm4=d12
177 SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
178 SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
186 ;-----------------------------------------------------------------------------
187 ; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
188 ;-----------------------------------------------------------------------------
189 cglobal x264_add4x4_idct_mmx, 2,2,1
192 movq mm0, [r1+ 0] ; dct
197 SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02
198 SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
200 SUMSUB_BADC mm1, mm2, mm4, mm0 ; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13
202 ; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0
203 TRANSPOSE4x4W mm1, mm4, mm0, mm2, mm3
205 SUMSUB_BA mm3, mm1 ; mm3=s02 mm1=d02
206 SUMSUBD2_AB mm2, mm0, mm5, mm4 ; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
208 SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13
211 movq mm6, [pw_32 GLOBAL]
213 STORE_DIFF_4P mm2, mm0, mm6, mm7, [r0+0*FDEC_STRIDE]
214 STORE_DIFF_4P mm4, mm0, mm6, mm7, [r0+1*FDEC_STRIDE]
215 STORE_DIFF_4P mm1, mm0, mm6, mm7, [r0+2*FDEC_STRIDE]
216 STORE_DIFF_4P mm3, mm0, mm6, mm7, [r0+3*FDEC_STRIDE]
222 ;-----------------------------------------------------------------------------
223 ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
224 ;-----------------------------------------------------------------------------
230 add r1, %4-%5*FENC_STRIDE
231 add r2, %4-%5*FDEC_STRIDE
234 add r1, %4*FENC_STRIDE-%6
235 add r2, %4*FDEC_STRIDE-%6
238 add r1, %4-%5*FENC_STRIDE
239 add r2, %4-%5*FDEC_STRIDE
243 ;-----------------------------------------------------------------------------
244 ; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
245 ;-----------------------------------------------------------------------------
246 %macro ADD_NxN_IDCT 6
250 add r0, %4-%5*FDEC_STRIDE
253 add r0, %4*FDEC_STRIDE-%6
256 add r0, %4-%5*FDEC_STRIDE
261 SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 4
262 ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 4
264 SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 4, 4, 12
265 ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 4, 4, 12
268 cextern x264_sub8x8_dct8_sse2
269 cextern x264_add8x8_idct8_sse2
270 SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 8
271 ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 8
275 ;-----------------------------------------------------------------------------
276 ; void x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] )
277 ;-----------------------------------------------------------------------------
278 cglobal x264_zigzag_scan_4x4_field_sse2, 2,2
281 punpcklwd xmm2, [r1+16]
282 punpckhwd xmm3, [r1+16]