1 ;*****************************************************************************
2 ;* dct-64.asm: x86_64 transform and zigzag
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2011 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Min Chen <chenm001.163.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;* This program is also available under a commercial proprietary license.
26 ;* For more information, contact us at licensing@x264.com.
27 ;*****************************************************************************
30 %include "x86util.asm"
34 %ifndef HIGH_BIT_DEPTH
40 SUMSUB_BA w, m%5, m%4 ; %5=s34, %4=d34
41 SUMSUB_BA w, m%6, m%3 ; %6=s25, %3=d25
42 SUMSUB_BA w, m%7, m%2 ; %7=s16, %2=d16
43 SUMSUB_BA w, m%8, m%1 ; %8=s07, %1=d07
45 SUMSUB_BA w, m%6, m%7, m%10 ; %6=a1, %7=a3
46 SUMSUB_BA w, m%5, m%8, m%10 ; %5=a0, %8=a2
52 paddw m%9, m%3 ; %9=a4
58 psubw m%10, m%3 ; %10=a7
65 psubw m%1, m%3 ; %1=a5
66 psubw m%4, m%2 ; %4=a6
70 paddw m%2, m%9 ; %2=b1
72 psubw m%9, m%10 ; %9=b7
74 SUMSUB_BA w, m%6, m%5, m%10 ; %6=b0, %5=b4
78 paddw m%3, m%8 ; %3=b2
80 psubw m%8, m%7 ; %8=b6
84 paddw m%7, m%1 ; %7=b3
86 psubw m%4, m%1 ; %4=b5
88 SWAP %1, %6, %4, %7, %8, %9
92 SUMSUB_BA w, m%5, m%1, m%9 ; %5=a0, %1=a2
98 paddw m%9, m%6 ; %9=a7
102 psubw m%3, m%7 ; %3=a4
104 paddw m%7, m%10 ; %7=a6
110 psubw m%10, m%2 ; %10=a5
118 psubw m%2, m%4 ; %2=a3
119 psubw m%6, m%8 ; %6=a1
123 paddw m%4, m%6 ; %4=b1
125 psubw m%9, m%6 ; %9=b7
127 SUMSUB_BA w, m%7, m%5, m%6 ; %7=b0, %5=b6
128 SUMSUB_BA w, m%3, m%1, m%6 ; %3=b2, %1=b4
132 paddw m%8, m%2 ; %8=b3
134 psubw m%2, m%10 ; %2=b5
136 SUMSUB_BA w, m%9, m%7, m%6 ; %9=c0, %7=c7
137 SUMSUB_BA w, m%2, m%3, m%6 ; %2=c1, %3=c6
138 SUMSUB_BA w, m%8, m%1, m%6 ; %8=c2, %1=c5
139 SUMSUB_BA w, m%4, m%5, m%6 ; %4=c3, %5=c4
146 cglobal sub8x8_dct_%1, 3,3,11
147 add r2, 4*FDEC_STRIDE
155 global sub8x8_dct_%1.skip_prologue
158 LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
159 LOAD_DIFF8x4 4, 5, 6, 7, 8, 9, r1, r2-4*FDEC_STRIDE
160 DCT4_1D 0, 1, 2, 3, 8
161 TRANSPOSE2x4x4W 0, 1, 2, 3, 8
162 DCT4_1D 4, 5, 6, 7, 8
163 TRANSPOSE2x4x4W 4, 5, 6, 7, 8
164 DCT4_1D 0, 1, 2, 3, 8
165 STORE_DCT 0, 1, 2, 3, r0, 0
166 DCT4_1D 4, 5, 6, 7, 8
167 STORE_DCT 4, 5, 6, 7, r0, 64
170 ;-----------------------------------------------------------------------------
171 ; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
172 ;-----------------------------------------------------------------------------
173 cglobal sub8x8_dct8_%1, 3,3,11
174 add r2, 4*FDEC_STRIDE
182 global sub8x8_dct8_%1.skip_prologue
185 LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
186 LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE
187 DCT8_1D 0,1,2,3,4,5,6,7,8,9
188 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
189 DCT8_1D 0,1,2,3,4,5,6,7,8,9
201 %define LOAD_DIFF8x4 LOAD_DIFF8x4_SSE2
202 %define movdqa movaps
203 %define punpcklqdq movlhps
207 %define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3
210 ;-----------------------------------------------------------------------------
211 ; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
212 ;-----------------------------------------------------------------------------
213 cglobal add8x8_idct8_sse2, 2,2,11
214 add r0, 4*FDEC_STRIDE
220 global add8x8_idct8_sse2.skip_prologue
231 IDCT8_1D 0,1,2,3,4,5,6,7,8,10
232 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
233 paddw m0, [pw_32] ; rounding for the >>6 at the end
234 IDCT8_1D 0,1,2,3,4,5,6,7,8,10
235 DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
236 DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
237 DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
238 DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
239 STORE_IDCT m1, m3, m5, m7
242 ;-----------------------------------------------------------------------------
243 ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
244 ;-----------------------------------------------------------------------------
245 cglobal add8x8_idct_sse2, 2,2,11
246 add r0, 4*FDEC_STRIDE
252 global add8x8_idct_sse2.skip_prologue
259 SBUTTERFLY qdq, 0, 1, 4
260 SBUTTERFLY qdq, 2, 3, 4
265 SBUTTERFLY qdq, 4, 5, 8
266 SBUTTERFLY qdq, 6, 7, 8
267 IDCT4_1D w,0,1,2,3,8,10
268 TRANSPOSE2x4x4W 0,1,2,3,8
269 IDCT4_1D w,4,5,6,7,8,10
270 TRANSPOSE2x4x4W 4,5,6,7,8
272 IDCT4_1D w,0,1,2,3,8,10
274 IDCT4_1D w,4,5,6,7,8,10
275 DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
276 DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
277 DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
278 DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
279 STORE_IDCT m1, m3, m5, m7
281 %endif ; !HIGH_BIT_DEPTH