1 ;*****************************************************************************
2 ;* dct-64.asm: x86_64 transform and zigzag
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2011 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Min Chen <chenm001.163.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;* This program is also available under a commercial proprietary license.
26 ;* For more information, contact us at licensing@x264.com.
27 ;*****************************************************************************
30 %include "x86util.asm"
34 %ifndef HIGH_BIT_DEPTH
40 SUMSUB_BA w, %5, %4 ; %5=s34, %4=d34
41 SUMSUB_BA w, %6, %3 ; %6=s25, %3=d25
42 SUMSUB_BA w, %7, %2 ; %7=s16, %2=d16
43 SUMSUB_BA w, %8, %1 ; %8=s07, %1=d07
45 SUMSUB_BA w, %6, %7, %10 ; %6=a1, %7=a3
46 SUMSUB_BA w, %5, %8, %10 ; %5=a0, %8=a2
51 paddw m%9, m%3 ; %9=a4
56 psubw m%10, m%3 ; %10=a7
63 psubw m%1, m%3 ; %1=a5
64 psubw m%4, m%2 ; %4=a6
67 paddw m%2, m%9 ; %2=b1
69 psubw m%9, m%10 ; %9=b7
71 SUMSUB_BA w, %6, %5, %10 ; %6=b0, %5=b4
74 paddw m%3, m%8 ; %3=b2
76 psubw m%8, m%7 ; %8=b6
79 paddw m%7, m%1 ; %7=b3
81 psubw m%4, m%1 ; %4=b5
83 SWAP %1, %6, %4, %7, %8, %9
87 SUMSUB_BA w, %5, %1, %9 ; %5=a0, %1=a2
92 paddw m%9, m%6 ; %9=a7
95 psubw m%10, m%7 ; %10=a4
97 paddw m%7, m%3 ; %7=a6
102 psubw m%3, m%2 ; %3=a5
110 psubw m%2, m%4 ; %2=a3
111 psubw m%6, m%8 ; %6=a1
114 paddw m%4, m%6 ; %4=b1
116 psubw m%9, m%6 ; %9=b7
118 SUMSUB_BA w, %7, %5, %6 ; %7=b0, %5=b6
119 SUMSUB_BA w, %10, %1, %6 ; %10=b2, %1=b4
122 paddw m%8, m%2 ; %8=b3
124 psubw m%2, m%3 ; %2=b5
126 SUMSUB_BA w, %9, %7, %6 ; %9=c0, %7=c7
127 SUMSUB_BA w, %2, %10, %6 ; %2=c1, %10=c6
128 SUMSUB_BA w, %8, %1, %6 ; %8=c2, %1=c5
129 SUMSUB_BA w, %4, %5, %6 ; %4=c3, %5=c4
137 cglobal sub8x8_dct_%1, 3,3,11
138 add r2, 4*FDEC_STRIDE
146 global sub8x8_dct_%1.skip_prologue
149 LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
150 LOAD_DIFF8x4 4, 5, 6, 7, 8, 9, r1, r2-4*FDEC_STRIDE
151 DCT4_1D 0, 1, 2, 3, 8
152 TRANSPOSE2x4x4W 0, 1, 2, 3, 8
153 DCT4_1D 4, 5, 6, 7, 8
154 TRANSPOSE2x4x4W 4, 5, 6, 7, 8
155 DCT4_1D 0, 1, 2, 3, 8
156 STORE_DCT 0, 1, 2, 3, r0, 0
157 DCT4_1D 4, 5, 6, 7, 8
158 STORE_DCT 4, 5, 6, 7, r0, 64
161 ;-----------------------------------------------------------------------------
162 ; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
163 ;-----------------------------------------------------------------------------
164 cglobal sub8x8_dct8_%1, 3,3,11
165 add r2, 4*FDEC_STRIDE
173 global sub8x8_dct8_%1.skip_prologue
176 LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
177 LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE
178 DCT8_1D 0,1,2,3,4,5,6,7,8,9
179 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
180 DCT8_1D 0,1,2,3,4,5,6,7,8,9
192 %define LOAD_DIFF8x4 LOAD_DIFF8x4_SSE2
193 %define movdqa movaps
194 %define punpcklqdq movlhps
198 %define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3
203 ;-----------------------------------------------------------------------------
204 ; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
205 ;-----------------------------------------------------------------------------
206 %macro ADD8x8_IDCT8 1
207 cglobal add8x8_idct8_%1, 2,2,11
208 add r0, 4*FDEC_STRIDE
214 global add8x8_idct8_%1.skip_prologue
225 IDCT8_1D 0,1,2,3,4,5,6,7,8,10
226 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
227 paddw m0, [pw_32] ; rounding for the >>6 at the end
228 IDCT8_1D 0,1,2,3,4,5,6,7,8,10
229 DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
230 DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
231 DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
232 DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
233 STORE_IDCT m1, m3, m5, m7
235 %endmacro ; ADD8x8_IDCT8
242 ;-----------------------------------------------------------------------------
243 ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
244 ;-----------------------------------------------------------------------------
246 cglobal add8x8_idct_%1, 2,2,11
247 add r0, 4*FDEC_STRIDE
253 global add8x8_idct_%1.skip_prologue
260 SBUTTERFLY qdq, 0, 1, 4
261 SBUTTERFLY qdq, 2, 3, 4
266 SBUTTERFLY qdq, 4, 5, 8
267 SBUTTERFLY qdq, 6, 7, 8
268 IDCT4_1D w,0,1,2,3,8,10
269 TRANSPOSE2x4x4W 0,1,2,3,8
270 IDCT4_1D w,4,5,6,7,8,10
271 TRANSPOSE2x4x4W 4,5,6,7,8
273 IDCT4_1D w,0,1,2,3,8,10
275 IDCT4_1D w,4,5,6,7,8,10
276 DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
277 DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
278 DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
279 DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
280 STORE_IDCT m1, m3, m5, m7
288 %endif ; !HIGH_BIT_DEPTH