1 ;*****************************************************************************
2 ;* dct-64.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Min Chen <chenm001.163.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;*****************************************************************************
27 %include "x86util.asm"
31 hsub_mul: times 8 db 1, -1
37 SUMSUB_BA m%5, m%4 ; %5=s34, %4=d34
38 SUMSUB_BA m%6, m%3 ; %6=s25, %3=d25
39 SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16
40 SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07
42 SUMSUB_BA m%6, m%7, m%10 ; %6=a1, %7=a3
43 SUMSUB_BA m%5, m%8, m%10 ; %5=a0, %8=a2
49 paddw m%9, m%3 ; %9=a4
55 psubw m%10, m%3 ; %10=a7
62 psubw m%1, m%3 ; %1=a5
63 psubw m%4, m%2 ; %4=a6
67 paddw m%2, m%9 ; %2=b1
69 psubw m%9, m%10 ; %9=b7
71 SUMSUB_BA m%6, m%5, m%10 ; %6=b0, %5=b4
75 paddw m%3, m%8 ; %3=b2
77 psubw m%8, m%7 ; %8=b6
81 paddw m%7, m%1 ; %7=b3
83 psubw m%4, m%1 ; %4=b5
85 SWAP %1, %6, %4, %7, %8, %9
89 SUMSUB_BA m%5, m%1, m%9 ; %5=a0, %1=a2
95 paddw m%9, m%6 ; %9=a7
99 psubw m%3, m%7 ; %3=a4
101 paddw m%7, m%10 ; %7=a6
107 psubw m%10, m%2 ; %10=a5
115 psubw m%2, m%4 ; %2=a3
116 psubw m%6, m%8 ; %6=a1
120 paddw m%4, m%6 ; %4=b1
122 psubw m%9, m%6 ; %9=b7
124 SUMSUB_BA m%7, m%5, m%6 ; %7=b0, %5=b6
125 SUMSUB_BA m%3, m%1, m%6; %3=b2, %1=b4
129 paddw m%8, m%2 ; %8=b3
131 psubw m%2, m%10 ; %2=b5
133 SUMSUB_BA m%9, m%7, m%6 ; %9=c0, %7=c7
134 SUMSUB_BA m%2, m%3, m%6 ; %2=c1, %3=c6
135 SUMSUB_BA m%8, m%1, m%6 ; %8=c2, %1=c5
136 SUMSUB_BA m%4, m%5, m%6 ; %4=c3, %5=c4
143 cglobal x264_sub8x8_dct_%1, 3,3,11
144 add r2, 4*FDEC_STRIDE
146 mova m7, [hsub_mul GLOBAL]
152 global x264_sub8x8_dct_%1.skip_prologue
155 LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
156 LOAD_DIFF8x4 4, 5, 6, 7, 8, 9, r1, r2-4*FDEC_STRIDE
157 DCT4_1D 0, 1, 2, 3, 8
158 TRANSPOSE2x4x4W 0, 1, 2, 3, 8
159 DCT4_1D 4, 5, 6, 7, 8
160 TRANSPOSE2x4x4W 4, 5, 6, 7, 8
161 DCT4_1D 0, 1, 2, 3, 8
162 STORE_DCT 0, 1, 2, 3, r0, 0
163 DCT4_1D 4, 5, 6, 7, 8
164 STORE_DCT 4, 5, 6, 7, r0, 64
167 ;-----------------------------------------------------------------------------
168 ; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
169 ;-----------------------------------------------------------------------------
170 cglobal x264_sub8x8_dct8_%1, 3,3,11
171 add r2, 4*FDEC_STRIDE
173 mova m7, [hsub_mul GLOBAL]
179 global x264_sub8x8_dct8_%1.skip_prologue
182 LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
183 LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE
184 DCT8_1D 0,1,2,3,4,5,6,7,8,9
185 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
186 DCT8_1D 0,1,2,3,4,5,6,7,8,9
198 %define LOAD_DIFF8x4 LOAD_DIFF8x4_SSE2
199 %define movdqa movaps
200 %define punpcklqdq movlhps
204 %define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3
207 ;-----------------------------------------------------------------------------
208 ; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
209 ;-----------------------------------------------------------------------------
210 cglobal x264_add8x8_idct8_sse2, 2,2,11
211 add r0, 4*FDEC_STRIDE
217 global x264_add8x8_idct8_sse2.skip_prologue
228 IDCT8_1D 0,1,2,3,4,5,6,7,8,10
229 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
230 paddw m0, [pw_32 GLOBAL] ; rounding for the >>6 at the end
231 IDCT8_1D 0,1,2,3,4,5,6,7,8,10
232 DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
233 DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
234 DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
235 DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
236 STORE_IDCT m1, m3, m5, m7
239 ;-----------------------------------------------------------------------------
240 ; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] )
241 ;-----------------------------------------------------------------------------
242 cglobal x264_add8x8_idct_sse2, 2,2,11
243 add r0, 4*FDEC_STRIDE
249 global x264_add8x8_idct_sse2.skip_prologue
256 SBUTTERFLY qdq, 0, 1, 4
257 SBUTTERFLY qdq, 2, 3, 4
262 SBUTTERFLY qdq, 4, 5, 8
263 SBUTTERFLY qdq, 6, 7, 8
264 IDCT4_1D 0,1,2,3,8,10
265 TRANSPOSE2x4x4W 0,1,2,3,8
266 IDCT4_1D 4,5,6,7,8,10
267 TRANSPOSE2x4x4W 4,5,6,7,8
268 paddw m0, [pw_32 GLOBAL]
269 IDCT4_1D 0,1,2,3,8,10
270 paddw m4, [pw_32 GLOBAL]
271 IDCT4_1D 4,5,6,7,8,10
272 DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
273 DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
274 DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
275 DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
276 STORE_IDCT m1, m3, m5, m7