1 ;*****************************************************************************
2 ;* dct-64.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
7 ;* Min Chen <chenm001.163.com> (converted to nasm)
8 ;* Loren Merritt <lorenm@u.washington.edu> (dct8)
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 ;*****************************************************************************
52 ;-----------------------------------------------------------------------------
53 ; input ABCDEFGH output AFHDTECB
54 ;-----------------------------------------------------------------------------
55 %macro TRANSPOSE8x8W 9
56 SBUTTERFLY dqa, wd, %1, %2, %9
57 SBUTTERFLY dqa, wd, %3, %4, %2
58 SBUTTERFLY dqa, wd, %5, %6, %4
59 SBUTTERFLY dqa, wd, %7, %8, %6
60 SBUTTERFLY dqa, dq, %1, %3, %8
61 SBUTTERFLY dqa, dq, %9, %2, %3
62 SBUTTERFLY dqa, dq, %5, %7, %2
63 SBUTTERFLY dqa, dq, %4, %6, %7
64 SBUTTERFLY dqa, qdq, %1, %5, %6
65 SBUTTERFLY dqa, qdq, %9, %4, %5
66 SBUTTERFLY dqa, qdq, %8, %2, %4
67 SBUTTERFLY dqa, qdq, %3, %7, %2
70 %macro STORE_DIFF_8P 4
84 SUMSUB_BA %8, %1 ; %8=s07, %1=d07
85 SUMSUB_BA %7, %2 ; %7=s16, %2=d16
86 SUMSUB_BA %6, %3 ; %6=s25, %3=d25
87 SUMSUB_BA %5, %4 ; %5=s34, %4=d34
89 SUMSUB_BA %5, %8 ; %5=a0, %8=a2
90 SUMSUB_BA %6, %7 ; %6=a1, %7=a3
102 psubw %10, %3 ; %10=a7
112 SUMSUB_BA %6, %5 ; %6=b0, %5=b4
118 psubw %9, %10 ; %9=b7
133 ;-----------------------------------------------------------------------------
134 ; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
135 ;-----------------------------------------------------------------------------
136 cglobal x264_sub8x8_dct8_sse2
138 LOAD_DIFF_8P xmm0, xmm8, xmm9, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE]
139 LOAD_DIFF_8P xmm1, xmm8, xmm9, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE]
140 LOAD_DIFF_8P xmm2, xmm8, xmm9, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE]
141 LOAD_DIFF_8P xmm3, xmm8, xmm9, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE]
142 LOAD_DIFF_8P xmm4, xmm8, xmm9, [parm2q+4*FENC_STRIDE], [parm3q+4*FDEC_STRIDE]
143 LOAD_DIFF_8P xmm5, xmm8, xmm9, [parm2q+5*FENC_STRIDE], [parm3q+5*FDEC_STRIDE]
144 LOAD_DIFF_8P xmm6, xmm8, xmm9, [parm2q+6*FENC_STRIDE], [parm3q+6*FDEC_STRIDE]
145 LOAD_DIFF_8P xmm7, xmm8, xmm9, [parm2q+7*FENC_STRIDE], [parm3q+7*FDEC_STRIDE]
147 DCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
148 TRANSPOSE8x8W xmm5, xmm1, xmm2, xmm6, xmm4, xmm3, xmm7, xmm8, xmm0
149 DCT8_1D xmm5, xmm3, xmm8, xmm6, xmm0, xmm4, xmm2, xmm1, xmm7, xmm9
151 movdqa [parm1q+0x00], xmm4
152 movdqa [parm1q+0x10], xmm3
153 movdqa [parm1q+0x20], xmm8
154 movdqa [parm1q+0x30], xmm2
155 movdqa [parm1q+0x40], xmm0
156 movdqa [parm1q+0x50], xmm6
157 movdqa [parm1q+0x60], xmm1
158 movdqa [parm1q+0x70], xmm7
165 SUMSUB_BA %5, %1 ; %5=a0, %1=a2
170 paddw %7, %10 ; %7=a6
182 psubw %10, %2 ; %10=a5
193 SUMSUB_BA %7, %5 ; %7=b0, %5=b6
194 SUMSUB_BA %3, %1 ; %3=b2, %1=b4
206 psubw %2, %10 ; %2=b5
208 SUMSUB_BA %9, %7 ; %9=c0, %7=c7
209 SUMSUB_BA %2, %3 ; %2=c1, %3=c6
210 SUMSUB_BA %8, %1 ; %8=c2, %1=c5
211 SUMSUB_BA %4, %5 ; %4=c3, %5=c4
214 ;-----------------------------------------------------------------------------
215 ; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
216 ;-----------------------------------------------------------------------------
217 cglobal x264_add8x8_idct8_sse2
218 movdqa xmm0, [parm2q+0x00]
219 movdqa xmm1, [parm2q+0x10]
220 movdqa xmm2, [parm2q+0x20]
221 movdqa xmm3, [parm2q+0x30]
222 movdqa xmm4, [parm2q+0x40]
223 movdqa xmm5, [parm2q+0x50]
224 movdqa xmm6, [parm2q+0x60]
225 movdqa xmm7, [parm2q+0x70]
227 IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm9, xmm8
228 TRANSPOSE8x8W xmm9, xmm1, xmm7, xmm3, xmm4, xmm0, xmm2, xmm6, xmm5
229 paddw xmm9, [pw_32 GLOBAL] ; rounding for the >>6 at the end
230 IDCT8_1D xmm9, xmm0, xmm6, xmm3, xmm5, xmm4, xmm7, xmm1, xmm8, xmm2
233 STORE_DIFF_8P xmm8, xmm14, xmm15, [parm1q+0*FDEC_STRIDE]
234 STORE_DIFF_8P xmm0, xmm14, xmm15, [parm1q+1*FDEC_STRIDE]
235 STORE_DIFF_8P xmm1, xmm14, xmm15, [parm1q+2*FDEC_STRIDE]
236 STORE_DIFF_8P xmm3, xmm14, xmm15, [parm1q+3*FDEC_STRIDE]
237 STORE_DIFF_8P xmm5, xmm14, xmm15, [parm1q+4*FDEC_STRIDE]
238 STORE_DIFF_8P xmm9, xmm14, xmm15, [parm1q+5*FDEC_STRIDE]
239 STORE_DIFF_8P xmm6, xmm14, xmm15, [parm1q+6*FDEC_STRIDE]
240 STORE_DIFF_8P xmm7, xmm14, xmm15, [parm1q+7*FDEC_STRIDE]