2 * Loongson SIMD optimized idctdsp
4 * Copyright (c) 2015 Loongson Technology Corporation Limited
5 * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "idctdsp_mips.h"
25 #include "constants.h"
27 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
28 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
29 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
30 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
31 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
32 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
33 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
40 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
41 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
42 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
43 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
58 static void simple_idct_mmi(int16_t *block)
60 DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
61 int16_t * const temp= (int16_t*)align_tmp;
65 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, rarg, shift) \
66 "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
67 "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\
68 "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\
69 "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\
71 "and $f8, $f8, $f0 \n\t"\
72 "or $f8, $f8, $f2 \n\t"\
73 "or $f8, $f8, $f4 \n\t"\
74 "or $f8, $f8, $f6 \n\t"\
75 "packsswh $f8, $f8, $f8 \n\t"\
76 "li $11, " #shift " \n\t"\
78 "mtc1 $11, $f18 \n\t"\
80 "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
81 "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
82 "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
83 "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
84 "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\
85 "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
86 "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\
87 "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
88 "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\
89 "ldc1 $f16, " #rarg " \n\t"\
90 "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
91 #rounder " $f8, $f8, $f16 \n\t"\
92 "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
93 "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\
94 "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\
95 "ldc1 $f10, 56(%2) \n\t" /* C7 C5 C7 C5 */\
96 "ldc1 $f16, " #rarg " \n\t"\
97 "pmaddhw $f10, $f10, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
98 #rounder " $f0, $f0, $f16 \n\t"\
99 "paddw $f2, $f2, $f0 \n\t" /* A1 a1 */\
100 "ldc1 $f16, 64(%2) \n\t"\
101 "paddw $f0, $f0, $f0 \n\t" \
102 "psubw $f0, $f0, $f2 \n\t" /* A2 a2 */\
103 "pmaddhw $f4, $f4, $f16 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
104 "paddw $f14, $f14, $f10 \n\t" /* B0 b0 */\
105 "ldc1 $f10, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\
106 "pmaddhw $f10, $f10, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
107 "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\
108 "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
109 "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\
110 "paddw $f10, $f10, $f4 \n\t" /* B1 b1 */\
111 "psraw $f14, $f14, $f18 \n\t"\
112 "psraw $f8, $f8, $f18 \n\t"\
113 "mov.d $f4, $f2 \n\t" /* A1 a1 */\
114 "paddw $f2, $f2, $f10 \n\t" /* A1+B1 a1+b1 */\
115 "psubw $f4, $f4, $f10 \n\t" /* A1-B1 a1-b1 */\
116 "psraw $f2, $f2, $f18 \n\t"\
117 "psraw $f4, $f4, $f18 \n\t"\
118 "packsswh $f14, $f14, $f2 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
119 "packsswh $f4, $f4, $f8 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
120 "sdc1 $f14, " #dst " \n\t"\
121 "ldc1 $f2, " #src1 " \n\t" /* R3 R1 r3 r1 */\
122 "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\
123 "sdc1 $f4, 24+" #dst " \n\t"\
124 "pmaddhw $f8, $f8, $f2 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
125 "ldc1 $f16, 96(%2) \n\t"\
126 "ldc1 $f14, 88(%2) \n\t" /* C3 C7 C3 C7 */\
127 "pmaddhw $f2, $f2, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
128 "pmaddhw $f14, $f14, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
129 "ldc1 $f16, 104(%2) \n\t"\
130 "mov.d $f4, $f0 \n\t" /* A2 a2 */\
131 "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
132 "paddw $f8, $f8, $f14 \n\t" /* B2 b2 */\
133 "paddw $f4, $f4, $f8 \n\t" /* A2+B2 a2+b2 */\
134 "psubw $f0, $f0, $f8 \n\t" /* a2-B2 a2-b2 */\
135 "psraw $f4, $f4, $f18 \n\t"\
136 "psraw $f0, $f0, $f18 \n\t"\
137 "mov.d $f8, $f12 \n\t" /* A3 a3 */\
138 "paddw $f6, $f6, $f2 \n\t" /* B3 b3 */\
139 "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\
140 "psubw $f8, $f8, $f6 \n\t" /* a3-B3 a3-b3 */\
141 "psraw $f12, $f12, $f18 \n\t"\
142 "packsswh $f4, $f4, $f12 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
143 "sdc1 $f4, 8+" #dst " \n\t"\
144 "psraw $f8, $f8, $f18 \n\t"\
145 "packsswh $f8, $f8, $f0 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
146 "sdc1 $f8, 16+" #dst " \n\t"\
150 "mtc1 $10, $f16 \n\t"\
151 "psllw $f0, $f0, $f16 \n\t"\
152 "ldc1 $f16, %4 \n\t"\
153 "paddw $f0, $f0, $f16 \n\t"\
155 "mtc1 $10, $f16 \n\t"\
156 "psraw $f0, $f0, $f16 \n\t"\
157 "packsswh $f0, $f0, $f0 \n\t"\
158 "sdc1 $f0, " #dst " \n\t"\
159 "sdc1 $f0, 8+" #dst " \n\t"\
160 "sdc1 $f0, 16+" #dst " \n\t"\
161 "sdc1 $f0, 24+" #dst " \n\t"\
165 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, rarg, shift, bt) \
166 "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
167 "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\
168 "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\
169 "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\
170 "mov.d $f8, $f0 \n\t"\
171 "or $f8, $f8, $f2 \n\t"\
172 "or $f8, $f8, $f4 \n\t"\
173 "or $f8, $f8, $f6 \n\t"\
174 "packsswh $f8, $f8, $f8 \n\t"\
175 "mfc1 $10, $f8 \n\t"\
176 "beqz $10, " #bt " \n\t"\
177 "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
178 "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
179 "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
180 "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
181 "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\
182 "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
183 "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\
184 "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
185 "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\
186 "ldc1 $f16, " #rarg " \n\t"\
187 "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
188 #rounder " $f8, $f8, $f16 \n\t"\
189 "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
190 "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\
191 "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\
192 "ldc1 $f10, 56(%2) \n\t" /* C7 C5 C7 C5 */\
193 "ldc1 $f16, " #rarg " \n\t"\
194 "pmaddhw $f10, $f10, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
195 #rounder " $f0, $f0, $f16 \n\t"\
196 "paddw $f2, $f2, $f0 \n\t" /* A1 a1 */\
197 "paddw $f0, $f0, $f0 \n\t" \
198 "ldc1 $f16, 64(%2) \n\t"\
199 "psubw $f0, $f0, $f2 \n\t" /* A2 a2 */\
200 "pmaddhw $f4, $f4, $f16 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
201 "paddw $f14, $f14, $f10 \n\t" /* B0 b0 */\
202 "ldc1 $f10, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\
203 "pmaddhw $f10, $f10, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
204 "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\
205 "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
206 "li $10, " #shift " \n\t"\
207 "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\
208 "mtc1 $10, $f18 \n\t"\
209 "paddw $f10, $f10, $f4 \n\t" /* B1 b1 */\
210 "psraw $f14, $f14, $f18 \n\t"\
211 "psraw $f8, $f8, $f18 \n\t"\
212 "mov.d $f4, $f2 \n\t" /* A1 a1 */\
213 "paddw $f2, $f2, $f10 \n\t" /* A1+B1 a1+b1 */\
214 "psubw $f4, $f4, $f10 \n\t" /* A1-B1 a1-b1 */\
215 "psraw $f2, $f2, $f18 \n\t"\
216 "psraw $f4, $f4, $f18 \n\t"\
217 "packsswh $f14, $f14, $f2 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
218 "packsswh $f4, $f4, $f8 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
219 "sdc1 $f14, " #dst " \n\t"\
220 "ldc1 $f2, " #src1 " \n\t" /* R3 R1 r3 r1 */\
221 "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\
222 "sdc1 $f4, 24+" #dst " \n\t"\
223 "pmaddhw $f8, $f8, $f2 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
224 "ldc1 $f16, 96(%2) \n\t"\
225 "ldc1 $f14, 88(%2) \n\t" /* C3 C7 C3 C7 */\
226 "pmaddhw $f2, $f2, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
227 "pmaddhw $f14, $f14, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
228 "ldc1 $f16, 104(%2) \n\t"\
229 "mov.d $f4, $f0 \n\t" /* A2 a2 */\
230 "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
231 "paddw $f8, $f8, $f14 \n\t" /* B2 b2 */\
232 "paddw $f4, $f4, $f8 \n\t" /* A2+B2 a2+b2 */\
233 "psubw $f0, $f0, $f8 \n\t" /* a2-B2 a2-b2 */\
234 "psraw $f4, $f4, $f18 \n\t"\
235 "psraw $f0, $f0, $f18 \n\t"\
236 "mov.d $f8, $f12 \n\t" /* A3 a3 */\
237 "paddw $f6, $f6, $f2 \n\t" /* B3 b3 */\
238 "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\
239 "psubw $f8, $f8, $f6 \n\t" /* a3-B3 a3-b3 */\
240 "psraw $f12, $f12, $f18 \n\t"\
241 "packsswh $f4, $f4, $f12 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
242 "sdc1 $f4, 8+" #dst " \n\t"\
243 "psraw $f8, $f8, $f18 \n\t"\
244 "packsswh $f8, $f8, $f0 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
245 "sdc1 $f8, 16+" #dst " \n\t"\
247 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
248 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddw,8(%2), 11)
249 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddw,(%2), 11, 4f)
250 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddw,(%2), 11, 2f)
251 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 1f)
254 #define IDCT(src0, src4, src1, src5, dst, shift) \
255 "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
256 "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\
257 "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\
258 "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\
259 "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
260 "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
261 "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
262 "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
263 "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\
264 "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
265 "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\
266 "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
267 "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
268 "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\
269 "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
270 "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\
271 "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\
272 "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
273 "paddw $f0, $f0, $f2 \n\t" /* A1 a1 */\
274 "psubw $f10, $f10, $f2 \n\t" /* A2 a2 */\
275 "ldc1 $f2, 56(%2) \n\t" /* C7 C5 C7 C5 */\
276 "ldc1 $f16, 64(%2) \n\t"\
277 "pmaddhw $f2, $f2, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
278 "pmaddhw $f4, $f4, $f16 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
279 "li $10, " #shift " \n\t"\
280 "paddw $f14, $f14, $f2 \n\t" /* B0 b0 */\
281 "ldc1 $f2, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\
282 "mtc1 $10, $f18 \n\t"\
283 "pmaddhw $f2, $f2, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
284 "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\
285 "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
286 "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\
287 "paddw $f2, $f2, $f4 \n\t" /* B1 b1 */\
288 "psraw $f14, $f14, $f18 \n\t"\
289 "psraw $f8, $f8, $f18 \n\t"\
290 "mov.d $f4, $f0 \n\t" /* A1 a1 */\
291 "paddw $f0, $f0, $f2 \n\t" /* A1+B1 a1+b1 */\
292 "psubw $f4, $f4, $f2 \n\t" /* A1-B1 a1-b1 */\
293 "psraw $f0, $f0, $f18 \n\t"\
294 "psraw $f4, $f4, $f18 \n\t"\
295 "packsswh $f14, $f14, $f14 \n\t" /* A0+B0 a0+b0 */\
296 "swc1 $f14, " #dst " \n\t"\
297 "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\
298 "swc1 $f0, 16+" #dst " \n\t"\
299 "packsswh $f4, $f4, $f4 \n\t" /* A1-B1 a1-b1 */\
300 "swc1 $f4, 96+" #dst " \n\t"\
301 "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\
302 "swc1 $f8, 112+" #dst " \n\t"\
303 "ldc1 $f0, " #src1 " \n\t" /* R3 R1 r3 r1 */\
304 "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\
305 "pmaddhw $f8, $f8, $f0 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
306 "ldc1 $f16, 96(%2) \n\t"\
307 "ldc1 $f14, 88(%2) \n\t" /* C3 C7 C3 C7 */\
308 "pmaddhw $f0, $f0, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
309 "pmaddhw $f14, $f14, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
310 "ldc1 $f16, 104(%2) \n\t"\
311 "mov.d $f4, $f10 \n\t" /* A2 a2 */\
312 "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
313 "paddw $f8, $f8, $f14 \n\t" /* B2 b2 */\
314 "paddw $f4, $f4, $f8 \n\t" /* A2+B2 a2+b2 */\
315 "psubw $f10, $f10, $f8 \n\t" /* a2-B2 a2-b2 */\
316 "psraw $f4, $f4, $f18 \n\t"\
317 "psraw $f10, $f10, $f18 \n\t"\
318 "mov.d $f8, $f12 \n\t" /* A3 a3 */\
319 "paddw $f6, $f6, $f0 \n\t" /* B3 b3 */\
320 "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\
321 "psubw $f8, $f8, $f6 \n\t" /* a3-B3 a3-b3 */\
322 "psraw $f12, $f12, $f18 \n\t"\
323 "psraw $f8, $f8, $f18 \n\t"\
324 "packsswh $f4, $f4, $f4 \n\t" /* A2+B2 a2+b2 */\
325 "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\
326 "swc1 $f4, 32+" #dst " \n\t"\
327 "packsswh $f8, $f8, $f8 \n\t" /* A3-B3 a3-b3 */\
328 "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\
329 "swc1 $f12, 48+" #dst " \n\t"\
330 "swc1 $f8, 64+" #dst " \n\t"\
331 "swc1 $f10, 80+" #dst " \n\t"
333 //IDCT( src0, src4, src1, src5, dst, shift)
334 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
335 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
336 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
337 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
342 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddw,(%2), 11, 6f)
343 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 5f)
346 #define IDCT(src0, src4, src1, src5, dst, shift) \
347 "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
348 "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\
349 "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\
350 "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
351 "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
352 "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
353 "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
354 "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\
355 "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
356 "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\
357 "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
358 "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
359 "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\
360 "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\
361 "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
362 "paddw $f0, $f0, $f2 \n\t" /* A1 a1 */\
363 "psubw $f10, $f10, $f2 \n\t" /* A2 a2 */\
364 "ldc1 $f2, 56(%2) \n\t" /* C7 C5 C7 C5 */\
365 "li $10, " #shift " \n\t"\
366 "pmaddhw $f2, $f2, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
367 "ldc1 $f14, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\
368 "mtc1 $10, $f18 \n\t"\
369 "pmaddhw $f14, $f14, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
370 "paddw $f2, $f2, $f8 \n\t" /* A0+B0 a0+b0 */\
371 "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
372 "psubw $f8, $f8, $f2 \n\t" /* A0-B0 a0-b0 */\
373 "psraw $f2, $f2, $f18 \n\t"\
374 "psraw $f8, $f8, $f18 \n\t"\
375 "mov.d $f4, $f0 \n\t" /* A1 a1 */\
376 "paddw $f0, $f0, $f14 \n\t" /* A1+B1 a1+b1 */\
377 "psubw $f4, $f4, $f14 \n\t" /* A1-B1 a1-b1 */\
378 "psraw $f0, $f0, $f18 \n\t"\
379 "psraw $f4, $f4, $f18 \n\t"\
380 "packsswh $f2, $f2, $f2 \n\t" /* A0+B0 a0+b0 */\
381 "swc1 $f2, " #dst " \n\t"\
382 "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\
383 "swc1 $f0, 16+" #dst " \n\t"\
384 "packsswh $f4, $f4, $f4 \n\t" /* A1-B1 a1-b1 */\
385 "swc1 $f4, 96+" #dst " \n\t"\
386 "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\
387 "swc1 $f8, 112+" #dst " \n\t"\
388 "ldc1 $f2, 88(%2) \n\t" /* C3 C7 C3 C7 */\
389 "ldc1 $f16, 104(%2) \n\t"\
390 "pmaddhw $f2, $f2, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
391 "mov.d $f4, $f10 \n\t" /* A2 a2 */\
392 "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
393 "paddw $f4, $f4, $f2 \n\t" /* A2+B2 a2+b2 */\
394 "psubw $f10, $f10, $f2 \n\t" /* a2-B2 a2-b2 */\
395 "psraw $f4, $f4, $f18 \n\t"\
396 "psraw $f10, $f10, $f18 \n\t"\
397 "mov.d $f2, $f12 \n\t" /* A3 a3 */\
398 "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\
399 "psubw $f2, $f2, $f6 \n\t" /* a3-B3 a3-b3 */\
400 "psraw $f12, $f12, $f18 \n\t"\
401 "psraw $f2, $f2, $f18 \n\t"\
402 "packsswh $f4, $f4, $f4 \n\t" /* A2+B2 a2+b2 */\
403 "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\
404 "swc1 $f4, 32+" #dst " \n\t"\
405 "packsswh $f2, $f2, $f2 \n\t" /* A3-B3 a3-b3 */\
406 "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\
407 "swc1 $f12, 48+" #dst " \n\t"\
408 "swc1 $f2, 64+" #dst " \n\t"\
409 "swc1 $f10, 80+" #dst " \n\t"
411 //IDCT( src0, src4, src1, src5, dst, shift)
412 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
413 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
414 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
415 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
420 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 7f)
423 #define IDCT(src0, src4, src1, src5, dst, shift) \
424 "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
425 "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\
426 "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
427 "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
428 "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
429 "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
430 "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
431 "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
432 "ldc1 $f2, 56(%2) \n\t" /* C7 C5 C7 C5 */\
433 "pmaddhw $f2, $f2, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
434 "ldc1 $f14, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\
435 "li $10, " #shift " \n\t"\
436 "pmaddhw $f14, $f14, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
437 "paddw $f2, $f2, $f8 \n\t" /* A0+B0 a0+b0 */\
438 "mtc1 $10, $f18 \n\t"\
439 "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
440 "psubw $f8, $f8, $f2 \n\t" /* A0-B0 a0-b0 */\
441 "psraw $f2, $f2, $f18 \n\t"\
442 "psraw $f8, $f8, $f18 \n\t"\
443 "mov.d $f4, $f0 \n\t" /* A1 a1 */\
444 "paddw $f0, $f0, $f14 \n\t" /* A1+B1 a1+b1 */\
445 "psubw $f4, $f4, $f14 \n\t" /* A1-B1 a1-b1 */\
446 "psraw $f0, $f0, $f18 \n\t"\
447 "psraw $f4, $f4, $f18 \n\t"\
448 "packsswh $f2, $f2, $f2 \n\t" /* A0+B0 a0+b0 */\
449 "swc1 $f2, " #dst " \n\t"\
450 "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\
451 "swc1 $f0, 16+" #dst " \n\t"\
452 "packsswh $f4, $f4, $f4 \n\t" /* A1-B1 a1-b1 */\
453 "swc1 $f4, 96+" #dst " \n\t"\
454 "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\
455 "swc1 $f8, 112+" #dst " \n\t"\
456 "ldc1 $f2, 88(%2) \n\t" /* C3 C7 C3 C7 */\
457 "ldc1 $f16, 104(%2) \n\t"\
458 "pmaddhw $f2, $f2, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
459 "mov.d $f4, $f10 \n\t" /* A2 a2 */\
460 "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
461 "paddw $f4, $f4, $f2 \n\t" /* A2+B2 a2+b2 */\
462 "psubw $f10, $f10, $f2 \n\t" /* a2-B2 a2-b2 */\
463 "psraw $f4, $f4, $f18 \n\t"\
464 "psraw $f10, $f10, $f18 \n\t"\
465 "mov.d $f2, $f12 \n\t" /* A3 a3 */\
466 "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\
467 "psubw $f2, $f2, $f6 \n\t" /* a3-B3 a3-b3 */\
468 "psraw $f12, $f12, $f18 \n\t"\
469 "psraw $f2, $f2, $f18 \n\t"\
470 "packsswh $f4, $f4, $f4 \n\t" /* A2+B2 a2+b2 */\
471 "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\
472 "swc1 $f4, 32+" #dst " \n\t"\
473 "packsswh $f2, $f2, $f2 \n\t" /* A3-B3 a3-b3 */\
474 "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\
475 "swc1 $f12, 48+" #dst " \n\t"\
476 "swc1 $f2, 64+" #dst " \n\t"\
477 "swc1 $f10, 80+" #dst " \n\t"
479 //IDCT( src0, src4, src1, src5, dst, shift)
480 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
481 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
482 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
483 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
488 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 3f)
491 #define IDCT(src0, src4, src1, src5, dst, shift) \
492 "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
493 "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\
494 "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\
495 "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
496 "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
497 "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
498 "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
499 "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
500 "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\
501 "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
502 "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
503 "ldc1 $f2, 56(%2) \n\t" /* C7 C5 C7 C5 */\
504 "pmaddhw $f2, $f2, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
505 "ldc1 $f16, 64(%2) \n\t"\
506 "pmaddhw $f4, $f4, $f16 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
507 "paddw $f14, $f14, $f2 \n\t" /* B0 b0 */\
508 "ldc1 $f2, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\
509 "li $10, " #shift " \n\t"\
510 "pmaddhw $f2, $f2, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
511 "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\
512 "mtc1 $10, $f18 \n\t"\
513 "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
514 "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\
515 "paddw $f2, $f2, $f4 \n\t" /* B1 b1 */\
516 "psraw $f14, $f14, $f18 \n\t"\
517 "psraw $f8, $f8, $f18 \n\t"\
518 "mov.d $f4, $f0 \n\t" /* A1 a1 */\
519 "paddw $f0, $f0, $f2 \n\t" /* A1+B1 a1+b1 */\
520 "psubw $f4, $f4, $f2 \n\t" /* A1-B1 a1-b1 */\
521 "psraw $f0, $f0, $f18 \n\t"\
522 "psraw $f4, $f4, $f18 \n\t"\
523 "packsswh $f14, $f14, $f14 \n\t" /* A0+B0 a0+b0 */\
524 "swc1 $f14, " #dst " \n\t"\
525 "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\
526 "swc1 $f0, 16+" #dst " \n\t"\
527 "packsswh $f4, $f4, $f4 \n\t" /* A1-B1 a1-b1 */\
528 "swc1 $f4, 96+" #dst " \n\t"\
529 "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\
530 "swc1 $f8, 112+" #dst " \n\t"\
531 "ldc1 $f0, " #src1 " \n\t" /* R3 R1 r3 r1 */\
532 "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\
533 "pmaddhw $f8, $f8, $f0 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
534 "ldc1 $f14, 88(%2) \n\t" /* C3 C7 C3 C7 */\
535 "ldc1 $f16, 96(%2) \n\t"\
536 "pmaddhw $f0, $f0, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
537 "pmaddhw $f14, $f14, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
538 "mov.d $f4, $f10 \n\t" /* A2 a2 */\
539 "ldc1 $f16, 104(%2) \n\t"\
540 "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
541 "paddw $f8, $f8, $f14 \n\t" /* B2 b2 */\
542 "paddw $f4, $f4, $f8 \n\t" /* A2+B2 a2+b2 */\
543 "psubw $f10, $f10, $f8 \n\t" /* a2-B2 a2-b2 */\
544 "psraw $f4, $f4, $f18 \n\t"\
545 "psraw $f10, $f10, $f18 \n\t"\
546 "mov.d $f8, $f12 \n\t" /* A3 a3 */\
547 "paddw $f6, $f6, $f0 \n\t" /* B3 b3 */\
548 "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\
549 "psubw $f8, $f8, $f6 \n\t" /* a3-B3 a3-b3 */\
550 "psraw $f12, $f12, $f18 \n\t"\
551 "psraw $f8, $f8, $f18 \n\t"\
552 "packsswh $f4, $f4, $f4 \n\t" /* A2+B2 a2+b2 */\
553 "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\
554 "swc1 $f4, 32+" #dst " \n\t"\
555 "packsswh $f8, $f8, $f8 \n\t" /* A3-B3 a3-b3 */\
556 "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\
557 "swc1 $f12, 48+" #dst " \n\t"\
558 "swc1 $f8, 64+" #dst " \n\t"\
559 "swc1 $f10, 80+" #dst " \n\t"
561 //IDCT( src0, src4, src1, src5, dst, shift)
562 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
563 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
564 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
565 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
572 #define IDCT(src0, src4, src1, src5, dst, shift) \
573 "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
574 "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\
575 "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
576 "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
577 "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
578 "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
579 "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
580 "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\
581 "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
582 "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
583 "ldc1 $f6, 64(%2) \n\t"\
584 "pmaddhw $f6, $f6, $f4 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
585 "li $10, " #shift " \n\t"\
586 "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\
587 "mtc1 $10, $f18 \n\t"\
588 "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
589 "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\
590 "psraw $f14, $f14, $f18 \n\t"\
591 "psraw $f8, $f8, $f18 \n\t"\
592 "mov.d $f2, $f0 \n\t" /* A1 a1 */\
593 "paddw $f0, $f0, $f6 \n\t" /* A1+B1 a1+b1 */\
594 "psubw $f2, $f2, $f6 \n\t" /* A1-B1 a1-b1 */\
595 "psraw $f0, $f0, $f18 \n\t"\
596 "psraw $f2, $f2, $f18 \n\t"\
597 "packsswh $f14, $f14, $f14 \n\t" /* A0+B0 a0+b0 */\
598 "swc1 $f14, " #dst " \n\t"\
599 "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\
600 "swc1 $f0, 16+" #dst " \n\t"\
601 "packsswh $f2, $f2, $f2 \n\t" /* A1-B1 a1-b1 */\
602 "swc1 $f2, 96+" #dst " \n\t"\
603 "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\
604 "swc1 $f8, 112+" #dst " \n\t"\
605 "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\
606 "ldc1 $f16, 96(%2) \n\t"\
607 "pmaddhw $f8, $f8, $f4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
608 "pmaddhw $f4, $f4, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
609 "mov.d $f2, $f10 \n\t" /* A2 a2 */\
610 "paddw $f2, $f2, $f8 \n\t" /* A2+B2 a2+b2 */\
611 "psubw $f10, $f10, $f8 \n\t" /* a2-B2 a2-b2 */\
612 "psraw $f2, $f2, $f18 \n\t"\
613 "psraw $f10, $f10, $f18 \n\t"\
614 "mov.d $f8, $f12 \n\t" /* A3 a3 */\
615 "paddw $f12, $f12, $f4 \n\t" /* A3+B3 a3+b3 */\
616 "psubw $f8, $f8, $f4 \n\t" /* a3-B3 a3-b3 */\
617 "psraw $f12, $f12, $f18 \n\t"\
618 "psraw $f8, $f8, $f18 \n\t"\
619 "packsswh $f2, $f2, $f2 \n\t" /* A2+B2 a2+b2 */\
620 "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\
621 "swc1 $f2, 32+" #dst " \n\t"\
622 "packsswh $f8, $f8, $f8 \n\t" /* A3-B3 a3-b3 */\
623 "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\
624 "swc1 $f12, 48+" #dst " \n\t"\
625 "swc1 $f8, 64+" #dst " \n\t"\
626 "swc1 $f10, 80+" #dst " \n\t"
628 //IDCT( src0, src4, src1, src5, dst, shift)
629 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
630 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
631 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
632 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
639 #define IDCT(src0, src4, src1, src5, dst, shift) \
640 "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
641 "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\
642 "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
643 "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
644 "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
645 "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
646 "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\
647 "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
648 "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\
649 "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
650 "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
651 "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\
652 "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\
653 "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
654 "paddw $f0, $f0, $f2 \n\t" /* A1 a1 */\
655 "psubw $f10, $f10, $f2 \n\t" /* A2 a2 */\
656 "ldc1 $f4, 8+" #src0 " \n\t" /* R4 R0 r4 r0 */\
657 "ldc1 $f6, 8+" #src4 " \n\t" /* R6 R2 r6 r2 */\
658 "ldc1 $f2, 16(%2) \n\t" /* C4 C4 C4 C4 */\
659 "pmaddhw $f2, $f2, $f4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
660 "ldc1 $f14, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
661 "pmaddhw $f4, $f4, $f14 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
662 "ldc1 $f14, 32(%2) \n\t" /* C6 C2 C6 C2 */\
663 "ldc1 $f16, 40(%2) \n\t"\
664 "pmaddhw $f14, $f14, $f6 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
665 "pmaddhw $f6, $f6, $f16 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
666 "paddw $f14, $f14, $f2 \n\t" /* A0 a0 */\
667 "paddw $f2, $f2, $f2 \n\t" /* 2C0 2c0 */\
668 "psubw $f2, $f2, $f14 \n\t" /* A3 a3 */\
669 "li $10, " #shift " \n\t"\
670 "paddw $f6, $f6, $f4 \n\t" /* A1 a1 */\
671 "mtc1 $10, $f18 \n\t"\
672 "paddw $f4, $f4, $f4 \n\t" /* 2C1 2c1 */\
673 "psubw $f4, $f4, $f6 \n\t" /* A2 a2 */\
674 "psraw $f8, $f8, $f18 \n\t"\
675 "psraw $f14, $f14, $f18 \n\t"\
676 "psraw $f6, $f6, $f18 \n\t"\
677 "packsswh $f8, $f8, $f14 \n\t" /* A0 a0 */\
678 "sdc1 $f8, " #dst " \n\t"\
679 "psraw $f0, $f0, $f18 \n\t"\
680 "packsswh $f0, $f0, $f6 \n\t" /* A1 a1 */\
681 "sdc1 $f0, 16+" #dst " \n\t"\
682 "sdc1 $f0, 96+" #dst " \n\t"\
683 "sdc1 $f8, 112+" #dst " \n\t"\
684 "psraw $f10, $f10, $f18 \n\t"\
685 "psraw $f12, $f12, $f18 \n\t"\
686 "psraw $f4, $f4, $f18 \n\t"\
687 "packsswh $f10, $f10, $f4 \n\t" /* A2-B2 a2-b2 */\
688 "sdc1 $f10, 32+" #dst " \n\t"\
689 "psraw $f2, $f2, $f18 \n\t"\
690 "packsswh $f12, $f12, $f2 \n\t" /* A3+B3 a3+b3 */\
691 "sdc1 $f12, 48+" #dst " \n\t"\
692 "sdc1 $f12, 64+" #dst " \n\t"\
693 "sdc1 $f10, 80+" #dst " \n\t"
695 //IDCT( src0, src4, src1, src5, dst, shift)
696 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
697 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
698 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
699 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
706 #define IDCT(src0, src4, src1, src5, dst, shift) \
707 "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
708 "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\
709 "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\
710 "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
711 "li $10, " #shift " \n\t"\
712 "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
713 "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
714 "mtc1 $10, $f18 \n\t"\
715 "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
716 "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\
717 "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
718 "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\
719 "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
720 "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
721 "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\
722 "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
723 "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\
724 "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\
725 "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
726 "paddw $f0, $f0, $f2 \n\t" /* A1 a1 */\
727 "psubw $f10, $f10, $f2 \n\t" /* A2 a2 */\
728 "ldc1 $f2, 64(%2) \n\t"\
729 "pmaddhw $f2, $f2, $f4 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
730 "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\
731 "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
732 "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\
733 "psraw $f14, $f14, $f18 \n\t"\
734 "psraw $f8, $f8, $f18 \n\t"\
735 "mov.d $f6, $f0 \n\t" /* A1 a1 */\
736 "paddw $f0, $f0, $f2 \n\t" /* A1+B1 a1+b1 */\
737 "psubw $f6, $f6, $f2 \n\t" /* A1-B1 a1-b1 */\
738 "psraw $f0, $f0, $f18 \n\t"\
739 "psraw $f6, $f6, $f18 \n\t"\
740 "packsswh $f14, $f14, $f14 \n\t" /* A0+B0 a0+b0 */\
741 "swc1 $f14, " #dst " \n\t"\
742 "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\
743 "swc1 $f0, 16+" #dst " \n\t"\
744 "packsswh $f6, $f6, $f6 \n\t" /* A1-B1 a1-b1 */\
745 "swc1 $f6, 96+" #dst " \n\t"\
746 "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\
747 "swc1 $f8, 112+" #dst " \n\t"\
748 "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\
749 "ldc1 $f16, 96(%2) \n\t"\
750 "pmaddhw $f8, $f8, $f4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
751 "pmaddhw $f4, $f4, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
752 "mov.d $f6, $f10 \n\t" /* A2 a2 */\
753 "paddw $f6, $f6, $f8 \n\t" /* A2+B2 a2+b2 */\
754 "psubw $f10, $f10, $f8 \n\t" /* a2-B2 a2-b2 */\
755 "psraw $f6, $f6, $f18 \n\t"\
756 "psraw $f10, $f10, $f18 \n\t"\
757 "mov.d $f8, $f12 \n\t" /* A3 a3 */\
758 "paddw $f12, $f12, $f4 \n\t" /* A3+B3 a3+b3 */\
759 "psubw $f8, $f8, $f4 \n\t" /* a3-B3 a3-b3 */\
760 "psraw $f12, $f12, $f18 \n\t"\
761 "packsswh $f6, $f6, $f6 \n\t" /* A2+B2 a2+b2 */\
762 "swc1 $f6, 32+" #dst " \n\t"\
763 "psraw $f8, $f8, $f18 \n\t"\
764 "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\
765 "swc1 $f12, 48+" #dst " \n\t"\
766 "packsswh $f8, $f8, $f8 \n\t" /* A3-B3 a3-b3 */\
767 "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\
768 "swc1 $f8, 64+" #dst " \n\t"\
769 "swc1 $f10, 80+" #dst " \n\t"
771 //IDCT( src0, src4, src1, src5, dst, shift)
772 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
773 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
774 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
775 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
782 #define IDCT(src0, src4, src1, src5, dst, shift) \
783 "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
784 "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
785 "li $10, " #shift " \n\t"\
786 "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
787 "mtc1 $10, $f18 \n\t"\
788 "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
789 "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
790 "psraw $f8, $f8, $f18 \n\t"\
791 "psraw $f0, $f0, $f18 \n\t"\
792 "ldc1 $f4, 8+" #src0 " \n\t" /* R4 R0 r4 r0 */\
793 "ldc1 $f2, 16(%2) \n\t" /* C4 C4 C4 C4 */\
794 "pmaddhw $f2, $f2, $f4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
795 "ldc1 $f14, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
796 "pmaddhw $f4, $f4, $f14 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
797 "ldc1 $f14, 32(%2) \n\t" /* C6 C2 C6 C2 */\
798 "psraw $f2, $f2, $f18 \n\t"\
799 "packsswh $f8, $f8, $f2 \n\t" /* A0 a0 */\
800 "sdc1 $f8, " #dst " \n\t"\
801 "psraw $f4, $f4, $f18 \n\t"\
802 "packsswh $f0, $f0, $f4 \n\t" /* A1 a1 */\
803 "sdc1 $f0, 16+" #dst " \n\t"\
804 "sdc1 $f0, 96+" #dst " \n\t"\
805 "sdc1 $f8, 112+" #dst " \n\t"\
806 "sdc1 $f0, 32+" #dst " \n\t"\
807 "sdc1 $f8, 48+" #dst " \n\t"\
808 "sdc1 $f8, 64+" #dst " \n\t"\
809 "sdc1 $f0, 80+" #dst " \n\t"
811 //IDCT( src0, src4, src1, src5, dst, shift)
812 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
813 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
814 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
815 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
818 ::"r"(block),"r"(temp),"r"(coeffs),"m"(wm1010),"m"(d40000)
823 void ff_put_pixels_clamped_mmi(const int16_t *block,
824 uint8_t *av_restrict pixels, ptrdiff_t line_size)
833 "ldc1 $f0, 0+%3 \r\n"
834 "ldc1 $f2, 8+%3 \r\n"
835 "ldc1 $f4, 16+%3 \r\n"
836 "ldc1 $f6, 24+%3 \r\n"
837 "ldc1 $f8, 32+%3 \r\n"
838 "ldc1 $f10, 40+%3 \r\n"
839 "ldc1 $f12, 48+%3 \r\n"
840 "ldc1 $f14, 56+%3 \r\n"
841 "dadd $10, %0, %1 \r\n"
842 "packushb $f0, $f0, $f2 \r\n"
843 "packushb $f4, $f4, $f6 \r\n"
844 "packushb $f8, $f8, $f10 \r\n"
845 "packushb $f12, $f12, $f14 \r\n"
846 "sdc1 $f0, 0(%0) \r\n"
847 "sdc1 $f4, 0($10) \r\n"
848 "gssdxc1 $f8, 0($10, %1) \r\n"
849 "gssdxc1 $f12, 0(%0, %2) \r\n"
850 ::"r"(pix),"r"((int)line_size),
851 "r"((int)line_size*3),"m"(*p)
859 "ldc1 $f0, 0+%3 \r\n"
860 "ldc1 $f2, 8+%3 \r\n"
861 "ldc1 $f4, 16+%3 \r\n"
862 "ldc1 $f6, 24+%3 \r\n"
863 "ldc1 $f8, 32+%3 \r\n"
864 "ldc1 $f10, 40+%3 \r\n"
865 "ldc1 $f12, 48+%3 \r\n"
866 "ldc1 $f14, 56+%3 \r\n"
867 "dadd $10, %0, %1 \r\n"
868 "packushb $f0, $f0, $f2 \r\n"
869 "packushb $f4, $f4, $f6 \r\n"
870 "packushb $f8, $f8, $f10 \r\n"
871 "packushb $f12, $f12, $f14 \r\n"
872 "sdc1 $f0, 0(%0) \r\n"
873 "sdc1 $f4, 0($10) \r\n"
874 "gssdxc1 $f8, 0($10, %1) \r\n"
875 "gssdxc1 $f12, 0(%0, %2) \r\n"
876 ::"r"(pix),"r"((int)line_size),
877 "r"((int)line_size*3),"m"(*p)
882 void ff_put_signed_pixels_clamped_mmi(const int16_t *block,
883 uint8_t *av_restrict pixels, ptrdiff_t line_size)
885 int64_t line_skip = line_size;
890 "daddu %1, %3, %3 \n\t"
891 "ldc1 $f2, 0(%2) \n\t"
892 "ldc1 $f10, 8(%2) \n\t"
893 "packsshb $f2, $f2, $f10 \n\t"
894 "ldc1 $f4, 16(%2) \n\t"
895 "ldc1 $f10, 24(%2) \n\t"
896 "packsshb $f4, $f4, $f10 \n\t"
897 "ldc1 $f6, 32(%2) \n\t"
898 "ldc1 $f10, 40(%2) \n\t"
899 "packsshb $f6, $f6, $f10 \n\t"
900 "ldc1 $f8, 48(%2) \n\t"
901 "ldc1 $f10, 56(%2) \n\t"
902 "packsshb $f8, $f8, $f10 \n\t"
903 "paddb $f2, $f2, $f0 \n\t"
904 "paddb $f4, $f4, $f0 \n\t"
905 "paddb $f6, $f6, $f0 \n\t"
906 "paddb $f8, $f8, $f0 \n\t"
907 "sdc1 $f2, 0(%0) \n\t"
908 "gssdxc1 $f4, 0(%0, %3) \n\t"
909 "gssdxc1 $f6, 0(%0, %1) \n\t"
910 "daddu %1, %1, %3 \n\t"
911 "gssdxc1 $f8, 0(%0, %1) \n\t"
912 "daddu $10, %1, %3 \n\t"
913 "daddu %0, %0, $10 \n\t"
914 "ldc1 $f2, 64(%2) \n\t"
915 "ldc1 $f10, 8+64(%2) \n\t"
916 "packsshb $f2, $f2, $f10 \n\t"
917 "ldc1 $f4, 16+64(%2) \n\t"
918 "ldc1 $f10, 24+64(%2) \n\t"
919 "packsshb $f4, $f4, $f10 \n\t"
920 "ldc1 $f6, 32+64(%2) \n\t"
921 "ldc1 $f10, 40+64(%2) \n\t"
922 "packsshb $f6, $f6, $f10 \n\t"
923 "ldc1 $f8, 48+64(%2) \n\t"
924 "ldc1 $f10, 56+64(%2) \n\t"
925 "packsshb $f8, $f8, $f10 \n\t"
926 "paddb $f2, $f2, $f0 \n\t"
927 "paddb $f4, $f4, $f0 \n\t"
928 "paddb $f6, $f6, $f0 \n\t"
929 "paddb $f8, $f8, $f0 \n\t"
930 "sdc1 $f2, 0(%0) \n\t"
931 "gssdxc1 $f4, 0(%0, %3) \n\t"
932 "daddu $10, %3, %3 \n\t"
933 "gssdxc1 $f6, 0(%0, $10) \n\t"
934 "gssdxc1 $f8, 0(%0, %1) \n\t"
935 : "+&r"(pixels),"=&r"(line_skip3)
936 : "r"(block),"r"(line_skip),"m"(ff_pb_80)
941 void ff_add_pixels_clamped_mmi(const int16_t *block,
942 uint8_t *av_restrict pixels, ptrdiff_t line_size)
952 "xor $f14, $f14, $f14 \r\n"
958 "ldc1 $f0, 0+%2 \r\n"
959 "ldc1 $f2, 8+%2 \r\n"
960 "ldc1 $f4, 16+%2 \r\n"
961 "ldc1 $f6, 24+%2 \r\n"
964 "mov.d $f10, $f8 \r\n"
965 "punpcklbh $f8, $f8, $f14 \r\n"
966 "punpckhbh $f10, $f10, $f14 \r\n"
967 "paddsh $f0, $f0, $f8 \r\n"
968 "paddsh $f2, $f2, $f10 \r\n"
969 "mov.d $f10, $f12 \r\n"
970 "punpcklbh $f12, $f12, $f14 \r\n"
971 "punpckhbh $f10, $f10, $f14 \r\n"
972 "paddsh $f4, $f4, $f12 \r\n"
973 "paddsh $f6, $f6, $f10 \r\n"
974 "packushb $f0, $f0, $f2 \r\n"
975 "packushb $f4, $f4, $f6 \r\n"
978 : "+m"(*pix),"+m"(*(pix+line_size))
988 void ff_simple_idct_mmi(int16_t *block)
990 simple_idct_mmi(block);
993 void ff_simple_idct_put_mmi(uint8_t *dest, int32_t line_size, int16_t *block)
995 simple_idct_mmi(block);
996 ff_put_pixels_clamped_mmi(block, dest, line_size);
999 void ff_simple_idct_add_mmi(uint8_t *dest, int32_t line_size, int16_t *block)
1001 simple_idct_mmi(block);
1002 ff_add_pixels_clamped_mmi(block, dest, line_size);