2 * Loongson SIMD optimized mpegvideo
4 * Copyright (c) 2015 Loongson Technology Corporation Limited
5 * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "mpegvideo_mips.h"
26 #include "libavutil/mips/asmdefs.h"
28 void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
31 int64_t level, qmul, qadd, nCoeffs;
36 av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
40 level = block[0] * s->y_dc_scale;
42 level = block[0] * s->c_dc_scale;
43 qadd = (qscale-1) | 1;
52 nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
55 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
56 "packsswh %[qmul], %[qmul], %[qmul] \n\t"
57 "packsswh %[qmul], %[qmul], %[qmul] \n\t"
58 "packsswh %[qadd], %[qadd], %[qadd] \n\t"
59 "packsswh %[qadd], %[qadd], %[qadd] \n\t"
60 "psubh %[ftmp0], %[ftmp0], %[qadd] \n\t"
61 "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
64 PTR_ADDU "%[addr0], %[block], %[nCoeffs] \n\t"
65 "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t"
66 "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t"
67 "gsldlc1 %[ftmp2], 0x0f(%[addr0]) \n\t"
68 "gsldrc1 %[ftmp2], 0x08(%[addr0]) \n\t"
69 "mov.d %[ftmp3], %[ftmp1] \n\t"
70 "mov.d %[ftmp4], %[ftmp2] \n\t"
71 "pmullh %[ftmp1], %[ftmp1], %[qmul] \n\t"
72 "pmullh %[ftmp2], %[ftmp2], %[qmul] \n\t"
73 "pcmpgth %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
74 "pcmpgth %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
75 "xor %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
76 "xor %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
77 "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
78 "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
79 "xor %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
80 "xor %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
81 "pcmpeqh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
82 "pcmpeqh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
83 "pandn %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
84 "pandn %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
85 PTR_ADDIU "%[nCoeffs], %[nCoeffs], 0x10 \n\t"
86 "gssdlc1 %[ftmp1], 0x07(%[addr0]) \n\t"
87 "gssdrc1 %[ftmp1], 0x00(%[addr0]) \n\t"
88 "gssdlc1 %[ftmp2], 0x0f(%[addr0]) \n\t"
89 "gssdrc1 %[ftmp2], 0x08(%[addr0]) \n\t"
90 "blez %[nCoeffs], 1b \n\t"
91 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
92 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
93 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
95 : [block]"r"((mips_reg)(block+nCoeffs)),
96 [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
97 [qmul]"f"(qmul), [qadd]"f"(qadd)
104 void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
107 int64_t qmul, qadd, nCoeffs;
112 qadd = (qscale - 1) | 1;
113 av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
114 nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
117 "packsswh %[qmul], %[qmul], %[qmul] \n\t"
118 "packsswh %[qmul], %[qmul], %[qmul] \n\t"
119 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
120 "packsswh %[qadd], %[qadd], %[qadd] \n\t"
121 "packsswh %[qadd], %[qadd], %[qadd] \n\t"
122 "psubh %[ftmp0], %[ftmp0], %[qadd] \n\t"
123 "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
126 PTR_ADDU "%[addr0], %[block], %[nCoeffs] \n\t"
127 "gsldlc1 %[ftmp1], 0x07(%[addr0]) \n\t"
128 "gsldrc1 %[ftmp1], 0x00(%[addr0]) \n\t"
129 "gsldlc1 %[ftmp2], 0x0f(%[addr0]) \n\t"
130 "gsldrc1 %[ftmp2], 0x08(%[addr0]) \n\t"
131 "mov.d %[ftmp3], %[ftmp1] \n\t"
132 "mov.d %[ftmp4], %[ftmp2] \n\t"
133 "pmullh %[ftmp1], %[ftmp1], %[qmul] \n\t"
134 "pmullh %[ftmp2], %[ftmp2], %[qmul] \n\t"
135 "pcmpgth %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
136 "pcmpgth %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
137 "xor %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
138 "xor %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
139 "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
140 "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
141 "xor %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
142 "xor %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
143 "pcmpeqh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
144 "pcmpeqh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
145 "pandn %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
146 "pandn %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
147 PTR_ADDIU "%[nCoeffs], %[nCoeffs], 0x10 \n\t"
148 "gssdlc1 %[ftmp1], 0x07(%[addr0]) \n\t"
149 "gssdrc1 %[ftmp1], 0x00(%[addr0]) \n\t"
150 "gssdlc1 %[ftmp2], 0x0f(%[addr0]) \n\t"
151 "gssdrc1 %[ftmp2], 0x08(%[addr0]) \n\t"
152 "blez %[nCoeffs], 1b \n\t"
153 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
154 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
155 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
156 [addr0]"=&r"(addr[0])
157 : [block]"r"((mips_reg)(block+nCoeffs)),
158 [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
159 [qmul]"f"(qmul), [qadd]"f"(qadd)
164 void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
168 const uint16_t *quant_matrix;
174 av_assert2(s->block_last_index[n]>=0);
175 nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
178 block0 = block[0] * s->y_dc_scale;
180 block0 = block[0] * s->c_dc_scale;
182 /* XXX: only mpeg1 */
183 quant_matrix = s->intra_matrix;
186 "dli %[tmp0], 0x0f \n\t"
187 "pcmpeqh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
188 "dmtc1 %[tmp0], %[ftmp4] \n\t"
189 "dmtc1 %[qscale], %[ftmp1] \n\t"
190 "psrlh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
191 "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
192 "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
193 "or %[addr0], %[nCoeffs], $0 \n\t"
196 "gsldxc1 %[ftmp2], 0x00(%[addr0], %[block]) \n\t"
197 "gsldxc1 %[ftmp3], 0x08(%[addr0], %[block]) \n\t"
198 "mov.d %[ftmp4], %[ftmp2] \n\t"
199 "mov.d %[ftmp5], %[ftmp3] \n\t"
200 "gsldxc1 %[ftmp6], 0x00(%[addr0], %[quant]) \n\t"
201 "gsldxc1 %[ftmp7], 0x08(%[addr0], %[quant]) \n\t"
202 "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
203 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
204 "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
205 "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
206 "pcmpgth %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
207 "pcmpgth %[ftmp9], %[ftmp9], %[ftmp3] \n\t"
208 "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
209 "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
210 "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
211 "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
212 "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
213 "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
214 "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
215 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
216 "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
217 "dli %[tmp0], 0x03 \n\t"
218 "pcmpeqh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
219 "dmtc1 %[tmp0], %[ftmp4] \n\t"
220 "psrah %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
221 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
222 "psubh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
223 "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
224 "or %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
225 "or %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
226 "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
227 "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
228 "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
229 "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
230 "pandn %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
231 "pandn %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
232 "gssdxc1 %[ftmp6], 0x00(%[addr0], %[block]) \n\t"
233 "gssdxc1 %[ftmp7], 0x08(%[addr0], %[block]) \n\t"
234 PTR_ADDIU "%[addr0], %[addr0], 0x10 \n\t"
235 "bltz %[addr0], 1b \n\t"
236 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
237 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
238 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
239 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
240 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
242 [addr0]"=&r"(addr[0])
243 : [block]"r"((mips_reg)(block+nCoeffs)),
244 [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
245 [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
253 void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
257 const uint16_t *quant_matrix;
262 av_assert2(s->block_last_index[n] >= 0);
263 nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
264 quant_matrix = s->inter_matrix;
267 "dli %[tmp0], 0x0f \n\t"
268 "pcmpeqh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
269 "dmtc1 %[tmp0], %[ftmp4] \n\t"
270 "dmtc1 %[qscale], %[ftmp1] \n\t"
271 "psrlh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
272 "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
273 "packsswh %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
274 "or %[addr0], %[nCoeffs], $0 \n\t"
277 "gsldxc1 %[ftmp2], 0x00(%[addr0], %[block]) \n\t"
278 "gsldxc1 %[ftmp3], 0x08(%[addr0], %[block]) \n\t"
279 "mov.d %[ftmp4], %[ftmp2] \n\t"
280 "mov.d %[ftmp5], %[ftmp3] \n\t"
281 "gsldxc1 %[ftmp6], 0x00(%[addr0], %[quant]) \n\t"
282 "gsldxc1 %[ftmp7], 0x08(%[addr0], %[quant]) \n\t"
283 "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
284 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
285 "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
286 "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
287 "pcmpgth %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
288 "pcmpgth %[ftmp9], %[ftmp9], %[ftmp3] \n\t"
289 "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
290 "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
291 "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
292 "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
293 "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
294 "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
295 "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
296 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
297 "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
298 "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
299 "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
300 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
301 "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
302 "dli %[tmp0], 0x04 \n\t"
303 "pcmpeqh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
304 "dmtc1 %[tmp0], %[ftmp4] \n\t"
305 "psrah %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
306 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
307 "psubh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
308 "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
309 "or %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
310 "or %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
311 "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
312 "xor %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
313 "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
314 "psubh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
315 "pandn %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
316 "pandn %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
317 "gssdxc1 %[ftmp6], 0x00(%[addr0], %[block]) \n\t"
318 "gssdxc1 %[ftmp7], 0x08(%[addr0], %[block]) \n\t"
319 PTR_ADDIU "%[addr0], %[addr0], 0x10 \n\t"
320 "bltz %[addr0], 1b \n\t"
321 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
322 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
323 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
324 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
325 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
327 [addr0]"=&r"(addr[0])
328 : [block]"r"((mips_reg)(block+nCoeffs)),
329 [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
330 [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
336 void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
340 const uint16_t *quant_matrix;
346 assert(s->block_last_index[n]>=0);
348 if (s->alternate_scan)
351 nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
354 block0 = block[0] * s->y_dc_scale;
356 block0 = block[0] * s->c_dc_scale;
358 quant_matrix = s->intra_matrix;
361 "dli %[tmp0], 0x0f \n\t"
362 "pcmpeqh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
363 "mtc1 %[tmp0], %[ftmp3] \n\t"
364 "mtc1 %[qscale], %[ftmp9] \n\t"
365 "psrlh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
366 "packsswh %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
367 "packsswh %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
368 "or %[addr0], %[nCoeffs], $0 \n\t"
371 "gsldxc1 %[ftmp1], 0x00(%[addr0], %[block]) \n\t"
372 "gsldxc1 %[ftmp2], 0x08(%[addr0], %[block]) \n\t"
373 "mov.d %[ftmp3], %[ftmp1] \n\t"
374 "mov.d %[ftmp4], %[ftmp2] \n\t"
375 "gsldxc1 %[ftmp5], 0x00(%[addr0], %[quant]) \n\t"
376 "gsldxc1 %[ftmp6], 0x00(%[addr0], %[quant]) \n\t"
377 "pmullh %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
378 "pmullh %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
379 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
380 "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
381 "pcmpgth %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
382 "pcmpgth %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
383 "xor %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
384 "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
385 "psubh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
386 "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
387 "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
388 "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
389 "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
390 "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
391 "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
392 "dli %[tmp0], 0x03 \n\t"
393 "pcmpeqh %[ftmp6] , %[ftmp6], %[ftmp4] \n\t"
394 "mtc1 %[tmp0], %[ftmp3] \n\t"
395 "psrah %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
396 "psrah %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
397 "xor %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
398 "xor %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
399 "psubh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
400 "psubh %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
401 "pandn %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
402 "pandn %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
403 PTR_ADDIU "%[addr0], %[addr0], 0x10 \n\t"
404 "gssdxc1 %[ftmp5], 0x00(%[addr0], %[block]) \n\t"
405 "gssdxc1 %[ftmp6], 0x08(%[addr0], %[block]) \n\t"
406 "blez %[addr0], 1b \n\t"
407 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
408 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
409 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
410 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
411 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
413 [addr0]"=&r"(addr[0])
414 : [block]"r"((mips_reg)(block+nCoeffs)),
415 [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
416 [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
424 void ff_denoise_dct_mmi(MpegEncContext *s, int16_t *block)
426 const int intra = s->mb_intra;
427 int *sum = s->dct_error_sum[intra];
428 uint16_t *offset = s->dct_offset[intra];
432 s->dct_count[intra]++;
435 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
437 "ldc1 %[ftmp1], 0x00(%[block]) \n\t"
438 "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
439 "ldc1 %[ftmp3], 0x08(%[block]) \n\t"
440 "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
441 "pcmpgth %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
442 "pcmpgth %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
443 "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
444 "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
445 "psubh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
446 "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
447 "ldc1 %[ftmp6], 0x00(%[offset]) \n\t"
448 "mov.d %[ftmp5], %[ftmp1] \n\t"
449 "psubush %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
450 "ldc1 %[ftmp6], 0x08(%[offset]) \n\t"
451 "mov.d %[ftmp7], %[ftmp3] \n\t"
452 "psubush %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
453 "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
454 "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
455 "psubh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
456 "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
457 "sdc1 %[ftmp1], 0x00(%[block]) \n\t"
458 "sdc1 %[ftmp3], 0x08(%[block]) \n\t"
459 "mov.d %[ftmp1], %[ftmp5] \n\t"
460 "mov.d %[ftmp3], %[ftmp7] \n\t"
461 "punpcklhw %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
462 "punpckhhw %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
463 "punpcklhw %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
464 "punpckhhw %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
465 "ldc1 %[ftmp2], 0x00(%[sum]) \n\t"
466 "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
467 "ldc1 %[ftmp2], 0x08(%[sum]) \n\t"
468 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
469 "ldc1 %[ftmp2], 0x10(%[sum]) \n\t"
470 "paddw %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
471 "ldc1 %[ftmp2], 0x18(%[sum]) \n\t"
472 "paddw %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
473 "sdc1 %[ftmp5], 0x00(%[sum]) \n\t"
474 "sdc1 %[ftmp1], 0x08(%[sum]) \n\t"
475 "sdc1 %[ftmp7], 0x10(%[sum]) \n\t"
476 "sdc1 %[ftmp3], 0x18(%[sum]) \n\t"
477 PTR_ADDIU "%[block], %[block], 0x10 \n\t"
478 PTR_ADDIU "%[sum], %[sum], 0x20 \n\t"
479 PTR_SUBU "%[addr0], %[block1], %[block] \n\t"
480 PTR_ADDIU "%[offset], %[offset], 0x10 \n\t"
481 "bgtz %[addr0], 1b \n\t"
482 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
483 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
484 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
485 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
486 [addr0]"=&r"(addr[0]),
487 [block]"+&r"(block), [sum]"+&r"(sum),
488 [offset]"+&r"(offset)
489 : [block1]"r"(block+64)