2 * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
4 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/avassert.h"
24 #include "libavcodec/vc1dsp.h"
25 #include "constants.h"
26 #include "vc1dsp_mips.h"
27 #include "hpeldsp_mips.h"
28 #include "libavutil/mips/mmiutils.h"
31 #define VC1_INV_TRANCS_8_STEP1_MMI(fp1, fp2, fp3, fp4, \
34 ff_p1, ff_p2, ff_p3, ff_p4) \
35 "pmullh "#t1" , "#fp1" , "#ff_p1" \n\t" \
36 "pmullh "#t2" , "#fp2" , "#ff_p2" \n\t" \
37 "pmullh "#t3" , "#fp3" , "#ff_p3" \n\t" \
38 "pmullh "#t4" , "#fp4" , "#ff_p4" \n\t" \
39 "paddh "#o1" , "#t1" , "#t2" \n\t" \
40 "paddh "#o1" , "#o1" , "#t3" \n\t" \
41 "paddh "#o1" , "#o1" , "#t4" \n\t" \
43 "pmullh "#t1" , "#fp1" , "#ff_p2" \n\t" \
44 "pmullh "#t2" , "#fp2" , "#ff_p4" \n\t" \
45 "pmullh "#t3" , "#fp3" , "#ff_p1" \n\t" \
46 "pmullh "#t4" , "#fp4" , "#ff_p3" \n\t" \
47 "psubh "#o2" , "#t1" , "#t2" \n\t" \
48 "psubh "#o2" , "#o2" , "#t3" \n\t" \
49 "psubh "#o2" , "#o2" , "#t4" \n\t" \
51 "pmullh "#t1" , "#fp1" , "#ff_p3" \n\t" \
52 "pmullh "#t2" , "#fp2" , "#ff_p1" \n\t" \
53 "pmullh "#t3" , "#fp3" , "#ff_p4" \n\t" \
54 "pmullh "#t4" , "#fp4" , "#ff_p2" \n\t" \
55 "psubh "#o3" , "#t1" , "#t2" \n\t" \
56 "paddh "#o3" , "#o3" , "#t3" \n\t" \
57 "paddh "#o3" , "#o3" , "#t4" \n\t" \
59 "pmullh "#t1" , "#fp1" , "#ff_p4" \n\t" \
60 "pmullh "#t2" , "#fp2" , "#ff_p3" \n\t" \
61 "pmullh "#t3" , "#fp3" , "#ff_p2" \n\t" \
62 "pmullh "#t4" , "#fp4" , "#ff_p1" \n\t" \
63 "psubh "#o4" , "#t1" , "#t2" \n\t" \
64 "paddh "#o4" , "#o4" , "#t3" \n\t" \
65 "psubh "#o4" , "#o4" , "#t4" \n\t"
68 #define VC1_INV_TRANCS_8_STEP2_MMI(fp1, fp2, fp3, fp4, \
71 ff_p1, ff_p2, ff_p3, ff_pw) \
72 "paddh "#fp5" , "#fp1" , "#fp2" \n\t" \
73 "psubh "#fp6" , "#fp1" , "#fp2" \n\t" \
74 "pmullh "#fp5" , "#fp5" , "#ff_p1" \n\t" \
75 "pmullh "#fp6" , "#fp6" , "#ff_p1" \n\t" \
76 "paddh "#fp5" , "#fp5" , "#ff_pw" \n\t" \
77 "paddh "#fp6" , "#fp6" , "#ff_pw" \n\t" \
79 "pmullh "#fp1" , "#fp3" , "#ff_p2" \n\t" \
80 "pmullh "#fp2" , "#fp4" , "#ff_p3" \n\t" \
81 "pmullh "#fp3" , "#fp3" , "#ff_p3" \n\t" \
82 "pmullh "#fp4" , "#fp4" , "#ff_p2" \n\t" \
83 "paddh "#fp7" , "#fp1" , "#fp2" \n\t" \
84 "psubh "#fp8" , "#fp3" , "#fp4" \n\t" \
86 "paddh "#fp1" , "#fp5" , "#fp7" \n\t" \
87 "paddh "#fp2" , "#fp6" , "#fp8" \n\t" \
88 "psubh "#fp3" , "#fp6" , "#fp8" \n\t" \
89 "psubh "#fp4" , "#fp5" , "#fp7" \n\t" \
91 "paddh "#fp5" , "#fp1" , "#o1" \n\t" \
92 "paddh "#fp6" , "#fp2" , "#o2" \n\t" \
93 "paddh "#fp7" , "#fp3" , "#o3" \n\t" \
94 "paddh "#fp8" , "#fp4" , "#o4" \n\t" \
96 "psubh "#fp4" , "#fp4" , "#o4" \n\t" \
97 "psubh "#fp3" , "#fp3" , "#o3" \n\t" \
98 "psubh "#fp2" , "#fp2" , "#o2" \n\t" \
99 "psubh "#fp1" , "#fp1" , "#o1" \n\t"
102 #define VC1_INV_TRANCS_4_STEP1_MMI(fp1, fp2, fp3, fp4, \
103 fp5, fp6, fp7, fp8, \
104 ff_p1, ff_p2, ff_p3, ff_pw) \
105 "paddh "#fp5" , "#fp1" , "#fp2" \n\t" \
106 "psubh "#fp6" , "#fp1" , "#fp2" \n\t" \
107 "pmullh "#fp5" , "#fp5" , "#ff_p1" \n\t" \
108 "pmullh "#fp6" , "#fp6" , "#ff_p1" \n\t" \
109 "paddh "#fp5" , "#fp5" , "#ff_pw" \n\t" \
110 "paddh "#fp6" , "#fp6" , "#ff_pw" \n\t" \
112 "pmullh "#fp1" , "#fp3" , "#ff_p2" \n\t" \
113 "pmullh "#fp2" , "#fp4" , "#ff_p3" \n\t" \
114 "pmullh "#fp3" , "#fp3" , "#ff_p3" \n\t" \
115 "pmullh "#fp4" , "#fp4" , "#ff_p2" \n\t" \
116 "paddh "#fp7" , "#fp1" , "#fp2" \n\t" \
117 "psubh "#fp8" , "#fp3" , "#fp4" \n\t" \
119 "paddh "#fp1" , "#fp5" , "#fp7" \n\t" \
120 "psubh "#fp2" , "#fp6" , "#fp8" \n\t" \
121 "paddh "#fp3" , "#fp6" , "#fp8" \n\t" \
122 "psubh "#fp4" , "#fp5" , "#fp7" \n\t"
125 #define VC1_INV_TRANCS_4_STEP2_MMI(fp1, fp2, fp3, fp4, \
126 fp5, fp6, fp7, fp8, zero) \
127 "punpcklbh "#fp5" , "#fp5" , "#zero" \n\t" \
128 "punpcklbh "#fp6" , "#fp6" , "#zero" \n\t" \
129 "punpcklbh "#fp7" , "#fp7" , "#zero" \n\t" \
130 "punpcklbh "#fp8" , "#fp8" , "#zero" \n\t" \
132 "paddh "#fp1" , "#fp1" , "#fp5" \n\t" \
133 "paddh "#fp2" , "#fp2" , "#fp6" \n\t" \
134 "paddh "#fp3" , "#fp3" , "#fp7" \n\t" \
135 "paddh "#fp4" , "#fp4" , "#fp8" \n\t" \
137 "packushb "#fp1" , "#fp1" , "#zero" \n\t" \
138 "packushb "#fp2" , "#fp2" , "#zero" \n\t" \
139 "packushb "#fp3" , "#fp3" , "#zero" \n\t" \
140 "packushb "#fp4" , "#fp4" , "#zero" \n\t"
143 /* Do inverse transform on 8x8 block */
144 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
151 dc = (3 * dc + 1) >> 1;
152 dc = (3 * dc + 16) >> 5;
155 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
156 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
157 "li %[count], 0x02 \n\t"
160 MMI_LDC1(%[ftmp1], %[dest], 0x00)
161 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
162 MMI_LDC1(%[ftmp2], %[addr0], 0x00)
163 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
164 MMI_LDC1(%[ftmp3], %[addr0], 0x00)
165 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
166 MMI_LDC1(%[ftmp4], %[addr0], 0x00)
168 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
169 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
170 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
171 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
172 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
173 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
174 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
175 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
177 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
178 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
179 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
180 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
181 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
182 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
183 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
184 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
186 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
187 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
188 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
189 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
191 MMI_SDC1(%[ftmp1], %[dest], 0x00)
192 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
193 MMI_SDC1(%[ftmp2], %[addr0], 0x00)
194 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
195 MMI_SDC1(%[ftmp3], %[addr0], 0x00)
196 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
197 MMI_SDC1(%[ftmp4], %[addr0], 0x00)
199 "addiu %[count], %[count], -0x01 \n\t"
200 PTR_ADDU "%[dest], %[addr0], %[linesize] \n\t"
201 "bnez %[count], 1b \n\t"
202 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
203 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
204 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
205 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
206 [ftmp8]"=&f"(ftmp[8]),
207 [addr0]"=&r"(addr[0]),
208 [count]"=&r"(count), [dest]"+&r"(dest)
209 : [linesize]"r"((mips_reg)linesize),
215 #if _MIPS_SIM != _ABIO32
216 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
218 DECLARE_ALIGNED(16, int16_t, temp[64]);
219 int16_t *src = block;
222 uint32_t count, tmp[1];
226 "li %[tmp0], 0x03 \n\t"
227 "mtc1 %[tmp0], %[ftmp0] \n\t"
228 "li %[count], 0x02 \n\t"
231 MMI_LDC1(%[ftmp5], %[src], 0x10)
232 MMI_LDC1(%[ftmp6], %[src], 0x30)
233 MMI_LDC1(%[ftmp7], %[src], 0x50)
234 MMI_LDC1(%[ftmp8], %[src], 0x70)
236 VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
237 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
238 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
239 %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
242 MMI_LDC1(%[ftmp1], %[src], 0x00)
243 MMI_LDC1(%[ftmp2], %[src], 0x40)
244 MMI_LDC1(%[ftmp3], %[src], 0x20)
245 MMI_LDC1(%[ftmp4], %[src], 0x60)
247 VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
248 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
249 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
250 %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
254 PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
255 %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
257 TRANSPOSE_4H(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
258 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
259 %[ftmp13], %[tmp0], %[ftmp14], %[ftmp15])
261 MMI_SDC1(%[ftmp5], %[dst], 0x00)
262 MMI_SDC1(%[ftmp6], %[dst], 0x10)
263 MMI_SDC1(%[ftmp7], %[dst], 0x20)
264 MMI_SDC1(%[ftmp8], %[dst], 0x30)
266 TRANSPOSE_4H(%[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1],
267 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
268 %[ftmp13], %[tmp0], %[ftmp14], %[ftmp15])
270 MMI_SDC1(%[ftmp4], %[dst], 0x08)
271 MMI_SDC1(%[ftmp3], %[dst], 0x18)
272 MMI_SDC1(%[ftmp2], %[dst], 0x28)
273 MMI_SDC1(%[ftmp1], %[dst], 0x38)
275 "addiu %[count], %[count], -0x01 \n\t"
276 PTR_ADDIU "%[src], %[src], 0x08 \n\t"
277 PTR_ADDIU "%[dst], %[dst], 0x40 \n\t"
278 "bnez %[count], 1b \n\t"
279 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
280 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
281 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
282 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
283 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
284 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
285 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
286 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
289 [src]"+&r"(src), [dst]"+&r"(dst)
290 : [ff_pw_4]"f"(ff_pw_4), [ff_pw_6]"f"(ff_pw_6),
291 [ff_pw_9]"f"(ff_pw_9), [ff_pw_12]"f"(ff_pw_12),
292 [ff_pw_15]"f"(ff_pw_15), [ff_pw_16]"f"(ff_pw_16)
301 "li %[tmp0], 0x07 \n\t"
302 "mtc1 %[tmp0], %[ftmp0] \n\t"
303 "li %[count], 0x02 \n\t"
306 MMI_LDC1(%[ftmp5], %[src], 0x10)
307 MMI_LDC1(%[ftmp6], %[src], 0x30)
308 MMI_LDC1(%[ftmp7], %[src], 0x50)
309 MMI_LDC1(%[ftmp8], %[src], 0x70)
311 VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
312 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
313 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
314 %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
317 MMI_LDC1(%[ftmp1], %[src], 0x00)
318 MMI_LDC1(%[ftmp2], %[src], 0x40)
319 MMI_LDC1(%[ftmp3], %[src], 0x20)
320 MMI_LDC1(%[ftmp4], %[src], 0x60)
322 VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
323 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
324 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
325 %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
328 "paddh %[ftmp4], %[ftmp4], %[ff_pw_1] \n\t"
329 "paddh %[ftmp3], %[ftmp3], %[ff_pw_1] \n\t"
330 "paddh %[ftmp2], %[ftmp2], %[ff_pw_1] \n\t"
331 "paddh %[ftmp1], %[ftmp1], %[ff_pw_1] \n\t"
333 PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
334 %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
336 MMI_SDC1(%[ftmp5], %[dst], 0x00)
337 MMI_SDC1(%[ftmp6], %[dst], 0x10)
338 MMI_SDC1(%[ftmp7], %[dst], 0x20)
339 MMI_SDC1(%[ftmp8], %[dst], 0x30)
341 MMI_SDC1(%[ftmp4], %[dst], 0x40)
342 MMI_SDC1(%[ftmp3], %[dst], 0x50)
343 MMI_SDC1(%[ftmp2], %[dst], 0x60)
344 MMI_SDC1(%[ftmp1], %[dst], 0x70)
346 "addiu %[count], %[count], -0x01 \n\t"
347 PTR_ADDIU "%[src], %[src], 0x08 \n\t"
348 PTR_ADDIU "%[dst], %[dst], 0x08 \n\t"
349 "bnez %[count], 1b \n\t"
350 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
351 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
352 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
353 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
354 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
355 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
356 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
357 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
360 [src]"+&r"(src), [dst]"+&r"(dst)
361 : [ff_pw_1]"f"(ff_pw_1), [ff_pw_4]"f"(ff_pw_4),
362 [ff_pw_6]"f"(ff_pw_6), [ff_pw_9]"f"(ff_pw_9),
363 [ff_pw_12]"f"(ff_pw_12), [ff_pw_15]"f"(ff_pw_15),
364 [ff_pw_16]"f"(ff_pw_16), [ff_pw_64]"f"(ff_pw_64)
370 /* Do inverse transform on 8x4 part of block */
371 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
376 dc = ( 3 * dc + 1) >> 1;
377 dc = (17 * dc + 64) >> 7;
380 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
381 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
383 MMI_LDC1(%[ftmp1], %[dest0], 0x00)
384 MMI_LDC1(%[ftmp2], %[dest1], 0x00)
385 MMI_LDC1(%[ftmp3], %[dest2], 0x00)
386 MMI_LDC1(%[ftmp4], %[dest3], 0x00)
388 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
389 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
390 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
391 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
392 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
393 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
394 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
395 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
397 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
398 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
399 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
400 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
401 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
402 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
403 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
404 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
406 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
407 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
408 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
409 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
411 MMI_SDC1(%[ftmp1], %[dest0], 0x00)
412 MMI_SDC1(%[ftmp2], %[dest1], 0x00)
413 MMI_SDC1(%[ftmp3], %[dest2], 0x00)
414 MMI_SDC1(%[ftmp4], %[dest3], 0x00)
415 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
416 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
417 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
418 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
419 [ftmp8]"=&f"(ftmp[8])
420 : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
421 [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
427 #if _MIPS_SIM != _ABIO32
428 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
430 int16_t *src = block;
431 int16_t *dst = block;
439 MMI_LDC1(%[ftmp1], %[src], 0x00)
440 MMI_LDC1(%[ftmp2], %[src], 0x08)
441 MMI_LDC1(%[ftmp3], %[src], 0x10)
442 MMI_LDC1(%[ftmp4], %[src], 0x18)
443 MMI_LDC1(%[ftmp5], %[src], 0x20)
444 MMI_LDC1(%[ftmp6], %[src], 0x28)
445 MMI_LDC1(%[ftmp7], %[src], 0x30)
446 MMI_LDC1(%[ftmp8], %[src], 0x38)
449 TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp5], %[ftmp7],
450 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
451 %[ftmp13], %[tmp0], %[ftmp14], %[ftmp15])
454 TRANSPOSE_4H(%[ftmp2], %[ftmp4], %[ftmp6], %[ftmp8],
455 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
456 %[ftmp13], %[tmp0], %[ftmp14], %[ftmp15])
459 VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
460 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
461 %[ftmp0], %[ftmp13], %[ftmp14], %[ftmp15],
462 %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
465 VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp5], %[ftmp6],
466 %[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
467 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
468 %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
471 "li %[tmp0], 0x03 \n\t"
472 "mtc1 %[tmp0], %[ftmp0] \n\t"
474 PSRAH_8_MMI(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
475 %[ftmp6], %[ftmp5], %[ftmp2], %[ftmp1], %[ftmp0])
477 TRANSPOSE_4H(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
478 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
479 %[ftmp13], %[tmp0], %[ftmp14], %[ftmp15])
481 MMI_SDC1(%[ftmp3], %[dst], 0x00)
482 MMI_SDC1(%[ftmp7], %[dst], 0x10)
483 MMI_SDC1(%[ftmp4], %[dst], 0x20)
484 MMI_SDC1(%[ftmp8], %[dst], 0x30)
486 TRANSPOSE_4H(%[ftmp6], %[ftmp5], %[ftmp2], %[ftmp1],
487 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
488 %[ftmp13], %[tmp0], %[ftmp14], %[ftmp15])
490 MMI_SDC1(%[ftmp6], %[dst], 0x08)
491 MMI_SDC1(%[ftmp5], %[dst], 0x18)
492 MMI_SDC1(%[ftmp2], %[dst], 0x28)
493 MMI_SDC1(%[ftmp1], %[dst], 0x38)
494 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
495 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
496 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
497 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
498 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
499 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
500 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
501 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
503 : [src]"r"(src), [dst]"r"(dst),
504 [ff_pw_4]"f"(ff_pw_4), [ff_pw_6]"f"(ff_pw_6),
505 [ff_pw_9]"f"(ff_pw_9), [ff_pw_12]"f"(ff_pw_12),
506 [ff_pw_15]"f"(ff_pw_15), [ff_pw_16]"f"(ff_pw_16)
514 "li %[tmp0], 0x07 \n\t"
515 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
516 "mtc1 %[tmp0], %[ftmp9] \n\t"
519 MMI_LDC1(%[ftmp1], %[src], 0x00)
520 MMI_LDC1(%[ftmp2], %[src], 0x20)
521 MMI_LDC1(%[ftmp3], %[src], 0x30)
522 MMI_LDC1(%[ftmp4], %[src], 0x10)
524 VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
525 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
526 %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
529 PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp9])
531 MMI_LWC1(%[ftmp5], %[dest], 0x00)
532 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
533 MMI_LWC1(%[ftmp6], %[addr0], 0x00)
534 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
535 MMI_LWC1(%[ftmp7], %[addr0], 0x00)
536 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
537 MMI_LWC1(%[ftmp8], %[addr0], 0x00)
539 VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
540 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
543 MMI_SWC1(%[ftmp1], %[dest], 0x00)
544 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
545 MMI_SWC1(%[ftmp2], %[addr0], 0x00)
546 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
547 MMI_SWC1(%[ftmp3], %[addr0], 0x00)
548 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
549 MMI_SWC1(%[ftmp4], %[addr0], 0x00)
552 MMI_LDC1(%[ftmp1], %[src], 0x08)
553 MMI_LDC1(%[ftmp2], %[src], 0x28)
554 MMI_LDC1(%[ftmp3], %[src], 0x38)
555 MMI_LDC1(%[ftmp4], %[src], 0x18)
557 VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
558 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
559 %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
562 PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp9])
564 MMI_LWC1(%[ftmp5], %[dest], 0x04)
565 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
566 MMI_LWC1(%[ftmp6], %[addr0], 0x04)
567 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
568 MMI_LWC1(%[ftmp7], %[addr0], 0x04)
569 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
570 MMI_LWC1(%[ftmp8], %[addr0], 0x04)
572 VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
573 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
576 MMI_SWC1(%[ftmp1], %[dest], 0x04)
577 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
578 MMI_SWC1(%[ftmp2], %[addr0], 0x04)
579 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
580 MMI_SWC1(%[ftmp3], %[addr0], 0x04)
581 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
582 MMI_SWC1(%[ftmp4], %[addr0], 0x04)
584 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
585 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
586 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
587 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
588 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
591 [addr0]"=&r"(addr[0])
592 : [src]"r"(src), [dest]"r"(dest),
593 [linesize]"r"((mips_reg)linesize),
594 [ff_pw_17]"f"(ff_pw_17), [ff_pw_22]"f"(ff_pw_22),
595 [ff_pw_10]"f"(ff_pw_10), [ff_pw_64]"f"(ff_pw_64)
601 /* Do inverse transform on 4x8 parts of block */
602 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
608 dc = (17 * dc + 4) >> 3;
609 dc = (12 * dc + 64) >> 7;
612 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
613 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
615 MMI_LWC1(%[ftmp1], %[dest0], 0x00)
616 MMI_LWC1(%[ftmp2], %[dest1], 0x00)
617 MMI_LWC1(%[ftmp3], %[dest2], 0x00)
618 MMI_LWC1(%[ftmp4], %[dest3], 0x00)
619 MMI_LWC1(%[ftmp5], %[dest4], 0x00)
620 MMI_LWC1(%[ftmp6], %[dest5], 0x00)
621 MMI_LWC1(%[ftmp7], %[dest6], 0x00)
622 MMI_LWC1(%[ftmp8], %[dest7], 0x00)
624 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
625 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
626 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
627 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
628 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
629 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
630 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
631 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
633 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
634 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
635 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
636 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
637 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
638 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
639 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
640 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
642 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
643 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
644 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
645 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
646 "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
647 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
648 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
649 "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
651 MMI_SWC1(%[ftmp1], %[dest0], 0x00)
652 MMI_SWC1(%[ftmp2], %[dest1], 0x00)
653 MMI_SWC1(%[ftmp3], %[dest2], 0x00)
654 MMI_SWC1(%[ftmp4], %[dest3], 0x00)
655 MMI_SWC1(%[ftmp5], %[dest4], 0x00)
656 MMI_SWC1(%[ftmp6], %[dest5], 0x00)
657 MMI_SWC1(%[ftmp7], %[dest6], 0x00)
658 MMI_SWC1(%[ftmp8], %[dest7], 0x00)
659 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
660 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
661 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
662 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
664 [ftmp8]"=&f"(ftmp[8])
665 : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
666 [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
667 [dest4]"r"(dest+4*linesize), [dest5]"r"(dest+5*linesize),
668 [dest6]"r"(dest+6*linesize), [dest7]"r"(dest+7*linesize),
674 #if _MIPS_SIM != _ABIO32
675 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
677 int16_t *src = block;
678 int16_t *dst = block;
680 uint32_t count, tmp[1];
686 "li %[count], 0x02 \n\t"
687 "li %[tmp0], 0x03 \n\t"
688 "mtc1 %[tmp0], %[ftmp0] \n\t"
691 MMI_LDC1(%[ftmp1], %[src], 0x00)
692 MMI_LDC1(%[ftmp2], %[src], 0x10)
693 MMI_LDC1(%[ftmp3], %[src], 0x20)
694 MMI_LDC1(%[ftmp4], %[src], 0x30)
696 TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
697 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
698 %[ftmp9], %[tmp0], %[ftmp10], %[ftmp11])
701 VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
702 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
703 %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
706 PSRAH_4_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2], %[ftmp0])
708 TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
709 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
710 %[ftmp9], %[tmp0], %[ftmp10], %[ftmp11])
712 MMI_SDC1(%[ftmp1], %[dst], 0x00)
713 MMI_SDC1(%[ftmp3], %[dst], 0x10)
714 MMI_SDC1(%[ftmp4], %[dst], 0x20)
715 MMI_SDC1(%[ftmp2], %[dst], 0x30)
717 "addiu %[count], %[count], -0x01 \n\t"
718 PTR_ADDIU "%[src], %[src], 0x40 \n\t"
719 PTR_ADDIU "%[dst], %[dst], 0x40 \n\t"
720 "bnez %[count], 1b \n\t"
721 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
722 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
723 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
724 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
725 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
726 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
729 [src]"+&r"(src), [dst]"+&r"(dst)
730 : [ff_pw_17]"f"(ff_pw_17), [ff_pw_10]"f"(ff_pw_10),
731 [ff_pw_22]"f"(ff_pw_22), [ff_pw_4]"f"(ff_pw_4)
739 "li %[tmp0], 0x07 \n\t"
740 "mtc1 %[tmp0], %[ftmp0] \n\t"
742 MMI_LDC1(%[ftmp5], %[src], 0x10)
743 MMI_LDC1(%[ftmp6], %[src], 0x30)
744 MMI_LDC1(%[ftmp7], %[src], 0x50)
745 MMI_LDC1(%[ftmp8], %[src], 0x70)
747 VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
748 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
749 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
750 %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
753 MMI_LDC1(%[ftmp1], %[src], 0x00)
754 MMI_LDC1(%[ftmp2], %[src], 0x40)
755 MMI_LDC1(%[ftmp3], %[src], 0x20)
756 MMI_LDC1(%[ftmp4], %[src], 0x60)
758 VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
759 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
760 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
761 %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
764 "paddh %[ftmp4], %[ftmp4], %[ff_pw_1] \n\t"
765 "paddh %[ftmp3], %[ftmp3], %[ff_pw_1] \n\t"
766 "paddh %[ftmp2], %[ftmp2], %[ff_pw_1] \n\t"
767 "paddh %[ftmp1], %[ftmp1], %[ff_pw_1] \n\t"
769 PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
770 %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
772 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
775 MMI_LWC1(%[ftmp9], %[dest], 0x00)
776 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
777 MMI_LWC1(%[ftmp10], %[addr0], 0x00)
778 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
779 MMI_LWC1(%[ftmp11], %[addr0], 0x00)
780 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
781 MMI_LWC1(%[ftmp12], %[addr0], 0x00)
782 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
784 VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
785 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
789 MMI_LWC1(%[ftmp9], %[addr0], 0x00)
790 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
791 MMI_LWC1(%[ftmp10], %[addr0], 0x00)
792 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
793 MMI_LWC1(%[ftmp11], %[addr0], 0x00)
794 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
795 MMI_LWC1(%[ftmp12], %[addr0], 0x00)
797 VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1],
798 %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
802 MMI_SWC1(%[ftmp5], %[dest], 0x00)
803 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
804 MMI_SWC1(%[ftmp6], %[addr0], 0x00)
805 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
806 MMI_SWC1(%[ftmp7], %[addr0], 0x00)
807 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
808 MMI_SWC1(%[ftmp8], %[addr0], 0x00)
809 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
812 MMI_SWC1(%[ftmp4], %[addr0], 0x00)
813 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
814 MMI_SWC1(%[ftmp3], %[addr0], 0x00)
815 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
816 MMI_SWC1(%[ftmp2], %[addr0], 0x00)
817 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
818 MMI_SWC1(%[ftmp1], %[addr0], 0x00)
819 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
820 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
821 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
822 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
823 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
824 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
825 [ftmp12]"=&f"(ftmp[12]),
828 [addr0]"=&r"(addr[0]),
830 : [src]"r"(src), [linesize]"r"(linesize),
831 [ff_pw_1]"f"(ff_pw_1), [ff_pw_4]"f"(ff_pw_4),
832 [ff_pw_6]"f"(ff_pw_6), [ff_pw_9]"f"(ff_pw_9),
833 [ff_pw_12]"f"(ff_pw_12), [ff_pw_15]"f"(ff_pw_15),
834 [ff_pw_16]"f"(ff_pw_16), [ff_pw_64]"f"(ff_pw_64)
840 /* Do inverse transform on 4x4 part of block */
841 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
847 dc = (17 * dc + 4) >> 3;
848 dc = (17 * dc + 64) >> 7;
851 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
852 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
854 MMI_LWC1(%[ftmp1], %[dest0], 0x00)
855 MMI_LWC1(%[ftmp2], %[dest1], 0x00)
856 MMI_LWC1(%[ftmp3], %[dest2], 0x00)
857 MMI_LWC1(%[ftmp4], %[dest3], 0x00)
859 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
860 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
861 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
862 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
864 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
865 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
866 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
867 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
869 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
870 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
871 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
872 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
874 MMI_SWC1(%[ftmp1], %[dest0], 0x00)
875 MMI_SWC1(%[ftmp2], %[dest1], 0x00)
876 MMI_SWC1(%[ftmp3], %[dest2], 0x00)
877 MMI_SWC1(%[ftmp4], %[dest3], 0x00)
878 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
879 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
881 [ftmp4]"=&f"(ftmp[4])
882 : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
883 [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
889 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
891 int16_t *src = block;
892 int16_t *dst = block;
900 "li %[tmp0], 0x03 \n\t"
901 "mtc1 %[tmp0], %[ftmp0] \n\t"
903 MMI_LDC1(%[ftmp1], %[src], 0x00)
904 MMI_LDC1(%[ftmp2], %[src], 0x10)
905 MMI_LDC1(%[ftmp3], %[src], 0x20)
906 MMI_LDC1(%[ftmp4], %[src], 0x30)
908 TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
909 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
910 %[ftmp9], %[tmp0], %[ftmp10], %[ftmp11])
913 VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
914 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
915 %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
918 PSRAH_4_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2], %[ftmp0])
920 TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
921 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
922 %[ftmp9], %[tmp0], %[ftmp10], %[ftmp11])
924 MMI_SDC1(%[ftmp1], %[dst], 0x00)
925 MMI_SDC1(%[ftmp3], %[dst], 0x10)
926 MMI_SDC1(%[ftmp4], %[dst], 0x20)
927 MMI_SDC1(%[ftmp2], %[dst], 0x30)
928 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
929 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
930 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
931 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
932 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
933 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
935 [src]"+&r"(src), [dst]"+&r"(dst)
936 : [ff_pw_17]"f"(ff_pw_17), [ff_pw_10]"f"(ff_pw_10),
937 [ff_pw_22]"f"(ff_pw_22), [ff_pw_4]"f"(ff_pw_4)
945 "li %[tmp0], 0x07 \n\t"
946 "mtc1 %[tmp0], %[ftmp0] \n\t"
949 MMI_LDC1(%[ftmp1], %[src], 0x00)
950 MMI_LDC1(%[ftmp2], %[src], 0x20)
951 MMI_LDC1(%[ftmp3], %[src], 0x30)
952 MMI_LDC1(%[ftmp4], %[src], 0x10)
954 VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
955 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
956 %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
959 PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp0])
961 MMI_LWC1(%[ftmp5], %[dest], 0x00)
962 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
963 MMI_LWC1(%[ftmp6], %[addr0], 0x00)
964 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
965 MMI_LWC1(%[ftmp7], %[addr0], 0x00)
966 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
967 MMI_LWC1(%[ftmp8], %[addr0], 0x00)
969 "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
971 VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
972 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
975 MMI_SWC1(%[ftmp1], %[dest], 0x00)
976 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
977 MMI_SWC1(%[ftmp2], %[addr0], 0x00)
978 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
979 MMI_SWC1(%[ftmp3], %[addr0], 0x00)
980 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
981 MMI_SWC1(%[ftmp4], %[addr0], 0x00)
982 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
983 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
984 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
985 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
986 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
989 [addr0]"=&r"(addr[0])
990 : [src]"r"(src), [dest]"r"(dest),
991 [linesize]"r"((mips_reg)linesize),
992 [ff_pw_17]"f"(ff_pw_17), [ff_pw_22]"f"(ff_pw_22),
993 [ff_pw_10]"f"(ff_pw_10), [ff_pw_64]"f"(ff_pw_64)
998 /* Apply overlap transform to horizontal edge */
999 void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
1005 for (i = 0; i < 8; i++) {
1010 d1 = (a - d + 3 + rnd) >> 3;
1011 d2 = (a - d + b - c + 4 - rnd) >> 3;
1014 src[-1] = av_clip_uint8(b - d2);
1015 src[0] = av_clip_uint8(c + d2);
1022 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
1027 int rnd1 = flags & 2 ? 3 : 4;
1028 int rnd2 = 7 - rnd1;
1029 for (i = 0; i < 8; i++) {
1037 left[6] = ((a << 3) - d1 + rnd1) >> 3;
1038 left[7] = ((b << 3) - d2 + rnd2) >> 3;
1039 right[0] = ((c << 3) + d2 + rnd1) >> 3;
1040 right[1] = ((d << 3) + d1 + rnd2) >> 3;
1042 right += right_stride;
1043 left += left_stride;
1051 /* Apply overlap transform to vertical edge */
1052 void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
1058 for (i = 0; i < 8; i++) {
1059 a = src[-2 * stride];
1063 d1 = (a - d + 3 + rnd) >> 3;
1064 d2 = (a - d + b - c + 4 - rnd) >> 3;
1066 src[-2 * stride] = a - d1;
1067 src[-stride] = av_clip_uint8(b - d2);
1068 src[0] = av_clip_uint8(c + d2);
1069 src[stride] = d + d1;
1075 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1080 int rnd1 = 4, rnd2 = 3;
1081 for (i = 0; i < 8; i++) {
1089 top[48] = ((a << 3) - d1 + rnd1) >> 3;
1090 top[56] = ((b << 3) - d2 + rnd2) >> 3;
1091 bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1092 bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1102 * VC-1 in-loop deblocking filter for one line
1103 * @param src source block type
1104 * @param stride block stride
1105 * @param pq block quantizer
1106 * @return whether other 3 pairs should be filtered or not
1109 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1111 int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1112 5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1113 int a0_sign = a0 >> 31; /* Store sign */
1115 a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1117 int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1118 5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1119 int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1120 5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1121 if (a1 < a0 || a2 < a0) {
1122 int clip = src[-1 * stride] - src[0 * stride];
1123 int clip_sign = clip >> 31;
1125 clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1127 int a3 = FFMIN(a1, a2);
1128 int d = 5 * (a3 - a0);
1129 int d_sign = (d >> 31);
1131 d = ((d ^ d_sign) - d_sign) >> 3;
1134 if (d_sign ^ clip_sign)
1138 d = (d ^ d_sign) - d_sign; /* Restore sign */
1139 src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1140 src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1150 * VC-1 in-loop deblocking filter
1151 * @param src source block type
1152 * @param step distance between horizontally adjacent elements
1153 * @param stride distance between vertically adjacent elements
1154 * @param len edge length to filter (4 or 8 pixels)
1155 * @param pq block quantizer
1158 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1164 for (i = 0; i < len; i += 4) {
1165 filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1167 vc1_filter_line(src + 0 * step, stride, pq);
1168 vc1_filter_line(src + 1 * step, stride, pq);
1169 vc1_filter_line(src + 3 * step, stride, pq);
1175 void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1177 vc1_loop_filter(src, 1, stride, 4, pq);
1180 void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1182 vc1_loop_filter(src, stride, 1, 4, pq);
1185 void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1187 vc1_loop_filter(src, 1, stride, 8, pq);
1190 void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1192 vc1_loop_filter(src, stride, 1, 8, pq);
1195 void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1197 vc1_loop_filter(src, 1, stride, 16, pq);
1200 void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1202 vc1_loop_filter(src, stride, 1, 16, pq);
1205 void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1206 ptrdiff_t stride, int rnd)
1208 ff_put_pixels8_8_mmi(dst, src, stride, 8);
1210 void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1211 ptrdiff_t stride, int rnd)
1213 ff_put_pixels16_8_mmi(dst, src, stride, 16);
1215 void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1216 ptrdiff_t stride, int rnd)
1218 ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1220 void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1221 ptrdiff_t stride, int rnd)
1223 ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1226 #define OP_PUT(S, D)
1227 #define OP_AVG(S, D) \
1228 "ldc1 $f16, "#S" \n\t" \
1229 "pavgb "#D", "#D", $f16 \n\t"
1231 /** Add rounder from $f14 to $f6 and pack result at destination */
1232 #define NORMALIZE_MMI(SHIFT) \
1233 "paddh $f6, $f6, $f14 \n\t" /* +bias-r */ \
1234 "paddh $f8, $f8, $f14 \n\t" /* +bias-r */ \
1235 "psrah $f6, $f6, "SHIFT" \n\t" \
1236 "psrah $f8, $f8, "SHIFT" \n\t"
1238 #define TRANSFER_DO_PACK(OP) \
1239 "packushb $f6, $f6, $f8 \n\t" \
1241 "sdc1 $f6, 0x00(%[dst]) \n\t"
1243 #define TRANSFER_DONT_PACK(OP) \
1244 OP(0(%[dst]), $f6) \
1245 OP(8(%[dst]), $f8) \
1246 "sdc1 $f6, 0x00(%[dst]) \n\t" \
1247 "sdc1 $f8, 0x08(%[dst]) \n\t"
1249 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1250 #define DO_UNPACK(reg) \
1251 "punpcklbh "reg", "reg", $f0 \n\t"
1252 #define DONT_UNPACK(reg)
1254 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1255 #define LOAD_ROUNDER_MMI(ROUND) \
1256 "lwc1 $f14, "ROUND" \n\t" \
1257 "punpcklhw $f14, $f14, $f14 \n\t" \
1258 "punpcklwd $f14, $f14, $f14 \n\t"
1261 #define SHIFT2_LINE(OFF, R0, R1, R2, R3) \
1262 "paddh "#R1", "#R1", "#R2" \n\t" \
1263 PTR_ADDU "$9, %[src], %[stride1] \n\t" \
1264 MMI_ULWC1(R0, $9, 0x00) \
1265 "pmullh "#R1", "#R1", $f6 \n\t" \
1266 "punpcklbh "#R0", "#R0", $f0 \n\t" \
1267 PTR_ADDU "$9, %[src], %[stride] \n\t" \
1268 MMI_ULWC1(R3, $9, 0x00) \
1269 "psubh "#R1", "#R1", "#R0" \n\t" \
1270 "punpcklbh "#R3", "#R3", $f0 \n\t" \
1271 "paddh "#R1", "#R1", $f14 \n\t" \
1272 "psubh "#R1", "#R1", "#R3" \n\t" \
1273 "psrah "#R1", "#R1", %[shift] \n\t" \
1274 MMI_SDC1(R1, %[dst], OFF) \
1275 PTR_ADDU "%[src], %[src], %[stride] \n\t"
1277 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1278 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1279 const uint8_t *src, mips_reg stride,
1280 int rnd, int64_t shift)
1286 "xor $f0, $f0, $f0 \n\t"
1288 LOAD_ROUNDER_MMI("%[rnd]")
1289 "ldc1 $f12, %[ff_pw_9] \n\t"
1291 MMI_ULWC1($f4, %[src], 0x00)
1292 PTR_ADDU "%[src], %[src], %[stride] \n\t"
1293 MMI_ULWC1($f6, %[src], 0x00)
1294 "punpcklbh $f4, $f4, $f0 \n\t"
1295 "punpcklbh $f6, $f6, $f0 \n\t"
1296 SHIFT2_LINE( 0, $f2, $f4, $f6, $f8)
1297 SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1298 SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1299 SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1300 SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1301 SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1302 SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1303 SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1304 PTR_SUBU "%[src], %[src], %[stride2] \n\t"
1305 PTR_ADDIU "%[dst], %[dst], 0x08 \n\t"
1306 "addiu $8, $8, -0x01 \n\t"
1308 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT
1309 [src]"+r"(src), [dst]"+r"(dst)
1310 : [stride]"r"(stride), [stride1]"r"(-2*stride),
1311 [shift]"f"(shift), [rnd]"m"(rnd),
1312 [stride2]"r"(9*stride-4), [ff_pw_9]"m"(ff_pw_9)
1313 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1314 "$f14", "$f16", "memory"
1319 * Data is already unpacked, so some operations can directly be made from
1322 #define VC1_HOR_16B_SHIFT2(OP, OPNAME) \
1323 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1324 const int16_t *src, int rnd) \
1327 DECLARE_VAR_ALL64; \
1328 DECLARE_VAR_ADDRT; \
1331 rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ \
1334 LOAD_ROUNDER_MMI("%[rnd]") \
1335 "ldc1 $f12, %[ff_pw_128] \n\t" \
1336 "ldc1 $f10, %[ff_pw_9] \n\t" \
1338 MMI_ULDC1($f2, %[src], 0x00) \
1339 MMI_ULDC1($f4, %[src], 0x08) \
1340 MMI_ULDC1($f6, %[src], 0x02) \
1341 MMI_ULDC1($f8, %[src], 0x0a) \
1342 MMI_ULDC1($f0, %[src], 0x06) \
1343 "paddh $f2, $f2, $f0 \n\t" \
1344 MMI_ULDC1($f0, %[src], 0x0e) \
1345 "paddh $f4, $f4, $f0 \n\t" \
1346 MMI_ULDC1($f0, %[src], 0x04) \
1347 "paddh $f6, $f6, $f0 \n\t" \
1348 MMI_ULDC1($f0, %[src], 0x0b) \
1349 "paddh $f8, $f8, $f0 \n\t" \
1350 "pmullh $f6, $f6, $f10 \n\t" \
1351 "pmullh $f8, $f8, $f10 \n\t" \
1352 "psubh $f6, $f6, $f2 \n\t" \
1353 "psubh $f8, $f8, $f4 \n\t" \
1354 "li $8, 0x07 \n\t" \
1355 "mtc1 $8, $f16 \n\t" \
1356 NORMALIZE_MMI("$f16") \
1358 "paddh $f6, $f6, $f12 \n\t" \
1359 "paddh $f8, $f8, $f12 \n\t" \
1360 TRANSFER_DO_PACK(OP) \
1361 "addiu %[h], %[h], -0x01 \n\t" \
1362 PTR_ADDIU "%[src], %[src], 0x18 \n\t" \
1363 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1364 "bnez %[h], 1b \n\t" \
1365 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1367 [src]"+r"(src), [dst]"+r"(dst) \
1368 : [stride]"r"(stride), [rnd]"m"(rnd), \
1369 [ff_pw_9]"m"(ff_pw_9), [ff_pw_128]"m"(ff_pw_128) \
1370 : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", \
1375 VC1_HOR_16B_SHIFT2(OP_PUT, put_)
1376 VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
1379 * Purely vertical or horizontal 1/2 shift interpolation.
1380 * Sacrify $f12 for *9 factor.
1382 #define VC1_SHIFT2(OP, OPNAME)\
1383 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src, \
1384 mips_reg stride, int rnd, \
1387 DECLARE_VAR_LOW32; \
1388 DECLARE_VAR_ADDRT; \
1393 "xor $f0, $f0, $f0 \n\t" \
1394 "li $10, 0x08 \n\t" \
1395 LOAD_ROUNDER_MMI("%[rnd]") \
1396 "ldc1 $f12, %[ff_pw_9] \n\t" \
1398 MMI_ULWC1($f6, %[src], 0x00) \
1399 MMI_ULWC1($f8, %[src], 0x04) \
1400 PTR_ADDU "$9, %[src], %[offset] \n\t" \
1401 MMI_ULWC1($f2, $9, 0x00) \
1402 MMI_ULWC1($f4, $9, 0x04) \
1403 PTR_ADDU "%[src], %[src], %[offset] \n\t" \
1404 "punpcklbh $f6, $f6, $f0 \n\t" \
1405 "punpcklbh $f8, $f8, $f0 \n\t" \
1406 "punpcklbh $f2, $f2, $f0 \n\t" \
1407 "punpcklbh $f4, $f4, $f0 \n\t" \
1408 "paddh $f6, $f6, $f2 \n\t" \
1409 "paddh $f8, $f8, $f4 \n\t" \
1410 PTR_ADDU "$9, %[src], %[offset_x2n] \n\t" \
1411 MMI_ULWC1($f2, $9, 0x00) \
1412 MMI_ULWC1($f4, $9, 0x04) \
1413 "pmullh $f6, $f6, $f12 \n\t" /* 0,9,9,0*/ \
1414 "pmullh $f8, $f8, $f12 \n\t" /* 0,9,9,0*/ \
1415 "punpcklbh $f2, $f2, $f0 \n\t" \
1416 "punpcklbh $f4, $f4, $f0 \n\t" \
1417 "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,0*/ \
1418 "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,0*/ \
1419 PTR_ADDU "$9, %[src], %[offset] \n\t" \
1420 MMI_ULWC1($f2, $9, 0x00) \
1421 MMI_ULWC1($f4, $9, 0x04) \
1422 "punpcklbh $f2, $f2, $f0 \n\t" \
1423 "punpcklbh $f4, $f4, $f0 \n\t" \
1424 "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,-1*/ \
1425 "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,-1*/ \
1426 "li $8, 0x04 \n\t" \
1427 "mtc1 $8, $f16 \n\t" \
1428 NORMALIZE_MMI("$f16") \
1429 "packushb $f6, $f6, $f8 \n\t" \
1431 "sdc1 $f6, 0x00(%[dst]) \n\t" \
1432 "addiu $10, $10, -0x01 \n\t" \
1433 PTR_ADDU "%[src], %[src], %[stride1] \n\t" \
1434 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1435 "bnez $10, 1b \n\t" \
1436 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1437 [src]"+r"(src), [dst]"+r"(dst) \
1438 : [offset]"r"(offset), [offset_x2n]"r"(-2*offset), \
1439 [stride]"g"(stride), [rnd]"m"(rnd), \
1440 [stride1]"g"(stride-offset), \
1441 [ff_pw_9]"m"(ff_pw_9) \
1442 : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", \
1443 "$f12", "$f14", "$f16", "memory" \
1447 VC1_SHIFT2(OP_PUT, put_)
1448 VC1_SHIFT2(OP_AVG, avg_)
1451 * Core of the 1/4 and 3/4 shift bicubic interpolation.
1453 * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
1454 * @param LOAD "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1455 * @param M "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1456 * @param A1 Stride address of 1st tap (beware of unpacked/packed).
1457 * @param A2 Stride address of 2nd tap
1458 * @param A3 Stride address of 3rd tap
1459 * @param A4 Stride address of 4th tap
1461 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4) \
1462 PTR_ADDU "$9, %[src], "#A1" \n\t" \
1463 LOAD($f2, $9, M*0) \
1464 LOAD($f4, $9, M*4) \
1467 "pmullh $f2, $f2, %[ff_pw_3] \n\t" \
1468 "pmullh $f4, $f4, %[ff_pw_3] \n\t" \
1469 PTR_ADDU "$9, %[src], "#A2" \n\t" \
1470 LOAD($f6, $9, M*0) \
1471 LOAD($f8, $9, M*4) \
1474 "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1475 "pmullh $f8, $f8, $f12 \n\t" /* *18 */ \
1476 "psubh $f6, $f6, $f2 \n\t" /* *18, -3 */ \
1477 "psubh $f8, $f8, $f4 \n\t" /* *18, -3 */ \
1478 PTR_ADDU "$9, %[src], "#A4" \n\t" \
1479 LOAD($f2, $9, M*0) \
1480 LOAD($f4, $9, M*4) \
1483 "li $8, 0x02 \n\t" \
1484 "mtc1 $8, $f16 \n\t" \
1485 "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1486 "psllh $f4, $f4, $f16 \n\t" /* 4* */ \
1487 "psubh $f6, $f6, $f2 \n\t" /* -4,18,-3 */ \
1488 "psubh $f8, $f8, $f4 \n\t" /* -4,18,-3 */ \
1489 PTR_ADDU "$9, %[src], "#A3" \n\t" \
1490 LOAD($f2, $9, M*0) \
1491 LOAD($f4, $9, M*4) \
1494 "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1495 "pmullh $f4, $f4, $f10 \n\t" /* *53 */ \
1496 "paddh $f6, $f6, $f2 \n\t" /* 4,53,18,-3 */ \
1497 "paddh $f8, $f8, $f4 \n\t" /* 4,53,18,-3 */
1500 * Macro to build the vertical 16bits version of vc1_put_shift[13].
1501 * Here, offset=src_stride. Parameters passed A1 to A4 must use
1502 * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1504 * @param NAME Either 1 or 3
1505 * @see MSPEL_FILTER13_CORE for information on A1->A4
1507 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
1509 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src, \
1510 mips_reg src_stride, \
1511 int rnd, int64_t shift) \
1514 DECLARE_VAR_LOW32; \
1515 DECLARE_VAR_ADDRT; \
1517 src -= src_stride; \
1520 "xor $f0, $f0, $f0 \n\t" \
1521 LOAD_ROUNDER_MMI("%[rnd]") \
1522 "ldc1 $f10, %[ff_pw_53] \n\t" \
1523 "ldc1 $f12, %[ff_pw_18] \n\t" \
1526 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
1527 NORMALIZE_MMI("%[shift]") \
1528 TRANSFER_DONT_PACK(OP_PUT) \
1529 /* Last 3 (in fact 4) bytes on the line */ \
1530 PTR_ADDU "$9, %[src], "#A1" \n\t" \
1531 MMI_ULWC1($f2, $9, 0x08) \
1533 "mov.d $f6, $f2 \n\t" \
1534 "paddh $f2, $f2, $f2 \n\t" \
1535 "paddh $f2, $f2, $f6 \n\t" /* 3* */ \
1536 PTR_ADDU "$9, %[src], "#A2" \n\t" \
1537 MMI_ULWC1($f6, $9, 0x08) \
1539 "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1540 "psubh $f6, $f6, $f2 \n\t" /* *18,-3 */ \
1541 PTR_ADDU "$9, %[src], "#A3" \n\t" \
1542 MMI_ULWC1($f2, $9, 0x08) \
1544 "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1545 "paddh $f6, $f6, $f2 \n\t" /* *53,18,-3 */ \
1546 PTR_ADDU "$9, %[src], "#A4" \n\t" \
1547 MMI_ULWC1($f2, $9, 0x08) \
1549 "li $8, 0x02 \n\t" \
1550 "mtc1 $8, $f16 \n\t" \
1551 "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1552 "psubh $f6, $f6, $f2 \n\t" \
1553 "paddh $f6, $f6, $f14 \n\t" \
1554 "li $8, 0x06 \n\t" \
1555 "mtc1 $8, $f16 \n\t" \
1556 "psrah $f6, $f6, $f16 \n\t" \
1557 "sdc1 $f6, 0x10(%[dst]) \n\t" \
1558 "addiu %[h], %[h], -0x01 \n\t" \
1559 PTR_ADDU "%[src], %[src], %[stride_x1] \n\t" \
1560 PTR_ADDIU "%[dst], %[dst], 0x18 \n\t" \
1561 "bnez %[h], 1b \n\t" \
1562 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1564 [src]"+r"(src), [dst]"+r"(dst) \
1565 : [stride_x1]"r"(src_stride), [stride_x2]"r"(2*src_stride), \
1566 [stride_x3]"r"(3*src_stride), \
1567 [rnd]"m"(rnd), [shift]"f"(shift), \
1568 [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1569 [ff_pw_3]"f"(ff_pw_3) \
1570 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
1571 "$f14", "$f16", "memory" \
1576 * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1577 * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1579 * @param NAME Either 1 or 3
1580 * @see MSPEL_FILTER13_CORE for information on A1->A4
1582 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
1584 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride, \
1585 const int16_t *src, int rnd) \
1588 DECLARE_VAR_ALL64; \
1589 DECLARE_VAR_ADDRT; \
1592 rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
1595 "xor $f0, $f0, $f0 \n\t" \
1596 LOAD_ROUNDER_MMI("%[rnd]") \
1597 "ldc1 $f10, %[ff_pw_53] \n\t" \
1598 "ldc1 $f12, %[ff_pw_18] \n\t" \
1601 MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4) \
1602 "li $8, 0x07 \n\t" \
1603 "mtc1 $8, $f16 \n\t" \
1604 NORMALIZE_MMI("$f16") \
1606 "paddh $f6, $f6, %[ff_pw_128] \n\t" \
1607 "paddh $f8, $f8, %[ff_pw_128] \n\t" \
1608 TRANSFER_DO_PACK(OP) \
1609 "addiu %[h], %[h], -0x01 \n\t" \
1610 PTR_ADDU "%[src], %[src], 0x18 \n\t" \
1611 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1612 "bnez %[h], 1b \n\t" \
1613 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1615 [src]"+r"(src), [dst]"+r"(dst) \
1616 : [stride]"r"(stride), [rnd]"m"(rnd), \
1617 [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1618 [ff_pw_3]"f"(ff_pw_3), [ff_pw_128]"f"(ff_pw_128) \
1619 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
1620 "$f14", "$f16", "memory" \
1625 * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
1626 * Here, offset=src_stride. Parameters passed A1 to A4 must use
1627 * %3 (offset), %4 (2*offset) and %5 (3*offset).
1629 * @param NAME Either 1 or 3
1630 * @see MSPEL_FILTER13_CORE for information on A1->A4
1632 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
1634 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src, \
1635 mips_reg stride, int rnd, mips_reg offset) \
1638 DECLARE_VAR_LOW32; \
1639 DECLARE_VAR_ADDRT; \
1644 __asm__ volatile ( \
1645 "xor $f0, $f0, $f0 \n\t" \
1646 LOAD_ROUNDER_MMI("%[rnd]") \
1647 "ldc1 $f10, %[ff_pw_53] \n\t" \
1648 "ldc1 $f12, %[ff_pw_18] \n\t" \
1651 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
1652 "li $8, 0x06 \n\t" \
1653 "mtc1 $8, $f16 \n\t" \
1654 NORMALIZE_MMI("$f16") \
1655 TRANSFER_DO_PACK(OP) \
1656 "addiu %[h], %[h], -0x01 \n\t" \
1657 PTR_ADDU "%[src], %[src], %[stride] \n\t" \
1658 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1659 "bnez %[h], 1b \n\t" \
1660 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1662 [src]"+r"(src), [dst]"+r"(dst) \
1663 : [offset_x1]"r"(offset), [offset_x2]"r"(2*offset), \
1664 [offset_x3]"r"(3*offset), [stride]"g"(stride), \
1666 [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1667 [ff_pw_3]"f"(ff_pw_3) \
1668 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
1669 "$f14", "$f16", "memory" \
1674 /** 1/4 shift bicubic interpolation */
1675 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
1676 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
1677 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
1678 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
1679 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
1681 /** 3/4 shift bicubic interpolation */
1682 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
1683 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
1684 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
1685 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
1686 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
1688 typedef void (*vc1_mspel_mc_filter_ver_16bits)
1689 (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
1691 typedef void (*vc1_mspel_mc_filter_hor_16bits)
1692 (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
1693 typedef void (*vc1_mspel_mc_filter_8bits)
1694 (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
1698 * Interpolate fractional pel values by applying proper vertical then
1699 * horizontal filter.
1701 * @param dst Destination buffer for interpolated pels.
1702 * @param src Source buffer.
1703 * @param stride Stride for both src and dst buffers.
1704 * @param hmode Horizontal filter (expressed in quarter pixels shift).
1705 * @param hmode Vertical filter.
1706 * @param rnd Rounding bias.
1708 #define VC1_MSPEL_MC(OP) \
1709 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
1710 int hmode, int vmode, int rnd) \
1712 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
1713 { NULL, vc1_put_ver_16b_shift1_mmi, \
1714 vc1_put_ver_16b_shift2_mmi, \
1715 vc1_put_ver_16b_shift3_mmi }; \
1716 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
1717 { NULL, OP ## vc1_hor_16b_shift1_mmi, \
1718 OP ## vc1_hor_16b_shift2_mmi, \
1719 OP ## vc1_hor_16b_shift3_mmi }; \
1720 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = \
1721 { NULL, OP ## vc1_shift1_mmi, \
1722 OP ## vc1_shift2_mmi, \
1723 OP ## vc1_shift3_mmi }; \
1725 if (vmode) { /* Vertical filter to apply */ \
1726 if (hmode) { /* Horizontal filter to apply, output to tmp */ \
1727 static const int shift_value[] = { 0, 5, 1, 5 }; \
1728 int shift = (shift_value[hmode]+shift_value[vmode])>>1; \
1730 LOCAL_ALIGNED(16, int16_t, tmp, [12*8]); \
1732 r = (1<<(shift-1)) + rnd-1; \
1733 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); \
1735 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); \
1738 else { /* No horizontal filter, output 8 lines to dst */ \
1739 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); \
1744 /* Horizontal mode with no vertical mode */ \
1745 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); \
1747 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
1748 int stride, int hmode, int vmode, int rnd)\
1750 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
1751 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
1752 dst += 8*stride; src += 8*stride; \
1753 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
1754 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
1760 /** Macro to ease bicubic filter interpolation functions declarations */
1761 #define DECLARE_FUNCTION(a, b) \
1762 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
1763 const uint8_t *src, \
1767 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
1769 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
1770 const uint8_t *src, \
1774 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
1776 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
1777 const uint8_t *src, \
1781 put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
1783 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
1784 const uint8_t *src, \
1788 avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
1791 DECLARE_FUNCTION(0, 1)
1792 DECLARE_FUNCTION(0, 2)
1793 DECLARE_FUNCTION(0, 3)
1795 DECLARE_FUNCTION(1, 0)
1796 DECLARE_FUNCTION(1, 1)
1797 DECLARE_FUNCTION(1, 2)
1798 DECLARE_FUNCTION(1, 3)
1800 DECLARE_FUNCTION(2, 0)
1801 DECLARE_FUNCTION(2, 1)
1802 DECLARE_FUNCTION(2, 2)
1803 DECLARE_FUNCTION(2, 3)
1805 DECLARE_FUNCTION(3, 0)
1806 DECLARE_FUNCTION(3, 1)
1807 DECLARE_FUNCTION(3, 2)
1808 DECLARE_FUNCTION(3, 3)
1810 #define CHROMA_MC_8_MMI \
1811 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
1812 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
1813 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
1814 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
1815 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
1816 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
1817 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \
1818 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
1820 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
1821 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" \
1822 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
1823 "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" \
1824 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
1825 "pmullh %[ftmp7], %[ftmp7], %[C] \n\t" \
1826 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
1827 "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" \
1829 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
1830 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
1831 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
1832 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
1834 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
1835 "paddh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" \
1836 "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
1837 "paddh %[ftmp5], %[ftmp5], %[ff_pw_28] \n\t" \
1839 "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
1840 "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
1841 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1844 #define CHROMA_MC_4_MMI \
1845 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
1846 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
1847 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
1848 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
1850 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
1851 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
1852 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
1853 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
1855 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
1856 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
1857 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
1858 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
1860 "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
1861 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1864 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
1865 uint8_t *src /* align 1 */,
1866 int stride, int h, int x, int y)
1868 const int A = (8 - x) * (8 - y);
1869 const int B = (x) * (8 - y);
1870 const int C = (8 - x) * (y);
1871 const int D = (x) * (y);
1877 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1880 "li %[tmp0], 0x06 \n\t"
1881 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1882 "mtc1 %[tmp0], %[ftmp9] \n\t"
1883 "pshufh %[A], %[A], %[ftmp0] \n\t"
1884 "pshufh %[B], %[B], %[ftmp0] \n\t"
1885 "pshufh %[C], %[C], %[ftmp0] \n\t"
1886 "pshufh %[D], %[D], %[ftmp0] \n\t"
1889 MMI_ULDC1(%[ftmp1], %[src], 0x00)
1890 MMI_ULDC1(%[ftmp2], %[src], 0x01)
1891 PTR_ADDU "%[src], %[src], %[stride] \n\t"
1892 MMI_ULDC1(%[ftmp3], %[src], 0x00)
1893 MMI_ULDC1(%[ftmp4], %[src], 0x01)
1897 MMI_SDC1(%[ftmp1], %[dst], 0x00)
1898 "addiu %[h], %[h], -0x01 \n\t"
1899 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
1900 "bnez %[h], 1b \n\t"
1901 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1902 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1903 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1904 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1905 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1908 [tmp0]"=&r"(tmp[0]),
1909 [src]"+&r"(src), [dst]"+&r"(dst),
1911 : [stride]"r"((mips_reg)stride),
1912 [A]"f"(A), [B]"f"(B),
1913 [C]"f"(C), [D]"f"(D),
1914 [ff_pw_28]"f"(ff_pw_28)
1919 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
1920 uint8_t *src /* align 1 */,
1921 int stride, int h, int x, int y)
1923 const int A = (8 - x) * (8 - y);
1924 const int B = (x) * (8 - y);
1925 const int C = (8 - x) * (y);
1926 const int D = (x) * (y);
1932 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1935 "li %[tmp0], 0x06 \n\t"
1936 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1937 "mtc1 %[tmp0], %[ftmp5] \n\t"
1938 "pshufh %[A], %[A], %[ftmp0] \n\t"
1939 "pshufh %[B], %[B], %[ftmp0] \n\t"
1940 "pshufh %[C], %[C], %[ftmp0] \n\t"
1941 "pshufh %[D], %[D], %[ftmp0] \n\t"
1944 MMI_ULWC1(%[ftmp1], %[src], 0x00)
1945 MMI_ULWC1(%[ftmp2], %[src], 0x01)
1946 PTR_ADDU "%[src], %[src], %[stride] \n\t"
1947 MMI_ULWC1(%[ftmp3], %[src], 0x00)
1948 MMI_ULWC1(%[ftmp4], %[src], 0x01)
1952 MMI_SWC1(%[ftmp1], %[dst], 0x00)
1953 "addiu %[h], %[h], -0x01 \n\t"
1954 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
1955 "bnez %[h], 1b \n\t"
1956 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1957 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1958 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1959 [tmp0]"=&r"(tmp[0]),
1962 [src]"+&r"(src), [dst]"+&r"(dst),
1964 : [stride]"r"((mips_reg)stride),
1965 [A]"f"(A), [B]"f"(B),
1966 [C]"f"(C), [D]"f"(D),
1967 [ff_pw_28]"f"(ff_pw_28)
1972 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
1973 uint8_t *src /* align 1 */,
1974 int stride, int h, int x, int y)
1976 const int A = (8 - x) * (8 - y);
1977 const int B = (x) * (8 - y);
1978 const int C = (8 - x) * (y);
1979 const int D = (x) * (y);
1985 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1988 "li %[tmp0], 0x06 \n\t"
1989 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1990 "mtc1 %[tmp0], %[ftmp9] \n\t"
1991 "pshufh %[A], %[A], %[ftmp0] \n\t"
1992 "pshufh %[B], %[B], %[ftmp0] \n\t"
1993 "pshufh %[C], %[C], %[ftmp0] \n\t"
1994 "pshufh %[D], %[D], %[ftmp0] \n\t"
1997 MMI_ULDC1(%[ftmp1], %[src], 0x00)
1998 MMI_ULDC1(%[ftmp2], %[src], 0x01)
1999 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2000 MMI_ULDC1(%[ftmp3], %[src], 0x00)
2001 MMI_ULDC1(%[ftmp4], %[src], 0x01)
2005 MMI_LDC1(%[ftmp2], %[dst], 0x00)
2006 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2008 MMI_SDC1(%[ftmp1], %[dst], 0x00)
2009 "addiu %[h], %[h], -0x01 \n\t"
2010 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2011 "bnez %[h], 1b \n\t"
2012 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2013 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2014 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2015 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2016 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2017 [tmp0]"=&r"(tmp[0]),
2020 [src]"+&r"(src), [dst]"+&r"(dst),
2022 : [stride]"r"((mips_reg)stride),
2023 [A]"f"(A), [B]"f"(B),
2024 [C]"f"(C), [D]"f"(D),
2025 [ff_pw_28]"f"(ff_pw_28)
2030 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2031 uint8_t *src /* align 1 */,
2032 int stride, int h, int x, int y)
2034 const int A = (8 - x) * (8 - y);
2035 const int B = ( x) * (8 - y);
2036 const int C = (8 - x) * ( y);
2037 const int D = ( x) * ( y);
2043 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2046 "li %[tmp0], 0x06 \n\t"
2047 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2048 "mtc1 %[tmp0], %[ftmp5] \n\t"
2049 "pshufh %[A], %[A], %[ftmp0] \n\t"
2050 "pshufh %[B], %[B], %[ftmp0] \n\t"
2051 "pshufh %[C], %[C], %[ftmp0] \n\t"
2052 "pshufh %[D], %[D], %[ftmp0] \n\t"
2055 MMI_ULWC1(%[ftmp1], %[src], 0x00)
2056 MMI_ULWC1(%[ftmp2], %[src], 0x01)
2057 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2058 MMI_ULWC1(%[ftmp3], %[src], 0x00)
2059 MMI_ULWC1(%[ftmp4], %[src], 0x01)
2063 MMI_LWC1(%[ftmp2], %[dst], 0x00)
2064 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2066 MMI_SWC1(%[ftmp1], %[dst], 0x00)
2067 "addiu %[h], %[h], -0x01 \n\t"
2068 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2069 "bnez %[h], 1b \n\t"
2070 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2071 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2072 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2073 [tmp0]"=&r"(tmp[0]),
2076 [src]"+&r"(src), [dst]"+&r"(dst),
2078 : [stride]"r"((mips_reg)stride),
2079 [A]"f"(A), [B]"f"(B),
2080 [C]"f"(C), [D]"f"(D),
2081 [ff_pw_28]"f"(ff_pw_28)