2 * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
4 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/avassert.h"
24 #include "libavcodec/vc1dsp.h"
25 #include "constants.h"
26 #include "vc1dsp_mips.h"
27 #include "hpeldsp_mips.h"
28 #include "libavutil/mips/mmiutils.h"
30 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0) \
31 "li %[tmp0], "#r1" \n\t" \
32 "mtc1 %[tmp0], %[ftmp13] \n\t" \
33 "pshufh %[ftmp13], %[ftmp13], %[ftmp23] \n\t" \
34 "li %[tmp0], "#r2" \n\t" \
35 "mtc1 %[tmp0], %[ftmp14] \n\t" \
36 "pshufh %[ftmp14], %[ftmp14], %[ftmp23] \n\t" \
37 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
38 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
39 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
40 "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
41 "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
42 "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
44 "li %[tmp0], "#r3" \n\t" \
45 "mtc1 %[tmp0], %[ftmp13] \n\t" \
46 "pshufh %[ftmp13], %[ftmp13], %[ftmp23] \n\t" \
47 "li %[tmp0], "#r4" \n\t" \
48 "mtc1 %[tmp0], %[ftmp14] \n\t" \
49 "pshufh %[ftmp14], %[ftmp14], %[ftmp23] \n\t" \
50 "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
51 "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
52 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
53 "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
54 "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
55 "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
57 "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
58 "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
59 "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
60 "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
61 "paddw %[ftmp13], %[ftmp13], "#c0" \n\t" \
62 "paddw %[ftmp14], %[ftmp14], "#c0" \n\t" \
63 "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
64 "paddw %[ftmp3], %[ftmp3], "#c0" \n\t" \
65 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
66 "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
67 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
68 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
69 "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
70 "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
71 "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
72 "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
73 "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
74 "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
76 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1) \
77 "li %[tmp0], "#r1" \n\t" \
78 "mtc1 %[tmp0], %[ftmp13] \n\t" \
79 "pshufh %[ftmp13], %[ftmp13], %[ftmp23] \n\t" \
80 "li %[tmp0], "#r2" \n\t" \
81 "mtc1 %[tmp0], %[ftmp14] \n\t" \
82 "pshufh %[ftmp14], %[ftmp14], %[ftmp23] \n\t" \
83 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
84 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
85 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
86 "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
87 "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
88 "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
90 "li %[tmp0], "#r3" \n\t" \
91 "mtc1 %[tmp0], %[ftmp13] \n\t" \
92 "pshufh %[ftmp13], %[ftmp13], %[ftmp23] \n\t" \
93 "li %[tmp0], "#r4" \n\t" \
94 "mtc1 %[tmp0], %[ftmp14] \n\t" \
95 "pshufh %[ftmp14], %[ftmp14], %[ftmp23] \n\t" \
96 "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
97 "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
98 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
99 "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
100 "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
101 "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
103 "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
104 "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
105 "paddw %[ftmp14], %[ftmp14], "#c1" \n\t" \
106 "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
107 "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
108 "paddw %[ftmp3], %[ftmp3], "#c1" \n\t" \
109 "paddw %[ftmp13], %[ftmp13], "#c0" \n\t" \
110 "paddw %[ftmp14], %[ftmp14], "#c0" \n\t" \
111 "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
112 "paddw %[ftmp3], %[ftmp3], "#c0" \n\t" \
113 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
114 "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
115 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
116 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
117 "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
118 "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
119 "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
120 "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
121 "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
122 "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
124 /* Do inverse transform on 8x8 block */
125 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
132 dc = (3 * dc + 1) >> 1;
133 dc = (3 * dc + 16) >> 5;
136 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
137 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
138 "li %[count], 0x02 \n\t"
141 MMI_LDC1(%[ftmp1], %[dest], 0x00)
142 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
143 MMI_LDC1(%[ftmp2], %[addr0], 0x00)
144 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
145 MMI_LDC1(%[ftmp3], %[addr0], 0x00)
146 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
147 MMI_LDC1(%[ftmp4], %[addr0], 0x00)
149 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
150 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
151 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
152 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
153 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
154 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
155 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
156 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
158 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
159 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
160 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
161 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
162 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
163 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
164 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
165 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
167 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
168 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
169 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
170 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
172 MMI_SDC1(%[ftmp1], %[dest], 0x00)
173 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
174 MMI_SDC1(%[ftmp2], %[addr0], 0x00)
175 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
176 MMI_SDC1(%[ftmp3], %[addr0], 0x00)
177 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
178 MMI_SDC1(%[ftmp4], %[addr0], 0x00)
180 "addiu %[count], %[count], -0x01 \n\t"
181 PTR_ADDU "%[dest], %[addr0], %[linesize] \n\t"
182 "bnez %[count], 1b \n\t"
183 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
184 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
185 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
186 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
187 [ftmp8]"=&f"(ftmp[8]),
188 [addr0]"=&r"(addr[0]),
189 [count]"=&r"(count), [dest]"+&r"(dest)
190 : [linesize]"r"((mips_reg)linesize),
196 #if _MIPS_SIM != _ABIO32
197 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
199 DECLARE_ALIGNED(16, int16_t, temp[64]);
200 DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
201 DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
202 DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
203 int16_t *src = block;
210 "li %[tmp0], 0x03 \n\t"
211 "mtc1 %[tmp0], %[ftmp0] \n\t"
212 "li %[tmp0], 0x44 \n\t"
213 "mtc1 %[tmp0], %[ftmp23] \n\t"
216 MMI_LDC1(%[ftmp1], %[src], 0x00)
217 MMI_LDC1(%[ftmp2], %[src], 0x20)
218 MMI_LDC1(%[ftmp3], %[src], 0x40)
219 MMI_LDC1(%[ftmp4], %[src], 0x60)
220 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
221 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
222 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
223 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
225 MMI_LDC1(%[ftmp1], %[src], 0x10)
226 MMI_LDC1(%[ftmp2], %[src], 0x30)
227 MMI_LDC1(%[ftmp3], %[src], 0x50)
228 MMI_LDC1(%[ftmp4], %[src], 0x70)
229 "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
230 "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
231 "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t"
232 "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t"
234 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
235 VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
236 0x000f0010, 0x00040009, %[ff_pw_4])
238 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
239 VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
240 0xfffc000f, 0xfff7fff0, %[ff_pw_4])
242 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
243 VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
244 0xfff00009, 0x000f0004, %[ff_pw_4])
246 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
247 VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
248 0xfff70004, 0xfff0000f, %[ff_pw_4])
250 TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
251 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
253 MMI_SDC1(%[ftmp15], %[dst], 0x00)
254 MMI_SDC1(%[ftmp16], %[dst], 0x10)
255 MMI_SDC1(%[ftmp17], %[dst], 0x20)
256 MMI_SDC1(%[ftmp18], %[dst], 0x30)
258 TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
259 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
261 MMI_SDC1(%[ftmp19], %[dst], 0x08)
262 MMI_SDC1(%[ftmp20], %[dst], 0x18)
263 MMI_SDC1(%[ftmp21], %[dst], 0x28)
264 MMI_SDC1(%[ftmp22], %[dst], 0x38)
267 MMI_LDC1(%[ftmp1], %[src], 0x08)
268 MMI_LDC1(%[ftmp2], %[src], 0x28)
269 MMI_LDC1(%[ftmp3], %[src], 0x48)
270 MMI_LDC1(%[ftmp4], %[src], 0x68)
271 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
272 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
273 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
274 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
276 MMI_LDC1(%[ftmp1], %[src], 0x18)
277 MMI_LDC1(%[ftmp2], %[src], 0x38)
278 MMI_LDC1(%[ftmp3], %[src], 0x58)
279 MMI_LDC1(%[ftmp4], %[src], 0x78)
280 "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
281 "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
282 "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t"
283 "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t"
285 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
286 VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
287 0x000f0010, 0x00040009, %[ff_pw_4])
289 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
290 VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
291 0xfffc000f, 0xfff7fff0, %[ff_pw_4])
293 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
294 VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
295 0xfff00009, 0x000f0004, %[ff_pw_4])
297 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
298 VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
299 0xfff70004, 0xfff0000f, %[ff_pw_4])
301 TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
302 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
304 MMI_SDC1(%[ftmp15], %[dst], 0x40)
305 MMI_SDC1(%[ftmp16], %[dst], 0x50)
306 MMI_SDC1(%[ftmp17], %[dst], 0x60)
307 MMI_SDC1(%[ftmp18], %[dst], 0x70)
309 TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
310 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
312 MMI_SDC1(%[ftmp19], %[dst], 0x48)
313 MMI_SDC1(%[ftmp20], %[dst], 0x58)
314 MMI_SDC1(%[ftmp21], %[dst], 0x68)
315 MMI_SDC1(%[ftmp22], %[dst], 0x78)
317 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
318 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
319 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
320 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
321 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
322 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
323 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
324 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
325 [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
326 [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
327 [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
328 [ftmp22]"=&f"(ftmp[22]), [ftmp23]"=&f"(ftmp[23]),
330 : [ff_pw_4]"f"(ff_pw_4_local), [src]"r"(src), [dst]"r"(dst)
339 "li %[tmp0], 0x07 \n\t"
340 "mtc1 %[tmp0], %[ftmp0] \n\t"
341 "li %[tmp0], 0x44 \n\t"
342 "mtc1 %[tmp0], %[ftmp23] \n\t"
345 MMI_LDC1(%[ftmp1], %[src], 0x00)
346 MMI_LDC1(%[ftmp2], %[src], 0x20)
347 MMI_LDC1(%[ftmp3], %[src], 0x40)
348 MMI_LDC1(%[ftmp4], %[src], 0x60)
349 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
350 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
351 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
352 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
354 MMI_LDC1(%[ftmp1], %[src], 0x10)
355 MMI_LDC1(%[ftmp2], %[src], 0x30)
356 MMI_LDC1(%[ftmp3], %[src], 0x50)
357 MMI_LDC1(%[ftmp4], %[src], 0x70)
358 "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
359 "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
360 "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t"
361 "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t"
363 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
364 VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
365 0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
367 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
368 VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
369 0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
371 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
372 VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
373 0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
375 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
376 VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
377 0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
379 MMI_SDC1(%[ftmp15], %[dst], 0x00)
380 MMI_SDC1(%[ftmp16], %[dst], 0x10)
381 MMI_SDC1(%[ftmp17], %[dst], 0x20)
382 MMI_SDC1(%[ftmp18], %[dst], 0x30)
383 MMI_SDC1(%[ftmp19], %[dst], 0x40)
384 MMI_SDC1(%[ftmp20], %[dst], 0x50)
385 MMI_SDC1(%[ftmp21], %[dst], 0x60)
386 MMI_SDC1(%[ftmp22], %[dst], 0x70)
389 MMI_LDC1(%[ftmp1], %[src], 0x08)
390 MMI_LDC1(%[ftmp2], %[src], 0x28)
391 MMI_LDC1(%[ftmp3], %[src], 0x48)
392 MMI_LDC1(%[ftmp4], %[src], 0x68)
393 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
394 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
395 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
396 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
398 MMI_LDC1(%[ftmp1], %[src], 0x18)
399 MMI_LDC1(%[ftmp2], %[src], 0x38)
400 MMI_LDC1(%[ftmp3], %[src], 0x58)
401 MMI_LDC1(%[ftmp4], %[src], 0x78)
402 "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
403 "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
404 "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t"
405 "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t"
407 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
408 VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
409 0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
411 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
412 VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
413 0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
415 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
416 VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
417 0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
419 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
420 VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
421 0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
423 MMI_SDC1(%[ftmp15], %[dst], 0x08)
424 MMI_SDC1(%[ftmp16], %[dst], 0x18)
425 MMI_SDC1(%[ftmp17], %[dst], 0x28)
426 MMI_SDC1(%[ftmp18], %[dst], 0x38)
427 MMI_SDC1(%[ftmp19], %[dst], 0x48)
428 MMI_SDC1(%[ftmp20], %[dst], 0x58)
429 MMI_SDC1(%[ftmp21], %[dst], 0x68)
430 MMI_SDC1(%[ftmp22], %[dst], 0x78)
432 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
433 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
434 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
435 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
436 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
437 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
438 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
439 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
440 [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
441 [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
442 [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
443 [ftmp22]"=&f"(ftmp[22]), [ftmp23]"=&f"(ftmp[23]),
445 : [ff_pw_1]"f"(ff_pw_1_local), [ff_pw_64]"f"(ff_pw_64_local),
446 [src]"r"(src), [dst]"r"(dst)
452 /* Do inverse transform on 8x4 part of block */
453 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
458 dc = ( 3 * dc + 1) >> 1;
459 dc = (17 * dc + 64) >> 7;
462 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
463 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
465 MMI_LDC1(%[ftmp1], %[dest0], 0x00)
466 MMI_LDC1(%[ftmp2], %[dest1], 0x00)
467 MMI_LDC1(%[ftmp3], %[dest2], 0x00)
468 MMI_LDC1(%[ftmp4], %[dest3], 0x00)
470 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
471 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
472 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
473 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
474 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
475 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
476 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
477 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
479 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
480 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
481 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
482 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
483 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
484 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
485 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
486 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
488 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
489 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
490 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
491 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
493 MMI_SDC1(%[ftmp1], %[dest0], 0x00)
494 MMI_SDC1(%[ftmp2], %[dest1], 0x00)
495 MMI_SDC1(%[ftmp3], %[dest2], 0x00)
496 MMI_SDC1(%[ftmp4], %[dest3], 0x00)
497 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
498 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
499 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
500 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
501 [ftmp8]"=&f"(ftmp[8])
502 : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
503 [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
509 #if _MIPS_SIM != _ABIO32
510 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
512 int16_t *src = block;
513 int16_t *dst = block;
517 DECLARE_ALIGNED(16, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
518 DECLARE_ALIGNED(16, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
519 int16_t coeff[64] = {12, 16, 16, 15, 12, 9, 6, 4,
520 12, 15, 6, -4, -12, -16, -16, -9,
521 12, 9, -6, -16, -12, 4, 16, 15,
522 12, 4, -16, -9, 12, 15, -6, -16,
523 12, -4, -16, 9, 12, -15, -6, 16,
524 12, -9, -6, 16, -12, -4, 16, -15,
525 12, -15, 6, 4, -12, 16, -16, 9,
526 12, -16, 16, -15, 12, -9, 6, -4};
530 "li %[tmp0], 0x03 \n\t"
531 "mtc1 %[tmp0], %[ftmp0] \n\t"
534 MMI_LDC1(%[ftmp1], %[src], 0x00)
535 MMI_LDC1(%[ftmp2], %[src], 0x08)
537 /* ftmp11: dst1,dst0 */
538 MMI_LDC1(%[ftmp3], %[coeff], 0x00)
539 MMI_LDC1(%[ftmp4], %[coeff], 0x08)
540 MMI_LDC1(%[ftmp5], %[coeff], 0x10)
541 MMI_LDC1(%[ftmp6], %[coeff], 0x18)
542 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
543 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
544 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
545 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
546 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
547 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
548 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
549 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
550 "paddw %[ftmp11], %[ftmp7], %[ftmp8] \n\t"
551 "paddw %[ftmp11], %[ftmp11], %[ff_pw_4] \n\t"
553 /* ftmp12: dst3,dst2 */
554 MMI_LDC1(%[ftmp3], %[coeff], 0x20)
555 MMI_LDC1(%[ftmp4], %[coeff], 0x28)
556 MMI_LDC1(%[ftmp5], %[coeff], 0x30)
557 MMI_LDC1(%[ftmp6], %[coeff], 0x38)
558 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
559 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
560 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
561 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
562 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
563 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
564 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
565 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
566 "paddw %[ftmp12], %[ftmp7], %[ftmp8] \n\t"
567 "paddw %[ftmp12], %[ftmp12], %[ff_pw_4] \n\t"
569 /* ftmp13: dst5,dst4 */
570 MMI_LDC1(%[ftmp3], %[coeff], 0x40)
571 MMI_LDC1(%[ftmp4], %[coeff], 0x48)
572 MMI_LDC1(%[ftmp5], %[coeff], 0x50)
573 MMI_LDC1(%[ftmp6], %[coeff], 0x58)
574 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
575 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
576 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
577 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
578 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
579 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
580 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
581 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
582 "paddw %[ftmp13], %[ftmp7], %[ftmp8] \n\t"
583 "paddw %[ftmp13], %[ftmp13], %[ff_pw_4] \n\t"
585 /* ftmp14: dst7,dst6 */
586 MMI_LDC1(%[ftmp3], %[coeff], 0x60)
587 MMI_LDC1(%[ftmp4], %[coeff], 0x68)
588 MMI_LDC1(%[ftmp5], %[coeff], 0x70)
589 MMI_LDC1(%[ftmp6], %[coeff], 0x78)
590 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
591 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
592 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
593 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
594 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
595 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
596 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
597 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
598 "paddw %[ftmp14], %[ftmp7], %[ftmp8] \n\t"
599 "paddw %[ftmp14], %[ftmp14], %[ff_pw_4] \n\t"
601 /* ftmp9: dst3,dst2,dst1,dst0 ftmp10: dst7,dst6,dst5,dst4 */
602 "psraw %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
603 "psraw %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
604 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t"
605 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
606 "punpcklhw %[ftmp7], %[ftmp11], %[ftmp12] \n\t"
607 "punpckhhw %[ftmp8], %[ftmp11], %[ftmp12] \n\t"
608 "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
609 "punpcklhw %[ftmp7], %[ftmp13], %[ftmp14] \n\t"
610 "punpckhhw %[ftmp8], %[ftmp13], %[ftmp14] \n\t"
611 "punpcklhw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
612 MMI_SDC1(%[ftmp9], %[dst], 0x00)
613 MMI_SDC1(%[ftmp10], %[dst], 0x08)
615 PTR_ADDIU "%[src], %[src], 0x10 \n\t"
616 PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
617 "addiu %[count], %[count], -0x01 \n\t"
618 "bnez %[count], 1b \n\t"
619 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
620 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
621 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
622 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
623 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
624 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
625 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
626 [ftmp14]"=&f"(ftmp[14]), [tmp0]"=&r"(tmp[0]),
627 [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count)
628 : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
636 "li %[tmp0], 0x44 \n\t"
637 "mtc1 %[tmp0], %[ftmp15] \n\t"
640 "li %[tmp0], 0x07 \n\t"
641 "mtc1 %[tmp0], %[ftmp0] \n\t"
642 MMI_LDC1(%[ftmp1], %[src], 0x00)
643 MMI_LDC1(%[ftmp2], %[src], 0x10)
644 MMI_LDC1(%[ftmp3], %[src], 0x20)
645 MMI_LDC1(%[ftmp4], %[src], 0x30)
646 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
647 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
648 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
649 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
651 /* ftmp11: dst03,dst02,dst01,dst00 */
652 "li %[tmp0], 0x00160011 \n\t"
653 "mtc1 %[tmp0], %[ftmp3] \n\t"
654 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
655 "li %[tmp0], 0x000a0011 \n\t"
656 "mtc1 %[tmp0], %[ftmp4] \n\t"
657 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
658 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
659 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
660 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
661 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
662 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
663 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
664 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
665 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
666 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
667 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
668 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
669 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
670 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
672 /* ftmp12: dst13,dst12,dst11,dst10 */
673 "li %[tmp0], 0x000a0011 \n\t"
674 "mtc1 %[tmp0], %[ftmp3] \n\t"
675 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
676 "li %[tmp0], 0xffeaffef \n\t"
677 "mtc1 %[tmp0], %[ftmp4] \n\t"
678 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
679 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
680 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
681 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
682 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
683 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
684 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
685 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
686 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
687 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
688 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
689 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
690 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
691 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
693 /* ftmp13: dst23,dst22,dst21,dst20 */
694 "li %[tmp0], 0xfff60011 \n\t"
695 "mtc1 %[tmp0], %[ftmp3] \n\t"
696 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
697 "li %[tmp0], 0x0016ffef \n\t"
698 "mtc1 %[tmp0], %[ftmp4] \n\t"
699 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
700 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
701 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
702 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
703 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
704 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
705 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
706 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
707 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
708 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
709 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
710 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
711 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
712 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
714 /* ftmp14: dst33,dst32,dst31,dst30 */
715 "li %[tmp0], 0xffea0011 \n\t"
716 "mtc1 %[tmp0], %[ftmp3] \n\t"
717 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
718 "li %[tmp0], 0xfff60011 \n\t"
719 "mtc1 %[tmp0], %[ftmp4] \n\t"
720 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
721 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
722 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
723 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
724 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
725 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
726 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
727 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
728 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
729 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
730 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
731 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
732 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
733 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
735 MMI_LWC1(%[ftmp1], %[dest], 0x00)
736 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
737 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
738 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
739 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
740 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
741 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
742 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
743 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
744 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
745 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
746 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
747 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
748 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
749 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
750 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
751 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
752 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
753 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
754 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
755 MMI_SWC1(%[ftmp1], %[dest], 0x00)
756 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
757 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
758 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
759 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
760 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
761 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
764 "li %[tmp0], 0x07 \n\t"
765 "mtc1 %[tmp0], %[ftmp0] \n\t"
766 MMI_LDC1(%[ftmp1], %[src], 0x08)
767 MMI_LDC1(%[ftmp2], %[src], 0x18)
768 MMI_LDC1(%[ftmp3], %[src], 0x28)
769 MMI_LDC1(%[ftmp4], %[src], 0x38)
770 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
771 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
772 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
773 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
775 /* ftmp11: dst03,dst02,dst01,dst00 */
776 "li %[tmp0], 0x00160011 \n\t"
777 "mtc1 %[tmp0], %[ftmp3] \n\t"
778 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
779 "li %[tmp0], 0x000a0011 \n\t"
780 "mtc1 %[tmp0], %[ftmp4] \n\t"
781 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
782 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
783 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
784 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
785 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
786 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
787 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
788 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
789 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
790 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
791 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
792 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
793 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
794 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
796 /* ftmp12: dst13,dst12,dst11,dst10 */
797 "li %[tmp0], 0x000a0011 \n\t"
798 "mtc1 %[tmp0], %[ftmp3] \n\t"
799 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
800 "li %[tmp0], 0xffeaffef \n\t"
801 "mtc1 %[tmp0], %[ftmp4] \n\t"
802 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
803 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
804 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
805 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
806 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
807 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
808 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
809 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
810 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
811 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
812 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
813 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
814 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
815 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
817 /* ftmp13: dst23,dst22,dst21,dst20 */
818 "li %[tmp0], 0xfff60011 \n\t"
819 "mtc1 %[tmp0], %[ftmp3] \n\t"
820 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
821 "li %[tmp0], 0x0016ffef \n\t"
822 "mtc1 %[tmp0], %[ftmp4] \n\t"
823 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
824 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
825 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
826 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
827 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
828 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
829 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
830 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
831 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
832 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
833 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
834 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
835 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
836 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
838 /* ftmp14: dst33,dst32,dst31,dst30 */
839 "li %[tmp0], 0xffea0011 \n\t"
840 "mtc1 %[tmp0], %[ftmp3] \n\t"
841 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
842 "li %[tmp0], 0xfff60011 \n\t"
843 "mtc1 %[tmp0], %[ftmp4] \n\t"
844 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
845 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
846 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
847 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
848 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
849 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
850 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
851 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
852 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
853 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
854 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
855 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
856 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
857 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
859 MMI_LWC1(%[ftmp1], %[dest], 0x04)
860 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
861 MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
862 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
863 MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
864 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
865 MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
866 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
867 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
868 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
869 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
870 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
871 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
872 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
873 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
874 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
875 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
876 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
877 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
878 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
879 MMI_SWC1(%[ftmp1], %[dest], 0x04)
880 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
881 MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
882 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
883 MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
884 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
885 MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
887 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
888 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
889 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
890 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
891 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
892 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
893 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
894 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
896 : [ff_pw_64]"f"(ff_pw_64_local),
897 [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
903 /* Do inverse transform on 4x8 parts of block */
904 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
910 dc = (17 * dc + 4) >> 3;
911 dc = (12 * dc + 64) >> 7;
914 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
915 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
917 MMI_LWC1(%[ftmp1], %[dest0], 0x00)
918 MMI_LWC1(%[ftmp2], %[dest1], 0x00)
919 MMI_LWC1(%[ftmp3], %[dest2], 0x00)
920 MMI_LWC1(%[ftmp4], %[dest3], 0x00)
921 MMI_LWC1(%[ftmp5], %[dest4], 0x00)
922 MMI_LWC1(%[ftmp6], %[dest5], 0x00)
923 MMI_LWC1(%[ftmp7], %[dest6], 0x00)
924 MMI_LWC1(%[ftmp8], %[dest7], 0x00)
926 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
927 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
928 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
929 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
930 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
931 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
932 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
933 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
935 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
936 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
937 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
938 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
939 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
940 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
941 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
942 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
944 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
945 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
946 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
947 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
948 "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
949 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
950 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
951 "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
953 MMI_SWC1(%[ftmp1], %[dest0], 0x00)
954 MMI_SWC1(%[ftmp2], %[dest1], 0x00)
955 MMI_SWC1(%[ftmp3], %[dest2], 0x00)
956 MMI_SWC1(%[ftmp4], %[dest3], 0x00)
957 MMI_SWC1(%[ftmp5], %[dest4], 0x00)
958 MMI_SWC1(%[ftmp6], %[dest5], 0x00)
959 MMI_SWC1(%[ftmp7], %[dest6], 0x00)
960 MMI_SWC1(%[ftmp8], %[dest7], 0x00)
961 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
962 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
963 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
964 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
966 [ftmp8]"=&f"(ftmp[8])
967 : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
968 [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
969 [dest4]"r"(dest+4*linesize), [dest5]"r"(dest+5*linesize),
970 [dest6]"r"(dest+6*linesize), [dest7]"r"(dest+7*linesize),
976 #if _MIPS_SIM != _ABIO32
977 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
979 int16_t *src = block;
980 int16_t *dst = block;
982 uint32_t count = 8, tmp[1];
983 int16_t coeff[16] = {17, 22, 17, 10,
987 DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
988 DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
989 DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
994 "li %[tmp0], 0x03 \n\t"
995 "mtc1 %[tmp0], %[ftmp0] \n\t"
997 MMI_LDC1(%[ftmp2], %[coeff], 0x00)
998 MMI_LDC1(%[ftmp3], %[coeff], 0x08)
999 MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1000 MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1002 /* ftmp8: dst3,dst2,dst1,dst0 */
1003 MMI_LDC1(%[ftmp1], %[src], 0x00)
1004 "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
1005 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
1006 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
1007 "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
1008 "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
1009 "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
1010 "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1011 "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1012 "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
1013 "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
1014 "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
1015 "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
1016 "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1017 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1018 "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1019 "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1020 "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
1021 MMI_SDC1(%[ftmp8], %[dst], 0x00)
1023 PTR_ADDIU "%[src], %[src], 0x10 \n\t"
1024 PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
1025 "addiu %[count], %[count], -0x01 \n\t"
1026 "bnez %[count], 1b \n\t"
1027 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1028 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1029 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1030 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1031 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1032 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1033 [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
1034 [src]"+&r"(src), [dst]"+&r"(dst)
1035 : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
1043 "li %[tmp0], 0x07 \n\t"
1044 "mtc1 %[tmp0], %[ftmp0] \n\t"
1045 "li %[tmp0], 0x44 \n\t"
1046 "mtc1 %[tmp0], %[ftmp23] \n\t"
1048 MMI_LDC1(%[ftmp1], %[src], 0x00)
1049 MMI_LDC1(%[ftmp2], %[src], 0x20)
1050 MMI_LDC1(%[ftmp3], %[src], 0x40)
1051 MMI_LDC1(%[ftmp4], %[src], 0x60)
1052 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1053 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1054 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1055 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1057 MMI_LDC1(%[ftmp1], %[src], 0x10)
1058 MMI_LDC1(%[ftmp2], %[src], 0x30)
1059 MMI_LDC1(%[ftmp3], %[src], 0x50)
1060 MMI_LDC1(%[ftmp4], %[src], 0x70)
1061 "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1062 "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1063 "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t"
1064 "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t"
1066 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
1067 VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
1068 0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
1070 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
1071 VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
1072 0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
1074 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
1075 VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
1076 0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
1078 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
1079 VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
1080 0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
1082 MMI_LWC1(%[ftmp1], %[dest], 0x00)
1083 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1084 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1085 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1086 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1087 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1088 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1089 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1090 MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1091 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1092 MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1093 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1094 MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1095 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1096 MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1097 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1098 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1099 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1100 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1101 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1102 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1103 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1104 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1105 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1107 "paddh %[ftmp1], %[ftmp1], %[ftmp15] \n\t"
1108 "paddh %[ftmp2], %[ftmp2], %[ftmp16] \n\t"
1109 "paddh %[ftmp3], %[ftmp3], %[ftmp17] \n\t"
1110 "paddh %[ftmp4], %[ftmp4], %[ftmp18] \n\t"
1111 "paddh %[ftmp5], %[ftmp5], %[ftmp19] \n\t"
1112 "paddh %[ftmp6], %[ftmp6], %[ftmp20] \n\t"
1113 "paddh %[ftmp7], %[ftmp7], %[ftmp21] \n\t"
1114 "paddh %[ftmp8], %[ftmp8], %[ftmp22] \n\t"
1116 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1117 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1118 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1119 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1120 "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1121 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1122 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1123 "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1125 MMI_SWC1(%[ftmp1], %[dest], 0x00)
1126 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1127 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1128 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1129 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1130 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1131 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1132 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1133 MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1134 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1135 MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1136 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1137 MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1138 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1139 MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1141 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1142 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1143 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1144 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1145 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1146 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1147 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1148 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1149 [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
1150 [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
1151 [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
1152 [ftmp22]"=&f"(ftmp[22]), [ftmp23]"=&f"(ftmp[23]),
1154 : [ff_pw_1]"f"(ff_pw_1_local), [ff_pw_64]"f"(ff_pw_64_local),
1155 [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1161 /* Do inverse transform on 4x4 part of block */
1162 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1168 dc = (17 * dc + 4) >> 3;
1169 dc = (17 * dc + 64) >> 7;
1172 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1173 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
1175 MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1176 MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1177 MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1178 MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1180 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1181 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1182 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1183 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1185 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
1186 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
1187 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
1188 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
1190 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1191 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1192 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1193 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1195 MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1196 MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1197 MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1198 MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1199 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1200 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1202 [ftmp4]"=&f"(ftmp[4])
1203 : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
1204 [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
1210 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1212 int16_t *src = block;
1213 int16_t *dst = block;
1215 uint32_t count = 4, tmp[1];
1216 int16_t coeff[16] = {17, 22, 17, 10,
1220 DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
1221 DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
1225 "li %[tmp0], 0x03 \n\t"
1226 "mtc1 %[tmp0], %[ftmp0] \n\t"
1227 MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1228 MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1229 MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1230 MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1232 /* ftmp8: dst3,dst2,dst1,dst0 */
1233 MMI_LDC1(%[ftmp1], %[src], 0x00)
1234 "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
1235 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
1236 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
1237 "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
1238 "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
1239 "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
1240 "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1241 "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1242 "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
1243 "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
1244 "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
1245 "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
1246 "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1247 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1248 "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1249 "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1250 "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
1251 MMI_SDC1(%[ftmp8], %[dst], 0x00)
1253 PTR_ADDIU "%[src], %[src], 0x10 \n\t"
1254 PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
1255 "addiu %[count], %[count], -0x01 \n\t"
1256 "bnez %[count], 1b \n\t"
1257 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1258 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1259 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1260 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1261 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1262 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1263 [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
1264 [src]"+&r"(src), [dst]"+&r"(dst)
1265 : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
1273 "li %[tmp0], 0x07 \n\t"
1274 "mtc1 %[tmp0], %[ftmp0] \n\t"
1275 "li %[tmp0], 0x44 \n\t"
1276 "mtc1 %[tmp0], %[ftmp15] \n\t"
1278 MMI_LDC1(%[ftmp1], %[src], 0x00)
1279 MMI_LDC1(%[ftmp2], %[src], 0x10)
1280 MMI_LDC1(%[ftmp3], %[src], 0x20)
1281 MMI_LDC1(%[ftmp4], %[src], 0x30)
1282 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1283 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1284 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1285 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1287 /* ftmp11: dst03,dst02,dst01,dst00 */
1288 "li %[tmp0], 0x00160011 \n\t"
1289 "mtc1 %[tmp0], %[ftmp3] \n\t"
1290 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1291 "li %[tmp0], 0x000a0011 \n\t"
1292 "mtc1 %[tmp0], %[ftmp4] \n\t"
1293 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1294 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1295 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1296 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1297 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1298 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1299 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1300 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1301 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1302 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1303 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1304 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1305 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1306 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
1308 /* ftmp12: dst13,dst12,dst11,dst10 */
1309 "li %[tmp0], 0x000a0011 \n\t"
1310 "mtc1 %[tmp0], %[ftmp3] \n\t"
1311 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1312 "li %[tmp0], 0xffeaffef \n\t"
1313 "mtc1 %[tmp0], %[ftmp4] \n\t"
1314 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1315 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1316 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1317 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1318 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1319 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1320 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1321 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1322 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1323 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1324 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1325 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1326 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1327 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
1329 /* ftmp13: dst23,dst22,dst21,dst20 */
1330 "li %[tmp0], 0xfff60011 \n\t"
1331 "mtc1 %[tmp0], %[ftmp3] \n\t"
1332 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1333 "li %[tmp0], 0x0016ffef \n\t"
1334 "mtc1 %[tmp0], %[ftmp4] \n\t"
1335 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1336 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1337 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1338 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1339 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1340 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1341 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1342 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1343 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1344 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1345 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1346 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1347 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1348 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
1350 /* ftmp14: dst33,dst32,dst31,dst30 */
1351 "li %[tmp0], 0xffea0011 \n\t"
1352 "mtc1 %[tmp0], %[ftmp3] \n\t"
1353 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1354 "li %[tmp0], 0xfff60011 \n\t"
1355 "mtc1 %[tmp0], %[ftmp4] \n\t"
1356 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1357 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1358 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1359 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1360 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1361 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1362 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1363 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1364 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1365 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1366 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1367 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1368 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1369 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
1371 MMI_LWC1(%[ftmp1], %[dest], 0x00)
1372 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1373 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1374 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1375 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1376 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1377 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1378 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1379 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1380 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1381 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1382 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1383 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1384 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
1385 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
1386 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
1387 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1388 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1389 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1390 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1392 MMI_SWC1(%[ftmp1], %[dest], 0x00)
1393 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1394 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1395 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1396 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1397 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1398 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1400 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1401 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1402 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1403 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1404 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1405 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1406 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1407 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1409 : [ff_pw_64]"f"(ff_pw_64_local),
1410 [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1415 /* Apply overlap transform to horizontal edge */
1416 void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
1422 for (i = 0; i < 8; i++) {
1427 d1 = (a - d + 3 + rnd) >> 3;
1428 d2 = (a - d + b - c + 4 - rnd) >> 3;
1431 src[-1] = av_clip_uint8(b - d2);
1432 src[0] = av_clip_uint8(c + d2);
1439 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
1444 int rnd1 = flags & 2 ? 3 : 4;
1445 int rnd2 = 7 - rnd1;
1446 for (i = 0; i < 8; i++) {
1454 left[6] = ((a << 3) - d1 + rnd1) >> 3;
1455 left[7] = ((b << 3) - d2 + rnd2) >> 3;
1456 right[0] = ((c << 3) + d2 + rnd1) >> 3;
1457 right[1] = ((d << 3) + d1 + rnd2) >> 3;
1459 right += right_stride;
1460 left += left_stride;
1468 /* Apply overlap transform to vertical edge */
1469 void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
1475 for (i = 0; i < 8; i++) {
1476 a = src[-2 * stride];
1480 d1 = (a - d + 3 + rnd) >> 3;
1481 d2 = (a - d + b - c + 4 - rnd) >> 3;
1483 src[-2 * stride] = a - d1;
1484 src[-stride] = av_clip_uint8(b - d2);
1485 src[0] = av_clip_uint8(c + d2);
1486 src[stride] = d + d1;
1492 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1497 int rnd1 = 4, rnd2 = 3;
1498 for (i = 0; i < 8; i++) {
1506 top[48] = ((a << 3) - d1 + rnd1) >> 3;
1507 top[56] = ((b << 3) - d2 + rnd2) >> 3;
1508 bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1509 bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1519 * VC-1 in-loop deblocking filter for one line
1520 * @param src source block type
1521 * @param stride block stride
1522 * @param pq block quantizer
1523 * @return whether other 3 pairs should be filtered or not
1526 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1528 int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1529 5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1530 int a0_sign = a0 >> 31; /* Store sign */
1532 a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1534 int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1535 5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1536 int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1537 5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1538 if (a1 < a0 || a2 < a0) {
1539 int clip = src[-1 * stride] - src[0 * stride];
1540 int clip_sign = clip >> 31;
1542 clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1544 int a3 = FFMIN(a1, a2);
1545 int d = 5 * (a3 - a0);
1546 int d_sign = (d >> 31);
1548 d = ((d ^ d_sign) - d_sign) >> 3;
1551 if (d_sign ^ clip_sign)
1555 d = (d ^ d_sign) - d_sign; /* Restore sign */
1556 src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1557 src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1567 * VC-1 in-loop deblocking filter
1568 * @param src source block type
1569 * @param step distance between horizontally adjacent elements
1570 * @param stride distance between vertically adjacent elements
1571 * @param len edge length to filter (4 or 8 pixels)
1572 * @param pq block quantizer
1575 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1581 for (i = 0; i < len; i += 4) {
1582 filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1584 vc1_filter_line(src + 0 * step, stride, pq);
1585 vc1_filter_line(src + 1 * step, stride, pq);
1586 vc1_filter_line(src + 3 * step, stride, pq);
1592 void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1594 vc1_loop_filter(src, 1, stride, 4, pq);
1597 void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1599 vc1_loop_filter(src, stride, 1, 4, pq);
1602 void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1604 vc1_loop_filter(src, 1, stride, 8, pq);
1607 void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1609 vc1_loop_filter(src, stride, 1, 8, pq);
1612 void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1614 vc1_loop_filter(src, 1, stride, 16, pq);
1617 void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1619 vc1_loop_filter(src, stride, 1, 16, pq);
1622 void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1623 ptrdiff_t stride, int rnd)
1625 ff_put_pixels8_8_mmi(dst, src, stride, 8);
1627 void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1628 ptrdiff_t stride, int rnd)
1630 ff_put_pixels16_8_mmi(dst, src, stride, 16);
1632 void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1633 ptrdiff_t stride, int rnd)
1635 ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1637 void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1638 ptrdiff_t stride, int rnd)
1640 ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1643 #define OP_PUT(S, D)
1644 #define OP_AVG(S, D) \
1645 "ldc1 $f16, "#S" \n\t" \
1646 "pavgb "#D", "#D", $f16 \n\t"
1648 /** Add rounder from $f14 to $f6 and pack result at destination */
1649 #define NORMALIZE_MMI(SHIFT) \
1650 "paddh $f6, $f6, $f14 \n\t" /* +bias-r */ \
1651 "paddh $f8, $f8, $f14 \n\t" /* +bias-r */ \
1652 "psrah $f6, $f6, "SHIFT" \n\t" \
1653 "psrah $f8, $f8, "SHIFT" \n\t"
1655 #define TRANSFER_DO_PACK(OP) \
1656 "packushb $f6, $f6, $f8 \n\t" \
1658 "sdc1 $f6, 0x00(%[dst]) \n\t"
1660 #define TRANSFER_DONT_PACK(OP) \
1661 OP(0(%[dst]), $f6) \
1662 OP(8(%[dst]), $f8) \
1663 "sdc1 $f6, 0x00(%[dst]) \n\t" \
1664 "sdc1 $f8, 0x08(%[dst]) \n\t"
1666 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1667 #define DO_UNPACK(reg) \
1668 "punpcklbh "reg", "reg", $f0 \n\t"
1669 #define DONT_UNPACK(reg)
1671 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1672 #define LOAD_ROUNDER_MMI(ROUND) \
1673 "lwc1 $f14, "ROUND" \n\t" \
1674 "punpcklhw $f14, $f14, $f14 \n\t" \
1675 "punpcklwd $f14, $f14, $f14 \n\t"
1678 #define SHIFT2_LINE(OFF, R0, R1, R2, R3) \
1679 "paddh "#R1", "#R1", "#R2" \n\t" \
1680 PTR_ADDU "$9, %[src], %[stride1] \n\t" \
1681 MMI_ULWC1(R0, $9, 0x00) \
1682 "pmullh "#R1", "#R1", $f6 \n\t" \
1683 "punpcklbh "#R0", "#R0", $f0 \n\t" \
1684 PTR_ADDU "$9, %[src], %[stride] \n\t" \
1685 MMI_ULWC1(R3, $9, 0x00) \
1686 "psubh "#R1", "#R1", "#R0" \n\t" \
1687 "punpcklbh "#R3", "#R3", $f0 \n\t" \
1688 "paddh "#R1", "#R1", $f14 \n\t" \
1689 "psubh "#R1", "#R1", "#R3" \n\t" \
1690 "psrah "#R1", "#R1", %[shift] \n\t" \
1691 MMI_SDC1(R1, %[dst], OFF) \
1692 PTR_ADDU "%[src], %[src], %[stride] \n\t"
1694 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1695 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1696 const uint8_t *src, mips_reg stride,
1697 int rnd, int64_t shift)
1703 "xor $f0, $f0, $f0 \n\t"
1705 LOAD_ROUNDER_MMI("%[rnd]")
1706 "ldc1 $f12, %[ff_pw_9] \n\t"
1708 MMI_ULWC1($f4, %[src], 0x00)
1709 PTR_ADDU "%[src], %[src], %[stride] \n\t"
1710 MMI_ULWC1($f6, %[src], 0x00)
1711 "punpcklbh $f4, $f4, $f0 \n\t"
1712 "punpcklbh $f6, $f6, $f0 \n\t"
1713 SHIFT2_LINE( 0, $f2, $f4, $f6, $f8)
1714 SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1715 SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1716 SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1717 SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1718 SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1719 SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1720 SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1721 PTR_SUBU "%[src], %[src], %[stride2] \n\t"
1722 PTR_ADDIU "%[dst], %[dst], 0x08 \n\t"
1723 "addiu $8, $8, -0x01 \n\t"
1725 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT
1726 [src]"+r"(src), [dst]"+r"(dst)
1727 : [stride]"r"(stride), [stride1]"r"(-2*stride),
1728 [shift]"f"(shift), [rnd]"m"(rnd),
1729 [stride2]"r"(9*stride-4), [ff_pw_9]"m"(ff_pw_9)
1730 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1731 "$f14", "$f16", "memory"
1736 * Data is already unpacked, so some operations can directly be made from
1739 #define VC1_HOR_16B_SHIFT2(OP, OPNAME) \
1740 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1741 const int16_t *src, int rnd) \
1744 DECLARE_VAR_ALL64; \
1745 DECLARE_VAR_ADDRT; \
1748 rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ \
1751 LOAD_ROUNDER_MMI("%[rnd]") \
1752 "ldc1 $f12, %[ff_pw_128] \n\t" \
1753 "ldc1 $f10, %[ff_pw_9] \n\t" \
1755 MMI_ULDC1($f2, %[src], 0x00) \
1756 MMI_ULDC1($f4, %[src], 0x08) \
1757 MMI_ULDC1($f6, %[src], 0x02) \
1758 MMI_ULDC1($f8, %[src], 0x0a) \
1759 MMI_ULDC1($f0, %[src], 0x06) \
1760 "paddh $f2, $f2, $f0 \n\t" \
1761 MMI_ULDC1($f0, %[src], 0x0e) \
1762 "paddh $f4, $f4, $f0 \n\t" \
1763 MMI_ULDC1($f0, %[src], 0x04) \
1764 "paddh $f6, $f6, $f0 \n\t" \
1765 MMI_ULDC1($f0, %[src], 0x0b) \
1766 "paddh $f8, $f8, $f0 \n\t" \
1767 "pmullh $f6, $f6, $f10 \n\t" \
1768 "pmullh $f8, $f8, $f10 \n\t" \
1769 "psubh $f6, $f6, $f2 \n\t" \
1770 "psubh $f8, $f8, $f4 \n\t" \
1771 "li $8, 0x07 \n\t" \
1772 "mtc1 $8, $f16 \n\t" \
1773 NORMALIZE_MMI("$f16") \
1775 "paddh $f6, $f6, $f12 \n\t" \
1776 "paddh $f8, $f8, $f12 \n\t" \
1777 TRANSFER_DO_PACK(OP) \
1778 "addiu %[h], %[h], -0x01 \n\t" \
1779 PTR_ADDIU "%[src], %[src], 0x18 \n\t" \
1780 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1781 "bnez %[h], 1b \n\t" \
1782 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1784 [src]"+r"(src), [dst]"+r"(dst) \
1785 : [stride]"r"(stride), [rnd]"m"(rnd), \
1786 [ff_pw_9]"m"(ff_pw_9), [ff_pw_128]"m"(ff_pw_128) \
1787 : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", \
1792 VC1_HOR_16B_SHIFT2(OP_PUT, put_)
1793 VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
1796 * Purely vertical or horizontal 1/2 shift interpolation.
1797 * Sacrify $f12 for *9 factor.
1799 #define VC1_SHIFT2(OP, OPNAME)\
1800 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src, \
1801 mips_reg stride, int rnd, \
1804 DECLARE_VAR_LOW32; \
1805 DECLARE_VAR_ADDRT; \
1810 "xor $f0, $f0, $f0 \n\t" \
1811 "li $10, 0x08 \n\t" \
1812 LOAD_ROUNDER_MMI("%[rnd]") \
1813 "ldc1 $f12, %[ff_pw_9] \n\t" \
1815 MMI_ULWC1($f6, %[src], 0x00) \
1816 MMI_ULWC1($f8, %[src], 0x04) \
1817 PTR_ADDU "$9, %[src], %[offset] \n\t" \
1818 MMI_ULWC1($f2, $9, 0x00) \
1819 MMI_ULWC1($f4, $9, 0x04) \
1820 PTR_ADDU "%[src], %[src], %[offset] \n\t" \
1821 "punpcklbh $f6, $f6, $f0 \n\t" \
1822 "punpcklbh $f8, $f8, $f0 \n\t" \
1823 "punpcklbh $f2, $f2, $f0 \n\t" \
1824 "punpcklbh $f4, $f4, $f0 \n\t" \
1825 "paddh $f6, $f6, $f2 \n\t" \
1826 "paddh $f8, $f8, $f4 \n\t" \
1827 PTR_ADDU "$9, %[src], %[offset_x2n] \n\t" \
1828 MMI_ULWC1($f2, $9, 0x00) \
1829 MMI_ULWC1($f4, $9, 0x04) \
1830 "pmullh $f6, $f6, $f12 \n\t" /* 0,9,9,0*/ \
1831 "pmullh $f8, $f8, $f12 \n\t" /* 0,9,9,0*/ \
1832 "punpcklbh $f2, $f2, $f0 \n\t" \
1833 "punpcklbh $f4, $f4, $f0 \n\t" \
1834 "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,0*/ \
1835 "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,0*/ \
1836 PTR_ADDU "$9, %[src], %[offset] \n\t" \
1837 MMI_ULWC1($f2, $9, 0x00) \
1838 MMI_ULWC1($f4, $9, 0x04) \
1839 "punpcklbh $f2, $f2, $f0 \n\t" \
1840 "punpcklbh $f4, $f4, $f0 \n\t" \
1841 "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,-1*/ \
1842 "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,-1*/ \
1843 "li $8, 0x04 \n\t" \
1844 "mtc1 $8, $f16 \n\t" \
1845 NORMALIZE_MMI("$f16") \
1846 "packushb $f6, $f6, $f8 \n\t" \
1848 "sdc1 $f6, 0x00(%[dst]) \n\t" \
1849 "addiu $10, $10, -0x01 \n\t" \
1850 PTR_ADDU "%[src], %[src], %[stride1] \n\t" \
1851 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1852 "bnez $10, 1b \n\t" \
1853 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1854 [src]"+r"(src), [dst]"+r"(dst) \
1855 : [offset]"r"(offset), [offset_x2n]"r"(-2*offset), \
1856 [stride]"r"(stride), [rnd]"m"(rnd), \
1857 [stride1]"r"(stride-offset), \
1858 [ff_pw_9]"m"(ff_pw_9) \
1859 : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", \
1860 "$f12", "$f14", "$f16", "memory" \
1864 VC1_SHIFT2(OP_PUT, put_)
1865 VC1_SHIFT2(OP_AVG, avg_)
1868 * Core of the 1/4 and 3/4 shift bicubic interpolation.
1870 * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
1871 * @param LOAD "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1872 * @param M "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1873 * @param A1 Stride address of 1st tap (beware of unpacked/packed).
1874 * @param A2 Stride address of 2nd tap
1875 * @param A3 Stride address of 3rd tap
1876 * @param A4 Stride address of 4th tap
1878 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4) \
1879 PTR_ADDU "$9, %[src], "#A1" \n\t" \
1880 LOAD($f2, $9, M*0) \
1881 LOAD($f4, $9, M*4) \
1884 "pmullh $f2, $f2, %[ff_pw_3] \n\t" \
1885 "pmullh $f4, $f4, %[ff_pw_3] \n\t" \
1886 PTR_ADDU "$9, %[src], "#A2" \n\t" \
1887 LOAD($f6, $9, M*0) \
1888 LOAD($f8, $9, M*4) \
1891 "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1892 "pmullh $f8, $f8, $f12 \n\t" /* *18 */ \
1893 "psubh $f6, $f6, $f2 \n\t" /* *18, -3 */ \
1894 "psubh $f8, $f8, $f4 \n\t" /* *18, -3 */ \
1895 PTR_ADDU "$9, %[src], "#A4" \n\t" \
1896 LOAD($f2, $9, M*0) \
1897 LOAD($f4, $9, M*4) \
1900 "li $8, 0x02 \n\t" \
1901 "mtc1 $8, $f16 \n\t" \
1902 "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1903 "psllh $f4, $f4, $f16 \n\t" /* 4* */ \
1904 "psubh $f6, $f6, $f2 \n\t" /* -4,18,-3 */ \
1905 "psubh $f8, $f8, $f4 \n\t" /* -4,18,-3 */ \
1906 PTR_ADDU "$9, %[src], "#A3" \n\t" \
1907 LOAD($f2, $9, M*0) \
1908 LOAD($f4, $9, M*4) \
1911 "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1912 "pmullh $f4, $f4, $f10 \n\t" /* *53 */ \
1913 "paddh $f6, $f6, $f2 \n\t" /* 4,53,18,-3 */ \
1914 "paddh $f8, $f8, $f4 \n\t" /* 4,53,18,-3 */
1917 * Macro to build the vertical 16bits version of vc1_put_shift[13].
1918 * Here, offset=src_stride. Parameters passed A1 to A4 must use
1919 * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1921 * @param NAME Either 1 or 3
1922 * @see MSPEL_FILTER13_CORE for information on A1->A4
1924 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
1926 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src, \
1927 mips_reg src_stride, \
1928 int rnd, int64_t shift) \
1931 DECLARE_VAR_LOW32; \
1932 DECLARE_VAR_ADDRT; \
1934 src -= src_stride; \
1937 "xor $f0, $f0, $f0 \n\t" \
1938 LOAD_ROUNDER_MMI("%[rnd]") \
1939 "ldc1 $f10, %[ff_pw_53] \n\t" \
1940 "ldc1 $f12, %[ff_pw_18] \n\t" \
1943 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
1944 NORMALIZE_MMI("%[shift]") \
1945 TRANSFER_DONT_PACK(OP_PUT) \
1946 /* Last 3 (in fact 4) bytes on the line */ \
1947 PTR_ADDU "$9, %[src], "#A1" \n\t" \
1948 MMI_ULWC1($f2, $9, 0x08) \
1950 "mov.d $f6, $f2 \n\t" \
1951 "paddh $f2, $f2, $f2 \n\t" \
1952 "paddh $f2, $f2, $f6 \n\t" /* 3* */ \
1953 PTR_ADDU "$9, %[src], "#A2" \n\t" \
1954 MMI_ULWC1($f6, $9, 0x08) \
1956 "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1957 "psubh $f6, $f6, $f2 \n\t" /* *18,-3 */ \
1958 PTR_ADDU "$9, %[src], "#A3" \n\t" \
1959 MMI_ULWC1($f2, $9, 0x08) \
1961 "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1962 "paddh $f6, $f6, $f2 \n\t" /* *53,18,-3 */ \
1963 PTR_ADDU "$9, %[src], "#A4" \n\t" \
1964 MMI_ULWC1($f2, $9, 0x08) \
1966 "li $8, 0x02 \n\t" \
1967 "mtc1 $8, $f16 \n\t" \
1968 "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1969 "psubh $f6, $f6, $f2 \n\t" \
1970 "paddh $f6, $f6, $f14 \n\t" \
1971 "li $8, 0x06 \n\t" \
1972 "mtc1 $8, $f16 \n\t" \
1973 "psrah $f6, $f6, $f16 \n\t" \
1974 "sdc1 $f6, 0x10(%[dst]) \n\t" \
1975 "addiu %[h], %[h], -0x01 \n\t" \
1976 PTR_ADDU "%[src], %[src], %[stride_x1] \n\t" \
1977 PTR_ADDIU "%[dst], %[dst], 0x18 \n\t" \
1978 "bnez %[h], 1b \n\t" \
1979 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1981 [src]"+r"(src), [dst]"+r"(dst) \
1982 : [stride_x1]"r"(src_stride), [stride_x2]"r"(2*src_stride), \
1983 [stride_x3]"r"(3*src_stride), \
1984 [rnd]"m"(rnd), [shift]"f"(shift), \
1985 [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1986 [ff_pw_3]"f"(ff_pw_3) \
1987 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
1988 "$f14", "$f16", "memory" \
1993 * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1994 * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1996 * @param NAME Either 1 or 3
1997 * @see MSPEL_FILTER13_CORE for information on A1->A4
1999 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
2001 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride, \
2002 const int16_t *src, int rnd) \
2005 DECLARE_VAR_ALL64; \
2006 DECLARE_VAR_ADDRT; \
2009 rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
2012 "xor $f0, $f0, $f0 \n\t" \
2013 LOAD_ROUNDER_MMI("%[rnd]") \
2014 "ldc1 $f10, %[ff_pw_53] \n\t" \
2015 "ldc1 $f12, %[ff_pw_18] \n\t" \
2018 MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4) \
2019 "li $8, 0x07 \n\t" \
2020 "mtc1 $8, $f16 \n\t" \
2021 NORMALIZE_MMI("$f16") \
2023 "paddh $f6, $f6, %[ff_pw_128] \n\t" \
2024 "paddh $f8, $f8, %[ff_pw_128] \n\t" \
2025 TRANSFER_DO_PACK(OP) \
2026 "addiu %[h], %[h], -0x01 \n\t" \
2027 PTR_ADDU "%[src], %[src], 0x18 \n\t" \
2028 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
2029 "bnez %[h], 1b \n\t" \
2030 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
2032 [src]"+r"(src), [dst]"+r"(dst) \
2033 : [stride]"r"(stride), [rnd]"m"(rnd), \
2034 [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
2035 [ff_pw_3]"f"(ff_pw_3), [ff_pw_128]"f"(ff_pw_128) \
2036 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
2037 "$f14", "$f16", "memory" \
2042 * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
2043 * Here, offset=src_stride. Parameters passed A1 to A4 must use
2044 * %3 (offset), %4 (2*offset) and %5 (3*offset).
2046 * @param NAME Either 1 or 3
2047 * @see MSPEL_FILTER13_CORE for information on A1->A4
2049 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
2051 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src, \
2052 mips_reg stride, int rnd, mips_reg offset) \
2055 DECLARE_VAR_LOW32; \
2056 DECLARE_VAR_ADDRT; \
2061 __asm__ volatile ( \
2062 "xor $f0, $f0, $f0 \n\t" \
2063 LOAD_ROUNDER_MMI("%[rnd]") \
2064 "ldc1 $f10, %[ff_pw_53] \n\t" \
2065 "ldc1 $f12, %[ff_pw_18] \n\t" \
2068 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
2069 "li $8, 0x06 \n\t" \
2070 "mtc1 $8, $f16 \n\t" \
2071 NORMALIZE_MMI("$f16") \
2072 TRANSFER_DO_PACK(OP) \
2073 "addiu %[h], %[h], -0x01 \n\t" \
2074 PTR_ADDU "%[src], %[src], %[stride] \n\t" \
2075 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
2076 "bnez %[h], 1b \n\t" \
2077 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
2079 [src]"+r"(src), [dst]"+r"(dst) \
2080 : [offset_x1]"r"(offset), [offset_x2]"r"(2*offset), \
2081 [offset_x3]"r"(3*offset), [stride]"r"(stride), \
2083 [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
2084 [ff_pw_3]"f"(ff_pw_3) \
2085 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
2086 "$f14", "$f16", "memory" \
2091 /** 1/4 shift bicubic interpolation */
2092 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
2093 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
2094 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
2095 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
2096 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
2098 /** 3/4 shift bicubic interpolation */
2099 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2100 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2101 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
2102 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
2103 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
2105 typedef void (*vc1_mspel_mc_filter_ver_16bits)
2106 (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
2108 typedef void (*vc1_mspel_mc_filter_hor_16bits)
2109 (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
2110 typedef void (*vc1_mspel_mc_filter_8bits)
2111 (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
2115 * Interpolate fractional pel values by applying proper vertical then
2116 * horizontal filter.
2118 * @param dst Destination buffer for interpolated pels.
2119 * @param src Source buffer.
2120 * @param stride Stride for both src and dst buffers.
2121 * @param hmode Horizontal filter (expressed in quarter pixels shift).
2122 * @param hmode Vertical filter.
2123 * @param rnd Rounding bias.
2125 #define VC1_MSPEL_MC(OP) \
2126 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
2127 int hmode, int vmode, int rnd) \
2129 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
2130 { NULL, vc1_put_ver_16b_shift1_mmi, \
2131 vc1_put_ver_16b_shift2_mmi, \
2132 vc1_put_ver_16b_shift3_mmi }; \
2133 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
2134 { NULL, OP ## vc1_hor_16b_shift1_mmi, \
2135 OP ## vc1_hor_16b_shift2_mmi, \
2136 OP ## vc1_hor_16b_shift3_mmi }; \
2137 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = \
2138 { NULL, OP ## vc1_shift1_mmi, \
2139 OP ## vc1_shift2_mmi, \
2140 OP ## vc1_shift3_mmi }; \
2142 if (vmode) { /* Vertical filter to apply */ \
2143 if (hmode) { /* Horizontal filter to apply, output to tmp */ \
2144 static const int shift_value[] = { 0, 5, 1, 5 }; \
2145 int shift = (shift_value[hmode]+shift_value[vmode])>>1; \
2147 LOCAL_ALIGNED(16, int16_t, tmp, [12*8]); \
2149 r = (1<<(shift-1)) + rnd-1; \
2150 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); \
2152 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); \
2155 else { /* No horizontal filter, output 8 lines to dst */ \
2156 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); \
2161 /* Horizontal mode with no vertical mode */ \
2162 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); \
2164 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
2165 int stride, int hmode, int vmode, int rnd)\
2167 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2168 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2169 dst += 8*stride; src += 8*stride; \
2170 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2171 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2177 /** Macro to ease bicubic filter interpolation functions declarations */
2178 #define DECLARE_FUNCTION(a, b) \
2179 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2180 const uint8_t *src, \
2184 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2186 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2187 const uint8_t *src, \
2191 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2193 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2194 const uint8_t *src, \
2198 put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2200 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2201 const uint8_t *src, \
2205 avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2208 DECLARE_FUNCTION(0, 1)
2209 DECLARE_FUNCTION(0, 2)
2210 DECLARE_FUNCTION(0, 3)
2212 DECLARE_FUNCTION(1, 0)
2213 DECLARE_FUNCTION(1, 1)
2214 DECLARE_FUNCTION(1, 2)
2215 DECLARE_FUNCTION(1, 3)
2217 DECLARE_FUNCTION(2, 0)
2218 DECLARE_FUNCTION(2, 1)
2219 DECLARE_FUNCTION(2, 2)
2220 DECLARE_FUNCTION(2, 3)
2222 DECLARE_FUNCTION(3, 0)
2223 DECLARE_FUNCTION(3, 1)
2224 DECLARE_FUNCTION(3, 2)
2225 DECLARE_FUNCTION(3, 3)
2227 #define CHROMA_MC_8_MMI \
2228 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
2229 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2230 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
2231 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2232 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
2233 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2234 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \
2235 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2237 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2238 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" \
2239 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2240 "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" \
2241 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2242 "pmullh %[ftmp7], %[ftmp7], %[C] \n\t" \
2243 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2244 "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" \
2246 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2247 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2248 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2249 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2251 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
2252 "paddh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" \
2253 "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
2254 "paddh %[ftmp5], %[ftmp5], %[ff_pw_28] \n\t" \
2256 "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
2257 "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
2258 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
2261 #define CHROMA_MC_4_MMI \
2262 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2263 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2264 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2265 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2267 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2268 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2269 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2270 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2272 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2273 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2274 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2275 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2277 "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
2278 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
2281 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2282 uint8_t *src /* align 1 */,
2283 int stride, int h, int x, int y)
2285 const int A = (8 - x) * (8 - y);
2286 const int B = (x) * (8 - y);
2287 const int C = (8 - x) * (y);
2288 const int D = (x) * (y);
2294 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2297 "li %[tmp0], 0x06 \n\t"
2298 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2299 "mtc1 %[tmp0], %[ftmp9] \n\t"
2300 "pshufh %[A], %[A], %[ftmp0] \n\t"
2301 "pshufh %[B], %[B], %[ftmp0] \n\t"
2302 "pshufh %[C], %[C], %[ftmp0] \n\t"
2303 "pshufh %[D], %[D], %[ftmp0] \n\t"
2306 MMI_ULDC1(%[ftmp1], %[src], 0x00)
2307 MMI_ULDC1(%[ftmp2], %[src], 0x01)
2308 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2309 MMI_ULDC1(%[ftmp3], %[src], 0x00)
2310 MMI_ULDC1(%[ftmp4], %[src], 0x01)
2314 MMI_SDC1(%[ftmp1], %[dst], 0x00)
2315 "addiu %[h], %[h], -0x01 \n\t"
2316 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2317 "bnez %[h], 1b \n\t"
2318 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2319 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2320 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2321 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2322 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2325 [tmp0]"=&r"(tmp[0]),
2326 [src]"+&r"(src), [dst]"+&r"(dst),
2328 : [stride]"r"((mips_reg)stride),
2329 [A]"f"(A), [B]"f"(B),
2330 [C]"f"(C), [D]"f"(D),
2331 [ff_pw_28]"f"(ff_pw_28)
2336 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2337 uint8_t *src /* align 1 */,
2338 int stride, int h, int x, int y)
2340 const int A = (8 - x) * (8 - y);
2341 const int B = (x) * (8 - y);
2342 const int C = (8 - x) * (y);
2343 const int D = (x) * (y);
2349 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2352 "li %[tmp0], 0x06 \n\t"
2353 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2354 "mtc1 %[tmp0], %[ftmp5] \n\t"
2355 "pshufh %[A], %[A], %[ftmp0] \n\t"
2356 "pshufh %[B], %[B], %[ftmp0] \n\t"
2357 "pshufh %[C], %[C], %[ftmp0] \n\t"
2358 "pshufh %[D], %[D], %[ftmp0] \n\t"
2361 MMI_ULWC1(%[ftmp1], %[src], 0x00)
2362 MMI_ULWC1(%[ftmp2], %[src], 0x01)
2363 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2364 MMI_ULWC1(%[ftmp3], %[src], 0x00)
2365 MMI_ULWC1(%[ftmp4], %[src], 0x01)
2369 MMI_SWC1(%[ftmp1], %[dst], 0x00)
2370 "addiu %[h], %[h], -0x01 \n\t"
2371 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2372 "bnez %[h], 1b \n\t"
2373 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2374 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2375 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2376 [tmp0]"=&r"(tmp[0]),
2379 [src]"+&r"(src), [dst]"+&r"(dst),
2381 : [stride]"r"((mips_reg)stride),
2382 [A]"f"(A), [B]"f"(B),
2383 [C]"f"(C), [D]"f"(D),
2384 [ff_pw_28]"f"(ff_pw_28)
2389 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2390 uint8_t *src /* align 1 */,
2391 int stride, int h, int x, int y)
2393 const int A = (8 - x) * (8 - y);
2394 const int B = (x) * (8 - y);
2395 const int C = (8 - x) * (y);
2396 const int D = (x) * (y);
2402 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2405 "li %[tmp0], 0x06 \n\t"
2406 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2407 "mtc1 %[tmp0], %[ftmp9] \n\t"
2408 "pshufh %[A], %[A], %[ftmp0] \n\t"
2409 "pshufh %[B], %[B], %[ftmp0] \n\t"
2410 "pshufh %[C], %[C], %[ftmp0] \n\t"
2411 "pshufh %[D], %[D], %[ftmp0] \n\t"
2414 MMI_ULDC1(%[ftmp1], %[src], 0x00)
2415 MMI_ULDC1(%[ftmp2], %[src], 0x01)
2416 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2417 MMI_ULDC1(%[ftmp3], %[src], 0x00)
2418 MMI_ULDC1(%[ftmp4], %[src], 0x01)
2422 MMI_LDC1(%[ftmp2], %[dst], 0x00)
2423 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2425 MMI_SDC1(%[ftmp1], %[dst], 0x00)
2426 "addiu %[h], %[h], -0x01 \n\t"
2427 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2428 "bnez %[h], 1b \n\t"
2429 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2430 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2431 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2432 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2433 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2434 [tmp0]"=&r"(tmp[0]),
2437 [src]"+&r"(src), [dst]"+&r"(dst),
2439 : [stride]"r"((mips_reg)stride),
2440 [A]"f"(A), [B]"f"(B),
2441 [C]"f"(C), [D]"f"(D),
2442 [ff_pw_28]"f"(ff_pw_28)
2447 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2448 uint8_t *src /* align 1 */,
2449 int stride, int h, int x, int y)
2451 const int A = (8 - x) * (8 - y);
2452 const int B = ( x) * (8 - y);
2453 const int C = (8 - x) * ( y);
2454 const int D = ( x) * ( y);
2460 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2463 "li %[tmp0], 0x06 \n\t"
2464 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2465 "mtc1 %[tmp0], %[ftmp5] \n\t"
2466 "pshufh %[A], %[A], %[ftmp0] \n\t"
2467 "pshufh %[B], %[B], %[ftmp0] \n\t"
2468 "pshufh %[C], %[C], %[ftmp0] \n\t"
2469 "pshufh %[D], %[D], %[ftmp0] \n\t"
2472 MMI_ULWC1(%[ftmp1], %[src], 0x00)
2473 MMI_ULWC1(%[ftmp2], %[src], 0x01)
2474 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2475 MMI_ULWC1(%[ftmp3], %[src], 0x00)
2476 MMI_ULWC1(%[ftmp4], %[src], 0x01)
2480 MMI_LWC1(%[ftmp2], %[dst], 0x00)
2481 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2483 MMI_SWC1(%[ftmp1], %[dst], 0x00)
2484 "addiu %[h], %[h], -0x01 \n\t"
2485 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2486 "bnez %[h], 1b \n\t"
2487 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2488 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2489 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2490 [tmp0]"=&r"(tmp[0]),
2493 [src]"+&r"(src), [dst]"+&r"(dst),
2495 : [stride]"r"((mips_reg)stride),
2496 [A]"f"(A), [B]"f"(B),
2497 [C]"f"(C), [D]"f"(D),
2498 [ff_pw_28]"f"(ff_pw_28)