]> git.sesse.net Git - ffmpeg/blob - libavcodec/mips/vc1dsp_mmi.c
Merge commit '76eef04f30a768fa80366d679f3de7e9447b67d5'
[ffmpeg] / libavcodec / mips / vc1dsp_mmi.c
1 /*
2  * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
3  *
4  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 #include "libavutil/avassert.h"
24 #include "libavcodec/vc1dsp.h"
25 #include "constants.h"
26 #include "vc1dsp_mips.h"
27 #include "hpeldsp_mips.h"
28 #include "libavutil/mips/mmiutils.h"
29
30 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0)                  \
31         "li         %[tmp0],    "#r1"                                 \n\t" \
32         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
33         "pshufh     %[ftmp13],  %[ftmp13],  %[ftmp23]                 \n\t" \
34         "li         %[tmp0],    "#r2"                                 \n\t" \
35         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
36         "pshufh     %[ftmp14],  %[ftmp14],  %[ftmp23]                 \n\t" \
37         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp13]                 \n\t" \
38         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp14]                 \n\t" \
39         "paddw      %[ftmp1],   %[ftmp1],   %[ftmp2]                  \n\t" \
40         "pmaddhw    %[ftmp2],   %[ftmp6],   %[ftmp13]                 \n\t" \
41         "pmaddhw    %[ftmp3],   %[ftmp8],   %[ftmp14]                 \n\t" \
42         "paddw      %[ftmp2],   %[ftmp2],   %[ftmp3]                  \n\t" \
43                                                                             \
44         "li         %[tmp0],    "#r3"                                 \n\t" \
45         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
46         "pshufh     %[ftmp13],  %[ftmp13],  %[ftmp23]                 \n\t" \
47         "li         %[tmp0],    "#r4"                                 \n\t" \
48         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
49         "pshufh     %[ftmp14],  %[ftmp14],  %[ftmp23]                 \n\t" \
50         "pmaddhw    %[ftmp3],   %[ftmp9],   %[ftmp13]                 \n\t" \
51         "pmaddhw    %[ftmp4],   %[ftmp11],  %[ftmp14]                 \n\t" \
52         "paddw      %[ftmp3],   %[ftmp3],   %[ftmp4]                  \n\t" \
53         "pmaddhw    %[ftmp4],   %[ftmp10],  %[ftmp13]                 \n\t" \
54         "pmaddhw    %[ftmp13],  %[ftmp12],  %[ftmp14]                 \n\t" \
55         "paddw      %[ftmp4],   %[ftmp4],   %[ftmp13]                 \n\t" \
56                                                                             \
57         "paddw      %[ftmp13],  %[ftmp1],   %[ftmp3]                  \n\t" \
58         "psubw      %[ftmp14],  %[ftmp1],   %[ftmp3]                  \n\t" \
59         "paddw      %[ftmp1],   %[ftmp2],   %[ftmp4]                  \n\t" \
60         "psubw      %[ftmp3],   %[ftmp2],   %[ftmp4]                  \n\t" \
61         "paddw      %[ftmp13],  %[ftmp13],  "#c0"                     \n\t" \
62         "paddw      %[ftmp14],  %[ftmp14],  "#c0"                     \n\t" \
63         "paddw      %[ftmp1],   %[ftmp1],   "#c0"                     \n\t" \
64         "paddw      %[ftmp3],   %[ftmp3],   "#c0"                     \n\t" \
65         "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                  \n\t" \
66         "psraw      %[ftmp1],   %[ftmp1],   %[ftmp0]                  \n\t" \
67         "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                  \n\t" \
68         "psraw      %[ftmp3],   %[ftmp3],   %[ftmp0]                  \n\t" \
69         "punpcklhw  %[ftmp2],   %[ftmp13],  %[ftmp1]                  \n\t" \
70         "punpckhhw  %[ftmp4],   %[ftmp13],  %[ftmp1]                  \n\t" \
71         "punpcklhw  "#o1",      %[ftmp2],   %[ftmp4]                  \n\t" \
72         "punpcklhw  %[ftmp2],   %[ftmp14],  %[ftmp3]                  \n\t" \
73         "punpckhhw  %[ftmp4],   %[ftmp14],  %[ftmp3]                  \n\t" \
74         "punpcklhw  "#o2",      %[ftmp2],   %[ftmp4]                  \n\t"
75
76 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1)              \
77         "li         %[tmp0],    "#r1"                                 \n\t" \
78         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
79         "pshufh     %[ftmp13],  %[ftmp13],  %[ftmp23]                 \n\t" \
80         "li         %[tmp0],    "#r2"                                 \n\t" \
81         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
82         "pshufh     %[ftmp14],  %[ftmp14],  %[ftmp23]                 \n\t" \
83         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp13]                 \n\t" \
84         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp14]                 \n\t" \
85         "paddw      %[ftmp1],   %[ftmp1],   %[ftmp2]                  \n\t" \
86         "pmaddhw    %[ftmp2],   %[ftmp6],   %[ftmp13]                 \n\t" \
87         "pmaddhw    %[ftmp3],   %[ftmp8],   %[ftmp14]                 \n\t" \
88         "paddw      %[ftmp2],   %[ftmp2],   %[ftmp3]                  \n\t" \
89                                                                             \
90         "li         %[tmp0],    "#r3"                                 \n\t" \
91         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
92         "pshufh     %[ftmp13],  %[ftmp13],  %[ftmp23]                 \n\t" \
93         "li         %[tmp0],    "#r4"                                 \n\t" \
94         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
95         "pshufh     %[ftmp14],  %[ftmp14],  %[ftmp23]                 \n\t" \
96         "pmaddhw    %[ftmp3],   %[ftmp9],   %[ftmp13]                 \n\t" \
97         "pmaddhw    %[ftmp4],   %[ftmp11],  %[ftmp14]                 \n\t" \
98         "paddw      %[ftmp3],   %[ftmp3],   %[ftmp4]                  \n\t" \
99         "pmaddhw    %[ftmp4],   %[ftmp10],  %[ftmp13]                 \n\t" \
100         "pmaddhw    %[ftmp13],  %[ftmp12],  %[ftmp14]                 \n\t" \
101         "paddw      %[ftmp4],   %[ftmp4],   %[ftmp13]                 \n\t" \
102                                                                             \
103         "paddw      %[ftmp13],  %[ftmp1],   %[ftmp3]                  \n\t" \
104         "psubw      %[ftmp14],  %[ftmp1],   %[ftmp3]                  \n\t" \
105         "paddw      %[ftmp14],  %[ftmp14],  "#c1"                     \n\t" \
106         "paddw      %[ftmp1],   %[ftmp2],   %[ftmp4]                  \n\t" \
107         "psubw      %[ftmp3],   %[ftmp2],   %[ftmp4]                  \n\t" \
108         "paddw      %[ftmp3],   %[ftmp3],   "#c1"                     \n\t" \
109         "paddw      %[ftmp13],  %[ftmp13],  "#c0"                     \n\t" \
110         "paddw      %[ftmp14],  %[ftmp14],  "#c0"                     \n\t" \
111         "paddw      %[ftmp1],   %[ftmp1],   "#c0"                     \n\t" \
112         "paddw      %[ftmp3],   %[ftmp3],   "#c0"                     \n\t" \
113         "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                  \n\t" \
114         "psraw      %[ftmp1],   %[ftmp1],   %[ftmp0]                  \n\t" \
115         "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                  \n\t" \
116         "psraw      %[ftmp3],   %[ftmp3],   %[ftmp0]                  \n\t" \
117         "punpcklhw  %[ftmp2],   %[ftmp13],  %[ftmp1]                  \n\t" \
118         "punpckhhw  %[ftmp4],   %[ftmp13],  %[ftmp1]                  \n\t" \
119         "punpcklhw  "#o1",      %[ftmp2],   %[ftmp4]                  \n\t" \
120         "punpcklhw  %[ftmp2],   %[ftmp14],  %[ftmp3]                  \n\t" \
121         "punpckhhw  %[ftmp4],   %[ftmp14],  %[ftmp3]                  \n\t" \
122         "punpcklhw  "#o2",      %[ftmp2],   %[ftmp4]                  \n\t"
123
124 /* Do inverse transform on 8x8 block */
125 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
126 {
127     int dc = block[0];
128     double ftmp[9];
129     mips_reg addr[1];
130     int count;
131
132     dc = (3 * dc +  1) >> 1;
133     dc = (3 * dc + 16) >> 5;
134
135     __asm__ volatile(
136         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
137         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
138         "li         %[count],   0x02                                    \n\t"
139
140         "1:                                                             \n\t"
141         MMI_LDC1(%[ftmp1], %[dest], 0x00)
142         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
143         MMI_LDC1(%[ftmp2], %[addr0], 0x00)
144         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
145         MMI_LDC1(%[ftmp3], %[addr0], 0x00)
146         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
147         MMI_LDC1(%[ftmp4], %[addr0], 0x00)
148
149         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
150         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
151         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
152         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
153         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
154         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
155         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
156         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
157
158         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
159         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
160         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
161         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
162         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
163         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
164         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
165         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
166
167         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
168         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
169         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
170         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
171
172         MMI_SDC1(%[ftmp1], %[dest], 0x00)
173         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
174         MMI_SDC1(%[ftmp2], %[addr0], 0x00)
175         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
176         MMI_SDC1(%[ftmp3], %[addr0], 0x00)
177         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
178         MMI_SDC1(%[ftmp4], %[addr0], 0x00)
179
180         "addiu      %[count],   %[count],       -0x01                   \n\t"
181         PTR_ADDU   "%[dest],    %[addr0],       %[linesize]             \n\t"
182         "bnez       %[count],   1b                                      \n\t"
183         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
184           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
185           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
186           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
187           [ftmp8]"=&f"(ftmp[8]),
188           [addr0]"=&r"(addr[0]),
189           [count]"=&r"(count),          [dest]"+&r"(dest)
190         : [linesize]"r"((mips_reg)linesize),
191           [dc]"f"(dc)
192         : "memory"
193     );
194 }
195
196 #if _MIPS_SIM != _ABIO32
197 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
198 {
199     DECLARE_ALIGNED(16, int16_t, temp[64]);
200     DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
201     DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
202     DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
203     int16_t *src = block;
204     int16_t *dst = temp;
205     double ftmp[24];
206     uint64_t tmp[1];
207
208     // 1st loop
209     __asm__ volatile (
210         "li         %[tmp0],    0x03                                    \n\t"
211         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
212         "li         %[tmp0],    0x44                                    \n\t"
213         "mtc1       %[tmp0],    %[ftmp23]                               \n\t"
214
215        // 1st part
216         MMI_LDC1(%[ftmp1], %[src], 0x00)
217         MMI_LDC1(%[ftmp2], %[src], 0x20)
218         MMI_LDC1(%[ftmp3], %[src], 0x40)
219         MMI_LDC1(%[ftmp4], %[src], 0x60)
220         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
221         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
222         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
223         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
224
225         MMI_LDC1(%[ftmp1], %[src], 0x10)
226         MMI_LDC1(%[ftmp2], %[src], 0x30)
227         MMI_LDC1(%[ftmp3], %[src], 0x50)
228         MMI_LDC1(%[ftmp4], %[src], 0x70)
229         "punpcklhw  %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
230         "punpckhhw  %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
231         "punpcklhw  %[ftmp11],  %[ftmp3],   %[ftmp4]                    \n\t"
232         "punpckhhw  %[ftmp12],  %[ftmp3],   %[ftmp4]                    \n\t"
233
234         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
235         VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
236                                0x000f0010, 0x00040009, %[ff_pw_4])
237
238         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
239         VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
240                                0xfffc000f, 0xfff7fff0, %[ff_pw_4])
241
242         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
243         VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
244                                0xfff00009, 0x000f0004, %[ff_pw_4])
245
246         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
247         VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
248                                0xfff70004, 0xfff0000f, %[ff_pw_4])
249
250         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
251                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
252
253         MMI_SDC1(%[ftmp15], %[dst], 0x00)
254         MMI_SDC1(%[ftmp16], %[dst], 0x10)
255         MMI_SDC1(%[ftmp17], %[dst], 0x20)
256         MMI_SDC1(%[ftmp18], %[dst], 0x30)
257
258         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
259                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
260
261         MMI_SDC1(%[ftmp19], %[dst], 0x08)
262         MMI_SDC1(%[ftmp20], %[dst], 0x18)
263         MMI_SDC1(%[ftmp21], %[dst], 0x28)
264         MMI_SDC1(%[ftmp22], %[dst], 0x38)
265
266        // 2nd part
267         MMI_LDC1(%[ftmp1], %[src], 0x08)
268         MMI_LDC1(%[ftmp2], %[src], 0x28)
269         MMI_LDC1(%[ftmp3], %[src], 0x48)
270         MMI_LDC1(%[ftmp4], %[src], 0x68)
271         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
272         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
273         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
274         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
275
276         MMI_LDC1(%[ftmp1], %[src], 0x18)
277         MMI_LDC1(%[ftmp2], %[src], 0x38)
278         MMI_LDC1(%[ftmp3], %[src], 0x58)
279         MMI_LDC1(%[ftmp4], %[src], 0x78)
280         "punpcklhw  %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
281         "punpckhhw  %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
282         "punpcklhw  %[ftmp11],  %[ftmp3],   %[ftmp4]                    \n\t"
283         "punpckhhw  %[ftmp12],  %[ftmp3],   %[ftmp4]                    \n\t"
284
285         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
286         VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
287                                0x000f0010, 0x00040009, %[ff_pw_4])
288
289         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
290         VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
291                                0xfffc000f, 0xfff7fff0, %[ff_pw_4])
292
293         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
294         VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
295                                0xfff00009, 0x000f0004, %[ff_pw_4])
296
297         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
298         VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
299                                0xfff70004, 0xfff0000f, %[ff_pw_4])
300
301         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
302                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
303
304         MMI_SDC1(%[ftmp15], %[dst], 0x40)
305         MMI_SDC1(%[ftmp16], %[dst], 0x50)
306         MMI_SDC1(%[ftmp17], %[dst], 0x60)
307         MMI_SDC1(%[ftmp18], %[dst], 0x70)
308
309         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
310                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
311
312         MMI_SDC1(%[ftmp19], %[dst], 0x48)
313         MMI_SDC1(%[ftmp20], %[dst], 0x58)
314         MMI_SDC1(%[ftmp21], %[dst], 0x68)
315         MMI_SDC1(%[ftmp22], %[dst], 0x78)
316
317         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
318           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
319           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
320           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
321           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
322           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
323           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
324           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
325           [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
326           [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
327           [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
328           [ftmp22]"=&f"(ftmp[22]),      [ftmp23]"=&f"(ftmp[23]),
329           [tmp0]"=&r"(tmp[0])
330         : [ff_pw_4]"f"(ff_pw_4_local), [src]"r"(src), [dst]"r"(dst)
331         : "memory"
332     );
333
334     src = temp;
335     dst = block;
336
337     // 2nd loop
338     __asm__ volatile (
339         "li         %[tmp0],    0x07                                    \n\t"
340         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
341         "li         %[tmp0],    0x44                                    \n\t"
342         "mtc1       %[tmp0],    %[ftmp23]                               \n\t"
343
344         // 1st part
345         MMI_LDC1(%[ftmp1], %[src], 0x00)
346         MMI_LDC1(%[ftmp2], %[src], 0x20)
347         MMI_LDC1(%[ftmp3], %[src], 0x40)
348         MMI_LDC1(%[ftmp4], %[src], 0x60)
349         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
350         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
351         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
352         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
353
354         MMI_LDC1(%[ftmp1], %[src], 0x10)
355         MMI_LDC1(%[ftmp2], %[src], 0x30)
356         MMI_LDC1(%[ftmp3], %[src], 0x50)
357         MMI_LDC1(%[ftmp4], %[src], 0x70)
358         "punpcklhw  %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
359         "punpckhhw  %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
360         "punpcklhw  %[ftmp11],  %[ftmp3],   %[ftmp4]                    \n\t"
361         "punpckhhw  %[ftmp12],  %[ftmp3],   %[ftmp4]                    \n\t"
362
363         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
364         VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
365                                0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
366
367         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
368         VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
369                                0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
370
371         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
372         VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
373                                0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
374
375         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
376         VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
377                                0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
378
379         MMI_SDC1(%[ftmp15], %[dst], 0x00)
380         MMI_SDC1(%[ftmp16], %[dst], 0x10)
381         MMI_SDC1(%[ftmp17], %[dst], 0x20)
382         MMI_SDC1(%[ftmp18], %[dst], 0x30)
383         MMI_SDC1(%[ftmp19], %[dst], 0x40)
384         MMI_SDC1(%[ftmp20], %[dst], 0x50)
385         MMI_SDC1(%[ftmp21], %[dst], 0x60)
386         MMI_SDC1(%[ftmp22], %[dst], 0x70)
387
388        // 2nd part
389         MMI_LDC1(%[ftmp1], %[src], 0x08)
390         MMI_LDC1(%[ftmp2], %[src], 0x28)
391         MMI_LDC1(%[ftmp3], %[src], 0x48)
392         MMI_LDC1(%[ftmp4], %[src], 0x68)
393         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
394         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
395         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
396         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
397
398         MMI_LDC1(%[ftmp1], %[src], 0x18)
399         MMI_LDC1(%[ftmp2], %[src], 0x38)
400         MMI_LDC1(%[ftmp3], %[src], 0x58)
401         MMI_LDC1(%[ftmp4], %[src], 0x78)
402         "punpcklhw  %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
403         "punpckhhw  %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
404         "punpcklhw  %[ftmp11],  %[ftmp3],   %[ftmp4]                    \n\t"
405         "punpckhhw  %[ftmp12],  %[ftmp3],   %[ftmp4]                    \n\t"
406
407         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
408         VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
409                                0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
410
411         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
412         VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
413                                0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
414
415         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
416         VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
417                                0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
418
419         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
420         VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
421                                0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
422
423         MMI_SDC1(%[ftmp15], %[dst], 0x08)
424         MMI_SDC1(%[ftmp16], %[dst], 0x18)
425         MMI_SDC1(%[ftmp17], %[dst], 0x28)
426         MMI_SDC1(%[ftmp18], %[dst], 0x38)
427         MMI_SDC1(%[ftmp19], %[dst], 0x48)
428         MMI_SDC1(%[ftmp20], %[dst], 0x58)
429         MMI_SDC1(%[ftmp21], %[dst], 0x68)
430         MMI_SDC1(%[ftmp22], %[dst], 0x78)
431
432         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
433           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
434           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
435           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
436           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
437           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
438           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
439           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
440           [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
441           [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
442           [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
443           [ftmp22]"=&f"(ftmp[22]),      [ftmp23]"=&f"(ftmp[23]),
444           [tmp0]"=&r"(tmp[0])
445         : [ff_pw_1]"f"(ff_pw_1_local),  [ff_pw_64]"f"(ff_pw_64_local),
446           [src]"r"(src), [dst]"r"(dst)
447         : "memory"
448     );
449 }
450 #endif
451
452 /* Do inverse transform on 8x4 part of block */
453 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
454 {
455     int dc = block[0];
456     double ftmp[9];
457
458     dc = ( 3 * dc +  1) >> 1;
459     dc = (17 * dc + 64) >> 7;
460
461     __asm__ volatile(
462         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
463         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
464
465         MMI_LDC1(%[ftmp1], %[dest0], 0x00)
466         MMI_LDC1(%[ftmp2], %[dest1], 0x00)
467         MMI_LDC1(%[ftmp3], %[dest2], 0x00)
468         MMI_LDC1(%[ftmp4], %[dest3], 0x00)
469
470         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
471         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
472         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
473         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
474         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
475         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
476         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
477         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
478
479         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
480         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
481         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
482         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
483         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
484         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
485         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
486         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
487
488         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
489         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
490         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
491         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
492
493         MMI_SDC1(%[ftmp1], %[dest0], 0x00)
494         MMI_SDC1(%[ftmp2], %[dest1], 0x00)
495         MMI_SDC1(%[ftmp3], %[dest2], 0x00)
496         MMI_SDC1(%[ftmp4], %[dest3], 0x00)
497         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
498           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
499           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
500           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
501           [ftmp8]"=&f"(ftmp[8])
502         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
503           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
504           [dc]"f"(dc)
505         : "memory"
506     );
507 }
508
509 #if _MIPS_SIM != _ABIO32
510 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
511 {
512     int16_t *src = block;
513     int16_t *dst = block;
514     double ftmp[16];
515     uint32_t tmp[1];
516     int16_t count = 4;
517     DECLARE_ALIGNED(16, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
518     DECLARE_ALIGNED(16, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
519     int16_t coeff[64] = {12, 16,  16,  15,  12,   9,   6,   4,
520                          12, 15,   6,  -4, -12, -16, -16,  -9,
521                          12,  9,  -6, -16, -12,   4,  16,  15,
522                          12,  4, -16,  -9,  12,  15,  -6, -16,
523                          12, -4, -16,   9,  12, -15,  -6,  16,
524                          12, -9,  -6,  16, -12,  -4,  16, -15,
525                          12, -15,  6,   4, -12,  16, -16,   9,
526                          12, -16, 16, -15,  12,  -9,   6,  -4};
527
528     // 1st loop
529     __asm__ volatile (
530         "li         %[tmp0],    0x03                                    \n\t"
531         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
532
533         "1:                                                             \n\t"
534         MMI_LDC1(%[ftmp1], %[src], 0x00)
535         MMI_LDC1(%[ftmp2], %[src], 0x08)
536
537         /* ftmp11: dst1,dst0 */
538         MMI_LDC1(%[ftmp3], %[coeff], 0x00)
539         MMI_LDC1(%[ftmp4], %[coeff], 0x08)
540         MMI_LDC1(%[ftmp5], %[coeff], 0x10)
541         MMI_LDC1(%[ftmp6], %[coeff], 0x18)
542         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
543         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
544         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
545         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
546         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
547         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
548         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
549         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
550         "paddw      %[ftmp11],  %[ftmp7],   %[ftmp8]                    \n\t"
551         "paddw      %[ftmp11],  %[ftmp11],  %[ff_pw_4]                  \n\t"
552
553         /* ftmp12: dst3,dst2 */
554         MMI_LDC1(%[ftmp3], %[coeff], 0x20)
555         MMI_LDC1(%[ftmp4], %[coeff], 0x28)
556         MMI_LDC1(%[ftmp5], %[coeff], 0x30)
557         MMI_LDC1(%[ftmp6], %[coeff], 0x38)
558         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
559         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
560         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
561         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
562         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
563         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
564         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
565         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
566         "paddw      %[ftmp12],  %[ftmp7],   %[ftmp8]                    \n\t"
567         "paddw      %[ftmp12],  %[ftmp12],  %[ff_pw_4]                  \n\t"
568
569         /* ftmp13: dst5,dst4 */
570         MMI_LDC1(%[ftmp3], %[coeff], 0x40)
571         MMI_LDC1(%[ftmp4], %[coeff], 0x48)
572         MMI_LDC1(%[ftmp5], %[coeff], 0x50)
573         MMI_LDC1(%[ftmp6], %[coeff], 0x58)
574         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
575         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
576         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
577         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
578         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
579         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
580         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
581         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
582         "paddw      %[ftmp13],  %[ftmp7],   %[ftmp8]                    \n\t"
583         "paddw      %[ftmp13],  %[ftmp13],  %[ff_pw_4]                  \n\t"
584
585         /* ftmp14: dst7,dst6 */
586         MMI_LDC1(%[ftmp3], %[coeff], 0x60)
587         MMI_LDC1(%[ftmp4], %[coeff], 0x68)
588         MMI_LDC1(%[ftmp5], %[coeff], 0x70)
589         MMI_LDC1(%[ftmp6], %[coeff], 0x78)
590         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
591         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
592         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
593         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
594         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
595         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
596         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
597         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
598         "paddw      %[ftmp14],  %[ftmp7],   %[ftmp8]                    \n\t"
599         "paddw      %[ftmp14],  %[ftmp14],  %[ff_pw_4]                  \n\t"
600
601         /* ftmp9: dst3,dst2,dst1,dst0    ftmp10: dst7,dst6,dst5,dst4 */
602         "psraw      %[ftmp11],  %[ftmp11],  %[ftmp0]                    \n\t"
603         "psraw      %[ftmp12],  %[ftmp12],  %[ftmp0]                    \n\t"
604         "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                    \n\t"
605         "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                    \n\t"
606         "punpcklhw  %[ftmp7],   %[ftmp11],  %[ftmp12]                   \n\t"
607         "punpckhhw  %[ftmp8],   %[ftmp11],  %[ftmp12]                   \n\t"
608         "punpcklhw  %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
609         "punpcklhw  %[ftmp7],   %[ftmp13],  %[ftmp14]                   \n\t"
610         "punpckhhw  %[ftmp8],   %[ftmp13],  %[ftmp14]                   \n\t"
611         "punpcklhw  %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
612         MMI_SDC1(%[ftmp9], %[dst], 0x00)
613         MMI_SDC1(%[ftmp10], %[dst], 0x08)
614
615         PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
616         PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
617         "addiu      %[count],   %[count],   -0x01                       \n\t"
618         "bnez       %[count],   1b                                      \n\t"
619         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
620           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
621           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
622           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
623           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
624           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
625           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
626           [ftmp14]"=&f"(ftmp[14]),      [tmp0]"=&r"(tmp[0]),
627           [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count)
628         : [ff_pw_4]"f"(ff_pw_4_local),  [coeff]"r"(coeff)
629         : "memory"
630     );
631
632     src = block;
633
634     // 2nd loop
635     __asm__ volatile (
636         "li         %[tmp0],    0x44                                    \n\t"
637         "mtc1       %[tmp0],    %[ftmp15]                               \n\t"
638
639         // 1st part
640         "li         %[tmp0],    0x07                                    \n\t"
641         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
642         MMI_LDC1(%[ftmp1], %[src], 0x00)
643         MMI_LDC1(%[ftmp2], %[src], 0x10)
644         MMI_LDC1(%[ftmp3], %[src], 0x20)
645         MMI_LDC1(%[ftmp4], %[src], 0x30)
646         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
647         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
648         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
649         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
650
651         /* ftmp11: dst03,dst02,dst01,dst00 */
652         "li         %[tmp0],    0x00160011                              \n\t"
653         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
654         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
655         "li         %[tmp0],    0x000a0011                              \n\t"
656         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
657         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
658         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
659         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
660         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
661         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
662         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
663         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
664         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
665         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
666         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
667         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
668         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
669         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
670         "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
671
672         /* ftmp12: dst13,dst12,dst11,dst10 */
673         "li         %[tmp0],    0x000a0011                              \n\t"
674         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
675         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
676         "li         %[tmp0],    0xffeaffef                              \n\t"
677         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
678         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
679         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
680         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
681         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
682         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
683         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
684         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
685         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
686         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
687         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
688         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
689         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
690         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
691         "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
692
693         /* ftmp13: dst23,dst22,dst21,dst20 */
694         "li         %[tmp0],    0xfff60011                              \n\t"
695         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
696         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
697         "li         %[tmp0],    0x0016ffef                              \n\t"
698         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
699         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
700         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
701         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
702         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
703         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
704         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
705         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
706         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
707         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
708         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
709         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
710         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
711         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
712         "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
713
714         /* ftmp14: dst33,dst32,dst31,dst30 */
715         "li         %[tmp0],    0xffea0011                              \n\t"
716         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
717         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
718         "li         %[tmp0],    0xfff60011                              \n\t"
719         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
720         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
721         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
722         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
723         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
724         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
725         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
726         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
727         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
728         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
729         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
730         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
731         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
732         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
733         "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
734
735         MMI_LWC1(%[ftmp1], %[dest], 0x00)
736         PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
737         MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
738         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
739         MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
740         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
741         MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
742         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
743         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
744         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
745         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
746         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
747         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
748         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
749         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
750         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
751         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
752         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
753         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
754         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
755         MMI_SWC1(%[ftmp1], %[dest], 0x00)
756         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
757         MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
758         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
759         MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
760         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
761         MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
762
763         // 2nd part
764         "li         %[tmp0],    0x07                                    \n\t"
765         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
766         MMI_LDC1(%[ftmp1], %[src], 0x08)
767         MMI_LDC1(%[ftmp2], %[src], 0x18)
768         MMI_LDC1(%[ftmp3], %[src], 0x28)
769         MMI_LDC1(%[ftmp4], %[src], 0x38)
770         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
771         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
772         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
773         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
774
775         /* ftmp11: dst03,dst02,dst01,dst00 */
776         "li         %[tmp0],    0x00160011                              \n\t"
777         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
778         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
779         "li         %[tmp0],    0x000a0011                              \n\t"
780         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
781         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
782         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
783         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
784         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
785         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
786         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
787         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
788         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
789         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
790         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
791         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
792         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
793         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
794         "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
795
796         /* ftmp12: dst13,dst12,dst11,dst10 */
797         "li         %[tmp0],    0x000a0011                              \n\t"
798         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
799         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
800         "li         %[tmp0],    0xffeaffef                              \n\t"
801         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
802         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
803         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
804         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
805         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
806         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
807         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
808         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
809         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
810         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
811         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
812         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
813         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
814         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
815         "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
816
817         /* ftmp13: dst23,dst22,dst21,dst20 */
818         "li         %[tmp0],    0xfff60011                              \n\t"
819         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
820         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
821         "li         %[tmp0],    0x0016ffef                              \n\t"
822         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
823         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
824         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
825         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
826         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
827         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
828         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
829         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
830         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
831         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
832         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
833         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
834         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
835         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
836         "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
837
838         /* ftmp14: dst33,dst32,dst31,dst30 */
839         "li         %[tmp0],    0xffea0011                              \n\t"
840         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
841         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
842         "li         %[tmp0],    0xfff60011                              \n\t"
843         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
844         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
845         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
846         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
847         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
848         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
849         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
850         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
851         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
852         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
853         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
854         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
855         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
856         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
857         "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
858
859         MMI_LWC1(%[ftmp1], %[dest], 0x04)
860         PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
861         MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
862         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
863         MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
864         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
865         MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
866         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
867         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
868         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
869         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
870         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
871         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
872         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
873         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
874         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
875         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
876         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
877         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
878         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
879         MMI_SWC1(%[ftmp1], %[dest], 0x04)
880         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
881         MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
882         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
883         MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
884         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
885         MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
886
887         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
888           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
889           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
890           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
891           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
892           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
893           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
894           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
895           [tmp0]"=&r"(tmp[0])
896         : [ff_pw_64]"f"(ff_pw_64_local),
897           [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
898         :"memory"
899     );
900 }
901 #endif
902
903 /* Do inverse transform on 4x8 parts of block */
904 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
905 {
906     int dc = block[0];
907     double ftmp[9];
908     DECLARE_VAR_LOW32;
909
910     dc = (17 * dc +  4) >> 3;
911     dc = (12 * dc + 64) >> 7;
912
913     __asm__ volatile(
914         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
915         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
916
917         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
918         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
919         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
920         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
921         MMI_LWC1(%[ftmp5], %[dest4], 0x00)
922         MMI_LWC1(%[ftmp6], %[dest5], 0x00)
923         MMI_LWC1(%[ftmp7], %[dest6], 0x00)
924         MMI_LWC1(%[ftmp8], %[dest7], 0x00)
925
926         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
927         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
928         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
929         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
930         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
931         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
932         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
933         "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
934
935         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
936         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
937         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
938         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
939         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
940         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
941         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
942         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
943
944         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
945         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
946         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
947         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
948         "packushb   %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
949         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
950         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
951         "packushb   %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
952
953         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
954         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
955         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
956         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
957         MMI_SWC1(%[ftmp5], %[dest4], 0x00)
958         MMI_SWC1(%[ftmp6], %[dest5], 0x00)
959         MMI_SWC1(%[ftmp7], %[dest6], 0x00)
960         MMI_SWC1(%[ftmp8], %[dest7], 0x00)
961         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
962           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
963           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
964           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
965           RESTRICT_ASM_LOW32
966           [ftmp8]"=&f"(ftmp[8])
967         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
968           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
969           [dest4]"r"(dest+4*linesize),  [dest5]"r"(dest+5*linesize),
970           [dest6]"r"(dest+6*linesize),  [dest7]"r"(dest+7*linesize),
971           [dc]"f"(dc)
972         : "memory"
973     );
974 }
975
976 #if _MIPS_SIM != _ABIO32
977 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
978 {
979     int16_t *src = block;
980     int16_t *dst = block;
981     double ftmp[24];
982     uint32_t count = 8, tmp[1];
983     int16_t coeff[16] = {17, 22, 17, 10,
984                          17, 10,-17,-22,
985                          17,-10,-17, 22,
986                          17,-22, 17,-10};
987     DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
988     DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
989     DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
990
991     // 1st loop
992     __asm__ volatile (
993
994         "li         %[tmp0],    0x03                                    \n\t"
995         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
996
997         MMI_LDC1(%[ftmp2], %[coeff], 0x00)
998         MMI_LDC1(%[ftmp3], %[coeff], 0x08)
999         MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1000         MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1001         "1:                                                             \n\t"
1002         /* ftmp8: dst3,dst2,dst1,dst0 */
1003         MMI_LDC1(%[ftmp1], %[src], 0x00)
1004         "pmaddhw    %[ftmp6],   %[ftmp2],   %[ftmp1]                    \n\t"
1005         "pmaddhw    %[ftmp7],   %[ftmp3],   %[ftmp1]                    \n\t"
1006         "pmaddhw    %[ftmp8],   %[ftmp4],   %[ftmp1]                    \n\t"
1007         "pmaddhw    %[ftmp9],   %[ftmp5],   %[ftmp1]                    \n\t"
1008         "punpcklwd  %[ftmp10],  %[ftmp6],   %[ftmp7]                    \n\t"
1009         "punpckhwd  %[ftmp11],  %[ftmp6],   %[ftmp7]                    \n\t"
1010         "punpcklwd  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1011         "punpckhwd  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1012         "paddw      %[ftmp8],   %[ftmp10],  %[ftmp11]                   \n\t"
1013         "paddw      %[ftmp9],   %[ftmp6],   %[ftmp7]                    \n\t"
1014         "paddw      %[ftmp8],   %[ftmp8],   %[ff_pw_4]                  \n\t"
1015         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_4]                  \n\t"
1016         "psraw      %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1017         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1018         "punpcklhw  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1019         "punpckhhw  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1020         "punpcklhw  %[ftmp8],   %[ftmp6],   %[ftmp7]                    \n\t"
1021         MMI_SDC1(%[ftmp8], %[dst], 0x00)
1022
1023         PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
1024         PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
1025         "addiu      %[count],   %[count],   -0x01                       \n\t"
1026         "bnez       %[count],   1b                                      \n\t"
1027         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1028           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1029           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1030           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1031           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1032           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1033           [tmp0]"=&r"(tmp[0]),          [count]"+&r"(count),
1034           [src]"+&r"(src),              [dst]"+&r"(dst)
1035         : [ff_pw_4]"f"(ff_pw_4_local),  [coeff]"r"(coeff)
1036         : "memory"
1037     );
1038
1039     src = block;
1040
1041     // 2nd loop
1042     __asm__ volatile (
1043         "li         %[tmp0],    0x07                                    \n\t"
1044         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1045         "li         %[tmp0],    0x44                                    \n\t"
1046         "mtc1       %[tmp0],    %[ftmp23]                               \n\t"
1047
1048         MMI_LDC1(%[ftmp1], %[src], 0x00)
1049         MMI_LDC1(%[ftmp2], %[src], 0x20)
1050         MMI_LDC1(%[ftmp3], %[src], 0x40)
1051         MMI_LDC1(%[ftmp4], %[src], 0x60)
1052         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
1053         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
1054         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
1055         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
1056
1057         MMI_LDC1(%[ftmp1], %[src], 0x10)
1058         MMI_LDC1(%[ftmp2], %[src], 0x30)
1059         MMI_LDC1(%[ftmp3], %[src], 0x50)
1060         MMI_LDC1(%[ftmp4], %[src], 0x70)
1061         "punpcklhw  %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1062         "punpckhhw  %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1063         "punpcklhw  %[ftmp11],  %[ftmp3],   %[ftmp4]                    \n\t"
1064         "punpckhhw  %[ftmp12],  %[ftmp3],   %[ftmp4]                    \n\t"
1065
1066         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
1067         VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
1068                                0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
1069
1070         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
1071         VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
1072                                0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
1073
1074         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
1075         VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
1076                                0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
1077
1078         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
1079         VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
1080                                0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
1081
1082         MMI_LWC1(%[ftmp1], %[dest], 0x00)
1083         PTR_ADDU  "%[tmp0],   %[dest],    %[linesize]                 \n\t"
1084         MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1085         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1086         MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1087         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1088         MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1089         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1090         MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1091         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1092         MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1093         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1094         MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1095         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1096         MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1097         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1098         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1099         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1100         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1101         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1102         "punpcklbh  %[ftmp5],   %[ftmp5],   %[ftmp0]                    \n\t"
1103         "punpcklbh  %[ftmp6],   %[ftmp6],   %[ftmp0]                    \n\t"
1104         "punpcklbh  %[ftmp7],   %[ftmp7],   %[ftmp0]                    \n\t"
1105         "punpcklbh  %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1106
1107         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp15]                   \n\t"
1108         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp16]                   \n\t"
1109         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp17]                   \n\t"
1110         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp18]                   \n\t"
1111         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp19]                   \n\t"
1112         "paddh      %[ftmp6],   %[ftmp6],   %[ftmp20]                   \n\t"
1113         "paddh      %[ftmp7],   %[ftmp7],   %[ftmp21]                   \n\t"
1114         "paddh      %[ftmp8],   %[ftmp8],   %[ftmp22]                   \n\t"
1115
1116         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1117         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1118         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1119         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1120         "packushb   %[ftmp5],   %[ftmp5],   %[ftmp0]                    \n\t"
1121         "packushb   %[ftmp6],   %[ftmp6],   %[ftmp0]                    \n\t"
1122         "packushb   %[ftmp7],   %[ftmp7],   %[ftmp0]                    \n\t"
1123         "packushb   %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1124
1125         MMI_SWC1(%[ftmp1], %[dest], 0x00)
1126         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
1127         MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1128         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1129         MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1130         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1131         MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1132         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1133         MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1134         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1135         MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1136         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1137         MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1138         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1139         MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1140
1141         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1142           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1143           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1144           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1145           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1146           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1147           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
1148           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
1149           [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
1150           [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
1151           [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
1152           [ftmp22]"=&f"(ftmp[22]),      [ftmp23]"=&f"(ftmp[23]),
1153           [tmp0]"=&r"(tmp[0])
1154         : [ff_pw_1]"f"(ff_pw_1_local),  [ff_pw_64]"f"(ff_pw_64_local),
1155           [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1156         : "memory"
1157     );
1158 }
1159 #endif
1160
1161 /* Do inverse transform on 4x4 part of block */
1162 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1163 {
1164     int dc = block[0];
1165     double ftmp[5];
1166     DECLARE_VAR_LOW32;
1167
1168     dc = (17 * dc +  4) >> 3;
1169     dc = (17 * dc + 64) >> 7;
1170
1171     __asm__ volatile(
1172         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1173         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
1174
1175         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1176         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1177         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1178         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1179
1180         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1181         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1182         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1183         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1184
1185         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
1186         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
1187         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
1188         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
1189
1190         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1191         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1192         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1193         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1194
1195         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1196         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1197         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1198         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1199         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1200           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1201           RESTRICT_ASM_LOW32
1202           [ftmp4]"=&f"(ftmp[4])
1203         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
1204           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
1205           [dc]"f"(dc)
1206         : "memory"
1207     );
1208 }
1209
1210 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1211 {
1212     int16_t *src = block;
1213     int16_t *dst = block;
1214     double ftmp[16];
1215     uint32_t count = 4, tmp[1];
1216     int16_t coeff[16] = {17, 22, 17, 10,
1217                          17, 10,-17,-22,
1218                          17,-10,-17, 22,
1219                          17,-22, 17,-10};
1220     DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
1221     DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
1222     // 1st loop
1223     __asm__ volatile (
1224
1225         "li         %[tmp0],    0x03                                    \n\t"
1226         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1227         MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1228         MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1229         MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1230         MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1231         "1:                                                             \n\t"
1232         /* ftmp8: dst3,dst2,dst1,dst0 */
1233         MMI_LDC1(%[ftmp1], %[src], 0x00)
1234         "pmaddhw    %[ftmp6],   %[ftmp2],   %[ftmp1]                    \n\t"
1235         "pmaddhw    %[ftmp7],   %[ftmp3],   %[ftmp1]                    \n\t"
1236         "pmaddhw    %[ftmp8],   %[ftmp4],   %[ftmp1]                    \n\t"
1237         "pmaddhw    %[ftmp9],   %[ftmp5],   %[ftmp1]                    \n\t"
1238         "punpcklwd  %[ftmp10],  %[ftmp6],   %[ftmp7]                    \n\t"
1239         "punpckhwd  %[ftmp11],  %[ftmp6],   %[ftmp7]                    \n\t"
1240         "punpcklwd  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1241         "punpckhwd  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1242         "paddw      %[ftmp8],   %[ftmp10],  %[ftmp11]                   \n\t"
1243         "paddw      %[ftmp9],   %[ftmp6],   %[ftmp7]                    \n\t"
1244         "paddw      %[ftmp8],   %[ftmp8],   %[ff_pw_4]                  \n\t"
1245         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_4]                  \n\t"
1246         "psraw      %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1247         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1248         "punpcklhw  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1249         "punpckhhw  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1250         "punpcklhw  %[ftmp8],   %[ftmp6],   %[ftmp7]                    \n\t"
1251         MMI_SDC1(%[ftmp8], %[dst], 0x00)
1252
1253         PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
1254         PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
1255         "addiu      %[count],   %[count],   -0x01                       \n\t"
1256         "bnez       %[count],   1b                                      \n\t"
1257         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1258           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1259           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1260           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1261           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1262           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1263           [tmp0]"=&r"(tmp[0]),          [count]"+&r"(count),
1264           [src]"+&r"(src),              [dst]"+&r"(dst)
1265         : [ff_pw_4]"f"(ff_pw_4_local),  [coeff]"r"(coeff)
1266         : "memory"
1267     );
1268
1269     src = block;
1270
1271     // 2nd loop
1272     __asm__ volatile (
1273         "li         %[tmp0],    0x07                                    \n\t"
1274         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1275         "li         %[tmp0],    0x44                                    \n\t"
1276         "mtc1       %[tmp0],    %[ftmp15]                               \n\t"
1277
1278         MMI_LDC1(%[ftmp1], %[src], 0x00)
1279         MMI_LDC1(%[ftmp2], %[src], 0x10)
1280         MMI_LDC1(%[ftmp3], %[src], 0x20)
1281         MMI_LDC1(%[ftmp4], %[src], 0x30)
1282         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
1283         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
1284         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
1285         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
1286
1287         /* ftmp11: dst03,dst02,dst01,dst00 */
1288         "li         %[tmp0],    0x00160011                              \n\t"
1289         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1290         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1291         "li         %[tmp0],    0x000a0011                              \n\t"
1292         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1293         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1294         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1295         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1296         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1297         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1298         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1299         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1300         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1301         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1302         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1303         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1304         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1305         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1306         "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
1307
1308         /* ftmp12: dst13,dst12,dst11,dst10 */
1309         "li         %[tmp0],    0x000a0011                              \n\t"
1310         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1311         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1312         "li         %[tmp0],    0xffeaffef                              \n\t"
1313         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1314         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1315         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1316         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1317         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1318         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1319         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1320         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1321         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1322         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1323         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1324         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1325         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1326         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1327         "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
1328
1329         /* ftmp13: dst23,dst22,dst21,dst20 */
1330         "li         %[tmp0],    0xfff60011                              \n\t"
1331         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1332         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1333         "li         %[tmp0],    0x0016ffef                              \n\t"
1334         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1335         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1336         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1337         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1338         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1339         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1340         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1341         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1342         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1343         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1344         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1345         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1346         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1347         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1348         "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
1349
1350         /* ftmp14: dst33,dst32,dst31,dst30 */
1351         "li         %[tmp0],    0xffea0011                              \n\t"
1352         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1353         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1354         "li         %[tmp0],    0xfff60011                              \n\t"
1355         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1356         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1357         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1358         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1359         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1360         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1361         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1362         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1363         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1364         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1365         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1366         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1367         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1368         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1369         "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
1370
1371         MMI_LWC1(%[ftmp1], %[dest], 0x00)
1372         PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
1373         MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1374         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1375         MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1376         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1377         MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1378         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1379         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1380         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1381         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1382         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1383         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
1384         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
1385         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
1386         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
1387         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1388         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1389         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1390         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1391
1392         MMI_SWC1(%[ftmp1], %[dest], 0x00)
1393         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
1394         MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1395         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1396         MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1397         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1398         MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1399
1400         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1401           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1402           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1403           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1404           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1405           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1406           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
1407           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
1408           [tmp0]"=&r"(tmp[0])
1409         : [ff_pw_64]"f"(ff_pw_64_local),
1410           [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1411         :"memory"
1412     );
1413 }
1414
1415 /* Apply overlap transform to horizontal edge */
1416 void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
1417 {
1418     int i;
1419     int a, b, c, d;
1420     int d1, d2;
1421     int rnd = 1;
1422     for (i = 0; i < 8; i++) {
1423         a  = src[-2];
1424         b  = src[-1];
1425         c  = src[0];
1426         d  = src[1];
1427         d1 = (a - d + 3 + rnd) >> 3;
1428         d2 = (a - d + b - c + 4 - rnd) >> 3;
1429
1430         src[-2] = a - d1;
1431         src[-1] = av_clip_uint8(b - d2);
1432         src[0]  = av_clip_uint8(c + d2);
1433         src[1]  = d + d1;
1434         src    += stride;
1435         rnd     = !rnd;
1436     }
1437 }
1438
1439 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
1440 {
1441     int i;
1442     int a, b, c, d;
1443     int d1, d2;
1444     int rnd1 = flags & 2 ? 3 : 4;
1445     int rnd2 = 7 - rnd1;
1446     for (i = 0; i < 8; i++) {
1447         a  = left[6];
1448         b  = left[7];
1449         c  = right[0];
1450         d  = right[1];
1451         d1 = a - d;
1452         d2 = a - d + b - c;
1453
1454         left[6]  = ((a << 3) - d1 + rnd1) >> 3;
1455         left[7]  = ((b << 3) - d2 + rnd2) >> 3;
1456         right[0] = ((c << 3) + d2 + rnd1) >> 3;
1457         right[1] = ((d << 3) + d1 + rnd2) >> 3;
1458
1459         right += right_stride;
1460         left  += left_stride;
1461         if (flags & 1) {
1462             rnd2   = 7 - rnd2;
1463             rnd1   = 7 - rnd1;
1464         }
1465     }
1466 }
1467
1468 /* Apply overlap transform to vertical edge */
1469 void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
1470 {
1471     int i;
1472     int a, b, c, d;
1473     int d1, d2;
1474     int rnd = 1;
1475     for (i = 0; i < 8; i++) {
1476         a  = src[-2 * stride];
1477         b  = src[-stride];
1478         c  = src[0];
1479         d  = src[stride];
1480         d1 = (a - d + 3 + rnd) >> 3;
1481         d2 = (a - d + b - c + 4 - rnd) >> 3;
1482
1483         src[-2 * stride] = a - d1;
1484         src[-stride]     = av_clip_uint8(b - d2);
1485         src[0]           = av_clip_uint8(c + d2);
1486         src[stride]      = d + d1;
1487         src++;
1488         rnd = !rnd;
1489     }
1490 }
1491
1492 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1493 {
1494     int i;
1495     int a, b, c, d;
1496     int d1, d2;
1497     int rnd1 = 4, rnd2 = 3;
1498     for (i = 0; i < 8; i++) {
1499         a  = top[48];
1500         b  = top[56];
1501         c  = bottom[0];
1502         d  = bottom[8];
1503         d1 = a - d;
1504         d2 = a - d + b - c;
1505
1506         top[48]   = ((a << 3) - d1 + rnd1) >> 3;
1507         top[56]   = ((b << 3) - d2 + rnd2) >> 3;
1508         bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1509         bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1510
1511         bottom++;
1512         top++;
1513         rnd2 = 7 - rnd2;
1514         rnd1 = 7 - rnd1;
1515     }
1516 }
1517
1518 /**
1519  * VC-1 in-loop deblocking filter for one line
1520  * @param src source block type
1521  * @param stride block stride
1522  * @param pq block quantizer
1523  * @return whether other 3 pairs should be filtered or not
1524  * @see 8.6
1525  */
1526 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1527 {
1528     int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1529               5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1530     int a0_sign = a0 >> 31;        /* Store sign */
1531
1532     a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1533     if (a0 < pq) {
1534         int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1535                         5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1536         int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1537                         5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1538         if (a1 < a0 || a2 < a0) {
1539             int clip      = src[-1 * stride] - src[0 * stride];
1540             int clip_sign = clip >> 31;
1541
1542             clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1543             if (clip) {
1544                 int a3     = FFMIN(a1, a2);
1545                 int d      = 5 * (a3 - a0);
1546                 int d_sign = (d >> 31);
1547
1548                 d       = ((d ^ d_sign) - d_sign) >> 3;
1549                 d_sign ^= a0_sign;
1550
1551                 if (d_sign ^ clip_sign)
1552                     d = 0;
1553                 else {
1554                     d = FFMIN(d, clip);
1555                     d = (d ^ d_sign) - d_sign; /* Restore sign */
1556                     src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1557                     src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1558                 }
1559                 return 1;
1560             }
1561         }
1562     }
1563     return 0;
1564 }
1565
1566 /**
1567  * VC-1 in-loop deblocking filter
1568  * @param src source block type
1569  * @param step distance between horizontally adjacent elements
1570  * @param stride distance between vertically adjacent elements
1571  * @param len edge length to filter (4 or 8 pixels)
1572  * @param pq block quantizer
1573  * @see 8.6
1574  */
1575 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1576                                    int len, int pq)
1577 {
1578     int i;
1579     int filt3;
1580
1581     for (i = 0; i < len; i += 4) {
1582         filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1583         if (filt3) {
1584             vc1_filter_line(src + 0 * step, stride, pq);
1585             vc1_filter_line(src + 1 * step, stride, pq);
1586             vc1_filter_line(src + 3 * step, stride, pq);
1587         }
1588         src += step * 4;
1589     }
1590 }
1591
1592 void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1593 {
1594     vc1_loop_filter(src, 1, stride, 4, pq);
1595 }
1596
1597 void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1598 {
1599     vc1_loop_filter(src, stride, 1, 4, pq);
1600 }
1601
1602 void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1603 {
1604     vc1_loop_filter(src, 1, stride, 8, pq);
1605 }
1606
1607 void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1608 {
1609     vc1_loop_filter(src, stride, 1, 8, pq);
1610 }
1611
1612 void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1613 {
1614     vc1_loop_filter(src, 1, stride, 16, pq);
1615 }
1616
1617 void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1618 {
1619     vc1_loop_filter(src, stride, 1, 16, pq);
1620 }
1621
1622 void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1623                                ptrdiff_t stride, int rnd)
1624 {
1625     ff_put_pixels8_8_mmi(dst, src, stride, 8);
1626 }
1627 void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1628                                   ptrdiff_t stride, int rnd)
1629 {
1630     ff_put_pixels16_8_mmi(dst, src, stride, 16);
1631 }
1632 void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1633                                ptrdiff_t stride, int rnd)
1634 {
1635     ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1636 }
1637 void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1638                                   ptrdiff_t stride, int rnd)
1639 {
1640     ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1641 }
1642
1643 #define OP_PUT(S, D)
1644 #define OP_AVG(S, D)                                                        \
1645     "ldc1       $f16,   "#S"                        \n\t"                   \
1646     "pavgb      "#D",   "#D",   $f16                \n\t"
1647
1648 /** Add rounder from $f14 to $f6 and pack result at destination */
1649 #define NORMALIZE_MMI(SHIFT)                                                \
1650     "paddh      $f6,    $f6,    $f14                \n\t" /* +bias-r */     \
1651     "paddh      $f8,    $f8,    $f14                \n\t" /* +bias-r */     \
1652     "psrah      $f6,    $f6,    "SHIFT"             \n\t"                   \
1653     "psrah      $f8,    $f8,    "SHIFT"             \n\t"
1654
1655 #define TRANSFER_DO_PACK(OP)                                                \
1656     "packushb   $f6,    $f6,    $f8                 \n\t"                   \
1657     OP((%[dst]), $f6)                                                       \
1658     "sdc1       $f6,    0x00(%[dst])                \n\t"
1659
1660 #define TRANSFER_DONT_PACK(OP)                                              \
1661      OP(0(%[dst]), $f6)                                                     \
1662      OP(8(%[dst]), $f8)                                                     \
1663      "sdc1      $f6,    0x00(%[dst])                \n\t"                   \
1664      "sdc1      $f8,    0x08(%[dst])                \n\t"
1665
1666 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1667 #define DO_UNPACK(reg)                                                      \
1668     "punpcklbh  "reg",  "reg",  $f0                 \n\t"
1669 #define DONT_UNPACK(reg)
1670
1671 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1672 #define LOAD_ROUNDER_MMI(ROUND)                                             \
1673     "lwc1       $f14,   "ROUND"                     \n\t"                   \
1674     "punpcklhw  $f14,   $f14,   $f14                \n\t"                   \
1675     "punpcklwd  $f14,   $f14,   $f14                \n\t"
1676
1677
1678 #define SHIFT2_LINE(OFF, R0, R1, R2, R3)                                    \
1679     "paddh      "#R1",      "#R1",  "#R2"           \n\t"                   \
1680     PTR_ADDU    "$9,        %[src], %[stride1]      \n\t"                   \
1681     MMI_ULWC1(R0, $9, 0x00)                                                 \
1682     "pmullh     "#R1",      "#R1",  $f6             \n\t"                   \
1683     "punpcklbh  "#R0",      "#R0",  $f0             \n\t"                   \
1684     PTR_ADDU    "$9,        %[src], %[stride]       \n\t"                   \
1685     MMI_ULWC1(R3, $9, 0x00)                                                 \
1686     "psubh      "#R1",      "#R1",  "#R0"           \n\t"                   \
1687     "punpcklbh  "#R3",      "#R3",  $f0             \n\t"                   \
1688     "paddh      "#R1",      "#R1",  $f14            \n\t"                   \
1689     "psubh      "#R1",      "#R1",  "#R3"           \n\t"                   \
1690     "psrah      "#R1",      "#R1",  %[shift]        \n\t"                   \
1691     MMI_SDC1(R1, %[dst], OFF)                                               \
1692     PTR_ADDU    "%[src],    %[src], %[stride]       \n\t"
1693
1694 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1695 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1696                                        const uint8_t *src, mips_reg stride,
1697                                        int rnd, int64_t shift)
1698 {
1699     DECLARE_VAR_LOW32;
1700     DECLARE_VAR_ADDRT;
1701
1702     __asm__ volatile(
1703         "xor        $f0,    $f0,    $f0             \n\t"
1704         "li         $8,     0x03                    \n\t"
1705         LOAD_ROUNDER_MMI("%[rnd]")
1706         "ldc1       $f12,   %[ff_pw_9]              \n\t"
1707         "1:                                         \n\t"
1708         MMI_ULWC1($f4, %[src], 0x00)
1709         PTR_ADDU   "%[src], %[src], %[stride]       \n\t"
1710         MMI_ULWC1($f6, %[src], 0x00)
1711         "punpcklbh  $f4,    $f4,    $f0             \n\t"
1712         "punpcklbh  $f6,    $f6,    $f0             \n\t"
1713         SHIFT2_LINE(  0, $f2, $f4, $f6, $f8)
1714         SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1715         SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1716         SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1717         SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1718         SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1719         SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1720         SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1721         PTR_SUBU   "%[src], %[src], %[stride2]      \n\t"
1722         PTR_ADDIU  "%[dst], %[dst], 0x08            \n\t"
1723         "addiu      $8,     $8,    -0x01            \n\t"
1724         "bnez       $8,     1b                      \n\t"
1725         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT
1726           [src]"+r"(src),               [dst]"+r"(dst)
1727         : [stride]"r"(stride),          [stride1]"r"(-2*stride),
1728           [shift]"f"(shift),            [rnd]"m"(rnd),
1729           [stride2]"r"(9*stride-4),     [ff_pw_9]"m"(ff_pw_9)
1730         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1731           "$f14", "$f16", "memory"
1732     );
1733 }
1734
1735 /**
1736  * Data is already unpacked, so some operations can directly be made from
1737  * memory.
1738  */
1739 #define VC1_HOR_16B_SHIFT2(OP, OPNAME)                                      \
1740 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1741                                              const int16_t *src, int rnd)   \
1742 {                                                                           \
1743     int h = 8;                                                              \
1744     DECLARE_VAR_ALL64;                                                      \
1745     DECLARE_VAR_ADDRT;                                                      \
1746                                                                             \
1747     src -= 1;                                                               \
1748     rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */                            \
1749                                                                             \
1750     __asm__ volatile(                                                       \
1751         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1752         "ldc1       $f12,   %[ff_pw_128]            \n\t"                   \
1753         "ldc1       $f10,   %[ff_pw_9]              \n\t"                   \
1754         "1:                                         \n\t"                   \
1755         MMI_ULDC1($f2, %[src], 0x00)                                        \
1756         MMI_ULDC1($f4, %[src], 0x08)                                        \
1757         MMI_ULDC1($f6, %[src], 0x02)                                        \
1758         MMI_ULDC1($f8, %[src], 0x0a)                                        \
1759         MMI_ULDC1($f0, %[src], 0x06)                                        \
1760         "paddh      $f2,    $f2,    $f0             \n\t"                   \
1761         MMI_ULDC1($f0, %[src], 0x0e)                                        \
1762         "paddh      $f4,    $f4,    $f0             \n\t"                   \
1763         MMI_ULDC1($f0, %[src], 0x04)                                        \
1764         "paddh      $f6,    $f6,    $f0             \n\t"                   \
1765         MMI_ULDC1($f0, %[src], 0x0b)                                        \
1766         "paddh      $f8,    $f8,    $f0             \n\t"                   \
1767         "pmullh     $f6,    $f6,    $f10            \n\t"                   \
1768         "pmullh     $f8,    $f8,    $f10            \n\t"                   \
1769         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1770         "psubh      $f8,    $f8,    $f4             \n\t"                   \
1771         "li         $8,     0x07                    \n\t"                   \
1772         "mtc1       $8,     $f16                    \n\t"                   \
1773         NORMALIZE_MMI("$f16")                                               \
1774         /* Remove bias */                                                   \
1775         "paddh      $f6,    $f6,    $f12            \n\t"                   \
1776         "paddh      $f8,    $f8,    $f12            \n\t"                   \
1777         TRANSFER_DO_PACK(OP)                                                \
1778         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1779         PTR_ADDIU  "%[src], %[src], 0x18            \n\t"                   \
1780         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1781         "bnez       %[h],   1b                      \n\t"                   \
1782         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1783           [h]"+r"(h),                                                       \
1784           [src]"+r"(src),               [dst]"+r"(dst)                      \
1785         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1786           [ff_pw_9]"m"(ff_pw_9),        [ff_pw_128]"m"(ff_pw_128)           \
1787         : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",  \
1788           "$f16", "memory"                                                  \
1789     );                                                                      \
1790 }
1791
1792 VC1_HOR_16B_SHIFT2(OP_PUT, put_)
1793 VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
1794
1795 /**
1796  * Purely vertical or horizontal 1/2 shift interpolation.
1797  * Sacrify $f12 for *9 factor.
1798  */
1799 #define VC1_SHIFT2(OP, OPNAME)\
1800 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src,      \
1801                                      mips_reg stride, int rnd,              \
1802                                      mips_reg offset)                       \
1803 {                                                                           \
1804     DECLARE_VAR_LOW32;                                                      \
1805     DECLARE_VAR_ADDRT;                                                      \
1806                                                                             \
1807     rnd = 8 - rnd;                                                          \
1808                                                                             \
1809     __asm__ volatile(                                                       \
1810         "xor        $f0,    $f0,    $f0             \n\t"                   \
1811         "li         $10,    0x08                    \n\t"                   \
1812         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1813         "ldc1       $f12,   %[ff_pw_9]              \n\t"                   \
1814         "1:                                         \n\t"                   \
1815         MMI_ULWC1($f6, %[src], 0x00)                                        \
1816         MMI_ULWC1($f8, %[src], 0x04)                                        \
1817         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1818         MMI_ULWC1($f2, $9, 0x00)                                            \
1819         MMI_ULWC1($f4, $9, 0x04)                                            \
1820         PTR_ADDU   "%[src], %[src], %[offset]       \n\t"                   \
1821         "punpcklbh  $f6,    $f6,    $f0             \n\t"                   \
1822         "punpcklbh  $f8,    $f8,    $f0             \n\t"                   \
1823         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1824         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1825         "paddh      $f6,    $f6,    $f2             \n\t"                   \
1826         "paddh      $f8,    $f8,    $f4             \n\t"                   \
1827         PTR_ADDU   "$9,     %[src], %[offset_x2n]   \n\t"                   \
1828         MMI_ULWC1($f2, $9, 0x00)                                            \
1829         MMI_ULWC1($f4, $9, 0x04)                                            \
1830         "pmullh     $f6,    $f6,    $f12            \n\t" /* 0,9,9,0*/      \
1831         "pmullh     $f8,    $f8,    $f12            \n\t" /* 0,9,9,0*/      \
1832         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1833         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1834         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,0*/      \
1835         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,0*/      \
1836         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1837         MMI_ULWC1($f2, $9, 0x00)                                            \
1838         MMI_ULWC1($f4, $9, 0x04)                                            \
1839         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1840         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1841         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,-1*/     \
1842         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,-1*/     \
1843         "li         $8,     0x04                    \n\t"                   \
1844         "mtc1       $8,     $f16                    \n\t"                   \
1845         NORMALIZE_MMI("$f16")                                               \
1846         "packushb   $f6,    $f6,    $f8             \n\t"                   \
1847         OP((%[dst]), $f6)                                                   \
1848         "sdc1       $f6,    0x00(%[dst])            \n\t"                   \
1849         "addiu      $10,    $10,   -0x01            \n\t"                   \
1850         PTR_ADDU   "%[src], %[src], %[stride1]      \n\t"                   \
1851         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1852         "bnez       $10,    1b                      \n\t"                   \
1853         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1854           [src]"+r"(src),               [dst]"+r"(dst)                      \
1855         : [offset]"r"(offset),          [offset_x2n]"r"(-2*offset),         \
1856           [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1857           [stride1]"r"(stride-offset),                                      \
1858           [ff_pw_9]"m"(ff_pw_9)                                             \
1859         : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",     \
1860           "$f12", "$f14", "$f16", "memory"                                  \
1861     );                                                                      \
1862 }
1863
1864 VC1_SHIFT2(OP_PUT, put_)
1865 VC1_SHIFT2(OP_AVG, avg_)
1866
1867 /**
1868  * Core of the 1/4 and 3/4 shift bicubic interpolation.
1869  *
1870  * @param UNPACK  Macro unpacking arguments from 8 to 16bits (can be empty).
1871  * @param LOAD    "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1872  * @param M       "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1873  * @param A1      Stride address of 1st tap (beware of unpacked/packed).
1874  * @param A2      Stride address of 2nd tap
1875  * @param A3      Stride address of 3rd tap
1876  * @param A4      Stride address of 4th tap
1877  */
1878 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4)                \
1879     PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                       \
1880     LOAD($f2, $9, M*0)                                                      \
1881     LOAD($f4, $9, M*4)                                                      \
1882     UNPACK("$f2")                                                           \
1883     UNPACK("$f4")                                                           \
1884     "pmullh     $f2,    $f2,    %[ff_pw_3]      \n\t"                       \
1885     "pmullh     $f4,    $f4,    %[ff_pw_3]      \n\t"                       \
1886     PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                       \
1887     LOAD($f6, $9, M*0)                                                      \
1888     LOAD($f8, $9, M*4)                                                      \
1889     UNPACK("$f6")                                                           \
1890     UNPACK("$f8")                                                           \
1891     "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */             \
1892     "pmullh     $f8,    $f8,    $f12            \n\t" /* *18 */             \
1893     "psubh      $f6,    $f6,    $f2             \n\t" /* *18, -3 */         \
1894     "psubh      $f8,    $f8,    $f4             \n\t" /* *18, -3 */         \
1895     PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                       \
1896     LOAD($f2, $9, M*0)                                                      \
1897     LOAD($f4, $9, M*4)                                                      \
1898     UNPACK("$f2")                                                           \
1899     UNPACK("$f4")                                                           \
1900     "li         $8,     0x02                    \n\t"                       \
1901     "mtc1       $8,     $f16                    \n\t"                       \
1902     "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */              \
1903     "psllh      $f4,    $f4,    $f16            \n\t" /* 4* */              \
1904     "psubh      $f6,    $f6,    $f2             \n\t" /* -4,18,-3 */        \
1905     "psubh      $f8,    $f8,    $f4             \n\t" /* -4,18,-3 */        \
1906     PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                       \
1907     LOAD($f2, $9, M*0)                                                      \
1908     LOAD($f4, $9, M*4)                                                      \
1909     UNPACK("$f2")                                                           \
1910     UNPACK("$f4")                                                           \
1911     "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */             \
1912     "pmullh     $f4,    $f4,    $f10            \n\t" /* *53 */             \
1913     "paddh      $f6,    $f6,    $f2             \n\t" /* 4,53,18,-3 */      \
1914     "paddh      $f8,    $f8,    $f4             \n\t" /* 4,53,18,-3 */
1915
1916 /**
1917  * Macro to build the vertical 16bits version of vc1_put_shift[13].
1918  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1919  * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1920  *
1921  * @param  NAME   Either 1 or 3
1922  * @see MSPEL_FILTER13_CORE for information on A1->A4
1923  */
1924 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)                        \
1925 static void                                                                 \
1926 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src,          \
1927                                  mips_reg src_stride,                       \
1928                                  int rnd, int64_t shift)                    \
1929 {                                                                           \
1930     int h = 8;                                                              \
1931     DECLARE_VAR_LOW32;                                                      \
1932     DECLARE_VAR_ADDRT;                                                      \
1933                                                                             \
1934     src -= src_stride;                                                      \
1935                                                                             \
1936     __asm__ volatile(                                                       \
1937         "xor        $f0,    $f0,    $f0             \n\t"                   \
1938         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1939         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1940         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1941         ".p2align 3                                 \n\t"                   \
1942         "1:                                         \n\t"                   \
1943         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
1944         NORMALIZE_MMI("%[shift]")                                           \
1945         TRANSFER_DONT_PACK(OP_PUT)                                          \
1946         /* Last 3 (in fact 4) bytes on the line */                          \
1947         PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                   \
1948         MMI_ULWC1($f2, $9, 0x08)                                            \
1949         DO_UNPACK("$f2")                                                    \
1950         "mov.d      $f6,    $f2                     \n\t"                   \
1951         "paddh      $f2,    $f2,    $f2             \n\t"                   \
1952         "paddh      $f2,    $f2,    $f6             \n\t" /* 3* */          \
1953         PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                   \
1954         MMI_ULWC1($f6, $9, 0x08)                                            \
1955         DO_UNPACK("$f6")                                                    \
1956         "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */         \
1957         "psubh      $f6,    $f6,    $f2             \n\t" /* *18,-3 */      \
1958         PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                   \
1959         MMI_ULWC1($f2, $9, 0x08)                                            \
1960         DO_UNPACK("$f2")                                                    \
1961         "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */         \
1962         "paddh      $f6,    $f6,    $f2             \n\t" /* *53,18,-3 */   \
1963         PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                   \
1964         MMI_ULWC1($f2, $9, 0x08)                                            \
1965         DO_UNPACK("$f2")                                                    \
1966         "li         $8,     0x02                    \n\t"                   \
1967         "mtc1       $8,     $f16                    \n\t"                   \
1968         "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */          \
1969         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1970         "paddh      $f6,    $f6,    $f14            \n\t"                   \
1971         "li         $8,     0x06                    \n\t"                   \
1972         "mtc1       $8,     $f16                    \n\t"                   \
1973         "psrah      $f6,    $f6,    $f16            \n\t"                   \
1974         "sdc1       $f6,    0x10(%[dst])            \n\t"                   \
1975         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1976         PTR_ADDU   "%[src], %[src], %[stride_x1]    \n\t"                   \
1977         PTR_ADDIU  "%[dst], %[dst], 0x18            \n\t"                   \
1978         "bnez       %[h],   1b                      \n\t"                   \
1979         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1980           [h]"+r"(h),                                                       \
1981           [src]"+r"(src),               [dst]"+r"(dst)                      \
1982         : [stride_x1]"r"(src_stride),   [stride_x2]"r"(2*src_stride),       \
1983           [stride_x3]"r"(3*src_stride),                                     \
1984           [rnd]"m"(rnd),                [shift]"f"(shift),                  \
1985           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1986           [ff_pw_3]"f"(ff_pw_3)                                             \
1987         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1988           "$f14", "$f16", "memory"                                          \
1989     );                                                                      \
1990 }
1991
1992 /**
1993  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1994  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1995  *
1996  * @param  NAME   Either 1 or 3
1997  * @see MSPEL_FILTER13_CORE for information on A1->A4
1998  */
1999 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)            \
2000 static void                                                                 \
2001 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride,       \
2002                                        const int16_t *src, int rnd)         \
2003 {                                                                           \
2004     int h = 8;                                                              \
2005     DECLARE_VAR_ALL64;                                                      \
2006     DECLARE_VAR_ADDRT;                                                      \
2007                                                                             \
2008     src -= 1;                                                               \
2009     rnd -= (-4+58+13-3)*256; /* Add -256 bias */                            \
2010                                                                             \
2011     __asm__ volatile(                                                       \
2012         "xor        $f0,    $f0,    $f0             \n\t"                   \
2013         LOAD_ROUNDER_MMI("%[rnd]")                                          \
2014         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
2015         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
2016         ".p2align 3                                 \n\t"                   \
2017         "1:                                         \n\t"                   \
2018         MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4)      \
2019         "li         $8,     0x07                    \n\t"                   \
2020         "mtc1       $8,     $f16                    \n\t"                   \
2021         NORMALIZE_MMI("$f16")                                               \
2022         /* Remove bias */                                                   \
2023         "paddh      $f6,    $f6,    %[ff_pw_128]    \n\t"                   \
2024         "paddh      $f8,    $f8,    %[ff_pw_128]    \n\t"                   \
2025         TRANSFER_DO_PACK(OP)                                                \
2026         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
2027         PTR_ADDU   "%[src], %[src], 0x18            \n\t"                   \
2028         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
2029         "bnez       %[h],   1b                      \n\t"                   \
2030         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
2031           [h]"+r"(h),                                                       \
2032           [src]"+r"(src),               [dst]"+r"(dst)                      \
2033         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
2034           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
2035           [ff_pw_3]"f"(ff_pw_3),        [ff_pw_128]"f"(ff_pw_128)           \
2036         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
2037           "$f14", "$f16", "memory"                                          \
2038     );                                                                      \
2039 }
2040
2041 /**
2042  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
2043  * Here, offset=src_stride. Parameters passed A1 to A4 must use
2044  * %3 (offset), %4 (2*offset) and %5 (3*offset).
2045  *
2046  * @param  NAME   Either 1 or 3
2047  * @see MSPEL_FILTER13_CORE for information on A1->A4
2048  */
2049 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)                 \
2050 static void                                                                 \
2051 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src,             \
2052                               mips_reg stride, int rnd, mips_reg offset)    \
2053 {                                                                           \
2054     int h = 8;                                                              \
2055     DECLARE_VAR_LOW32;                                                      \
2056     DECLARE_VAR_ADDRT;                                                      \
2057                                                                             \
2058     src -= offset;                                                          \
2059     rnd = 32-rnd;                                                           \
2060                                                                             \
2061     __asm__ volatile (                                                      \
2062         "xor        $f0,    $f0,    $f0             \n\t"                   \
2063         LOAD_ROUNDER_MMI("%[rnd]")                                          \
2064         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
2065         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
2066         ".p2align 3                                 \n\t"                   \
2067         "1:                                         \n\t"                   \
2068         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
2069         "li         $8,     0x06                    \n\t"                   \
2070         "mtc1       $8,     $f16                    \n\t"                   \
2071         NORMALIZE_MMI("$f16")                                               \
2072         TRANSFER_DO_PACK(OP)                                                \
2073         "addiu      %[h],   %[h],      -0x01        \n\t"                   \
2074         PTR_ADDU   "%[src], %[src],     %[stride]   \n\t"                   \
2075         PTR_ADDU   "%[dst], %[dst],     %[stride]   \n\t"                   \
2076         "bnez       %[h],   1b                      \n\t"                   \
2077         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
2078           [h]"+r"(h),                                                       \
2079           [src]"+r"(src),               [dst]"+r"(dst)                      \
2080         : [offset_x1]"r"(offset),       [offset_x2]"r"(2*offset),           \
2081           [offset_x3]"r"(3*offset),     [stride]"r"(stride),                \
2082           [rnd]"m"(rnd),                                                    \
2083           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
2084           [ff_pw_3]"f"(ff_pw_3)                                             \
2085         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
2086           "$f14", "$f16", "memory"                                          \
2087     );                                                                      \
2088 }
2089
2090
2091 /** 1/4 shift bicubic interpolation */
2092 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
2093 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
2094 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
2095 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
2096 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
2097
2098 /** 3/4 shift bicubic interpolation */
2099 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2100 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2101 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
2102 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
2103 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
2104
2105 typedef void (*vc1_mspel_mc_filter_ver_16bits)
2106              (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
2107               int64_t shift);
2108 typedef void (*vc1_mspel_mc_filter_hor_16bits)
2109              (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
2110 typedef void (*vc1_mspel_mc_filter_8bits)
2111              (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
2112               mips_reg offset);
2113
2114 /**
2115  * Interpolate fractional pel values by applying proper vertical then
2116  * horizontal filter.
2117  *
2118  * @param  dst     Destination buffer for interpolated pels.
2119  * @param  src     Source buffer.
2120  * @param  stride  Stride for both src and dst buffers.
2121  * @param  hmode   Horizontal filter (expressed in quarter pixels shift).
2122  * @param  hmode   Vertical filter.
2123  * @param  rnd     Rounding bias.
2124  */
2125 #define VC1_MSPEL_MC(OP)                                                    \
2126 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
2127                                int hmode, int vmode, int rnd)               \
2128 {                                                                           \
2129     static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
2130          { NULL, vc1_put_ver_16b_shift1_mmi,                                \
2131                  vc1_put_ver_16b_shift2_mmi,                                \
2132                  vc1_put_ver_16b_shift3_mmi };                              \
2133     static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
2134          { NULL, OP ## vc1_hor_16b_shift1_mmi,                              \
2135                  OP ## vc1_hor_16b_shift2_mmi,                              \
2136                  OP ## vc1_hor_16b_shift3_mmi };                            \
2137     static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =          \
2138          { NULL, OP ## vc1_shift1_mmi,                                      \
2139                  OP ## vc1_shift2_mmi,                                      \
2140                  OP ## vc1_shift3_mmi };                                    \
2141                                                                             \
2142     if (vmode) { /* Vertical filter to apply */                             \
2143         if (hmode) { /* Horizontal filter to apply, output to tmp */        \
2144             static const int shift_value[] = { 0, 5, 1, 5 };                \
2145             int    shift = (shift_value[hmode]+shift_value[vmode])>>1;      \
2146             int    r;                                                       \
2147             LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);                        \
2148                                                                             \
2149             r = (1<<(shift-1)) + rnd-1;                                     \
2150             vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);  \
2151                                                                             \
2152             vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);    \
2153             return;                                                         \
2154         }                                                                   \
2155         else { /* No horizontal filter, output 8 lines to dst */            \
2156             vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);    \
2157             return;                                                         \
2158         }                                                                   \
2159     }                                                                       \
2160                                                                             \
2161     /* Horizontal mode with no vertical mode */                             \
2162     vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);                   \
2163 }                                                                           \
2164 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src,         \
2165                                   int stride, int hmode, int vmode, int rnd)\
2166 {                                                                           \
2167     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
2168     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
2169     dst += 8*stride; src += 8*stride;                                       \
2170     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
2171     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
2172 }
2173
2174 VC1_MSPEL_MC(put_)
2175 VC1_MSPEL_MC(avg_)
2176
2177 /** Macro to ease bicubic filter interpolation functions declarations */
2178 #define DECLARE_FUNCTION(a, b)                                              \
2179 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
2180                                            const uint8_t *src,              \
2181                                            ptrdiff_t stride,                \
2182                                            int rnd)                         \
2183 {                                                                           \
2184      put_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
2185 }                                                                           \
2186 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
2187                                            const uint8_t *src,              \
2188                                            ptrdiff_t stride,                \
2189                                            int rnd)                         \
2190 {                                                                           \
2191      avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
2192 }                                                                           \
2193 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
2194                                               const uint8_t *src,           \
2195                                               ptrdiff_t stride,             \
2196                                               int rnd)                      \
2197 {                                                                           \
2198      put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
2199 }                                                                           \
2200 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
2201                                               const uint8_t *src,           \
2202                                               ptrdiff_t stride,             \
2203                                               int rnd)                      \
2204 {                                                                           \
2205      avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
2206 }
2207
2208 DECLARE_FUNCTION(0, 1)
2209 DECLARE_FUNCTION(0, 2)
2210 DECLARE_FUNCTION(0, 3)
2211
2212 DECLARE_FUNCTION(1, 0)
2213 DECLARE_FUNCTION(1, 1)
2214 DECLARE_FUNCTION(1, 2)
2215 DECLARE_FUNCTION(1, 3)
2216
2217 DECLARE_FUNCTION(2, 0)
2218 DECLARE_FUNCTION(2, 1)
2219 DECLARE_FUNCTION(2, 2)
2220 DECLARE_FUNCTION(2, 3)
2221
2222 DECLARE_FUNCTION(3, 0)
2223 DECLARE_FUNCTION(3, 1)
2224 DECLARE_FUNCTION(3, 2)
2225 DECLARE_FUNCTION(3, 3)
2226
2227 #define CHROMA_MC_8_MMI                                                     \
2228         "punpckhbh  %[ftmp5],   %[ftmp1],   %[ftmp0]                \n\t"   \
2229         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
2230         "punpckhbh  %[ftmp6],   %[ftmp2],   %[ftmp0]                \n\t"   \
2231         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
2232         "punpckhbh  %[ftmp7],   %[ftmp3],   %[ftmp0]                \n\t"   \
2233         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
2234         "punpckhbh  %[ftmp8],   %[ftmp4],   %[ftmp0]                \n\t"   \
2235         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
2236                                                                             \
2237         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
2238         "pmullh     %[ftmp5],   %[ftmp5],   %[A]                    \n\t"   \
2239         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
2240         "pmullh     %[ftmp6],   %[ftmp6],   %[B]                    \n\t"   \
2241         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
2242         "pmullh     %[ftmp7],   %[ftmp7],   %[C]                    \n\t"   \
2243         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
2244         "pmullh     %[ftmp8],   %[ftmp8],   %[D]                    \n\t"   \
2245                                                                             \
2246         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
2247         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
2248         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
2249         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
2250                                                                             \
2251         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp6]                \n\t"   \
2252         "paddh      %[ftmp7],   %[ftmp7],   %[ftmp8]                \n\t"   \
2253         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp7]                \n\t"   \
2254         "paddh      %[ftmp5],   %[ftmp5],   %[ff_pw_28]             \n\t"   \
2255                                                                             \
2256         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp9]                \n\t"   \
2257         "psrlh      %[ftmp5],   %[ftmp5],   %[ftmp9]                \n\t"   \
2258         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"
2259
2260
2261 #define CHROMA_MC_4_MMI                                                     \
2262         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
2263         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
2264         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
2265         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
2266                                                                             \
2267         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
2268         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
2269         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
2270         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
2271                                                                             \
2272         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
2273         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
2274         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
2275         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
2276                                                                             \
2277         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"   \
2278         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"
2279
2280
2281 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2282                                       uint8_t *src /* align 1 */,
2283                                       int stride, int h, int x, int y)
2284 {
2285     const int A = (8 - x) * (8 - y);
2286     const int B =     (x) * (8 - y);
2287     const int C = (8 - x) *     (y);
2288     const int D =     (x) *     (y);
2289     double ftmp[10];
2290     uint32_t tmp[1];
2291     DECLARE_VAR_ALL64;
2292     DECLARE_VAR_ADDRT;
2293
2294     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2295
2296     __asm__ volatile(
2297         "li         %[tmp0],    0x06                                    \n\t"
2298         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2299         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
2300         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2301         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2302         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2303         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2304
2305         "1:                                                             \n\t"
2306         MMI_ULDC1(%[ftmp1], %[src], 0x00)
2307         MMI_ULDC1(%[ftmp2], %[src], 0x01)
2308         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2309         MMI_ULDC1(%[ftmp3], %[src], 0x00)
2310         MMI_ULDC1(%[ftmp4], %[src], 0x01)
2311
2312         CHROMA_MC_8_MMI
2313
2314         MMI_SDC1(%[ftmp1], %[dst], 0x00)
2315         "addiu      %[h],       %[h],      -0x01                        \n\t"
2316         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2317         "bnez       %[h],       1b                                      \n\t"
2318         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2319           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2320           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2321           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2322           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2323           RESTRICT_ASM_ALL64
2324           RESTRICT_ASM_ADDRT
2325           [tmp0]"=&r"(tmp[0]),
2326           [src]"+&r"(src),              [dst]"+&r"(dst),
2327           [h]"+&r"(h)
2328         : [stride]"r"((mips_reg)stride),
2329           [A]"f"(A),                    [B]"f"(B),
2330           [C]"f"(C),                    [D]"f"(D),
2331           [ff_pw_28]"f"(ff_pw_28)
2332         : "memory"
2333     );
2334 }
2335
2336 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2337                                       uint8_t *src /* align 1 */,
2338                                       int stride, int h, int x, int y)
2339 {
2340     const int A = (8 - x) * (8 - y);
2341     const int B =     (x) * (8 - y);
2342     const int C = (8 - x) *     (y);
2343     const int D =     (x) *     (y);
2344     double ftmp[6];
2345     uint32_t tmp[1];
2346     DECLARE_VAR_LOW32;
2347     DECLARE_VAR_ADDRT;
2348
2349     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2350
2351     __asm__ volatile(
2352         "li         %[tmp0],    0x06                                    \n\t"
2353         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2354         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2355         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2356         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2357         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2358         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2359
2360         "1:                                                             \n\t"
2361         MMI_ULWC1(%[ftmp1], %[src], 0x00)
2362         MMI_ULWC1(%[ftmp2], %[src], 0x01)
2363         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2364         MMI_ULWC1(%[ftmp3], %[src], 0x00)
2365         MMI_ULWC1(%[ftmp4], %[src], 0x01)
2366
2367         CHROMA_MC_4_MMI
2368
2369         MMI_SWC1(%[ftmp1], %[dst], 0x00)
2370         "addiu      %[h],       %[h],      -0x01                        \n\t"
2371         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2372         "bnez       %[h],       1b                                      \n\t"
2373         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2374           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2375           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2376           [tmp0]"=&r"(tmp[0]),
2377           RESTRICT_ASM_LOW32
2378           RESTRICT_ASM_ADDRT
2379           [src]"+&r"(src),              [dst]"+&r"(dst),
2380           [h]"+&r"(h)
2381         : [stride]"r"((mips_reg)stride),
2382           [A]"f"(A),                    [B]"f"(B),
2383           [C]"f"(C),                    [D]"f"(D),
2384           [ff_pw_28]"f"(ff_pw_28)
2385         : "memory"
2386     );
2387 }
2388
2389 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2390                                       uint8_t *src /* align 1 */,
2391                                       int stride, int h, int x, int y)
2392 {
2393     const int A = (8 - x) * (8 - y);
2394     const int B =     (x) * (8 - y);
2395     const int C = (8 - x) *     (y);
2396     const int D =     (x) *     (y);
2397     double ftmp[10];
2398     uint32_t tmp[1];
2399     DECLARE_VAR_ALL64;
2400     DECLARE_VAR_ADDRT;
2401
2402     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2403
2404     __asm__ volatile(
2405         "li         %[tmp0],    0x06                                    \n\t"
2406         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2407         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
2408         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2409         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2410         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2411         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2412
2413         "1:                                                             \n\t"
2414         MMI_ULDC1(%[ftmp1], %[src], 0x00)
2415         MMI_ULDC1(%[ftmp2], %[src], 0x01)
2416         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2417         MMI_ULDC1(%[ftmp3], %[src], 0x00)
2418         MMI_ULDC1(%[ftmp4], %[src], 0x01)
2419
2420         CHROMA_MC_8_MMI
2421
2422         MMI_LDC1(%[ftmp2], %[dst], 0x00)
2423         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2424
2425         MMI_SDC1(%[ftmp1], %[dst], 0x00)
2426         "addiu      %[h],       %[h],      -0x01                        \n\t"
2427         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2428         "bnez       %[h],       1b                                      \n\t"
2429         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2430           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2431           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2432           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2433           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2434           [tmp0]"=&r"(tmp[0]),
2435           RESTRICT_ASM_ALL64
2436           RESTRICT_ASM_ADDRT
2437           [src]"+&r"(src),              [dst]"+&r"(dst),
2438           [h]"+&r"(h)
2439         : [stride]"r"((mips_reg)stride),
2440           [A]"f"(A),                    [B]"f"(B),
2441           [C]"f"(C),                    [D]"f"(D),
2442           [ff_pw_28]"f"(ff_pw_28)
2443         : "memory"
2444     );
2445 }
2446
2447 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2448                                       uint8_t *src /* align 1 */,
2449                                       int stride, int h, int x, int y)
2450 {
2451     const int A = (8 - x) * (8 - y);
2452     const int B = (    x) * (8 - y);
2453     const int C = (8 - x) * (    y);
2454     const int D = (    x) * (    y);
2455     double ftmp[6];
2456     uint32_t tmp[1];
2457     DECLARE_VAR_LOW32;
2458     DECLARE_VAR_ADDRT;
2459
2460     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2461
2462     __asm__ volatile(
2463         "li         %[tmp0],    0x06                                    \n\t"
2464         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2465         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2466         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2467         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2468         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2469         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2470
2471         "1:                                                             \n\t"
2472         MMI_ULWC1(%[ftmp1], %[src], 0x00)
2473         MMI_ULWC1(%[ftmp2], %[src], 0x01)
2474         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2475         MMI_ULWC1(%[ftmp3], %[src], 0x00)
2476         MMI_ULWC1(%[ftmp4], %[src], 0x01)
2477
2478         CHROMA_MC_4_MMI
2479
2480         MMI_LWC1(%[ftmp2], %[dst], 0x00)
2481         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2482
2483         MMI_SWC1(%[ftmp1], %[dst], 0x00)
2484         "addiu      %[h],       %[h],      -0x01                        \n\t"
2485         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2486         "bnez       %[h],       1b                                      \n\t"
2487         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2488           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2489           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2490           [tmp0]"=&r"(tmp[0]),
2491           RESTRICT_ASM_LOW32
2492           RESTRICT_ASM_ADDRT
2493           [src]"+&r"(src),              [dst]"+&r"(dst),
2494           [h]"+&r"(h)
2495         : [stride]"r"((mips_reg)stride),
2496           [A]"f"(A),                    [B]"f"(B),
2497           [C]"f"(C),                    [D]"f"(D),
2498           [ff_pw_28]"f"(ff_pw_28)
2499         : "memory"
2500     );
2501 }