]> git.sesse.net Git - ffmpeg/blob - libavcodec/mips/vc1dsp_mmi.c
Merge commit 'c1bcd321ea2c2ae1765a1e64f03278712221d726'
[ffmpeg] / libavcodec / mips / vc1dsp_mmi.c
1 /*
2  * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
3  *
4  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 #include "libavutil/avassert.h"
24 #include "libavcodec/vc1dsp.h"
25 #include "constants.h"
26 #include "vc1dsp_mips.h"
27 #include "hpeldsp_mips.h"
28 #include "libavutil/mips/mmiutils.h"
29
30
31 #define VC1_INV_TRANCS_8_STEP1_MMI(fp1,   fp2,   fp3,   fp4,                \
32                                    o1,    o2,    o3,    o4,                 \
33                                    t1,    t2,    t3,    t4,                 \
34                                    ff_p1, ff_p2, ff_p3, ff_p4)              \
35         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p1"                \n\t"   \
36         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p2"                \n\t"   \
37         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p3"                \n\t"   \
38         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p4"                \n\t"   \
39         "paddh      "#o1"   ,   "#t1"   ,   "#t2"                   \n\t"   \
40         "paddh      "#o1"   ,   "#o1"   ,   "#t3"                   \n\t"   \
41         "paddh      "#o1"   ,   "#o1"   ,   "#t4"                   \n\t"   \
42                                                                             \
43         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p2"                \n\t"   \
44         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p4"                \n\t"   \
45         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p1"                \n\t"   \
46         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p3"                \n\t"   \
47         "psubh      "#o2"   ,   "#t1"   ,   "#t2"                   \n\t"   \
48         "psubh      "#o2"   ,   "#o2"   ,   "#t3"                   \n\t"   \
49         "psubh      "#o2"   ,   "#o2"   ,   "#t4"                   \n\t"   \
50                                                                             \
51         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p3"                \n\t"   \
52         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p1"                \n\t"   \
53         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p4"                \n\t"   \
54         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p2"                \n\t"   \
55         "psubh      "#o3"   ,   "#t1"   ,   "#t2"                   \n\t"   \
56         "paddh      "#o3"   ,   "#o3"   ,   "#t3"                   \n\t"   \
57         "paddh      "#o3"   ,   "#o3"   ,   "#t4"                   \n\t"   \
58                                                                             \
59         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p4"                \n\t"   \
60         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p3"                \n\t"   \
61         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p2"                \n\t"   \
62         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p1"                \n\t"   \
63         "psubh      "#o4"   ,   "#t1"   ,   "#t2"                   \n\t"   \
64         "paddh      "#o4"   ,   "#o4"   ,   "#t3"                   \n\t"   \
65         "psubh      "#o4"   ,   "#o4"   ,   "#t4"                   \n\t"
66
67
68 #define VC1_INV_TRANCS_8_STEP2_MMI(fp1,   fp2,   fp3,   fp4,                \
69                                    fp5,   fp6,   fp7,   fp8,                \
70                                    o1,    o2,    o3,    o4,                 \
71                                    ff_p1, ff_p2, ff_p3, ff_pw)              \
72         "paddh      "#fp5"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
73         "psubh      "#fp6"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
74         "pmullh     "#fp5"  ,   "#fp5"  ,   "#ff_p1"                \n\t"   \
75         "pmullh     "#fp6"  ,   "#fp6"  ,   "#ff_p1"                \n\t"   \
76         "paddh      "#fp5"  ,   "#fp5"  ,   "#ff_pw"                \n\t"   \
77         "paddh      "#fp6"  ,   "#fp6"  ,   "#ff_pw"                \n\t"   \
78                                                                             \
79         "pmullh     "#fp1"  ,   "#fp3"  ,   "#ff_p2"                \n\t"   \
80         "pmullh     "#fp2"  ,   "#fp4"  ,   "#ff_p3"                \n\t"   \
81         "pmullh     "#fp3"  ,   "#fp3"  ,   "#ff_p3"                \n\t"   \
82         "pmullh     "#fp4"  ,   "#fp4"  ,   "#ff_p2"                \n\t"   \
83         "paddh      "#fp7"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
84         "psubh      "#fp8"  ,   "#fp3"  ,   "#fp4"                  \n\t"   \
85                                                                             \
86         "paddh      "#fp1"  ,   "#fp5"  ,   "#fp7"                  \n\t"   \
87         "paddh      "#fp2"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
88         "psubh      "#fp3"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
89         "psubh      "#fp4"  ,   "#fp5"  ,   "#fp7"                  \n\t"   \
90                                                                             \
91         "paddh      "#fp5"  ,   "#fp1"  ,   "#o1"                   \n\t"   \
92         "paddh      "#fp6"  ,   "#fp2"  ,   "#o2"                   \n\t"   \
93         "paddh      "#fp7"  ,   "#fp3"  ,   "#o3"                   \n\t"   \
94         "paddh      "#fp8"  ,   "#fp4"  ,   "#o4"                   \n\t"   \
95                                                                             \
96         "psubh      "#fp4"  ,   "#fp4"  ,   "#o4"                   \n\t"   \
97         "psubh      "#fp3"  ,   "#fp3"  ,   "#o3"                   \n\t"   \
98         "psubh      "#fp2"  ,   "#fp2"  ,   "#o2"                   \n\t"   \
99         "psubh      "#fp1"  ,   "#fp1"  ,   "#o1"                   \n\t"
100
101
102 #define VC1_INV_TRANCS_4_STEP1_MMI(fp1,   fp2,   fp3,   fp4,                \
103                                    fp5,   fp6,   fp7,   fp8,                \
104                                    ff_p1, ff_p2, ff_p3, ff_pw)              \
105         "paddh      "#fp5"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
106         "psubh      "#fp6"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
107         "pmullh     "#fp5"  ,   "#fp5"  ,   "#ff_p1"                \n\t"   \
108         "pmullh     "#fp6"  ,   "#fp6"  ,   "#ff_p1"                \n\t"   \
109         "paddh      "#fp5"  ,   "#fp5"  ,   "#ff_pw"                \n\t"   \
110         "paddh      "#fp6"  ,   "#fp6"  ,   "#ff_pw"                \n\t"   \
111                                                                             \
112         "pmullh     "#fp1"  ,   "#fp3"  ,   "#ff_p2"                \n\t"   \
113         "pmullh     "#fp2"  ,   "#fp4"  ,   "#ff_p3"                \n\t"   \
114         "pmullh     "#fp3"  ,   "#fp3"  ,   "#ff_p3"                \n\t"   \
115         "pmullh     "#fp4"  ,   "#fp4"  ,   "#ff_p2"                \n\t"   \
116         "paddh      "#fp7"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
117         "psubh      "#fp8"  ,   "#fp3"  ,   "#fp4"                  \n\t"   \
118                                                                             \
119         "paddh      "#fp1"  ,   "#fp5"  ,   "#fp7"                  \n\t"   \
120         "psubh      "#fp2"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
121         "paddh      "#fp3"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
122         "psubh      "#fp4"  ,   "#fp5"  ,   "#fp7"                  \n\t"
123
124
125 #define VC1_INV_TRANCS_4_STEP2_MMI(fp1, fp2, fp3, fp4,                      \
126                                    fp5, fp6, fp7, fp8, zero)                \
127         "punpcklbh  "#fp5"  ,   "#fp5"  ,   "#zero"                 \n\t"   \
128         "punpcklbh  "#fp6"  ,   "#fp6"  ,   "#zero"                 \n\t"   \
129         "punpcklbh  "#fp7"  ,   "#fp7"  ,   "#zero"                 \n\t"   \
130         "punpcklbh  "#fp8"  ,   "#fp8"  ,   "#zero"                 \n\t"   \
131                                                                             \
132         "paddh      "#fp1"  ,   "#fp1"  ,   "#fp5"                  \n\t"   \
133         "paddh      "#fp2"  ,   "#fp2"  ,   "#fp6"                  \n\t"   \
134         "paddh      "#fp3"  ,   "#fp3"  ,   "#fp7"                  \n\t"   \
135         "paddh      "#fp4"  ,   "#fp4"  ,   "#fp8"                  \n\t"   \
136                                                                             \
137         "packushb   "#fp1"  ,   "#fp1"  ,   "#zero"                 \n\t"   \
138         "packushb   "#fp2"  ,   "#fp2"  ,   "#zero"                 \n\t"   \
139         "packushb   "#fp3"  ,   "#fp3"  ,   "#zero"                 \n\t"   \
140         "packushb   "#fp4"  ,   "#fp4"  ,   "#zero"                 \n\t"
141
142
143 /* Do inverse transform on 8x8 block */
144 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
145 {
146     int dc = block[0];
147     double ftmp[9];
148     mips_reg addr[1];
149     int count;
150
151     dc = (3 * dc +  1) >> 1;
152     dc = (3 * dc + 16) >> 5;
153
154     __asm__ volatile(
155         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
156         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
157         "li         %[count],   0x02                                    \n\t"
158
159         "1:                                                             \n\t"
160         MMI_LDC1(%[ftmp1], %[dest], 0x00)
161         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
162         MMI_LDC1(%[ftmp2], %[addr0], 0x00)
163         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
164         MMI_LDC1(%[ftmp3], %[addr0], 0x00)
165         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
166         MMI_LDC1(%[ftmp4], %[addr0], 0x00)
167
168         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
169         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
170         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
171         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
172         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
173         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
174         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
175         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
176
177         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
178         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
179         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
180         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
181         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
182         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
183         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
184         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
185
186         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
187         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
188         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
189         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
190
191         MMI_SDC1(%[ftmp1], %[dest], 0x00)
192         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
193         MMI_SDC1(%[ftmp2], %[addr0], 0x00)
194         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
195         MMI_SDC1(%[ftmp3], %[addr0], 0x00)
196         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
197         MMI_SDC1(%[ftmp4], %[addr0], 0x00)
198
199         "addiu      %[count],   %[count],       -0x01                   \n\t"
200         PTR_ADDU   "%[dest],    %[addr0],       %[linesize]             \n\t"
201         "bnez       %[count],   1b                                      \n\t"
202         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
203           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
204           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
205           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
206           [ftmp8]"=&f"(ftmp[8]),
207           [addr0]"=&r"(addr[0]),
208           [count]"=&r"(count),          [dest]"+&r"(dest)
209         : [linesize]"r"((mips_reg)linesize),
210           [dc]"f"(dc)
211         : "memory"
212     );
213 }
214
215 #if _MIPS_SIM != _ABIO32
216 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
217 {
218     DECLARE_ALIGNED(16, int16_t, temp[64]);
219     int16_t *src = block;
220     int16_t *dst = temp;
221     double ftmp[16];
222     uint32_t count, tmp[1];
223
224     // 1st loop
225     __asm__ volatile (
226         "li         %[tmp0],    0x03                                    \n\t"
227         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
228         "li         %[count],   0x02                                    \n\t"
229
230         "1:                                                             \n\t"
231         MMI_LDC1(%[ftmp5], %[src], 0x10)
232         MMI_LDC1(%[ftmp6], %[src], 0x30)
233         MMI_LDC1(%[ftmp7], %[src], 0x50)
234         MMI_LDC1(%[ftmp8], %[src], 0x70)
235
236         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
237                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
238                                    %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
239                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
240                                    %[ff_pw_4])
241
242         MMI_LDC1(%[ftmp1], %[src], 0x00)
243         MMI_LDC1(%[ftmp2], %[src], 0x40)
244         MMI_LDC1(%[ftmp3], %[src], 0x20)
245         MMI_LDC1(%[ftmp4], %[src], 0x60)
246
247         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
248                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
249                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
250                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
251                                    %[ff_pw_4])
252
253
254         PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
255                     %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
256
257         TRANSPOSE_4H(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
258                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
259                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
260
261         MMI_SDC1(%[ftmp5], %[dst], 0x00)
262         MMI_SDC1(%[ftmp6], %[dst], 0x10)
263         MMI_SDC1(%[ftmp7], %[dst], 0x20)
264         MMI_SDC1(%[ftmp8], %[dst], 0x30)
265
266         TRANSPOSE_4H(%[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1],
267                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
268                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
269
270         MMI_SDC1(%[ftmp4], %[dst], 0x08)
271         MMI_SDC1(%[ftmp3], %[dst], 0x18)
272         MMI_SDC1(%[ftmp2], %[dst], 0x28)
273         MMI_SDC1(%[ftmp1], %[dst], 0x38)
274
275         "addiu      %[count],   %[count],  -0x01                        \n\t"
276         PTR_ADDIU  "%[src],     %[src],     0x08                        \n\t"
277         PTR_ADDIU  "%[dst],     %[dst],     0x40                        \n\t"
278         "bnez       %[count],   1b                                      \n\t"
279         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
280           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
281           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
282           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
283           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
284           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
285           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
286           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
287           [tmp0]"=&r"(tmp[0]),
288           [count]"=&r"(count),
289           [src]"+&r"(src),              [dst]"+&r"(dst)
290         : [ff_pw_4]"f"(ff_pw_4),        [ff_pw_6]"f"(ff_pw_6),
291           [ff_pw_9]"f"(ff_pw_9),        [ff_pw_12]"f"(ff_pw_12),
292           [ff_pw_15]"f"(ff_pw_15),      [ff_pw_16]"f"(ff_pw_16)
293         : "memory"
294     );
295
296     src = temp;
297     dst = block;
298
299     // 2nd loop
300     __asm__ volatile (
301         "li         %[tmp0],    0x07                                    \n\t"
302         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
303         "li         %[count],   0x02                                    \n\t"
304
305         "1:                                                             \n\t"
306         MMI_LDC1(%[ftmp5], %[src], 0x10)
307         MMI_LDC1(%[ftmp6], %[src], 0x30)
308         MMI_LDC1(%[ftmp7], %[src], 0x50)
309         MMI_LDC1(%[ftmp8], %[src], 0x70)
310
311         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
312                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
313                                    %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
314                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
315                                    %[ff_pw_4])
316
317         MMI_LDC1(%[ftmp1], %[src], 0x00)
318         MMI_LDC1(%[ftmp2], %[src], 0x40)
319         MMI_LDC1(%[ftmp3], %[src], 0x20)
320         MMI_LDC1(%[ftmp4], %[src], 0x60)
321
322         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
323                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
324                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
325                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
326                                    %[ff_pw_64])
327
328         "paddh      %[ftmp4],   %[ftmp4],   %[ff_pw_1]                  \n\t"
329         "paddh      %[ftmp3],   %[ftmp3],   %[ff_pw_1]                  \n\t"
330         "paddh      %[ftmp2],   %[ftmp2],   %[ff_pw_1]                  \n\t"
331         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_1]                  \n\t"
332
333         PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
334                     %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
335
336         MMI_SDC1(%[ftmp5], %[dst], 0x00)
337         MMI_SDC1(%[ftmp6], %[dst], 0x10)
338         MMI_SDC1(%[ftmp7], %[dst], 0x20)
339         MMI_SDC1(%[ftmp8], %[dst], 0x30)
340
341         MMI_SDC1(%[ftmp4], %[dst], 0x40)
342         MMI_SDC1(%[ftmp3], %[dst], 0x50)
343         MMI_SDC1(%[ftmp2], %[dst], 0x60)
344         MMI_SDC1(%[ftmp1], %[dst], 0x70)
345
346         "addiu      %[count],   %[count],  -0x01                        \n\t"
347         PTR_ADDIU  "%[src],     %[src],     0x08                        \n\t"
348         PTR_ADDIU  "%[dst],     %[dst],     0x08                        \n\t"
349         "bnez       %[count],   1b                                      \n\t"
350         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
351           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
352           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
353           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
354           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
355           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
356           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
357           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
358           [tmp0]"=&r"(tmp[0]),
359           [count]"=&r"(count),
360           [src]"+&r"(src),              [dst]"+&r"(dst)
361         : [ff_pw_1]"f"(ff_pw_1),        [ff_pw_4]"f"(ff_pw_4),
362           [ff_pw_6]"f"(ff_pw_6),        [ff_pw_9]"f"(ff_pw_9),
363           [ff_pw_12]"f"(ff_pw_12),      [ff_pw_15]"f"(ff_pw_15),
364           [ff_pw_16]"f"(ff_pw_16),      [ff_pw_64]"f"(ff_pw_64)
365         : "memory"
366     );
367 }
368 #endif
369
370 /* Do inverse transform on 8x4 part of block */
371 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
372 {
373     int dc = block[0];
374     double ftmp[9];
375
376     dc = ( 3 * dc +  1) >> 1;
377     dc = (17 * dc + 64) >> 7;
378
379     __asm__ volatile(
380         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
381         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
382
383         MMI_LDC1(%[ftmp1], %[dest0], 0x00)
384         MMI_LDC1(%[ftmp2], %[dest1], 0x00)
385         MMI_LDC1(%[ftmp3], %[dest2], 0x00)
386         MMI_LDC1(%[ftmp4], %[dest3], 0x00)
387
388         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
389         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
390         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
391         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
392         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
393         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
394         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
395         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
396
397         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
398         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
399         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
400         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
401         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
402         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
403         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
404         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
405
406         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
407         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
408         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
409         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
410
411         MMI_SDC1(%[ftmp1], %[dest0], 0x00)
412         MMI_SDC1(%[ftmp2], %[dest1], 0x00)
413         MMI_SDC1(%[ftmp3], %[dest2], 0x00)
414         MMI_SDC1(%[ftmp4], %[dest3], 0x00)
415         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
416           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
417           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
418           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
419           [ftmp8]"=&f"(ftmp[8])
420         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
421           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
422           [dc]"f"(dc)
423         : "memory"
424     );
425 }
426
427 #if _MIPS_SIM != _ABIO32
428 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
429 {
430     int16_t *src = block;
431     int16_t *dst = block;
432     double ftmp[16];
433     uint32_t tmp[1];
434     mips_reg addr[1];
435     DECLARE_VAR_LOW32;
436
437     // 1st loop
438     __asm__ volatile (
439         MMI_LDC1(%[ftmp1], %[src], 0x00)
440         MMI_LDC1(%[ftmp2], %[src], 0x08)
441         MMI_LDC1(%[ftmp3], %[src], 0x10)
442         MMI_LDC1(%[ftmp4], %[src], 0x18)
443         MMI_LDC1(%[ftmp5], %[src], 0x20)
444         MMI_LDC1(%[ftmp6], %[src], 0x28)
445         MMI_LDC1(%[ftmp7], %[src], 0x30)
446         MMI_LDC1(%[ftmp8], %[src], 0x38)
447
448         //             a1        b1        a3        b2
449         TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp5], %[ftmp7],
450                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
451                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
452
453         //             a2        b3        a4        b4
454         TRANSPOSE_4H(%[ftmp2], %[ftmp4], %[ftmp6], %[ftmp8],
455                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
456                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
457
458         // input b1 b2 b3 b4
459         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
460                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
461                                    %[ftmp0], %[ftmp13], %[ftmp14], %[ftmp15],
462                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
463                                    %[ff_pw_4])
464         // input a1 a2 a3 a4
465         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp5], %[ftmp6],
466                                    %[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
467                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
468                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
469                                    %[ff_pw_4])
470
471         "li         %[tmp0],    0x03                                    \n\t"
472         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
473
474         PSRAH_8_MMI(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
475                     %[ftmp6], %[ftmp5], %[ftmp2], %[ftmp1], %[ftmp0])
476
477         TRANSPOSE_4H(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
478                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
479                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
480
481         MMI_SDC1(%[ftmp3], %[dst], 0x00)
482         MMI_SDC1(%[ftmp7], %[dst], 0x10)
483         MMI_SDC1(%[ftmp4], %[dst], 0x20)
484         MMI_SDC1(%[ftmp8], %[dst], 0x30)
485
486         TRANSPOSE_4H(%[ftmp6], %[ftmp5], %[ftmp2], %[ftmp1],
487                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
488                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
489
490         MMI_SDC1(%[ftmp6], %[dst], 0x08)
491         MMI_SDC1(%[ftmp5], %[dst], 0x18)
492         MMI_SDC1(%[ftmp2], %[dst], 0x28)
493         MMI_SDC1(%[ftmp1], %[dst], 0x38)
494         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
495           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
496           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
497           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
498           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
499           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
500           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
501           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
502           [tmp0]"=&r"(tmp[0])
503         : [src]"r"(src),                [dst]"r"(dst),
504           [ff_pw_4]"f"(ff_pw_4),        [ff_pw_6]"f"(ff_pw_6),
505           [ff_pw_9]"f"(ff_pw_9),        [ff_pw_12]"f"(ff_pw_12),
506           [ff_pw_15]"f"(ff_pw_15),      [ff_pw_16]"f"(ff_pw_16)
507         : "memory"
508     );
509
510     src = block;
511
512     // 2nd loop
513     __asm__ volatile (
514         "li         %[tmp0],    0x07                                    \n\t"
515         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
516         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
517
518         // dest low 32bit
519         MMI_LDC1(%[ftmp1], %[src], 0x00)
520         MMI_LDC1(%[ftmp2], %[src], 0x20)
521         MMI_LDC1(%[ftmp3], %[src], 0x30)
522         MMI_LDC1(%[ftmp4], %[src], 0x10)
523
524         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
525                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
526                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
527                                    %[ff_pw_64])
528
529         PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp9])
530
531         MMI_LWC1(%[ftmp5], %[dest], 0x00)
532         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
533         MMI_LWC1(%[ftmp6], %[addr0], 0x00)
534         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
535         MMI_LWC1(%[ftmp7], %[addr0], 0x00)
536         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
537         MMI_LWC1(%[ftmp8], %[addr0], 0x00)
538
539         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
540                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
541                                    %[ftmp0])
542
543         MMI_SWC1(%[ftmp1], %[dest], 0x00)
544         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
545         MMI_SWC1(%[ftmp2], %[addr0], 0x00)
546         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
547         MMI_SWC1(%[ftmp3], %[addr0], 0x00)
548         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
549         MMI_SWC1(%[ftmp4], %[addr0], 0x00)
550
551         // dest high 32bit
552         MMI_LDC1(%[ftmp1], %[src], 0x08)
553         MMI_LDC1(%[ftmp2], %[src], 0x28)
554         MMI_LDC1(%[ftmp3], %[src], 0x38)
555         MMI_LDC1(%[ftmp4], %[src], 0x18)
556
557         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
558                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
559                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
560                                    %[ff_pw_64])
561
562         PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp9])
563
564         MMI_LWC1(%[ftmp5], %[dest], 0x04)
565         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
566         MMI_LWC1(%[ftmp6], %[addr0], 0x04)
567         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
568         MMI_LWC1(%[ftmp7], %[addr0], 0x04)
569         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
570         MMI_LWC1(%[ftmp8], %[addr0], 0x04)
571
572         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
573                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
574                                    %[ftmp0])
575
576         MMI_SWC1(%[ftmp1], %[dest], 0x04)
577         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
578         MMI_SWC1(%[ftmp2], %[addr0], 0x04)
579         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
580         MMI_SWC1(%[ftmp3], %[addr0], 0x04)
581         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
582         MMI_SWC1(%[ftmp4], %[addr0], 0x04)
583
584         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
585           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
586           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
587           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
588           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
589           [tmp0]"=&r"(tmp[0]),
590           RESTRICT_ASM_LOW32
591           [addr0]"=&r"(addr[0])
592         : [src]"r"(src),                [dest]"r"(dest),
593           [linesize]"r"((mips_reg)linesize),
594           [ff_pw_17]"f"(ff_pw_17),      [ff_pw_22]"f"(ff_pw_22),
595           [ff_pw_10]"f"(ff_pw_10),      [ff_pw_64]"f"(ff_pw_64)
596         : "memory"
597     );
598 }
599 #endif
600
601 /* Do inverse transform on 4x8 parts of block */
602 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
603 {
604     int dc = block[0];
605     double ftmp[9];
606     DECLARE_VAR_LOW32;
607
608     dc = (17 * dc +  4) >> 3;
609     dc = (12 * dc + 64) >> 7;
610
611     __asm__ volatile(
612         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
613         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
614
615         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
616         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
617         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
618         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
619         MMI_LWC1(%[ftmp5], %[dest4], 0x00)
620         MMI_LWC1(%[ftmp6], %[dest5], 0x00)
621         MMI_LWC1(%[ftmp7], %[dest6], 0x00)
622         MMI_LWC1(%[ftmp8], %[dest7], 0x00)
623
624         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
625         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
626         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
627         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
628         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
629         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
630         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
631         "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
632
633         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
634         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
635         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
636         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
637         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
638         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
639         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
640         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
641
642         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
643         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
644         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
645         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
646         "packushb   %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
647         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
648         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
649         "packushb   %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
650
651         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
652         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
653         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
654         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
655         MMI_SWC1(%[ftmp5], %[dest4], 0x00)
656         MMI_SWC1(%[ftmp6], %[dest5], 0x00)
657         MMI_SWC1(%[ftmp7], %[dest6], 0x00)
658         MMI_SWC1(%[ftmp8], %[dest7], 0x00)
659         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
660           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
661           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
662           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
663           RESTRICT_ASM_LOW32
664           [ftmp8]"=&f"(ftmp[8])
665         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
666           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
667           [dest4]"r"(dest+4*linesize),  [dest5]"r"(dest+5*linesize),
668           [dest6]"r"(dest+6*linesize),  [dest7]"r"(dest+7*linesize),
669           [dc]"f"(dc)
670         : "memory"
671     );
672 }
673
674 #if _MIPS_SIM != _ABIO32
675 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
676 {
677     int16_t *src = block;
678     int16_t *dst = block;
679     double ftmp[16];
680     uint32_t count, tmp[1];
681     mips_reg addr[1];
682     DECLARE_VAR_LOW32;
683
684     // 1st loop
685     __asm__ volatile (
686         "li         %[count],   0x02                                    \n\t"
687         "li         %[tmp0],    0x03                                    \n\t"
688         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
689
690         "1:                                                             \n\t"
691         MMI_LDC1(%[ftmp1], %[src], 0x00)
692         MMI_LDC1(%[ftmp2], %[src], 0x10)
693         MMI_LDC1(%[ftmp3], %[src], 0x20)
694         MMI_LDC1(%[ftmp4], %[src], 0x30)
695
696         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
697                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
698                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
699
700         //                              t1        t2        t3        t4
701         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
702                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
703                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
704                                    %[ff_pw_4])
705
706         PSRAH_4_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2], %[ftmp0])
707
708         TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
709                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
710                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
711
712         MMI_SDC1(%[ftmp1], %[dst], 0x00)
713         MMI_SDC1(%[ftmp3], %[dst], 0x10)
714         MMI_SDC1(%[ftmp4], %[dst], 0x20)
715         MMI_SDC1(%[ftmp2], %[dst], 0x30)
716
717         "addiu      %[count],   %[count],  -0x01                        \n\t"
718         PTR_ADDIU  "%[src],     %[src],     0x40                        \n\t"
719         PTR_ADDIU  "%[dst],     %[dst],     0x40                        \n\t"
720         "bnez       %[count],   1b                                      \n\t"
721         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
722           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
723           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
724           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
725           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
726           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
727           [tmp0]"=&r"(tmp[0]),
728           [count]"=&r"(count),
729           [src]"+&r"(src),              [dst]"+&r"(dst)
730         : [ff_pw_17]"f"(ff_pw_17),      [ff_pw_10]"f"(ff_pw_10),
731           [ff_pw_22]"f"(ff_pw_22),      [ff_pw_4]"f"(ff_pw_4)
732         : "memory"
733     );
734
735     src = block;
736
737     // 2nd loop
738     __asm__ volatile (
739         "li         %[tmp0],    0x07                                    \n\t"
740         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
741
742         MMI_LDC1(%[ftmp5], %[src], 0x10)
743         MMI_LDC1(%[ftmp6], %[src], 0x30)
744         MMI_LDC1(%[ftmp7], %[src], 0x50)
745         MMI_LDC1(%[ftmp8], %[src], 0x70)
746
747         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
748                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
749                                    %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
750                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
751                                    %[ff_pw_4])
752
753         MMI_LDC1(%[ftmp1], %[src], 0x00)
754         MMI_LDC1(%[ftmp2], %[src], 0x40)
755         MMI_LDC1(%[ftmp3], %[src], 0x20)
756         MMI_LDC1(%[ftmp4], %[src], 0x60)
757
758         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
759                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
760                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
761                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
762                                    %[ff_pw_64])
763
764         "paddh      %[ftmp4],   %[ftmp4],   %[ff_pw_1]                  \n\t"
765         "paddh      %[ftmp3],   %[ftmp3],   %[ff_pw_1]                  \n\t"
766         "paddh      %[ftmp2],   %[ftmp2],   %[ff_pw_1]                  \n\t"
767         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_1]                  \n\t"
768
769         PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
770                     %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
771
772         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
773
774         // dest low
775         MMI_LWC1(%[ftmp9], %[dest], 0x00)
776         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
777         MMI_LWC1(%[ftmp10], %[addr0], 0x00)
778         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
779         MMI_LWC1(%[ftmp11], %[addr0], 0x00)
780         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
781         MMI_LWC1(%[ftmp12], %[addr0], 0x00)
782         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
783
784         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
785                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
786                                    %[ftmp0])
787
788         // dest high
789         MMI_LWC1(%[ftmp9], %[addr0], 0x00)
790         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
791         MMI_LWC1(%[ftmp10], %[addr0], 0x00)
792         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
793         MMI_LWC1(%[ftmp11], %[addr0], 0x00)
794         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
795         MMI_LWC1(%[ftmp12], %[addr0], 0x00)
796
797         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1],
798                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
799                                    %[ftmp0])
800
801         // dest low
802         MMI_SWC1(%[ftmp5], %[dest], 0x00)
803         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
804         MMI_SWC1(%[ftmp6], %[addr0], 0x00)
805         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
806         MMI_SWC1(%[ftmp7], %[addr0], 0x00)
807         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
808         MMI_SWC1(%[ftmp8], %[addr0], 0x00)
809         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
810
811         // dest high
812         MMI_SWC1(%[ftmp4], %[addr0], 0x00)
813         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
814         MMI_SWC1(%[ftmp3], %[addr0], 0x00)
815         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
816         MMI_SWC1(%[ftmp2], %[addr0], 0x00)
817         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
818         MMI_SWC1(%[ftmp1], %[addr0], 0x00)
819         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
820           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
821           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
822           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
823           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
824           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
825           [ftmp12]"=&f"(ftmp[12]),
826           [tmp0]"=&r"(tmp[0]),
827           RESTRICT_ASM_LOW32
828           [addr0]"=&r"(addr[0]),
829           [dest]"+&r"(dest)
830         : [src]"r"(src),                [linesize]"r"(linesize),
831           [ff_pw_1]"f"(ff_pw_1),        [ff_pw_4]"f"(ff_pw_4),
832           [ff_pw_6]"f"(ff_pw_6),        [ff_pw_9]"f"(ff_pw_9),
833           [ff_pw_12]"f"(ff_pw_12),      [ff_pw_15]"f"(ff_pw_15),
834           [ff_pw_16]"f"(ff_pw_16),      [ff_pw_64]"f"(ff_pw_64)
835         : "memory"
836     );
837 }
838 #endif
839
840 /* Do inverse transform on 4x4 part of block */
841 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
842 {
843     int dc = block[0];
844     double ftmp[5];
845     DECLARE_VAR_LOW32;
846
847     dc = (17 * dc +  4) >> 3;
848     dc = (17 * dc + 64) >> 7;
849
850     __asm__ volatile(
851         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
852         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
853
854         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
855         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
856         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
857         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
858
859         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
860         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
861         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
862         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
863
864         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
865         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
866         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
867         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
868
869         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
870         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
871         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
872         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
873
874         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
875         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
876         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
877         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
878         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
879           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
880           RESTRICT_ASM_LOW32
881           [ftmp4]"=&f"(ftmp[4])
882         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
883           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
884           [dc]"f"(dc)
885         : "memory"
886     );
887 }
888
889 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
890 {
891     int16_t *src = block;
892     int16_t *dst = block;
893     double ftmp[12];
894     uint32_t tmp[1];
895     mips_reg addr[1];
896     DECLARE_VAR_LOW32;
897
898     // 1st loop
899     __asm__ volatile (
900         "li         %[tmp0],    0x03                                    \n\t"
901         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
902
903         MMI_LDC1(%[ftmp1], %[src], 0x00)
904         MMI_LDC1(%[ftmp2], %[src], 0x10)
905         MMI_LDC1(%[ftmp3], %[src], 0x20)
906         MMI_LDC1(%[ftmp4], %[src], 0x30)
907
908         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
909                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
910                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
911
912         //                              t1        t2        t3        t4
913         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
914                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
915                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
916                                    %[ff_pw_4])
917
918         PSRAH_4_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2], %[ftmp0])
919
920         TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
921                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
922                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
923
924         MMI_SDC1(%[ftmp1], %[dst], 0x00)
925         MMI_SDC1(%[ftmp3], %[dst], 0x10)
926         MMI_SDC1(%[ftmp4], %[dst], 0x20)
927         MMI_SDC1(%[ftmp2], %[dst], 0x30)
928         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
929           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
930           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
931           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
932           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
933           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
934           [tmp0]"=&r"(tmp[0]),
935           [src]"+&r"(src),              [dst]"+&r"(dst)
936         : [ff_pw_17]"f"(ff_pw_17),      [ff_pw_10]"f"(ff_pw_10),
937           [ff_pw_22]"f"(ff_pw_22),      [ff_pw_4]"f"(ff_pw_4)
938         : "memory"
939     );
940
941     src = block;
942
943     // 2nd loop
944     __asm__ volatile (
945         "li         %[tmp0],    0x07                                    \n\t"
946         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
947
948         // dest low 32bit
949         MMI_LDC1(%[ftmp1], %[src], 0x00)
950         MMI_LDC1(%[ftmp2], %[src], 0x20)
951         MMI_LDC1(%[ftmp3], %[src], 0x30)
952         MMI_LDC1(%[ftmp4], %[src], 0x10)
953
954         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
955                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
956                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
957                                    %[ff_pw_64])
958
959         PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp0])
960
961         MMI_LWC1(%[ftmp5], %[dest], 0x00)
962         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
963         MMI_LWC1(%[ftmp6], %[addr0], 0x00)
964         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
965         MMI_LWC1(%[ftmp7], %[addr0], 0x00)
966         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
967         MMI_LWC1(%[ftmp8], %[addr0], 0x00)
968
969         "xor        %[ftmp9],   %[ftmp9],  %[ftmp9]                     \n\t"
970
971         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
972                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
973                                    %[ftmp9])
974
975         MMI_SWC1(%[ftmp1], %[dest], 0x00)
976         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
977         MMI_SWC1(%[ftmp2], %[addr0], 0x00)
978         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
979         MMI_SWC1(%[ftmp3], %[addr0], 0x00)
980         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
981         MMI_SWC1(%[ftmp4], %[addr0], 0x00)
982         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
983           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
984           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
985           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
986           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
987           [tmp0]"=&r"(tmp[0]),
988           RESTRICT_ASM_LOW32
989           [addr0]"=&r"(addr[0])
990         : [src]"r"(src),                [dest]"r"(dest),
991           [linesize]"r"((mips_reg)linesize),
992           [ff_pw_17]"f"(ff_pw_17),      [ff_pw_22]"f"(ff_pw_22),
993           [ff_pw_10]"f"(ff_pw_10),      [ff_pw_64]"f"(ff_pw_64)
994         : "memory"
995     );
996 }
997
998 /* Apply overlap transform to horizontal edge */
999 void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
1000 {
1001     int i;
1002     int a, b, c, d;
1003     int d1, d2;
1004     int rnd = 1;
1005     for (i = 0; i < 8; i++) {
1006         a  = src[-2];
1007         b  = src[-1];
1008         c  = src[0];
1009         d  = src[1];
1010         d1 = (a - d + 3 + rnd) >> 3;
1011         d2 = (a - d + b - c + 4 - rnd) >> 3;
1012
1013         src[-2] = a - d1;
1014         src[-1] = av_clip_uint8(b - d2);
1015         src[0]  = av_clip_uint8(c + d2);
1016         src[1]  = d + d1;
1017         src    += stride;
1018         rnd     = !rnd;
1019     }
1020 }
1021
1022 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
1023 {
1024     int i;
1025     int a, b, c, d;
1026     int d1, d2;
1027     int rnd1 = flags & 2 ? 3 : 4;
1028     int rnd2 = 7 - rnd1;
1029     for (i = 0; i < 8; i++) {
1030         a  = left[6];
1031         b  = left[7];
1032         c  = right[0];
1033         d  = right[1];
1034         d1 = a - d;
1035         d2 = a - d + b - c;
1036
1037         left[6]  = ((a << 3) - d1 + rnd1) >> 3;
1038         left[7]  = ((b << 3) - d2 + rnd2) >> 3;
1039         right[0] = ((c << 3) + d2 + rnd1) >> 3;
1040         right[1] = ((d << 3) + d1 + rnd2) >> 3;
1041
1042         right += right_stride;
1043         left  += left_stride;
1044         if (flags & 1) {
1045             rnd2   = 7 - rnd2;
1046             rnd1   = 7 - rnd1;
1047         }
1048     }
1049 }
1050
1051 /* Apply overlap transform to vertical edge */
1052 void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
1053 {
1054     int i;
1055     int a, b, c, d;
1056     int d1, d2;
1057     int rnd = 1;
1058     for (i = 0; i < 8; i++) {
1059         a  = src[-2 * stride];
1060         b  = src[-stride];
1061         c  = src[0];
1062         d  = src[stride];
1063         d1 = (a - d + 3 + rnd) >> 3;
1064         d2 = (a - d + b - c + 4 - rnd) >> 3;
1065
1066         src[-2 * stride] = a - d1;
1067         src[-stride]     = av_clip_uint8(b - d2);
1068         src[0]           = av_clip_uint8(c + d2);
1069         src[stride]      = d + d1;
1070         src++;
1071         rnd = !rnd;
1072     }
1073 }
1074
1075 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1076 {
1077     int i;
1078     int a, b, c, d;
1079     int d1, d2;
1080     int rnd1 = 4, rnd2 = 3;
1081     for (i = 0; i < 8; i++) {
1082         a  = top[48];
1083         b  = top[56];
1084         c  = bottom[0];
1085         d  = bottom[8];
1086         d1 = a - d;
1087         d2 = a - d + b - c;
1088
1089         top[48]   = ((a << 3) - d1 + rnd1) >> 3;
1090         top[56]   = ((b << 3) - d2 + rnd2) >> 3;
1091         bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1092         bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1093
1094         bottom++;
1095         top++;
1096         rnd2 = 7 - rnd2;
1097         rnd1 = 7 - rnd1;
1098     }
1099 }
1100
1101 /**
1102  * VC-1 in-loop deblocking filter for one line
1103  * @param src source block type
1104  * @param stride block stride
1105  * @param pq block quantizer
1106  * @return whether other 3 pairs should be filtered or not
1107  * @see 8.6
1108  */
1109 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1110 {
1111     int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1112               5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1113     int a0_sign = a0 >> 31;        /* Store sign */
1114
1115     a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1116     if (a0 < pq) {
1117         int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1118                         5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1119         int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1120                         5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1121         if (a1 < a0 || a2 < a0) {
1122             int clip      = src[-1 * stride] - src[0 * stride];
1123             int clip_sign = clip >> 31;
1124
1125             clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1126             if (clip) {
1127                 int a3     = FFMIN(a1, a2);
1128                 int d      = 5 * (a3 - a0);
1129                 int d_sign = (d >> 31);
1130
1131                 d       = ((d ^ d_sign) - d_sign) >> 3;
1132                 d_sign ^= a0_sign;
1133
1134                 if (d_sign ^ clip_sign)
1135                     d = 0;
1136                 else {
1137                     d = FFMIN(d, clip);
1138                     d = (d ^ d_sign) - d_sign; /* Restore sign */
1139                     src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1140                     src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1141                 }
1142                 return 1;
1143             }
1144         }
1145     }
1146     return 0;
1147 }
1148
1149 /**
1150  * VC-1 in-loop deblocking filter
1151  * @param src source block type
1152  * @param step distance between horizontally adjacent elements
1153  * @param stride distance between vertically adjacent elements
1154  * @param len edge length to filter (4 or 8 pixels)
1155  * @param pq block quantizer
1156  * @see 8.6
1157  */
1158 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1159                                    int len, int pq)
1160 {
1161     int i;
1162     int filt3;
1163
1164     for (i = 0; i < len; i += 4) {
1165         filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1166         if (filt3) {
1167             vc1_filter_line(src + 0 * step, stride, pq);
1168             vc1_filter_line(src + 1 * step, stride, pq);
1169             vc1_filter_line(src + 3 * step, stride, pq);
1170         }
1171         src += step * 4;
1172     }
1173 }
1174
1175 void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1176 {
1177     vc1_loop_filter(src, 1, stride, 4, pq);
1178 }
1179
1180 void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1181 {
1182     vc1_loop_filter(src, stride, 1, 4, pq);
1183 }
1184
1185 void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1186 {
1187     vc1_loop_filter(src, 1, stride, 8, pq);
1188 }
1189
1190 void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1191 {
1192     vc1_loop_filter(src, stride, 1, 8, pq);
1193 }
1194
1195 void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1196 {
1197     vc1_loop_filter(src, 1, stride, 16, pq);
1198 }
1199
1200 void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1201 {
1202     vc1_loop_filter(src, stride, 1, 16, pq);
1203 }
1204
1205 void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1206                                ptrdiff_t stride, int rnd)
1207 {
1208     ff_put_pixels8_8_mmi(dst, src, stride, 8);
1209 }
1210 void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1211                                   ptrdiff_t stride, int rnd)
1212 {
1213     ff_put_pixels16_8_mmi(dst, src, stride, 16);
1214 }
1215 void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1216                                ptrdiff_t stride, int rnd)
1217 {
1218     ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1219 }
1220 void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1221                                   ptrdiff_t stride, int rnd)
1222 {
1223     ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1224 }
1225
1226 #define OP_PUT(S, D)
1227 #define OP_AVG(S, D)                                                        \
1228     "ldc1       $f16,   "#S"                        \n\t"                   \
1229     "pavgb      "#D",   "#D",   $f16                \n\t"
1230
1231 /** Add rounder from $f14 to $f6 and pack result at destination */
1232 #define NORMALIZE_MMI(SHIFT)                                                \
1233     "paddh      $f6,    $f6,    $f14                \n\t" /* +bias-r */     \
1234     "paddh      $f8,    $f8,    $f14                \n\t" /* +bias-r */     \
1235     "psrah      $f6,    $f6,    "SHIFT"             \n\t"                   \
1236     "psrah      $f8,    $f8,    "SHIFT"             \n\t"
1237
1238 #define TRANSFER_DO_PACK(OP)                                                \
1239     "packushb   $f6,    $f6,    $f8                 \n\t"                   \
1240     OP((%[dst]), $f6)                                                       \
1241     "sdc1       $f6,    0x00(%[dst])                \n\t"
1242
1243 #define TRANSFER_DONT_PACK(OP)                                              \
1244      OP(0(%[dst]), $f6)                                                     \
1245      OP(8(%[dst]), $f8)                                                     \
1246      "sdc1      $f6,    0x00(%[dst])                \n\t"                   \
1247      "sdc1      $f8,    0x08(%[dst])                \n\t"
1248
1249 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1250 #define DO_UNPACK(reg)                                                      \
1251     "punpcklbh  "reg",  "reg",  $f0                 \n\t"
1252 #define DONT_UNPACK(reg)
1253
1254 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1255 #define LOAD_ROUNDER_MMI(ROUND)                                             \
1256     "lwc1       $f14,   "ROUND"                     \n\t"                   \
1257     "punpcklhw  $f14,   $f14,   $f14                \n\t"                   \
1258     "punpcklwd  $f14,   $f14,   $f14                \n\t"
1259
1260
1261 #define SHIFT2_LINE(OFF, R0, R1, R2, R3)                                    \
1262     "paddh      "#R1",      "#R1",  "#R2"           \n\t"                   \
1263     PTR_ADDU    "$9,        %[src], %[stride1]      \n\t"                   \
1264     MMI_ULWC1(R0, $9, 0x00)                                                 \
1265     "pmullh     "#R1",      "#R1",  $f6             \n\t"                   \
1266     "punpcklbh  "#R0",      "#R0",  $f0             \n\t"                   \
1267     PTR_ADDU    "$9,        %[src], %[stride]       \n\t"                   \
1268     MMI_ULWC1(R3, $9, 0x00)                                                 \
1269     "psubh      "#R1",      "#R1",  "#R0"           \n\t"                   \
1270     "punpcklbh  "#R3",      "#R3",  $f0             \n\t"                   \
1271     "paddh      "#R1",      "#R1",  $f14            \n\t"                   \
1272     "psubh      "#R1",      "#R1",  "#R3"           \n\t"                   \
1273     "psrah      "#R1",      "#R1",  %[shift]        \n\t"                   \
1274     MMI_SDC1(R1, %[dst], OFF)                                               \
1275     PTR_ADDU    "%[src],    %[src], %[stride]       \n\t"
1276
1277 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1278 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1279                                        const uint8_t *src, mips_reg stride,
1280                                        int rnd, int64_t shift)
1281 {
1282     DECLARE_VAR_LOW32;
1283     DECLARE_VAR_ADDRT;
1284
1285     __asm__ volatile(
1286         "xor        $f0,    $f0,    $f0             \n\t"
1287         "li         $8,     0x03                    \n\t"
1288         LOAD_ROUNDER_MMI("%[rnd]")
1289         "ldc1       $f12,   %[ff_pw_9]              \n\t"
1290         "1:                                         \n\t"
1291         MMI_ULWC1($f4, %[src], 0x00)
1292         PTR_ADDU   "%[src], %[src], %[stride]       \n\t"
1293         MMI_ULWC1($f6, %[src], 0x00)
1294         "punpcklbh  $f4,    $f4,    $f0             \n\t"
1295         "punpcklbh  $f6,    $f6,    $f0             \n\t"
1296         SHIFT2_LINE(  0, $f2, $f4, $f6, $f8)
1297         SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1298         SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1299         SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1300         SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1301         SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1302         SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1303         SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1304         PTR_SUBU   "%[src], %[src], %[stride2]      \n\t"
1305         PTR_ADDIU  "%[dst], %[dst], 0x08            \n\t"
1306         "addiu      $8,     $8,    -0x01            \n\t"
1307         "bnez       $8,     1b                      \n\t"
1308         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT
1309           [src]"+r"(src),               [dst]"+r"(dst)
1310         : [stride]"r"(stride),          [stride1]"r"(-2*stride),
1311           [shift]"f"(shift),            [rnd]"m"(rnd),
1312           [stride2]"r"(9*stride-4),     [ff_pw_9]"m"(ff_pw_9)
1313         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1314           "$f14", "$f16", "memory"
1315     );
1316 }
1317
1318 /**
1319  * Data is already unpacked, so some operations can directly be made from
1320  * memory.
1321  */
1322 #define VC1_HOR_16B_SHIFT2(OP, OPNAME)                                      \
1323 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1324                                              const int16_t *src, int rnd)   \
1325 {                                                                           \
1326     int h = 8;                                                              \
1327     DECLARE_VAR_ALL64;                                                      \
1328     DECLARE_VAR_ADDRT;                                                      \
1329                                                                             \
1330     src -= 1;                                                               \
1331     rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */                            \
1332                                                                             \
1333     __asm__ volatile(                                                       \
1334         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1335         "ldc1       $f12,   %[ff_pw_128]            \n\t"                   \
1336         "ldc1       $f10,   %[ff_pw_9]              \n\t"                   \
1337         "1:                                         \n\t"                   \
1338         MMI_ULDC1($f2, %[src], 0x00)                                        \
1339         MMI_ULDC1($f4, %[src], 0x08)                                        \
1340         MMI_ULDC1($f6, %[src], 0x02)                                        \
1341         MMI_ULDC1($f8, %[src], 0x0a)                                        \
1342         MMI_ULDC1($f0, %[src], 0x06)                                        \
1343         "paddh      $f2,    $f2,    $f0             \n\t"                   \
1344         MMI_ULDC1($f0, %[src], 0x0e)                                        \
1345         "paddh      $f4,    $f4,    $f0             \n\t"                   \
1346         MMI_ULDC1($f0, %[src], 0x04)                                        \
1347         "paddh      $f6,    $f6,    $f0             \n\t"                   \
1348         MMI_ULDC1($f0, %[src], 0x0b)                                        \
1349         "paddh      $f8,    $f8,    $f0             \n\t"                   \
1350         "pmullh     $f6,    $f6,    $f10            \n\t"                   \
1351         "pmullh     $f8,    $f8,    $f10            \n\t"                   \
1352         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1353         "psubh      $f8,    $f8,    $f4             \n\t"                   \
1354         "li         $8,     0x07                    \n\t"                   \
1355         "mtc1       $8,     $f16                    \n\t"                   \
1356         NORMALIZE_MMI("$f16")                                               \
1357         /* Remove bias */                                                   \
1358         "paddh      $f6,    $f6,    $f12            \n\t"                   \
1359         "paddh      $f8,    $f8,    $f12            \n\t"                   \
1360         TRANSFER_DO_PACK(OP)                                                \
1361         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1362         PTR_ADDIU  "%[src], %[src], 0x18            \n\t"                   \
1363         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1364         "bnez       %[h],   1b                      \n\t"                   \
1365         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1366           [h]"+r"(h),                                                       \
1367           [src]"+r"(src),               [dst]"+r"(dst)                      \
1368         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1369           [ff_pw_9]"m"(ff_pw_9),        [ff_pw_128]"m"(ff_pw_128)           \
1370         : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",  \
1371           "$f16", "memory"                                                  \
1372     );                                                                      \
1373 }
1374
1375 VC1_HOR_16B_SHIFT2(OP_PUT, put_)
1376 VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
1377
1378 /**
1379  * Purely vertical or horizontal 1/2 shift interpolation.
1380  * Sacrify $f12 for *9 factor.
1381  */
1382 #define VC1_SHIFT2(OP, OPNAME)\
1383 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src,      \
1384                                      mips_reg stride, int rnd,              \
1385                                      mips_reg offset)                       \
1386 {                                                                           \
1387     DECLARE_VAR_LOW32;                                                      \
1388     DECLARE_VAR_ADDRT;                                                      \
1389                                                                             \
1390     rnd = 8 - rnd;                                                          \
1391                                                                             \
1392     __asm__ volatile(                                                       \
1393         "xor        $f0,    $f0,    $f0             \n\t"                   \
1394         "li         $10,    0x08                    \n\t"                   \
1395         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1396         "ldc1       $f12,   %[ff_pw_9]              \n\t"                   \
1397         "1:                                         \n\t"                   \
1398         MMI_ULWC1($f6, %[src], 0x00)                                        \
1399         MMI_ULWC1($f8, %[src], 0x04)                                        \
1400         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1401         MMI_ULWC1($f2, $9, 0x00)                                            \
1402         MMI_ULWC1($f4, $9, 0x04)                                            \
1403         PTR_ADDU   "%[src], %[src], %[offset]       \n\t"                   \
1404         "punpcklbh  $f6,    $f6,    $f0             \n\t"                   \
1405         "punpcklbh  $f8,    $f8,    $f0             \n\t"                   \
1406         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1407         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1408         "paddh      $f6,    $f6,    $f2             \n\t"                   \
1409         "paddh      $f8,    $f8,    $f4             \n\t"                   \
1410         PTR_ADDU   "$9,     %[src], %[offset_x2n]   \n\t"                   \
1411         MMI_ULWC1($f2, $9, 0x00)                                            \
1412         MMI_ULWC1($f4, $9, 0x04)                                            \
1413         "pmullh     $f6,    $f6,    $f12            \n\t" /* 0,9,9,0*/      \
1414         "pmullh     $f8,    $f8,    $f12            \n\t" /* 0,9,9,0*/      \
1415         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1416         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1417         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,0*/      \
1418         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,0*/      \
1419         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1420         MMI_ULWC1($f2, $9, 0x00)                                            \
1421         MMI_ULWC1($f4, $9, 0x04)                                            \
1422         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1423         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1424         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,-1*/     \
1425         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,-1*/     \
1426         "li         $8,     0x04                    \n\t"                   \
1427         "mtc1       $8,     $f16                    \n\t"                   \
1428         NORMALIZE_MMI("$f16")                                               \
1429         "packushb   $f6,    $f6,    $f8             \n\t"                   \
1430         OP((%[dst]), $f6)                                                   \
1431         "sdc1       $f6,    0x00(%[dst])            \n\t"                   \
1432         "addiu      $10,    $10,   -0x01            \n\t"                   \
1433         PTR_ADDU   "%[src], %[src], %[stride1]      \n\t"                   \
1434         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1435         "bnez       $10,    1b                      \n\t"                   \
1436         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1437           [src]"+r"(src),               [dst]"+r"(dst)                      \
1438         : [offset]"r"(offset),          [offset_x2n]"r"(-2*offset),         \
1439           [stride]"g"(stride),          [rnd]"m"(rnd),                      \
1440           [stride1]"g"(stride-offset),                                      \
1441           [ff_pw_9]"m"(ff_pw_9)                                             \
1442         : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",     \
1443           "$f12", "$f14", "$f16", "memory"                                  \
1444     );                                                                      \
1445 }
1446
1447 VC1_SHIFT2(OP_PUT, put_)
1448 VC1_SHIFT2(OP_AVG, avg_)
1449
1450 /**
1451  * Core of the 1/4 and 3/4 shift bicubic interpolation.
1452  *
1453  * @param UNPACK  Macro unpacking arguments from 8 to 16bits (can be empty).
1454  * @param LOAD    "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1455  * @param M       "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1456  * @param A1      Stride address of 1st tap (beware of unpacked/packed).
1457  * @param A2      Stride address of 2nd tap
1458  * @param A3      Stride address of 3rd tap
1459  * @param A4      Stride address of 4th tap
1460  */
1461 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4)                \
1462     PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                       \
1463     LOAD($f2, $9, M*0)                                                      \
1464     LOAD($f4, $9, M*4)                                                      \
1465     UNPACK("$f2")                                                           \
1466     UNPACK("$f4")                                                           \
1467     "pmullh     $f2,    $f2,    %[ff_pw_3]      \n\t"                       \
1468     "pmullh     $f4,    $f4,    %[ff_pw_3]      \n\t"                       \
1469     PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                       \
1470     LOAD($f6, $9, M*0)                                                      \
1471     LOAD($f8, $9, M*4)                                                      \
1472     UNPACK("$f6")                                                           \
1473     UNPACK("$f8")                                                           \
1474     "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */             \
1475     "pmullh     $f8,    $f8,    $f12            \n\t" /* *18 */             \
1476     "psubh      $f6,    $f6,    $f2             \n\t" /* *18, -3 */         \
1477     "psubh      $f8,    $f8,    $f4             \n\t" /* *18, -3 */         \
1478     PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                       \
1479     LOAD($f2, $9, M*0)                                                      \
1480     LOAD($f4, $9, M*4)                                                      \
1481     UNPACK("$f2")                                                           \
1482     UNPACK("$f4")                                                           \
1483     "li         $8,     0x02                    \n\t"                       \
1484     "mtc1       $8,     $f16                    \n\t"                       \
1485     "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */              \
1486     "psllh      $f4,    $f4,    $f16            \n\t" /* 4* */              \
1487     "psubh      $f6,    $f6,    $f2             \n\t" /* -4,18,-3 */        \
1488     "psubh      $f8,    $f8,    $f4             \n\t" /* -4,18,-3 */        \
1489     PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                       \
1490     LOAD($f2, $9, M*0)                                                      \
1491     LOAD($f4, $9, M*4)                                                      \
1492     UNPACK("$f2")                                                           \
1493     UNPACK("$f4")                                                           \
1494     "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */             \
1495     "pmullh     $f4,    $f4,    $f10            \n\t" /* *53 */             \
1496     "paddh      $f6,    $f6,    $f2             \n\t" /* 4,53,18,-3 */      \
1497     "paddh      $f8,    $f8,    $f4             \n\t" /* 4,53,18,-3 */
1498
1499 /**
1500  * Macro to build the vertical 16bits version of vc1_put_shift[13].
1501  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1502  * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1503  *
1504  * @param  NAME   Either 1 or 3
1505  * @see MSPEL_FILTER13_CORE for information on A1->A4
1506  */
1507 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)                        \
1508 static void                                                                 \
1509 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src,          \
1510                                  mips_reg src_stride,                       \
1511                                  int rnd, int64_t shift)                    \
1512 {                                                                           \
1513     int h = 8;                                                              \
1514     DECLARE_VAR_LOW32;                                                      \
1515     DECLARE_VAR_ADDRT;                                                      \
1516                                                                             \
1517     src -= src_stride;                                                      \
1518                                                                             \
1519     __asm__ volatile(                                                       \
1520         "xor        $f0,    $f0,    $f0             \n\t"                   \
1521         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1522         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1523         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1524         ".p2align 3                                 \n\t"                   \
1525         "1:                                         \n\t"                   \
1526         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
1527         NORMALIZE_MMI("%[shift]")                                           \
1528         TRANSFER_DONT_PACK(OP_PUT)                                          \
1529         /* Last 3 (in fact 4) bytes on the line */                          \
1530         PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                   \
1531         MMI_ULWC1($f2, $9, 0x08)                                            \
1532         DO_UNPACK("$f2")                                                    \
1533         "mov.d      $f6,    $f2                     \n\t"                   \
1534         "paddh      $f2,    $f2,    $f2             \n\t"                   \
1535         "paddh      $f2,    $f2,    $f6             \n\t" /* 3* */          \
1536         PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                   \
1537         MMI_ULWC1($f6, $9, 0x08)                                            \
1538         DO_UNPACK("$f6")                                                    \
1539         "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */         \
1540         "psubh      $f6,    $f6,    $f2             \n\t" /* *18,-3 */      \
1541         PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                   \
1542         MMI_ULWC1($f2, $9, 0x08)                                            \
1543         DO_UNPACK("$f2")                                                    \
1544         "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */         \
1545         "paddh      $f6,    $f6,    $f2             \n\t" /* *53,18,-3 */   \
1546         PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                   \
1547         MMI_ULWC1($f2, $9, 0x08)                                            \
1548         DO_UNPACK("$f2")                                                    \
1549         "li         $8,     0x02                    \n\t"                   \
1550         "mtc1       $8,     $f16                    \n\t"                   \
1551         "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */          \
1552         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1553         "paddh      $f6,    $f6,    $f14            \n\t"                   \
1554         "li         $8,     0x06                    \n\t"                   \
1555         "mtc1       $8,     $f16                    \n\t"                   \
1556         "psrah      $f6,    $f6,    $f16            \n\t"                   \
1557         "sdc1       $f6,    0x10(%[dst])            \n\t"                   \
1558         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1559         PTR_ADDU   "%[src], %[src], %[stride_x1]    \n\t"                   \
1560         PTR_ADDIU  "%[dst], %[dst], 0x18            \n\t"                   \
1561         "bnez       %[h],   1b                      \n\t"                   \
1562         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1563           [h]"+r"(h),                                                       \
1564           [src]"+r"(src),               [dst]"+r"(dst)                      \
1565         : [stride_x1]"r"(src_stride),   [stride_x2]"r"(2*src_stride),       \
1566           [stride_x3]"r"(3*src_stride),                                     \
1567           [rnd]"m"(rnd),                [shift]"f"(shift),                  \
1568           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1569           [ff_pw_3]"f"(ff_pw_3)                                             \
1570         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1571           "$f14", "$f16", "memory"                                          \
1572     );                                                                      \
1573 }
1574
1575 /**
1576  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1577  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1578  *
1579  * @param  NAME   Either 1 or 3
1580  * @see MSPEL_FILTER13_CORE for information on A1->A4
1581  */
1582 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)            \
1583 static void                                                                 \
1584 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride,       \
1585                                        const int16_t *src, int rnd)         \
1586 {                                                                           \
1587     int h = 8;                                                              \
1588     DECLARE_VAR_ALL64;                                                      \
1589     DECLARE_VAR_ADDRT;                                                      \
1590                                                                             \
1591     src -= 1;                                                               \
1592     rnd -= (-4+58+13-3)*256; /* Add -256 bias */                            \
1593                                                                             \
1594     __asm__ volatile(                                                       \
1595         "xor        $f0,    $f0,    $f0             \n\t"                   \
1596         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1597         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1598         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1599         ".p2align 3                                 \n\t"                   \
1600         "1:                                         \n\t"                   \
1601         MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4)      \
1602         "li         $8,     0x07                    \n\t"                   \
1603         "mtc1       $8,     $f16                    \n\t"                   \
1604         NORMALIZE_MMI("$f16")                                               \
1605         /* Remove bias */                                                   \
1606         "paddh      $f6,    $f6,    %[ff_pw_128]    \n\t"                   \
1607         "paddh      $f8,    $f8,    %[ff_pw_128]    \n\t"                   \
1608         TRANSFER_DO_PACK(OP)                                                \
1609         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1610         PTR_ADDU   "%[src], %[src], 0x18            \n\t"                   \
1611         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1612         "bnez       %[h],   1b                      \n\t"                   \
1613         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1614           [h]"+r"(h),                                                       \
1615           [src]"+r"(src),               [dst]"+r"(dst)                      \
1616         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1617           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1618           [ff_pw_3]"f"(ff_pw_3),        [ff_pw_128]"f"(ff_pw_128)           \
1619         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1620           "$f14", "$f16", "memory"                                          \
1621     );                                                                      \
1622 }
1623
1624 /**
1625  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
1626  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1627  * %3 (offset), %4 (2*offset) and %5 (3*offset).
1628  *
1629  * @param  NAME   Either 1 or 3
1630  * @see MSPEL_FILTER13_CORE for information on A1->A4
1631  */
1632 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)                 \
1633 static void                                                                 \
1634 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src,             \
1635                               mips_reg stride, int rnd, mips_reg offset)    \
1636 {                                                                           \
1637     int h = 8;                                                              \
1638     DECLARE_VAR_LOW32;                                                      \
1639     DECLARE_VAR_ADDRT;                                                      \
1640                                                                             \
1641     src -= offset;                                                          \
1642     rnd = 32-rnd;                                                           \
1643                                                                             \
1644     __asm__ volatile (                                                      \
1645         "xor        $f0,    $f0,    $f0             \n\t"                   \
1646         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1647         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1648         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1649         ".p2align 3                                 \n\t"                   \
1650         "1:                                         \n\t"                   \
1651         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
1652         "li         $8,     0x06                    \n\t"                   \
1653         "mtc1       $8,     $f16                    \n\t"                   \
1654         NORMALIZE_MMI("$f16")                                               \
1655         TRANSFER_DO_PACK(OP)                                                \
1656         "addiu      %[h],   %[h],      -0x01        \n\t"                   \
1657         PTR_ADDU   "%[src], %[src],     %[stride]   \n\t"                   \
1658         PTR_ADDU   "%[dst], %[dst],     %[stride]   \n\t"                   \
1659         "bnez       %[h],   1b                      \n\t"                   \
1660         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1661           [h]"+r"(h),                                                       \
1662           [src]"+r"(src),               [dst]"+r"(dst)                      \
1663         : [offset_x1]"r"(offset),       [offset_x2]"r"(2*offset),           \
1664           [offset_x3]"r"(3*offset),     [stride]"g"(stride),                \
1665           [rnd]"m"(rnd),                                                    \
1666           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1667           [ff_pw_3]"f"(ff_pw_3)                                             \
1668         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1669           "$f14", "$f16", "memory"                                          \
1670     );                                                                      \
1671 }
1672
1673
1674 /** 1/4 shift bicubic interpolation */
1675 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
1676 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
1677 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
1678 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
1679 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
1680
1681 /** 3/4 shift bicubic interpolation */
1682 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
1683 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
1684 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
1685 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
1686 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
1687
1688 typedef void (*vc1_mspel_mc_filter_ver_16bits)
1689              (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
1690               int64_t shift);
1691 typedef void (*vc1_mspel_mc_filter_hor_16bits)
1692              (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
1693 typedef void (*vc1_mspel_mc_filter_8bits)
1694              (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
1695               mips_reg offset);
1696
1697 /**
1698  * Interpolate fractional pel values by applying proper vertical then
1699  * horizontal filter.
1700  *
1701  * @param  dst     Destination buffer for interpolated pels.
1702  * @param  src     Source buffer.
1703  * @param  stride  Stride for both src and dst buffers.
1704  * @param  hmode   Horizontal filter (expressed in quarter pixels shift).
1705  * @param  hmode   Vertical filter.
1706  * @param  rnd     Rounding bias.
1707  */
1708 #define VC1_MSPEL_MC(OP)                                                    \
1709 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
1710                                int hmode, int vmode, int rnd)               \
1711 {                                                                           \
1712     static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
1713          { NULL, vc1_put_ver_16b_shift1_mmi,                                \
1714                  vc1_put_ver_16b_shift2_mmi,                                \
1715                  vc1_put_ver_16b_shift3_mmi };                              \
1716     static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
1717          { NULL, OP ## vc1_hor_16b_shift1_mmi,                              \
1718                  OP ## vc1_hor_16b_shift2_mmi,                              \
1719                  OP ## vc1_hor_16b_shift3_mmi };                            \
1720     static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =          \
1721          { NULL, OP ## vc1_shift1_mmi,                                      \
1722                  OP ## vc1_shift2_mmi,                                      \
1723                  OP ## vc1_shift3_mmi };                                    \
1724                                                                             \
1725     if (vmode) { /* Vertical filter to apply */                             \
1726         if (hmode) { /* Horizontal filter to apply, output to tmp */        \
1727             static const int shift_value[] = { 0, 5, 1, 5 };                \
1728             int    shift = (shift_value[hmode]+shift_value[vmode])>>1;      \
1729             int    r;                                                       \
1730             LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);                        \
1731                                                                             \
1732             r = (1<<(shift-1)) + rnd-1;                                     \
1733             vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);  \
1734                                                                             \
1735             vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);    \
1736             return;                                                         \
1737         }                                                                   \
1738         else { /* No horizontal filter, output 8 lines to dst */            \
1739             vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);    \
1740             return;                                                         \
1741         }                                                                   \
1742     }                                                                       \
1743                                                                             \
1744     /* Horizontal mode with no vertical mode */                             \
1745     vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);                   \
1746 }                                                                           \
1747 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src,         \
1748                                   int stride, int hmode, int vmode, int rnd)\
1749 {                                                                           \
1750     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
1751     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
1752     dst += 8*stride; src += 8*stride;                                       \
1753     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
1754     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
1755 }
1756
1757 VC1_MSPEL_MC(put_)
1758 VC1_MSPEL_MC(avg_)
1759
1760 /** Macro to ease bicubic filter interpolation functions declarations */
1761 #define DECLARE_FUNCTION(a, b)                                              \
1762 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
1763                                            const uint8_t *src,              \
1764                                            ptrdiff_t stride,                \
1765                                            int rnd)                         \
1766 {                                                                           \
1767      put_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
1768 }                                                                           \
1769 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
1770                                            const uint8_t *src,              \
1771                                            ptrdiff_t stride,                \
1772                                            int rnd)                         \
1773 {                                                                           \
1774      avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
1775 }                                                                           \
1776 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
1777                                               const uint8_t *src,           \
1778                                               ptrdiff_t stride,             \
1779                                               int rnd)                      \
1780 {                                                                           \
1781      put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
1782 }                                                                           \
1783 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
1784                                               const uint8_t *src,           \
1785                                               ptrdiff_t stride,             \
1786                                               int rnd)                      \
1787 {                                                                           \
1788      avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
1789 }
1790
1791 DECLARE_FUNCTION(0, 1)
1792 DECLARE_FUNCTION(0, 2)
1793 DECLARE_FUNCTION(0, 3)
1794
1795 DECLARE_FUNCTION(1, 0)
1796 DECLARE_FUNCTION(1, 1)
1797 DECLARE_FUNCTION(1, 2)
1798 DECLARE_FUNCTION(1, 3)
1799
1800 DECLARE_FUNCTION(2, 0)
1801 DECLARE_FUNCTION(2, 1)
1802 DECLARE_FUNCTION(2, 2)
1803 DECLARE_FUNCTION(2, 3)
1804
1805 DECLARE_FUNCTION(3, 0)
1806 DECLARE_FUNCTION(3, 1)
1807 DECLARE_FUNCTION(3, 2)
1808 DECLARE_FUNCTION(3, 3)
1809
1810 #define CHROMA_MC_8_MMI                                                     \
1811         "punpckhbh  %[ftmp5],   %[ftmp1],   %[ftmp0]                \n\t"   \
1812         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
1813         "punpckhbh  %[ftmp6],   %[ftmp2],   %[ftmp0]                \n\t"   \
1814         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
1815         "punpckhbh  %[ftmp7],   %[ftmp3],   %[ftmp0]                \n\t"   \
1816         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
1817         "punpckhbh  %[ftmp8],   %[ftmp4],   %[ftmp0]                \n\t"   \
1818         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
1819                                                                             \
1820         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
1821         "pmullh     %[ftmp5],   %[ftmp5],   %[A]                    \n\t"   \
1822         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
1823         "pmullh     %[ftmp6],   %[ftmp6],   %[B]                    \n\t"   \
1824         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
1825         "pmullh     %[ftmp7],   %[ftmp7],   %[C]                    \n\t"   \
1826         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
1827         "pmullh     %[ftmp8],   %[ftmp8],   %[D]                    \n\t"   \
1828                                                                             \
1829         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
1830         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
1831         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
1832         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
1833                                                                             \
1834         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp6]                \n\t"   \
1835         "paddh      %[ftmp7],   %[ftmp7],   %[ftmp8]                \n\t"   \
1836         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp7]                \n\t"   \
1837         "paddh      %[ftmp5],   %[ftmp5],   %[ff_pw_28]             \n\t"   \
1838                                                                             \
1839         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp9]                \n\t"   \
1840         "psrlh      %[ftmp5],   %[ftmp5],   %[ftmp9]                \n\t"   \
1841         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"
1842
1843
1844 #define CHROMA_MC_4_MMI                                                     \
1845         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
1846         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
1847         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
1848         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
1849                                                                             \
1850         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
1851         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
1852         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
1853         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
1854                                                                             \
1855         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
1856         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
1857         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
1858         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
1859                                                                             \
1860         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"   \
1861         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"
1862
1863
1864 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
1865                                       uint8_t *src /* align 1 */,
1866                                       int stride, int h, int x, int y)
1867 {
1868     const int A = (8 - x) * (8 - y);
1869     const int B =     (x) * (8 - y);
1870     const int C = (8 - x) *     (y);
1871     const int D =     (x) *     (y);
1872     double ftmp[10];
1873     uint32_t tmp[1];
1874     DECLARE_VAR_ALL64;
1875     DECLARE_VAR_ADDRT;
1876
1877     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1878
1879     __asm__ volatile(
1880         "li         %[tmp0],    0x06                                    \n\t"
1881         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1882         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
1883         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
1884         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
1885         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
1886         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
1887
1888         "1:                                                             \n\t"
1889         MMI_ULDC1(%[ftmp1], %[src], 0x00)
1890         MMI_ULDC1(%[ftmp2], %[src], 0x01)
1891         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
1892         MMI_ULDC1(%[ftmp3], %[src], 0x00)
1893         MMI_ULDC1(%[ftmp4], %[src], 0x01)
1894
1895         CHROMA_MC_8_MMI
1896
1897         MMI_SDC1(%[ftmp1], %[dst], 0x00)
1898         "addiu      %[h],       %[h],      -0x01                        \n\t"
1899         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
1900         "bnez       %[h],       1b                                      \n\t"
1901         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1902           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1903           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1904           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1905           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1906           RESTRICT_ASM_ALL64
1907           RESTRICT_ASM_ADDRT
1908           [tmp0]"=&r"(tmp[0]),
1909           [src]"+&r"(src),              [dst]"+&r"(dst),
1910           [h]"+&r"(h)
1911         : [stride]"r"((mips_reg)stride),
1912           [A]"f"(A),                    [B]"f"(B),
1913           [C]"f"(C),                    [D]"f"(D),
1914           [ff_pw_28]"f"(ff_pw_28)
1915         : "memory"
1916     );
1917 }
1918
1919 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
1920                                       uint8_t *src /* align 1 */,
1921                                       int stride, int h, int x, int y)
1922 {
1923     const int A = (8 - x) * (8 - y);
1924     const int B =     (x) * (8 - y);
1925     const int C = (8 - x) *     (y);
1926     const int D =     (x) *     (y);
1927     double ftmp[6];
1928     uint32_t tmp[1];
1929     DECLARE_VAR_LOW32;
1930     DECLARE_VAR_ADDRT;
1931
1932     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1933
1934     __asm__ volatile(
1935         "li         %[tmp0],    0x06                                    \n\t"
1936         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1937         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
1938         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
1939         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
1940         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
1941         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
1942
1943         "1:                                                             \n\t"
1944         MMI_ULWC1(%[ftmp1], %[src], 0x00)
1945         MMI_ULWC1(%[ftmp2], %[src], 0x01)
1946         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
1947         MMI_ULWC1(%[ftmp3], %[src], 0x00)
1948         MMI_ULWC1(%[ftmp4], %[src], 0x01)
1949
1950         CHROMA_MC_4_MMI
1951
1952         MMI_SWC1(%[ftmp1], %[dst], 0x00)
1953         "addiu      %[h],       %[h],      -0x01                        \n\t"
1954         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
1955         "bnez       %[h],       1b                                      \n\t"
1956         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1957           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1958           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1959           [tmp0]"=&r"(tmp[0]),
1960           RESTRICT_ASM_LOW32
1961           RESTRICT_ASM_ADDRT
1962           [src]"+&r"(src),              [dst]"+&r"(dst),
1963           [h]"+&r"(h)
1964         : [stride]"r"((mips_reg)stride),
1965           [A]"f"(A),                    [B]"f"(B),
1966           [C]"f"(C),                    [D]"f"(D),
1967           [ff_pw_28]"f"(ff_pw_28)
1968         : "memory"
1969     );
1970 }
1971
1972 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
1973                                       uint8_t *src /* align 1 */,
1974                                       int stride, int h, int x, int y)
1975 {
1976     const int A = (8 - x) * (8 - y);
1977     const int B =     (x) * (8 - y);
1978     const int C = (8 - x) *     (y);
1979     const int D =     (x) *     (y);
1980     double ftmp[10];
1981     uint32_t tmp[1];
1982     DECLARE_VAR_ALL64;
1983     DECLARE_VAR_ADDRT;
1984
1985     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1986
1987     __asm__ volatile(
1988         "li         %[tmp0],    0x06                                    \n\t"
1989         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1990         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
1991         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
1992         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
1993         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
1994         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
1995
1996         "1:                                                             \n\t"
1997         MMI_ULDC1(%[ftmp1], %[src], 0x00)
1998         MMI_ULDC1(%[ftmp2], %[src], 0x01)
1999         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2000         MMI_ULDC1(%[ftmp3], %[src], 0x00)
2001         MMI_ULDC1(%[ftmp4], %[src], 0x01)
2002
2003         CHROMA_MC_8_MMI
2004
2005         MMI_LDC1(%[ftmp2], %[dst], 0x00)
2006         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2007
2008         MMI_SDC1(%[ftmp1], %[dst], 0x00)
2009         "addiu      %[h],       %[h],      -0x01                        \n\t"
2010         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2011         "bnez       %[h],       1b                                      \n\t"
2012         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2013           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2014           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2015           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2016           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2017           [tmp0]"=&r"(tmp[0]),
2018           RESTRICT_ASM_ALL64
2019           RESTRICT_ASM_ADDRT
2020           [src]"+&r"(src),              [dst]"+&r"(dst),
2021           [h]"+&r"(h)
2022         : [stride]"r"((mips_reg)stride),
2023           [A]"f"(A),                    [B]"f"(B),
2024           [C]"f"(C),                    [D]"f"(D),
2025           [ff_pw_28]"f"(ff_pw_28)
2026         : "memory"
2027     );
2028 }
2029
2030 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2031                                       uint8_t *src /* align 1 */,
2032                                       int stride, int h, int x, int y)
2033 {
2034     const int A = (8 - x) * (8 - y);
2035     const int B = (    x) * (8 - y);
2036     const int C = (8 - x) * (    y);
2037     const int D = (    x) * (    y);
2038     double ftmp[6];
2039     uint32_t tmp[1];
2040     DECLARE_VAR_LOW32;
2041     DECLARE_VAR_ADDRT;
2042
2043     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2044
2045     __asm__ volatile(
2046         "li         %[tmp0],    0x06                                    \n\t"
2047         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2048         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2049         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2050         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2051         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2052         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2053
2054         "1:                                                             \n\t"
2055         MMI_ULWC1(%[ftmp1], %[src], 0x00)
2056         MMI_ULWC1(%[ftmp2], %[src], 0x01)
2057         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2058         MMI_ULWC1(%[ftmp3], %[src], 0x00)
2059         MMI_ULWC1(%[ftmp4], %[src], 0x01)
2060
2061         CHROMA_MC_4_MMI
2062
2063         MMI_LWC1(%[ftmp2], %[dst], 0x00)
2064         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2065
2066         MMI_SWC1(%[ftmp1], %[dst], 0x00)
2067         "addiu      %[h],       %[h],      -0x01                        \n\t"
2068         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2069         "bnez       %[h],       1b                                      \n\t"
2070         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2071           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2072           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2073           [tmp0]"=&r"(tmp[0]),
2074           RESTRICT_ASM_LOW32
2075           RESTRICT_ASM_ADDRT
2076           [src]"+&r"(src),              [dst]"+&r"(dst),
2077           [h]"+&r"(h)
2078         : [stride]"r"((mips_reg)stride),
2079           [A]"f"(A),                    [B]"f"(B),
2080           [C]"f"(C),                    [D]"f"(D),
2081           [ff_pw_28]"f"(ff_pw_28)
2082         : "memory"
2083     );
2084 }