]> git.sesse.net Git - ffmpeg/blob - libavcodec/mips/vc1dsp_mmi.c
Merge commit 'b487add7ecf78efda36d49815f8f8757bd24d4cb'
[ffmpeg] / libavcodec / mips / vc1dsp_mmi.c
1 /*
2  * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
3  *
4  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 #include "libavutil/avassert.h"
24 #include "libavcodec/vc1dsp.h"
25 #include "constants.h"
26 #include "vc1dsp_mips.h"
27 #include "hpeldsp_mips.h"
28 #include "libavutil/mips/mmiutils.h"
29
30
31 #define VC1_INV_TRANCS_8_STEP1_MMI(fp1,   fp2,   fp3,   fp4,                \
32                                    o1,    o2,    o3,    o4,                 \
33                                    t1,    t2,    t3,    t4,                 \
34                                    ff_p1, ff_p2, ff_p3, ff_p4)              \
35         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p1"                \n\t"   \
36         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p2"                \n\t"   \
37         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p3"                \n\t"   \
38         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p4"                \n\t"   \
39         "paddh      "#o1"   ,   "#t1"   ,   "#t2"                   \n\t"   \
40         "paddh      "#o1"   ,   "#o1"   ,   "#t3"                   \n\t"   \
41         "paddh      "#o1"   ,   "#o1"   ,   "#t4"                   \n\t"   \
42                                                                             \
43         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p2"                \n\t"   \
44         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p4"                \n\t"   \
45         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p1"                \n\t"   \
46         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p3"                \n\t"   \
47         "psubh      "#o2"   ,   "#t1"   ,   "#t2"                   \n\t"   \
48         "psubh      "#o2"   ,   "#o2"   ,   "#t3"                   \n\t"   \
49         "psubh      "#o2"   ,   "#o2"   ,   "#t4"                   \n\t"   \
50                                                                             \
51         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p3"                \n\t"   \
52         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p1"                \n\t"   \
53         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p4"                \n\t"   \
54         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p2"                \n\t"   \
55         "psubh      "#o3"   ,   "#t1"   ,   "#t2"                   \n\t"   \
56         "paddh      "#o3"   ,   "#o3"   ,   "#t3"                   \n\t"   \
57         "paddh      "#o3"   ,   "#o3"   ,   "#t4"                   \n\t"   \
58                                                                             \
59         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p4"                \n\t"   \
60         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p3"                \n\t"   \
61         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p2"                \n\t"   \
62         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p1"                \n\t"   \
63         "psubh      "#o4"   ,   "#t1"   ,   "#t2"                   \n\t"   \
64         "paddh      "#o4"   ,   "#o4"   ,   "#t3"                   \n\t"   \
65         "psubh      "#o4"   ,   "#o4"   ,   "#t4"                   \n\t"
66
67
68 #define VC1_INV_TRANCS_8_STEP2_MMI(fp1,   fp2,   fp3,   fp4,                \
69                                    fp5,   fp6,   fp7,   fp8,                \
70                                    o1,    o2,    o3,    o4,                 \
71                                    ff_p1, ff_p2, ff_p3, ff_pw)              \
72         "paddh      "#fp5"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
73         "psubh      "#fp6"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
74         "pmullh     "#fp5"  ,   "#fp5"  ,   "#ff_p1"                \n\t"   \
75         "pmullh     "#fp6"  ,   "#fp6"  ,   "#ff_p1"                \n\t"   \
76         "paddh      "#fp5"  ,   "#fp5"  ,   "#ff_pw"                \n\t"   \
77         "paddh      "#fp6"  ,   "#fp6"  ,   "#ff_pw"                \n\t"   \
78                                                                             \
79         "pmullh     "#fp1"  ,   "#fp3"  ,   "#ff_p2"                \n\t"   \
80         "pmullh     "#fp2"  ,   "#fp4"  ,   "#ff_p3"                \n\t"   \
81         "pmullh     "#fp3"  ,   "#fp3"  ,   "#ff_p3"                \n\t"   \
82         "pmullh     "#fp4"  ,   "#fp4"  ,   "#ff_p2"                \n\t"   \
83         "paddh      "#fp7"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
84         "psubh      "#fp8"  ,   "#fp3"  ,   "#fp4"                  \n\t"   \
85                                                                             \
86         "paddh      "#fp1"  ,   "#fp5"  ,   "#fp7"                  \n\t"   \
87         "paddh      "#fp2"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
88         "psubh      "#fp3"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
89         "psubh      "#fp4"  ,   "#fp5"  ,   "#fp7"                  \n\t"   \
90                                                                             \
91         "paddh      "#fp5"  ,   "#fp1"  ,   "#o1"                   \n\t"   \
92         "paddh      "#fp6"  ,   "#fp2"  ,   "#o2"                   \n\t"   \
93         "paddh      "#fp7"  ,   "#fp3"  ,   "#o3"                   \n\t"   \
94         "paddh      "#fp8"  ,   "#fp4"  ,   "#o4"                   \n\t"   \
95                                                                             \
96         "psubh      "#fp4"  ,   "#fp4"  ,   "#o4"                   \n\t"   \
97         "psubh      "#fp3"  ,   "#fp3"  ,   "#o3"                   \n\t"   \
98         "psubh      "#fp2"  ,   "#fp2"  ,   "#o2"                   \n\t"   \
99         "psubh      "#fp1"  ,   "#fp1"  ,   "#o1"                   \n\t"
100
101
102 #define VC1_INV_TRANCS_4_STEP1_MMI(fp1,   fp2,   fp3,   fp4,                \
103                                    fp5,   fp6,   fp7,   fp8,                \
104                                    ff_p1, ff_p2, ff_p3, ff_pw)              \
105         "paddh      "#fp5"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
106         "psubh      "#fp6"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
107         "pmullh     "#fp5"  ,   "#fp5"  ,   "#ff_p1"                \n\t"   \
108         "pmullh     "#fp6"  ,   "#fp6"  ,   "#ff_p1"                \n\t"   \
109         "paddh      "#fp5"  ,   "#fp5"  ,   "#ff_pw"                \n\t"   \
110         "paddh      "#fp6"  ,   "#fp6"  ,   "#ff_pw"                \n\t"   \
111                                                                             \
112         "pmullh     "#fp1"  ,   "#fp3"  ,   "#ff_p2"                \n\t"   \
113         "pmullh     "#fp2"  ,   "#fp4"  ,   "#ff_p3"                \n\t"   \
114         "pmullh     "#fp3"  ,   "#fp3"  ,   "#ff_p3"                \n\t"   \
115         "pmullh     "#fp4"  ,   "#fp4"  ,   "#ff_p2"                \n\t"   \
116         "paddh      "#fp7"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
117         "psubh      "#fp8"  ,   "#fp3"  ,   "#fp4"                  \n\t"   \
118                                                                             \
119         "paddh      "#fp1"  ,   "#fp5"  ,   "#fp7"                  \n\t"   \
120         "psubh      "#fp2"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
121         "paddh      "#fp3"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
122         "psubh      "#fp4"  ,   "#fp5"  ,   "#fp7"                  \n\t"
123
124
125 #define VC1_INV_TRANCS_4_STEP2_MMI(fp1, fp2, fp3, fp4,                      \
126                                    fp5, fp6, fp7, fp8, zero)                \
127         "punpcklbh  "#fp5"  ,   "#fp5"  ,   "#zero"                 \n\t"   \
128         "punpcklbh  "#fp6"  ,   "#fp6"  ,   "#zero"                 \n\t"   \
129         "punpcklbh  "#fp7"  ,   "#fp7"  ,   "#zero"                 \n\t"   \
130         "punpcklbh  "#fp8"  ,   "#fp8"  ,   "#zero"                 \n\t"   \
131                                                                             \
132         "paddh      "#fp1"  ,   "#fp1"  ,   "#fp5"                  \n\t"   \
133         "paddh      "#fp2"  ,   "#fp2"  ,   "#fp6"                  \n\t"   \
134         "paddh      "#fp3"  ,   "#fp3"  ,   "#fp7"                  \n\t"   \
135         "paddh      "#fp4"  ,   "#fp4"  ,   "#fp8"                  \n\t"   \
136                                                                             \
137         "packushb   "#fp1"  ,   "#fp1"  ,   "#zero"                 \n\t"   \
138         "packushb   "#fp2"  ,   "#fp2"  ,   "#zero"                 \n\t"   \
139         "packushb   "#fp3"  ,   "#fp3"  ,   "#zero"                 \n\t"   \
140         "packushb   "#fp4"  ,   "#fp4"  ,   "#zero"                 \n\t"
141
142
143 /* Do inverse transform on 8x8 block */
144 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
145 {
146     int dc = block[0];
147     double ftmp[9];
148     mips_reg addr[1];
149     int count;
150
151     dc = (3 * dc +  1) >> 1;
152     dc = (3 * dc + 16) >> 5;
153
154     __asm__ volatile(
155         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
156         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
157         "li         %[count],   0x02                                    \n\t"
158
159         "1:                                                             \n\t"
160         MMI_LDC1(%[ftmp1], %[dest], 0x00)
161         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
162         MMI_LDC1(%[ftmp2], %[addr0], 0x00)
163         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
164         MMI_LDC1(%[ftmp3], %[addr0], 0x00)
165         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
166         MMI_LDC1(%[ftmp4], %[addr0], 0x00)
167
168         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
169         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
170         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
171         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
172         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
173         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
174         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
175         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
176
177         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
178         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
179         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
180         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
181         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
182         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
183         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
184         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
185
186         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
187         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
188         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
189         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
190
191         MMI_SDC1(%[ftmp1], %[dest], 0x00)
192         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
193         MMI_SDC1(%[ftmp2], %[addr0], 0x00)
194         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
195         MMI_SDC1(%[ftmp3], %[addr0], 0x00)
196         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
197         MMI_SDC1(%[ftmp4], %[addr0], 0x00)
198
199         "addiu      %[count],   %[count],       -0x01                   \n\t"
200         PTR_ADDU   "%[dest],    %[addr0],       %[linesize]             \n\t"
201         "bnez       %[count],   1b                                      \n\t"
202         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
203           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
204           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
205           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
206           [ftmp8]"=&f"(ftmp[8]),
207           [addr0]"=&r"(addr[0]),
208           [count]"=&r"(count),          [dest]"+&r"(dest)
209         : [linesize]"r"((mips_reg)linesize),
210           [dc]"f"(dc)
211         : "memory"
212     );
213 }
214
215 #if _MIPS_SIM != _ABIO32
216 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
217 {
218     DECLARE_ALIGNED(16, int16_t, temp[64]);
219     int16_t *src = block;
220     int16_t *dst = temp;
221     double ftmp[16];
222     uint32_t count, tmp[1];
223
224     // 1st loop
225     __asm__ volatile (
226         "li         %[tmp0],    0x03                                    \n\t"
227         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
228         "li         %[count],   0x02                                    \n\t"
229
230         "1:                                                             \n\t"
231         MMI_LDC1(%[ftmp5], %[src], 0x10)
232         MMI_LDC1(%[ftmp6], %[src], 0x30)
233         MMI_LDC1(%[ftmp7], %[src], 0x50)
234         MMI_LDC1(%[ftmp8], %[src], 0x70)
235
236         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
237                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
238                                    %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
239                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
240                                    %[ff_pw_4])
241
242         MMI_LDC1(%[ftmp1], %[src], 0x00)
243         MMI_LDC1(%[ftmp2], %[src], 0x40)
244         MMI_LDC1(%[ftmp3], %[src], 0x20)
245         MMI_LDC1(%[ftmp4], %[src], 0x60)
246
247         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
248                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
249                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
250                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
251                                    %[ff_pw_4])
252
253
254         PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
255                     %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
256
257         TRANSPOSE_4H(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
258                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
259                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
260
261         MMI_SDC1(%[ftmp5], %[dst], 0x00)
262         MMI_SDC1(%[ftmp6], %[dst], 0x10)
263         MMI_SDC1(%[ftmp7], %[dst], 0x20)
264         MMI_SDC1(%[ftmp8], %[dst], 0x30)
265
266         TRANSPOSE_4H(%[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1],
267                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
268                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
269
270         MMI_SDC1(%[ftmp4], %[dst], 0x08)
271         MMI_SDC1(%[ftmp3], %[dst], 0x18)
272         MMI_SDC1(%[ftmp2], %[dst], 0x28)
273         MMI_SDC1(%[ftmp1], %[dst], 0x38)
274
275         "addiu      %[count],   %[count],  -0x01                        \n\t"
276         PTR_ADDIU  "%[src],     %[src],     0x08                        \n\t"
277         PTR_ADDIU  "%[dst],     %[dst],     0x40                        \n\t"
278         "bnez       %[count],   1b                                      \n\t"
279         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
280           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
281           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
282           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
283           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
284           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
285           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
286           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
287           [tmp0]"=&r"(tmp[0]),
288           [count]"=&r"(count),
289           [src]"+&r"(src),              [dst]"+&r"(dst)
290         : [ff_pw_4]"f"(ff_pw_4),        [ff_pw_6]"f"(ff_pw_6),
291           [ff_pw_9]"f"(ff_pw_9),        [ff_pw_12]"f"(ff_pw_12),
292           [ff_pw_15]"f"(ff_pw_15),      [ff_pw_16]"f"(ff_pw_16)
293         : "memory"
294     );
295
296     src = temp;
297     dst = block;
298
299     // 2nd loop
300     __asm__ volatile (
301         "li         %[tmp0],    0x07                                    \n\t"
302         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
303         "li         %[count],   0x02                                    \n\t"
304
305         "1:                                                             \n\t"
306         MMI_LDC1(%[ftmp5], %[src], 0x10)
307         MMI_LDC1(%[ftmp6], %[src], 0x30)
308         MMI_LDC1(%[ftmp7], %[src], 0x50)
309         MMI_LDC1(%[ftmp8], %[src], 0x70)
310
311         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
312                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
313                                    %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
314                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
315                                    %[ff_pw_4])
316
317         MMI_LDC1(%[ftmp1], %[src], 0x00)
318         MMI_LDC1(%[ftmp2], %[src], 0x40)
319         MMI_LDC1(%[ftmp3], %[src], 0x20)
320         MMI_LDC1(%[ftmp4], %[src], 0x60)
321
322         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
323                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
324                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
325                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
326                                    %[ff_pw_64])
327
328         "paddh      %[ftmp4],   %[ftmp4],   %[ff_pw_1]                  \n\t"
329         "paddh      %[ftmp3],   %[ftmp3],   %[ff_pw_1]                  \n\t"
330         "paddh      %[ftmp2],   %[ftmp2],   %[ff_pw_1]                  \n\t"
331         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_1]                  \n\t"
332
333         PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
334                     %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
335
336         MMI_SDC1(%[ftmp5], %[dst], 0x00)
337         MMI_SDC1(%[ftmp6], %[dst], 0x10)
338         MMI_SDC1(%[ftmp7], %[dst], 0x20)
339         MMI_SDC1(%[ftmp8], %[dst], 0x30)
340
341         MMI_SDC1(%[ftmp4], %[dst], 0x40)
342         MMI_SDC1(%[ftmp3], %[dst], 0x50)
343         MMI_SDC1(%[ftmp2], %[dst], 0x60)
344         MMI_SDC1(%[ftmp1], %[dst], 0x70)
345
346         "addiu      %[count],   %[count],  -0x01                        \n\t"
347         PTR_ADDIU  "%[src],     %[src],     0x08                        \n\t"
348         PTR_ADDIU  "%[dst],     %[dst],     0x08                        \n\t"
349         "bnez       %[count],   1b                                      \n\t"
350         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
351           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
352           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
353           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
354           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
355           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
356           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
357           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
358           [tmp0]"=&r"(tmp[0]),
359           [count]"=&r"(count),
360           [src]"+&r"(src),              [dst]"+&r"(dst)
361         : [ff_pw_1]"f"(ff_pw_1),        [ff_pw_4]"f"(ff_pw_4),
362           [ff_pw_6]"f"(ff_pw_6),        [ff_pw_9]"f"(ff_pw_9),
363           [ff_pw_12]"f"(ff_pw_12),      [ff_pw_15]"f"(ff_pw_15),
364           [ff_pw_16]"f"(ff_pw_16),      [ff_pw_64]"f"(ff_pw_64)
365         : "memory"
366     );
367 }
368 #endif
369
370 /* Do inverse transform on 8x4 part of block */
371 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
372 {
373     int dc = block[0];
374     double ftmp[9];
375
376     dc = ( 3 * dc +  1) >> 1;
377     dc = (17 * dc + 64) >> 7;
378
379     __asm__ volatile(
380         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
381         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
382
383         MMI_LDC1(%[ftmp1], %[dest0], 0x00)
384         MMI_LDC1(%[ftmp2], %[dest1], 0x00)
385         MMI_LDC1(%[ftmp3], %[dest2], 0x00)
386         MMI_LDC1(%[ftmp4], %[dest3], 0x00)
387
388         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
389         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
390         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
391         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
392         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
393         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
394         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
395         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
396
397         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
398         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
399         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
400         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
401         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
402         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
403         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
404         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
405
406         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
407         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
408         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
409         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
410
411         MMI_SDC1(%[ftmp1], %[dest0], 0x00)
412         MMI_SDC1(%[ftmp2], %[dest1], 0x00)
413         MMI_SDC1(%[ftmp3], %[dest2], 0x00)
414         MMI_SDC1(%[ftmp4], %[dest3], 0x00)
415         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
416           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
417           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
418           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
419           [ftmp8]"=&f"(ftmp[8])
420         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
421           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
422           [dc]"f"(dc)
423         : "memory"
424     );
425 }
426
427 #if _MIPS_SIM != _ABIO32
428 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
429 {
430     int16_t *src = block;
431     int16_t *dst = block;
432     double ftmp[16];
433     uint32_t tmp[1];
434     mips_reg addr[1];
435     DECLARE_VAR_LOW32;
436
437     // 1st loop
438     __asm__ volatile (
439         MMI_LDC1(%[ftmp1], %[src], 0x00)
440         MMI_LDC1(%[ftmp2], %[src], 0x08)
441         MMI_LDC1(%[ftmp3], %[src], 0x10)
442         MMI_LDC1(%[ftmp4], %[src], 0x18)
443         MMI_LDC1(%[ftmp5], %[src], 0x20)
444         MMI_LDC1(%[ftmp6], %[src], 0x28)
445         MMI_LDC1(%[ftmp7], %[src], 0x30)
446         MMI_LDC1(%[ftmp8], %[src], 0x38)
447
448         //             a1        b1        a3        b2
449         TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp5], %[ftmp7],
450                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
451                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
452
453         //             a2        b3        a4        b4
454         TRANSPOSE_4H(%[ftmp2], %[ftmp4], %[ftmp6], %[ftmp8],
455                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
456                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
457
458         // input b1 b2 b3 b4
459         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
460                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
461                                    %[ftmp0], %[ftmp13], %[ftmp14], %[ftmp15],
462                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
463                                    %[ff_pw_4])
464         // input a1 a2 a3 a4
465         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp5], %[ftmp6],
466                                    %[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
467                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
468                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
469                                    %[ff_pw_4])
470
471         "li         %[tmp0],    0x03                                    \n\t"
472         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
473
474         PSRAH_8_MMI(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
475                     %[ftmp6], %[ftmp5], %[ftmp2], %[ftmp1], %[ftmp0])
476
477         TRANSPOSE_4H(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
478                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
479                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
480
481         MMI_SDC1(%[ftmp3], %[dst], 0x00)
482         MMI_SDC1(%[ftmp7], %[dst], 0x10)
483         MMI_SDC1(%[ftmp4], %[dst], 0x20)
484         MMI_SDC1(%[ftmp8], %[dst], 0x30)
485
486         TRANSPOSE_4H(%[ftmp6], %[ftmp5], %[ftmp2], %[ftmp1],
487                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
488                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
489
490         MMI_SDC1(%[ftmp6], %[dst], 0x08)
491         MMI_SDC1(%[ftmp5], %[dst], 0x18)
492         MMI_SDC1(%[ftmp2], %[dst], 0x28)
493         MMI_SDC1(%[ftmp1], %[dst], 0x38)
494         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
495           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
496           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
497           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
498           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
499           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
500           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
501           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
502           [tmp0]"=&r"(tmp[0])
503         : [src]"r"(src),                [dst]"r"(dst),
504           [ff_pw_4]"f"(ff_pw_4),        [ff_pw_6]"f"(ff_pw_6),
505           [ff_pw_9]"f"(ff_pw_9),        [ff_pw_12]"f"(ff_pw_12),
506           [ff_pw_15]"f"(ff_pw_15),      [ff_pw_16]"f"(ff_pw_16)
507         : "memory"
508     );
509
510     src = block;
511
512     // 2nd loop
513     __asm__ volatile (
514         "li         %[tmp0],    0x07                                    \n\t"
515         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
516         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
517
518         // dest low 32bit
519         MMI_LDC1(%[ftmp1], %[src], 0x00)
520         MMI_LDC1(%[ftmp2], %[src], 0x20)
521         MMI_LDC1(%[ftmp3], %[src], 0x30)
522         MMI_LDC1(%[ftmp4], %[src], 0x10)
523
524         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
525                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
526                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
527                                    %[ff_pw_64])
528
529         PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp9])
530
531         MMI_LWC1(%[ftmp5], %[dest], 0x00)
532         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
533         MMI_LWC1(%[ftmp6], %[addr0], 0x00)
534         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
535         MMI_LWC1(%[ftmp7], %[addr0], 0x00)
536         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
537         MMI_LWC1(%[ftmp8], %[addr0], 0x00)
538
539         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
540                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
541                                    %[ftmp0])
542
543         MMI_SWC1(%[ftmp1], %[dest], 0x00)
544         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
545         MMI_SWC1(%[ftmp2], %[addr0], 0x00)
546         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
547         MMI_SWC1(%[ftmp3], %[addr0], 0x00)
548         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
549         MMI_SWC1(%[ftmp4], %[addr0], 0x00)
550
551         // dest high 32bit
552         MMI_LDC1(%[ftmp1], %[src], 0x08)
553         MMI_LDC1(%[ftmp2], %[src], 0x28)
554         MMI_LDC1(%[ftmp3], %[src], 0x38)
555         MMI_LDC1(%[ftmp4], %[src], 0x18)
556
557         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
558                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
559                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
560                                    %[ff_pw_64])
561
562         PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp9])
563
564         MMI_LWC1(%[ftmp5], %[dest], 0x04)
565         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
566         MMI_LWC1(%[ftmp6], %[addr0], 0x04)
567         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
568         MMI_LWC1(%[ftmp7], %[addr0], 0x04)
569         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
570         MMI_LWC1(%[ftmp8], %[addr0], 0x04)
571
572         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
573                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
574                                    %[ftmp0])
575
576         MMI_SWC1(%[ftmp1], %[dest], 0x04)
577         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
578         MMI_SWC1(%[ftmp2], %[addr0], 0x04)
579         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
580         MMI_SWC1(%[ftmp3], %[addr0], 0x04)
581         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
582         MMI_SWC1(%[ftmp4], %[addr0], 0x04)
583
584         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
585           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
586           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
587           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
588           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
589           [tmp0]"=&r"(tmp[0]),
590           RESTRICT_ASM_LOW32
591           [addr0]"=&r"(addr[0])
592         : [src]"r"(src),                [dest]"r"(dest),
593           [linesize]"r"((mips_reg)linesize),
594           [ff_pw_17]"f"(ff_pw_17),      [ff_pw_22]"f"(ff_pw_22),
595           [ff_pw_10]"f"(ff_pw_10),      [ff_pw_64]"f"(ff_pw_64)
596         : "memory"
597     );
598 }
599 #endif
600
601 /* Do inverse transform on 4x8 parts of block */
602 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
603 {
604     int dc = block[0];
605     double ftmp[9];
606     DECLARE_VAR_LOW32;
607
608     dc = (17 * dc +  4) >> 3;
609     dc = (12 * dc + 64) >> 7;
610
611     __asm__ volatile(
612         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
613         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
614
615         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
616         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
617         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
618         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
619         MMI_LWC1(%[ftmp5], %[dest4], 0x00)
620         MMI_LWC1(%[ftmp6], %[dest5], 0x00)
621         MMI_LWC1(%[ftmp7], %[dest6], 0x00)
622         MMI_LWC1(%[ftmp8], %[dest7], 0x00)
623
624         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
625         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
626         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
627         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
628         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
629         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
630         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
631         "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
632
633         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
634         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
635         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
636         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
637         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
638         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
639         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
640         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
641
642         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
643         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
644         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
645         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
646         "packushb   %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
647         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
648         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
649         "packushb   %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
650
651         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
652         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
653         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
654         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
655         MMI_SWC1(%[ftmp5], %[dest4], 0x00)
656         MMI_SWC1(%[ftmp6], %[dest5], 0x00)
657         MMI_SWC1(%[ftmp7], %[dest6], 0x00)
658         MMI_SWC1(%[ftmp8], %[dest7], 0x00)
659         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
660           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
661           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
662           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
663           RESTRICT_ASM_LOW32
664           [ftmp8]"=&f"(ftmp[8])
665         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
666           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
667           [dest4]"r"(dest+4*linesize),  [dest5]"r"(dest+5*linesize),
668           [dest6]"r"(dest+6*linesize),  [dest7]"r"(dest+7*linesize),
669           [dc]"f"(dc)
670         : "memory"
671     );
672 }
673
674 #if _MIPS_SIM != _ABIO32
675 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
676 {
677     int16_t *src = block;
678     int16_t *dst = block;
679     double ftmp[16];
680     uint32_t count, tmp[1];
681     mips_reg addr[1];
682     DECLARE_VAR_LOW32;
683
684     // 1st loop
685     __asm__ volatile (
686         "li         %[count],   0x02                                    \n\t"
687         "li         %[tmp0],    0x03                                    \n\t"
688         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
689
690         "1:                                                             \n\t"
691         MMI_LDC1(%[ftmp1], %[src], 0x00)
692         MMI_LDC1(%[ftmp2], %[src], 0x10)
693         MMI_LDC1(%[ftmp3], %[src], 0x20)
694         MMI_LDC1(%[ftmp4], %[src], 0x30)
695
696         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
697                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
698                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
699
700         //                              t1        t2        t3        t4
701         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
702                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
703                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
704                                    %[ff_pw_4])
705
706         PSRAH_4_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2], %[ftmp0])
707
708         TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
709                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
710                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
711
712         MMI_SDC1(%[ftmp1], %[dst], 0x00)
713         MMI_SDC1(%[ftmp3], %[dst], 0x10)
714         MMI_SDC1(%[ftmp4], %[dst], 0x20)
715         MMI_SDC1(%[ftmp2], %[dst], 0x30)
716
717         "addiu      %[count],   %[count],  -0x01                        \n\t"
718         PTR_ADDIU  "%[src],     %[src],     0x40                        \n\t"
719         PTR_ADDIU  "%[dst],     %[dst],     0x40                        \n\t"
720         "bnez       %[count],   1b                                      \n\t"
721         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
722           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
723           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
724           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
725           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
726           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
727           [tmp0]"=&r"(tmp[0]),
728           [count]"=&r"(count),
729           [src]"+&r"(src),              [dst]"+&r"(dst)
730         : [ff_pw_17]"f"(ff_pw_17),      [ff_pw_10]"f"(ff_pw_10),
731           [ff_pw_22]"f"(ff_pw_22),      [ff_pw_4]"f"(ff_pw_4)
732         : "memory"
733     );
734
735     src = block;
736
737     // 2nd loop
738     __asm__ volatile (
739         "li         %[tmp0],    0x07                                    \n\t"
740         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
741
742         MMI_LDC1(%[ftmp5], %[src], 0x10)
743         MMI_LDC1(%[ftmp6], %[src], 0x30)
744         MMI_LDC1(%[ftmp7], %[src], 0x50)
745         MMI_LDC1(%[ftmp8], %[src], 0x70)
746
747         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
748                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
749                                    %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
750                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
751                                    %[ff_pw_4])
752
753         MMI_LDC1(%[ftmp1], %[src], 0x00)
754         MMI_LDC1(%[ftmp2], %[src], 0x40)
755         MMI_LDC1(%[ftmp3], %[src], 0x20)
756         MMI_LDC1(%[ftmp4], %[src], 0x60)
757
758         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
759                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
760                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
761                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
762                                    %[ff_pw_64])
763
764         "paddh      %[ftmp4],   %[ftmp4],   %[ff_pw_1]                  \n\t"
765         "paddh      %[ftmp3],   %[ftmp3],   %[ff_pw_1]                  \n\t"
766         "paddh      %[ftmp2],   %[ftmp2],   %[ff_pw_1]                  \n\t"
767         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_1]                  \n\t"
768
769         PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
770                     %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
771
772         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
773
774         // dest low
775         MMI_LWC1(%[ftmp9], %[dest], 0x00)
776         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
777         MMI_LWC1(%[ftmp10], %[addr0], 0x00)
778         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
779         MMI_LWC1(%[ftmp11], %[addr0], 0x00)
780         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
781         MMI_LWC1(%[ftmp12], %[addr0], 0x00)
782         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
783
784         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
785                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
786                                    %[ftmp0])
787
788         // dest high
789         MMI_LWC1(%[ftmp9], %[addr0], 0x00)
790         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
791         MMI_LWC1(%[ftmp10], %[addr0], 0x00)
792         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
793         MMI_LWC1(%[ftmp11], %[addr0], 0x00)
794         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
795         MMI_LWC1(%[ftmp12], %[addr0], 0x00)
796
797         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1],
798                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
799                                    %[ftmp0])
800
801         // dest low
802         MMI_SWC1(%[ftmp5], %[dest], 0x00)
803         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
804         MMI_SWC1(%[ftmp6], %[addr0], 0x00)
805         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
806         MMI_SWC1(%[ftmp7], %[addr0], 0x00)
807         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
808         MMI_SWC1(%[ftmp8], %[addr0], 0x00)
809         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
810
811         // dest high
812         MMI_SWC1(%[ftmp4], %[addr0], 0x00)
813         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
814         MMI_SWC1(%[ftmp3], %[addr0], 0x00)
815         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
816         MMI_SWC1(%[ftmp2], %[addr0], 0x00)
817         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
818         MMI_SWC1(%[ftmp1], %[addr0], 0x00)
819         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
820           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
821           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
822           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
823           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
824           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
825           [ftmp12]"=&f"(ftmp[12]),
826           [tmp0]"=&r"(tmp[0]),
827           RESTRICT_ASM_LOW32
828           [addr0]"=&r"(addr[0]),
829           [dest]"+&r"(dest)
830         : [src]"r"(src),                [linesize]"r"(linesize),
831           [ff_pw_1]"f"(ff_pw_1),        [ff_pw_4]"f"(ff_pw_4),
832           [ff_pw_6]"f"(ff_pw_6),        [ff_pw_9]"f"(ff_pw_9),
833           [ff_pw_12]"f"(ff_pw_12),      [ff_pw_15]"f"(ff_pw_15),
834           [ff_pw_16]"f"(ff_pw_16),      [ff_pw_64]"f"(ff_pw_64)
835         : "memory"
836     );
837 }
838 #endif
839
840 /* Do inverse transform on 4x4 part of block */
841 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
842 {
843     int dc = block[0];
844     double ftmp[5];
845     DECLARE_VAR_LOW32;
846
847     dc = (17 * dc +  4) >> 3;
848     dc = (17 * dc + 64) >> 7;
849
850     __asm__ volatile(
851         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
852         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
853
854         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
855         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
856         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
857         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
858
859         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
860         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
861         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
862         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
863
864         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
865         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
866         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
867         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
868
869         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
870         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
871         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
872         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
873
874         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
875         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
876         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
877         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
878         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
879           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
880           RESTRICT_ASM_LOW32
881           [ftmp4]"=&f"(ftmp[4])
882         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
883           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
884           [dc]"f"(dc)
885         : "memory"
886     );
887 }
888
889 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
890 {
891     int16_t *src = block;
892     int16_t *dst = block;
893     double ftmp[12];
894     uint32_t tmp[1];
895     mips_reg addr[1];
896     DECLARE_VAR_LOW32;
897
898     // 1st loop
899     __asm__ volatile (
900         "li         %[tmp0],    0x03                                    \n\t"
901         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
902
903         MMI_LDC1(%[ftmp1], %[src], 0x00)
904         MMI_LDC1(%[ftmp2], %[src], 0x10)
905         MMI_LDC1(%[ftmp3], %[src], 0x20)
906         MMI_LDC1(%[ftmp4], %[src], 0x30)
907
908         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
909                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
910                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
911
912         //                              t1        t2        t3        t4
913         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
914                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
915                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
916                                    %[ff_pw_4])
917
918         PSRAH_4_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2], %[ftmp0])
919
920         TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
921                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
922                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
923
924         MMI_SDC1(%[ftmp1], %[dst], 0x00)
925         MMI_SDC1(%[ftmp3], %[dst], 0x10)
926         MMI_SDC1(%[ftmp4], %[dst], 0x20)
927         MMI_SDC1(%[ftmp2], %[dst], 0x30)
928         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
929           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
930           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
931           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
932           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
933           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
934           [tmp0]"=&r"(tmp[0]),
935           [src]"+&r"(src),              [dst]"+&r"(dst)
936         : [ff_pw_17]"f"(ff_pw_17),      [ff_pw_10]"f"(ff_pw_10),
937           [ff_pw_22]"f"(ff_pw_22),      [ff_pw_4]"f"(ff_pw_4)
938         : "memory"
939     );
940
941     src = block;
942
943     // 2nd loop
944     __asm__ volatile (
945         "li         %[tmp0],    0x07                                    \n\t"
946         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
947
948         // dest low 32bit
949         MMI_LDC1(%[ftmp1], %[src], 0x00)
950         MMI_LDC1(%[ftmp2], %[src], 0x20)
951         MMI_LDC1(%[ftmp3], %[src], 0x30)
952         MMI_LDC1(%[ftmp4], %[src], 0x10)
953
954         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
955                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
956                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
957                                    %[ff_pw_64])
958
959         PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp0])
960
961         MMI_LWC1(%[ftmp5], %[dest], 0x00)
962         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
963         MMI_LWC1(%[ftmp6], %[addr0], 0x00)
964         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
965         MMI_LWC1(%[ftmp7], %[addr0], 0x00)
966         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
967         MMI_LWC1(%[ftmp8], %[addr0], 0x00)
968
969         "xor        %[ftmp9],   %[ftmp9],  %[ftmp9]                     \n\t"
970
971         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
972                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
973                                    %[ftmp9])
974
975         MMI_SWC1(%[ftmp1], %[dest], 0x00)
976         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
977         MMI_SWC1(%[ftmp2], %[addr0], 0x00)
978         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
979         MMI_SWC1(%[ftmp3], %[addr0], 0x00)
980         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
981         MMI_SWC1(%[ftmp4], %[addr0], 0x00)
982         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
983           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
984           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
985           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
986           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
987           [tmp0]"=&r"(tmp[0]),
988           RESTRICT_ASM_LOW32
989           [addr0]"=&r"(addr[0])
990         : [src]"r"(src),                [dest]"r"(dest),
991           [linesize]"r"((mips_reg)linesize),
992           [ff_pw_17]"f"(ff_pw_17),      [ff_pw_22]"f"(ff_pw_22),
993           [ff_pw_10]"f"(ff_pw_10),      [ff_pw_64]"f"(ff_pw_64)
994         : "memory"
995     );
996 }
997
998 /* Apply overlap transform to horizontal edge */
999 void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
1000 {
1001     int i;
1002     int a, b, c, d;
1003     int d1, d2;
1004     int rnd = 1;
1005     for (i = 0; i < 8; i++) {
1006         a  = src[-2];
1007         b  = src[-1];
1008         c  = src[0];
1009         d  = src[1];
1010         d1 = (a - d + 3 + rnd) >> 3;
1011         d2 = (a - d + b - c + 4 - rnd) >> 3;
1012
1013         src[-2] = a - d1;
1014         src[-1] = av_clip_uint8(b - d2);
1015         src[0]  = av_clip_uint8(c + d2);
1016         src[1]  = d + d1;
1017         src    += stride;
1018         rnd     = !rnd;
1019     }
1020 }
1021
1022 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right)
1023 {
1024     int i;
1025     int a, b, c, d;
1026     int d1, d2;
1027     int rnd1 = 4, rnd2 = 3;
1028     for (i = 0; i < 8; i++) {
1029         a  = left[6];
1030         b  = left[7];
1031         c  = right[0];
1032         d  = right[1];
1033         d1 = a - d;
1034         d2 = a - d + b - c;
1035
1036         left[6]  = ((a << 3) - d1 + rnd1) >> 3;
1037         left[7]  = ((b << 3) - d2 + rnd2) >> 3;
1038         right[0] = ((c << 3) + d2 + rnd1) >> 3;
1039         right[1] = ((d << 3) + d1 + rnd2) >> 3;
1040
1041         right += 8;
1042         left  += 8;
1043         rnd2   = 7 - rnd2;
1044         rnd1   = 7 - rnd1;
1045     }
1046 }
1047
1048 /* Apply overlap transform to vertical edge */
1049 void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
1050 {
1051     int i;
1052     int a, b, c, d;
1053     int d1, d2;
1054     int rnd = 1;
1055     for (i = 0; i < 8; i++) {
1056         a  = src[-2 * stride];
1057         b  = src[-stride];
1058         c  = src[0];
1059         d  = src[stride];
1060         d1 = (a - d + 3 + rnd) >> 3;
1061         d2 = (a - d + b - c + 4 - rnd) >> 3;
1062
1063         src[-2 * stride] = a - d1;
1064         src[-stride]     = av_clip_uint8(b - d2);
1065         src[0]           = av_clip_uint8(c + d2);
1066         src[stride]      = d + d1;
1067         src++;
1068         rnd = !rnd;
1069     }
1070 }
1071
1072 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1073 {
1074     int i;
1075     int a, b, c, d;
1076     int d1, d2;
1077     int rnd1 = 4, rnd2 = 3;
1078     for (i = 0; i < 8; i++) {
1079         a  = top[48];
1080         b  = top[56];
1081         c  = bottom[0];
1082         d  = bottom[8];
1083         d1 = a - d;
1084         d2 = a - d + b - c;
1085
1086         top[48]   = ((a << 3) - d1 + rnd1) >> 3;
1087         top[56]   = ((b << 3) - d2 + rnd2) >> 3;
1088         bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1089         bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1090
1091         bottom++;
1092         top++;
1093         rnd2 = 7 - rnd2;
1094         rnd1 = 7 - rnd1;
1095     }
1096 }
1097
1098 /**
1099  * VC-1 in-loop deblocking filter for one line
1100  * @param src source block type
1101  * @param stride block stride
1102  * @param pq block quantizer
1103  * @return whether other 3 pairs should be filtered or not
1104  * @see 8.6
1105  */
1106 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1107 {
1108     int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1109               5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1110     int a0_sign = a0 >> 31;        /* Store sign */
1111
1112     a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1113     if (a0 < pq) {
1114         int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1115                         5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1116         int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1117                         5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1118         if (a1 < a0 || a2 < a0) {
1119             int clip      = src[-1 * stride] - src[0 * stride];
1120             int clip_sign = clip >> 31;
1121
1122             clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1123             if (clip) {
1124                 int a3     = FFMIN(a1, a2);
1125                 int d      = 5 * (a3 - a0);
1126                 int d_sign = (d >> 31);
1127
1128                 d       = ((d ^ d_sign) - d_sign) >> 3;
1129                 d_sign ^= a0_sign;
1130
1131                 if (d_sign ^ clip_sign)
1132                     d = 0;
1133                 else {
1134                     d = FFMIN(d, clip);
1135                     d = (d ^ d_sign) - d_sign; /* Restore sign */
1136                     src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1137                     src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1138                 }
1139                 return 1;
1140             }
1141         }
1142     }
1143     return 0;
1144 }
1145
1146 /**
1147  * VC-1 in-loop deblocking filter
1148  * @param src source block type
1149  * @param step distance between horizontally adjacent elements
1150  * @param stride distance between vertically adjacent elements
1151  * @param len edge length to filter (4 or 8 pixels)
1152  * @param pq block quantizer
1153  * @see 8.6
1154  */
1155 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1156                                    int len, int pq)
1157 {
1158     int i;
1159     int filt3;
1160
1161     for (i = 0; i < len; i += 4) {
1162         filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1163         if (filt3) {
1164             vc1_filter_line(src + 0 * step, stride, pq);
1165             vc1_filter_line(src + 1 * step, stride, pq);
1166             vc1_filter_line(src + 3 * step, stride, pq);
1167         }
1168         src += step * 4;
1169     }
1170 }
1171
1172 void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1173 {
1174     vc1_loop_filter(src, 1, stride, 4, pq);
1175 }
1176
1177 void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1178 {
1179     vc1_loop_filter(src, stride, 1, 4, pq);
1180 }
1181
1182 void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1183 {
1184     vc1_loop_filter(src, 1, stride, 8, pq);
1185 }
1186
1187 void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1188 {
1189     vc1_loop_filter(src, stride, 1, 8, pq);
1190 }
1191
1192 void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1193 {
1194     vc1_loop_filter(src, 1, stride, 16, pq);
1195 }
1196
1197 void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1198 {
1199     vc1_loop_filter(src, stride, 1, 16, pq);
1200 }
1201
1202 void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1203                                ptrdiff_t stride, int rnd)
1204 {
1205     ff_put_pixels8_8_mmi(dst, src, stride, 8);
1206 }
1207 void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1208                                   ptrdiff_t stride, int rnd)
1209 {
1210     ff_put_pixels16_8_mmi(dst, src, stride, 16);
1211 }
1212 void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1213                                ptrdiff_t stride, int rnd)
1214 {
1215     ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1216 }
1217 void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1218                                   ptrdiff_t stride, int rnd)
1219 {
1220     ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1221 }
1222
1223 #define OP_PUT(S, D)
1224 #define OP_AVG(S, D)                                                        \
1225     "ldc1       $f16,   "#S"                        \n\t"                   \
1226     "pavgb      "#D",   "#D",   $f16                \n\t"
1227
1228 /** Add rounder from $f14 to $f6 and pack result at destination */
1229 #define NORMALIZE_MMI(SHIFT)                                                \
1230     "paddh      $f6,    $f6,    $f14                \n\t" /* +bias-r */     \
1231     "paddh      $f8,    $f8,    $f14                \n\t" /* +bias-r */     \
1232     "psrah      $f6,    $f6,    "SHIFT"             \n\t"                   \
1233     "psrah      $f8,    $f8,    "SHIFT"             \n\t"
1234
1235 #define TRANSFER_DO_PACK(OP)                                                \
1236     "packushb   $f6,    $f6,    $f8                 \n\t"                   \
1237     OP((%[dst]), $f6)                                                       \
1238     "sdc1       $f6,    0x00(%[dst])                \n\t"
1239
1240 #define TRANSFER_DONT_PACK(OP)                                              \
1241      OP(0(%[dst]), $f6)                                                     \
1242      OP(8(%[dst]), $f8)                                                     \
1243      "sdc1      $f6,    0x00(%[dst])                \n\t"                   \
1244      "sdc1      $f8,    0x08(%[dst])                \n\t"
1245
1246 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1247 #define DO_UNPACK(reg)                                                      \
1248     "punpcklbh  "reg",  "reg",  $f0                 \n\t"
1249 #define DONT_UNPACK(reg)
1250
1251 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1252 #define LOAD_ROUNDER_MMI(ROUND)                                             \
1253     "lwc1       $f14,   "ROUND"                     \n\t"                   \
1254     "punpcklhw  $f14,   $f14,   $f14                \n\t"                   \
1255     "punpcklwd  $f14,   $f14,   $f14                \n\t"
1256
1257
1258 #define SHIFT2_LINE(OFF, R0, R1, R2, R3)                                    \
1259     "paddh      "#R1",      "#R1",  "#R2"           \n\t"                   \
1260     PTR_ADDU    "$9,        %[src], %[stride1]      \n\t"                   \
1261     MMI_ULWC1(R0, $9, 0x00)                                                 \
1262     "pmullh     "#R1",      "#R1",  $f6             \n\t"                   \
1263     "punpcklbh  "#R0",      "#R0",  $f0             \n\t"                   \
1264     PTR_ADDU    "$9,        %[src], %[stride]       \n\t"                   \
1265     MMI_ULWC1(R3, $9, 0x00)                                                 \
1266     "psubh      "#R1",      "#R1",  "#R0"           \n\t"                   \
1267     "punpcklbh  "#R3",      "#R3",  $f0             \n\t"                   \
1268     "paddh      "#R1",      "#R1",  $f14            \n\t"                   \
1269     "psubh      "#R1",      "#R1",  "#R3"           \n\t"                   \
1270     "psrah      "#R1",      "#R1",  %[shift]        \n\t"                   \
1271     MMI_SDC1(R1, %[dst], OFF)                                               \
1272     PTR_ADDU    "%[src],    %[src], %[stride]       \n\t"
1273
1274 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1275 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1276                                        const uint8_t *src, mips_reg stride,
1277                                        int rnd, int64_t shift)
1278 {
1279     DECLARE_VAR_LOW32;
1280     DECLARE_VAR_ADDRT;
1281
1282     __asm__ volatile(
1283         "xor        $f0,    $f0,    $f0             \n\t"
1284         "li         $8,     0x03                    \n\t"
1285         LOAD_ROUNDER_MMI("%[rnd]")
1286         "ldc1       $f12,   %[ff_pw_9]              \n\t"
1287         "1:                                         \n\t"
1288         MMI_ULWC1($f4, %[src], 0x00)
1289         PTR_ADDU   "%[src], %[src], %[stride]       \n\t"
1290         MMI_ULWC1($f6, %[src], 0x00)
1291         "punpcklbh  $f4,    $f4,    $f0             \n\t"
1292         "punpcklbh  $f6,    $f6,    $f0             \n\t"
1293         SHIFT2_LINE(  0, $f2, $f4, $f6, $f8)
1294         SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1295         SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1296         SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1297         SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1298         SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1299         SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1300         SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1301         PTR_SUBU   "%[src], %[src], %[stride2]      \n\t"
1302         PTR_ADDIU  "%[dst], %[dst], 0x08            \n\t"
1303         "addiu      $8,     $8,    -0x01            \n\t"
1304         "bnez       $8,     1b                      \n\t"
1305         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT
1306           [src]"+r"(src),               [dst]"+r"(dst)
1307         : [stride]"r"(stride),          [stride1]"r"(-2*stride),
1308           [shift]"f"(shift),            [rnd]"m"(rnd),
1309           [stride2]"r"(9*stride-4),     [ff_pw_9]"m"(ff_pw_9)
1310         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1311           "$f14", "$f16", "memory"
1312     );
1313 }
1314
1315 /**
1316  * Data is already unpacked, so some operations can directly be made from
1317  * memory.
1318  */
1319 #define VC1_HOR_16B_SHIFT2(OP, OPNAME)                                      \
1320 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1321                                              const int16_t *src, int rnd)   \
1322 {                                                                           \
1323     int h = 8;                                                              \
1324     DECLARE_VAR_ALL64;                                                      \
1325     DECLARE_VAR_ADDRT;                                                      \
1326                                                                             \
1327     src -= 1;                                                               \
1328     rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */                            \
1329                                                                             \
1330     __asm__ volatile(                                                       \
1331         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1332         "ldc1       $f12,   %[ff_pw_128]            \n\t"                   \
1333         "ldc1       $f10,   %[ff_pw_9]              \n\t"                   \
1334         "1:                                         \n\t"                   \
1335         MMI_ULDC1($f2, %[src], 0x00)                                        \
1336         MMI_ULDC1($f4, %[src], 0x08)                                        \
1337         MMI_ULDC1($f6, %[src], 0x02)                                        \
1338         MMI_ULDC1($f8, %[src], 0x0a)                                        \
1339         MMI_ULDC1($f0, %[src], 0x06)                                        \
1340         "paddh      $f2,    $f2,    $f0             \n\t"                   \
1341         MMI_ULDC1($f0, %[src], 0x0e)                                        \
1342         "paddh      $f4,    $f4,    $f0             \n\t"                   \
1343         MMI_ULDC1($f0, %[src], 0x04)                                        \
1344         "paddh      $f6,    $f6,    $f0             \n\t"                   \
1345         MMI_ULDC1($f0, %[src], 0x0b)                                        \
1346         "paddh      $f8,    $f8,    $f0             \n\t"                   \
1347         "pmullh     $f6,    $f6,    $f10            \n\t"                   \
1348         "pmullh     $f8,    $f8,    $f10            \n\t"                   \
1349         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1350         "psubh      $f8,    $f8,    $f4             \n\t"                   \
1351         "li         $8,     0x07                    \n\t"                   \
1352         "mtc1       $8,     $f16                    \n\t"                   \
1353         NORMALIZE_MMI("$f16")                                               \
1354         /* Remove bias */                                                   \
1355         "paddh      $f6,    $f6,    $f12            \n\t"                   \
1356         "paddh      $f8,    $f8,    $f12            \n\t"                   \
1357         TRANSFER_DO_PACK(OP)                                                \
1358         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1359         PTR_ADDIU  "%[src], %[src], 0x18            \n\t"                   \
1360         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1361         "bnez       %[h],   1b                      \n\t"                   \
1362         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1363           [h]"+r"(h),                                                       \
1364           [src]"+r"(src),               [dst]"+r"(dst)                      \
1365         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1366           [ff_pw_9]"m"(ff_pw_9),        [ff_pw_128]"m"(ff_pw_128)           \
1367         : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",  \
1368           "$f16", "memory"                                                  \
1369     );                                                                      \
1370 }
1371
1372 VC1_HOR_16B_SHIFT2(OP_PUT, put_)
1373 VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
1374
1375 /**
1376  * Purely vertical or horizontal 1/2 shift interpolation.
1377  * Sacrify $f12 for *9 factor.
1378  */
1379 #define VC1_SHIFT2(OP, OPNAME)\
1380 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src,      \
1381                                      mips_reg stride, int rnd,              \
1382                                      mips_reg offset)                       \
1383 {                                                                           \
1384     DECLARE_VAR_LOW32;                                                      \
1385     DECLARE_VAR_ADDRT;                                                      \
1386                                                                             \
1387     rnd = 8 - rnd;                                                          \
1388                                                                             \
1389     __asm__ volatile(                                                       \
1390         "xor        $f0,    $f0,    $f0             \n\t"                   \
1391         "li         $10,    0x08                    \n\t"                   \
1392         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1393         "ldc1       $f12,   %[ff_pw_9]              \n\t"                   \
1394         "1:                                         \n\t"                   \
1395         MMI_ULWC1($f6, %[src], 0x00)                                        \
1396         MMI_ULWC1($f8, %[src], 0x04)                                        \
1397         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1398         MMI_ULWC1($f2, $9, 0x00)                                            \
1399         MMI_ULWC1($f4, $9, 0x04)                                            \
1400         PTR_ADDU   "%[src], %[src], %[offset]       \n\t"                   \
1401         "punpcklbh  $f6,    $f6,    $f0             \n\t"                   \
1402         "punpcklbh  $f8,    $f8,    $f0             \n\t"                   \
1403         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1404         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1405         "paddh      $f6,    $f6,    $f2             \n\t"                   \
1406         "paddh      $f8,    $f8,    $f4             \n\t"                   \
1407         PTR_ADDU   "$9,     %[src], %[offset_x2n]   \n\t"                   \
1408         MMI_ULWC1($f2, $9, 0x00)                                            \
1409         MMI_ULWC1($f4, $9, 0x04)                                            \
1410         "pmullh     $f6,    $f6,    $f12            \n\t" /* 0,9,9,0*/      \
1411         "pmullh     $f8,    $f8,    $f12            \n\t" /* 0,9,9,0*/      \
1412         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1413         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1414         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,0*/      \
1415         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,0*/      \
1416         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1417         MMI_ULWC1($f2, $9, 0x00)                                            \
1418         MMI_ULWC1($f4, $9, 0x04)                                            \
1419         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1420         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1421         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,-1*/     \
1422         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,-1*/     \
1423         "li         $8,     0x04                    \n\t"                   \
1424         "mtc1       $8,     $f16                    \n\t"                   \
1425         NORMALIZE_MMI("$f16")                                               \
1426         "packushb   $f6,    $f6,    $f8             \n\t"                   \
1427         OP((%[dst]), $f6)                                                   \
1428         "sdc1       $f6,    0x00(%[dst])            \n\t"                   \
1429         "addiu      $10,    $10,   -0x01            \n\t"                   \
1430         PTR_ADDU   "%[src], %[src], %[stride1]      \n\t"                   \
1431         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1432         "bnez       $10,    1b                      \n\t"                   \
1433         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1434           [src]"+r"(src),               [dst]"+r"(dst)                      \
1435         : [offset]"r"(offset),          [offset_x2n]"r"(-2*offset),         \
1436           [stride]"g"(stride),          [rnd]"m"(rnd),                      \
1437           [stride1]"g"(stride-offset),                                      \
1438           [ff_pw_9]"m"(ff_pw_9)                                             \
1439         : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",     \
1440           "$f12", "$f14", "$f16", "memory"                                  \
1441     );                                                                      \
1442 }
1443
1444 VC1_SHIFT2(OP_PUT, put_)
1445 VC1_SHIFT2(OP_AVG, avg_)
1446
1447 /**
1448  * Core of the 1/4 and 3/4 shift bicubic interpolation.
1449  *
1450  * @param UNPACK  Macro unpacking arguments from 8 to 16bits (can be empty).
1451  * @param LOAD    "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1452  * @param M       "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1453  * @param A1      Stride address of 1st tap (beware of unpacked/packed).
1454  * @param A2      Stride address of 2nd tap
1455  * @param A3      Stride address of 3rd tap
1456  * @param A4      Stride address of 4th tap
1457  */
1458 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4)                \
1459     PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                       \
1460     LOAD($f2, $9, M*0)                                                      \
1461     LOAD($f4, $9, M*4)                                                      \
1462     UNPACK("$f2")                                                           \
1463     UNPACK("$f4")                                                           \
1464     "pmullh     $f2,    $f2,    %[ff_pw_3]      \n\t"                       \
1465     "pmullh     $f4,    $f4,    %[ff_pw_3]      \n\t"                       \
1466     PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                       \
1467     LOAD($f6, $9, M*0)                                                      \
1468     LOAD($f8, $9, M*4)                                                      \
1469     UNPACK("$f6")                                                           \
1470     UNPACK("$f8")                                                           \
1471     "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */             \
1472     "pmullh     $f8,    $f8,    $f12            \n\t" /* *18 */             \
1473     "psubh      $f6,    $f6,    $f2             \n\t" /* *18, -3 */         \
1474     "psubh      $f8,    $f8,    $f4             \n\t" /* *18, -3 */         \
1475     PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                       \
1476     LOAD($f2, $9, M*0)                                                      \
1477     LOAD($f4, $9, M*4)                                                      \
1478     UNPACK("$f2")                                                           \
1479     UNPACK("$f4")                                                           \
1480     "li         $8,     0x02                    \n\t"                       \
1481     "mtc1       $8,     $f16                    \n\t"                       \
1482     "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */              \
1483     "psllh      $f4,    $f4,    $f16            \n\t" /* 4* */              \
1484     "psubh      $f6,    $f6,    $f2             \n\t" /* -4,18,-3 */        \
1485     "psubh      $f8,    $f8,    $f4             \n\t" /* -4,18,-3 */        \
1486     PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                       \
1487     LOAD($f2, $9, M*0)                                                      \
1488     LOAD($f4, $9, M*4)                                                      \
1489     UNPACK("$f2")                                                           \
1490     UNPACK("$f4")                                                           \
1491     "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */             \
1492     "pmullh     $f4,    $f4,    $f10            \n\t" /* *53 */             \
1493     "paddh      $f6,    $f6,    $f2             \n\t" /* 4,53,18,-3 */      \
1494     "paddh      $f8,    $f8,    $f4             \n\t" /* 4,53,18,-3 */
1495
1496 /**
1497  * Macro to build the vertical 16bits version of vc1_put_shift[13].
1498  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1499  * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1500  *
1501  * @param  NAME   Either 1 or 3
1502  * @see MSPEL_FILTER13_CORE for information on A1->A4
1503  */
1504 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)                        \
1505 static void                                                                 \
1506 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src,          \
1507                                  mips_reg src_stride,                       \
1508                                  int rnd, int64_t shift)                    \
1509 {                                                                           \
1510     int h = 8;                                                              \
1511     DECLARE_VAR_LOW32;                                                      \
1512     DECLARE_VAR_ADDRT;                                                      \
1513                                                                             \
1514     src -= src_stride;                                                      \
1515                                                                             \
1516     __asm__ volatile(                                                       \
1517         "xor        $f0,    $f0,    $f0             \n\t"                   \
1518         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1519         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1520         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1521         ".p2align 3                                 \n\t"                   \
1522         "1:                                         \n\t"                   \
1523         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
1524         NORMALIZE_MMI("%[shift]")                                           \
1525         TRANSFER_DONT_PACK(OP_PUT)                                          \
1526         /* Last 3 (in fact 4) bytes on the line */                          \
1527         PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                   \
1528         MMI_ULWC1($f2, $9, 0x08)                                            \
1529         DO_UNPACK("$f2")                                                    \
1530         "mov.d      $f6,    $f2                     \n\t"                   \
1531         "paddh      $f2,    $f2,    $f2             \n\t"                   \
1532         "paddh      $f2,    $f2,    $f6             \n\t" /* 3* */          \
1533         PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                   \
1534         MMI_ULWC1($f6, $9, 0x08)                                            \
1535         DO_UNPACK("$f6")                                                    \
1536         "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */         \
1537         "psubh      $f6,    $f6,    $f2             \n\t" /* *18,-3 */      \
1538         PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                   \
1539         MMI_ULWC1($f2, $9, 0x08)                                            \
1540         DO_UNPACK("$f2")                                                    \
1541         "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */         \
1542         "paddh      $f6,    $f6,    $f2             \n\t" /* *53,18,-3 */   \
1543         PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                   \
1544         MMI_ULWC1($f2, $9, 0x08)                                            \
1545         DO_UNPACK("$f2")                                                    \
1546         "li         $8,     0x02                    \n\t"                   \
1547         "mtc1       $8,     $f16                    \n\t"                   \
1548         "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */          \
1549         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1550         "paddh      $f6,    $f6,    $f14            \n\t"                   \
1551         "li         $8,     0x06                    \n\t"                   \
1552         "mtc1       $8,     $f16                    \n\t"                   \
1553         "psrah      $f6,    $f6,    $f16            \n\t"                   \
1554         "sdc1       $f6,    0x10(%[dst])            \n\t"                   \
1555         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1556         PTR_ADDU   "%[src], %[src], %[stride_x1]    \n\t"                   \
1557         PTR_ADDIU  "%[dst], %[dst], 0x18            \n\t"                   \
1558         "bnez       %[h],   1b                      \n\t"                   \
1559         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1560           [h]"+r"(h),                                                       \
1561           [src]"+r"(src),               [dst]"+r"(dst)                      \
1562         : [stride_x1]"r"(src_stride),   [stride_x2]"r"(2*src_stride),       \
1563           [stride_x3]"r"(3*src_stride),                                     \
1564           [rnd]"m"(rnd),                [shift]"f"(shift),                  \
1565           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1566           [ff_pw_3]"f"(ff_pw_3)                                             \
1567         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1568           "$f14", "$f16", "memory"                                          \
1569     );                                                                      \
1570 }
1571
1572 /**
1573  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1574  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1575  *
1576  * @param  NAME   Either 1 or 3
1577  * @see MSPEL_FILTER13_CORE for information on A1->A4
1578  */
1579 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)            \
1580 static void                                                                 \
1581 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride,       \
1582                                        const int16_t *src, int rnd)         \
1583 {                                                                           \
1584     int h = 8;                                                              \
1585     DECLARE_VAR_ALL64;                                                      \
1586     DECLARE_VAR_ADDRT;                                                      \
1587                                                                             \
1588     src -= 1;                                                               \
1589     rnd -= (-4+58+13-3)*256; /* Add -256 bias */                            \
1590                                                                             \
1591     __asm__ volatile(                                                       \
1592         "xor        $f0,    $f0,    $f0             \n\t"                   \
1593         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1594         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1595         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1596         ".p2align 3                                 \n\t"                   \
1597         "1:                                         \n\t"                   \
1598         MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4)      \
1599         "li         $8,     0x07                    \n\t"                   \
1600         "mtc1       $8,     $f16                    \n\t"                   \
1601         NORMALIZE_MMI("$f16")                                               \
1602         /* Remove bias */                                                   \
1603         "paddh      $f6,    $f6,    %[ff_pw_128]    \n\t"                   \
1604         "paddh      $f8,    $f8,    %[ff_pw_128]    \n\t"                   \
1605         TRANSFER_DO_PACK(OP)                                                \
1606         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1607         PTR_ADDU   "%[src], %[src], 0x18            \n\t"                   \
1608         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1609         "bnez       %[h],   1b                      \n\t"                   \
1610         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1611           [h]"+r"(h),                                                       \
1612           [src]"+r"(src),               [dst]"+r"(dst)                      \
1613         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1614           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1615           [ff_pw_3]"f"(ff_pw_3),        [ff_pw_128]"f"(ff_pw_128)           \
1616         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1617           "$f14", "$f16", "memory"                                          \
1618     );                                                                      \
1619 }
1620
1621 /**
1622  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
1623  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1624  * %3 (offset), %4 (2*offset) and %5 (3*offset).
1625  *
1626  * @param  NAME   Either 1 or 3
1627  * @see MSPEL_FILTER13_CORE for information on A1->A4
1628  */
1629 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)                 \
1630 static void                                                                 \
1631 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src,             \
1632                               mips_reg stride, int rnd, mips_reg offset)    \
1633 {                                                                           \
1634     int h = 8;                                                              \
1635     DECLARE_VAR_LOW32;                                                      \
1636     DECLARE_VAR_ADDRT;                                                      \
1637                                                                             \
1638     src -= offset;                                                          \
1639     rnd = 32-rnd;                                                           \
1640                                                                             \
1641     __asm__ volatile (                                                      \
1642         "xor        $f0,    $f0,    $f0             \n\t"                   \
1643         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1644         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1645         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1646         ".p2align 3                                 \n\t"                   \
1647         "1:                                         \n\t"                   \
1648         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
1649         "li         $8,     0x06                    \n\t"                   \
1650         "mtc1       $8,     $f16                    \n\t"                   \
1651         NORMALIZE_MMI("$f16")                                               \
1652         TRANSFER_DO_PACK(OP)                                                \
1653         "addiu      %[h],   %[h],      -0x01        \n\t"                   \
1654         PTR_ADDU   "%[src], %[src],     %[stride]   \n\t"                   \
1655         PTR_ADDU   "%[dst], %[dst],     %[stride]   \n\t"                   \
1656         "bnez       %[h],   1b                      \n\t"                   \
1657         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1658           [h]"+r"(h),                                                       \
1659           [src]"+r"(src),               [dst]"+r"(dst)                      \
1660         : [offset_x1]"r"(offset),       [offset_x2]"r"(2*offset),           \
1661           [offset_x3]"r"(3*offset),     [stride]"g"(stride),                \
1662           [rnd]"m"(rnd),                                                    \
1663           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1664           [ff_pw_3]"f"(ff_pw_3)                                             \
1665         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1666           "$f14", "$f16", "memory"                                          \
1667     );                                                                      \
1668 }
1669
1670
1671 /** 1/4 shift bicubic interpolation */
1672 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
1673 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
1674 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
1675 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
1676 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
1677
1678 /** 3/4 shift bicubic interpolation */
1679 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
1680 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
1681 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
1682 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
1683 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
1684
1685 typedef void (*vc1_mspel_mc_filter_ver_16bits)
1686              (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
1687               int64_t shift);
1688 typedef void (*vc1_mspel_mc_filter_hor_16bits)
1689              (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
1690 typedef void (*vc1_mspel_mc_filter_8bits)
1691              (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
1692               mips_reg offset);
1693
1694 /**
1695  * Interpolate fractional pel values by applying proper vertical then
1696  * horizontal filter.
1697  *
1698  * @param  dst     Destination buffer for interpolated pels.
1699  * @param  src     Source buffer.
1700  * @param  stride  Stride for both src and dst buffers.
1701  * @param  hmode   Horizontal filter (expressed in quarter pixels shift).
1702  * @param  hmode   Vertical filter.
1703  * @param  rnd     Rounding bias.
1704  */
1705 #define VC1_MSPEL_MC(OP)                                                    \
1706 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
1707                                int hmode, int vmode, int rnd)               \
1708 {                                                                           \
1709     static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
1710          { NULL, vc1_put_ver_16b_shift1_mmi,                                \
1711                  vc1_put_ver_16b_shift2_mmi,                                \
1712                  vc1_put_ver_16b_shift3_mmi };                              \
1713     static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
1714          { NULL, OP ## vc1_hor_16b_shift1_mmi,                              \
1715                  OP ## vc1_hor_16b_shift2_mmi,                              \
1716                  OP ## vc1_hor_16b_shift3_mmi };                            \
1717     static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =          \
1718          { NULL, OP ## vc1_shift1_mmi,                                      \
1719                  OP ## vc1_shift2_mmi,                                      \
1720                  OP ## vc1_shift3_mmi };                                    \
1721                                                                             \
1722     if (vmode) { /* Vertical filter to apply */                             \
1723         if (hmode) { /* Horizontal filter to apply, output to tmp */        \
1724             static const int shift_value[] = { 0, 5, 1, 5 };                \
1725             int    shift = (shift_value[hmode]+shift_value[vmode])>>1;      \
1726             int    r;                                                       \
1727             LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);                        \
1728                                                                             \
1729             r = (1<<(shift-1)) + rnd-1;                                     \
1730             vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);  \
1731                                                                             \
1732             vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);    \
1733             return;                                                         \
1734         }                                                                   \
1735         else { /* No horizontal filter, output 8 lines to dst */            \
1736             vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);    \
1737             return;                                                         \
1738         }                                                                   \
1739     }                                                                       \
1740                                                                             \
1741     /* Horizontal mode with no vertical mode */                             \
1742     vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);                   \
1743 }                                                                           \
1744 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src,         \
1745                                   int stride, int hmode, int vmode, int rnd)\
1746 {                                                                           \
1747     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
1748     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
1749     dst += 8*stride; src += 8*stride;                                       \
1750     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
1751     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
1752 }
1753
1754 VC1_MSPEL_MC(put_)
1755 VC1_MSPEL_MC(avg_)
1756
1757 /** Macro to ease bicubic filter interpolation functions declarations */
1758 #define DECLARE_FUNCTION(a, b)                                              \
1759 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
1760                                            const uint8_t *src,              \
1761                                            ptrdiff_t stride,                \
1762                                            int rnd)                         \
1763 {                                                                           \
1764      put_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
1765 }                                                                           \
1766 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
1767                                            const uint8_t *src,              \
1768                                            ptrdiff_t stride,                \
1769                                            int rnd)                         \
1770 {                                                                           \
1771      avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
1772 }                                                                           \
1773 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
1774                                               const uint8_t *src,           \
1775                                               ptrdiff_t stride,             \
1776                                               int rnd)                      \
1777 {                                                                           \
1778      put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
1779 }                                                                           \
1780 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
1781                                               const uint8_t *src,           \
1782                                               ptrdiff_t stride,             \
1783                                               int rnd)                      \
1784 {                                                                           \
1785      avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
1786 }
1787
1788 DECLARE_FUNCTION(0, 1)
1789 DECLARE_FUNCTION(0, 2)
1790 DECLARE_FUNCTION(0, 3)
1791
1792 DECLARE_FUNCTION(1, 0)
1793 DECLARE_FUNCTION(1, 1)
1794 DECLARE_FUNCTION(1, 2)
1795 DECLARE_FUNCTION(1, 3)
1796
1797 DECLARE_FUNCTION(2, 0)
1798 DECLARE_FUNCTION(2, 1)
1799 DECLARE_FUNCTION(2, 2)
1800 DECLARE_FUNCTION(2, 3)
1801
1802 DECLARE_FUNCTION(3, 0)
1803 DECLARE_FUNCTION(3, 1)
1804 DECLARE_FUNCTION(3, 2)
1805 DECLARE_FUNCTION(3, 3)
1806
1807 #define CHROMA_MC_8_MMI                                                     \
1808         "punpckhbh  %[ftmp5],   %[ftmp1],   %[ftmp0]                \n\t"   \
1809         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
1810         "punpckhbh  %[ftmp6],   %[ftmp2],   %[ftmp0]                \n\t"   \
1811         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
1812         "punpckhbh  %[ftmp7],   %[ftmp3],   %[ftmp0]                \n\t"   \
1813         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
1814         "punpckhbh  %[ftmp8],   %[ftmp4],   %[ftmp0]                \n\t"   \
1815         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
1816                                                                             \
1817         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
1818         "pmullh     %[ftmp5],   %[ftmp5],   %[A]                    \n\t"   \
1819         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
1820         "pmullh     %[ftmp6],   %[ftmp6],   %[B]                    \n\t"   \
1821         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
1822         "pmullh     %[ftmp7],   %[ftmp7],   %[C]                    \n\t"   \
1823         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
1824         "pmullh     %[ftmp8],   %[ftmp8],   %[D]                    \n\t"   \
1825                                                                             \
1826         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
1827         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
1828         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
1829         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
1830                                                                             \
1831         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp6]                \n\t"   \
1832         "paddh      %[ftmp7],   %[ftmp7],   %[ftmp8]                \n\t"   \
1833         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp7]                \n\t"   \
1834         "paddh      %[ftmp5],   %[ftmp5],   %[ff_pw_28]             \n\t"   \
1835                                                                             \
1836         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp9]                \n\t"   \
1837         "psrlh      %[ftmp5],   %[ftmp5],   %[ftmp9]                \n\t"   \
1838         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"
1839
1840
1841 #define CHROMA_MC_4_MMI                                                     \
1842         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
1843         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
1844         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
1845         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
1846                                                                             \
1847         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
1848         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
1849         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
1850         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
1851                                                                             \
1852         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
1853         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
1854         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
1855         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
1856                                                                             \
1857         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"   \
1858         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"
1859
1860
1861 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
1862                                       uint8_t *src /* align 1 */,
1863                                       int stride, int h, int x, int y)
1864 {
1865     const int A = (8 - x) * (8 - y);
1866     const int B =     (x) * (8 - y);
1867     const int C = (8 - x) *     (y);
1868     const int D =     (x) *     (y);
1869     double ftmp[10];
1870     uint32_t tmp[1];
1871     DECLARE_VAR_ALL64;
1872     DECLARE_VAR_ADDRT;
1873
1874     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1875
1876     __asm__ volatile(
1877         "li         %[tmp0],    0x06                                    \n\t"
1878         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1879         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
1880         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
1881         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
1882         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
1883         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
1884
1885         "1:                                                             \n\t"
1886         MMI_ULDC1(%[ftmp1], %[src], 0x00)
1887         MMI_ULDC1(%[ftmp2], %[src], 0x01)
1888         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
1889         MMI_ULDC1(%[ftmp3], %[src], 0x00)
1890         MMI_ULDC1(%[ftmp4], %[src], 0x01)
1891
1892         CHROMA_MC_8_MMI
1893
1894         MMI_SDC1(%[ftmp1], %[dst], 0x00)
1895         "addiu      %[h],       %[h],      -0x01                        \n\t"
1896         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
1897         "bnez       %[h],       1b                                      \n\t"
1898         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1899           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1900           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1901           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1902           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1903           RESTRICT_ASM_ALL64
1904           RESTRICT_ASM_ADDRT
1905           [tmp0]"=&r"(tmp[0]),
1906           [src]"+&r"(src),              [dst]"+&r"(dst),
1907           [h]"+&r"(h)
1908         : [stride]"r"((mips_reg)stride),
1909           [A]"f"(A),                    [B]"f"(B),
1910           [C]"f"(C),                    [D]"f"(D),
1911           [ff_pw_28]"f"(ff_pw_28)
1912         : "memory"
1913     );
1914 }
1915
1916 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
1917                                       uint8_t *src /* align 1 */,
1918                                       int stride, int h, int x, int y)
1919 {
1920     const int A = (8 - x) * (8 - y);
1921     const int B =     (x) * (8 - y);
1922     const int C = (8 - x) *     (y);
1923     const int D =     (x) *     (y);
1924     double ftmp[6];
1925     uint32_t tmp[1];
1926     DECLARE_VAR_LOW32;
1927     DECLARE_VAR_ADDRT;
1928
1929     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1930
1931     __asm__ volatile(
1932         "li         %[tmp0],    0x06                                    \n\t"
1933         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1934         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
1935         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
1936         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
1937         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
1938         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
1939
1940         "1:                                                             \n\t"
1941         MMI_ULWC1(%[ftmp1], %[src], 0x00)
1942         MMI_ULWC1(%[ftmp2], %[src], 0x01)
1943         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
1944         MMI_ULWC1(%[ftmp3], %[src], 0x00)
1945         MMI_ULWC1(%[ftmp4], %[src], 0x01)
1946
1947         CHROMA_MC_4_MMI
1948
1949         MMI_SWC1(%[ftmp1], %[dst], 0x00)
1950         "addiu      %[h],       %[h],      -0x01                        \n\t"
1951         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
1952         "bnez       %[h],       1b                                      \n\t"
1953         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1954           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1955           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1956           [tmp0]"=&r"(tmp[0]),
1957           RESTRICT_ASM_LOW32
1958           RESTRICT_ASM_ADDRT
1959           [src]"+&r"(src),              [dst]"+&r"(dst),
1960           [h]"+&r"(h)
1961         : [stride]"r"((mips_reg)stride),
1962           [A]"f"(A),                    [B]"f"(B),
1963           [C]"f"(C),                    [D]"f"(D),
1964           [ff_pw_28]"f"(ff_pw_28)
1965         : "memory"
1966     );
1967 }
1968
1969 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
1970                                       uint8_t *src /* align 1 */,
1971                                       int stride, int h, int x, int y)
1972 {
1973     const int A = (8 - x) * (8 - y);
1974     const int B =     (x) * (8 - y);
1975     const int C = (8 - x) *     (y);
1976     const int D =     (x) *     (y);
1977     double ftmp[10];
1978     uint32_t tmp[1];
1979     DECLARE_VAR_ALL64;
1980     DECLARE_VAR_ADDRT;
1981
1982     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1983
1984     __asm__ volatile(
1985         "li         %[tmp0],    0x06                                    \n\t"
1986         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1987         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
1988         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
1989         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
1990         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
1991         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
1992
1993         "1:                                                             \n\t"
1994         MMI_ULDC1(%[ftmp1], %[src], 0x00)
1995         MMI_ULDC1(%[ftmp2], %[src], 0x01)
1996         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
1997         MMI_ULDC1(%[ftmp3], %[src], 0x00)
1998         MMI_ULDC1(%[ftmp4], %[src], 0x01)
1999
2000         CHROMA_MC_8_MMI
2001
2002         MMI_LDC1(%[ftmp2], %[dst], 0x00)
2003         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2004
2005         MMI_SDC1(%[ftmp1], %[dst], 0x00)
2006         "addiu      %[h],       %[h],      -0x01                        \n\t"
2007         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2008         "bnez       %[h],       1b                                      \n\t"
2009         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2010           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2011           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2012           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2013           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2014           [tmp0]"=&r"(tmp[0]),
2015           RESTRICT_ASM_ALL64
2016           RESTRICT_ASM_ADDRT
2017           [src]"+&r"(src),              [dst]"+&r"(dst),
2018           [h]"+&r"(h)
2019         : [stride]"r"((mips_reg)stride),
2020           [A]"f"(A),                    [B]"f"(B),
2021           [C]"f"(C),                    [D]"f"(D),
2022           [ff_pw_28]"f"(ff_pw_28)
2023         : "memory"
2024     );
2025 }
2026
2027 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2028                                       uint8_t *src /* align 1 */,
2029                                       int stride, int h, int x, int y)
2030 {
2031     const int A = (8 - x) * (8 - y);
2032     const int B = (    x) * (8 - y);
2033     const int C = (8 - x) * (    y);
2034     const int D = (    x) * (    y);
2035     double ftmp[6];
2036     uint32_t tmp[1];
2037     DECLARE_VAR_LOW32;
2038     DECLARE_VAR_ADDRT;
2039
2040     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2041
2042     __asm__ volatile(
2043         "li         %[tmp0],    0x06                                    \n\t"
2044         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2045         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2046         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2047         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2048         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2049         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2050
2051         "1:                                                             \n\t"
2052         MMI_ULWC1(%[ftmp1], %[src], 0x00)
2053         MMI_ULWC1(%[ftmp2], %[src], 0x01)
2054         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2055         MMI_ULWC1(%[ftmp3], %[src], 0x00)
2056         MMI_ULWC1(%[ftmp4], %[src], 0x01)
2057
2058         CHROMA_MC_4_MMI
2059
2060         MMI_LWC1(%[ftmp2], %[dst], 0x00)
2061         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2062
2063         MMI_SWC1(%[ftmp1], %[dst], 0x00)
2064         "addiu      %[h],       %[h],      -0x01                        \n\t"
2065         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2066         "bnez       %[h],       1b                                      \n\t"
2067         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2068           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2069           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2070           [tmp0]"=&r"(tmp[0]),
2071           RESTRICT_ASM_LOW32
2072           RESTRICT_ASM_ADDRT
2073           [src]"+&r"(src),              [dst]"+&r"(dst),
2074           [h]"+&r"(h)
2075         : [stride]"r"((mips_reg)stride),
2076           [A]"f"(A),                    [B]"f"(B),
2077           [C]"f"(C),                    [D]"f"(D),
2078           [ff_pw_28]"f"(ff_pw_28)
2079         : "memory"
2080     );
2081 }