]> git.sesse.net Git - ffmpeg/blob - libavcodec/mips/h264dsp_mmi.c
Merge commit 'e93aa2c9e7b3599aee6a5820760fc1a2c629dea0'
[ffmpeg] / libavcodec / mips / h264dsp_mmi.c
1 /*
2  * Loongson SIMD optimized h264dsp
3  *
4  * Copyright (c) 2015 Loongson Technology Corporation Limited
5  * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
7  *                    Heiher <r@hev.cc>
8  *
9  * This file is part of FFmpeg.
10  *
11  * FFmpeg is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * FFmpeg is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with FFmpeg; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24  */
25
26 #include "libavcodec/bit_depth_template.c"
27 #include "h264dsp_mips.h"
28
29 void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
30 {
31     __asm__ volatile (
32         "xor $f0, $f0, $f0              \r\n"
33         "ldc1 $f2, 0(%[src])            \r\n"
34         "ldc1 $f4, 8(%[src])            \r\n"
35         "ldc1 $f6, 16(%[src])           \r\n"
36         "ldc1 $f8, 24(%[src])           \r\n"
37         "lwc1 $f10, 0(%[dst0])          \r\n"
38         "lwc1 $f12, 0(%[dst1])          \r\n"
39         "lwc1 $f14, 0(%[dst2])          \r\n"
40         "lwc1 $f16, 0(%[dst3])          \r\n"
41         "punpcklbh $f10, $f10, $f0      \r\n"
42         "punpcklbh $f12, $f12, $f0      \r\n"
43         "punpcklbh $f14, $f14, $f0      \r\n"
44         "punpcklbh $f16, $f16, $f0      \r\n"
45         "paddh $f2, $f2, $f10           \r\n"
46         "paddh $f4, $f4, $f12           \r\n"
47         "paddh $f6, $f6, $f14           \r\n"
48         "paddh $f8, $f8, $f16           \r\n"
49         "packushb $f2, $f2, $f0         \r\n"
50         "packushb $f4, $f4, $f0         \r\n"
51         "packushb $f6, $f6, $f0         \r\n"
52         "packushb $f8, $f8, $f0         \r\n"
53         "swc1 $f2, 0(%[dst0])           \r\n"
54         "swc1 $f4, 0(%[dst1])           \r\n"
55         "swc1 $f6, 0(%[dst2])           \r\n"
56         "swc1 $f8, 0(%[dst3])           \r\n"
57         ::[dst0]"r"(dst),[dst1]"r"(dst+stride),[dst2]"r"(dst+2*stride),
58           [dst3]"r"(dst+3*stride),[src]"r"(src)
59         : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16"
60     );
61
62     memset(src, 0, 32);
63 }
64
65 void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
66 {
67     __asm__ volatile (
68         "dli $8, 1                              \r\n"
69         "ldc1 $f0, 0(%[block])                  \r\n"
70         "dmtc1 $8, $f16                         \r\n"
71         "ldc1 $f2, 8(%[block])                  \r\n"
72         "dli $8, 6                              \r\n"
73         "ldc1 $f4, 16(%[block])                 \r\n"
74         "dmtc1 $8, $f18                         \r\n"
75         "psrah $f8, $f2, $f16                   \r\n"
76         "ldc1 $f6, 24(%[block])                 \r\n"
77         "psrah $f10, $f6, $f16                  \r\n"
78         "psubh $f8, $f8, $f6                    \r\n"
79         "paddh $f10, $f10, $f2                  \r\n"
80         "paddh $f20, $f4, $f0                   \r\n"
81         "psubh $f0, $f0, $f4                    \r\n"
82         "paddh $f22, $f10, $f20                 \r\n"
83         "psubh $f4, $f20, $f10                  \r\n"
84         "paddh $f20, $f8, $f0                   \r\n"
85         "psubh $f0, $f0, $f8                    \r\n"
86         "punpckhhw $f2, $f22, $f20              \r\n"
87         "punpcklhw $f10, $f22, $f20             \r\n"
88         "punpckhhw $f8, $f0, $f4                \r\n"
89         "punpcklhw $f0, $f0, $f4                \r\n"
90         "punpckhwd $f4, $f10, $f0               \r\n"
91         "punpcklwd $f10, $f10, $f0              \r\n"
92         "punpcklwd $f20, $f2, $f8               \r\n"
93         "punpckhwd $f0, $f2, $f8                \r\n"
94         "paddh $f10, $f10, %[ff_pw_32]          \r\n"
95         "psrah $f8, $f4, $f16                   \r\n"
96         "psrah $f6, $f0, $f16                   \r\n"
97         "psubh $f8, $f8, $f0                    \r\n"
98         "paddh $f6, $f6, $f4                    \r\n"
99         "paddh $f2, $f20, $f10                  \r\n"
100         "psubh $f10, $f10, $f20                 \r\n"
101         "paddh $f20, $f6, $f2                   \r\n"
102         "psubh $f2, $f2, $f6                    \r\n"
103         "paddh $f22, $f8, $f10                  \r\n"
104         "xor $f14, $f14, $f14                   \r\n"
105         "psubh $f10, $f10, $f8                  \r\n"
106         "sdc1 $f14, 0(%[block])                 \r\n"
107         "sdc1 $f14, 8(%[block])                 \r\n"
108         "sdc1 $f14, 16(%[block])                \r\n"
109         "sdc1 $f14, 24(%[block])                \r\n"
110         "lwc1 $f4, 0(%[dst])                    \r\n"
111         "psrah $f6, $f20, $f18                  \r\n"
112         "gslwxc1 $f0, 0(%[dst], %[stride])      \r\n"
113         "psrah $f8, $f22, $f18                  \r\n"
114         "punpcklbh $f4, $f4, $f14               \r\n"
115         "punpcklbh $f0, $f0, $f14               \r\n"
116         "paddh $f4, $f4, $f6                    \r\n"
117         "paddh $f0, $f0, $f8                    \r\n"
118         "packushb $f4, $f4, $f14                \r\n"
119         "packushb $f0, $f0, $f14                \r\n"
120         "swc1 $f4, 0(%[dst])                    \r\n"
121         "gsswxc1 $f0, 0(%[dst], %[stride])      \r\n"
122         "daddu %[dst], %[dst], %[stride]        \r\n"
123         "daddu %[dst], %[dst], %[stride]        \r\n"
124         "lwc1 $f4, 0(%[dst])                    \r\n"
125         "psrah $f10, $f10, $f18                 \r\n"
126         "gslwxc1 $f0, 0(%[dst], %[stride])      \r\n"
127         "psrah $f2, $f2, $f18                   \r\n"
128         "punpcklbh $f4, $f4, $f14               \r\n"
129         "punpcklbh $f0, $f0, $f14               \r\n"
130         "paddh $f4, $f4, $f10                   \r\n"
131         "paddh $f0, $f0, $f2                    \r\n"
132         "packushb $f4, $f4, $f14                \r\n"
133         "swc1 $f4, 0(%[dst])                    \r\n"
134         "packushb $f0, $f0, $f14                \r\n"
135         "gsswxc1 $f0, 0(%[dst], %[stride])      \r\n"
136         ::[dst]"r"(dst),[block]"r"(block),[stride]"r"((uint64_t)stride),
137           [ff_pw_32]"f"(ff_pw_32)
138         : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
139           "$f18","$f20","$f22"
140     );
141
142     memset(block, 0, 32);
143 }
144
145 void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
146 {
147     __asm__ volatile (
148         "lhu $10, 0x0(%[block])                     \r\n"
149         "daddiu $29, $29, -0x20                     \r\n"
150         "daddiu $10, $10, 0x20                      \r\n"
151         "ldc1 $f2, 0x10(%[block])                   \r\n"
152         "sh $10, 0x0(%[block])                      \r\n"
153         "ldc1 $f4, 0x20(%[block])                   \r\n"
154         "dli $10, 0x1                               \r\n"
155         "ldc1 $f6, 0x30(%[block])                   \r\n"
156         "dmtc1 $10, $f16                            \r\n"
157         "ldc1 $f10, 0x50(%[block])                  \r\n"
158         "ldc1 $f12, 0x60(%[block])                  \r\n"
159         "ldc1 $f14, 0x70(%[block])                  \r\n"
160         "mov.d $f0, $f2                             \r\n"
161         "psrah $f2, $f2, $f16                       \r\n"
162         "psrah $f8, $f10, $f16                      \r\n"
163         "paddh $f2, $f2, $f0                        \r\n"
164         "paddh $f8, $f8, $f10                       \r\n"
165         "paddh $f2, $f2, $f10                       \r\n"
166         "paddh $f8, $f8, $f14                       \r\n"
167         "paddh $f2, $f2, $f6                        \r\n"
168         "psubh $f8, $f8, $f0                        \r\n"
169         "psubh $f0, $f0, $f6                        \r\n"
170         "psubh $f10, $f10, $f6                      \r\n"
171         "psrah $f6, $f6, $f16                       \r\n"
172         "paddh $f0, $f0, $f14                       \r\n"
173         "psubh $f10, $f10, $f14                     \r\n"
174         "psrah $f14, $f14, $f16                     \r\n"
175         "psubh $f0, $f0, $f6                        \r\n"
176         "dli $10, 0x2                               \r\n"
177         "psubh $f10, $f10, $f14                     \r\n"
178         "dmtc1 $10, $f18                            \r\n"
179         "mov.d $f14, $f2                            \r\n"
180         "psrah $f2, $f2, $f18                       \r\n"
181         "psrah $f6, $f8, $f18                       \r\n"
182         "paddh $f6, $f6, $f0                        \r\n"
183         "psrah $f0, $f0, $f18                       \r\n"
184         "paddh $f2, $f2, $f10                       \r\n"
185         "psrah $f10, $f10, $f18                     \r\n"
186         "psubh $f0, $f0, $f8                        \r\n"
187         "psubh $f14, $f14, $f10                     \r\n"
188         "mov.d $f10, $f12                           \r\n"
189         "psrah $f12, $f12, $f16                     \r\n"
190         "psrah $f8, $f4, $f16                       \r\n"
191         "paddh $f12, $f12, $f4                      \r\n"
192         "psubh $f8, $f8, $f10                       \r\n"
193         "ldc1 $f4, 0x0(%[block])                    \r\n"
194         "ldc1 $f10, 0x40(%[block])                  \r\n"
195         "paddh $f10, $f10, $f4                      \r\n"
196         "paddh $f4, $f4, $f4                        \r\n"
197         "paddh $f12, $f12, $f10                     \r\n"
198         "psubh $f4, $f4, $f10                       \r\n"
199         "paddh $f10, $f10, $f10                     \r\n"
200         "paddh $f8, $f8, $f4                        \r\n"
201         "psubh $f10, $f10, $f12                     \r\n"
202         "paddh $f4, $f4, $f4                        \r\n"
203         "paddh $f14, $f14, $f12                     \r\n"
204         "psubh $f4, $f4, $f8                        \r\n"
205         "paddh $f12, $f12, $f12                     \r\n"
206         "paddh $f0, $f0, $f8                        \r\n"
207         "psubh $f12, $f12, $f14                     \r\n"
208         "paddh $f8, $f8, $f8                        \r\n"
209         "paddh $f6, $f6, $f4                        \r\n"
210         "psubh $f8, $f8, $f0                        \r\n"
211         "paddh $f4, $f4, $f4                        \r\n"
212         "paddh $f2, $f2, $f10                       \r\n"
213         "psubh $f4, $f4, $f6                        \r\n"
214         "paddh $f10, $f10, $f10                     \r\n"
215         "sdc1 $f12, 0x0(%[block])                   \r\n"
216         "psubh $f10, $f10, $f2                      \r\n"
217         "punpckhhw $f12, $f14, $f0                  \r\n"
218         "punpcklhw $f14, $f14, $f0                  \r\n"
219         "punpckhhw $f0, $f6, $f2                    \r\n"
220         "punpcklhw $f6, $f6, $f2                    \r\n"
221         "punpckhwd $f2, $f14, $f6                   \r\n"
222         "punpcklwd $f14, $f14, $f6                  \r\n"
223         "punpckhwd $f6, $f12, $f0                   \r\n"
224         "punpcklwd $f12, $f12, $f0                  \r\n"
225         "ldc1 $f0, 0x0(%[block])                    \r\n"
226         "sdc1 $f14, 0x0($29)                        \r\n"
227         "sdc1 $f2, 0x10($29)                        \r\n"
228         "dmfc1 $8, $f12                             \r\n"
229         "dmfc1 $11, $f6                             \r\n"
230         "punpckhhw $f6, $f10, $f4                   \r\n"
231         "punpcklhw $f10, $f10, $f4                  \r\n"
232         "punpckhhw $f4, $f8, $f0                    \r\n"
233         "punpcklhw $f8, $f8, $f0                    \r\n"
234         "punpckhwd $f0, $f10, $f8                   \r\n"
235         "punpcklwd $f10, $f10, $f8                  \r\n"
236         "punpckhwd $f8, $f6, $f4                    \r\n"
237         "punpcklwd $f6, $f6, $f4                    \r\n"
238         "sdc1 $f10, 0x8($29)                        \r\n"
239         "sdc1 $f0, 0x18($29)                        \r\n"
240         "dmfc1 $9, $f6                              \r\n"
241         "dmfc1 $12, $f8                             \r\n"
242         "ldc1 $f2, 0x18(%[block])                   \r\n"
243         "ldc1 $f12, 0x28(%[block])                  \r\n"
244         "ldc1 $f4, 0x38(%[block])                   \r\n"
245         "ldc1 $f0, 0x58(%[block])                   \r\n"
246         "ldc1 $f6, 0x68(%[block])                   \r\n"
247         "ldc1 $f8, 0x78(%[block])                   \r\n"
248         "mov.d $f14, $f2                            \r\n"
249         "psrah $f10, $f0, $f16                      \r\n"
250         "psrah $f2, $f2, $f16                       \r\n"
251         "paddh $f10, $f10, $f0                      \r\n"
252         "paddh $f2, $f2, $f14                       \r\n"
253         "paddh $f10, $f10, $f8                      \r\n"
254         "paddh $f2, $f2, $f0                        \r\n"
255         "psubh $f10, $f10, $f14                     \r\n"
256         "paddh $f2, $f2, $f4                        \r\n"
257         "psubh $f14, $f14, $f4                      \r\n"
258         "psubh $f0, $f0, $f4                        \r\n"
259         "psrah $f4, $f4, $f16                       \r\n"
260         "paddh $f14, $f14, $f8                      \r\n"
261         "psubh $f0, $f0, $f8                        \r\n"
262         "psrah $f8, $f8, $f16                       \r\n"
263         "psubh $f14, $f14, $f4                      \r\n"
264         "psubh $f0, $f0, $f8                        \r\n"
265         "mov.d $f8, $f2                             \r\n"
266         "psrah $f4, $f10, $f18                      \r\n"
267         "psrah $f2, $f2, $f18                       \r\n"
268         "paddh $f4, $f4, $f14                       \r\n"
269         "psrah $f14, $f14, $f18                     \r\n"
270         "paddh $f2, $f2, $f0                        \r\n"
271         "psrah $f0, $f0, $f18                       \r\n"
272         "psubh $f14, $f14, $f10                     \r\n"
273         "psubh $f8, $f8, $f0                        \r\n"
274         "mov.d $f0, $f6                             \r\n"
275         "psrah $f6, $f6, $f16                       \r\n"
276         "psrah $f10, $f12, $f16                     \r\n"
277         "paddh $f6, $f6, $f12                       \r\n"
278         "psubh $f10, $f10, $f0                      \r\n"
279         "ldc1 $f12, 0x8(%[block])                   \r\n"
280         "ldc1 $f0, 0x48(%[block])                   \r\n"
281         "paddh $f0, $f0, $f12                       \r\n"
282         "paddh $f12, $f12, $f12                     \r\n"
283         "paddh $f6, $f6, $f0                        \r\n"
284         "psubh $f12, $f12, $f0                      \r\n"
285         "paddh $f0, $f0, $f0                        \r\n"
286         "paddh $f10, $f10, $f12                     \r\n"
287         "psubh $f0, $f0, $f6                        \r\n"
288         "paddh $f12, $f12, $f12                     \r\n"
289         "paddh $f8, $f8, $f6                        \r\n"
290         "psubh $f12, $f12, $f10                     \r\n"
291         "paddh $f6, $f6, $f6                        \r\n"
292         "paddh $f14, $f14, $f10                     \r\n"
293         "psubh $f6, $f6, $f8                        \r\n"
294         "paddh $f10, $f10, $f10                     \r\n"
295         "paddh $f4, $f4, $f12                       \r\n"
296         "psubh $f10, $f10, $f14                     \r\n"
297         "paddh $f12, $f12, $f12                     \r\n"
298         "paddh $f2, $f2, $f0                        \r\n"
299         "psubh $f12, $f12, $f4                      \r\n"
300         "paddh $f0, $f0, $f0                        \r\n"
301         "sdc1 $f6, 0x8(%[block])                    \r\n"
302         "psubh $f0, $f0, $f2                        \r\n"
303         "punpckhhw $f6, $f8, $f14                   \r\n"
304         "punpcklhw $f8, $f8, $f14                   \r\n"
305         "punpckhhw $f14, $f4, $f2                   \r\n"
306         "punpcklhw $f4, $f4, $f2                    \r\n"
307         "punpckhwd $f2, $f8, $f4                    \r\n"
308         "punpcklwd $f8, $f8, $f4                    \r\n"
309         "punpckhwd $f4, $f6, $f14                   \r\n"
310         "punpcklwd $f6, $f6, $f14                   \r\n"
311         "ldc1 $f14, 0x8(%[block])                   \r\n"
312         "dmfc1 $13, $f8                             \r\n"
313         "dmfc1 $15, $f2                             \r\n"
314         "mov.d $f24, $f6                            \r\n"
315         "mov.d $f28, $f4                            \r\n"
316         "punpckhhw $f4, $f0, $f12                   \r\n"
317         "punpcklhw $f0, $f0, $f12                   \r\n"
318         "punpckhhw $f12, $f10, $f14                 \r\n"
319         "punpcklhw $f10, $f10, $f14                 \r\n"
320         "punpckhwd $f14, $f0, $f10                  \r\n"
321         "punpcklwd $f0, $f0, $f10                   \r\n"
322         "punpckhwd $f10, $f4, $f12                  \r\n"
323         "punpcklwd $f4, $f4, $f12                   \r\n"
324         "dmfc1 $14, $f0                             \r\n"
325         "mov.d $f22, $f14                           \r\n"
326         "mov.d $f26, $f4                            \r\n"
327         "mov.d $f30, $f10                           \r\n"
328         "daddiu $10, %[dst], 0x4                    \r\n"
329         "dmtc1 $15, $f14                            \r\n"
330         "dmtc1 $11, $f12                            \r\n"
331         "ldc1 $f2, 0x10($29)                        \r\n"
332         "dmtc1 $8, $f6                              \r\n"
333         "mov.d $f8, $f2                             \r\n"
334         "psrah $f2, $f2, $f16                       \r\n"
335         "psrah $f0, $f14, $f16                      \r\n"
336         "paddh $f2, $f2, $f8                        \r\n"
337         "paddh $f0, $f0, $f14                       \r\n"
338         "paddh $f2, $f2, $f14                       \r\n"
339         "paddh $f0, $f0, $f28                       \r\n"
340         "paddh $f2, $f2, $f12                       \r\n"
341         "psubh $f0, $f0, $f8                        \r\n"
342         "psubh $f8, $f8, $f12                       \r\n"
343         "psubh $f14, $f14, $f12                     \r\n"
344         "psrah $f12, $f12, $f16                     \r\n"
345         "paddh $f8, $f8, $f28                       \r\n"
346         "psubh $f14, $f14, $f28                     \r\n"
347         "psrah $f10, $f28, $f16                     \r\n"
348         "psubh $f8, $f8, $f12                       \r\n"
349         "psubh $f14, $f14, $f10                     \r\n"
350         "mov.d $f10, $f2                            \r\n"
351         "psrah $f2, $f2, $f18                       \r\n"
352         "psrah $f12, $f0, $f18                      \r\n"
353         "paddh $f2, $f2, $f14                       \r\n"
354         "paddh $f12, $f12, $f8                      \r\n"
355         "psrah $f8, $f8, $f18                       \r\n"
356         "psrah $f14, $f14, $f18                     \r\n"
357         "psubh $f8, $f8, $f0                        \r\n"
358         "psubh $f10, $f10, $f14                     \r\n"
359         "mov.d $f14, $f24                           \r\n"
360         "psrah $f4, $f24, $f16                      \r\n"
361         "psrah $f0, $f6, $f16                       \r\n"
362         "paddh $f4, $f4, $f6                        \r\n"
363         "psubh $f0, $f0, $f14                       \r\n"
364         "ldc1 $f6, 0x0($29)                         \r\n"
365         "dmtc1 $13, $f14                            \r\n"
366         "paddh $f14, $f14, $f6                      \r\n"
367         "paddh $f6, $f6, $f6                        \r\n"
368         "paddh $f4, $f4, $f14                       \r\n"
369         "psubh $f6, $f6, $f14                       \r\n"
370         "paddh $f14, $f14, $f14                     \r\n"
371         "paddh $f0, $f0, $f6                        \r\n"
372         "psubh $f14, $f14, $f4                      \r\n"
373         "paddh $f6, $f6, $f6                        \r\n"
374         "paddh $f10, $f10, $f4                      \r\n"
375         "psubh $f6, $f6, $f0                        \r\n"
376         "paddh $f4, $f4, $f4                        \r\n"
377         "paddh $f8, $f8, $f0                        \r\n"
378         "psubh $f4, $f4, $f10                       \r\n"
379         "paddh $f0, $f0, $f0                        \r\n"
380         "paddh $f12, $f12, $f6                      \r\n"
381         "psubh $f0, $f0, $f8                        \r\n"
382         "paddh $f6, $f6, $f6                        \r\n"
383         "paddh $f2, $f2, $f14                       \r\n"
384         "psubh $f6, $f6, $f12                       \r\n"
385         "paddh $f14, $f14, $f14                     \r\n"
386         "sdc1 $f6, 0x0($29)                         \r\n"
387         "psubh $f14, $f14, $f2                      \r\n"
388         "sdc1 $f0, 0x10($29)                        \r\n"
389         "dmfc1 $8, $f4                              \r\n"
390         "xor $f4, $f4, $f4                          \r\n"
391         "sdc1 $f4, 0x0(%[block])                    \r\n"
392         "sdc1 $f4, 0x8(%[block])                    \r\n"
393         "sdc1 $f4, 0x10(%[block])                   \r\n"
394         "sdc1 $f4, 0x18(%[block])                   \r\n"
395         "sdc1 $f4, 0x20(%[block])                   \r\n"
396         "sdc1 $f4, 0x28(%[block])                   \r\n"
397         "sdc1 $f4, 0x30(%[block])                   \r\n"
398         "sdc1 $f4, 0x38(%[block])                   \r\n"
399         "sdc1 $f4, 0x40(%[block])                   \r\n"
400         "sdc1 $f4, 0x48(%[block])                   \r\n"
401         "sdc1 $f4, 0x50(%[block])                   \r\n"
402         "sdc1 $f4, 0x58(%[block])                   \r\n"
403         "sdc1 $f4, 0x60(%[block])                   \r\n"
404         "sdc1 $f4, 0x68(%[block])                   \r\n"
405         "sdc1 $f4, 0x70(%[block])                   \r\n"
406         "sdc1 $f4, 0x78(%[block])                   \r\n"
407         "dli $11, 0x6                               \r\n"
408         "lwc1 $f6, 0x0(%[dst])                      \r\n"
409         "dmtc1 $11, $f20                            \r\n"
410         "gslwxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
411         "psrah $f10, $f10, $f20                     \r\n"
412         "psrah $f8, $f8, $f20                       \r\n"
413         "punpcklbh $f6, $f6, $f4                    \r\n"
414         "punpcklbh $f0, $f0, $f4                    \r\n"
415         "paddh $f6, $f6, $f10                       \r\n"
416         "paddh $f0, $f0, $f8                        \r\n"
417         "packushb $f6, $f6, $f4                     \r\n"
418         "packushb $f0, $f0, $f4                     \r\n"
419         "swc1 $f6, 0x0(%[dst])                      \r\n"
420         "gsswxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
421         "daddu %[dst], %[dst], %[stride]            \r\n"
422         "daddu %[dst], %[dst], %[stride]            \r\n"
423         "lwc1 $f6, 0x0(%[dst])                      \r\n"
424         "gslwxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
425         "psrah $f12, $f12, $f20                     \r\n"
426         "psrah $f2, $f2, $f20                       \r\n"
427         "punpcklbh $f6, $f6, $f4                    \r\n"
428         "punpcklbh $f0, $f0, $f4                    \r\n"
429         "paddh $f6, $f6, $f12                       \r\n"
430         "paddh $f0, $f0, $f2                        \r\n"
431         "packushb $f6, $f6, $f4                     \r\n"
432         "packushb $f0, $f0, $f4                     \r\n"
433         "swc1 $f6, 0x0(%[dst])                      \r\n"
434         "gsswxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
435         "ldc1 $f10, 0x0($29)                        \r\n"
436         "ldc1 $f8, 0x10($29)                        \r\n"
437         "dmtc1 $8, $f12                             \r\n"
438         "daddu %[dst], %[dst], %[stride]            \r\n"
439         "daddu %[dst], %[dst], %[stride]            \r\n"
440         "lwc1 $f6, 0x0(%[dst])                      \r\n"
441         "gslwxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
442         "psrah $f14, $f14, $f20                     \r\n"
443         "psrah $f10, $f10, $f20                     \r\n"
444         "punpcklbh $f6, $f6, $f4                    \r\n"
445         "punpcklbh $f0, $f0, $f4                    \r\n"
446         "paddh $f6, $f6, $f14                       \r\n"
447         "paddh $f0, $f0, $f10                       \r\n"
448         "packushb $f6, $f6, $f4                     \r\n"
449         "packushb $f0, $f0, $f4                     \r\n"
450         "swc1 $f6, 0x0(%[dst])                      \r\n"
451         "gsswxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
452         "daddu %[dst], %[dst], %[stride]            \r\n"
453         "daddu %[dst], %[dst], %[stride]            \r\n"
454         "lwc1 $f6, 0x0(%[dst])                      \r\n"
455         "gslwxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
456         "psrah $f8, $f8, $f20                       \r\n"
457         "psrah $f12, $f12, $f20                     \r\n"
458         "punpcklbh $f6, $f6, $f4                    \r\n"
459         "punpcklbh $f0, $f0, $f4                    \r\n"
460         "paddh $f6, $f6, $f8                        \r\n"
461         "paddh $f0, $f0, $f12                       \r\n"
462         "packushb $f6, $f6, $f4                     \r\n"
463         "packushb $f0, $f0, $f4                     \r\n"
464         "swc1 $f6, 0x0(%[dst])                      \r\n"
465         "gsswxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
466         "dmtc1 $12, $f2                             \r\n"
467         "dmtc1 $9, $f12                             \r\n"
468         "ldc1 $f8, 0x18($29)                        \r\n"
469         "mov.d $f10, $f8                            \r\n"
470         "psrah $f8, $f8, $f16                       \r\n"
471         "psrah $f14, $f22, $f16                     \r\n"
472         "paddh $f14, $f14, $f22                     \r\n"
473         "paddh $f8, $f8, $f10                       \r\n"
474         "paddh $f14, $f14, $f30                     \r\n"
475         "paddh $f8, $f8, $f22                       \r\n"
476         "psubh $f14, $f14, $f10                     \r\n"
477         "paddh $f8, $f8, $f2                        \r\n"
478         "psubh $f10, $f10, $f2                      \r\n"
479         "psubh $f6, $f22, $f2                       \r\n"
480         "psrah $f2, $f2, $f16                       \r\n"
481         "paddh $f10, $f10, $f30                     \r\n"
482         "psubh $f6, $f6, $f30                       \r\n"
483         "psrah $f4, $f30, $f16                      \r\n"
484         "psubh $f10, $f10, $f2                      \r\n"
485         "psubh $f6, $f6, $f4                        \r\n"
486         "mov.d $f4, $f8                             \r\n"
487         "psrah $f8, $f8, $f18                       \r\n"
488         "psrah $f2, $f14, $f18                      \r\n"
489         "paddh $f8, $f8, $f6                        \r\n"
490         "paddh $f2, $f2, $f10                       \r\n"
491         "psrah $f10, $f10, $f18                     \r\n"
492         "psrah $f6, $f6, $f18                       \r\n"
493         "psubh $f10, $f10, $f14                     \r\n"
494         "psubh $f4, $f4, $f6                        \r\n"
495         "mov.d $f6, $f26                            \r\n"
496         "psrah $f0, $f26, $f16                      \r\n"
497         "psrah $f14, $f12, $f16                     \r\n"
498         "paddh $f0, $f0, $f12                       \r\n"
499         "psubh $f14, $f14, $f6                      \r\n"
500         "ldc1 $f12, 0x8($29)                        \r\n"
501         "dmtc1 $14, $f6                             \r\n"
502         "paddh $f6, $f6, $f12                       \r\n"
503         "paddh $f12, $f12, $f12                     \r\n"
504         "paddh $f0, $f0, $f6                        \r\n"
505         "psubh $f12, $f12, $f6                      \r\n"
506         "paddh $f6, $f6, $f6                        \r\n"
507         "paddh $f14, $f14, $f12                     \r\n"
508         "psubh $f6, $f6, $f0                        \r\n"
509         "paddh $f12, $f12, $f12                     \r\n"
510         "paddh $f4, $f4, $f0                        \r\n"
511         "psubh $f12, $f12, $f14                     \r\n"
512         "paddh $f0, $f0, $f0                        \r\n"
513         "paddh $f10, $f10, $f14                     \r\n"
514         "psubh $f0, $f0, $f4                        \r\n"
515         "paddh $f14, $f14, $f14                     \r\n"
516         "paddh $f2, $f2, $f12                       \r\n"
517         "psubh $f14, $f14, $f10                     \r\n"
518         "paddh $f12, $f12, $f12                     \r\n"
519         "paddh $f8, $f8, $f6                        \r\n"
520         "psubh $f12, $f12, $f2                      \r\n"
521         "paddh $f6, $f6, $f6                        \r\n"
522         "sdc1 $f12, 0x8($29)                        \r\n"
523         "psubh $f6, $f6, $f8                        \r\n"
524         "sdc1 $f14, 0x18($29)                       \r\n"
525         "dmfc1 $9, $f0                              \r\n"
526         "xor $f0, $f0, $f0                          \r\n"
527         "lwc1 $f12, 0x0($10)                        \r\n"
528         "gslwxc1 $f14, 0x0($10, %[stride])          \r\n"
529         "psrah $f4, $f4, $f20                       \r\n"
530         "psrah $f10, $f10, $f20                     \r\n"
531         "punpcklbh $f12, $f12, $f0                  \r\n"
532         "punpcklbh $f14, $f14, $f0                  \r\n"
533         "paddh $f12, $f12, $f4                      \r\n"
534         "paddh $f14, $f14, $f10                     \r\n"
535         "packushb $f12, $f12, $f0                   \r\n"
536         "packushb $f14, $f14, $f0                   \r\n"
537         "swc1 $f12, 0x0($10)                        \r\n"
538         "gsswxc1 $f14, 0x0($10, %[stride])          \r\n"
539         "daddu $10, $10, %[stride]                  \r\n"
540         "daddu $10, $10, %[stride]                  \r\n"
541         "lwc1 $f12, 0x0($10)                        \r\n"
542         "gslwxc1 $f14, 0x0($10, %[stride])          \r\n"
543         "psrah $f2, $f2, $f20                       \r\n"
544         "psrah $f8, $f8, $f20                       \r\n"
545         "punpcklbh $f12, $f12, $f0                  \r\n"
546         "punpcklbh $f14, $f14, $f0                  \r\n"
547         "paddh $f12, $f12, $f2                      \r\n"
548         "paddh $f14, $f14, $f8                      \r\n"
549         "packushb $f12, $f12, $f0                   \r\n"
550         "packushb $f14, $f14, $f0                   \r\n"
551         "swc1 $f12, 0x0($10)                        \r\n"
552         "gsswxc1 $f14, 0x0($10, %[stride])          \r\n"
553         "ldc1 $f4, 0x8($29)                         \r\n"
554         "ldc1 $f10, 0x18($29)                       \r\n"
555         "daddu $10, $10, %[stride]                  \r\n"
556         "dmtc1 $9, $f2                              \r\n"
557         "daddu $10, $10, %[stride]                  \r\n"
558         "lwc1 $f12, 0x0($10)                        \r\n"
559         "gslwxc1 $f14, 0x0($10, %[stride])          \r\n"
560         "psrah $f6, $f6, $f20                       \r\n"
561         "psrah $f4, $f4, $f20                       \r\n"
562         "punpcklbh $f12, $f12, $f0                  \r\n"
563         "punpcklbh $f14, $f14, $f0                  \r\n"
564         "paddh $f12, $f12, $f6                      \r\n"
565         "paddh $f14, $f14, $f4                      \r\n"
566         "packushb $f12, $f12, $f0                   \r\n"
567         "packushb $f14, $f14, $f0                   \r\n"
568         "swc1 $f12, 0x0($10)                        \r\n"
569         "gsswxc1 $f14, 0x0($10, %[stride])          \r\n"
570         "daddu $10, $10, %[stride]                  \r\n"
571         "daddu $10, $10, %[stride]                  \r\n"
572         "lwc1 $f12, 0x0($10)                        \r\n"
573         "gslwxc1 $f14, 0x0($10, %[stride])          \r\n"
574         "psrah $f10, $f10, $f20                     \r\n"
575         "psrah $f2, $f2, $f20                       \r\n"
576         "punpcklbh $f12, $f12, $f0                  \r\n"
577         "punpcklbh $f14, $f14, $f0                  \r\n"
578         "paddh $f12, $f12, $f10                     \r\n"
579         "paddh $f14, $f14, $f2                      \r\n"
580         "packushb $f12, $f12, $f0                   \r\n"
581         "packushb $f14, $f14, $f0                   \r\n"
582         "swc1 $f12, 0x0($10)                        \r\n"
583         "gsswxc1 $f14, 0x0($10, %[stride])          \r\n"
584         "daddiu $29, $29, 0x20                      \r\n"
585         ::[dst]"r"(dst),[block]"r"(block),[stride]"r"((uint64_t)stride)
586         :"$8","$9","$10","$11","$12","$13","$14","$15","$29","$f0","$f2","$f4",
587          "$f8","$f10","$f12","$f14","$f16","$f18","$f20","$f22","$f24","$f26",
588          "$f28","$f30"
589     );
590
591     memset(block, 0, 128);
592 }
593
594 void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
595 {
596     __asm__ volatile (
597         "lh $8, 0x0(%[block])                       \r\n"
598         "sd $0, 0x0(%[block])                       \r\n"
599         "daddiu $8, $8, 0x20                        \r\n"
600         "daddu $10, %[stride], %[stride]            \r\n"
601         "dsra $8, $8, 0x6                           \r\n"
602         "xor $f2, $f2, $f2                          \r\n"
603         "mtc1 $8, $f0                               \r\n"
604         "pshufh $f0, $f0, $f2                       \r\n"
605         "daddu $8, $10, %[stride]                   \r\n"
606         "psubh $f2, $f2, $f0                        \r\n"
607         "packushb $f0, $f0, $f0                     \r\n"
608         "packushb $f2, $f2, $f2                     \r\n"
609         "lwc1 $f4, 0x0(%[dst])                      \r\n"
610         "gslwxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
611         "gslwxc1 $f8, 0x0(%[dst], $10)              \r\n"
612         "gslwxc1 $f10, 0x0(%[dst], $8)              \r\n"
613         "paddusb $f4, $f4, $f0                      \r\n"
614         "paddusb $f6, $f6, $f0                      \r\n"
615         "paddusb $f8, $f8, $f0                      \r\n"
616         "paddusb $f10, $f10, $f0                    \r\n"
617         "psubusb $f4, $f4, $f2                      \r\n"
618         "psubusb $f6, $f6, $f2                      \r\n"
619         "psubusb $f8, $f8, $f2                      \r\n"
620         "psubusb $f10, $f10, $f2                    \r\n"
621         "swc1 $f4, 0x0(%[dst])                      \r\n"
622         "gsswxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
623         "gsswxc1 $f8, 0x0(%[dst], $10)              \r\n"
624         "gsswxc1 $f10, 0x0(%[dst], $8)              \r\n"
625         ::[dst]"r"(dst),[block]"r"(block),[stride]"r"((uint64_t)stride)
626         : "$8","$10","$f0","$f2","$f4","$f6","$f8","$f10"
627     );
628 }
629
630 void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
631 {
632     __asm__ volatile (
633         "lh $8, 0x0(%[block])                       \r\n"
634         "sd $0, 0x0(%[block])                       \r\n"
635         "daddiu $8, $8, 0x20                        \r\n"
636         "daddu $10, %[stride], %[stride]            \r\n"
637         "dsra $8, $8, 0x6                           \r\n"
638         "xor $f2, $f2, $f2                          \r\n"
639         "mtc1 $8, $f0                               \r\n"
640         "pshufh $f0, $f0, $f2                       \r\n"
641         "daddu $8, $10, %[stride]                   \r\n"
642         "psubh $f2, $f2, $f0                        \r\n"
643         "packushb $f0, $f0, $f0                     \r\n"
644         "packushb $f2, $f2, $f2                     \r\n"
645         "ldc1 $f4, 0x0(%[dst])                      \r\n"
646         "gsldxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
647         "gsldxc1 $f8, 0x0(%[dst], $10)              \r\n"
648         "gsldxc1 $f10, 0x0(%[dst], $8)              \r\n"
649         "paddusb $f4, $f4, $f0                      \r\n"
650         "paddusb $f6, $f6, $f0                      \r\n"
651         "paddusb $f8, $f8, $f0                      \r\n"
652         "paddusb $f10, $f10, $f0                    \r\n"
653         "psubusb $f4, $f4, $f2                      \r\n"
654         "psubusb $f6, $f6, $f2                      \r\n"
655         "psubusb $f8, $f8, $f2                      \r\n"
656         "psubusb $f10, $f10, $f2                    \r\n"
657         "sdc1 $f4, 0x0(%[dst])                      \r\n"
658         "gssdxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
659         "gssdxc1 $f8, 0x0(%[dst], $10)              \r\n"
660         "daddu $9, $10, $10                         \r\n"
661         "gssdxc1 $f10, 0x0(%[dst], $8)              \r\n"
662         "daddu %[dst], %[dst], $9                   \r\n"
663         "ldc1 $f4, 0x0(%[dst])                      \r\n"
664         "gsldxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
665         "gsldxc1 $f8, 0x0(%[dst], $10)              \r\n"
666         "gsldxc1 $f10, 0x0(%[dst], $8)              \r\n"
667         "paddusb $f4, $f4, $f0                      \r\n"
668         "paddusb $f6, $f6, $f0                      \r\n"
669         "paddusb $f8, $f8, $f0                      \r\n"
670         "paddusb $f10, $f10, $f0                    \r\n"
671         "psubusb $f4, $f4, $f2                      \r\n"
672         "psubusb $f6, $f6, $f2                      \r\n"
673         "psubusb $f8, $f8, $f2                      \r\n"
674         "psubusb $f10, $f10, $f2                    \r\n"
675         "sdc1 $f4, 0x0(%[dst])                      \r\n"
676         "gssdxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
677         "gssdxc1 $f8, 0x0(%[dst], $10)              \r\n"
678         "gssdxc1 $f10, 0x0(%[dst], $8)              \r\n"
679         ::[dst]"r"(dst),[block]"r"(block),[stride]"r"((uint64_t)stride)
680         : "$8","$9","$10","$f0","$f2","$f4","$f6","$f8","$f10"
681     );
682 }
683
684 void ff_h264_idct_add16_8_mmi(uint8_t *dst, const int *block_offset,
685         int16_t *block, int stride, const uint8_t nnzc[15*8])
686 {
687     int i;
688     for(i=0; i<16; i++){
689         int nnz = nnzc[ scan8[i] ];
690         if(nnz){
691             if(nnz==1 && ((int16_t*)block)[i*16])
692                 ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16,
693                         stride);
694             else
695                 ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16,
696                         stride);
697         }
698     }
699 }
700
701 void ff_h264_idct_add16intra_8_mmi(uint8_t *dst, const int *block_offset,
702         int16_t *block, int stride, const uint8_t nnzc[15*8])
703 {
704     int i;
705     for(i=0; i<16; i++){
706         if(nnzc[ scan8[i] ])
707             ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16, stride);
708         else if(((int16_t*)block)[i*16])
709             ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16,
710                     stride);
711     }
712 }
713
714 void ff_h264_idct8_add4_8_mmi(uint8_t *dst, const int *block_offset,
715         int16_t *block, int stride, const uint8_t nnzc[15*8])
716 {
717     int i;
718     for(i=0; i<16; i+=4){
719         int nnz = nnzc[ scan8[i] ];
720         if(nnz){
721             if(nnz==1 && ((int16_t*)block)[i*16])
722                 ff_h264_idct8_dc_add_8_mmi(dst + block_offset[i],
723                         block + i*16, stride);
724             else
725                 ff_h264_idct8_add_8_mmi(dst + block_offset[i], block + i*16,
726                         stride);
727         }
728     }
729 }
730
731 void ff_h264_idct_add8_8_mmi(uint8_t **dest, const int *block_offset,
732         int16_t *block, int stride, const uint8_t nnzc[15*8])
733 {
734     int i, j;
735     for(j=1; j<3; j++){
736         for(i=j*16; i<j*16+4; i++){
737             if(nnzc[ scan8[i] ])
738                 ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i],
739                         block + i*16, stride);
740             else if(((int16_t*)block)[i*16])
741                 ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i],
742                         block + i*16, stride);
743         }
744     }
745 }
746
747 void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset,
748         int16_t *block, int stride, const uint8_t nnzc[15*8])
749 {
750     int i, j;
751
752     for(j=1; j<3; j++){
753         for(i=j*16; i<j*16+4; i++){
754             if(nnzc[ scan8[i] ])
755                 ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i],
756                         block + i*16, stride);
757             else if(((int16_t*)block)[i*16])
758                 ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i],
759                         block + i*16, stride);
760         }
761     }
762
763     for(j=1; j<3; j++){
764         for(i=j*16+4; i<j*16+8; i++){
765             if(nnzc[ scan8[i+4] ])
766                 ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i+4],
767                         block + i*16, stride);
768             else if(((int16_t*)block)[i*16])
769                 ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i+4],
770                         block + i*16, stride);
771         }
772     }
773 }
774
775 void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
776         int qmul)
777 {
778     __asm__ volatile (
779         ".set noreorder                                 \r\n"
780         "dli $10, 0x8                                   \r\n"
781         "ldc1 $f6, 0x18(%[input])                       \r\n"
782         "dmtc1 $10, $f16                                \r\n"
783         "ldc1 $f4, 0x10(%[input])                       \r\n"
784         "dli $10, 0x20                                  \r\n"
785         "ldc1 $f2, 0x8(%[input])                        \r\n"
786         "dmtc1 $10, $f18                                \r\n"
787         "ldc1 $f0, 0x0(%[input])                        \r\n"
788         "mov.d $f8, $f6                                 \r\n"
789         "paddh $f6, $f6, $f4                            \r\n"
790         "psubh $f4, $f4, $f8                            \r\n"
791         "mov.d $f8, $f2                                 \r\n"
792         "paddh $f2, $f2, $f0                            \r\n"
793         "psubh $f0, $f0, $f8                            \r\n"
794         "mov.d $f8, $f6                                 \r\n"
795         "paddh $f6, $f6, $f2                            \r\n"
796         "psubh $f2, $f2, $f8                            \r\n"
797         "mov.d $f8, $f4                                 \r\n"
798         "paddh $f4, $f4, $f0                            \r\n"
799         "psubh $f0, $f0, $f8                            \r\n"
800         "mov.d $f8, $f6                                 \r\n"
801         "punpcklhw $f6, $f6, $f2                        \r\n"
802         "punpckhhw $f8, $f8, $f2                        \r\n"
803         "punpckhhw $f2, $f0, $f4                        \r\n"
804         "punpcklhw $f0, $f0, $f4                        \r\n"
805         "punpckhwd $f4, $f6, $f0                        \r\n"
806         "punpcklwd $f6, $f6, $f0                        \r\n"
807         "mov.d $f0, $f8                                 \r\n"
808         "punpcklwd $f8, $f8, $f2                        \r\n"
809         "punpckhwd $f0, $f0, $f2                        \r\n"
810         "mov.d $f2, $f0                                 \r\n"
811         "paddh $f0, $f0, $f8                            \r\n"
812         "psubh $f8, $f8, $f2                            \r\n"
813         "mov.d $f2, $f4                                 \r\n"
814         "paddh $f4, $f4, $f6                            \r\n"
815         "psubh $f6, $f6, $f2                            \r\n"
816         "mov.d $f2, $f0                                 \r\n"
817         "paddh $f0, $f0, $f4                            \r\n"
818         "psubh $f4, $f4, $f2                            \r\n"
819         "mov.d $f2, $f8                                 \r\n"
820         "daddiu $10, %[qmul], -0x7fff                   \r\n"
821         "paddh $f8, $f8, $f6                            \r\n"
822         "bgtz $10, 1f                                   \r\n"
823         "psubh $f6, $f6, $f2                            \r\n"
824         "ori $10, $0, 0x80                              \r\n"
825         "dsll $10, $10, 0x10                            \r\n"
826         "punpckhhw $f2, $f0, %[ff_pw_1]                 \r\n"
827         "daddu %[qmul], %[qmul], $10                    \r\n"
828         "punpcklhw $f0, $f0, %[ff_pw_1]                 \r\n"
829         "punpckhhw $f10, $f4, %[ff_pw_1]                \r\n"
830         "punpcklhw $f4, $f4, %[ff_pw_1]                 \r\n"
831         "mtc1 %[qmul], $f14                             \r\n"
832         "punpcklwd $f14, $f14, $f14                     \r\n"
833         "pmaddhw $f0, $f0, $f14                         \r\n"
834         "pmaddhw $f4, $f4, $f14                         \r\n"
835         "pmaddhw $f2, $f2, $f14                         \r\n"
836         "pmaddhw $f10, $f10, $f14                       \r\n"
837         "psraw $f0, $f0, $f16                           \r\n"
838         "psraw $f4, $f4, $f16                           \r\n"
839         "psraw $f2, $f2, $f16                           \r\n"
840         "psraw $f10, $f10, $f16                         \r\n"
841         "packsswh $f0, $f0, $f2                         \r\n"
842         "packsswh $f4, $f4, $f10                        \r\n"
843         "mfc1 $9, $f0                                   \r\n"
844         "dsrl $f0, $f0, $f18                            \r\n"
845         "mfc1 %[input], $f0                             \r\n"
846         "sh $9, 0x0(%[output])                          \r\n"
847         "sh %[input], 0x80(%[output])                   \r\n"
848         "dsrl $9, $9, 0x10                              \r\n"
849         "dsrl %[input], %[input], 0x10                  \r\n"
850         "sh $9, 0x20(%[output])                         \r\n"
851         "sh %[input], 0xa0(%[output])                   \r\n"
852         "mfc1 $9, $f4                                   \r\n"
853         "dsrl $f4, $f4, $f18                            \r\n"
854         "mfc1 %[input], $f4                             \r\n"
855         "sh $9, 0x40(%[output])                         \r\n"
856         "sh %[input], 0xc0(%[output])                   \r\n"
857         "dsrl $9, $9, 0x10                              \r\n"
858         "dsrl %[input], %[input], 0x10                  \r\n"
859         "sh $9, 0x60(%[output])                         \r\n"
860         "sh %[input], 0xe0(%[output])                   \r\n"
861         "punpckhhw $f2, $f6, %[ff_pw_1]                 \r\n"
862         "punpcklhw $f6, $f6, %[ff_pw_1]                 \r\n"
863         "punpckhhw $f10, $f8, %[ff_pw_1]                \r\n"
864         "punpcklhw $f8, $f8, %[ff_pw_1]                 \r\n"
865         "mtc1 %[qmul], $f14                             \r\n"
866         "punpcklwd $f14, $f14, $f14                     \r\n"
867         "pmaddhw $f6, $f6, $f14                         \r\n"
868         "pmaddhw $f8, $f8, $f14                         \r\n"
869         "pmaddhw $f2, $f2, $f14                         \r\n"
870         "pmaddhw $f10, $f10, $f14                       \r\n"
871         "psraw $f6, $f6, $f16                           \r\n"
872         "psraw $f8, $f8, $f16                           \r\n"
873         "psraw $f2, $f2, $f16                           \r\n"
874         "psraw $f10, $f10, $f16                         \r\n"
875         "packsswh $f6, $f6, $f2                         \r\n"
876         "packsswh $f8, $f8, $f10                        \r\n"
877         "mfc1 $9, $f6                                   \r\n"
878         "dsrl $f6, $f6, $f18                            \r\n"
879         "mfc1 %[input], $f6                             \r\n"
880         "sh $9, 0x100(%[output])                        \r\n"
881         "sh %[input], 0x180(%[output])                  \r\n"
882         "dsrl $9, $9, 0x10                              \r\n"
883         "dsrl %[input], %[input], 0x10                  \r\n"
884         "sh $9, 0x120(%[output])                        \r\n"
885         "sh %[input], 0x1a0(%[output])                  \r\n"
886         "mfc1 $9, $f8                                   \r\n"
887         "dsrl $f8, $f8, $f18                            \r\n"
888         "mfc1 %[input], $f8                             \r\n"
889         "sh $9, 0x140(%[output])                        \r\n"
890         "sh %[input], 0x1c0(%[output])                  \r\n"
891         "dsrl $9, $9, 0x10                              \r\n"
892         "dsrl %[input], %[input], 0x10                  \r\n"
893         "sh $9, 0x160(%[output])                        \r\n"
894         "jr $31                                         \r\n"
895         "sh %[input], 0x1e0(%[output])                  \r\n"
896         "1:                                             \r\n"
897         "ori $10, $0, 0x1f                              \r\n"
898         "clz $9, %[qmul]                                \r\n"
899         "ori %[input], $0, 0x7                          \r\n"
900         "dsubu $9, $10, $9                              \r\n"
901         "ori $10, $0, 0x80                              \r\n"
902         "dsll $10, $10, 0x10                            \r\n"
903         "daddu %[qmul], %[qmul], $10                    \r\n"
904         "dsubu $10, $9, %[input]                        \r\n"
905         "movn $9, %[input], $10                         \r\n"
906         "daddiu %[input], %[input], 0x1                 \r\n"
907         "andi $10, $9, 0xff                             \r\n"
908         "dsrlv %[qmul], %[qmul], $10                    \r\n"
909         "dsubu %[input], %[input], $9                   \r\n"
910         "mtc1 %[input], $f12                            \r\n"
911         "punpckhhw $f2, $f0, %[ff_pw_1]                 \r\n"
912         "punpcklhw $f0, $f0, %[ff_pw_1]                 \r\n"
913         "punpckhhw $f10, $f4, %[ff_pw_1]                \r\n"
914         "punpcklhw $f4, $f4, %[ff_pw_1]                 \r\n"
915         "mtc1 %[qmul], $f14                             \r\n"
916         "punpcklwd $f14, $f14, $f14                     \r\n"
917         "pmaddhw $f0, $f0, $f14                         \r\n"
918         "pmaddhw $f4, $f4, $f14                         \r\n"
919         "pmaddhw $f2, $f2, $f14                         \r\n"
920         "pmaddhw $f10, $f10, $f14                       \r\n"
921         "psraw $f0, $f0, $f12                           \r\n"
922         "psraw $f4, $f4, $f12                           \r\n"
923         "psraw $f2, $f2, $f12                           \r\n"
924         "psraw $f10, $f10, $f12                         \r\n"
925         "packsswh $f0, $f0, $f2                         \r\n"
926         "packsswh $f4, $f4, $f10                        \r\n"
927         "mfc1 $9, $f0                                   \r\n"
928         "dsrl $f0, $f0, $f18                            \r\n"
929         "sh $9, 0x0(%[output])                          \r\n"
930         "mfc1 %[input], $f0                             \r\n"
931         "dsrl $9, $9, 0x10                              \r\n"
932         "sh %[input], 0x80(%[output])                   \r\n"
933         "sh $9, 0x20(%[output])                         \r\n"
934         "dsrl %[input], %[input], 0x10                  \r\n"
935         "mfc1 $9, $f4                                   \r\n"
936         "sh %[input], 0xa0(%[output])                   \r\n"
937         "dsrl $f4, $f4, $f18                            \r\n"
938         "sh $9, 0x40(%[output])                         \r\n"
939         "mfc1 %[input], $f4                             \r\n"
940         "dsrl $9, $9, 0x10                              \r\n"
941         "sh %[input], 0xc0(%[output])                   \r\n"
942         "sh $9, 0x60(%[output])                         \r\n"
943         "dsrl %[input], %[input], 0x10                  \r\n"
944         "sh %[input], 0xe0(%[output])                   \r\n"
945         "punpckhhw $f2, $f6, %[ff_pw_1]                 \r\n"
946         "punpcklhw $f6, $f6, %[ff_pw_1]                 \r\n"
947         "punpckhhw $f10, $f8, %[ff_pw_1]                \r\n"
948         "punpcklhw $f8, $f8, %[ff_pw_1]                 \r\n"
949         "mtc1 %[qmul], $f14                             \r\n"
950         "punpcklwd $f14, $f14, $f14                     \r\n"
951         "pmaddhw $f6, $f6, $f14                         \r\n"
952         "pmaddhw $f8, $f8, $f14                         \r\n"
953         "pmaddhw $f2, $f2, $f14                         \r\n"
954         "pmaddhw $f10, $f10, $f14                       \r\n"
955         "psraw $f6, $f6, $f12                           \r\n"
956         "psraw $f8, $f8, $f12                           \r\n"
957         "psraw $f2, $f2, $f12                           \r\n"
958         "psraw $f10, $f10, $f12                         \r\n"
959         "packsswh $f6, $f6, $f2                         \r\n"
960         "packsswh $f8, $f8, $f10                        \r\n"
961         "mfc1 $9, $f6                                   \r\n"
962         "dsrl $f6, $f6, $f18                            \r\n"
963         "mfc1 %[input], $f6                             \r\n"
964         "sh $9, 0x100(%[output])                        \r\n"
965         "sh %[input], 0x180(%[output])                  \r\n"
966         "dsrl $9, $9, 0x10                              \r\n"
967         "dsrl %[input], %[input], 0x10                  \r\n"
968         "sh $9, 0x120(%[output])                        \r\n"
969         "sh %[input], 0x1a0(%[output])                  \r\n"
970         "mfc1 $9, $f8                                   \r\n"
971         "dsrl $f8, $f8, $f18                            \r\n"
972         "mfc1 %[input], $f8                             \r\n"
973         "sh $9, 0x140(%[output])                        \r\n"
974         "sh %[input], 0x1c0(%[output])                  \r\n"
975         "dsrl $9, $9, 0x10                              \r\n"
976         "dsrl %[input], %[input], 0x10                  \r\n"
977         "sh $9, 0x160(%[output])                        \r\n"
978         "sh %[input], 0x1e0(%[output])                  \r\n"
979         ".set reorder                                   \r\n"
980         ::[output]"r"(output),[input]"r"(input),[qmul]"r"((uint64_t)qmul),
981           [ff_pw_1]"f"(ff_pw_1)
982         : "$9","$10","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
983           "$f18"
984     );
985 }
986
987 void ff_h264_chroma422_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
988 {
989     int temp[8];
990     int t[8];
991
992     temp[0] = block[0] + block[16];
993     temp[1] = block[0] - block[16];
994     temp[2] = block[32] + block[48];
995     temp[3] = block[32] - block[48];
996     temp[4] = block[64] + block[80];
997     temp[5] = block[64] - block[80];
998     temp[6] = block[96] + block[112];
999     temp[7] = block[96] - block[112];
1000
1001     t[0] = temp[0] + temp[4] + temp[2] + temp[6];
1002     t[1] = temp[0] - temp[4] + temp[2] - temp[6];
1003     t[2] = temp[0] - temp[4] - temp[2] + temp[6];
1004     t[3] = temp[0] + temp[4] - temp[2] - temp[6];
1005     t[4] = temp[1] + temp[5] + temp[3] + temp[7];
1006     t[5] = temp[1] - temp[5] + temp[3] - temp[7];
1007     t[6] = temp[1] - temp[5] - temp[3] + temp[7];
1008     t[7] = temp[1] + temp[5] - temp[3] - temp[7];
1009
1010     block[  0]= (t[0]*qmul + 128) >> 8;
1011     block[ 32]= (t[1]*qmul + 128) >> 8;
1012     block[ 64]= (t[2]*qmul + 128) >> 8;
1013     block[ 96]= (t[3]*qmul + 128) >> 8;
1014     block[ 16]= (t[4]*qmul + 128) >> 8;
1015     block[ 48]= (t[5]*qmul + 128) >> 8;
1016     block[ 80]= (t[6]*qmul + 128) >> 8;
1017     block[112]= (t[7]*qmul + 128) >> 8;
1018 }
1019
1020 void ff_h264_chroma_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
1021 {
1022     int a,b,c,d;
1023
1024     d = block[0] - block[16];
1025     a = block[0] + block[16];
1026     b = block[32] - block[48];
1027     c = block[32] + block[48];
1028     block[0] = ((a+c)*qmul) >> 7;
1029     block[16]= ((d+b)*qmul) >> 7;
1030     block[32]= ((a-c)*qmul) >> 7;
1031     block[48]= ((d-b)*qmul) >> 7;
1032 }
1033
1034 void ff_h264_weight_pixels16_8_mmi(uint8_t *block, int stride,
1035         int height, int log2_denom, int weight, int offset)
1036 {
1037     int y;
1038
1039     offset <<= log2_denom;
1040
1041     if (log2_denom)
1042         offset += 1 << (log2_denom - 1);
1043
1044     for (y=0; y<height; y++, block+=stride) {
1045         __asm__ volatile (
1046             "ldc1 $f2, %0                   \r\n"
1047             "ldc1 $f4, %1                   \r\n"
1048             "dmtc1 $0, $f20                 \r\n"
1049             "mtc1 %2, $f6                   \r\n"
1050             "mtc1 %3, $f8                   \r\n"
1051             "mtc1 %4, $f10                  \r\n"
1052             "pshufh $f6, $f6, $f20          \r\n"
1053             "pshufh $f8, $f8, $f20          \r\n"
1054             "punpckhbh $f14, $f2, $f20      \r\n"
1055             "punpckhbh $f16, $f4, $f20      \r\n"
1056             "punpcklbh $f2, $f2, $f20       \r\n"
1057             "punpcklbh $f4, $f4, $f20       \r\n"
1058             "pmullh $f14, $f14, $f6         \r\n"
1059             "pmullh $f16, $f16, $f6         \r\n"
1060             "pmullh $f2, $f2, $f6           \r\n"
1061             "pmullh $f4, $f4, $f6           \r\n"
1062             "paddsh $f14, $f14, $f8         \r\n"
1063             "paddsh $f16, $f16, $f8         \r\n"
1064             "paddsh $f2, $f2, $f8           \r\n"
1065             "paddsh $f4, $f4, $f8           \r\n"
1066             "psrah $f14, $f14, $f10         \r\n"
1067             "psrah $f16, $f16, $f10         \r\n"
1068             "psrah $f2, $f2, $f10           \r\n"
1069             "psrah $f4, $f4, $f10           \r\n"
1070             "packushb $f2, $f2, $f14        \r\n"
1071             "packushb $f4, $f4, $f16        \r\n"
1072             "sdc1 $f2, %0                   \r\n"
1073             "sdc1 $f4, %1                   \r\n"
1074             : "=m"(*block),"=m"(*(block + 8))
1075             : "r"(weight),"r"(offset),"r"(log2_denom)
1076         );
1077     }
1078 }
1079
1080 void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src,
1081         int stride, int height, int log2_denom, int weightd, int weights,
1082         int offset)
1083 {
1084     int y;
1085
1086     offset = ((offset + 1) | 1) << log2_denom;
1087
1088     for (y=0; y<height; y++, dst+=stride, src+=stride) {
1089         __asm__ volatile (
1090             "ldc1 $f2, %2                   \r\n"
1091             "ldc1 $f4, %3                   \r\n"
1092             "dmtc1 $0, $f20                 \r\n"
1093             "mtc1 %6, $f6                   \r\n"
1094             "mtc1 %7, $f8                   \r\n"
1095             "mtc1 %8, $f10                  \r\n"
1096             "mtc1 %9, $f12                  \r\n"
1097             "pshufh $f6, $f6, $f20          \r\n"
1098             "pshufh $f8, $f8, $f20          \r\n"
1099             "pshufh $f10, $f10, $f20        \r\n"
1100             "punpckhbh $f14, $f2, $f20      \r\n"
1101             "punpckhbh $f16, $f4, $f20      \r\n"
1102             "punpcklbh $f2, $f2, $f20       \r\n"
1103             "punpcklbh $f4, $f4, $f20       \r\n"
1104             "pmullh $f14, $f14, $f6         \r\n"
1105             "pmullh $f16, $f16, $f8         \r\n"
1106             "pmullh $f2, $f2, $f6           \r\n"
1107             "pmullh $f4, $f4, $f8           \r\n"
1108             "paddsh $f14, $f14, $f10        \r\n"
1109             "paddsh $f2, $f2, $f10          \r\n"
1110             "paddsh $f14, $f14, $f16        \r\n"
1111             "paddsh $f2, $f2, $f4           \r\n"
1112             "psrah $f14, $f14, $f12         \r\n"
1113             "psrah $f2, $f2, $f12           \r\n"
1114             "packushb $f2, $f2, $f14        \r\n"
1115             "sdc1 $f2, %0                   \r\n"
1116             "ldc1 $f2, %4                   \r\n"
1117             "ldc1 $f4, %5                   \r\n"
1118             "punpckhbh $f14, $f2, $f20      \r\n"
1119             "punpckhbh $f16, $f4, $f20      \r\n"
1120             "punpcklbh $f2, $f2, $f20       \r\n"
1121             "punpcklbh $f4, $f4, $f20       \r\n"
1122             "pmullh $f14, $f14, $f6         \r\n"
1123             "pmullh $f16, $f16, $f8         \r\n"
1124             "pmullh $f2, $f2, $f6           \r\n"
1125             "pmullh $f4, $f4, $f8           \r\n"
1126             "paddsh $f14, $f14, $f10        \r\n"
1127             "paddsh $f2, $f2, $f10          \r\n"
1128             "paddsh $f14, $f14, $f16        \r\n"
1129             "paddsh $f2, $f2, $f4           \r\n"
1130             "psrah $f14, $f14, $f12         \r\n"
1131             "psrah $f2, $f2, $f12           \r\n"
1132             "packushb $f2, $f2, $f14        \r\n"
1133             "sdc1 $f2, %1                   \r\n"
1134             : "=m"(*dst),"=m"(*(dst+8))
1135             : "m"(*src),"m"(*dst),"m"(*(src+8)),"m"(*(dst+8)),
1136               "r"(weights),"r"(weightd),"r"(offset),"r"(log2_denom+1)
1137         );
1138     }
1139 }
1140
1141 void ff_h264_weight_pixels8_8_mmi(uint8_t *block, int stride, int height,
1142         int log2_denom, int weight, int offset)
1143 {
1144     int y;
1145
1146     offset <<= log2_denom;
1147
1148     if (log2_denom)
1149         offset += 1 << (log2_denom - 1);
1150
1151     for (y=0; y<height; y++, block+=stride) {
1152         __asm__ volatile (
1153             "ldc1 $f2, %0                   \r\n"
1154             "mtc1 %1, $f6                   \r\n"
1155             "mtc1 %2, $f8                   \r\n"
1156             "mtc1 %3, $f10                  \r\n"
1157             "dmtc1 $0, $f20                 \r\n"
1158             "pshufh $f6, $f6, $f20          \r\n"
1159             "pshufh $f8, $f8, $f20          \r\n"
1160             "punpckhbh $f14, $f2, $f20      \r\n"
1161             "punpcklbh $f2, $f2, $f20       \r\n"
1162             "pmullh $f14, $f14, $f6         \r\n"
1163             "pmullh $f2, $f2, $f6           \r\n"
1164             "paddsh $f14, $f14, $f8         \r\n"
1165             "paddsh $f2, $f2, $f8           \r\n"
1166             "psrah $f14, $f14, $f10         \r\n"
1167             "psrah $f2, $f2, $f10           \r\n"
1168             "packushb $f2, $f2, $f14        \r\n"
1169             "sdc1 $f2, %0                   \r\n"
1170             : "=m"(*block)
1171             : "r"(weight),"r"(offset),"r"(log2_denom)
1172         );
1173     }
1174 }
1175
1176 void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src,
1177         int stride, int height, int log2_denom, int weightd, int weights,
1178         int offset)
1179 {
1180     int y;
1181
1182     offset = ((offset + 1) | 1) << log2_denom;
1183
1184     for (y=0; y<height; y++, dst+=stride, src+=stride) {
1185         __asm__ volatile (
1186             "ldc1 $f2, %1                   \r\n"
1187             "ldc1 $f4, %2                   \r\n"
1188             "dmtc1 $0, $f20                 \r\n"
1189             "mtc1 %3, $f6                   \r\n"
1190             "mtc1 %4, $f8                   \r\n"
1191             "mtc1 %5, $f10                  \r\n"
1192             "mtc1 %6, $f12                  \r\n"
1193             "pshufh $f6, $f6, $f20          \r\n"
1194             "pshufh $f8, $f8, $f20          \r\n"
1195             "pshufh $f10, $f10, $f20        \r\n"
1196             "punpckhbh $f14, $f2, $f20      \r\n"
1197             "punpckhbh $f16, $f4, $f20      \r\n"
1198             "punpcklbh $f2, $f2, $f20       \r\n"
1199             "punpcklbh $f4, $f4, $f20       \r\n"
1200             "pmullh $f14, $f14, $f6         \r\n"
1201             "pmullh $f16, $f16, $f8         \r\n"
1202             "pmullh $f2, $f2, $f6           \r\n"
1203             "pmullh $f4, $f4, $f8           \r\n"
1204             "paddsh $f14, $f14, $f10        \r\n"
1205             "paddsh $f2, $f2, $f10          \r\n"
1206             "paddsh $f14, $f14, $f16        \r\n"
1207             "paddsh $f2, $f2, $f4           \r\n"
1208             "psrah $f14, $f14, $f12         \r\n"
1209             "psrah $f2, $f2, $f12           \r\n"
1210             "packushb $f2, $f2, $f14        \r\n"
1211             "sdc1 $f2, %0                   \r\n"
1212             : "=m"(*dst)
1213             : "m"(*src),"m"(*dst),"r"(weights),
1214               "r"(weightd),"r"(offset),"r"(log2_denom+1)
1215         );
1216     }
1217 }
1218
1219 void ff_h264_weight_pixels4_8_mmi(uint8_t *block, int stride, int height,
1220         int log2_denom, int weight, int offset)
1221 {
1222     int y;
1223
1224     offset <<= log2_denom;
1225
1226     if (log2_denom)
1227         offset += 1 << (log2_denom - 1);
1228
1229     for (y=0; y<height; y++, block+=stride) {
1230         __asm__ volatile (
1231             "lwc1 $f2, %0                   \r\n"
1232             "mtc1 %1, $f6                   \r\n"
1233             "mtc1 %2, $f8                   \r\n"
1234             "mtc1 %3, $f10                  \r\n"
1235             "dmtc1 $0, $f20                 \r\n"
1236             "pshufh $f6, $f6, $f20          \r\n"
1237             "pshufh $f8, $f8, $f20          \r\n"
1238             "punpcklbh $f2, $f2, $f20       \r\n"
1239             "pmullh $f2, $f2, $f6           \r\n"
1240             "paddsh $f2, $f2, $f8           \r\n"
1241             "psrah $f2, $f2, $f10           \r\n"
1242             "packushb $f2, $f2, $f20        \r\n"
1243             "swc1 $f2, %0                   \r\n"
1244             : "=m"(*block)
1245             : "r"(weight),"r"(offset),"r"(log2_denom)
1246         );
1247     }
1248 }
1249
1250 void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src,
1251         int stride, int height, int log2_denom, int weightd, int weights,
1252         int offset)
1253 {
1254     int y;
1255
1256     offset = ((offset + 1) | 1) << log2_denom;
1257
1258     for (y=0; y<height; y++, dst+=stride, src+=stride) {
1259         __asm__ volatile (
1260             "lwc1 $f2, %1                   \r\n"
1261             "lwc1 $f4, %2                   \r\n"
1262             "dmtc1 $0, $f20                 \r\n"
1263             "mtc1 %3, $f6                   \r\n"
1264             "mtc1 %4, $f8                   \r\n"
1265             "mtc1 %5, $f10                  \r\n"
1266             "mtc1 %6, $f12                  \r\n"
1267             "pshufh $f6, $f6, $f20          \r\n"
1268             "pshufh $f8, $f8, $f20          \r\n"
1269             "pshufh $f10, $f10, $f20        \r\n"
1270             "punpcklbh $f2, $f2, $f20       \r\n"
1271             "punpcklbh $f4, $f4, $f20       \r\n"
1272             "pmullh $f2, $f2, $f6           \r\n"
1273             "pmullh $f4, $f4, $f8           \r\n"
1274             "paddsh $f2, $f2, $f10          \r\n"
1275             "paddsh $f2, $f2, $f4           \r\n"
1276             "psrah $f2, $f2, $f12           \r\n"
1277             "packushb $f2, $f2, $f20        \r\n"
1278             "swc1 $f2, %0                   \r\n"
1279             : "=m"(*dst)
1280             : "m"(*src),"m"(*dst),"r"(weights),
1281               "r"(weightd),"r"(offset),"r"(log2_denom+1)
1282         );
1283     }
1284 }
1285
1286 static void inline chroma_inter_body_mmi(uint8_t *pix, int stride,
1287         int alpha, int beta, int8_t *tc0)
1288 {
1289     __asm__ volatile (
1290         "xor $f16, $f16, $f16                           \r\n"
1291         "mtc1 %[alpha], $f8                             \r\n"
1292         "mtc1 %[beta], $f10                             \r\n"
1293         "pshufh $f8, $f8, $f16                          \r\n"
1294         "pshufh $f10, $f10, $f16                        \r\n"
1295         "packushb $f8, $f8, $f8                         \r\n"
1296         "packushb $f10, $f10, $f10                      \r\n"
1297         "psubusb $f12, $f4, $f2                         \r\n"
1298         "psubusb $f14, $f2, $f4                         \r\n"
1299         "or $f14, $f14, $f12                            \r\n"
1300         "psubusb $f14, $f14, $f8                        \r\n"
1301         "psubusb $f12, $f2, $f0                         \r\n"
1302         "psubusb $f8, $f0, $f2                          \r\n"
1303         "or $f8, $f8, $f12                              \r\n"
1304         "psubusb $f8, $f8, $f10                         \r\n"
1305         "or $f14, $f14, $f8                             \r\n"
1306         "psubusb $f12, $f4, $f6                         \r\n"
1307         "psubusb $f8, $f6, $f4                          \r\n"
1308         "or $f8, $f8, $f12                              \r\n"
1309         "psubusb $f8, $f8, $f10                         \r\n"
1310         "or $f14, $f14, $f8                             \r\n"
1311         "xor $f12, $f12, $f12                           \r\n"
1312         "pcmpeqb $f14, $f14, $f12                       \r\n"
1313         "lwc1 $f12, 0x0(%[tc0])                         \r\n"
1314         "punpcklbh $f12, $f12, $f12                     \r\n"
1315         "and $f14, $f14, $f12                           \r\n"
1316         "pcmpeqb $f8, $f8, $f8                          \r\n"
1317         "xor $f10, $f2, $f4                             \r\n"
1318         "xor $f6, $f6, $f8                              \r\n"
1319         "and $f10, $f10, %[ff_pb_1]                     \r\n"
1320         "pavgb $f6, $f6, $f0                            \r\n"
1321         "xor $f8, $f8, $f2                              \r\n"
1322         "pavgb $f6, $f6, %[ff_pb_3]                     \r\n"
1323         "pavgb $f8, $f8, $f4                            \r\n"
1324         "pavgb $f6, $f6, $f10                           \r\n"
1325         "paddusb $f6, $f6, $f8                          \r\n"
1326         "psubusb $f12, %[ff_pb_A1], $f6                 \r\n"
1327         "psubusb $f6, $f6, %[ff_pb_A1]                  \r\n"
1328         "pminub $f12, $f12, $f14                        \r\n"
1329         "pminub $f6, $f6, $f14                          \r\n"
1330         "psubusb $f2, $f2, $f12                         \r\n"
1331         "psubusb $f4, $f4, $f6                          \r\n"
1332         "paddusb $f2, $f2, $f6                          \r\n"
1333         "paddusb $f4, $f4, $f12                         \r\n"
1334         ::[pix]"r"(pix),[stride]"r"((int64_t)stride),
1335           [alpha]"r"((int64_t)alpha),[beta]"r"((int64_t)beta),[tc0]"r"(tc0),
1336           [ff_pb_1]"f"(ff_pb_1),[ff_pb_3]"f"(ff_pb_3),[ff_pb_A1]"f"(ff_pb_A1)
1337         : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16"
1338     );
1339 }
1340
1341 static void inline chroma_intra_body_mmi(uint8_t *pix, int stride,
1342         int alpha, int beta)
1343 {
1344     __asm__ volatile (
1345         "xor $f16, $f16, $f16                           \r\n"
1346         "mtc1 %[alpha], $f8                             \r\n"
1347         "mtc1 %[beta], $f10                             \r\n"
1348         "pshufh $f8, $f8, $f16                          \r\n"
1349         "pshufh $f10, $f10, $f16                        \r\n"
1350         "packushb $f8, $f8, $f8                         \r\n"
1351         "packushb $f10, $f10, $f10                      \r\n"
1352         "psubusb $f12, $f4, $f2                         \r\n"
1353         "psubusb $f14, $f2, $f4                         \r\n"
1354         "or $f14, $f14, $f12                            \r\n"
1355         "psubusb $f14, $f14, $f8                        \r\n"
1356         "psubusb $f12, $f2, $f0                         \r\n"
1357         "psubusb $f8, $f0, $f2                          \r\n"
1358         "or $f8, $f8, $f12                              \r\n"
1359         "psubusb $f8, $f8, $f10                         \r\n"
1360         "or $f14, $f14, $f8                             \r\n"
1361         "psubusb $f12, $f4, $f6                         \r\n"
1362         "psubusb $f8, $f6, $f4                          \r\n"
1363         "or $f8, $f8, $f12                              \r\n"
1364         "psubusb $f8, $f8, $f10                         \r\n"
1365         "or $f14, $f14, $f8                             \r\n"
1366         "xor $f12, $f12, $f12                           \r\n"
1367         "pcmpeqb $f14, $f14, $f12                       \r\n"
1368         "mov.d $f10, $f2                                \r\n"
1369         "mov.d $f12, $f4                                \r\n"
1370         "xor $f8, $f2, $f6                              \r\n"
1371         "and $f8, $f8, %[ff_pb_1]                       \r\n"
1372         "pavgb $f2, $f2, $f6                            \r\n"
1373         "psubusb $f2, $f2, $f8                          \r\n"
1374         "pavgb $f2, $f2, $f0                            \r\n"
1375         "xor $f8, $f4, $f0                              \r\n"
1376         "and $f8, $f8, %[ff_pb_1]                       \r\n"
1377         "pavgb $f4, $f4, $f0                            \r\n"
1378         "psubusb $f4, $f4, $f8                          \r\n"
1379         "pavgb $f4, $f4, $f6                            \r\n"
1380         "psubb $f2, $f2, $f10                           \r\n"
1381         "psubb $f4, $f4, $f12                           \r\n"
1382         "and $f2, $f2, $f14                             \r\n"
1383         "and $f4, $f4, $f14                             \r\n"
1384         "paddb $f2, $f2, $f10                           \r\n"
1385         "paddb $f4, $f4, $f12                           \r\n"
1386         ::[pix]"r"(pix),[stride]"r"((int64_t)stride),
1387           [alpha]"r"((int64_t)alpha),[beta]"r"((int64_t)beta),
1388           [ff_pb_1]"f"(ff_pb_1)
1389         : "$f0","$f2","$f4","$f8","$f10","$f12","$f14","$f16"
1390     );
1391 }
1392
1393 void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
1394         int8_t *tc0)
1395 {
1396     __asm__ volatile (
1397         "daddu $8, %[stride], %[stride]                 \r\n"
1398         "xor $f16, $f16, $f16                           \r\n"
1399         "daddu $9, %[stride], $8                        \r\n"
1400         "daddiu %[alpha], %[alpha], -0x1                \r\n"
1401         "dsubu $9, $0, $9                               \r\n"
1402         "daddiu %[beta], %[beta], -0x1                  \r\n"
1403         "daddu $9, $9, %[pix]                           \r\n"
1404         "ldc1 $f4, 0x0(%[pix])                          \r\n"
1405         "gsldxc1 $f0, 0x0($9, %[stride])                \r\n"
1406         "gsldxc1 $f2, 0x0($9, $8)                       \r\n"
1407         "gsldxc1 $f6, 0x0(%[pix], %[stride])            \r\n"
1408         "mtc1 %[alpha], $f8                             \r\n"
1409         "mtc1 %[beta], $f10                             \r\n"
1410         "pshufh $f8, $f8, $f16                          \r\n"
1411         "pshufh $f10, $f10, $f16                        \r\n"
1412         "packushb $f8, $f8, $f8                         \r\n"
1413         "packushb $f10, $f10, $f10                      \r\n"
1414         "psubusb $f12, $f4, $f2                         \r\n"
1415         "psubusb $f14, $f2, $f4                         \r\n"
1416         "or $f14, $f14, $f12                            \r\n"
1417         "psubusb $f12, $f2, $f0                         \r\n"
1418         "psubusb $f14, $f14, $f8                        \r\n"
1419         "psubusb $f8, $f0, $f2                          \r\n"
1420         "or $f8, $f8, $f12                              \r\n"
1421         "psubusb $f12, $f4, $f6                         \r\n"
1422         "psubusb $f8, $f8, $f10                         \r\n"
1423         "or $f14, $f14, $f8                             \r\n"
1424         "psubusb $f8, $f6, $f4                          \r\n"
1425         "or $f8, $f8, $f12                              \r\n"
1426         "psubusb $f8, $f8, $f10                         \r\n"
1427         "or $f14, $f14, $f8                             \r\n"
1428         "pcmpeqb $f14, $f14, $f16                       \r\n"
1429         "pcmpeqb $f6, $f6, $f6                          \r\n"
1430         "gslwlc1 $f8, 0x3(%[tc0])                       \r\n"
1431         "gslwrc1 $f8, 0x0(%[tc0])                       \r\n"
1432         "punpcklbh $f8, $f8, $f8                        \r\n"
1433         "punpcklbh $f18, $f8, $f8                       \r\n"
1434         "pcmpgtb $f8, $f18, $f6                         \r\n"
1435         "ldc1 $f6, 0x0($9)                              \r\n"
1436         "and $f20, $f8, $f14                            \r\n"
1437         "psubusb $f14, $f6, $f2                         \r\n"
1438         "psubusb $f12, $f2, $f6                         \r\n"
1439         "psubusb $f14, $f14, $f10                       \r\n"
1440         "psubusb $f12, $f12, $f10                       \r\n"
1441         "pcmpeqb $f12, $f12, $f14                       \r\n"
1442         "and $f12, $f12, $f20                           \r\n"
1443         "and $f8, $f20, $f18                            \r\n"
1444         "psubb $f14, $f8, $f12                          \r\n"
1445         "and $f12, $f12, $f8                            \r\n"
1446         "pavgb $f8, $f2, $f4                            \r\n"
1447         "ldc1 $f22, 0x0($9)                             \r\n"
1448         "pavgb $f6, $f6, $f8                            \r\n"
1449         "xor $f8, $f8, $f22                             \r\n"
1450         "and $f8, $f8, %[ff_pb_1]                       \r\n"
1451         "psubusb $f6, $f6, $f8                          \r\n"
1452         "psubusb $f8, $f0, $f12                         \r\n"
1453         "paddusb $f12, $f12, $f0                        \r\n"
1454         "pmaxub $f6, $f6, $f8                           \r\n"
1455         "pminub $f6, $f6, $f12                          \r\n"
1456         "gssdxc1 $f6, 0x0($9, %[stride])                \r\n"
1457         "gsldxc1 $f8, 0x0(%[pix], $8)                   \r\n"
1458         "psubusb $f6, $f8, $f4                          \r\n"
1459         "psubusb $f12, $f4, $f8                         \r\n"
1460         "psubusb $f6, $f6, $f10                         \r\n"
1461         "psubusb $f12, $f12, $f10                       \r\n"
1462         "pcmpeqb $f12, $f12, $f6                        \r\n"
1463         "and $f12, $f12, $f20                           \r\n"
1464         "psubb $f14, $f14, $f12                         \r\n"
1465         "and $f10, $f18, $f12                           \r\n"
1466         "gsldxc1 $f6, 0x0(%[pix], %[stride])            \r\n"
1467         "pavgb $f12, $f2, $f4                           \r\n"
1468         "gsldxc1 $f22, 0x0(%[pix], $8)                  \r\n"
1469         "pavgb $f8, $f8, $f12                           \r\n"
1470         "xor $f12, $f12, $f22                           \r\n"
1471         "and $f12, $f12, %[ff_pb_1]                     \r\n"
1472         "psubusb $f8, $f8, $f12                         \r\n"
1473         "psubusb $f12, $f6, $f10                        \r\n"
1474         "paddusb $f10, $f10, $f6                        \r\n"
1475         "pmaxub $f8, $f8, $f12                          \r\n"
1476         "pminub $f8, $f8, $f10                          \r\n"
1477         "gssdxc1 $f8, 0x0(%[pix], %[stride])            \r\n"
1478         "xor $f10, $f2, $f4                             \r\n"
1479         "pcmpeqb $f8, $f8, $f8                          \r\n"
1480         "and $f10, $f10, %[ff_pb_1]                     \r\n"
1481         "xor $f6, $f6, $f8                              \r\n"
1482         "xor $f8, $f8, $f2                              \r\n"
1483         "pavgb $f6, $f6, $f0                            \r\n"
1484         "pavgb $f6, $f6, %[ff_pb_3]                     \r\n"
1485         "pavgb $f8, $f8, $f4                            \r\n"
1486         "pavgb $f6, $f6, $f10                           \r\n"
1487         "paddusb $f6, $f6, $f8                          \r\n"
1488         "psubusb $f12, %[ff_pb_A1], $f6                 \r\n"
1489         "psubusb $f6, $f6, %[ff_pb_A1]                  \r\n"
1490         "pminub $f12, $f12, $f14                        \r\n"
1491         "pminub $f6, $f6, $f14                          \r\n"
1492         "psubusb $f2, $f2, $f12                         \r\n"
1493         "psubusb $f4, $f4, $f6                          \r\n"
1494         "paddusb $f2, $f2, $f6                          \r\n"
1495         "paddusb $f4, $f4, $f12                         \r\n"
1496         "gssdxc1 $f2, 0x0($9, $8)                       \r\n"
1497         "sdc1 $f4, 0x0(%[pix])                          \r\n"
1498         ::[pix]"r"(pix),[stride]"r"((int64_t)stride),
1499           [alpha]"r"((int64_t)alpha),[beta]"r"((int64_t)beta),[tc0]"r"(tc0),
1500           [ff_pb_1]"f"(ff_pb_1),[ff_pb_3]"f"(ff_pb_3),[ff_pb_A1]"f"(ff_pb_A1)
1501         : "$8","$9","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
1502           "$f18","$f20","$f22"
1503     );
1504 }
1505
1506 void ff_deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
1507         int beta)
1508 {
1509     uint64_t stack[0xa];
1510
1511     __asm__ volatile (
1512         "ori $8, $0, 0x1                                \r\n"
1513         "xor $f30, $f30, $f30                           \r\n"
1514         "dmtc1 $8, $f16                                 \r\n"
1515         "dsll $8, %[stride], 2                          \r\n"
1516         "daddu $10, %[stride], %[stride]                \r\n"
1517         "daddiu %[alpha], %[alpha], -0x1                \r\n"
1518         "dsll $f20, $f16, $f16                          \r\n"
1519         "bltz %[alpha], 1f                              \r\n"
1520         "daddu $9, $10, %[stride]                       \r\n"
1521         "daddiu %[beta], %[beta], -0x1                  \r\n"
1522         "bltz %[beta], 1f                               \r\n"
1523         "dsubu $8, $0, $8                               \r\n"
1524         "daddu $8, $8, %[pix]                           \r\n"
1525         "ldc1 $f4, 0x0(%[pix])                          \r\n"
1526         "gsldxc1 $f0, 0x0($8, $10)                      \r\n"
1527         "gsldxc1 $f2, 0x0($8, $9)                       \r\n"
1528         "gsldxc1 $f6, 0x0(%[pix], %[stride])            \r\n"
1529         "mtc1 %[alpha], $f8                             \r\n"
1530         "mtc1 %[beta], $f10                             \r\n"
1531         "pshufh $f8, $f8, $f30                          \r\n"
1532         "pshufh $f10, $f10, $f30                        \r\n"
1533         "packushb $f8, $f8, $f8                         \r\n"
1534         "psubusb $f12, $f4, $f2                         \r\n"
1535         "psubusb $f14, $f2, $f4                         \r\n"
1536         "packushb $f10, $f10, $f10                      \r\n"
1537         "or $f14, $f14, $f12                            \r\n"
1538         "sdc1 $f8, 0x10+%[stack]                        \r\n"
1539         "psubusb $f14, $f14, $f8                        \r\n"
1540         "psubusb $f12, $f2, $f0                         \r\n"
1541         "psubusb $f8, $f0, $f2                          \r\n"
1542         "or $f8, $f8, $f12                              \r\n"
1543         "psubusb $f8, $f8, $f10                         \r\n"
1544         "or $f14, $f14, $f8                             \r\n"
1545         "psubusb $f12, $f4, $f6                         \r\n"
1546         "psubusb $f8, $f6, $f4                          \r\n"
1547         "or $f8, $f8, $f12                              \r\n"
1548         "psubusb $f8, $f8, $f10                         \r\n"
1549         "or $f14, $f14, $f8                             \r\n"
1550         "xor $f12, $f12, $f12                           \r\n"
1551         "ldc1 $f8, 0x10+%[stack]                        \r\n"
1552         "pcmpeqb $f14, $f14, $f12                       \r\n"
1553         "sdc1 $f14, 0x20+%[stack]                       \r\n"
1554         "pavgb $f8, $f8, $f30                           \r\n"
1555         "psubusb $f14, $f4, $f2                         \r\n"
1556         "pavgb $f8, $f8, %[ff_pb_1]                     \r\n"
1557         "psubusb $f12, $f2, $f4                         \r\n"
1558         "psubusb $f14, $f14, $f8                        \r\n"
1559         "psubusb $f12, $f12, $f8                        \r\n"
1560         "ldc1 $f28, 0x20+%[stack]                       \r\n"
1561         "pcmpeqb $f12, $f12, $f14                       \r\n"
1562         "and $f12, $f12, $f28                           \r\n"
1563         "gsldxc1 $f28, 0x0($8, %[stride])               \r\n"
1564         "psubusb $f14, $f28, $f2                        \r\n"
1565         "psubusb $f8, $f2, $f28                         \r\n"
1566         "psubusb $f14, $f14, $f10                       \r\n"
1567         "psubusb $f8, $f8, $f10                         \r\n"
1568         "pcmpeqb $f8, $f8, $f14                         \r\n"
1569         "and $f8, $f8, $f12                             \r\n"
1570         "gsldxc1 $f26, 0x0(%[pix], $10)                 \r\n"
1571         "sdc1 $f8, 0x30+%[stack]                        \r\n"
1572         "psubusb $f14, $f26, $f4                        \r\n"
1573         "psubusb $f8, $f4, $f26                         \r\n"
1574         "psubusb $f14, $f14, $f10                       \r\n"
1575         "psubusb $f8, $f8, $f10                         \r\n"
1576         "pcmpeqb $f8, $f8, $f14                         \r\n"
1577         "and $f8, $f8, $f12                             \r\n"
1578         "sdc1 $f8, 0x40+%[stack]                        \r\n"
1579         "pavgb $f8, $f28, $f0                           \r\n"
1580         "pavgb $f10, $f2, $f4                           \r\n"
1581         "pavgb $f8, $f8, $f10                           \r\n"
1582         "sdc1 $f10, 0x10+%[stack]                       \r\n"
1583         "paddb $f12, $f28, $f0                          \r\n"
1584         "paddb $f14, $f2, $f4                           \r\n"
1585         "paddb $f12, $f12, $f14                         \r\n"
1586         "mov.d $f14, $f12                               \r\n"
1587         "sdc1 $f12, 0x0+%[stack]                        \r\n"
1588         "psrlh $f12, $f12, $f16                         \r\n"
1589         "pavgb $f12, $f12, $f30                         \r\n"
1590         "xor $f12, $f12, $f8                            \r\n"
1591         "and $f12, $f12, %[ff_pb_1]                     \r\n"
1592         "psubb $f8, $f8, $f12                           \r\n"
1593         "pavgb $f10, $f28, $f6                          \r\n"
1594         "psubb $f12, $f28, $f6                          \r\n"
1595         "paddb $f14, $f14, $f14                         \r\n"
1596         "psubb $f14, $f14, $f12                         \r\n"
1597         "and $f12, $f12, %[ff_pb_1]                     \r\n"
1598         "psubb $f10, $f10, $f12                         \r\n"
1599         "ldc1 $f24, 0x10+%[stack]                       \r\n"
1600         "pavgb $f10, $f10, $f0                          \r\n"
1601         "psrlh $f14, $f14, $f20                         \r\n"
1602         "pavgb $f10, $f10, $f24                         \r\n"
1603         "pavgb $f14, $f14, $f30                         \r\n"
1604         "xor $f14, $f14, $f10                           \r\n"
1605         "and $f14, $f14, %[ff_pb_1]                     \r\n"
1606         "psubb $f10, $f10, $f14                         \r\n"
1607         "xor $f14, $f2, $f6                             \r\n"
1608         "pavgb $f12, $f2, $f6                           \r\n"
1609         "and $f14, $f14, %[ff_pb_1]                     \r\n"
1610         "psubb $f12, $f12, $f14                         \r\n"
1611         "ldc1 $f24, 0x30+%[stack]                       \r\n"
1612         "pavgb $f12, $f12, $f0                          \r\n"
1613         "ldc1 $f22, 0x20+%[stack]                       \r\n"
1614         "xor $f10, $f10, $f12                           \r\n"
1615         "xor $f12, $f12, $f2                            \r\n"
1616         "and $f10, $f10, $f24                           \r\n"
1617         "and $f12, $f12, $f22                           \r\n"
1618         "xor $f10, $f10, $f12                           \r\n"
1619         "xor $f10, $f10, $f2                            \r\n"
1620         "gssdxc1 $f10, 0x0($8, $9)                      \r\n"
1621         "ldc1 $f10, 0x0($8)                             \r\n"
1622         "paddb $f12, $f28, $f10                         \r\n"
1623         "pavgb $f10, $f10, $f28                         \r\n"
1624         "ldc1 $f22, 0x0+%[stack]                        \r\n"
1625         "pavgb $f10, $f10, $f8                          \r\n"
1626         "paddb $f12, $f12, $f12                         \r\n"
1627         "paddb $f12, $f12, $f22                         \r\n"
1628         "psrlh $f12, $f12, $f20                         \r\n"
1629         "pavgb $f12, $f12, $f30                         \r\n"
1630         "xor $f12, $f12, $f10                           \r\n"
1631         "and $f12, $f12, %[ff_pb_1]                     \r\n"
1632         "ldc1 $f22, 0x30+%[stack]                       \r\n"
1633         "psubb $f10, $f10, $f12                         \r\n"
1634         "xor $f8, $f8, $f0                              \r\n"
1635         "xor $f10, $f10, $f28                           \r\n"
1636         "and $f8, $f8, $f22                             \r\n"
1637         "and $f10, $f10, $f22                           \r\n"
1638         "xor $f8, $f8, $f0                              \r\n"
1639         "xor $f10, $f10, $f28                           \r\n"
1640         "gssdxc1 $f8, 0x0($8, $10)                      \r\n"
1641         "gssdxc1 $f10, 0x0($8, %[stride])               \r\n"
1642         "pavgb $f8, $f26, $f6                           \r\n"
1643         "pavgb $f10, $f4, $f2                           \r\n"
1644         "pavgb $f8, $f8, $f10                           \r\n"
1645         "sdc1 $f10, 0x10+%[stack]                       \r\n"
1646         "paddb $f12, $f26, $f6                          \r\n"
1647         "paddb $f14, $f4, $f2                           \r\n"
1648         "paddb $f12, $f12, $f14                         \r\n"
1649         "mov.d $f14, $f12                               \r\n"
1650         "sdc1 $f12, 0x0+%[stack]                        \r\n"
1651         "psrlh $f12, $f12, $f16                         \r\n"
1652         "pavgb $f12, $f12, $f30                         \r\n"
1653         "xor $f12, $f12, $f8                            \r\n"
1654         "and $f12, $f12, %[ff_pb_1]                     \r\n"
1655         "psubb $f8, $f8, $f12                           \r\n"
1656         "pavgb $f10, $f26, $f0                          \r\n"
1657         "paddb $f14, $f14, $f14                         \r\n"
1658         "psubb $f12, $f26, $f0                          \r\n"
1659         "psubb $f14, $f14, $f12                         \r\n"
1660         "and $f12, $f12, %[ff_pb_1]                     \r\n"
1661         "psubb $f10, $f10, $f12                         \r\n"
1662         "ldc1 $f22, 0x10+%[stack]                       \r\n"
1663         "pavgb $f10, $f10, $f6                          \r\n"
1664         "pavgb $f10, $f10, $f22                         \r\n"
1665         "psrlh $f14, $f14, $f20                         \r\n"
1666         "pavgb $f14, $f14, $f30                         \r\n"
1667         "xor $f14, $f14, $f10                           \r\n"
1668         "and $f14, $f14, %[ff_pb_1]                     \r\n"
1669         "psubb $f10, $f10, $f14                         \r\n"
1670         "xor $f14, $f4, $f0                             \r\n"
1671         "pavgb $f12, $f4, $f0                           \r\n"
1672         "and $f14, $f14, %[ff_pb_1]                     \r\n"
1673         "ldc1 $f22, 0x40+%[stack]                       \r\n"
1674         "psubb $f12, $f12, $f14                         \r\n"
1675         "ldc1 $f24, 0x20+%[stack]                       \r\n"
1676         "pavgb $f12, $f12, $f6                          \r\n"
1677         "xor $f10, $f10, $f12                           \r\n"
1678         "xor $f12, $f12, $f4                            \r\n"
1679         "and $f10, $f10, $f22                           \r\n"
1680         "and $f12, $f12, $f24                           \r\n"
1681         "xor $f10, $f10, $f12                           \r\n"
1682         "xor $f10, $f10, $f4                            \r\n"
1683         "sdc1 $f10, 0x0(%[pix])                         \r\n"
1684         "gsldxc1 $f10, 0x0(%[pix], $9)                  \r\n"
1685         "paddb $f12, $f26, $f10                         \r\n"
1686         "pavgb $f10, $f10, $f26                         \r\n"
1687         "ldc1 $f22, 0x0+%[stack]                        \r\n"
1688         "pavgb $f10, $f10, $f8                          \r\n"
1689         "paddb $f12, $f12, $f12                         \r\n"
1690         "paddb $f12, $f12, $f22                         \r\n"
1691         "psrlh $f12, $f12, $f20                         \r\n"
1692         "pavgb $f12, $f12, $f30                         \r\n"
1693         "xor $f12, $f12, $f10                           \r\n"
1694         "and $f12, $f12, %[ff_pb_1]                     \r\n"
1695         "ldc1 $f22, 0x40+%[stack]                       \r\n"
1696         "psubb $f10, $f10, $f12                         \r\n"
1697         "xor $f8, $f8, $f6                              \r\n"
1698         "xor $f10, $f10, $f26                           \r\n"
1699         "and $f8, $f8, $f22                             \r\n"
1700         "and $f10, $f10, $f22                           \r\n"
1701         "xor $f8, $f8, $f6                              \r\n"
1702         "xor $f10, $f10, $f26                           \r\n"
1703         "gssdxc1 $f8, 0x0(%[pix], %[stride])            \r\n"
1704         "gssdxc1 $f10, 0x0(%[pix], $10)                 \r\n"
1705         "1:                                             \r\n"
1706         ::[pix]"r"(pix),[stride]"r"((int64_t)stride),
1707           [alpha]"r"((int64_t)alpha),[beta]"r"((int64_t)beta),
1708           [stack]"m"(stack[0]),[ff_pb_1]"f"(ff_pb_1)
1709         : "$8","$9","$10","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14",
1710           "$f16","$f18","$f20","$f22","$f24","$f26","$f28","$f30"
1711     );
1712 }
1713
1714 void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
1715         int8_t *tc0)
1716 {
1717     __asm__ volatile (
1718         "daddiu %[alpha], %[alpha], -0x1                \r\n"
1719         "daddiu %[beta], %[beta], -0x1                  \r\n"
1720         "or $16, $0, %[pix]                             \r\n"
1721         "dsubu $16, $16, %[stride]                      \r\n"
1722         "dsubu $16, $16, %[stride]                      \r\n"
1723         "ldc1 $f0, 0x0($16)                             \r\n"
1724         "gsldxc1 $f2, 0x0($16, %[stride])               \r\n"
1725         "ldc1 $f4, 0x0(%[pix])                          \r\n"
1726         "gsldxc1 $f6, 0x0(%[pix], %[stride])            \r\n"
1727         : [pix]"+r"(pix),[stride]"+r"(stride),[alpha]"+r"(alpha),
1728           [beta]"+r"(beta)
1729         : [tc0]"r"(tc0)
1730         : "$16","$f2","$f4"
1731     );
1732
1733     chroma_inter_body_mmi(pix, stride, alpha, beta, tc0);
1734
1735     __asm__ volatile (
1736         "gssdxc1 $f2, 0x0($16, %[stride])               \r\n"
1737         "sdc1 $f4, 0x0(%[pix])                          \r\n"
1738         ::[pix]"r"(pix),[stride]"r"((int64_t)stride)
1739         : "$16","$f2","$f4"
1740     );
1741 }
1742
1743 void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
1744         int beta)
1745 {
1746     __asm__ volatile (
1747         "daddiu %[alpha], %[alpha], -0x1                \r\n"
1748         "daddiu %[beta], %[beta], -0x1                  \r\n"
1749         "or $16, $0, %[pix]                             \r\n"
1750         "dsubu $16, $16, %[stride]                      \r\n"
1751         "dsubu $16, $16, %[stride]                      \r\n"
1752         "ldc1 $f0, 0x0($16)                             \r\n"
1753         "gsldxc1 $f2, 0x0($16, %[stride])               \r\n"
1754         "ldc1 $f4, 0x0(%[pix])                          \r\n"
1755         "gsldxc1 $f6, 0x0(%[pix], %[stride])            \r\n"
1756         : [pix]"+r"(pix),[stride]"+r"(stride),[alpha]"+r"(alpha),
1757           [beta]"+r"(beta)
1758         ::"$16","$f0","$f2","$f4","$f6"
1759     );
1760
1761     chroma_intra_body_mmi(pix, stride, alpha, beta);
1762
1763     __asm__ volatile (
1764         "gssdxc1 $f2, 0x0($16, %[stride])               \r\n"
1765         "sdc1 $f4, 0x0(%[pix])                          \r\n"
1766         ::[pix]"r"(pix),[stride]"r"((int64_t)stride)
1767         : "$16","$f2","$f4"
1768     );
1769 }
1770
1771 void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
1772         int8_t *tc0)
1773 {
1774     __asm__ volatile (
1775         "daddiu %[alpha], %[alpha], -0x1                \r\n"
1776         "daddiu %[beta], %[beta], -0x1                  \r\n"
1777         "daddu $16, %[stride], %[stride]                \r\n"
1778         "daddiu %[pix], %[pix], -0x2                    \r\n"
1779         "daddu $17, $16, %[stride]                      \r\n"
1780         "daddu $19, $16, $16                            \r\n"
1781         "or $18, $0, %[pix]                             \r\n"
1782         "daddu %[pix], %[pix], $17                      \r\n"
1783         "gslwlc1 $f0, 0x3($18)                          \r\n"
1784         "daddu $12, $18, %[stride]                      \r\n"
1785         "gslwrc1 $f0, 0x0($18)                          \r\n"
1786         "gslwlc1 $f4, 0x3($12)                          \r\n"
1787         "daddu $13, $18, $16                            \r\n"
1788         "gslwrc1 $f4, 0x0($12)                          \r\n"
1789         "gslwlc1 $f2, 0x3($13)                          \r\n"
1790         "gslwrc1 $f2, 0x0($13)                          \r\n"
1791         "gslwlc1 $f6, 0x3(%[pix])                       \r\n"
1792         "gslwrc1 $f6, 0x0(%[pix])                       \r\n"
1793         "punpcklbh $f0, $f0, $f4                        \r\n"
1794         "punpcklbh $f2, $f2, $f6                        \r\n"
1795         "daddu $12, %[pix], %[stride]                   \r\n"
1796         "punpckhhw $f4, $f0, $f2                        \r\n"
1797         "punpcklhw $f0, $f0, $f2                        \r\n"
1798         "gslwlc1 $f8, 0x3($12)                          \r\n"
1799         "daddu $13, %[pix], $16                         \r\n"
1800         "gslwrc1 $f8, 0x0($12)                          \r\n"
1801         "gslwlc1 $f12, 0x3($13)                         \r\n"
1802         "daddu $12, %[pix], $17                         \r\n"
1803         "gslwrc1 $f12, 0x0($13)                         \r\n"
1804         "gslwlc1 $f10, 0x3($12)                         \r\n"
1805         "daddu $13, %[pix], $19                         \r\n"
1806         "gslwrc1 $f10, 0x0($12)                         \r\n"
1807         "gslwlc1 $f14, 0x3($13)                         \r\n"
1808         "gslwrc1 $f14, 0x0($13)                         \r\n"
1809         "punpcklbh $f8, $f8, $f12                       \r\n"
1810         "punpcklbh $f10, $f10, $f14                     \r\n"
1811         "mov.d $f12, $f8                                \r\n"
1812         "punpcklhw $f8, $f8, $f10                       \r\n"
1813         "punpckhhw $f12, $f12, $f10                     \r\n"
1814         "punpckhwd $f2, $f0, $f8                        \r\n"
1815         "punpckhwd $f6, $f4, $f12                       \r\n"
1816         "punpcklwd $f0, $f0, $f8                        \r\n"
1817         "punpcklwd $f4, $f4, $f12                       \r\n"
1818         "mov.d $f20, $f0                                \r\n"
1819         "mov.d $f22, $f6                                \r\n"
1820         : [pix]"+r"(pix),[stride]"+r"(stride),[alpha]"+r"(alpha),
1821           [beta]"+r"(beta)
1822         ::"$12","$13","$16","$17","$18","$19","$f0","$f2","$f4","$f6","$f8",
1823           "$f10","$f12","$f14","$f20","$f22"
1824     );
1825
1826     chroma_inter_body_mmi(pix, stride, alpha, beta, tc0);
1827
1828     __asm__ volatile (
1829         "punpckhwd $f8, $f20, $f20                      \r\n"
1830         "punpckhwd $f10, $f2, $f2                       \r\n"
1831         "punpckhwd $f12, $f4, $f4                       \r\n"
1832         "punpcklbh $f0, $f20, $f2                       \r\n"
1833         "punpcklbh $f4, $f4, $f22                       \r\n"
1834         "punpcklhw $f2, $f0, $f4                        \r\n"
1835         "punpckhhw $f0, $f0, $f4                        \r\n"
1836         "gsswlc1 $f2, 0x3($18)                          \r\n"
1837         "gsswrc1 $f2, 0x0($18)                          \r\n"
1838         "daddu $12, $18, %[stride]                      \r\n"
1839         "punpckhwd $f2, $f2, $f2                        \r\n"
1840         "gsswlc1 $f2, 0x3($12)                          \r\n"
1841         "daddu $13, $18, $16                            \r\n"
1842         "gsswrc1 $f2, 0x0($12)                          \r\n"
1843         "gsswlc1 $f0, 0x3($13)                          \r\n"
1844         "gsswrc1 $f0, 0x0($13)                          \r\n"
1845         "punpckhwd $f0, $f0, $f0                        \r\n"
1846         "punpckhwd $f6, $f22, $f22                      \r\n"
1847         "gsswlc1 $f0, 0x3(%[pix])                       \r\n"
1848         "gsswrc1 $f0, 0x0(%[pix])                       \r\n"
1849         "punpcklbh $f8, $f8, $f10                       \r\n"
1850         "punpcklbh $f12, $f12, $f6                      \r\n"
1851         "daddu $12, %[pix], %[stride]                   \r\n"
1852         "punpcklhw $f10, $f8, $f12                      \r\n"
1853         "punpckhhw $f8, $f8, $f12                       \r\n"
1854         "gsswlc1 $f10, 0x3($12)                         \r\n"
1855         "gsswrc1 $f10, 0x0($12)                         \r\n"
1856         "punpckhwd $f10, $f10, $f10                     \r\n"
1857         "daddu $12, %[pix], $16                         \r\n"
1858         "daddu $13, %[pix], $17                         \r\n"
1859         "gsswlc1 $f10, 0x3($12)                         \r\n"
1860         "gsswrc1 $f10, 0x0($12)                         \r\n"
1861         "gsswlc1 $f8, 0x3($13)                          \r\n"
1862         "daddu $12, %[pix], $19                         \r\n"
1863         "punpckhwd $f20, $f8, $f8                       \r\n"
1864         "gsswrc1 $f8, 0x0($13)                          \r\n"
1865         "gsswlc1 $f20, 0x3($12)                         \r\n"
1866         "gsswrc1 $f20, 0x0($12)                         \r\n"
1867         ::[pix]"r"(pix),[stride]"r"((int64_t)stride)
1868         : "$12","$13","$16","$17","$18","$19","$f0","$f2","$f4","$f6","$f8",
1869           "$f10","$f12","$f20"
1870     );
1871 }
1872
1873 void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
1874         int beta)
1875 {
1876     __asm__ volatile (
1877         "daddiu %[alpha], %[alpha], -0x1                \r\n"
1878         "daddiu %[beta], %[beta], -0x1                  \r\n"
1879         "daddu $16, %[stride], %[stride]                \r\n"
1880         "daddiu %[pix], %[pix], -0x2                    \r\n"
1881         "daddu $17, $16, %[stride]                      \r\n"
1882         "daddu $19, $16, $16                            \r\n"
1883         "or $18, $0, %[pix]                             \r\n"
1884         "daddu %[pix], %[pix], $17                      \r\n"
1885         "gslwlc1 $f0, 0x3($18)                          \r\n"
1886         "daddu $12, $18, %[stride]                      \r\n"
1887         "gslwrc1 $f0, 0x0($18)                          \r\n"
1888         "gslwlc1 $f4, 0x3($12)                          \r\n"
1889         "daddu $13, $18, $16                            \r\n"
1890         "gslwrc1 $f4, 0x0($12)                          \r\n"
1891         "gslwlc1 $f2, 0x3($13)                          \r\n"
1892         "gslwrc1 $f2, 0x0($13)                          \r\n"
1893         "gslwlc1 $f6, 0x3(%[pix])                       \r\n"
1894         "gslwrc1 $f6, 0x0(%[pix])                       \r\n"
1895         "punpcklbh $f0, $f0, $f4                        \r\n"
1896         "punpcklbh $f2, $f2, $f6                        \r\n"
1897         "daddu $12, %[pix], %[stride]                   \r\n"
1898         "punpckhhw $f4, $f0, $f2                        \r\n"
1899         "punpcklhw $f0, $f0, $f2                        \r\n"
1900         "gslwlc1 $f8, 0x3($12)                          \r\n"
1901         "daddu $13, %[pix], $16                         \r\n"
1902         "gslwrc1 $f8, 0x0($12)                          \r\n"
1903         "gslwlc1 $f12, 0x3($13)                         \r\n"
1904         "daddu $12, %[pix], $17                         \r\n"
1905         "gslwrc1 $f12, 0x0($13)                         \r\n"
1906         "gslwlc1 $f10, 0x3($12)                         \r\n"
1907         "daddu $13, %[pix], $19                         \r\n"
1908         "gslwrc1 $f10, 0x0($12)                         \r\n"
1909         "gslwlc1 $f14, 0x3($13)                         \r\n"
1910         "gslwrc1 $f14, 0x0($13)                         \r\n"
1911         "punpcklbh $f8, $f8, $f12                       \r\n"
1912         "punpcklbh $f10, $f10, $f14                     \r\n"
1913         "mov.d $f12, $f8                                \r\n"
1914         "punpcklhw $f8, $f8, $f10                       \r\n"
1915         "punpckhhw $f12, $f12, $f10                     \r\n"
1916         "punpckhwd $f2, $f0, $f8                        \r\n"
1917         "punpckhwd $f6, $f4, $f12                       \r\n"
1918         "punpcklwd $f0, $f0, $f8                        \r\n"
1919         "punpcklwd $f4, $f4, $f12                       \r\n"
1920         : [pix]"+r"(pix),[stride]"+r"(stride),[alpha]"+r"(alpha),
1921           [beta]"+r"(beta)
1922         ::"$12","$13","$16","$17","$18","$19","$f0","$f2","$f4","$f6","$f8",
1923           "$f10","$f12","$f14","$f20","$f22"
1924     );
1925
1926     chroma_intra_body_mmi(pix, stride, alpha, beta);
1927
1928     __asm__ volatile (
1929         "punpckhwd $f8, $f0, $f0                        \r\n"
1930         "punpckhwd $f10, $f2, $f2                       \r\n"
1931         "punpckhwd $f12, $f4, $f4                       \r\n"
1932         "punpcklbh $f0, $f0, $f2                        \r\n"
1933         "punpcklbh $f4, $f4, $f6                        \r\n"
1934         "punpcklhw $f2, $f0, $f4                        \r\n"
1935         "punpckhhw $f0, $f0, $f4                        \r\n"
1936         "gsswlc1 $f2, 0x3($18)                          \r\n"
1937         "gsswrc1 $f2, 0x0($18)                          \r\n"
1938         "daddu $12, $18, %[stride]                      \r\n"
1939         "punpckhwd $f2, $f2, $f2                        \r\n"
1940         "gsswlc1 $f2, 0x3($12)                          \r\n"
1941         "daddu $13, $18, $16                            \r\n"
1942         "gsswrc1 $f2, 0x0($12)                          \r\n"
1943         "gsswlc1 $f0, 0x3($13)                          \r\n"
1944         "gsswrc1 $f0, 0x0($13)                          \r\n"
1945         "punpckhwd $f0, $f0, $f0                        \r\n"
1946         "punpckhwd $f6, $f6, $f6                        \r\n"
1947         "gsswlc1 $f0, 0x3(%[pix])                       \r\n"
1948         "gsswrc1 $f0, 0x0(%[pix])                       \r\n"
1949         "punpcklbh $f8, $f8, $f10                       \r\n"
1950         "punpcklbh $f12, $f12, $f6                      \r\n"
1951         "daddu $12, %[pix], %[stride]                   \r\n"
1952         "punpcklhw $f10, $f8, $f12                      \r\n"
1953         "punpckhhw $f8, $f8, $f12                       \r\n"
1954         "gsswlc1 $f10, 0x3($12)                         \r\n"
1955         "gsswrc1 $f10, 0x0($12)                         \r\n"
1956         "punpckhwd $f10, $f10, $f10                     \r\n"
1957         "daddu $12, %[pix], $16                         \r\n"
1958         "daddu $13, %[pix], $17                         \r\n"
1959         "gsswlc1 $f10, 0x3($12)                         \r\n"
1960         "gsswrc1 $f10, 0x0($12)                         \r\n"
1961         "gsswlc1 $f8, 0x3($13)                          \r\n"
1962         "daddu $12, %[pix], $19                         \r\n"
1963         "punpckhwd $f20, $f8, $f8                       \r\n"
1964         "gsswrc1 $f8, 0x0($13)                          \r\n"
1965         "gsswlc1 $f20, 0x3($12)                         \r\n"
1966         "gsswrc1 $f20, 0x0($12)                         \r\n"
1967         ::[pix]"r"(pix),[stride]"r"((int64_t)stride)
1968         : "$12","$13","$16","$17","$18","$19","$f0","$f2","$f4","$f6","$f8",
1969           "$f10","$f12","$f20"
1970     );
1971 }
1972
1973 void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
1974         int8_t *tc0)
1975 {
1976     if ((tc0[0] & tc0[1]) >= 0)
1977         ff_deblock_v8_luma_8_mmi(pix + 0, stride, alpha, beta, tc0);
1978     if ((tc0[2] & tc0[3]) >= 0)
1979         ff_deblock_v8_luma_8_mmi(pix + 8, stride, alpha, beta, tc0 + 2);
1980 }
1981
1982 void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
1983         int beta)
1984 {
1985     ff_deblock_v8_luma_intra_8_mmi(pix + 0, stride, alpha, beta);
1986     ff_deblock_v8_luma_intra_8_mmi(pix + 8, stride, alpha, beta);
1987 }
1988
1989 void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
1990         int8_t *tc0)
1991 {
1992     uint64_t stack[0xd];
1993
1994     __asm__ volatile (
1995         "daddu $15, %[stride], %[stride]                \r\n"
1996         "daddiu $8, %[pix], -0x4                        \r\n"
1997         "daddu $9, %[stride], $15                       \r\n"
1998         "gsldlc1 $f0, 0x7($8)                           \r\n"
1999         "gsldrc1 $f0, 0x0($8)                           \r\n"
2000         "daddu $12, $8, %[stride]                       \r\n"
2001         "daddu $10, $8, $9                              \r\n"
2002         "gsldlc1 $f2, 0x7($12)                          \r\n"
2003         "daddu $11, $8, $15                             \r\n"
2004         "gsldrc1 $f2, 0x0($12)                          \r\n"
2005         "gsldlc1 $f4, 0x7($11)                          \r\n"
2006         "gsldrc1 $f4, 0x0($11)                          \r\n"
2007         "gsldlc1 $f6, 0x7($10)                          \r\n"
2008         "daddu $12, $10, %[stride]                      \r\n"
2009         "gsldrc1 $f6, 0x0($10)                          \r\n"
2010         "gsldlc1 $f8, 0x7($12)                          \r\n"
2011         "daddu $11, $10, $15                            \r\n"
2012         "gsldrc1 $f8, 0x0($12)                          \r\n"
2013         "gsldlc1 $f10, 0x7($11)                         \r\n"
2014         "daddu $12, $10, $9                             \r\n"
2015         "gsldrc1 $f10, 0x0($11)                         \r\n"
2016         "gsldlc1 $f12, 0x7($12)                         \r\n"
2017         "gsldrc1 $f12, 0x0($12)                         \r\n"
2018         "daddu $14, $15, $15                            \r\n"
2019         "punpckhbh $f14, $f0, $f2                       \r\n"
2020         "punpcklbh $f0, $f0, $f2                        \r\n"
2021         "punpckhbh $f2, $f4, $f6                        \r\n"
2022         "punpcklbh $f4, $f4, $f6                        \r\n"
2023         "punpckhbh $f6, $f8, $f10                       \r\n"
2024         "punpcklbh $f8, $f8, $f10                       \r\n"
2025         "daddu $12, $10, $14                            \r\n"
2026         "sdc1 $f2, 0x10+%[stack]                        \r\n"
2027         "gsldlc1 $f16, 0x7($12)                         \r\n"
2028         "gsldrc1 $f16, 0x0($12)                         \r\n"
2029         "daddu $13, $14, $14                            \r\n"
2030         "punpckhbh $f10, $f12, $f16                     \r\n"
2031         "punpcklbh $f12, $f12, $f16                     \r\n"
2032         "punpckhhw $f2, $f0, $f4                        \r\n"
2033         "punpcklhw $f0, $f0, $f4                        \r\n"
2034         "punpckhhw $f4, $f8, $f12                       \r\n"
2035         "punpcklhw $f8, $f8, $f12                       \r\n"
2036         "ldc1 $f16, 0x10+%[stack]                       \r\n"
2037         "punpckhwd $f0, $f0, $f8                        \r\n"
2038         "sdc1 $f0, 0x0+%[stack]                         \r\n"
2039         "punpckhhw $f12, $f14, $f16                     \r\n"
2040         "punpcklhw $f14, $f14, $f16                     \r\n"
2041         "punpckhhw $f0, $f6, $f10                       \r\n"
2042         "punpcklhw $f6, $f6, $f10                       \r\n"
2043         "punpcklwd $f12, $f12, $f0                      \r\n"
2044         "punpckhwd $f10, $f14, $f6                      \r\n"
2045         "punpcklwd $f14, $f14, $f6                      \r\n"
2046         "punpckhwd $f6, $f2, $f4                        \r\n"
2047         "punpcklwd $f2, $f2, $f4                        \r\n"
2048         "sdc1 $f2, 0x10+%[stack]                        \r\n"
2049         "sdc1 $f6, 0x20+%[stack]                        \r\n"
2050         "sdc1 $f14, 0x30+%[stack]                       \r\n"
2051         "sdc1 $f10, 0x40+%[stack]                       \r\n"
2052         "sdc1 $f12, 0x50+%[stack]                       \r\n"
2053         "daddu $8, $8, $13                              \r\n"
2054         "daddu $10, $10, $13                            \r\n"
2055         "gsldlc1 $f0, 0x7($8)                           \r\n"
2056         "daddu $12, $8, %[stride]                       \r\n"
2057         "gsldrc1 $f0, 0x0($8)                           \r\n"
2058         "gsldlc1 $f2, 0x7($12)                          \r\n"
2059         "daddu $11, $8, $15                             \r\n"
2060         "gsldrc1 $f2, 0x0($12)                          \r\n"
2061         "gsldlc1 $f4, 0x7($11)                          \r\n"
2062         "gsldrc1 $f4, 0x0($11)                          \r\n"
2063         "gsldlc1 $f6, 0x7($10)                          \r\n"
2064         "daddu $12, $10, %[stride]                      \r\n"
2065         "gsldrc1 $f6, 0x0($10)                          \r\n"
2066         "gsldlc1 $f8, 0x7($12)                          \r\n"
2067         "daddu $11, $10, $15                            \r\n"
2068         "gsldrc1 $f8, 0x0($12)                          \r\n"
2069         "gsldlc1 $f10, 0x7($11)                         \r\n"
2070         "daddu $12, $10, $9                             \r\n"
2071         "gsldrc1 $f10, 0x0($11)                         \r\n"
2072         "gsldlc1 $f12, 0x7($12)                         \r\n"
2073         "gsldrc1 $f12, 0x0($12)                         \r\n"
2074         "punpckhbh $f14, $f0, $f2                       \r\n"
2075         "punpcklbh $f0, $f0, $f2                        \r\n"
2076         "punpckhbh $f2, $f4, $f6                        \r\n"
2077         "punpcklbh $f4, $f4, $f6                        \r\n"
2078         "punpckhbh $f6, $f8, $f10                       \r\n"
2079         "punpcklbh $f8, $f8, $f10                       \r\n"
2080         "daddu $12, $10, $14                            \r\n"
2081         "sdc1 $f2, 0x18+%[stack]                        \r\n"
2082         "gsldlc1 $f16, 0x7($12)                         \r\n"
2083         "gsldrc1 $f16, 0x0($12)                         \r\n"
2084         "punpckhhw $f2, $f0, $f4                        \r\n"
2085         "punpckhbh $f10, $f12, $f16                     \r\n"
2086         "punpcklbh $f12, $f12, $f16                     \r\n"
2087         "punpcklhw $f0, $f0, $f4                        \r\n"
2088         "punpckhhw $f4, $f8, $f12                       \r\n"
2089         "punpcklhw $f8, $f8, $f12                       \r\n"
2090         "punpckhwd $f0, $f0, $f8                        \r\n"
2091         "ldc1 $f16, 0x18+%[stack]                       \r\n"
2092         "sdc1 $f0, 0x8+%[stack]                         \r\n"
2093         "punpckhhw $f12, $f14, $f16                     \r\n"
2094         "punpcklhw $f14, $f14, $f16                     \r\n"
2095         "punpckhhw $f0, $f6, $f10                       \r\n"
2096         "punpcklhw $f6, $f6, $f10                       \r\n"
2097         "punpckhwd $f10, $f14, $f6                      \r\n"
2098         "punpcklwd $f14, $f14, $f6                      \r\n"
2099         "punpckhwd $f6, $f2, $f4                        \r\n"
2100         "punpcklwd $f2, $f2, $f4                        \r\n"
2101         "punpcklwd $f12, $f12, $f0                      \r\n"
2102         "sdc1 $f2, 0x18+%[stack]                        \r\n"
2103         "sdc1 $f6, 0x28+%[stack]                        \r\n"
2104         "sdc1 $f14, 0x38+%[stack]                       \r\n"
2105         "sdc1 $f10, 0x48+%[stack]                       \r\n"
2106         "sdc1 $f12, 0x58+%[stack]                       \r\n"
2107         ::[pix]"r"(pix),[stride]"r"((int64_t)stride),[stack]"m"(stack[0])
2108         : "$8","$9","$10","$11","$12","$13","$14","$15","$f0","$f2","$f4",
2109           "$f6","$f8","$f10","$f12","$f14","$f16"
2110     );
2111
2112     ff_deblock_v_luma_8_mmi((uint8_t *) &stack[6], 0x10, alpha, beta, tc0);
2113
2114     __asm__ volatile (
2115         "daddu $15, %[stride], %[stride]                \r\n"
2116         "daddiu $8, %[pix], -0x2                        \r\n"
2117         "daddu $14, $15, $15                            \r\n"
2118         "daddu $9, $15, %[stride]                       \r\n"
2119         "daddu $13, $14, $14                            \r\n"
2120         "daddu $10, $8, $9                              \r\n"
2121         "ldc1 $f0, 0x10+%[stack]                        \r\n"
2122         "ldc1 $f2, 0x20+%[stack]                        \r\n"
2123         "ldc1 $f4, 0x30+%[stack]                        \r\n"
2124         "ldc1 $f6, 0x40+%[stack]                        \r\n"
2125         "punpckhwd $f8, $f0, $f0                        \r\n"
2126         "punpckhwd $f10, $f2, $f2                       \r\n"
2127         "punpckhwd $f12, $f4, $f4                       \r\n"
2128         "punpcklbh $f0, $f0, $f2                        \r\n"
2129         "punpcklbh $f4, $f4, $f6                        \r\n"
2130         "punpcklhw $f2, $f0, $f4                        \r\n"
2131         "punpckhhw $f0, $f0, $f4                        \r\n"
2132         "gsswlc1 $f2, 0x3($8)                           \r\n"
2133         "gsswrc1 $f2, 0x0($8)                           \r\n"
2134         "daddu $12, $8, %[stride]                       \r\n"
2135         "punpckhwd $f2, $f2, $f2                        \r\n"
2136         "daddu $11, $8, $15                             \r\n"
2137         "gsswlc1 $f2, 0x3($12)                          \r\n"
2138         "gsswrc1 $f2, 0x0($12)                          \r\n"
2139         "gsswlc1 $f0, 0x3($11)                          \r\n"
2140         "gsswrc1 $f0, 0x0($11)                          \r\n"
2141         "punpckhwd $f0, $f0, $f0                        \r\n"
2142         "punpckhwd $f6, $f6, $f6                        \r\n"
2143         "gsswlc1 $f0, 0x3($10)                          \r\n"
2144         "gsswrc1 $f0, 0x0($10)                          \r\n"
2145         "punpcklbh $f8, $f8, $f10                       \r\n"
2146         "punpcklbh $f12, $f12, $f6                      \r\n"
2147         "punpcklhw $f10, $f8, $f12                      \r\n"
2148         "daddu $12, $10, %[stride]                      \r\n"
2149         "punpckhhw $f8, $f8, $f12                       \r\n"
2150         "gsswlc1 $f10, 0x3($12)                         \r\n"
2151         "gsswrc1 $f10, 0x0($12)                         \r\n"
2152         "daddu $12, $10, $15                            \r\n"
2153         "punpckhwd $f10, $f10, $f10                     \r\n"
2154         "daddu $11, $10, $9                             \r\n"
2155         "gsswlc1 $f10, 0x3($12)                         \r\n"
2156         "gsswrc1 $f10, 0x0($12)                         \r\n"
2157         "gsswlc1 $f8, 0x3($11)                          \r\n"
2158         "gsswrc1 $f8, 0x0($11)                          \r\n"
2159         "daddu $12, $10, $14                            \r\n"
2160         "punpckhwd $f8, $f8, $f8                        \r\n"
2161         "daddu $8, $8, $13                              \r\n"
2162         "gsswlc1 $f8, 0x3($12)                          \r\n"
2163         "gsswrc1 $f8, 0x0($12)                          \r\n"
2164         "daddu $10, $10, $13                            \r\n"
2165         "ldc1 $f0, 0x18+%[stack]                        \r\n"
2166         "ldc1 $f2, 0x28+%[stack]                        \r\n"
2167         "ldc1 $f4, 0x38+%[stack]                        \r\n"
2168         "ldc1 $f6, 0x48+%[stack]                        \r\n"
2169         "daddu $15, %[stride], %[stride]                \r\n"
2170         "punpckhwd $f8, $f0, $f0                        \r\n"
2171         "daddu $14, $15, $15                            \r\n"
2172         "punpckhwd $f10, $f2, $f2                       \r\n"
2173         "punpckhwd $f12, $f4, $f4                       \r\n"
2174         "punpcklbh $f0, $f0, $f2                        \r\n"
2175         "punpcklbh $f4, $f4, $f6                        \r\n"
2176         "daddu $12, $8, %[stride]                       \r\n"
2177         "punpcklhw $f2, $f0, $f4                        \r\n"
2178         "punpckhhw $f0, $f0, $f4                        \r\n"
2179         "gsswlc1 $f2, 0x3($8)                           \r\n"
2180         "gsswrc1 $f2, 0x0($8)                           \r\n"
2181         "punpckhwd $f2, $f2, $f2                        \r\n"
2182         "daddu $11, $8, $15                             \r\n"
2183         "gsswlc1 $f2, 0x3($12)                          \r\n"
2184         "gsswrc1 $f2, 0x0($12)                          \r\n"
2185         "gsswlc1 $f0, 0x3($11)                          \r\n"
2186         "gsswrc1 $f0, 0x0($11)                          \r\n"
2187         "punpckhwd $f0, $f0, $f0                        \r\n"
2188         "punpckhwd $f6, $f6, $f6                        \r\n"
2189         "gsswlc1 $f0, 0x3($10)                          \r\n"
2190         "gsswrc1 $f0, 0x0($10)                          \r\n"
2191         "punpcklbh $f8, $f8, $f10                       \r\n"
2192         "punpcklbh $f12, $f12, $f6                      \r\n"
2193         "daddu $12, $10, %[stride]                      \r\n"
2194         "punpcklhw $f10, $f8, $f12                      \r\n"
2195         "punpckhhw $f8, $f8, $f12                       \r\n"
2196         "gsswlc1 $f10, 0x3($12)                         \r\n"
2197         "gsswrc1 $f10, 0x0($12)                         \r\n"
2198         "daddu $12, $10, $15                            \r\n"
2199         "punpckhwd $f10, $f10, $f10                     \r\n"
2200         "daddu $11, $10, $9                             \r\n"
2201         "gsswlc1 $f10, 0x3($12)                         \r\n"
2202         "gsswrc1 $f10, 0x0($12)                         \r\n"
2203         "gsswlc1 $f8, 0x3($11)                          \r\n"
2204         "gsswrc1 $f8, 0x0($11)                          \r\n"
2205         "daddu $12, $10, $14                            \r\n"
2206         "punpckhwd $f8, $f8, $f8                        \r\n"
2207         "gsswlc1 $f8, 0x3($12)                          \r\n"
2208         "gsswrc1 $f8, 0x0($12)                          \r\n"
2209         ::[pix]"r"(pix),[stride]"r"((int64_t)stride),[stack]"m"(stack[0])
2210         : "$8","$9","$10","$11","$12","$13","$14","$15","$f0","$f2","$f4",
2211           "$f6","$f8","$f10","$f12","$f14","$f16"
2212     );
2213 }
2214
2215 void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
2216         int beta)
2217 {
2218     uint64_t ptmp[0x11];
2219     uint64_t pdat[4];
2220
2221     __asm__ volatile (
2222         "daddu $12, %[stride], %[stride]                \r\n"
2223         "daddiu $10, %[pix], -0x4                       \r\n"
2224         "daddu $11, $12, %[stride]                      \r\n"
2225         "daddu $13, $12, $12                            \r\n"
2226         "daddu $9, $10, $11                             \r\n"
2227         "daddu $8, $10, %[stride]                       \r\n"
2228         "gsldlc1 $f0, 0x7($10)                          \r\n"
2229         "gsldrc1 $f0, 0x0($10)                          \r\n"
2230         "daddu $14, $10, $12                            \r\n"
2231         "gsldlc1 $f2, 0x7($8)                           \r\n"
2232         "gsldrc1 $f2, 0x0($8)                           \r\n"
2233         "gsldlc1 $f4, 0x7($14)                          \r\n"
2234         "gsldrc1 $f4, 0x0($14)                          \r\n"
2235         "daddu $8, $9, %[stride]                        \r\n"
2236         "gsldlc1 $f6, 0x7($9)                           \r\n"
2237         "gsldrc1 $f6, 0x0($9)                           \r\n"
2238         "daddu $14, $9, $12                             \r\n"
2239         "gsldlc1 $f8, 0x7($8)                           \r\n"
2240         "gsldrc1 $f8, 0x0($8)                           \r\n"
2241         "daddu $8, $9, $11                              \r\n"
2242         "gsldlc1 $f10, 0x7($14)                         \r\n"
2243         "gsldrc1 $f10, 0x0($14)                         \r\n"
2244         "gsldlc1 $f12, 0x7($8)                          \r\n"
2245         "gsldrc1 $f12, 0x0($8)                          \r\n"
2246         "daddu $8, $9, $13                              \r\n"
2247         "punpckhbh $f14, $f0, $f2                       \r\n"
2248         "punpcklbh $f0, $f0, $f2                        \r\n"
2249         "punpckhbh $f2, $f4, $f6                        \r\n"
2250         "punpcklbh $f4, $f4, $f6                        \r\n"
2251         "punpckhbh $f6, $f8, $f10                       \r\n"
2252         "punpcklbh $f8, $f8, $f10                       \r\n"
2253         "gsldlc1 $f16, 0x7($8)                          \r\n"
2254         "gsldrc1 $f16, 0x0($8)                          \r\n"
2255         "punpckhbh $f10, $f12, $f16                     \r\n"
2256         "punpcklbh $f12, $f12, $f16                     \r\n"
2257         "sdc1 $f6, 0x0+%[ptmp]                          \r\n"
2258         "punpckhhw $f6, $f0, $f4                        \r\n"
2259         "punpcklhw $f0, $f0, $f4                        \r\n"
2260         "punpckhhw $f4, $f8, $f12                       \r\n"
2261         "punpcklhw $f8, $f8, $f12                       \r\n"
2262         "punpckhhw $f12, $f14, $f2                      \r\n"
2263         "punpcklhw $f14, $f14, $f2                      \r\n"
2264         "sdc1 $f4, 0x20+%[ptmp]                         \r\n"
2265         "ldc1 $f4, 0x0+%[ptmp]                          \r\n"
2266         "punpckhhw $f2, $f4, $f10                       \r\n"
2267         "punpcklhw $f4, $f4, $f10                       \r\n"
2268         "punpckhwd $f10, $f0, $f8                       \r\n"
2269         "punpcklwd $f0, $f0, $f8                        \r\n"
2270         "punpckhwd $f8, $f14, $f4                       \r\n"
2271         "punpcklwd $f14, $f14, $f4                      \r\n"
2272         "sdc1 $f0, 0x0+%[ptmp]                          \r\n"
2273         "sdc1 $f10, 0x10+%[ptmp]                        \r\n"
2274         "sdc1 $f14, 0x40+%[ptmp]                        \r\n"
2275         "sdc1 $f8, 0x50+%[ptmp]                         \r\n"
2276         "ldc1 $f16, 0x20+%[ptmp]                        \r\n"
2277         "punpckhwd $f0, $f6, $f16                       \r\n"
2278         "punpcklwd $f6, $f6, $f16                       \r\n"
2279         "punpckhwd $f10, $f12, $f2                      \r\n"
2280         "punpcklwd $f12, $f12, $f2                      \r\n"
2281         "daddu $8, $13, $13                             \r\n"
2282         "sdc1 $f6, 0x20+%[ptmp]                         \r\n"
2283         "sdc1 $f0, 0x30+%[ptmp]                         \r\n"
2284         "sdc1 $f12, 0x60+%[ptmp]                        \r\n"
2285         "sdc1 $f10, 0x70+%[ptmp]                        \r\n"
2286         "daddu $10, $10, $8                             \r\n"
2287         "daddu $9, $9, $8                               \r\n"
2288         "daddu $8, $10, %[stride]                       \r\n"
2289         "gsldlc1 $f0, 0x7($10)                          \r\n"
2290         "gsldrc1 $f0, 0x0($10)                          \r\n"
2291         "daddu $14, $10, $12                            \r\n"
2292         "gsldlc1 $f2, 0x7($8)                           \r\n"
2293         "gsldrc1 $f2, 0x0($8)                           \r\n"
2294         "gsldlc1 $f4, 0x7($14)                          \r\n"
2295         "gsldrc1 $f4, 0x0($14)                          \r\n"
2296         "daddu $8, $9, %[stride]                        \r\n"
2297         "gsldlc1 $f6, 0x7($9)                           \r\n"
2298         "gsldrc1 $f6, 0x0($9)                           \r\n"
2299         "daddu $14, $9, $12                             \r\n"
2300         "gsldlc1 $f8, 0x7($8)                           \r\n"
2301         "gsldrc1 $f8, 0x0($8)                           \r\n"
2302         "daddu $8, $9, $11                              \r\n"
2303         "gsldlc1 $f10, 0x7($14)                         \r\n"
2304         "gsldrc1 $f10, 0x0($14)                         \r\n"
2305         "gsldlc1 $f12, 0x7($8)                          \r\n"
2306         "gsldrc1 $f12, 0x0($8)                          \r\n"
2307         "daddu $8, $9, $13                              \r\n"
2308         "punpckhbh $f14, $f0, $f2                       \r\n"
2309         "punpcklbh $f0, $f0, $f2                        \r\n"
2310         "punpckhbh $f2, $f4, $f6                        \r\n"
2311         "punpcklbh $f4, $f4, $f6                        \r\n"
2312         "punpckhbh $f6, $f8, $f10                       \r\n"
2313         "punpcklbh $f8, $f8, $f10                       \r\n"
2314         "gsldlc1 $f16, 0x7($8)                          \r\n"
2315         "gsldrc1 $f16, 0x0($8)                          \r\n"
2316         "punpckhbh $f10, $f12, $f16                     \r\n"
2317         "punpcklbh $f12, $f12, $f16                     \r\n"
2318         "sdc1 $f6, 0x8+%[ptmp]                          \r\n"
2319         "punpckhhw $f6, $f0, $f4                        \r\n"
2320         "punpcklhw $f0, $f0, $f4                        \r\n"
2321         "punpckhhw $f4, $f8, $f12                       \r\n"
2322         "punpcklhw $f8, $f8, $f12                       \r\n"
2323         "punpckhhw $f12, $f14, $f2                      \r\n"
2324         "punpcklhw $f14, $f14, $f2                      \r\n"
2325         "sdc1 $f4, 0x28+%[ptmp]                         \r\n"
2326         "ldc1 $f4, 0x8+%[ptmp]                          \r\n"
2327         "punpckhhw $f2, $f4, $f10                       \r\n"
2328         "punpcklhw $f4, $f4, $f10                       \r\n"
2329         "punpckhwd $f10, $f0, $f8                       \r\n"
2330         "punpcklwd $f0, $f0, $f8                        \r\n"
2331         "punpckhwd $f8, $f14, $f4                       \r\n"
2332         "punpcklwd $f14, $f14, $f4                      \r\n"
2333         "sdc1 $f0, 0x8+%[ptmp]                          \r\n"
2334         "sdc1 $f10, 0x18+%[ptmp]                        \r\n"
2335         "sdc1 $f14, 0x48+%[ptmp]                        \r\n"
2336         "sdc1 $f8, 0x58+%[ptmp]                         \r\n"
2337         "ldc1 $f16, 0x28+%[ptmp]                        \r\n"
2338         "punpckhwd $f0, $f6, $f16                       \r\n"
2339         "punpcklwd $f6, $f6, $f16                       \r\n"
2340         "punpckhwd $f10, $f12, $f2                      \r\n"
2341         "punpcklwd $f12, $f12, $f2                      \r\n"
2342         "sdc1 $f6, 0x28+%[ptmp]                         \r\n"
2343         "sdc1 $f0, 0x38+%[ptmp]                         \r\n"
2344         "sdc1 $f12, 0x68+%[ptmp]                        \r\n"
2345         "sdc1 $f10, 0x78+%[ptmp]                        \r\n"
2346         "sd $10, 0x00+%[pdat]                           \r\n"
2347         "sd $11, 0x08+%[pdat]                           \r\n"
2348         "sd $12, 0x10+%[pdat]                           \r\n"
2349         "sd $13, 0x18+%[pdat]                           \r\n"
2350         ::[pix]"r"(pix),[stride]"r"((uint64_t)stride),[ptmp]"m"(ptmp[0]),
2351           [pdat]"m"(pdat[0])
2352         : "$8","$9","$10","$11","$12","$13","$14","$f0","$f2","$f4","$f6",
2353           "$f8","$f10","$f12","$f14","$f16"
2354     );
2355
2356     ff_deblock_v_luma_intra_8_mmi((uint8_t *) &ptmp[8], 0x10, alpha, beta);
2357
2358     __asm__ volatile (
2359         "ld $10, 0x00+%[pdat]                           \r\n"
2360         "ld $11, 0x08+%[pdat]                           \r\n"
2361         "ld $12, 0x10+%[pdat]                           \r\n"
2362         "ld $13, 0x18+%[pdat]                           \r\n"
2363         "daddu $9, $10, $11                             \r\n"
2364         "ldc1 $f0, 0x8+%[ptmp]                          \r\n"
2365         "ldc1 $f2, 0x18+%[ptmp]                         \r\n"
2366         "ldc1 $f4, 0x28+%[ptmp]                         \r\n"
2367         "ldc1 $f6, 0x38+%[ptmp]                         \r\n"
2368         "ldc1 $f8, 0x48+%[ptmp]                         \r\n"
2369         "ldc1 $f10, 0x58+%[ptmp]                        \r\n"
2370         "ldc1 $f12, 0x68+%[ptmp]                        \r\n"
2371         "punpckhbh $f14, $f0, $f2                       \r\n"
2372         "punpcklbh $f0, $f0, $f2                        \r\n"
2373         "punpckhbh $f2, $f4, $f6                        \r\n"
2374         "punpcklbh $f4, $f4, $f6                        \r\n"
2375         "punpckhbh $f6, $f8, $f10                       \r\n"
2376         "punpcklbh $f8, $f8, $f10                       \r\n"
2377         "ldc1 $f16, 0x78+%[ptmp]                        \r\n"
2378         "punpckhbh $f10, $f12, $f16                     \r\n"
2379         "punpcklbh $f12, $f12, $f16                     \r\n"
2380         "gssdlc1 $f6, 0x7($10)                          \r\n"
2381         "gssdrc1 $f6, 0x0($10)                          \r\n"
2382         "daddu $8, $10, $12                             \r\n"
2383         "punpckhhw $f6, $f0, $f4                        \r\n"
2384         "punpcklhw $f0, $f0, $f4                        \r\n"
2385         "punpckhhw $f4, $f8, $f12                       \r\n"
2386         "punpcklhw $f8, $f8, $f12                       \r\n"
2387         "punpckhhw $f12, $f14, $f2                      \r\n"
2388         "punpcklhw $f14, $f14, $f2                      \r\n"
2389         "gssdlc1 $f4, 0x7($8)                           \r\n"
2390         "gssdrc1 $f4, 0x0($8)                           \r\n"
2391         "gsldlc1 $f4, 0x7($10)                          \r\n"
2392         "gsldrc1 $f4, 0x0($10)                          \r\n"
2393         "punpckhhw $f2, $f4, $f10                       \r\n"
2394         "punpcklhw $f4, $f4, $f10                       \r\n"
2395         "punpckhwd $f10, $f0, $f8                       \r\n"
2396         "punpcklwd $f0, $f0, $f8                        \r\n"
2397         "punpckhwd $f8, $f14, $f4                       \r\n"
2398         "punpcklwd $f14, $f14, $f4                      \r\n"
2399         "daddu $8, $10, %[stride]                       \r\n"
2400         "gssdlc1 $f0, 0x7($10)                          \r\n"
2401         "gssdrc1 $f0, 0x0($10)                          \r\n"
2402         "daddu $14, $9, %[stride]                       \r\n"
2403         "gssdlc1 $f10, 0x7($8)                          \r\n"
2404         "gssdrc1 $f10, 0x0($8)                          \r\n"
2405         "daddu $8, $9, $12                              \r\n"
2406         "gssdlc1 $f14, 0x7($14)                         \r\n"
2407         "gssdrc1 $f14, 0x0($14)                         \r\n"
2408         "daddu $14, $10, $12                            \r\n"
2409         "gssdlc1 $f8, 0x7($8)                           \r\n"
2410         "gssdrc1 $f8, 0x0($8)                           \r\n"
2411         "gsldlc1 $f16, 0x7($14)                         \r\n"
2412         "gsldrc1 $f16, 0x0($14)                         \r\n"
2413         "daddu $8, $10, $12                             \r\n"
2414         "punpckhwd $f0, $f6, $f16                       \r\n"
2415         "punpcklwd $f6, $f6, $f16                       \r\n"
2416         "punpckhwd $f10, $f12, $f2                      \r\n"
2417         "punpcklwd $f12, $f12, $f2                      \r\n"
2418         "gssdlc1 $f6, 0x7($8)                           \r\n"
2419         "gssdrc1 $f6, 0x0($8)                           \r\n"
2420         "daddu $8, $9, $11                              \r\n"
2421         "gssdlc1 $f0, 0x7($9)                           \r\n"
2422         "gssdrc1 $f0, 0x0($9)                           \r\n"
2423         "daddu $14, $9, $13                             \r\n"
2424         "gssdlc1 $f12, 0x7($8)                          \r\n"
2425         "gssdrc1 $f12, 0x0($8)                          \r\n"
2426         "daddu $8, $13, $13                             \r\n"
2427         "gssdlc1 $f10, 0x7($14)                         \r\n"
2428         "gssdrc1 $f10, 0x0($14)                         \r\n"
2429         "dsubu $10, $10, $8                             \r\n"
2430         "dsubu $9, $9, $8                               \r\n"
2431         "ldc1 $f0, 0x0+%[ptmp]                          \r\n"
2432         "ldc1 $f2, 0x10+%[ptmp]                         \r\n"
2433         "ldc1 $f4, 0x20+%[ptmp]                         \r\n"
2434         "ldc1 $f6, 0x30+%[ptmp]                         \r\n"
2435         "ldc1 $f8, 0x40+%[ptmp]                         \r\n"
2436         "ldc1 $f10, 0x50+%[ptmp]                        \r\n"
2437         "ldc1 $f12, 0x60+%[ptmp]                        \r\n"
2438         "punpckhbh $f14, $f0, $f2                       \r\n"
2439         "punpcklbh $f0, $f0, $f2                        \r\n"
2440         "punpckhbh $f2, $f4, $f6                        \r\n"
2441         "punpcklbh $f4, $f4, $f6                        \r\n"
2442         "punpckhbh $f6, $f8, $f10                       \r\n"
2443         "punpcklbh $f8, $f8, $f10                       \r\n"
2444         "ldc1 $f16, 0x70+%[ptmp]                        \r\n"
2445         "punpckhbh $f10, $f12, $f16                     \r\n"
2446         "punpcklbh $f12, $f12, $f16                     \r\n"
2447         "gssdlc1 $f6, 0x7($10)                          \r\n"
2448         "gssdrc1 $f6, 0x0($10)                          \r\n"
2449         "daddu $8, $10, $12                             \r\n"
2450         "punpckhhw $f6, $f0, $f4                        \r\n"
2451         "punpcklhw $f0, $f0, $f4                        \r\n"
2452         "punpckhhw $f4, $f8, $f12                       \r\n"
2453         "punpcklhw $f8, $f8, $f12                       \r\n"
2454         "punpckhhw $f12, $f14, $f2                      \r\n"
2455         "punpcklhw $f14, $f14, $f2                      \r\n"
2456         "gssdlc1 $f4, 0x7($8)                           \r\n"
2457         "gssdrc1 $f4, 0x0($8)                           \r\n"
2458         "gsldlc1 $f4, 0x7($10)                          \r\n"
2459         "gsldrc1 $f4, 0x0($10)                          \r\n"
2460         "punpckhhw $f2, $f4, $f10                       \r\n"
2461         "punpcklhw $f4, $f4, $f10                       \r\n"
2462         "punpckhwd $f10, $f0, $f8                       \r\n"
2463         "punpcklwd $f0, $f0, $f8                        \r\n"
2464         "punpckhwd $f8, $f14, $f4                       \r\n"
2465         "punpcklwd $f14, $f14, $f4                      \r\n"
2466         "daddu $8, $10, %[stride]                       \r\n"
2467         "gssdlc1 $f0, 0x7($10)                          \r\n"
2468         "gssdrc1 $f0, 0x0($10)                          \r\n"
2469         "daddu $14, $9, %[stride]                       \r\n"
2470         "gssdlc1 $f10, 0x7($8)                          \r\n"
2471         "gssdrc1 $f10, 0x0($8)                          \r\n"
2472         "daddu $8, $9, $12                              \r\n"
2473         "gssdlc1 $f14, 0x7($14)                         \r\n"
2474         "gssdrc1 $f14, 0x0($14)                         \r\n"
2475         "daddu $14, $10, $12                            \r\n"
2476         "gssdlc1 $f8, 0x7($8)                           \r\n"
2477         "gssdrc1 $f8, 0x0($8)                           \r\n"
2478         "gsldlc1 $f16, 0x7($14)                         \r\n"
2479         "gsldrc1 $f16, 0x0($14)                         \r\n"
2480         "daddu $8, $10, $12                             \r\n"
2481         "punpckhwd $f0, $f6, $f16                       \r\n"
2482         "punpcklwd $f6, $f6, $f16                       \r\n"
2483         "punpckhwd $f10, $f12, $f2                      \r\n"
2484         "punpcklwd $f12, $f12, $f2                      \r\n"
2485         "gssdlc1 $f6, 0x7($8)                           \r\n"
2486         "gssdrc1 $f6, 0x0($8)                           \r\n"
2487         "daddu $8, $9, $11                              \r\n"
2488         "gssdlc1 $f0, 0x7($9)                           \r\n"
2489         "gssdrc1 $f0, 0x0($9)                           \r\n"
2490         "daddu $14, $9, $13                             \r\n"
2491         "gssdlc1 $f12, 0x7($8)                          \r\n"
2492         "gssdrc1 $f12, 0x0($8)                          \r\n"
2493         "gssdlc1 $f10, 0x7($14)                         \r\n"
2494         "gssdrc1 $f10, 0x0($14)                         \r\n"
2495         ::[pix]"r"(pix),[stride]"r"((uint64_t)stride),[ptmp]"m"(ptmp[0]),
2496           [pdat]"m"(pdat[0])
2497         : "$8","$9","$10","$11","$12","$13","$14","$f0","$f2","$f4","$f6",
2498           "$f8","$f10","$f12","$f14","$f16"
2499     );
2500 }