]> git.sesse.net Git - ffmpeg/blob - libavcodec/mips/h264dsp_mmi.c
Merge commit '5e2203448ab4cc8ea1d933b87f1b39b009201044'
[ffmpeg] / libavcodec / mips / h264dsp_mmi.c
1 /*
2  * Loongson SIMD optimized h264dsp
3  *
4  * Copyright (c) 2015 Loongson Technology Corporation Limited
5  * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
7  *                    Heiher <r@hev.cc>
8  *
9  * This file is part of FFmpeg.
10  *
11  * FFmpeg is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * FFmpeg is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with FFmpeg; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24  */
25
26 #include "libavcodec/bit_depth_template.c"
27 #include "h264dsp_mips.h"
28 #include "libavutil/mips/asmdefs.h"
29
30 void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
31 {
32     double ftmp[9];
33     uint64_t low32;
34
35     __asm__ volatile (
36         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
37         "ldc1       %[ftmp1],   0x00(%[src])                            \n\t"
38         "ldc1       %[ftmp2],   0x08(%[src])                            \n\t"
39         "ldc1       %[ftmp3],   0x10(%[src])                            \n\t"
40         "ldc1       %[ftmp4],   0x18(%[src])                            \n\t"
41         "uld        %[low32],   0x00(%[dst0])                           \n\t"
42         "mtc1       %[low32],   %[ftmp5]                                \n\t"
43         "uld        %[low32],   0x00(%[dst1])                           \n\t"
44         "mtc1       %[low32],   %[ftmp6]                                \n\t"
45         "uld        %[low32],   0x00(%[dst2])                           \n\t"
46         "mtc1       %[low32],   %[ftmp7]                                \n\t"
47         "uld        %[low32],   0x00(%[dst3])                           \n\t"
48         "mtc1       %[low32],   %[ftmp8]                                \n\t"
49         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
50         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
51         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
52         "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
53         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
54         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
55         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
56         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
57         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
58         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
59         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
60         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
61         "gsswlc1    %[ftmp1],   0x03(%[dst0])                           \n\t"
62         "gsswrc1    %[ftmp1],   0x00(%[dst0])                           \n\t"
63         "gsswlc1    %[ftmp2],   0x03(%[dst1])                           \n\t"
64         "gsswrc1    %[ftmp2],   0x00(%[dst1])                           \n\t"
65         "gsswlc1    %[ftmp3],   0x03(%[dst2])                           \n\t"
66         "gsswrc1    %[ftmp3],   0x00(%[dst2])                           \n\t"
67         "gsswlc1    %[ftmp4],   0x03(%[dst3])                           \n\t"
68         "gsswrc1    %[ftmp4],   0x00(%[dst3])                           \n\t"
69         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
70           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
71           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
72           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
73           [ftmp8]"=&f"(ftmp[8]),
74           [low32]"=&r"(low32)
75         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
76           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
77           [src]"r"(src)
78         : "memory"
79     );
80
81     memset(src, 0, 32);
82 }
83
84 void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
85 {
86     double ftmp[12];
87     uint64_t tmp[1];
88     uint64_t low32;
89
90     __asm__ volatile (
91         "dli        %[tmp0],    0x01                                    \n\t"
92         "ldc1       %[ftmp0],   0x00(%[block])                          \n\t"
93         "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
94         "ldc1       %[ftmp1],   0x08(%[block])                          \n\t"
95         "dli        %[tmp0],    0x06                                    \n\t"
96         "ldc1       %[ftmp2],   0x10(%[block])                          \n\t"
97         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
98         "psrah      %[ftmp4],   %[ftmp1],       %[ftmp8]                \n\t"
99         "ldc1       %[ftmp3],   0x18(%[block])                          \n\t"
100         "psrah      %[ftmp5],   %[ftmp3],       %[ftmp8]                \n\t"
101         "psubh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
102         "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
103         "paddh      %[ftmp10],  %[ftmp2],       %[ftmp0]                \n\t"
104         "psubh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
105         "paddh      %[ftmp11],  %[ftmp5],       %[ftmp10]               \n\t"
106         "psubh      %[ftmp2],   %[ftmp10],      %[ftmp5]                \n\t"
107         "paddh      %[ftmp10],  %[ftmp4],       %[ftmp0]                \n\t"
108         "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
109         "punpckhhw  %[ftmp1],   %[ftmp11],      %[ftmp10]               \n\t"
110         "punpcklhw  %[ftmp5],   %[ftmp11],      %[ftmp10]               \n\t"
111         "punpckhhw  %[ftmp4],   %[ftmp0],       %[ftmp2]                \n\t"
112         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
113         "punpckhwd  %[ftmp2],   %[ftmp5],       %[ftmp0]                \n\t"
114         "punpcklwd  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
115         "punpcklwd  %[ftmp10],  %[ftmp1],       %[ftmp4]                \n\t"
116         "punpckhwd  %[ftmp0],   %[ftmp1],       %[ftmp4]                \n\t"
117         "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_32]             \n\t"
118         "psrah      %[ftmp4],   %[ftmp2],       %[ftmp8]                \n\t"
119         "psrah      %[ftmp3],   %[ftmp0],       %[ftmp8]                \n\t"
120         "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
121         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
122         "paddh      %[ftmp1],   %[ftmp10],      %[ftmp5]                \n\t"
123         "psubh      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
124         "paddh      %[ftmp10],  %[ftmp3],       %[ftmp1]                \n\t"
125         "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
126         "paddh      %[ftmp11],  %[ftmp4],       %[ftmp5]                \n\t"
127         "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
128         "psubh      %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
129         "sdc1       %[ftmp7],   0x00(%[block])                          \n\t"
130         "sdc1       %[ftmp7],   0x08(%[block])                          \n\t"
131         "sdc1       %[ftmp7],   0x10(%[block])                          \n\t"
132         "sdc1       %[ftmp7],   0x18(%[block])                          \n\t"
133         "uld        %[low32],   0x00(%[dst])                            \n\t"
134         "mtc1       %[low32],   %[ftmp2]                                \n\t"
135         "psrah      %[ftmp3],   %[ftmp10],      %[ftmp9]                \n\t"
136         "gslwxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
137         "psrah      %[ftmp4],   %[ftmp11],      %[ftmp9]                \n\t"
138         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
139         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
140         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
141         "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
142         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
143         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
144         "gsswlc1    %[ftmp2],   0x03(%[dst])                            \n\t"
145         "gsswrc1    %[ftmp2],   0x00(%[dst])                            \n\t"
146         "gsswxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
147         PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
148         PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
149         "uld        %[low32],   0x00(%[dst])                            \n\t"
150         "mtc1       %[low32],   %[ftmp2]                                \n\t"
151         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
152         "gslwxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
153         "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
154         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
155         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
156         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
157         "paddh      %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
158         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
159         "gsswlc1    %[ftmp2],   0x03(%[dst])                            \n\t"
160         "gsswrc1    %[ftmp2],   0x00(%[dst])                            \n\t"
161         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
162         "gsswxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
163         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
164           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
165           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
166           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
167           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
168           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
169           [tmp0]"=&r"(tmp[0]),
170           [low32]"=&r"(low32)
171         : [dst]"r"(dst),                    [block]"r"(block),
172           [stride]"r"((mips_reg)stride),    [ff_pw_32]"f"(ff_pw_32)
173         : "memory"
174     );
175
176     memset(block, 0, 32);
177 }
178
179 void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
180 {
181     double ftmp[16];
182     uint64_t tmp[8];
183     mips_reg addr[1];
184     uint64_t low32;
185
186     __asm__ volatile (
187         "lhu       %[tmp0],     0x00(%[block])                          \n\t"
188         PTR_ADDI  "$29,         $29,            -0x20                   \n\t"
189         PTR_ADDIU "%[tmp0],     %[tmp0],        0x20                    \n\t"
190         "ldc1      %[ftmp1],    0x10(%[block])                          \n\t"
191         "sh        %[tmp0],     0x00(%[block])                          \n\t"
192         "ldc1      %[ftmp2],    0x20(%[block])                          \n\t"
193         "dli       %[tmp0],     0x01                                    \n\t"
194         "ldc1      %[ftmp3],    0x30(%[block])                          \n\t"
195         "mtc1      %[tmp0],     %[ftmp8]                                \n\t"
196         "ldc1      %[ftmp5],    0x50(%[block])                          \n\t"
197         "ldc1      %[ftmp6],    0x60(%[block])                          \n\t"
198         "ldc1      %[ftmp7],    0x70(%[block])                          \n\t"
199         "mov.d     %[ftmp0],    %[ftmp1]                                \n\t"
200         "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
201         "psrah     %[ftmp4],    %[ftmp5],       %[ftmp8]                \n\t"
202         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
203         "paddh     %[ftmp4],    %[ftmp4],       %[ftmp5]                \n\t"
204         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
205         "paddh     %[ftmp4],    %[ftmp4],       %[ftmp7]                \n\t"
206         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp3]                \n\t"
207         "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
208         "psubh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
209         "psubh     %[ftmp5],    %[ftmp5],       %[ftmp3]                \n\t"
210         "psrah     %[ftmp3],    %[ftmp3],       %[ftmp8]                \n\t"
211         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp7]                \n\t"
212         "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
213         "psrah     %[ftmp7],    %[ftmp7],       %[ftmp8]                \n\t"
214         "psubh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
215         "dli       %[tmp0],     0x02                                    \n\t"
216         "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
217         "mtc1      %[tmp0],     %[ftmp9]                                \n\t"
218         "mov.d     %[ftmp7],    %[ftmp1]                                \n\t"
219         "psrah     %[ftmp1],    %[ftmp1],       %[ftmp9]                \n\t"
220         "psrah     %[ftmp3],    %[ftmp4],       %[ftmp9]                \n\t"
221         "paddh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
222         "psrah     %[ftmp0],    %[ftmp0],       %[ftmp9]                \n\t"
223         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
224         "psrah     %[ftmp5],    %[ftmp5],       %[ftmp9]                \n\t"
225         "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
226         "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
227         "mov.d     %[ftmp5],    %[ftmp6]                                \n\t"
228         "psrah     %[ftmp6],    %[ftmp6],       %[ftmp8]                \n\t"
229         "psrah     %[ftmp4],    %[ftmp2],       %[ftmp8]                \n\t"
230         "paddh     %[ftmp6],    %[ftmp6],       %[ftmp2]                \n\t"
231         "psubh     %[ftmp4],    %[ftmp4],       %[ftmp5]                \n\t"
232         "ldc1      %[ftmp2],    0x00(%[block])                          \n\t"
233         "ldc1      %[ftmp5],    0x40(%[block])                          \n\t"
234         "paddh     %[ftmp5],    %[ftmp5],       %[ftmp2]                \n\t"
235         "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
236         "paddh     %[ftmp6],    %[ftmp6],       %[ftmp5]                \n\t"
237         "psubh     %[ftmp2],    %[ftmp2],       %[ftmp5]                \n\t"
238         "paddh     %[ftmp5],    %[ftmp5],       %[ftmp5]                \n\t"
239         "paddh     %[ftmp4],    %[ftmp4],       %[ftmp2]                \n\t"
240         "psubh     %[ftmp5],    %[ftmp5],       %[ftmp6]                \n\t"
241         "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
242         "paddh     %[ftmp7],    %[ftmp7],       %[ftmp6]                \n\t"
243         "psubh     %[ftmp2],    %[ftmp2],       %[ftmp4]                \n\t"
244         "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
245         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
246         "psubh     %[ftmp6],    %[ftmp6],       %[ftmp7]                \n\t"
247         "paddh     %[ftmp4],    %[ftmp4],       %[ftmp4]                \n\t"
248         "paddh     %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
249         "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
250         "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
251         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
252         "psubh     %[ftmp2],    %[ftmp2],       %[ftmp3]                \n\t"
253         "paddh     %[ftmp5],    %[ftmp5],       %[ftmp5]                \n\t"
254         "sdc1      %[ftmp6],    0x00(%[block])                          \n\t"
255         "psubh     %[ftmp5],    %[ftmp5],       %[ftmp1]                \n\t"
256         "punpckhhw %[ftmp6],    %[ftmp7],       %[ftmp0]                \n\t"
257         "punpcklhw %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
258         "punpckhhw %[ftmp0],    %[ftmp3],       %[ftmp1]                \n\t"
259         "punpcklhw %[ftmp3],    %[ftmp3],       %[ftmp1]                \n\t"
260         "punpckhwd %[ftmp1],    %[ftmp7],       %[ftmp3]                \n\t"
261         "punpcklwd %[ftmp7],    %[ftmp7],       %[ftmp3]                \n\t"
262         "punpckhwd %[ftmp3],    %[ftmp6],       %[ftmp0]                \n\t"
263         "punpcklwd %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
264         "ldc1      %[ftmp0],    0x00(%[block])                          \n\t"
265         "sdc1      %[ftmp7],    0x00($29)                               \n\t"
266         "sdc1      %[ftmp1],    0x10($29)                               \n\t"
267         "dmfc1     %[tmp1],     %[ftmp6]                                \n\t"
268         "dmfc1     %[tmp3],     %[ftmp3]                                \n\t"
269         "punpckhhw %[ftmp3],    %[ftmp5],       %[ftmp2]                \n\t"
270         "punpcklhw %[ftmp5],    %[ftmp5],       %[ftmp2]                \n\t"
271         "punpckhhw %[ftmp2],    %[ftmp4],       %[ftmp0]                \n\t"
272         "punpcklhw %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
273         "punpckhwd %[ftmp0],    %[ftmp5],       %[ftmp4]                \n\t"
274         "punpcklwd %[ftmp5],    %[ftmp5],       %[ftmp4]                \n\t"
275         "punpckhwd %[ftmp4],    %[ftmp3],       %[ftmp2]                \n\t"
276         "punpcklwd %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
277         "sdc1      %[ftmp5],    0x08($29)                               \n\t"
278         "sdc1      %[ftmp0],    0x18($29)                               \n\t"
279         "dmfc1     %[tmp2],     %[ftmp3]                                \n\t"
280         "dmfc1     %[tmp4],     %[ftmp4]                                \n\t"
281         "ldc1      %[ftmp1],    0x18(%[block])                          \n\t"
282         "ldc1      %[ftmp6],    0x28(%[block])                          \n\t"
283         "ldc1      %[ftmp2],    0x38(%[block])                          \n\t"
284         "ldc1      %[ftmp0],    0x58(%[block])                          \n\t"
285         "ldc1      %[ftmp3],    0x68(%[block])                          \n\t"
286         "ldc1      %[ftmp4],    0x78(%[block])                          \n\t"
287         "mov.d     %[ftmp7],    %[ftmp1]                                \n\t"
288         "psrah     %[ftmp5],    %[ftmp0],       %[ftmp8]                \n\t"
289         "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
290         "paddh     %[ftmp5],    %[ftmp5],       %[ftmp0]                \n\t"
291         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
292         "paddh     %[ftmp5],    %[ftmp5],       %[ftmp4]                \n\t"
293         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
294         "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
295         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp2]                \n\t"
296         "psubh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
297         "psubh     %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
298         "psrah     %[ftmp2],    %[ftmp2],       %[ftmp8]                \n\t"
299         "paddh     %[ftmp7],    %[ftmp7],       %[ftmp4]                \n\t"
300         "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
301         "psrah     %[ftmp4],    %[ftmp4],       %[ftmp8]                \n\t"
302         "psubh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
303         "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
304         "mov.d     %[ftmp4],    %[ftmp1]                                \n\t"
305         "psrah     %[ftmp2],    %[ftmp5],       %[ftmp9]                \n\t"
306         "psrah     %[ftmp1],    %[ftmp1],       %[ftmp9]                \n\t"
307         "paddh     %[ftmp2],    %[ftmp2],       %[ftmp7]                \n\t"
308         "psrah     %[ftmp7],    %[ftmp7],       %[ftmp9]                \n\t"
309         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
310         "psrah     %[ftmp0],    %[ftmp0],       %[ftmp9]                \n\t"
311         "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
312         "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
313         "mov.d     %[ftmp0],    %[ftmp3]                                \n\t"
314         "psrah     %[ftmp3],    %[ftmp3],       %[ftmp8]                \n\t"
315         "psrah     %[ftmp5],    %[ftmp6],       %[ftmp8]                \n\t"
316         "paddh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
317         "psubh     %[ftmp5],    %[ftmp5],       %[ftmp0]                \n\t"
318         "ldc1      %[ftmp6],    0x08(%[block])                          \n\t"
319         "ldc1      %[ftmp0],    0x48(%[block])                          \n\t"
320         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
321         "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
322         "paddh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
323         "psubh     %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
324         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
325         "paddh     %[ftmp5],    %[ftmp5],       %[ftmp6]                \n\t"
326         "psubh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
327         "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
328         "paddh     %[ftmp4],    %[ftmp4],       %[ftmp3]                \n\t"
329         "psubh     %[ftmp6],    %[ftmp6],       %[ftmp5]                \n\t"
330         "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
331         "paddh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
332         "psubh     %[ftmp3],    %[ftmp3],       %[ftmp4]                \n\t"
333         "paddh     %[ftmp5],    %[ftmp5],       %[ftmp5]                \n\t"
334         "paddh     %[ftmp2],    %[ftmp2],       %[ftmp6]                \n\t"
335         "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
336         "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
337         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
338         "psubh     %[ftmp6],    %[ftmp6],       %[ftmp2]                \n\t"
339         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
340         "sdc1      %[ftmp3],    0x08(%[block])                          \n\t"
341         "psubh     %[ftmp0],    %[ftmp0],       %[ftmp1]                \n\t"
342         "punpckhhw %[ftmp3],    %[ftmp4],       %[ftmp7]                \n\t"
343         "punpcklhw %[ftmp4],    %[ftmp4],       %[ftmp7]                \n\t"
344         "punpckhhw %[ftmp7],    %[ftmp2],       %[ftmp1]                \n\t"
345         "punpcklhw %[ftmp2],    %[ftmp2],       %[ftmp1]                \n\t"
346         "punpckhwd %[ftmp1],    %[ftmp4],       %[ftmp2]                \n\t"
347         "punpcklwd %[ftmp4],    %[ftmp4],       %[ftmp2]                \n\t"
348         "punpckhwd %[ftmp2],    %[ftmp3],       %[ftmp7]                \n\t"
349         "punpcklwd %[ftmp3],    %[ftmp3],       %[ftmp7]                \n\t"
350         "ldc1      %[ftmp7],    0x08(%[block])                          \n\t"
351         "dmfc1     %[tmp5],     %[ftmp4]                                \n\t"
352         "dmfc1     %[tmp7],     %[ftmp1]                                \n\t"
353         "mov.d     %[ftmp12],   %[ftmp3]                                \n\t"
354         "mov.d     %[ftmp14],   %[ftmp2]                                \n\t"
355         "punpckhhw %[ftmp2],    %[ftmp0],       %[ftmp6]                \n\t"
356         "punpcklhw %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
357         "punpckhhw %[ftmp6],    %[ftmp5],       %[ftmp7]                \n\t"
358         "punpcklhw %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
359         "punpckhwd %[ftmp7],    %[ftmp0],       %[ftmp5]                \n\t"
360         "punpcklwd %[ftmp0],    %[ftmp0],       %[ftmp5]                \n\t"
361         "punpckhwd %[ftmp5],    %[ftmp2],       %[ftmp6]                \n\t"
362         "punpcklwd %[ftmp2],    %[ftmp2],       %[ftmp6]                \n\t"
363         "dmfc1     %[tmp6],     %[ftmp0]                                \n\t"
364         "mov.d     %[ftmp11],   %[ftmp7]                                \n\t"
365         "mov.d     %[ftmp13],   %[ftmp2]                                \n\t"
366         "mov.d     %[ftmp15],   %[ftmp5]                                \n\t"
367         PTR_ADDIU "%[addr0],    %[dst],         0x04                    \n\t"
368         "dmtc1     %[tmp7],     %[ftmp7]                                \n\t"
369         "dmtc1     %[tmp3],     %[ftmp6]                                \n\t"
370         "ldc1      %[ftmp1],    0x10($29)                               \n\t"
371         "dmtc1     %[tmp1],     %[ftmp3]                                \n\t"
372         "mov.d     %[ftmp4],    %[ftmp1]                                \n\t"
373         "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
374         "psrah     %[ftmp0],    %[ftmp7],       %[ftmp8]                \n\t"
375         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp4]                \n\t"
376         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp7]                \n\t"
377         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
378         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp14]               \n\t"
379         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp6]                \n\t"
380         "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
381         "psubh     %[ftmp4],    %[ftmp4],       %[ftmp6]                \n\t"
382         "psubh     %[ftmp7],    %[ftmp7],       %[ftmp6]                \n\t"
383         "psrah     %[ftmp6],    %[ftmp6],       %[ftmp8]                \n\t"
384         "paddh     %[ftmp4],    %[ftmp4],       %[ftmp14]               \n\t"
385         "psubh     %[ftmp7],    %[ftmp7],       %[ftmp14]               \n\t"
386         "psrah     %[ftmp5],    %[ftmp14],      %[ftmp8]                \n\t"
387         "psubh     %[ftmp4],    %[ftmp4],       %[ftmp6]                \n\t"
388         "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
389         "mov.d     %[ftmp5],    %[ftmp1]                                \n\t"
390         "psrah     %[ftmp1],    %[ftmp1],       %[ftmp9]                \n\t"
391         "psrah     %[ftmp6],    %[ftmp0],       %[ftmp9]                \n\t"
392         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
393         "paddh     %[ftmp6],    %[ftmp6],       %[ftmp4]                \n\t"
394         "psrah     %[ftmp4],    %[ftmp4],       %[ftmp9]                \n\t"
395         "psrah     %[ftmp7],    %[ftmp7],       %[ftmp9]                \n\t"
396         "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
397         "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
398         "mov.d     %[ftmp7],    %[ftmp12]                               \n\t"
399         "psrah     %[ftmp2],    %[ftmp12],      %[ftmp8]                \n\t"
400         "psrah     %[ftmp0],    %[ftmp3],       %[ftmp8]                \n\t"
401         "paddh     %[ftmp2],    %[ftmp2],       %[ftmp3]                \n\t"
402         "psubh     %[ftmp0],    %[ftmp0],       %[ftmp7]                \n\t"
403         "ldc1      %[ftmp3],    0x00($29)                               \n\t"
404         "dmtc1     %[tmp5],     %[ftmp7]                                \n\t"
405         "paddh     %[ftmp7],    %[ftmp7],       %[ftmp3]                \n\t"
406         "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
407         "paddh     %[ftmp2],    %[ftmp2],       %[ftmp7]                \n\t"
408         "psubh     %[ftmp3],    %[ftmp3],       %[ftmp7]                \n\t"
409         "paddh     %[ftmp7],    %[ftmp7],       %[ftmp7]                \n\t"
410         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
411         "psubh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
412         "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
413         "paddh     %[ftmp5],    %[ftmp5],       %[ftmp2]                \n\t"
414         "psubh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
415         "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
416         "paddh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
417         "psubh     %[ftmp2],    %[ftmp2],       %[ftmp5]                \n\t"
418         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
419         "paddh     %[ftmp6],    %[ftmp6],       %[ftmp3]                \n\t"
420         "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
421         "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
422         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
423         "psubh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
424         "paddh     %[ftmp7],    %[ftmp7],       %[ftmp7]                \n\t"
425         "sdc1      %[ftmp3],    0x00($29)                               \n\t"
426         "psubh     %[ftmp7],    %[ftmp7],       %[ftmp1]                \n\t"
427         "sdc1      %[ftmp0],    0x10($29)                               \n\t"
428         "dmfc1     %[tmp1],     %[ftmp2]                                \n\t"
429         "xor       %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
430         "sdc1      %[ftmp2],    0x00(%[block])                          \n\t"
431         "sdc1      %[ftmp2],    0x08(%[block])                          \n\t"
432         "sdc1      %[ftmp2],    0x10(%[block])                          \n\t"
433         "sdc1      %[ftmp2],    0x18(%[block])                          \n\t"
434         "sdc1      %[ftmp2],    0x20(%[block])                          \n\t"
435         "sdc1      %[ftmp2],    0x28(%[block])                          \n\t"
436         "sdc1      %[ftmp2],    0x30(%[block])                          \n\t"
437         "sdc1      %[ftmp2],    0x38(%[block])                          \n\t"
438         "sdc1      %[ftmp2],    0x40(%[block])                          \n\t"
439         "sdc1      %[ftmp2],    0x48(%[block])                          \n\t"
440         "sdc1      %[ftmp2],    0x50(%[block])                          \n\t"
441         "sdc1      %[ftmp2],    0x58(%[block])                          \n\t"
442         "sdc1      %[ftmp2],    0x60(%[block])                          \n\t"
443         "sdc1      %[ftmp2],    0x68(%[block])                          \n\t"
444         "sdc1      %[ftmp2],    0x70(%[block])                          \n\t"
445         "sdc1      %[ftmp2],    0x78(%[block])                          \n\t"
446         "dli       %[tmp3],     0x06                                    \n\t"
447         "uld       %[low32],    0x00(%[dst])                            \n\t"
448         "mtc1      %[low32],    %[ftmp3]                                \n\t"
449         "mtc1      %[tmp3],     %[ftmp10]                               \n\t"
450         "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
451         "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
452         "psrah     %[ftmp4],    %[ftmp4],       %[ftmp10]               \n\t"
453         "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
454         "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
455         "paddh     %[ftmp3],    %[ftmp3],       %[ftmp5]                \n\t"
456         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
457         "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
458         "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
459         "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
460         "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
461         "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
462         PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
463         PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
464         "uld       %[low32],    0x00(%[dst])                            \n\t"
465         "mtc1      %[low32],    %[ftmp3]                                \n\t"
466         "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
467         "psrah     %[ftmp6],    %[ftmp6],       %[ftmp10]               \n\t"
468         "psrah     %[ftmp1],    %[ftmp1],       %[ftmp10]               \n\t"
469         "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
470         "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
471         "paddh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
472         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp1]                \n\t"
473         "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
474         "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
475         "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
476         "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
477         "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
478         "ldc1      %[ftmp5],    0x00($29)                               \n\t"
479         "ldc1      %[ftmp4],    0x10($29)                               \n\t"
480         "dmtc1     %[tmp1],     %[ftmp6]                                \n\t"
481         PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
482         PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
483         "uld       %[low32],    0x00(%[dst])                            \n\t"
484         "mtc1      %[low32],    %[ftmp3]                                \n\t"
485         "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
486         "psrah     %[ftmp7],    %[ftmp7],       %[ftmp10]               \n\t"
487         "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
488         "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
489         "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
490         "paddh     %[ftmp3],    %[ftmp3],       %[ftmp7]                \n\t"
491         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp5]                \n\t"
492         "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
493         "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
494         "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
495         "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
496         "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
497         PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
498         PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
499         "uld       %[low32],    0x00(%[dst])                            \n\t"
500         "mtc1      %[low32],    %[ftmp3]                                \n\t"
501         "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
502         "psrah     %[ftmp4],    %[ftmp4],       %[ftmp10]               \n\t"
503         "psrah     %[ftmp6],    %[ftmp6],       %[ftmp10]               \n\t"
504         "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
505         "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
506         "paddh     %[ftmp3],    %[ftmp3],       %[ftmp4]                \n\t"
507         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
508         "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
509         "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
510         "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
511         "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
512         "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
513         "dmtc1     %[tmp4],     %[ftmp1]                                \n\t"
514         "dmtc1     %[tmp2],     %[ftmp6]                                \n\t"
515         "ldc1      %[ftmp4],    0x18($29)                               \n\t"
516         "mov.d     %[ftmp5],    %[ftmp4]                                \n\t"
517         "psrah     %[ftmp4],    %[ftmp4],       %[ftmp8]                \n\t"
518         "psrah     %[ftmp7],    %[ftmp11],      %[ftmp8]                \n\t"
519         "paddh     %[ftmp7],    %[ftmp7],       %[ftmp11]               \n\t"
520         "paddh     %[ftmp4],    %[ftmp4],       %[ftmp5]                \n\t"
521         "paddh     %[ftmp7],    %[ftmp7],       %[ftmp15]               \n\t"
522         "paddh     %[ftmp4],    %[ftmp4],       %[ftmp11]               \n\t"
523         "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
524         "paddh     %[ftmp4],    %[ftmp4],       %[ftmp1]                \n\t"
525         "psubh     %[ftmp5],    %[ftmp5],       %[ftmp1]                \n\t"
526         "psubh     %[ftmp3],    %[ftmp11],      %[ftmp1]                \n\t"
527         "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
528         "paddh     %[ftmp5],    %[ftmp5],       %[ftmp15]               \n\t"
529         "psubh     %[ftmp3],    %[ftmp3],       %[ftmp15]               \n\t"
530         "psrah     %[ftmp2],    %[ftmp15],      %[ftmp8]                \n\t"
531         "psubh     %[ftmp5],    %[ftmp5],       %[ftmp1]                \n\t"
532         "psubh     %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
533         "mov.d     %[ftmp2],    %[ftmp4]                                \n\t"
534         "psrah     %[ftmp4],    %[ftmp4],       %[ftmp9]                \n\t"
535         "psrah     %[ftmp1],    %[ftmp7],       %[ftmp9]                \n\t"
536         "paddh     %[ftmp4],    %[ftmp4],       %[ftmp3]                \n\t"
537         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
538         "psrah     %[ftmp5],    %[ftmp5],       %[ftmp9]                \n\t"
539         "psrah     %[ftmp3],    %[ftmp3],       %[ftmp9]                \n\t"
540         "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
541         "psubh     %[ftmp2],    %[ftmp2],       %[ftmp3]                \n\t"
542         "mov.d     %[ftmp3],    %[ftmp13]                               \n\t"
543         "psrah     %[ftmp0],    %[ftmp13],      %[ftmp8]                \n\t"
544         "psrah     %[ftmp7],    %[ftmp6],       %[ftmp8]                \n\t"
545         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
546         "psubh     %[ftmp7],    %[ftmp7],       %[ftmp3]                \n\t"
547         "ldc1      %[ftmp6],    0x08($29)                               \n\t"
548         "dmtc1     %[tmp6],     %[ftmp3]                                \n\t"
549         "paddh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
550         "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
551         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
552         "psubh     %[ftmp6],    %[ftmp6],       %[ftmp3]                \n\t"
553         "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
554         "paddh     %[ftmp7],    %[ftmp7],       %[ftmp6]                \n\t"
555         "psubh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
556         "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
557         "paddh     %[ftmp2],    %[ftmp2],       %[ftmp0]                \n\t"
558         "psubh     %[ftmp6],    %[ftmp6],       %[ftmp7]                \n\t"
559         "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
560         "paddh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
561         "psubh     %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
562         "paddh     %[ftmp7],    %[ftmp7],       %[ftmp7]                \n\t"
563         "paddh     %[ftmp1],    %[ftmp1],       %[ftmp6]                \n\t"
564         "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
565         "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
566         "paddh     %[ftmp4],    %[ftmp4],       %[ftmp3]                \n\t"
567         "psubh     %[ftmp6],    %[ftmp6],       %[ftmp1]                \n\t"
568         "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
569         "sdc1      %[ftmp6],    0x08($29)                               \n\t"
570         "psubh     %[ftmp3],    %[ftmp3],       %[ftmp4]                \n\t"
571         "sdc1      %[ftmp7],    0x18($29)                               \n\t"
572         "dmfc1     %[tmp2],     %[ftmp0]                                \n\t"
573         "xor       %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
574         "uld       %[low32],    0x00(%[addr0])                          \n\t"
575         "mtc1      %[low32],    %[ftmp6]                                \n\t"
576         "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
577         "psrah     %[ftmp2],    %[ftmp2],       %[ftmp10]               \n\t"
578         "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
579         "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
580         "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
581         "paddh     %[ftmp6],    %[ftmp6],       %[ftmp2]                \n\t"
582         "paddh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
583         "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
584         "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
585         "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
586         "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
587         "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
588         PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
589         PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
590         "uld       %[low32],    0x00(%[addr0])                          \n\t"
591         "mtc1      %[low32],    %[ftmp6]                                \n\t"
592         "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
593         "psrah     %[ftmp1],    %[ftmp1],       %[ftmp10]               \n\t"
594         "psrah     %[ftmp4],    %[ftmp4],       %[ftmp10]               \n\t"
595         "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
596         "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
597         "paddh     %[ftmp6],    %[ftmp6],       %[ftmp1]                \n\t"
598         "paddh     %[ftmp7],    %[ftmp7],       %[ftmp4]                \n\t"
599         "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
600         "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
601         "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
602         "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
603         "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
604         "ldc1      %[ftmp2],    0x08($29)                               \n\t"
605         "ldc1      %[ftmp5],    0x18($29)                               \n\t"
606         PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
607         "dmtc1     %[tmp2],     %[ftmp1]                                \n\t"
608         PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
609         "uld       %[low32],    0x00(%[addr0])                          \n\t"
610         "mtc1      %[low32],    %[ftmp6]                                \n\t"
611         "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
612         "psrah     %[ftmp3],    %[ftmp3],       %[ftmp10]               \n\t"
613         "psrah     %[ftmp2],    %[ftmp2],       %[ftmp10]               \n\t"
614         "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
615         "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
616         "paddh     %[ftmp6],    %[ftmp6],       %[ftmp3]                \n\t"
617         "paddh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
618         "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
619         "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
620         "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
621         "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
622         "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
623         PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
624         PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
625         "uld       %[low32],    0x00(%[addr0])                          \n\t"
626         "mtc1      %[low32],    %[ftmp6]                                \n\t"
627         "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
628         "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
629         "psrah     %[ftmp1],    %[ftmp1],       %[ftmp10]               \n\t"
630         "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
631         "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
632         "paddh     %[ftmp6],    %[ftmp6],       %[ftmp5]                \n\t"
633         "paddh     %[ftmp7],    %[ftmp7],       %[ftmp1]                \n\t"
634         "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
635         "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
636         "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
637         "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
638         "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
639         PTR_ADDIU "$29,         $29,            0x20                    \n\t"
640         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
641           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
642           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
643           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
644           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
645           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
646           [ftmp12]"=&f"(ftmp[12]),          [ftmp13]"=&f"(ftmp[13]),
647           [ftmp14]"=&f"(ftmp[14]),          [ftmp15]"=&f"(ftmp[15]),
648           [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
649           [tmp2]"=&r"(tmp[2]),              [tmp3]"=&r"(tmp[3]),
650           [tmp4]"=&r"(tmp[4]),              [tmp5]"=&r"(tmp[5]),
651           [tmp6]"=&r"(tmp[6]),              [tmp7]"=&r"(tmp[7]),
652           [addr0]"=&r"(addr[0]),
653           [low32]"=&r"(low32)
654         : [dst]"r"(dst),                    [block]"r"(block),
655           [stride]"r"((mips_reg)stride)
656         : "$29","memory"
657     );
658
659     memset(block, 0, 128);
660 }
661
662 void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
663 {
664     int dc = (block[0] + 32) >> 6;
665     double ftmp[6];
666     uint64_t low32;
667
668     block[0] = 0;
669
670     __asm__ volatile (
671         "mtc1       %[dc],      %[ftmp5]                                \n\t"
672         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
673         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
674         "uld        %[low32],   0x00(%[dst0])                           \n\t"
675         "mtc1       %[low32],   %[ftmp1]                                \n\t"
676         "uld        %[low32],   0x00(%[dst1])                           \n\t"
677         "mtc1       %[low32],   %[ftmp2]                                \n\t"
678         "uld        %[low32],   0x00(%[dst2])                           \n\t"
679         "mtc1       %[low32],   %[ftmp3]                                \n\t"
680         "uld        %[low32],   0x00(%[dst3])                           \n\t"
681         "mtc1       %[low32],   %[ftmp4]                                \n\t"
682         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
683         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
684         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
685         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
686         "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
687         "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
688         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
689         "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
690         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
691         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
692         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
693         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
694         "gsswlc1    %[ftmp1],   0x03(%[dst0])                           \n\t"
695         "gsswrc1    %[ftmp1],   0x00(%[dst0])                           \n\t"
696         "gsswlc1    %[ftmp2],   0x03(%[dst1])                           \n\t"
697         "gsswrc1    %[ftmp2],   0x00(%[dst1])                           \n\t"
698         "gsswlc1    %[ftmp3],   0x03(%[dst2])                           \n\t"
699         "gsswrc1    %[ftmp3],   0x00(%[dst2])                           \n\t"
700         "gsswlc1    %[ftmp4],   0x03(%[dst3])                           \n\t"
701         "gsswrc1    %[ftmp4],   0x00(%[dst3])                           \n\t"
702         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
703           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
704           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
705           [low32]"=&r"(low32)
706         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
707           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
708           [dc]"r"(dc)
709         : "memory"
710     );
711 }
712
713 void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
714 {
715     int dc = (block[0] + 32) >> 6;
716     double ftmp[10];
717
718     block[0] = 0;
719
720     __asm__ volatile (
721         "mtc1       %[dc],      %[ftmp5]                                \n\t"
722         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
723         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
724         "ldc1       %[ftmp1],   0x00(%[dst0])                           \n\t"
725         "ldc1       %[ftmp2],   0x00(%[dst1])                           \n\t"
726         "ldc1       %[ftmp3],   0x00(%[dst2])                           \n\t"
727         "ldc1       %[ftmp4],   0x00(%[dst3])                           \n\t"
728         "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
729         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
730         "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
731         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
732         "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
733         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
734         "punpckhbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
735         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
736         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
737         "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
738         "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
739         "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
740         "paddsh     %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
741         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
742         "paddsh     %[ftmp9],   %[ftmp9],       %[ftmp5]                \n\t"
743         "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
744         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
745         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
746         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
747         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
748         "sdc1       %[ftmp1],   0x00(%[dst0])                           \n\t"
749         "sdc1       %[ftmp2],   0x00(%[dst1])                           \n\t"
750         "sdc1       %[ftmp3],   0x00(%[dst2])                           \n\t"
751         "sdc1       %[ftmp4],   0x00(%[dst3])                           \n\t"
752
753         "ldc1       %[ftmp1],   0x00(%[dst4])                           \n\t"
754         "ldc1       %[ftmp2],   0x00(%[dst5])                           \n\t"
755         "ldc1       %[ftmp3],   0x00(%[dst6])                           \n\t"
756         "ldc1       %[ftmp4],   0x00(%[dst7])                           \n\t"
757         "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
758         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
759         "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
760         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
761         "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
762         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
763         "punpckhbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
764         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
765         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
766         "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
767         "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
768         "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
769         "paddsh     %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
770         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
771         "paddsh     %[ftmp9],   %[ftmp9],       %[ftmp5]                \n\t"
772         "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
773         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
774         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
775         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
776         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
777         "sdc1       %[ftmp1],   0x00(%[dst4])                           \n\t"
778         "sdc1       %[ftmp2],   0x00(%[dst5])                           \n\t"
779         "sdc1       %[ftmp3],   0x00(%[dst6])                           \n\t"
780         "sdc1       %[ftmp4],   0x00(%[dst7])                           \n\t"
781         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
782           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
783           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
784           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
785           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9])
786         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
787           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
788           [dst4]"r"(dst+4*stride),          [dst5]"r"(dst+5*stride),
789           [dst6]"r"(dst+6*stride),          [dst7]"r"(dst+7*stride),
790           [dc]"r"(dc)
791         : "memory"
792     );
793 }
794
795 void ff_h264_idct_add16_8_mmi(uint8_t *dst, const int *block_offset,
796         int16_t *block, int stride, const uint8_t nnzc[15*8])
797 {
798     int i;
799     for(i=0; i<16; i++){
800         int nnz = nnzc[ scan8[i] ];
801         if(nnz){
802             if(nnz==1 && ((int16_t*)block)[i*16])
803                 ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16,
804                         stride);
805             else
806                 ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16,
807                         stride);
808         }
809     }
810 }
811
812 void ff_h264_idct_add16intra_8_mmi(uint8_t *dst, const int *block_offset,
813         int16_t *block, int stride, const uint8_t nnzc[15*8])
814 {
815     int i;
816     for(i=0; i<16; i++){
817         if(nnzc[ scan8[i] ])
818             ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16, stride);
819         else if(((int16_t*)block)[i*16])
820             ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16,
821                     stride);
822     }
823 }
824
825 void ff_h264_idct8_add4_8_mmi(uint8_t *dst, const int *block_offset,
826         int16_t *block, int stride, const uint8_t nnzc[15*8])
827 {
828     int i;
829     for(i=0; i<16; i+=4){
830         int nnz = nnzc[ scan8[i] ];
831         if(nnz){
832             if(nnz==1 && ((int16_t*)block)[i*16])
833                 ff_h264_idct8_dc_add_8_mmi(dst + block_offset[i],
834                         block + i*16, stride);
835             else
836                 ff_h264_idct8_add_8_mmi(dst + block_offset[i], block + i*16,
837                         stride);
838         }
839     }
840 }
841
842 void ff_h264_idct_add8_8_mmi(uint8_t **dest, const int *block_offset,
843         int16_t *block, int stride, const uint8_t nnzc[15*8])
844 {
845     int i, j;
846     for(j=1; j<3; j++){
847         for(i=j*16; i<j*16+4; i++){
848             if(nnzc[ scan8[i] ])
849                 ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i],
850                         block + i*16, stride);
851             else if(((int16_t*)block)[i*16])
852                 ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i],
853                         block + i*16, stride);
854         }
855     }
856 }
857
858 void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset,
859         int16_t *block, int stride, const uint8_t nnzc[15*8])
860 {
861     int i, j;
862
863     for(j=1; j<3; j++){
864         for(i=j*16; i<j*16+4; i++){
865             if(nnzc[ scan8[i] ])
866                 ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i],
867                         block + i*16, stride);
868             else if(((int16_t*)block)[i*16])
869                 ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i],
870                         block + i*16, stride);
871         }
872     }
873
874     for(j=1; j<3; j++){
875         for(i=j*16+4; i<j*16+8; i++){
876             if(nnzc[ scan8[i+4] ])
877                 ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i+4],
878                         block + i*16, stride);
879             else if(((int16_t*)block)[i*16])
880                 ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i+4],
881                         block + i*16, stride);
882         }
883     }
884 }
885
886 void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
887         int qmul)
888 {
889     double ftmp[10];
890     uint64_t tmp[2];
891
892     __asm__ volatile (
893         ".set       noreorder                                           \n\t"
894         "dli        %[tmp0],    0x08                                    \n\t"
895         "ldc1       %[ftmp3],   0x18(%[input])                          \n\t"
896         "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
897         "ldc1       %[ftmp2],   0x10(%[input])                          \n\t"
898         "dli        %[tmp0],    0x20                                    \n\t"
899         "ldc1       %[ftmp1],   0x08(%[input])                          \n\t"
900         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
901         "ldc1       %[ftmp0],   0x00(%[input])                          \n\t"
902         "mov.d      %[ftmp4],   %[ftmp3]                                \n\t"
903         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
904         "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
905         "mov.d      %[ftmp4],   %[ftmp1]                                \n\t"
906         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
907         "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
908         "mov.d      %[ftmp4],   %[ftmp3]                                \n\t"
909         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
910         "psubh      %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
911         "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
912         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
913         "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
914         "mov.d      %[ftmp4],   %[ftmp3]                                \n\t"
915         "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
916         "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
917         "punpckhhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
918         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
919         "punpckhwd  %[ftmp2],   %[ftmp3],       %[ftmp0]                \n\t"
920         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
921         "mov.d      %[ftmp0],   %[ftmp4]                                \n\t"
922         "punpcklwd  %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
923         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
924         "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
925         "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
926         "psubh      %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
927         "mov.d      %[ftmp1],   %[ftmp2]                                \n\t"
928         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
929         "psubh      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
930         "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
931         "paddh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
932         "psubh      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
933         "mov.d      %[ftmp1],   %[ftmp4]                                \n\t"
934         "daddi      %[tmp0],    %[qmul],        -0x7fff                 \n\t"
935         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
936         "bgtz       %[tmp0],    1f                                      \n\t"
937         "psubh      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
938         "ori        %[tmp0],    $0,             0x80                    \n\t"
939         "dsll       %[tmp0],    %[tmp0],        0x10                    \n\t"
940         "punpckhhw  %[ftmp1],   %[ftmp0],       %[ff_pw_1]              \n\t"
941         "daddu      %[qmul],    %[qmul],        %[tmp0]                 \n\t"
942         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ff_pw_1]              \n\t"
943         "punpckhhw  %[ftmp5],   %[ftmp2],       %[ff_pw_1]              \n\t"
944         "punpcklhw  %[ftmp2],   %[ftmp2],       %[ff_pw_1]              \n\t"
945         "mtc1       %[qmul],    %[ftmp7]                                \n\t"
946         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
947         "pmaddhw    %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
948         "pmaddhw    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
949         "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
950         "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
951         "psraw      %[ftmp0],   %[ftmp0],       %[ftmp8]                \n\t"
952         "psraw      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
953         "psraw      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
954         "psraw      %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
955         "packsswh   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
956         "packsswh   %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
957         "dmfc1      %[tmp1],    %[ftmp0]                                \n\t"
958         "dsrl       %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
959         "mfc1       %[input],   %[ftmp0]                                \n\t"
960         "sh         %[tmp1],    0x00(%[output])                         \n\t"
961         "sh         %[input],   0x80(%[output])                         \n\t"
962         "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
963         PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
964         "sh         %[tmp1],    0x20(%[output])                         \n\t"
965         "sh         %[input],   0xa0(%[output])                         \n\t"
966         "dmfc1      %[tmp1],    %[ftmp2]                                \n\t"
967         "dsrl       %[ftmp2],   %[ftmp2],       %[ftmp9]                \n\t"
968         "mfc1       %[input],   %[ftmp2]                                \n\t"
969         "sh         %[tmp1],    0x40(%[output])                         \n\t"
970         "sh         %[input],   0xc0(%[output])                         \n\t"
971         "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
972         PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
973         "sh         %[tmp1],    0x60(%[output])                         \n\t"
974         "sh         %[input],   0xe0(%[output])                         \n\t"
975         "punpckhhw  %[ftmp1],   %[ftmp3],       %[ff_pw_1]              \n\t"
976         "punpcklhw  %[ftmp3],   %[ftmp3],       %[ff_pw_1]              \n\t"
977         "punpckhhw  %[ftmp5],   %[ftmp4],       %[ff_pw_1]              \n\t"
978         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ff_pw_1]              \n\t"
979         "mtc1       %[qmul],    %[ftmp7]                                \n\t"
980         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
981         "pmaddhw    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
982         "pmaddhw    %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
983         "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
984         "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
985         "psraw      %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
986         "psraw      %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
987         "psraw      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
988         "psraw      %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
989         "packsswh   %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
990         "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
991         "dmfc1      %[tmp1],    %[ftmp3]                                \n\t"
992         "dsrl       %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
993         "mfc1       %[input],   %[ftmp3]                                \n\t"
994         "sh         %[tmp1],    0x100(%[output])                        \n\t"
995         "sh         %[input],   0x180(%[output])                        \n\t"
996         "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
997         PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
998         "sh         %[tmp1],    0x120(%[output])                        \n\t"
999         "sh         %[input],   0x1a0(%[output])                        \n\t"
1000         "dmfc1      %[tmp1],    %[ftmp4]                                \n\t"
1001         "dsrl       %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
1002         "mfc1       %[input],   %[ftmp4]                                \n\t"
1003         "sh         %[tmp1],    0x140(%[output])                        \n\t"
1004         "sh         %[input],   0x1c0(%[output])                        \n\t"
1005         "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
1006         PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
1007         "sh         %[tmp1],    0x160(%[output])                        \n\t"
1008         "j          2f                                                  \n\t"
1009         "sh         %[input],   0x1e0(%[output])                        \n\t"
1010         "1:                                                             \n\t"
1011         "ori        %[tmp0],    $0,             0x1f                    \n\t"
1012         "clz        %[tmp1],    %[qmul]                                 \n\t"
1013         "ori        %[input],   $0,             0x07                    \n\t"
1014         "dsubu      %[tmp1],    %[tmp0],        %[tmp1]                 \n\t"
1015         "ori        %[tmp0],    $0,             0x80                    \n\t"
1016         "dsll       %[tmp0],    %[tmp0],        0x10                    \n\t"
1017         "daddu      %[qmul],    %[qmul],        %[tmp0]                 \n\t"
1018         "dsubu      %[tmp0],    %[tmp1],        %[input]                \n\t"
1019         "movn       %[tmp1],    %[input],       %[tmp0]                 \n\t"
1020         PTR_ADDIU  "%[input],   %[input],       0x01                    \n\t"
1021         "andi       %[tmp0],    %[tmp1],        0xff                    \n\t"
1022         "srlv       %[qmul],    %[qmul],        %[tmp0]                 \n\t"
1023         PTR_SUBU   "%[input],   %[input],       %[tmp1]                 \n\t"
1024         "mtc1       %[input],   %[ftmp6]                                \n\t"
1025         "punpckhhw  %[ftmp1],   %[ftmp0],       %[ff_pw_1]              \n\t"
1026         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ff_pw_1]              \n\t"
1027         "punpckhhw  %[ftmp5],   %[ftmp2],       %[ff_pw_1]              \n\t"
1028         "punpcklhw  %[ftmp2],   %[ftmp2],       %[ff_pw_1]              \n\t"
1029         "mtc1       %[qmul],    %[ftmp7]                                \n\t"
1030         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
1031         "pmaddhw    %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
1032         "pmaddhw    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
1033         "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
1034         "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1035         "psraw      %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
1036         "psraw      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
1037         "psraw      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
1038         "psraw      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1039         "packsswh   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
1040         "packsswh   %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
1041         "dmfc1      %[tmp1],    %[ftmp0]                                \n\t"
1042         "dsrl       %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
1043         "sh         %[tmp1],    0x00(%[output])                         \n\t"
1044         "mfc1       %[input],   %[ftmp0]                                \n\t"
1045         "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
1046         "sh         %[input],   0x80(%[output])                         \n\t"
1047         "sh         %[tmp1],    0x20(%[output])                         \n\t"
1048         PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
1049         "dmfc1      %[tmp1],    %[ftmp2]                                \n\t"
1050         "sh         %[input],   0xa0(%[output])                         \n\t"
1051         "dsrl       %[ftmp2],   %[ftmp2],       %[ftmp9]                \n\t"
1052         "sh         %[tmp1],    0x40(%[output])                         \n\t"
1053         "mfc1       %[input],   %[ftmp2]                                \n\t"
1054         "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
1055         "sh         %[input],   0xc0(%[output])                         \n\t"
1056         "sh         %[tmp1],    0x60(%[output])                         \n\t"
1057         PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
1058         "sh         %[input],   0xe0(%[output])                         \n\t"
1059         "punpckhhw  %[ftmp1],   %[ftmp3],       %[ff_pw_1]              \n\t"
1060         "punpcklhw  %[ftmp3],   %[ftmp3],       %[ff_pw_1]              \n\t"
1061         "punpckhhw  %[ftmp5],   %[ftmp4],       %[ff_pw_1]              \n\t"
1062         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ff_pw_1]              \n\t"
1063         "mtc1       %[qmul],    %[ftmp7]                                \n\t"
1064         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
1065         "pmaddhw    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
1066         "pmaddhw    %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
1067         "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
1068         "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1069         "psraw      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
1070         "psraw      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
1071         "psraw      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
1072         "psraw      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1073         "packsswh   %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
1074         "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
1075         "dmfc1      %[tmp1],    %[ftmp3]                                \n\t"
1076         "dsrl       %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
1077         "mfc1       %[input],   %[ftmp3]                                \n\t"
1078         "sh         %[tmp1],    0x100(%[output])                        \n\t"
1079         "sh         %[input],   0x180(%[output])                        \n\t"
1080         "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
1081         PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
1082         "sh         %[tmp1],    0x120(%[output])                        \n\t"
1083         "sh         %[input],   0x1a0(%[output])                        \n\t"
1084         "dmfc1      %[tmp1],    %[ftmp4]                                \n\t"
1085         "dsrl       %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
1086         "mfc1       %[input],   %[ftmp4]                                \n\t"
1087         "sh         %[tmp1],    0x140(%[output])                        \n\t"
1088         "sh         %[input],   0x1c0(%[output])                        \n\t"
1089         "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
1090         PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
1091         "sh         %[tmp1],    0x160(%[output])                        \n\t"
1092         "sh         %[input],   0x1e0(%[output])                        \n\t"
1093         "2:                                                             \n\t"
1094         ".set       reorder                                             \n\t"
1095         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1096           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1097           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1098           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1099           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
1100           [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
1101           [output]"+&r"(output),            [input]"+&r"(input),
1102           [qmul]"+&r"(qmul)
1103         : [ff_pw_1]"f"(ff_pw_1)
1104         : "memory"
1105     );
1106 }
1107
1108 void ff_h264_chroma422_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
1109 {
1110     int temp[8];
1111     int t[8];
1112
1113     temp[0] = block[0] + block[16];
1114     temp[1] = block[0] - block[16];
1115     temp[2] = block[32] + block[48];
1116     temp[3] = block[32] - block[48];
1117     temp[4] = block[64] + block[80];
1118     temp[5] = block[64] - block[80];
1119     temp[6] = block[96] + block[112];
1120     temp[7] = block[96] - block[112];
1121
1122     t[0] = temp[0] + temp[4] + temp[2] + temp[6];
1123     t[1] = temp[0] - temp[4] + temp[2] - temp[6];
1124     t[2] = temp[0] - temp[4] - temp[2] + temp[6];
1125     t[3] = temp[0] + temp[4] - temp[2] - temp[6];
1126     t[4] = temp[1] + temp[5] + temp[3] + temp[7];
1127     t[5] = temp[1] - temp[5] + temp[3] - temp[7];
1128     t[6] = temp[1] - temp[5] - temp[3] + temp[7];
1129     t[7] = temp[1] + temp[5] - temp[3] - temp[7];
1130
1131     block[  0]= (t[0]*qmul + 128) >> 8;
1132     block[ 32]= (t[1]*qmul + 128) >> 8;
1133     block[ 64]= (t[2]*qmul + 128) >> 8;
1134     block[ 96]= (t[3]*qmul + 128) >> 8;
1135     block[ 16]= (t[4]*qmul + 128) >> 8;
1136     block[ 48]= (t[5]*qmul + 128) >> 8;
1137     block[ 80]= (t[6]*qmul + 128) >> 8;
1138     block[112]= (t[7]*qmul + 128) >> 8;
1139 }
1140
1141 void ff_h264_chroma_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
1142 {
1143     int a,b,c,d;
1144
1145     d = block[0] - block[16];
1146     a = block[0] + block[16];
1147     b = block[32] - block[48];
1148     c = block[32] + block[48];
1149     block[0] = ((a+c)*qmul) >> 7;
1150     block[16]= ((d+b)*qmul) >> 7;
1151     block[32]= ((a-c)*qmul) >> 7;
1152     block[48]= ((d-b)*qmul) >> 7;
1153 }
1154
1155 void ff_h264_weight_pixels16_8_mmi(uint8_t *block, int stride, int height,
1156         int log2_denom, int weight, int offset)
1157 {
1158     int y;
1159     double ftmp[8];
1160
1161     offset <<= log2_denom;
1162
1163     if (log2_denom)
1164         offset += 1 << (log2_denom - 1);
1165
1166     for (y=0; y<height; y++, block+=stride) {
1167         __asm__ volatile (
1168             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1169             "ldc1       %[ftmp1],   0x00(%[block0])                     \n\t"
1170             "ldc1       %[ftmp2],   0x00(%[block1])                     \n\t"
1171             "mtc1       %[weight],  %[ftmp3]                            \n\t"
1172             "mtc1       %[offset],  %[ftmp4]                            \n\t"
1173             "mtc1       %[log2_denom],              %[ftmp5]            \n\t"
1174             "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1175             "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1176             "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t"
1177             "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]            \n\t"
1178             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1179             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1180             "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1181             "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
1182             "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
1183             "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t"
1184             "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1185             "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp4]            \n\t"
1186             "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
1187             "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
1188             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1189             "psrah      %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
1190             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
1191             "psrah      %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
1192             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
1193             "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1194             "sdc1       %[ftmp1],   0x00(%[block0])                     \n\t"
1195             "sdc1       %[ftmp2],   0x00(%[block1])                     \n\t"
1196             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1197               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1198               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1199               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7])
1200             : [block0]"r"(block),           [block1]"r"(block+8),
1201               [weight]"r"(weight),          [offset]"r"(offset),
1202               [log2_denom]"r"(log2_denom)
1203             : "memory"
1204         );
1205     }
1206 }
1207
1208 void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src, int stride,
1209         int height, int log2_denom, int weightd, int weights, int offset)
1210 {
1211     int y;
1212     double ftmp[9];
1213
1214     offset = ((offset + 1) | 1) << log2_denom;
1215
1216     for (y=0; y<height; y++, dst+=stride, src+=stride) {
1217         __asm__ volatile (
1218             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1219             "ldc1       %[ftmp1],   0x00(%[src0])                       \n\t"
1220             "ldc1       %[ftmp2],   0x00(%[dst0])                       \n\t"
1221             "mtc1       %[weights], %[ftmp3]                            \n\t"
1222             "mtc1       %[weightd], %[ftmp4]                            \n\t"
1223             "mtc1       %[offset],  %[ftmp5]                            \n\t"
1224             "mtc1       %[log2_denom],              %[ftmp6]            \n\t"
1225             "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1226             "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1227             "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
1228             "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t"
1229             "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
1230             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1231             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1232             "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
1233             "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
1234             "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
1235             "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
1236             "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
1237             "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
1238             "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp8]            \n\t"
1239             "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
1240             "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
1241             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
1242             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1243             "sdc1       %[ftmp1],   0x00(%[dst0])                       \n\t"
1244             "ldc1       %[ftmp1],   0x00(%[src1])                       \n\t"
1245             "ldc1       %[ftmp2],   0x00(%[dst1])                       \n\t"
1246             "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t"
1247             "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
1248             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1249             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1250             "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
1251             "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
1252             "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
1253             "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
1254             "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
1255             "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
1256             "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp8]            \n\t"
1257             "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
1258             "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
1259             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
1260             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1261             "sdc1       %[ftmp1],   0x00(%[dst1])                       \n\t"
1262             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1263               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1264               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1265               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1266               [ftmp8]"=&f"(ftmp[8])
1267             : [dst0]"r"(dst),               [dst1]"r"(dst+8),
1268               [src0]"r"(src),               [src1]"r"(src+8),
1269               [weights]"r"(weights),        [weightd]"r"(weightd),
1270               [offset]"r"(offset),          [log2_denom]"r"(log2_denom+1)
1271             : "memory"
1272         );
1273     }
1274 }
1275
1276 void ff_h264_weight_pixels8_8_mmi(uint8_t *block, int stride, int height,
1277         int log2_denom, int weight, int offset)
1278 {
1279     int y;
1280     double ftmp[6];
1281
1282     offset <<= log2_denom;
1283
1284     if (log2_denom)
1285         offset += 1 << (log2_denom - 1);
1286
1287     for (y=0; y<height; y++, block+=stride) {
1288         __asm__ volatile (
1289             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1290             "ldc1       %[ftmp1],   0x00(%[block])                      \n\t"
1291             "mtc1       %[weight],  %[ftmp2]                            \n\t"
1292             "mtc1       %[offset],  %[ftmp3]                            \n\t"
1293             "mtc1       %[log2_denom],              %[ftmp5]            \n\t"
1294             "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1295             "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1296             "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t"
1297             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1298             "pmullh     %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"
1299             "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
1300             "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
1301             "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
1302             "psrah      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"
1303             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
1304             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
1305             "sdc1       %[ftmp1],   0x00(%[block])                      \n\t"
1306             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1307               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1308               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5])
1309             : [block]"r"(block),            [weight]"r"(weight),
1310               [offset]"r"(offset),          [log2_denom]"r"(log2_denom)
1311             : "memory"
1312         );
1313     }
1314 }
1315
1316 void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src, int stride,
1317         int height, int log2_denom, int weightd, int weights, int offset)
1318 {
1319     int y;
1320     double ftmp[9];
1321
1322     offset = ((offset + 1) | 1) << log2_denom;
1323
1324     for (y=0; y<height; y++, dst+=stride, src+=stride) {
1325         __asm__ volatile (
1326             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1327             "ldc1       %[ftmp1],   0x00(%[src])                        \n\t"
1328             "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
1329             "mtc1       %[weights], %[ftmp3]                            \n\t"
1330             "mtc1       %[weightd], %[ftmp4]                            \n\t"
1331             "mtc1       %[offset],  %[ftmp5]                            \n\t"
1332             "mtc1       %[log2_denom],              %[ftmp6]            \n\t"
1333             "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1334             "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1335             "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
1336             "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t"
1337             "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
1338             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1339             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1340             "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
1341             "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
1342             "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
1343             "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
1344             "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
1345             "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
1346             "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp8]            \n\t"
1347             "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
1348             "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
1349             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
1350             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1351             "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
1352             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1353               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1354               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1355               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1356               [ftmp8]"=&f"(ftmp[8])
1357             : [dst]"r"(dst),                [src]"r"(src),
1358               [weights]"r"(weights),        [weightd]"r"(weightd),
1359               [offset]"r"(offset),          [log2_denom]"r"(log2_denom+1)
1360             : "memory"
1361         );
1362     }
1363 }
1364
1365 void ff_h264_weight_pixels4_8_mmi(uint8_t *block, int stride, int height,
1366         int log2_denom, int weight, int offset)
1367 {
1368     int y;
1369     double ftmp[5];
1370     uint64_t low32;
1371
1372     offset <<= log2_denom;
1373
1374     if (log2_denom)
1375         offset += 1 << (log2_denom - 1);
1376
1377     for (y=0; y<height; y++, block+=stride) {
1378         __asm__ volatile (
1379             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1380             "uld        %[low32],   0x00(%[block])                      \n\t"
1381             "mtc1       %[low32],   %[ftmp1]                            \n\t"
1382             "mtc1       %[weight],  %[ftmp2]                            \n\t"
1383             "mtc1       %[offset],  %[ftmp3]                            \n\t"
1384             "mtc1       %[log2_denom],              %[ftmp4]            \n\t"
1385             "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1386             "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1387             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1388             "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
1389             "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
1390             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
1391             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1392             "gsswlc1    %[ftmp1],   0x03(%[block])                      \n\t"
1393             "gsswrc1    %[ftmp1],   0x00(%[block])                      \n\t"
1394             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1395               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1396               [ftmp4]"=&f"(ftmp[4]),
1397               [low32]"=&r"(low32)
1398             : [block]"r"(block),            [weight]"r"(weight),
1399               [offset]"r"(offset),          [log2_denom]"r"(log2_denom)
1400             : "memory"
1401         );
1402     }
1403 }
1404
1405 void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src, int stride,
1406         int height, int log2_denom, int weightd, int weights, int offset)
1407 {
1408     int y;
1409     double ftmp[7];
1410     uint64_t low32;
1411
1412     offset = ((offset + 1) | 1) << log2_denom;
1413
1414     for (y=0; y<height; y++, dst+=stride, src+=stride) {
1415         __asm__ volatile (
1416             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1417             "uld        %[low32],   0x00(%[src])                        \n\t"
1418             "mtc1       %[low32],   %[ftmp1]                            \n\t"
1419             "uld        %[low32],   0x00(%[dst])                        \n\t"
1420             "mtc1       %[low32],   %[ftmp2]                            \n\t"
1421             "mtc1       %[weight],  %[ftmp3]                            \n\t"
1422             "mtc1       %[weightd], %[ftmp4]                            \n\t"
1423             "mtc1       %[offset],  %[ftmp5]                            \n\t"
1424             "mtc1       %[log2_denom],              %[ftmp6]            \n\t"
1425             "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1426             "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1427             "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
1428             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1429             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1430             "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
1431             "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
1432             "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
1433             "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
1434             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
1435             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1436             "gsswlc1    %[ftmp1],   0x03(%[dst])                        \n\t"
1437             "gsswrc1    %[ftmp1],   0x00(%[dst])                        \n\t"
1438             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1439               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1440               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1441               [ftmp6]"=&f"(ftmp[6]),
1442               [low32]"=&r"(low32)
1443             : [dst]"r"(dst),                [src]"r"(src),
1444               [weight]"r"(weights),         [weightd]"r"(weightd),
1445               [offset]"r"(offset),          [log2_denom]"r"(log2_denom+1)
1446             : "memory"
1447         );
1448     }
1449 }
1450
1451 void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
1452         int8_t *tc0)
1453 {
1454     double ftmp[12];
1455     mips_reg addr[2];
1456     uint64_t low32;
1457
1458     __asm__ volatile (
1459         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
1460         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1461         PTR_ADDU   "%[addr1],   %[stride],      %[addr0]                \n\t"
1462         "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
1463         PTR_SUBU   "%[addr1],   $0,             %[addr1]                \n\t"
1464         "addi       %[beta],    %[beta],        -0x01                   \n\t"
1465         PTR_ADDU   "%[addr1],   %[addr1],       %[pix]                  \n\t"
1466         "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
1467         "gsldxc1    %[ftmp1],   0x00(%[addr1],  %[stride])              \n\t"
1468         "gsldxc1    %[ftmp2],   0x00(%[addr1],  %[addr0])               \n\t"
1469         "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
1470         "mtc1       %[alpha],   %[ftmp5]                                \n\t"
1471         "mtc1       %[beta],    %[ftmp6]                                \n\t"
1472         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
1473         "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
1474         "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
1475         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
1476         "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
1477         "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
1478         "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
1479         "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
1480         "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
1481         "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
1482         "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1483         "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
1484         "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1485         "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
1486         "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
1487         "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1488         "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1489         "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
1490         "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
1491         "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
1492         "uld        %[low32],   0x00(%[tc0])                            \n\t"
1493         "mtc1       %[low32],   %[ftmp5]                                \n\t"
1494         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
1495         "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp5]                \n\t"
1496         "pcmpgtb    %[ftmp5],   %[ftmp9],       %[ftmp4]                \n\t"
1497         "ldc1       %[ftmp4],   0x00(%[addr1])                          \n\t"
1498         "and        %[ftmp10],  %[ftmp5],       %[ftmp8]                \n\t"
1499         "psubusb    %[ftmp8],   %[ftmp4],       %[ftmp2]                \n\t"
1500         "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp4]                \n\t"
1501         "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
1502         "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
1503         "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
1504         "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
1505         "and        %[ftmp5],   %[ftmp10],      %[ftmp9]                \n\t"
1506         "psubb      %[ftmp8],   %[ftmp5],       %[ftmp7]                \n\t"
1507         "and        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
1508         "pavgb      %[ftmp5],   %[ftmp2],       %[ftmp3]                \n\t"
1509         "ldc1       %[ftmp11],  0x00(%[addr1])                          \n\t"
1510         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
1511         "xor        %[ftmp5],   %[ftmp5],       %[ftmp11]               \n\t"
1512         "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
1513         "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
1514         "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp7]                \n\t"
1515         "paddusb    %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
1516         "pmaxub     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
1517         "pminub     %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
1518         "gssdxc1    %[ftmp4],   0x00(%[addr1],  %[stride])              \n\t"
1519         "gsldxc1    %[ftmp5],   0x00(%[pix],    %[addr0])               \n\t"
1520         "psubusb    %[ftmp4],   %[ftmp5],       %[ftmp3]                \n\t"
1521         "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp5]                \n\t"
1522         "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
1523         "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
1524         "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
1525         "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
1526         "psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
1527         "and        %[ftmp6],   %[ftmp9],       %[ftmp7]                \n\t"
1528         "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
1529         "pavgb      %[ftmp7],   %[ftmp2],       %[ftmp3]                \n\t"
1530         "gsldxc1    %[ftmp11],  0x00(%[pix],    %[addr0])               \n\t"
1531         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1532         "xor        %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
1533         "and        %[ftmp7],   %[ftmp7],       %[ff_pb_1]              \n\t"
1534         "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1535         "psubusb    %[ftmp7],   %[ftmp4],       %[ftmp6]                \n\t"
1536         "paddusb    %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
1537         "pmaxub     %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1538         "pminub     %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1539         "gssdxc1    %[ftmp5],   0x00(%[pix],    %[stride])              \n\t"
1540         "xor        %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
1541         "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
1542         "and        %[ftmp6],   %[ftmp6],       %[ff_pb_1]              \n\t"
1543         "xor        %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
1544         "xor        %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
1545         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
1546         "pavgb      %[ftmp4],   %[ftmp4],       %[ff_pb_3]              \n\t"
1547         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
1548         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
1549         "paddusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
1550         "psubusb    %[ftmp7],   %[ff_pb_A1],    %[ftmp4]                \n\t"
1551         "psubusb    %[ftmp4],   %[ftmp4],       %[ff_pb_A1]             \n\t"
1552         "pminub     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
1553         "pminub     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
1554         "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
1555         "psubusb    %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
1556         "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
1557         "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
1558         "gssdxc1    %[ftmp2],   0x00(%[addr1],  %[addr0])               \n\t"
1559         "sdc1       %[ftmp3],   0x00(%[pix])                            \n\t"
1560         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1561           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1562           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1563           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1564           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
1565           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
1566           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
1567           [low32]"=&r"(low32)
1568         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
1569           [alpha]"r"((mips_reg)alpha),      [beta]"r"((mips_reg)beta),
1570           [tc0]"r"(tc0),                    [ff_pb_1]"f"(ff_pb_1),
1571           [ff_pb_3]"f"(ff_pb_3),            [ff_pb_A1]"f"(ff_pb_A1)
1572         : "memory"
1573     );
1574 }
1575
1576 static void deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
1577         int beta)
1578 {
1579     DECLARE_ALIGNED(8, const uint64_t, stack[0x0a]);
1580     double ftmp[16];
1581     uint64_t tmp[1];
1582     mips_reg addr[3];
1583
1584 __asm__ volatile (
1585 "ori        %[tmp0],    $0,             0x01                    \n\t"
1586 "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1587 "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
1588 PTR_SLL    "%[addr0],   %[stride],      0x02                    \n\t"
1589 PTR_ADDU   "%[addr2],   %[stride],      %[stride]               \n\t"
1590 PTR_ADDIU  "%[alpha],   %[alpha],       -0x01                   \n\t"
1591 PTR_SLL    "%[ftmp11],  %[ftmp9],       %[ftmp9]                \n\t"
1592 "bltz       %[alpha],   1f                                      \n\t"
1593 PTR_ADDU   "%[addr1],   %[addr2],       %[stride]               \n\t"
1594 PTR_ADDIU  "%[beta],    %[beta],        -0x01                   \n\t"
1595 "bltz       %[beta],    1f                                      \n\t"
1596 PTR_SUBU   "%[addr0],   $0,             %[addr0]                \n\t"
1597 PTR_ADDU   "%[addr0],   %[addr0],       %[pix]                  \n\t"
1598 "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
1599 "gsldxc1    %[ftmp1],   0x00(%[addr0],  %[addr2])               \n\t"
1600 "gsldxc1    %[ftmp2],   0x00(%[addr0],  %[addr1])               \n\t"
1601 "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
1602 "mtc1       %[alpha],   %[ftmp5]                                \n\t"
1603 "mtc1       %[beta],    %[ftmp6]                                \n\t"
1604 "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
1605 "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
1606 "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
1607 "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
1608 "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
1609 "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
1610 "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
1611 "sdc1       %[ftmp5],   0x10+%[stack]                           \n\t"
1612 "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
1613 "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
1614 "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
1615 "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1616 "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1617 "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
1618 "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
1619 "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
1620 "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1621 "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1622 "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
1623 "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
1624 "ldc1       %[ftmp5],   0x10+%[stack]                           \n\t"
1625 "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
1626 "ldc1       %[ftmp10],  %[ff_pb_1]                              \n\t"
1627 "sdc1       %[ftmp8],   0x20+%[stack]                           \n\t"
1628 "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
1629 "psubusb    %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
1630 "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
1631 "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp3]                \n\t"
1632 "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
1633 "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
1634 "ldc1       %[ftmp15],  0x20+%[stack]                           \n\t"
1635 "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
1636 "and        %[ftmp7],   %[ftmp7],       %[ftmp15]               \n\t"
1637 "gsldxc1    %[ftmp15],  0x00(%[addr0],  %[stride])              \n\t"
1638 "psubusb    %[ftmp8],   %[ftmp15],      %[ftmp2]                \n\t"
1639 "psubusb    %[ftmp5],   %[ftmp2],       %[ftmp15]               \n\t"
1640 "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
1641 "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1642 "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
1643 "and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1644 "gsldxc1    %[ftmp14],  0x00(%[pix],    %[addr2])               \n\t"
1645 "sdc1       %[ftmp5],   0x30+%[stack]                           \n\t"
1646 "psubusb    %[ftmp8],   %[ftmp14],      %[ftmp3]                \n\t"
1647 "psubusb    %[ftmp5],   %[ftmp3],       %[ftmp14]               \n\t"
1648 "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
1649 "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1650 "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
1651 "and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1652 "sdc1       %[ftmp5],   0x40+%[stack]                           \n\t"
1653 "pavgb      %[ftmp5],   %[ftmp15],      %[ftmp1]                \n\t"
1654 "pavgb      %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
1655 "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1656 "sdc1       %[ftmp6],   0x10+%[stack]                           \n\t"
1657 "paddb      %[ftmp7],   %[ftmp15],      %[ftmp1]                \n\t"
1658 "paddb      %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
1659 "paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
1660 "mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
1661 "sdc1       %[ftmp7],   0x00+%[stack]                           \n\t"
1662 "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
1663 "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
1664 "xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
1665 "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
1666 "psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1667 "pavgb      %[ftmp6],   %[ftmp15],      %[ftmp4]                \n\t"
1668 "psubb      %[ftmp7],   %[ftmp15],      %[ftmp4]                \n\t"
1669 "paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
1670 "psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
1671 "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
1672 "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
1673 "ldc1       %[ftmp13],  0x10+%[stack]                           \n\t"
1674 "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
1675 "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
1676 "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
1677 "pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
1678 "xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
1679 "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
1680 "psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
1681 "xor        %[ftmp8],   %[ftmp2],       %[ftmp4]                \n\t"
1682 "pavgb      %[ftmp7],   %[ftmp2],       %[ftmp4]                \n\t"
1683 "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
1684 "psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
1685 "ldc1       %[ftmp13],  0x30+%[stack]                           \n\t"
1686 "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
1687 "ldc1       %[ftmp12],  0x20+%[stack]                           \n\t"
1688 "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
1689 "xor        %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
1690 "and        %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
1691 "and        %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
1692 "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
1693 "xor        %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
1694 "gssdxc1    %[ftmp6],   0x00(%[addr0],  %[addr1])               \n\t"
1695 "ldc1       %[ftmp6],   0x00(%[addr0])                          \n\t"
1696 "paddb      %[ftmp7],   %[ftmp15],      %[ftmp6]                \n\t"
1697 "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
1698 "ldc1       %[ftmp12],  0x00+%[stack]                           \n\t"
1699 "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
1700 "paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
1701 "paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
1702 "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
1703 "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
1704 "xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
1705 "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
1706 "ldc1       %[ftmp12],  0x30+%[stack]                           \n\t"
1707 "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
1708 "xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
1709 "xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
1710 "and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
1711 "and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
1712 "xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
1713 "xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
1714 "gssdxc1    %[ftmp5],   0x00(%[addr0],  %[addr2])               \n\t"
1715 "gssdxc1    %[ftmp6],   0x00(%[addr0],  %[stride])              \n\t"
1716 "pavgb      %[ftmp5],   %[ftmp14],      %[ftmp4]                \n\t"
1717 "pavgb      %[ftmp6],   %[ftmp3],       %[ftmp2]                \n\t"
1718 "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1719 "sdc1       %[ftmp6],   0x10+%[stack]                           \n\t"
1720 "paddb      %[ftmp7],   %[ftmp14],      %[ftmp4]                \n\t"
1721 "paddb      %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
1722 "paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
1723 "mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
1724 "sdc1       %[ftmp7],   0x00+%[stack]                           \n\t"
1725 "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
1726 "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
1727 "xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
1728 "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
1729 "psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1730 "pavgb      %[ftmp6],   %[ftmp14],      %[ftmp1]                \n\t"
1731 "paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
1732 "psubb      %[ftmp7],   %[ftmp14],      %[ftmp1]                \n\t"
1733 "psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
1734 "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
1735 "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
1736 "ldc1       %[ftmp12],  0x10+%[stack]                           \n\t"
1737 "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
1738 "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
1739 "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
1740 "pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
1741 "xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
1742 "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
1743 "psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
1744 "xor        %[ftmp8],   %[ftmp3],       %[ftmp1]                \n\t"
1745 "pavgb      %[ftmp7],   %[ftmp3],       %[ftmp1]                \n\t"
1746 "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
1747 "ldc1       %[ftmp12],  0x40+%[stack]                           \n\t"
1748 "psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
1749 "ldc1       %[ftmp13],  0x20+%[stack]                           \n\t"
1750 "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
1751 "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
1752 "xor        %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
1753 "and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
1754 "and        %[ftmp7],   %[ftmp7],       %[ftmp13]               \n\t"
1755 "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
1756 "xor        %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
1757 "sdc1       %[ftmp6],   0x00(%[pix])                            \n\t"
1758 "gsldxc1    %[ftmp6],   0x00(%[pix],    %[addr1])               \n\t"
1759 "paddb      %[ftmp7],   %[ftmp14],      %[ftmp6]                \n\t"
1760 "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
1761 "ldc1       %[ftmp12],  0x00+%[stack]                           \n\t"
1762 "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
1763 "paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
1764 "paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
1765 "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
1766 "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
1767 "xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
1768 "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
1769 "ldc1       %[ftmp12],  0x40+%[stack]                           \n\t"
1770 "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
1771 "xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
1772 "xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
1773 "and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
1774 "and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
1775 "xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
1776 "xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
1777 "gssdxc1    %[ftmp5],   0x00(%[pix],    %[stride])              \n\t"
1778 "gssdxc1    %[ftmp6],   0x00(%[pix],    %[addr2])               \n\t"
1779 "1:                                                             \n\t"
1780         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1781           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1782           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1783           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1784           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
1785           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
1786           [ftmp12]"=&f"(ftmp[12]),          [ftmp13]"=&f"(ftmp[13]),
1787           [ftmp14]"=&f"(ftmp[14]),          [ftmp15]"=&f"(ftmp[15]),
1788   [tmp0]"=&r"(tmp[0]),
1789   [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
1790   [addr2]"=&r"(addr[2]),
1791   [alpha]"+&r"(alpha),              [beta]"+&r"(beta)
1792         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
1793   [stack]"m"(stack[0]),             [ff_pb_1]"m"(ff_pb_1)
1794 : "memory"
1795 );
1796 }
1797
1798 void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
1799         int8_t *tc0)
1800 {
1801     double ftmp[9];
1802     mips_reg addr[1];
1803     uint64_t low32;
1804
1805     __asm__ volatile (
1806         "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
1807         "addi       %[beta],    %[beta],        -0x01                   \n\t"
1808         "or         %[addr0],   $0,             %[pix]                  \n\t"
1809         PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
1810         PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
1811         "ldc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
1812         "gsldxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
1813         "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
1814         "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
1815
1816         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1817         "mtc1       %[alpha],   %[ftmp5]                                \n\t"
1818         "mtc1       %[beta],    %[ftmp6]                                \n\t"
1819         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
1820         "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
1821         "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
1822         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
1823         "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
1824         "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
1825         "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
1826         "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
1827         "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
1828         "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
1829         "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1830         "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1831         "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
1832         "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
1833         "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
1834         "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1835         "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1836         "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
1837         "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
1838         "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
1839         "uld        %[low32],   0x00(%[tc0])                            \n\t"
1840         "mtc1       %[low32],   %[ftmp7]                                \n\t"
1841         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
1842         "and        %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
1843         "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
1844         "xor        %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
1845         "xor        %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
1846         "and        %[ftmp6],   %[ftmp6],       %[ff_pb_1]              \n\t"
1847         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
1848         "xor        %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
1849         "pavgb      %[ftmp4],   %[ftmp4],       %[ff_pb_3]              \n\t"
1850         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
1851         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
1852         "paddusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
1853         "psubusb    %[ftmp7],   %[ff_pb_A1],    %[ftmp4]                \n\t"
1854         "psubusb    %[ftmp4],   %[ftmp4],       %[ff_pb_A1]             \n\t"
1855         "pminub     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
1856         "pminub     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
1857         "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
1858         "psubusb    %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
1859         "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
1860         "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
1861
1862         "gssdxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
1863         "sdc1       %[ftmp3],   0x00(%[pix])                            \n\t"
1864         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1865           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1866           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1867           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1868           [ftmp8]"=&f"(ftmp[8]),
1869           [addr0]"=&r"(addr[0]),
1870           [low32]"=&r"(low32)
1871         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
1872           [alpha]"r"(alpha),                [beta]"r"(beta),
1873           [tc0]"r"(tc0),                    [ff_pb_1]"f"(ff_pb_1),
1874           [ff_pb_3]"f"(ff_pb_3),            [ff_pb_A1]"f"(ff_pb_A1)
1875         : "memory"
1876     );
1877 }
1878
1879 void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
1880         int beta)
1881 {
1882     double ftmp[9];
1883     mips_reg addr[1];
1884
1885     __asm__ volatile (
1886         "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
1887         "addi       %[beta],    %[beta],        -0x01                   \n\t"
1888         "or         %[addr0],   $0,             %[pix]                  \n\t"
1889         PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
1890         PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
1891         "ldc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
1892         "gsldxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
1893         "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
1894         "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
1895
1896         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1897         "mtc1       %[alpha],   %[ftmp5]                                \n\t"
1898         "mtc1       %[beta],    %[ftmp6]                                \n\t"
1899         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
1900         "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
1901         "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
1902         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
1903         "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
1904         "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
1905         "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
1906         "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
1907         "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
1908         "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
1909         "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1910         "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1911         "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
1912         "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
1913         "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
1914         "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1915         "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1916         "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
1917         "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
1918         "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
1919         "mov.d      %[ftmp6],   %[ftmp2]                                \n\t"
1920         "mov.d      %[ftmp7],   %[ftmp3]                                \n\t"
1921         "xor        %[ftmp5],   %[ftmp2],       %[ftmp4]                \n\t"
1922         "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
1923         "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
1924         "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
1925         "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
1926         "xor        %[ftmp5],   %[ftmp3],       %[ftmp1]                \n\t"
1927         "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
1928         "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
1929         "psubusb    %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
1930         "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
1931         "psubb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
1932         "psubb      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
1933         "and        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
1934         "and        %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
1935         "paddb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
1936         "paddb      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
1937
1938         "gssdxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
1939         "sdc1       %[ftmp3],   0x00(%[pix])                            \n\t"
1940         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1941           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1942           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1943           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1944           [ftmp8]"=&f"(ftmp[8]),
1945           [addr0]"=&r"(addr[0])
1946         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
1947           [alpha]"r"(alpha),                [beta]"r"(beta),
1948           [ff_pb_1]"f"(ff_pb_1)
1949         : "memory"
1950     );
1951 }
1952
1953 void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
1954         int8_t *tc0)
1955 {
1956     double ftmp[11];
1957     mips_reg addr[6];
1958     uint64_t low32;
1959
1960     __asm__ volatile (
1961         "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
1962         "addi       %[beta],    %[beta],        -0x01                   \n\t"
1963         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
1964         PTR_ADDI   "%[pix],     %[pix],         -0x02                   \n\t"
1965         PTR_ADDU   "%[addr1],   %[addr0],       %[stride]               \n\t"
1966         PTR_ADDU   "%[addr2],   %[addr0],       %[addr0]                \n\t"
1967         "or         %[addr5],   $0,             %[pix]                  \n\t"
1968         PTR_ADDU   "%[pix],     %[pix],         %[addr1]                \n\t"
1969         "uld        %[low32],   0x00(%[addr5])                          \n\t"
1970         "mtc1       %[low32],   %[ftmp0]                                \n\t"
1971         PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
1972         "uld        %[low32],   0x00(%[addr3])                          \n\t"
1973         "mtc1       %[low32],   %[ftmp2]                                \n\t"
1974         PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
1975         "uld        %[low32],   0x00(%[addr4])                          \n\t"
1976         "mtc1       %[low32],   %[ftmp1]                                \n\t"
1977         "uld        %[low32],   0x00(%[pix])                            \n\t"
1978         "mtc1       %[low32],   %[ftmp3]                                \n\t"
1979         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
1980         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
1981         PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
1982         "punpckhhw  %[ftmp2],   %[ftmp0],       %[ftmp1]                \n\t"
1983         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
1984         "uld        %[low32],   0x00(%[addr3])                          \n\t"
1985         "mtc1       %[low32],   %[ftmp4]                                \n\t"
1986         PTR_ADDU   "%[addr4],   %[pix],         %[addr0]                \n\t"
1987         "uld        %[low32],   0x00(%[addr4])                          \n\t"
1988         "mtc1       %[low32],   %[ftmp6]                                \n\t"
1989         PTR_ADDU   "%[addr3],   %[pix],         %[addr1]                \n\t"
1990         "uld        %[low32],   0x00(%[addr3])                          \n\t"
1991         "mtc1       %[low32],   %[ftmp5]                                \n\t"
1992         PTR_ADDU   "%[addr4],   %[pix],         %[addr2]                \n\t"
1993         "uld        %[low32],   0x00(%[addr4])                          \n\t"
1994         "mtc1       %[low32],   %[ftmp7]                                \n\t"
1995         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
1996         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1997         "mov.d      %[ftmp6],   %[ftmp4]                                \n\t"
1998         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
1999         "punpckhhw  %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
2000         "punpckhwd  %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
2001         "punpckhwd  %[ftmp3],   %[ftmp2],       %[ftmp6]                \n\t"
2002         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
2003         "punpcklwd  %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
2004         "mov.d      %[ftmp9],   %[ftmp0]                                \n\t"
2005         "mov.d      %[ftmp10],  %[ftmp3]                                \n\t"
2006
2007         "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
2008         "mtc1       %[alpha],   %[ftmp4]                                \n\t"
2009         "mtc1       %[beta],    %[ftmp5]                                \n\t"
2010         "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
2011         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
2012         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
2013         "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
2014         "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp1]                \n\t"
2015         "psubusb    %[ftmp7],   %[ftmp1],       %[ftmp2]                \n\t"
2016         "or         %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
2017         "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
2018         "psubusb    %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
2019         "psubusb    %[ftmp4],   %[ftmp0],       %[ftmp1]                \n\t"
2020         "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
2021         "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
2022         "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
2023         "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
2024         "psubusb    %[ftmp4],   %[ftmp3],       %[ftmp2]                \n\t"
2025         "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
2026         "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
2027         "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
2028         "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
2029         "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
2030         "uld        %[low32],   0x00(%[tc0])                            \n\t"
2031         "mtc1       %[low32],   %[ftmp6]                                \n\t"
2032         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
2033         "and        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
2034         "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
2035         "xor        %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
2036         "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
2037         "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
2038         "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
2039         "xor        %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
2040         "pavgb      %[ftmp3],   %[ftmp3],       %[ff_pb_3]              \n\t"
2041         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
2042         "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
2043         "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
2044         "psubusb    %[ftmp6],   %[ff_pb_A1],    %[ftmp3]                \n\t"
2045         "psubusb    %[ftmp3],   %[ftmp3],       %[ff_pb_A1]             \n\t"
2046         "pminub     %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
2047         "pminub     %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
2048         "psubusb    %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
2049         "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
2050         "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
2051         "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
2052
2053         "punpckhwd  %[ftmp4],   %[ftmp9],       %[ftmp9]                \n\t"
2054         "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
2055         "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
2056         "punpcklbh  %[ftmp0],   %[ftmp9],       %[ftmp1]                \n\t"
2057         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp10]               \n\t"
2058         "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
2059         "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
2060         "gsswlc1    %[ftmp1],   0x03(%[addr5])                          \n\t"
2061         "gsswrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
2062         PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
2063         "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
2064         "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
2065         PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
2066         "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
2067         "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
2068         "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
2069         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
2070         "punpckhwd  %[ftmp3],   %[ftmp10],      %[ftmp10]               \n\t"
2071         "gsswlc1    %[ftmp0],   0x03(%[pix])                            \n\t"
2072         "gsswrc1    %[ftmp0],   0x00(%[pix])                            \n\t"
2073         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
2074         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
2075         PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
2076         "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
2077         "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
2078         "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
2079         "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
2080         "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
2081         PTR_ADDU   "%[addr3],   %[pix],         %[addr0]                \n\t"
2082         PTR_ADDU   "%[addr4],   %[pix],         %[addr1]                \n\t"
2083         "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
2084         "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
2085         "gsswlc1    %[ftmp4],   0x03(%[addr4])                          \n\t"
2086         PTR_ADDU   "%[addr3],   %[pix],         %[addr2]                \n\t"
2087         "punpckhwd  %[ftmp9],   %[ftmp4],       %[ftmp4]                \n\t"
2088         "gsswrc1    %[ftmp4],   0x00(%[addr4])                          \n\t"
2089         "gsswlc1    %[ftmp9],   0x03(%[addr3])                          \n\t"
2090         "gsswrc1    %[ftmp9],   0x00(%[addr3])                          \n\t"
2091         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2092           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2093           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2094           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2095           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
2096           [ftmp10]"=&f"(ftmp[10]),
2097           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
2098           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
2099           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
2100           [pix]"+&r"(pix),
2101           [low32]"=&r"(low32)
2102         : [alpha]"r"(alpha),                [beta]"r"(beta),
2103           [stride]"r"((mips_reg)stride),    [tc0]"r"(tc0),
2104           [ff_pb_1]"f"(ff_pb_1),            [ff_pb_3]"f"(ff_pb_3),
2105           [ff_pb_A1]"f"(ff_pb_A1)
2106         : "memory"
2107     );
2108 }
2109
2110 void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
2111         int beta)
2112 {
2113     double ftmp[11];
2114     mips_reg addr[6];
2115     uint64_t low32;
2116
2117     __asm__ volatile (
2118         "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
2119         "addi       %[beta],    %[beta],        -0x01                   \n\t"
2120         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
2121         PTR_ADDI   "%[pix],     %[pix],         -0x02                   \n\t"
2122         PTR_ADDU   "%[addr1],   %[addr0],       %[stride]               \n\t"
2123         PTR_ADDU   "%[addr2],   %[addr0],       %[addr0]                \n\t"
2124         "or         %[addr5],   $0,             %[pix]                  \n\t"
2125         PTR_ADDU   "%[pix],     %[pix],         %[addr1]                \n\t"
2126         "uld        %[low32],   0x00(%[addr5])                          \n\t"
2127         "mtc1       %[low32],   %[ftmp0]                                \n\t"
2128         PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
2129         "uld        %[low32],   0x00(%[addr3])                          \n\t"
2130         "mtc1       %[low32],   %[ftmp2]                                \n\t"
2131         PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
2132         "uld        %[low32],   0x00(%[addr4])                          \n\t"
2133         "mtc1       %[low32],   %[ftmp1]                                \n\t"
2134         "uld        %[low32],   0x00(%[pix])                            \n\t"
2135         "mtc1       %[low32],   %[ftmp3]                                \n\t"
2136         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
2137         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
2138         PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
2139         "punpckhhw  %[ftmp2],   %[ftmp0],       %[ftmp1]                \n\t"
2140         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
2141         "uld        %[low32],   0x00(%[addr3])                          \n\t"
2142         "mtc1       %[low32],   %[ftmp4]                                \n\t"
2143         PTR_ADDU   "%[addr4],   %[pix],         %[addr0]                \n\t"
2144         "uld        %[low32],   0x00(%[addr4])                          \n\t"
2145         "mtc1       %[low32],   %[ftmp6]                                \n\t"
2146         PTR_ADDU   "%[addr3],   %[pix],         %[addr1]                \n\t"
2147         "uld        %[low32],   0x00(%[addr3])                          \n\t"
2148         "mtc1       %[low32],   %[ftmp5]                                \n\t"
2149         PTR_ADDU   "%[addr4],   %[pix],         %[addr2]                \n\t"
2150         "uld        %[low32],   0x00(%[addr4])                          \n\t"
2151         "mtc1       %[low32],   %[ftmp7]                                \n\t"
2152         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
2153         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
2154         "mov.d      %[ftmp6],   %[ftmp4]                                \n\t"
2155         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
2156         "punpckhhw  %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
2157         "punpckhwd  %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
2158         "punpckhwd  %[ftmp3],   %[ftmp2],       %[ftmp6]                \n\t"
2159         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
2160         "punpcklwd  %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
2161
2162         "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
2163         "mtc1       %[alpha],   %[ftmp4]                                \n\t"
2164         "mtc1       %[beta],    %[ftmp5]                                \n\t"
2165         "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
2166         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
2167         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
2168         "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
2169         "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp1]                \n\t"
2170         "psubusb    %[ftmp7],   %[ftmp1],       %[ftmp2]                \n\t"
2171         "or         %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
2172         "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
2173         "psubusb    %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
2174         "psubusb    %[ftmp4],   %[ftmp0],       %[ftmp1]                \n\t"
2175         "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
2176         "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
2177         "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
2178         "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
2179         "psubusb    %[ftmp4],   %[ftmp3],       %[ftmp2]                \n\t"
2180         "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
2181         "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
2182         "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
2183         "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
2184         "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
2185         "mov.d      %[ftmp5],   %[ftmp1]                                \n\t"
2186         "mov.d      %[ftmp6],   %[ftmp2]                                \n\t"
2187         "xor        %[ftmp4],   %[ftmp1],       %[ftmp3]                \n\t"
2188         "and        %[ftmp4],   %[ftmp4],       %[ff_pb_1]              \n\t"
2189         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
2190         "psubusb    %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
2191         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
2192         "xor        %[ftmp4],   %[ftmp2],       %[ftmp0]                \n\t"
2193         "and        %[ftmp4],   %[ftmp4],       %[ff_pb_1]              \n\t"
2194         "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
2195         "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
2196         "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
2197         "psubb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
2198         "psubb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
2199         "and        %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
2200         "and        %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
2201         "paddb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
2202         "paddb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
2203
2204         "punpckhwd  %[ftmp4],   %[ftmp0],       %[ftmp0]                \n\t"
2205         "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
2206         "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
2207         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
2208         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
2209         "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
2210         "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
2211         "gsswlc1    %[ftmp1],   0x03(%[addr5])                          \n\t"
2212         "gsswrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
2213         PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
2214         "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
2215         "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
2216         PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
2217         "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
2218         "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
2219         "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
2220         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
2221         "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
2222         "gsswlc1    %[ftmp0],   0x03(%[pix])                            \n\t"
2223         "gsswrc1    %[ftmp0],   0x00(%[pix])                            \n\t"
2224         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
2225         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
2226         PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
2227         "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
2228         "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
2229         "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
2230         "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
2231         "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
2232         PTR_ADDU   "%[addr3],   %[pix],         %[addr0]                \n\t"
2233         PTR_ADDU   "%[addr4],   %[pix],         %[addr1]                \n\t"
2234         "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
2235         "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
2236         "gsswlc1    %[ftmp4],   0x03(%[addr4])                          \n\t"
2237         PTR_ADDU   "%[addr3],   %[pix],         %[addr2]                \n\t"
2238         "punpckhwd  %[ftmp9],   %[ftmp4],       %[ftmp4]                \n\t"
2239         "gsswrc1    %[ftmp4],   0x00(%[addr4])                          \n\t"
2240         "gsswlc1    %[ftmp9],   0x03(%[addr3])                          \n\t"
2241         "gsswrc1    %[ftmp9],   0x00(%[addr3])                          \n\t"
2242         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2243           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2244           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2245           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2246           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
2247           [ftmp10]"=&f"(ftmp[10]),
2248           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
2249           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
2250           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
2251           [pix]"+&r"(pix),
2252           [low32]"=&r"(low32)
2253         : [alpha]"r"(alpha),                [beta]"r"(beta),
2254           [stride]"r"((mips_reg)stride),    [ff_pb_1]"f"(ff_pb_1)
2255         : "memory"
2256     );
2257 }
2258
2259 void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
2260         int8_t *tc0)
2261 {
2262     if ((tc0[0] & tc0[1]) >= 0)
2263         ff_deblock_v8_luma_8_mmi(pix + 0, stride, alpha, beta, tc0);
2264     if ((tc0[2] & tc0[3]) >= 0)
2265         ff_deblock_v8_luma_8_mmi(pix + 8, stride, alpha, beta, tc0 + 2);
2266 }
2267
2268 void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
2269         int beta)
2270 {
2271     deblock_v8_luma_intra_8_mmi(pix + 0, stride, alpha, beta);
2272     deblock_v8_luma_intra_8_mmi(pix + 8, stride, alpha, beta);
2273 }
2274
2275 void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
2276         int8_t *tc0)
2277 {
2278     uint64_t stack[0xd];
2279     double ftmp[9];
2280     mips_reg addr[8];
2281
2282     __asm__ volatile (
2283         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
2284         PTR_ADDI   "%[addr1],   %[pix],         -0x4                    \n\t"
2285         PTR_ADDU   "%[addr2],   %[stride],      %[addr0]                \n\t"
2286         "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
2287         "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
2288         PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
2289         PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
2290         "gsldlc1    %[ftmp1],   0x07(%[addr3])                          \n\t"
2291         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
2292         "gsldrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
2293         "gsldlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
2294         "gsldrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
2295         "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
2296         PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
2297         "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
2298         "gsldlc1    %[ftmp4],   0x07(%[addr3])                          \n\t"
2299         PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
2300         "gsldrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
2301         "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
2302         PTR_ADDU   "%[addr3],   %[addr4],       %[addr2]                \n\t"
2303         "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
2304         "gsldlc1    %[ftmp6],   0x07(%[addr3])                          \n\t"
2305         "gsldrc1    %[ftmp6],   0x00(%[addr3])                          \n\t"
2306         PTR_ADDU   "%[addr6],   %[addr0],       %[addr0]                \n\t"
2307         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
2308         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
2309         "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
2310         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
2311         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
2312         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
2313         PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
2314         "sdc1       %[ftmp1],   0x10(%[stack])                          \n\t"
2315         "gsldlc1    %[ftmp8],   0x07(%[addr3])                          \n\t"
2316         "gsldrc1    %[ftmp8],   0x00(%[addr3])                          \n\t"
2317         PTR_ADDU   "%[addr7],   %[addr6],       %[addr6]                \n\t"
2318         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
2319         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
2320         "punpckhhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
2321         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
2322         "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
2323         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
2324         "ldc1       %[ftmp8],   0x10(%[stack])                          \n\t"
2325         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
2326         "sdc1       %[ftmp0],   0x00(%[stack])                          \n\t"
2327         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp8]                \n\t"
2328         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
2329         "punpckhhw  %[ftmp0],   %[ftmp3],       %[ftmp5]                \n\t"
2330         "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
2331         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
2332         "punpckhwd  %[ftmp5],   %[ftmp7],       %[ftmp3]                \n\t"
2333         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
2334         "punpckhwd  %[ftmp3],   %[ftmp1],       %[ftmp2]                \n\t"
2335         "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
2336         "sdc1       %[ftmp1],   0x10(%[stack])                          \n\t"
2337         "sdc1       %[ftmp3],   0x20(%[stack])                          \n\t"
2338         "sdc1       %[ftmp7],   0x30(%[stack])                          \n\t"
2339         "sdc1       %[ftmp5],   0x40(%[stack])                          \n\t"
2340         "sdc1       %[ftmp6],   0x50(%[stack])                          \n\t"
2341         PTR_ADDU   "%[addr1],   %[addr1],       %[addr7]                \n\t"
2342         PTR_ADDU   "%[addr4],   %[addr4],       %[addr7]                \n\t"
2343         "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
2344         PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
2345         "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
2346         "gsldlc1    %[ftmp1],   0x07(%[addr3])                          \n\t"
2347         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
2348         "gsldrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
2349         "gsldlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
2350         "gsldrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
2351         "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
2352         PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
2353         "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
2354         "gsldlc1    %[ftmp4],   0x07(%[addr3])                          \n\t"
2355         PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
2356         "gsldrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
2357         "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
2358         PTR_ADDU   "%[addr3],   %[addr4],       %[addr2]                \n\t"
2359         "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
2360         "gsldlc1    %[ftmp6],   0x07(%[addr3])                          \n\t"
2361         "gsldrc1    %[ftmp6],   0x00(%[addr3])                          \n\t"
2362         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
2363         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
2364         "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
2365         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
2366         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
2367         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
2368         PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
2369         "sdc1       %[ftmp1],   0x18(%[stack])                          \n\t"
2370         "gsldlc1    %[ftmp8],   0x07(%[addr3])                          \n\t"
2371         "gsldrc1    %[ftmp8],   0x00(%[addr3])                          \n\t"
2372         "punpckhhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
2373         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
2374         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
2375         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
2376         "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
2377         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
2378         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
2379         "ldc1       %[ftmp8],   0x18(%[stack])                          \n\t"
2380         "sdc1       %[ftmp0],   0x08(%[stack])                          \n\t"
2381         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp8]                \n\t"
2382         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
2383         "punpckhhw  %[ftmp0],   %[ftmp3],       %[ftmp5]                \n\t"
2384         "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
2385         "punpckhwd  %[ftmp5],   %[ftmp7],       %[ftmp3]                \n\t"
2386         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
2387         "punpckhwd  %[ftmp3],   %[ftmp1],       %[ftmp2]                \n\t"
2388         "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
2389         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
2390         "sdc1       %[ftmp1],   0x18(%[stack])                          \n\t"
2391         "sdc1       %[ftmp3],   0x28(%[stack])                          \n\t"
2392         "sdc1       %[ftmp7],   0x38(%[stack])                          \n\t"
2393         "sdc1       %[ftmp5],   0x48(%[stack])                          \n\t"
2394         "sdc1       %[ftmp6],   0x58(%[stack])                          \n\t"
2395         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2396           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2397           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2398           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2399           [ftmp8]"=&f"(ftmp[8]),
2400           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
2401           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
2402           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
2403           [addr6]"=&r"(addr[6]),            [addr7]"=&r"(addr[7])
2404         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
2405           [stack]"r"(stack)
2406         : "memory"
2407     );
2408
2409     ff_deblock_v_luma_8_mmi((uint8_t *) &stack[6], 0x10, alpha, beta, tc0);
2410
2411     __asm__ volatile (
2412         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
2413         PTR_ADDI   "%[addr1],   %[pix],          -0x02                  \n\t"
2414         PTR_ADDU   "%[addr6],   %[addr0],       %[addr0]                \n\t"
2415         PTR_ADDU   "%[addr2],   %[addr0],       %[stride]               \n\t"
2416         PTR_ADDU   "%[addr7],   %[addr6],       %[addr6]                \n\t"
2417         PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
2418         "ldc1       %[ftmp0],   0x10(%[stack])                          \n\t"
2419         "ldc1       %[ftmp1],   0x20(%[stack])                          \n\t"
2420         "ldc1       %[ftmp2],   0x30(%[stack])                          \n\t"
2421         "ldc1       %[ftmp3],   0x40(%[stack])                          \n\t"
2422         "punpckhwd  %[ftmp4],   %[ftmp0],       %[ftmp0]                \n\t"
2423         "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
2424         "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
2425         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
2426         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
2427         "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
2428         "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
2429         "gsswlc1    %[ftmp1],   0x03(%[addr1])                          \n\t"
2430         "gsswrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
2431         PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
2432         "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
2433         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
2434         "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
2435         "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
2436         "gsswlc1    %[ftmp0],   0x03(%[addr5])                          \n\t"
2437         "gsswrc1    %[ftmp0],   0x00(%[addr5])                          \n\t"
2438         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
2439         "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
2440         "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
2441         "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
2442         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
2443         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
2444         "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
2445         PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
2446         "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
2447         "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
2448         "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
2449         PTR_ADDU   "%[addr3],   %[addr4],       %[addr0]                \n\t"
2450         "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
2451         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
2452         "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
2453         "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
2454         "gsswlc1    %[ftmp4],   0x03(%[addr5])                          \n\t"
2455         "gsswrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
2456         PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
2457         "punpckhwd  %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
2458         PTR_ADDU   "%[addr1],   %[addr1],       %[addr7]                \n\t"
2459         "gsswlc1    %[ftmp4],   0x03(%[addr3])                          \n\t"
2460         "gsswrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
2461         PTR_ADDU   "%[addr4],   %[addr4],       %[addr7]                \n\t"
2462         "ldc1       %[ftmp0],   0x18(%[stack])                          \n\t"
2463         "ldc1       %[ftmp1],   0x28(%[stack])                          \n\t"
2464         "ldc1       %[ftmp2],   0x38(%[stack])                          \n\t"
2465         "ldc1       %[ftmp3],   0x48(%[stack])                          \n\t"
2466         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
2467         "punpckhwd  %[ftmp4],   %[ftmp0],       %[ftmp0]                \n\t"
2468         PTR_ADDU   "%[addr6],   %[addr0],       %[addr0]                \n\t"
2469         "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
2470         "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
2471         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
2472         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
2473         PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
2474         "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
2475         "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
2476         "gsswlc1    %[ftmp1],   0x03(%[addr1])                          \n\t"
2477         "gsswrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
2478         "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
2479         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
2480         "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
2481         "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
2482         "gsswlc1    %[ftmp0],   0x03(%[addr5])                          \n\t"
2483         "gsswrc1    %[ftmp0],   0x00(%[addr5])                          \n\t"
2484         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
2485         "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
2486         "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
2487         "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
2488         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
2489         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
2490         PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
2491         "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
2492         "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
2493         "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
2494         "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
2495         PTR_ADDU   "%[addr3],   %[addr4],       %[addr0]                \n\t"
2496         "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
2497         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
2498         "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
2499         "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
2500         "gsswlc1    %[ftmp4],   0x03(%[addr5])                          \n\t"
2501         "gsswrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
2502         PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
2503         "punpckhwd  %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
2504         "gsswlc1    %[ftmp4],   0x03(%[addr3])                          \n\t"
2505         "gsswrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
2506         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2507           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2508           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2509           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2510           [ftmp8]"=&f"(ftmp[8]),
2511           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
2512           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
2513           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
2514           [addr6]"=&r"(addr[6]),            [addr7]"=&r"(addr[7])
2515         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
2516           [stack]"r"(stack)
2517         : "memory"
2518     );
2519 }
2520
2521 void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
2522         int beta)
2523 {
2524     uint64_t ptmp[0x11];
2525     uint64_t pdat[4];
2526     double ftmp[9];
2527     mips_reg addr[7];
2528
2529     __asm__ volatile (
2530         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
2531         PTR_ADDI   "%[addr1],   %[pix],         -0x04                   \n\t"
2532         PTR_ADDU   "%[addr2],   %[addr0],       %[stride]               \n\t"
2533         PTR_ADDU   "%[addr3],   %[addr0],       %[addr0]                \n\t"
2534         PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
2535         PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
2536         "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
2537         "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
2538         PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
2539         "gsldlc1    %[ftmp1],   0x07(%[addr5])                          \n\t"
2540         "gsldrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
2541         "gsldlc1    %[ftmp2],   0x07(%[addr6])                          \n\t"
2542         "gsldrc1    %[ftmp2],   0x00(%[addr6])                          \n\t"
2543         PTR_ADDU   "%[addr5],   %[addr4],       %[stride]               \n\t"
2544         "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
2545         "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
2546         PTR_ADDU   "%[addr6],   %[addr4],       %[addr0]                \n\t"
2547         "gsldlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
2548         "gsldrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
2549         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
2550         "gsldlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
2551         "gsldrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
2552         "gsldlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
2553         "gsldrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
2554         PTR_ADDU   "%[addr5],   %[addr4],       %[addr3]                \n\t"
2555         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
2556         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
2557         "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
2558         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
2559         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
2560         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
2561         "gsldlc1    %[ftmp8],   0x07(%[addr5])                          \n\t"
2562         "gsldrc1    %[ftmp8],   0x00(%[addr5])                          \n\t"
2563         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
2564         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
2565         "sdc1       %[ftmp3],   0x00(%[ptmp])                           \n\t"
2566         "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
2567         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
2568         "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
2569         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
2570         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
2571         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
2572         "sdc1       %[ftmp2],   0x20(%[ptmp])                           \n\t"
2573         "ldc1       %[ftmp2],   0x00(%[ptmp])                           \n\t"
2574         "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
2575         "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
2576         "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
2577         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
2578         "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
2579         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
2580         "sdc1       %[ftmp0],   0x00(%[ptmp])                           \n\t"
2581         "sdc1       %[ftmp5],   0x10(%[ptmp])                           \n\t"
2582         "sdc1       %[ftmp7],   0x40(%[ptmp])                           \n\t"
2583         "sdc1       %[ftmp4],   0x50(%[ptmp])                           \n\t"
2584         "ldc1       %[ftmp8],   0x20(%[ptmp])                           \n\t"
2585         "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
2586         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
2587         "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
2588         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
2589         PTR_ADDU   "%[addr5],   %[addr3],       %[addr3]                \n\t"
2590         "sdc1       %[ftmp3],   0x20(%[ptmp])                           \n\t"
2591         "sdc1       %[ftmp0],   0x30(%[ptmp])                           \n\t"
2592         "sdc1       %[ftmp6],   0x60(%[ptmp])                           \n\t"
2593         "sdc1       %[ftmp5],   0x70(%[ptmp])                           \n\t"
2594         PTR_ADDU   "%[addr1],   %[addr1],       %[addr5]                \n\t"
2595         PTR_ADDU   "%[addr4],   %[addr4],       %[addr5]                \n\t"
2596         PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
2597         "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
2598         "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
2599         PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
2600         "gsldlc1    %[ftmp1],   0x07(%[addr5])                          \n\t"
2601         "gsldrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
2602         "gsldlc1    %[ftmp2],   0x07(%[addr6])                          \n\t"
2603         "gsldrc1    %[ftmp2],   0x00(%[addr6])                          \n\t"
2604         PTR_ADDU   "%[addr5],   %[addr4],       %[stride]               \n\t"
2605         "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
2606         "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
2607         PTR_ADDU   "%[addr6],   %[addr4],       %[addr0]                \n\t"
2608         "gsldlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
2609         "gsldrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
2610         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
2611         "gsldlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
2612         "gsldrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
2613         "gsldlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
2614         "gsldrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
2615         PTR_ADDU   "%[addr5],   %[addr4],       %[addr3]                \n\t"
2616         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
2617         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
2618         "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
2619         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
2620         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
2621         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
2622         "gsldlc1    %[ftmp8],   0x07(%[addr5])                          \n\t"
2623         "gsldrc1    %[ftmp8],   0x00(%[addr5])                          \n\t"
2624         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
2625         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
2626         "sdc1       %[ftmp3],   0x08(%[ptmp])                           \n\t"
2627         "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
2628         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
2629         "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
2630         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
2631         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
2632         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
2633         "sdc1       %[ftmp2],   0x28(%[ptmp])                           \n\t"
2634         "ldc1       %[ftmp2],   0x08(%[ptmp])                           \n\t"
2635         "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
2636         "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
2637         "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
2638         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
2639         "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
2640         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
2641         "sdc1       %[ftmp0],   0x08(%[ptmp])                           \n\t"
2642         "sdc1       %[ftmp5],   0x18(%[ptmp])                           \n\t"
2643         "sdc1       %[ftmp7],   0x48(%[ptmp])                           \n\t"
2644         "sdc1       %[ftmp4],   0x58(%[ptmp])                           \n\t"
2645         "ldc1       %[ftmp8],   0x28(%[ptmp])                           \n\t"
2646         "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
2647         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
2648         "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
2649         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
2650         "sdc1       %[ftmp3],   0x28(%[ptmp])                           \n\t"
2651         "sdc1       %[ftmp0],   0x38(%[ptmp])                           \n\t"
2652         "sdc1       %[ftmp6],   0x68(%[ptmp])                           \n\t"
2653         "sdc1       %[ftmp5],   0x78(%[ptmp])                           \n\t"
2654         PTR_S      "%[addr1],   0x00(%[pdat])                           \n\t"
2655         PTR_S      "%[addr2],   0x08(%[pdat])                           \n\t"
2656         PTR_S      "%[addr0],   0x10(%[pdat])                           \n\t"
2657         PTR_S      "%[addr3],   0x18(%[pdat])                           \n\t"
2658         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2659           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2660           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2661           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2662           [ftmp8]"=&f"(ftmp[8]),
2663           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
2664           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
2665           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
2666           [addr6]"=&r"(addr[6])
2667         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
2668           [ptmp]"r"(ptmp),                  [pdat]"r"(pdat)
2669         : "memory"
2670     );
2671
2672     ff_deblock_v_luma_intra_8_mmi((uint8_t *) &ptmp[8], 0x10, alpha, beta);
2673
2674     __asm__ volatile (
2675         PTR_L      "%[addr1],   0x00(%[pdat])                           \n\t"
2676         PTR_L      "%[addr2],   0x08(%[pdat])                           \n\t"
2677         PTR_L      "%[addr0],   0x10(%[pdat])                           \n\t"
2678         PTR_L      "%[addr3],   0x18(%[pdat])                           \n\t"
2679         PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
2680         "ldc1       %[ftmp0],   0x08(%[ptmp])                           \n\t"
2681         "ldc1       %[ftmp1],   0x18(%[ptmp])                           \n\t"
2682         "ldc1       %[ftmp2],   0x28(%[ptmp])                           \n\t"
2683         "ldc1       %[ftmp3],   0x38(%[ptmp])                           \n\t"
2684         "ldc1       %[ftmp4],   0x48(%[ptmp])                           \n\t"
2685         "ldc1       %[ftmp5],   0x58(%[ptmp])                           \n\t"
2686         "ldc1       %[ftmp6],   0x68(%[ptmp])                           \n\t"
2687         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
2688         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
2689         "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
2690         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
2691         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
2692         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
2693         "ldc1       %[ftmp8],   0x78(%[ptmp])                           \n\t"
2694         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
2695         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
2696         "gssdlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
2697         "gssdrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
2698         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
2699         "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
2700         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
2701         "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
2702         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
2703         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
2704         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
2705         "gssdlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
2706         "gssdrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
2707         "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
2708         "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
2709         "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
2710         "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
2711         "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
2712         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
2713         "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
2714         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
2715         PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
2716         "gssdlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
2717         "gssdrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
2718         PTR_ADDU   "%[addr6],   %[addr4],       %[stride]               \n\t"
2719         "gssdlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
2720         "gssdrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
2721         PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
2722         "gssdlc1    %[ftmp7],   0x07(%[addr6])                          \n\t"
2723         "gssdrc1    %[ftmp7],   0x00(%[addr6])                          \n\t"
2724         PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
2725         "gssdlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
2726         "gssdrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
2727         "gsldlc1    %[ftmp8],   0x07(%[addr6])                          \n\t"
2728         "gsldrc1    %[ftmp8],   0x00(%[addr6])                          \n\t"
2729         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
2730         "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
2731         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
2732         "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
2733         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
2734         "gssdlc1    %[ftmp3],   0x07(%[addr5])                          \n\t"
2735         "gssdrc1    %[ftmp3],   0x00(%[addr5])                          \n\t"
2736         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
2737         "gssdlc1    %[ftmp0],   0x07(%[addr4])                          \n\t"
2738         "gssdrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
2739         PTR_ADDU   "%[addr6],   %[addr4],       %[addr3]                \n\t"
2740         "gssdlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
2741         "gssdrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
2742         PTR_ADDU   "%[addr5],   %[addr3],       %[addr3]                \n\t"
2743         "gssdlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
2744         "gssdrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
2745         PTR_SUBU   "%[addr1],   %[addr1],       %[addr5]                \n\t"
2746         PTR_SUBU   "%[addr4],   %[addr4],       %[addr5]                \n\t"
2747         "ldc1       %[ftmp0],   0x00(%[ptmp])                           \n\t"
2748         "ldc1       %[ftmp1],   0x10(%[ptmp])                           \n\t"
2749         "ldc1       %[ftmp2],   0x20(%[ptmp])                           \n\t"
2750         "ldc1       %[ftmp3],   0x30(%[ptmp])                           \n\t"
2751         "ldc1       %[ftmp4],   0x40(%[ptmp])                           \n\t"
2752         "ldc1       %[ftmp5],   0x50(%[ptmp])                           \n\t"
2753         "ldc1       %[ftmp6],   0x60(%[ptmp])                           \n\t"
2754         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
2755         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
2756         "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
2757         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
2758         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
2759         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
2760         "ldc1       %[ftmp8],   0x70(%[ptmp])                           \n\t"
2761         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
2762         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
2763         "gssdlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
2764         "gssdrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
2765         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
2766         "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
2767         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
2768         "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
2769         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
2770         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
2771         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
2772         "gssdlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
2773         "gssdrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
2774         "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
2775         "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
2776         "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
2777         "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
2778         "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
2779         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
2780         "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
2781         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
2782         PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
2783         "gssdlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
2784         "gssdrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
2785         PTR_ADDU   "%[addr6],   %[addr4],       %[stride]               \n\t"
2786         "gssdlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
2787         "gssdrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
2788         PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
2789         "gssdlc1    %[ftmp7],   0x07(%[addr6])                          \n\t"
2790         "gssdrc1    %[ftmp7],   0x00(%[addr6])                          \n\t"
2791         PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
2792         "gssdlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
2793         "gssdrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
2794         "gsldlc1    %[ftmp8],   0x07(%[addr6])                          \n\t"
2795         "gsldrc1    %[ftmp8],   0x00(%[addr6])                          \n\t"
2796         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
2797         "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
2798         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
2799         "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
2800         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
2801         "gssdlc1    %[ftmp3],   0x07(%[addr5])                          \n\t"
2802         "gssdrc1    %[ftmp3],   0x00(%[addr5])                          \n\t"
2803         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
2804         "gssdlc1    %[ftmp0],   0x07(%[addr4])                          \n\t"
2805         "gssdrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
2806         PTR_ADDU   "%[addr6],   %[addr4],       %[addr3]                \n\t"
2807         "gssdlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
2808         "gssdrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
2809         "gssdlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
2810         "gssdrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
2811         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2812           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2813           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2814           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2815           [ftmp8]"=&f"(ftmp[8]),
2816           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
2817           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
2818           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
2819           [addr6]"=&r"(addr[6])
2820         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
2821           [ptmp]"r"(ptmp),                  [pdat]"r"(pdat)
2822         : "memory"
2823     );
2824 }