]> git.sesse.net Git - ffmpeg/blob - libavcodec/mips/h264qpel_mmi.c
Merge commit 'b77fffa127663028169c5ed543956af4b9496c29'
[ffmpeg] / libavcodec / mips / h264qpel_mmi.c
1 /*
2  * Loongson SIMD optimized h264qpel
3  *
4  * Copyright (c) 2015 Loongson Technology Corporation Limited
5  * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23
24 #include "h264dsp_mips.h"
25 #include "hpeldsp_mips.h"
26 #include "libavcodec/bit_depth_template.c"
27 #include "libavutil/mips/asmdefs.h"
28
29 static inline void copy_block4_mmi(uint8_t *dst, const uint8_t *src,
30         int dstStride, int srcStride, int h)
31 {
32     double ftmp[1];
33     uint64_t low32;
34
35     __asm__ volatile (
36         "1:                                                             \n\t"
37         "uld        %[low32],   0x00(%[src])                            \n\t"
38         "mtc1       %[low32],   %[ftmp0]                                \n\t"
39         "gsswlc1    %[ftmp0],   0x03(%[dst])                            \n\t"
40         "gsswrc1    %[ftmp0],   0x00(%[dst])                            \n\t"
41         "addi       %[h],       %[h],           -0x01                   \n\t"
42         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
43         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
44         "bnez       %[h],       1b                                      \n\t"
45         : [ftmp0]"=&f"(ftmp[0]),
46           [dst]"+&r"(dst),                  [src]"+&r"(src),
47           [h]"+&r"(h),
48           [low32]"=&r"(low32)
49         : [dstStride]"r"((mips_reg)dstStride),
50           [srcStride]"r"((mips_reg)srcStride)
51         : "memory"
52     );
53 }
54
55 static inline void copy_block8_mmi(uint8_t *dst, const uint8_t *src,
56         int dstStride, int srcStride, int h)
57 {
58     double ftmp[1];
59
60     __asm__ volatile (
61         "1:                                                             \n\t"
62         "gsldlc1    %[ftmp0],   0x07(%[src])                            \n\t"
63         "gsldrc1    %[ftmp0],   0x00(%[src])                            \n\t"
64         "gssdlc1    %[ftmp0],   0x07(%[dst])                            \n\t"
65         "gssdrc1    %[ftmp0],   0x00(%[dst])                            \n\t"
66         "addi       %[h],       %[h],           -0x01                   \n\t"
67         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
68         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
69         "bnez       %[h],       1b                                      \n\t"
70         : [ftmp0]"=&f"(ftmp[0]),
71           [dst]"+&r"(dst),                  [src]"+&r"(src),
72           [h]"+&r"(h)
73         : [dstStride]"r"((mips_reg)dstStride),
74           [srcStride]"r"((mips_reg)srcStride)
75         : "memory"
76     );
77 }
78
79 static inline void copy_block16_mmi(uint8_t *dst, const uint8_t *src,
80         int dstStride, int srcStride, int h)
81 {
82     double ftmp[1];
83     uint64_t tmp[1];
84
85     __asm__ volatile (
86         "1:                                                             \n\t"
87         "gsldlc1    %[ftmp0],   0x07(%[src])                            \n\t"
88         "gsldrc1    %[ftmp0],   0x00(%[src])                            \n\t"
89         "ldl        %[tmp0],    0x0f(%[src])                            \n\t"
90         "ldr        %[tmp0],    0x08(%[src])                            \n\t"
91         "gssdlc1    %[ftmp0],   0x07(%[dst])                            \n\t"
92         "gssdrc1    %[ftmp0],   0x00(%[dst])                            \n\t"
93         "sdl        %[tmp0],    0x0f(%[dst])                            \n\t"
94         "sdr        %[tmp0],    0x08(%[dst])                            \n\t"
95         "addi       %[h],       %[h],           -0x01                   \n\t"
96         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
97         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
98         "bnez       %[h],       1b                                      \n\t"
99         : [ftmp0]"=&f"(ftmp[0]),
100           [tmp0]"=&r"(tmp[0]),
101           [dst]"+&r"(dst),                  [src]"+&r"(src),
102           [h]"+&r"(h)
103         : [dstStride]"r"((mips_reg)dstStride),
104           [srcStride]"r"((mips_reg)srcStride)
105         : "memory"
106     );
107 }
108
109 #define op2_avg(a, b)  a = (((a)+CLIP(((b) + 512)>>10)+1)>>1)
110 #define op2_put(a, b)  a = CLIP(((b) + 512)>>10)
111 static void put_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
112         int dstStride, int srcStride)
113 {
114     double ftmp[10];
115     uint64_t tmp[1];
116     uint64_t low32;
117
118     __asm__ volatile (
119         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
120         "dli        %[tmp0],    0x04                                    \n\t"
121         "1:                                                             \n\t"
122         "uld        %[low32],   -0x02(%[src])                           \n\t"
123         "mtc1       %[low32],   %[ftmp1]                                \n\t"
124         "uld        %[low32],   -0x01(%[src])                           \n\t"
125         "mtc1       %[low32],   %[ftmp2]                                \n\t"
126         "uld        %[low32],   0x00(%[src])                            \n\t"
127         "mtc1       %[low32],   %[ftmp3]                                \n\t"
128         "uld        %[low32],   0x01(%[src])                            \n\t"
129         "mtc1       %[low32],   %[ftmp4]                                \n\t"
130         "uld        %[low32],   0x02(%[src])                            \n\t"
131         "mtc1       %[low32],   %[ftmp5]                                \n\t"
132         "uld        %[low32],   0x03(%[src])                            \n\t"
133         "mtc1       %[low32],   %[ftmp6]                                \n\t"
134         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
135         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
136         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
137         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
138         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
139         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
140         "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
141         "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
142         "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
143         "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
144         "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
145         "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
146         "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
147         "paddsh     %[ftmp9],   %[ftmp9],       %[ff_pw_16]             \n\t"
148         "psrah      %[ftmp9],   %[ftmp9],       %[ff_pw_5]              \n\t"
149         "packushb   %[ftmp9],   %[ftmp9],       %[ftmp0]                \n\t"
150         "gsswlc1    %[ftmp9],   0x03(%[dst])                            \n\t"
151         "gsswrc1    %[ftmp9],   0x00(%[dst])                            \n\t"
152         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
153         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
154         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
155         "bnez       %[tmp0],    1b                                      \n\t"
156         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
157           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
158           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
159           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
160           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
161           [tmp0]"=&r"(tmp[0]),
162           [dst]"+&r"(dst),                  [src]"+&r"(src),
163           [low32]"=&r"(low32)
164         : [dstStride]"r"((mips_reg)dstStride),
165           [srcStride]"r"((mips_reg)srcStride),
166           [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5),
167           [ff_pw_16]"f"(ff_pw_16)
168         : "memory"
169     );
170 }
171
172 static void put_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
173         int dstStride, int srcStride)
174 {
175     double ftmp[11];
176     uint64_t tmp[1];
177
178     __asm__ volatile (
179         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
180         "dli        %[tmp0],    0x08                                    \n\t"
181         "1:                                                             \n\t"
182         "gsldlc1    %[ftmp1],   0x05(%[src])                            \n\t"
183         "gsldrc1    %[ftmp1],   -0x02(%[src])                           \n\t"
184         "gsldlc1    %[ftmp2],   0x06(%[src])                            \n\t"
185         "gsldrc1    %[ftmp2],   -0x01(%[src])                           \n\t"
186         "gsldlc1    %[ftmp3],   0x07(%[src])                            \n\t"
187         "gsldrc1    %[ftmp3],   0x00(%[src])                            \n\t"
188         "gsldlc1    %[ftmp4],   0x08(%[src])                            \n\t"
189         "gsldrc1    %[ftmp4],   0x01(%[src])                            \n\t"
190         "gsldlc1    %[ftmp5],   0x09(%[src])                            \n\t"
191         "gsldrc1    %[ftmp5],   0x02(%[src])                            \n\t"
192         "gsldlc1    %[ftmp6],   0x0a(%[src])                            \n\t"
193         "gsldrc1    %[ftmp6],   0x03(%[src])                            \n\t"
194         "punpcklbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
195         "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
196         "punpcklbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
197         "punpckhbh  %[ftmp10],  %[ftmp4],       %[ftmp0]                \n\t"
198         "paddsh     %[ftmp3],   %[ftmp7],       %[ftmp9]                \n\t"
199         "paddsh     %[ftmp4],   %[ftmp8],       %[ftmp10]               \n\t"
200         "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_20]             \n\t"
201         "pmullh     %[ftmp4],   %[ftmp4],       %[ff_pw_20]             \n\t"
202         "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
203         "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]                \n\t"
204         "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp0]                \n\t"
205         "punpckhbh  %[ftmp10],  %[ftmp5],       %[ftmp0]                \n\t"
206         "paddsh     %[ftmp2],   %[ftmp7],       %[ftmp9]                \n\t"
207         "paddsh     %[ftmp5],   %[ftmp8],       %[ftmp10]               \n\t"
208         "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_5]              \n\t"
209         "pmullh     %[ftmp5],   %[ftmp5],       %[ff_pw_5]              \n\t"
210         "punpcklbh  %[ftmp7],   %[ftmp1],       %[ftmp0]                \n\t"
211         "punpckhbh  %[ftmp8],   %[ftmp1],       %[ftmp0]                \n\t"
212         "punpcklbh  %[ftmp9],   %[ftmp6],       %[ftmp0]                \n\t"
213         "punpckhbh  %[ftmp10],  %[ftmp6],       %[ftmp0]                \n\t"
214         "paddsh     %[ftmp1],   %[ftmp7],       %[ftmp9]                \n\t"
215         "paddsh     %[ftmp6],   %[ftmp8],       %[ftmp10]               \n\t"
216         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
217         "psubsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
218         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
219         "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
220         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
221         "paddsh     %[ftmp4],   %[ftmp4],       %[ff_pw_16]             \n\t"
222         "psrah      %[ftmp3],   %[ftmp3],       %[ff_pw_5]              \n\t"
223         "psrah      %[ftmp4],   %[ftmp4],       %[ff_pw_5]              \n\t"
224         "packushb   %[ftmp9],   %[ftmp3],       %[ftmp4]                \n\t"
225         "gssdlc1    %[ftmp9],   0x07(%[dst])                            \n\t"
226         "gssdrc1    %[ftmp9],   0x00(%[dst])                            \n\t"
227         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
228         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
229         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
230         "bnez       %[tmp0],    1b                                      \n\t"
231         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
232           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
233           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
234           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
235           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
236           [ftmp10]"=&f"(ftmp[10]),
237           [tmp0]"=&r"(tmp[0]),
238           [dst]"+&r"(dst),                  [src]"+&r"(src)
239         : [dstStride]"r"((mips_reg)dstStride),
240           [srcStride]"r"((mips_reg)srcStride),
241           [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5),
242           [ff_pw_16]"f"(ff_pw_16)
243         : "memory"
244     );
245 }
246
247 static void put_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
248         int dstStride, int srcStride)
249 {
250     put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
251     put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
252     src += 8*srcStride;
253     dst += 8*dstStride;
254     put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
255     put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
256 }
257
258 static void avg_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
259         int dstStride, int srcStride)
260 {
261     double ftmp[11];
262     uint64_t tmp[1];
263     uint64_t low32;
264
265     __asm__ volatile (
266         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
267         "dli        %[tmp0],    0x04                                    \n\t"
268         "1:                                                             \n\t"
269         "uld        %[low32],   -0x02(%[src])                           \n\t"
270         "mtc1       %[low32],   %[ftmp1]                                \n\t"
271         "uld        %[low32],   -0x01(%[src])                           \n\t"
272         "mtc1       %[low32],   %[ftmp2]                                \n\t"
273         "uld        %[low32],   0x00(%[src])                            \n\t"
274         "mtc1       %[low32],   %[ftmp3]                                \n\t"
275         "uld        %[low32],   0x01(%[src])                            \n\t"
276         "mtc1       %[low32],   %[ftmp4]                                \n\t"
277         "uld        %[low32],   0x02(%[src])                            \n\t"
278         "mtc1       %[low32],   %[ftmp5]                                \n\t"
279         "uld        %[low32],   0x03(%[src])                            \n\t"
280         "mtc1       %[low32],   %[ftmp6]                                \n\t"
281         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
282         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
283         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
284         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
285         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
286         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
287         "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
288         "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
289         "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
290         "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
291         "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
292         "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
293         "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
294         "paddsh     %[ftmp9],   %[ftmp9],       %[ff_pw_16]             \n\t"
295         "psrah      %[ftmp9],   %[ftmp9],       %[ff_pw_5]              \n\t"
296         "packushb   %[ftmp9],   %[ftmp9],       %[ftmp0]                \n\t"
297         "lwc1       %[ftmp10],  0x00(%[dst])                            \n\t"
298         "pavgb      %[ftmp9],   %[ftmp9],       %[ftmp10]               \n\t"
299         "gsswlc1    %[ftmp9],   0x03(%[dst])                            \n\t"
300         "gsswrc1    %[ftmp9],   0x00(%[dst])                            \n\t"
301         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
302         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
303         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
304         "bnez       %[tmp0],    1b                                      \n\t"
305         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
306           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
307           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
308           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
309           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
310           [ftmp10]"=&f"(ftmp[10]),
311           [tmp0]"=&r"(tmp[0]),
312           [dst]"+&r"(dst),                  [src]"+&r"(src),
313           [low32]"=&r"(low32)
314         : [dstStride]"r"((mips_reg)dstStride),
315           [srcStride]"r"((mips_reg)srcStride),
316           [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5),
317           [ff_pw_16]"f"(ff_pw_16)
318         : "memory"
319     );
320 }
321
322 static void avg_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
323         int dstStride, int srcStride)
324 {
325     double ftmp[11];
326     uint64_t tmp[1];
327
328     __asm__ volatile (
329         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
330         "dli        %[tmp0],    0x08                                    \n\t"
331         "1:                                                             \n\t"
332         "gsldlc1    %[ftmp1],   0x05(%[src])                            \n\t"
333         "gsldrc1    %[ftmp1],   -0x02(%[src])                           \n\t"
334         "gsldlc1    %[ftmp2],   0x06(%[src])                            \n\t"
335         "gsldrc1    %[ftmp2],   -0x01(%[src])                           \n\t"
336         "gsldlc1    %[ftmp3],   0x07(%[src])                            \n\t"
337         "gsldrc1    %[ftmp3],   0x00(%[src])                            \n\t"
338         "gsldlc1    %[ftmp4],   0x08(%[src])                            \n\t"
339         "gsldrc1    %[ftmp4],   0x01(%[src])                            \n\t"
340         "gsldlc1    %[ftmp5],   0x09(%[src])                            \n\t"
341         "gsldrc1    %[ftmp5],   0x02(%[src])                            \n\t"
342         "gsldlc1    %[ftmp6],   0x0a(%[src])                            \n\t"
343         "gsldrc1    %[ftmp6],   0x03(%[src])                            \n\t"
344         "punpcklbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
345         "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
346         "punpcklbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
347         "punpckhbh  %[ftmp10],  %[ftmp4],       %[ftmp0]                \n\t"
348         "paddsh     %[ftmp3],   %[ftmp7],       %[ftmp9]                \n\t"
349         "paddsh     %[ftmp4],   %[ftmp8],       %[ftmp10]               \n\t"
350         "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_20]             \n\t"
351         "pmullh     %[ftmp4],   %[ftmp4],       %[ff_pw_20]             \n\t"
352         "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
353         "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]                \n\t"
354         "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp0]                \n\t"
355         "punpckhbh  %[ftmp10],  %[ftmp5],       %[ftmp0]                \n\t"
356         "paddsh     %[ftmp2],   %[ftmp7],       %[ftmp9]                \n\t"
357         "paddsh     %[ftmp5],   %[ftmp8],       %[ftmp10]               \n\t"
358         "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_5]              \n\t"
359         "pmullh     %[ftmp5],   %[ftmp5],       %[ff_pw_5]              \n\t"
360         "punpcklbh  %[ftmp7],   %[ftmp1],       %[ftmp0]                \n\t"
361         "punpckhbh  %[ftmp8],   %[ftmp1],       %[ftmp0]                \n\t"
362         "punpcklbh  %[ftmp9],   %[ftmp6],       %[ftmp0]                \n\t"
363         "punpckhbh  %[ftmp10],  %[ftmp6],       %[ftmp0]                \n\t"
364         "paddsh     %[ftmp1],   %[ftmp7],       %[ftmp9]                \n\t"
365         "paddsh     %[ftmp6],   %[ftmp8],       %[ftmp10]               \n\t"
366         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
367         "psubsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
368         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
369         "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
370         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
371         "paddsh     %[ftmp4],   %[ftmp4],       %[ff_pw_16]             \n\t"
372         "psrah      %[ftmp3],   %[ftmp3],       %[ff_pw_5]              \n\t"
373         "psrah      %[ftmp4],   %[ftmp4],       %[ff_pw_5]              \n\t"
374         "packushb   %[ftmp9],   %[ftmp3],       %[ftmp4]                \n\t"
375         "ldc1       %[ftmp10],  0x00(%[dst])                            \n\t"
376         "pavgb      %[ftmp9],   %[ftmp9],       %[ftmp10]               \n\t"
377         "sdc1       %[ftmp9],   0x00(%[dst])                            \n\t"
378         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
379         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
380         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
381         "bnez       %[tmp0],    1b                                      \n\t"
382         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
383           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
384           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
385           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
386           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
387           [ftmp10]"=&f"(ftmp[10]),
388           [tmp0]"=&r"(tmp[0]),
389           [dst]"+&r"(dst),                  [src]"+&r"(src)
390         : [dstStride]"r"((mips_reg)dstStride),
391           [srcStride]"r"((mips_reg)srcStride),
392           [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5),
393           [ff_pw_16]"f"(ff_pw_16)
394         : "memory"
395     );
396 }
397
398 static void avg_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
399         int dstStride, int srcStride)
400 {
401     avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
402     avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
403     src += 8*srcStride;
404     dst += 8*dstStride;
405     avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
406     avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
407 }
408
409 static void put_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
410         int dstStride, int srcStride)
411 {
412     double ftmp[12];
413     uint64_t tmp[1];
414     uint64_t low32;
415
416     src -= 2 * srcStride;
417
418     __asm__ volatile (
419         ".set       push                                                \n\t"
420         ".set       noreorder                                           \n\t"
421         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
422         "dli        %[tmp0],    0x02                                    \n\t"
423         "uld        %[low32],   0x00(%[src])                            \n\t"
424         "mtc1       %[low32],   %[ftmp1]                                \n\t"
425         "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
426         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
427         "dli        %[tmp0],    0x05                                    \n\t"
428         "uld        %[low32],   0x00(%[src])                            \n\t"
429         "mtc1       %[low32],   %[ftmp2]                                \n\t"
430         "mtc1       %[tmp0],    %[ftmp11]                               \n\t"
431         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
432         "uld        %[low32],   0x00(%[src])                            \n\t"
433         "mtc1       %[low32],   %[ftmp3]                                \n\t"
434         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
435         "uld        %[low32],   0x00(%[src])                            \n\t"
436         "mtc1       %[low32],   %[ftmp4]                                \n\t"
437         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
438         "uld        %[low32],   0x00(%[src])                            \n\t"
439         "mtc1       %[low32],   %[ftmp5]                                \n\t"
440         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
441         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
442         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
443         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
444         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
445         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
446         "uld        %[low32],   0x00(%[src])                            \n\t"
447         "mtc1       %[low32],   %[ftmp6]                                \n\t"
448         "paddh      %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
449         "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
450         "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
451         "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
452         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
453         "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
454         "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]             \n\t"
455         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
456         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
457         "paddh      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
458         "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
459         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
460         "swc1       %[ftmp7],   0x00(%[dst])                            \n\t"
461         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
462         "uld        %[low32],   0x00(%[src])                            \n\t"
463         "mtc1       %[low32],   %[ftmp1]                                \n\t"
464         "paddh      %[ftmp7],   %[ftmp4],       %[ftmp5]                \n\t"
465         "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
466         "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
467         "psubh      %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
468         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
469         "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
470         "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]             \n\t"
471         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
472         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
473         "paddh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
474         "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
475         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
476         "swc1       %[ftmp7],   0x00(%[dst])                            \n\t"
477         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
478         "uld        %[low32],   0x00(%[src])                            \n\t"
479         "mtc1       %[low32],   %[ftmp2]                                \n\t"
480         "paddh      %[ftmp7],   %[ftmp5],       %[ftmp6]                \n\t"
481         "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
482         "psubh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
483         "psubh      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
484         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
485         "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
486         "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
487         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
488         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
489         "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
490         "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
491         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
492         "swc1       %[ftmp7],   0x00(%[dst])                            \n\t"
493         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
494         "uld        %[low32],   0x00(%[src])                            \n\t"
495         "mtc1       %[low32],   %[ftmp3]                                \n\t"
496         "paddh      %[ftmp7],   %[ftmp6],       %[ftmp1]                \n\t"
497         "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
498         "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
499         "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
500         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
501         "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
502         "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]             \n\t"
503         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
504         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
505         "paddh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
506         "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
507         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
508         "swc1       %[ftmp7],   0x00(%[dst])                            \n\t"
509         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
510         ".set       pop                                                 \n\t"
511         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
512           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
513           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
514           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
515           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
516           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
517           [tmp0]"=&r"(tmp[0]),
518           [dst]"+&r"(dst),                  [src]"+&r"(src),
519           [low32]"=&r"(low32)
520         : [dstStride]"r"((mips_reg)dstStride),
521           [srcStride]"r"((mips_reg)srcStride),
522           [ff_pw_5]"f"(ff_pw_5),            [ff_pw_16]"f"(ff_pw_16)
523         : "memory"
524     );
525 }
526
527 static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
528         int dstStride, int srcStride)
529 {
530     int w = 2;
531     int h = 8;
532     double ftmp[10];
533     uint64_t tmp[1];
534     uint64_t low32;
535
536     src -= 2 * srcStride;
537
538     while (w--) {
539         __asm__ volatile (
540             ".set       push                                            \n\t"
541             ".set       noreorder                                       \n\t"
542             "dli        %[tmp0],    0x02                                \n\t"
543             "uld        %[low32],   0x00(%[src])                        \n\t"
544             "mtc1       %[low32],   %[ftmp0]                            \n\t"
545             "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
546             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
547             "dli        %[tmp0],    0x05                                \n\t"
548             "uld        %[low32],   0x00(%[src])                        \n\t"
549             "mtc1       %[low32],   %[ftmp1]                            \n\t"
550             "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
551             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
552             "uld        %[low32],   0x00(%[src])                        \n\t"
553             "mtc1       %[low32],   %[ftmp2]                            \n\t"
554             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
555             "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
556             "uld        %[low32],   0x00(%[src])                        \n\t"
557             "mtc1       %[low32],   %[ftmp3]                            \n\t"
558             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
559             "uld        %[low32],   0x00(%[src])                        \n\t"
560             "mtc1       %[low32],   %[ftmp4]                            \n\t"
561             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
562             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
563             "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
564             "uld        %[low32],   0x00(%[src])                        \n\t"
565             "mtc1       %[low32],   %[ftmp5]                            \n\t"
566             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
567             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
568             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
569             "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
570             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
571             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
572             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
573             "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
574             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
575             "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
576             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
577             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
578             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
579             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
580             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
581             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
582             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
583             "uld        %[low32],   0x00(%[src])                        \n\t"
584             "mtc1       %[low32],   %[ftmp0]                            \n\t"
585             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
586             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
587             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
588             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
589             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
590             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
591             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
592             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
593             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
594             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
595             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
596             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
597             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
598             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
599             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
600             "uld        %[low32],   0x00(%[src])                        \n\t"
601             "mtc1       %[low32],   %[ftmp1]                            \n\t"
602             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
603             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
604             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
605             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
606             "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
607             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
608             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
609             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
610             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
611             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
612             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
613             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
614             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
615             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
616             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
617             "uld        %[low32],   0x00(%[src])                        \n\t"
618             "mtc1       %[low32],   %[ftmp2]                            \n\t"
619             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
620             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
621             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
622             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
623             "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
624             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
625             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
626             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
627             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
628             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
629             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
630             "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
631             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
632             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
633             "uld        %[low32],   0x00(%[src])                        \n\t"
634             "mtc1       %[low32],   %[ftmp3]                            \n\t"
635             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
636             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
637             "punpcklbh  %[ftmp3] ,  %[ftmp3],       %[ftmp7]            \n\t"
638             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
639             "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
640             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
641             "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
642             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
643             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
644             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
645             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
646             "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
647             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
648             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
649             "uld        %[low32],   0x00(%[src])                        \n\t"
650             "mtc1       %[low32],   %[ftmp4]                            \n\t"
651             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
652             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
653             "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
654             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
655             "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
656             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
657             "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
658             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
659             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
660             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
661             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
662             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
663             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
664             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
665             "uld        %[low32],   0x00(%[src])                        \n\t"
666             "mtc1       %[low32],   %[ftmp5]                            \n\t"
667             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
668             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
669             "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
670             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
671             "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
672             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
673             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
674             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
675             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
676             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
677             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
678             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
679             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
680             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
681             "uld        %[low32],   0x00(%[src])                        \n\t"
682             "mtc1       %[low32],   %[ftmp0]                            \n\t"
683             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
684             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
685             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
686             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
687             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
688             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
689             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
690             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
691             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
692             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
693             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
694             "bne        %[h],       0x10,           2f                  \n\t"
695             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
696             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
697             "uld        %[low32],   0x00(%[src])                        \n\t"
698             "mtc1       %[low32],   %[ftmp1]                            \n\t"
699             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
700             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
701             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
702             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
703             "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
704             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
705             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
706             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
707             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
708             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
709             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
710             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
711             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
712             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
713             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
714             "uld        %[low32],   0x00(%[src])                        \n\t"
715             "mtc1       %[low32],   %[ftmp2]                            \n\t"
716             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
717             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
718             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
719             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
720             "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
721             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
722             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
723             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
724             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
725             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
726             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
727             "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
728             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
729             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
730             "uld        %[low32],   0x00(%[src])                        \n\t"
731             "mtc1       %[low32],   %[ftmp3]                            \n\t"
732             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
733             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
734             "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
735             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
736             "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
737             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
738             "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
739             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
740             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
741             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
742             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
743             "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
744             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
745             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
746             "uld        %[low32],   0x00(%[src])                        \n\t"
747             "mtc1       %[low32],   %[ftmp4]                            \n\t"
748             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
749             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
750             "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
751             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
752             "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
753             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
754             "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
755             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
756             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
757             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
758             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
759             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
760             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
761             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
762             "uld        %[low32],   0x00(%[src])                        \n\t"
763             "mtc1       %[low32],   %[ftmp5]                            \n\t"
764             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
765             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
766             "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
767             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
768             "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
769             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
770             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
771             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
772             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
773             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
774             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
775             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
776             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
777             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
778             "uld        %[low32],   0x00(%[src])                        \n\t"
779             "mtc1       %[low32],   %[ftmp0]                            \n\t"
780             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
781             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
782             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
783             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
784             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
785             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
786             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
787             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
788             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
789             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
790             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
791             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
792             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
793             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
794             "uld        %[low32],   0x00(%[src])                        \n\t"
795             "mtc1       %[low32],   %[ftmp1]                            \n\t"
796             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
797             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
798             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
799             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
800             "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
801             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
802             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
803             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
804             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
805             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
806             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
807             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
808             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
809             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
810             "uld        %[low32],   0x00(%[src])                        \n\t"
811             "mtc1       %[low32],   %[ftmp2]                            \n\t"
812             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
813             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
814             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
815             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
816             "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
817             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
818             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
819             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
820             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
821             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
822             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
823             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
824             "2:                                                         \n\t"
825             ".set       pop                                             \n\t"
826             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
827               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
828               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
829               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
830               [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
831               [tmp0]"=&r"(tmp[0]),
832               [src]"+&r"(src),              [dst]"+&r"(dst),
833               [h]"+&r"(h),
834               [low32]"=&r"(low32)
835             : [dstStride]"r"((mips_reg)dstStride),
836               [srcStride]"r"((mips_reg)srcStride),
837               [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
838             : "memory"
839         );
840
841         src += 4 - (h + 5) * srcStride;
842         dst += 4 - h * dstStride;
843     }
844 }
845
846 static void put_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
847         int dstStride, int srcStride)
848 {
849     put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
850     put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
851     src += 8*srcStride;
852     dst += 8*dstStride;
853     put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
854     put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
855 }
856
857 static void avg_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
858         int dstStride, int srcStride)
859 {
860     double ftmp[10];
861     uint64_t tmp[1];
862
863     src -= 2 * srcStride;
864
865     __asm__ volatile (
866         ".set       push                                                \n\t"
867         ".set       noreorder                                           \n\t"
868         "dli        %[tmp0],    0x02                                    \n\t"
869         "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
870         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
871         "dli        %[tmp0],    0x05                                    \n\t"
872         "lwc1       %[ftmp0],   0x00(%[src])                            \n\t"
873         "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
874         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
875         "lwc1       %[ftmp1],   0x00(%[src])                            \n\t"
876         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
877         "lwc1       %[ftmp2],   0x00(%[src])                            \n\t"
878         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
879         "lwc1       %[ftmp3],   0x00(%[src])                            \n\t"
880         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
881         "lwc1       %[ftmp4],   0x00(%[src])                            \n\t"
882         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
883         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
884         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
885         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
886         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
887         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
888         "lwc1       %[ftmp5],   0x00(%[src])                            \n\t"
889         "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
890         "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
891         "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
892         "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
893         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
894         "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
895         "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]             \n\t"
896         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
897         "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]                \n\t"
898         "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
899         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
900         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
901         "lwc1       %[ftmp0],   0x00(%[dst])                            \n\t"
902         "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
903         "swc1       %[ftmp6],   0x00(%[dst])                            \n\t"
904         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
905         "lwc1       %[ftmp0],   0x00(%[src])                            \n\t"
906         "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]                \n\t"
907         "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
908         "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
909         "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
910         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
911         "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
912         "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]             \n\t"
913         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
914         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
915         "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
916         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
917         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
918         "lwc1       %[ftmp1],   0x00(%[dst])                            \n\t"
919         "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
920         "swc1       %[ftmp6],   0x00(%[dst])                            \n\t"
921         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
922         "lwc1       %[ftmp1],   0x00(%[src])                            \n\t"
923         "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]                \n\t"
924         "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
925         "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
926         "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
927         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
928         "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
929         "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]             \n\t"
930         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
931         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
932         "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
933         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
934         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
935         "lwc1       %[ftmp2],   0x00(%[dst])                            \n\t"
936         "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
937         "swc1       %[ftmp6],   0x00(%[dst])                            \n\t"
938         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
939         "lwc1       %[ftmp2],   0x00(%[src])                            \n\t"
940         "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]                \n\t"
941         "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
942         "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
943         "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
944         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
945         "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
946         "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
947         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
948         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
949         "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
950         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
951         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
952         "lwc1       %[ftmp3],   0x00(%[dst])                            \n\t"
953         "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
954         "swc1       %[ftmp6],   0x00(%[dst])                            \n\t"
955         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
956         ".set       pop                                                 \n\t"
957         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
958           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
959           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
960           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
961           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
962           [tmp0]"=&r"(tmp[0]),
963           [src]"+&r"(src),              [dst]"+&r"(dst)
964         : [dstStride]"r"((mips_reg)dstStride),
965           [srcStride]"r"((mips_reg)srcStride),
966           [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
967         : "memory"
968     );
969 }
970
971 static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
972         int dstStride, int srcStride)
973 {
974     int w = 2;
975     int h = 8;
976     double ftmp[10];
977     uint64_t tmp[1];
978     uint64_t low32;
979
980     src -= 2 * srcStride;
981
982     while (w--) {
983         __asm__ volatile (
984             ".set       push                                            \n\t"
985             ".set       noreorder                                       \n\t"
986             "dli        %[tmp0],    0x02                                \n\t"
987             "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
988             "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
989             "dli        %[tmp0],    0x05                                \n\t"
990             "uld        %[low32],   0x00(%[src])                        \n\t"
991             "mtc1       %[low32],   %[ftmp0]                            \n\t"
992             "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
993             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
994             "uld        %[low32],   0x00(%[src])                        \n\t"
995             "mtc1       %[low32],   %[ftmp1]                            \n\t"
996             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
997             "uld        %[low32],   0x00(%[src])                        \n\t"
998             "mtc1       %[low32],   %[ftmp2]                            \n\t"
999             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1000             "uld        %[low32],   0x00(%[src])                        \n\t"
1001             "mtc1       %[low32],   %[ftmp3]                            \n\t"
1002             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1003             "uld        %[low32],   0x00(%[src])                        \n\t"
1004             "mtc1       %[low32],   %[ftmp4]                            \n\t"
1005             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1006             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
1007             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1008             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1009             "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
1010             "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
1011             "uld        %[low32],   0x00(%[src])                        \n\t"
1012             "mtc1       %[low32],   %[ftmp5]                            \n\t"
1013             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
1014             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1015             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1016             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1017             "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
1018             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1019             "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
1020             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1021             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
1022             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1023             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1024             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1025             "lwc1       %[ftmp0],   0x00(%[dst])                        \n\t"
1026             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1027             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1028             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1029             "uld        %[low32],   0x00(%[src])                        \n\t"
1030             "mtc1       %[low32],   %[ftmp0]                            \n\t"
1031             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
1032             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1033             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1034             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1035             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
1036             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1037             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
1038             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1039             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1040             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1041             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1042             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1043             "lwc1       %[ftmp1],   0x00(%[dst])                        \n\t"
1044             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1045             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1046             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1047             "uld        %[low32],   0x00(%[src])                        \n\t"
1048             "mtc1       %[low32],   %[ftmp1]                            \n\t"
1049             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
1050             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1051             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1052             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1053             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1054             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1055             "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
1056             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1057             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
1058             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1059             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1060             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1061             "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
1062             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1063             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1064             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1065             "uld        %[low32],   0x00(%[src])                        \n\t"
1066             "mtc1       %[low32],   %[ftmp2]                            \n\t"
1067             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
1068             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1069             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1070             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1071             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1072             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1073             "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
1074             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1075             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
1076             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1077             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1078             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1079             "lwc1       %[ftmp3],   0x00(%[dst])                        \n\t"
1080             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1081             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1082             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1083             "uld        %[low32],   0x00(%[src])                        \n\t"
1084             "mtc1       %[low32],   %[ftmp3]                            \n\t"
1085             "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
1086             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1087             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1088             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1089             "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
1090             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1091             "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
1092             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1093             "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
1094             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1095             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1096             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1097             "lwc1       %[ftmp4],   0x00(%[dst])                        \n\t"
1098             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1099             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1100             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1101             "uld        %[low32],   0x00(%[src])                        \n\t"
1102             "mtc1       %[low32],   %[ftmp4]                            \n\t"
1103             "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
1104             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1105             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1106             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1107             "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
1108             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1109             "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
1110             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1111             "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
1112             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1113             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1114             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1115             "lwc1       %[ftmp5],   0x00(%[dst])                        \n\t"
1116             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1117             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1118             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1119             "uld        %[low32],   0x00(%[src])                        \n\t"
1120             "mtc1       %[low32],   %[ftmp5]                            \n\t"
1121             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
1122             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1123             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1124             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1125             "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
1126             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1127             "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
1128             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1129             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
1130             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1131             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1132             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1133             "lwc1       %[ftmp0],   0x00(%[dst])                        \n\t"
1134             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1135             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1136             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1137             "uld        %[low32],   0x00(%[src])                        \n\t"
1138             "mtc1       %[low32],   %[ftmp0]                            \n\t"
1139             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
1140             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1141             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1142             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1143             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
1144             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1145             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
1146             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1147             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1148             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1149             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1150             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1151             "lwc1       %[ftmp1],   0x00(%[dst])                        \n\t"
1152             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1153             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1154             "bne        %[h],       0x10,           2f                  \n\t"
1155             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1156             "uld        %[low32],   0x00(%[src])                        \n\t"
1157             "mtc1       %[low32],   %[ftmp1]                            \n\t"
1158             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
1159             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1160             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1161             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1162             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1163             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1164             "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
1165             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1166             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
1167             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1168             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1169             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1170             "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
1171             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1172             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1173             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1174             "uld        %[low32],   0x00(%[src])                        \n\t"
1175             "mtc1       %[low32],   %[ftmp2]                            \n\t"
1176             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
1177             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1178             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1179             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1180             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1181             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1182             "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
1183             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1184             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
1185             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1186             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1187             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1188             "lwc1       %[ftmp3],   0x00(%[dst])                        \n\t"
1189             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1190             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1191             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1192             "uld        %[low32],   0x00(%[src])                        \n\t"
1193             "mtc1       %[low32],   %[ftmp3]                            \n\t"
1194             "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
1195             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1196             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1197             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1198             "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
1199             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1200             "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
1201             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1202             "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
1203             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1204             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1205             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1206             "lwc1       %[ftmp4],   0x00(%[dst])                        \n\t"
1207             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1208             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1209             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1210             "uld        %[low32],   0x00(%[src])                        \n\t"
1211             "mtc1       %[low32],   %[ftmp4]                            \n\t"
1212             "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
1213             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1214             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1215             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1216             "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
1217             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1218             "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
1219             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1220             "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
1221             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1222             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1223             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1224             "lwc1       %[ftmp5],   0x00(%[dst])                        \n\t"
1225             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1226             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1227             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1228             "uld        %[low32],   0x00(%[src])                        \n\t"
1229             "mtc1       %[low32],   %[ftmp5]                            \n\t"
1230             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
1231             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1232             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1233             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1234             "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
1235             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1236             "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
1237             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1238             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
1239             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1240             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1241             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1242             "lwc1       %[ftmp0],   0x00(%[dst])                        \n\t"
1243             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1244             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1245             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1246             "uld        %[low32],   0x00(%[src])                        \n\t"
1247             "mtc1       %[low32],   %[ftmp0]                            \n\t"
1248             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
1249             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1250             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1251             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1252             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
1253             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1254             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
1255             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1256             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1257             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1258             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1259             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1260             "lwc1       %[ftmp1],   0x00(%[dst])                        \n\t"
1261             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1262             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1263             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1264             "uld        %[low32],   0x00(%[src])                        \n\t"
1265             "mtc1       %[low32],   %[ftmp1]                            \n\t"
1266             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
1267             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1268             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1269             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1270             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1271             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1272             "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
1273             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1274             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
1275             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1276             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1277             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1278             "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
1279             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1280             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1281             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1282             "uld        %[low32],   0x00(%[src])                        \n\t"
1283             "mtc1       %[low32],   %[ftmp2]                            \n\t"
1284             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
1285             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
1286             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1287             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1288             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1289             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1290             "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
1291             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1292             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
1293             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1294             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
1295             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
1296             "lwc1       %[ftmp3],   0x00(%[dst])                        \n\t"
1297             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1298             "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
1299             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1300             "2:                                                         \n\t"
1301             ".set       pop                                             \n\t"
1302             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1303               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1304               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1305               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1306               [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1307               [tmp0]"=&r"(tmp[0]),
1308               [src]"+&r"(src),              [dst]"+&r"(dst),
1309               [h]"+&r"(h),
1310               [low32]"=&r"(low32)
1311             : [dstStride]"r"((mips_reg)dstStride),
1312               [srcStride]"r"((mips_reg)srcStride),
1313               [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
1314             : "memory"
1315         );
1316
1317         src += 4 - (h + 5) * srcStride;
1318         dst += 4 - h * dstStride;
1319     }
1320 }
1321
1322 static void avg_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
1323         int dstStride, int srcStride)
1324 {
1325     avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
1326     avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
1327     src += 8*srcStride;
1328     dst += 8*dstStride;
1329     avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
1330     avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
1331 }
1332
1333 static void put_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
1334         int dstStride, int srcStride)
1335 {
1336     INIT_CLIP
1337     int i;
1338     int16_t _tmp[36];
1339     int16_t *tmp = _tmp;
1340     double ftmp[10];
1341     uint64_t tmp0;
1342     uint64_t low32;
1343
1344     src -= 2*srcStride;
1345
1346     __asm__ volatile (
1347         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1348         "dli        %[tmp0],    0x09                                    \n\t"
1349         "1:                                                             \n\t"
1350         "uld        %[low32],   -0x02(%[src])                           \n\t"
1351         "mtc1       %[low32],   %[ftmp1]                                \n\t"
1352         "uld        %[low32],   -0x01(%[src])                           \n\t"
1353         "mtc1       %[low32],   %[ftmp2]                                \n\t"
1354         "uld        %[low32],   0x00(%[src])                            \n\t"
1355         "mtc1       %[low32],   %[ftmp3]                                \n\t"
1356         "uld        %[low32],   0x01(%[src])                            \n\t"
1357         "mtc1       %[low32],   %[ftmp4]                                \n\t"
1358         "uld        %[low32],   0x02(%[src])                            \n\t"
1359         "mtc1       %[low32],   %[ftmp5]                                \n\t"
1360         "uld        %[low32],   0x03(%[src])                            \n\t"
1361         "mtc1       %[low32],   %[ftmp6]                                \n\t"
1362         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1363         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1364         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1365         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1366         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
1367         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
1368         "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
1369         "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
1370         "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
1371         "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
1372         "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
1373         "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
1374         "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
1375         "sdc1       %[ftmp9],   0x00(%[tmp])                            \n\t"
1376         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
1377         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
1378         PTR_ADDU   "%[tmp],     %[tmp],         %[tmpStride]            \n\t"
1379         "bnez       %[tmp0],    1b                                      \n\t"
1380         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1381           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1382           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1383           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1384           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
1385           [tmp0]"=&r"(tmp0),
1386           [tmp]"+&r"(tmp),                  [src]"+&r"(src),
1387           [low32]"=&r"(low32)
1388         : [tmpStride]"r"(8),
1389           [srcStride]"r"((mips_reg)srcStride),
1390           [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5)
1391         : "memory"
1392     );
1393
1394     tmp -= 28;
1395
1396     for (i=0; i<4; i++) {
1397         const int16_t tmpB= tmp[-8];
1398         const int16_t tmpA= tmp[-4];
1399         const int16_t tmp0= tmp[ 0];
1400         const int16_t tmp1= tmp[ 4];
1401         const int16_t tmp2= tmp[ 8];
1402         const int16_t tmp3= tmp[12];
1403         const int16_t tmp4= tmp[16];
1404         const int16_t tmp5= tmp[20];
1405         const int16_t tmp6= tmp[24];
1406         op2_put(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
1407         op2_put(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
1408         op2_put(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
1409         op2_put(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
1410         dst++;
1411         tmp++;
1412     }
1413 }
1414
1415 static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
1416         const uint8_t *src, ptrdiff_t tmpStride, ptrdiff_t srcStride, int size)
1417 {
1418     int w = (size + 8) >> 2;
1419     double ftmp[11];
1420     uint64_t tmp0;
1421     uint64_t low32;
1422
1423     src -= 2 * srcStride + 2;
1424
1425     while (w--) {
1426         __asm__ volatile (
1427             "dli        %[tmp0],    0x02                                \n\t"
1428             "uld        %[low32],   0x00(%[src])                        \n\t"
1429             "mtc1       %[low32],   %[ftmp0]                            \n\t"
1430             "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
1431             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1432             "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
1433             "uld        %[low32],   0x00(%[src])                        \n\t"
1434             "mtc1       %[low32],   %[ftmp1]                            \n\t"
1435             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1436             "uld        %[low32],   0x00(%[src])                        \n\t"
1437             "mtc1       %[low32],   %[ftmp2]                            \n\t"
1438             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1439             "uld        %[low32],   0x00(%[src])                        \n\t"
1440             "mtc1       %[low32],   %[ftmp3]                            \n\t"
1441             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1442             "uld        %[low32],   0x00(%[src])                        \n\t"
1443             "mtc1       %[low32],   %[ftmp4]                            \n\t"
1444             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1445             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
1446             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1447             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1448             "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
1449             "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
1450             "uld        %[low32],   0x00(%[src])                        \n\t"
1451             "mtc1       %[low32],   %[ftmp5]                            \n\t"
1452             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
1453             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1454             "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
1455             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1456             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1457             "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
1458             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1459             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
1460             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1461             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1462             "sdc1       %[ftmp6],   0x00(%[tmp])                        \n\t"
1463             "uld        %[low32],   0x00(%[src])                        \n\t"
1464             "mtc1       %[low32],   %[ftmp0]                            \n\t"
1465             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
1466             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1467             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
1468             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1469             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1470             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
1471             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1472             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1473             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1474             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1475             "sdc1       %[ftmp6],   0x30(%[tmp])                        \n\t"
1476             "uld        %[low32],   0x00(%[src])                        \n\t"
1477             "mtc1       %[low32],   %[ftmp1]                            \n\t"
1478             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
1479             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1480             "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
1481             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1482             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1483             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1484             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1485             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
1486             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1487             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1488             "sdc1       %[ftmp6],   0x60(%[tmp])                        \n\t"
1489             "uld        %[low32],   0x00(%[src])                        \n\t"
1490             "mtc1       %[low32],   %[ftmp2]                            \n\t"
1491             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
1492             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1493             "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
1494             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1495             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1496             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1497             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1498             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
1499             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1500             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1501             "sdc1       %[ftmp6],   0x90(%[tmp])                        \n\t"
1502             "uld        %[low32],   0x00(%[src])                        \n\t"
1503             "mtc1       %[low32],   %[ftmp3]                            \n\t"
1504             "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
1505             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1506             "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
1507             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1508             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1509             "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
1510             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1511             "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
1512             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1513             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1514             "sdc1       %[ftmp6],   0xc0(%[tmp])                        \n\t"
1515             "uld        %[low32],   0x00(%[src])                        \n\t"
1516             "mtc1       %[low32],   %[ftmp4]                            \n\t"
1517             "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
1518             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1519             "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
1520             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1521             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1522             "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
1523             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1524             "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
1525             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1526             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1527             "sdc1       %[ftmp6],   0xf0(%[tmp])                        \n\t"
1528             "uld        %[low32],   0x00(%[src])                        \n\t"
1529             "mtc1       %[low32],   %[ftmp5]                            \n\t"
1530             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
1531             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1532             "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
1533             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1534             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1535             "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
1536             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1537             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
1538             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1539             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1540             "sdc1       %[ftmp6],   0x120(%[tmp])                       \n\t"
1541             "uld        %[low32],   0x00(%[src])                        \n\t"
1542             "mtc1       %[low32],   %[ftmp0]                            \n\t"
1543             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
1544             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1545             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
1546             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1547             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1548             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
1549             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1550             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1551             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1552             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1553             "sdc1       %[ftmp6],   0x150(%[tmp])                       \n\t"
1554             "bne        %[size],    0x10,           2f                  \n\t"
1555
1556             "uld        %[low32],   0x00(%[src])                        \n\t"
1557             "mtc1       %[low32],   %[ftmp1]                            \n\t"
1558             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
1559             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1560             "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
1561             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1562             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1563             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1564             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1565             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
1566             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1567             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1568             "sdc1       %[ftmp6],   0x180(%[tmp])                       \n\t"
1569             "uld        %[low32],   0x00(%[src])                        \n\t"
1570             "mtc1       %[low32],   %[ftmp2]                            \n\t"
1571             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
1572             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1573             "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
1574             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1575             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1576             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1577             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1578             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
1579             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1580             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1581             "sdc1       %[ftmp6],   0x1b0(%[tmp])                       \n\t"
1582             "uld        %[low32],   0x00(%[src])                        \n\t"
1583             "mtc1       %[low32],   %[ftmp3]                            \n\t"
1584             "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
1585             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1586             "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
1587             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1588             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1589             "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
1590             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1591             "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
1592             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1593             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1594             "sdc1       %[ftmp6],   0x1e0(%[tmp])                       \n\t"
1595             "uld        %[low32],   0x00(%[src])                        \n\t"
1596             "mtc1       %[low32],   %[ftmp4]                            \n\t"
1597             "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
1598             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1599             "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
1600             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1601             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1602             "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
1603             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1604             "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
1605             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1606             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1607             "sdc1       %[ftmp6],   0x210(%[tmp])                       \n\t"
1608             "uld        %[low32],   0x00(%[src])                        \n\t"
1609             "mtc1       %[low32],   %[ftmp5]                            \n\t"
1610             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
1611             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1612             "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
1613             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1614             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1615             "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
1616             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1617             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
1618             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1619             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1620             "sdc1       %[ftmp6],   0x240(%[tmp])                       \n\t"
1621             "uld        %[low32],   0x00(%[src])                        \n\t"
1622             "mtc1       %[low32],   %[ftmp0]                            \n\t"
1623             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
1624             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1625             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
1626             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1627             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
1628             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
1629             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1630             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1631             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1632             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1633             "sdc1       %[ftmp6],   0x270(%[tmp])                       \n\t"
1634             "uld        %[low32],   0x00(%[src])                        \n\t"
1635             "mtc1       %[low32],   %[ftmp1]                            \n\t"
1636             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
1637             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1638             "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
1639             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1640             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1641             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
1642             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1643             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
1644             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1645             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
1646             "sdc1       %[ftmp6],   0x2a0(%[tmp])                       \n\t"
1647             "uld        %[low32],   0x00(%[src])                        \n\t"
1648             "mtc1       %[low32],   %[ftmp2]                            \n\t"
1649             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
1650             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
1651             "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
1652             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
1653             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
1654             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
1655             "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
1656             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
1657             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
1658             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
1659             "sdc1       %[ftmp6],   0x2d0(%[tmp])                       \n\t"
1660             "2:                                                         \n\t"
1661             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1662               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1663               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1664               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1665               [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1666               [ftmp10]"=&f"(ftmp[10]),
1667               [tmp0]"=&r"(tmp0),
1668               [src]"+&r"(src),
1669               [low32]"=&r"(low32)
1670             : [tmp]"r"(tmp),                [size]"r"(size),
1671               [srcStride]"r"((mips_reg)srcStride),
1672               [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
1673             : "memory"
1674         );
1675
1676         tmp += 4;
1677         src += 4 - (size + 5) * srcStride;
1678     }
1679 }
1680
1681 static void put_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
1682         int16_t *tmp, ptrdiff_t dstStride, ptrdiff_t tmpStride, int size)
1683 {
1684     int w = size >> 4;
1685     double ftmp[10];
1686     uint64_t tmp0;
1687
1688     do {
1689         int h = size;
1690
1691         __asm__ volatile (
1692             "dli        %[tmp0],    0x02                                \n\t"
1693             "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
1694             "dli        %[tmp0],    0x06                                \n\t"
1695             "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
1696             "1:                                                         \n\t"
1697             "ldc1       %[ftmp0],   0x00(%[tmp])                        \n\t"
1698             "ldc1       %[ftmp3],   0x08(%[tmp])                        \n\t"
1699             "ldc1       %[ftmp6],   0x10(%[tmp])                        \n\t"
1700             "gsldlc1    %[ftmp1],   0x09(%[tmp])                        \n\t"
1701             "gsldrc1    %[ftmp1],   0x02(%[tmp])                        \n\t"
1702             "gsldlc1    %[ftmp4],   0x11(%[tmp])                        \n\t"
1703             "gsldrc1    %[ftmp4],   0x0a(%[tmp])                        \n\t"
1704             "gsldlc1    %[ftmp5],   0x19(%[tmp])                        \n\t"
1705             "gsldrc1    %[ftmp5],   0x12(%[tmp])                        \n\t"
1706             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]            \n\t"
1707             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
1708             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
1709             "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t"
1710             "gsldlc1    %[ftmp2],   0x0b(%[tmp])                        \n\t"
1711             "gsldrc1    %[ftmp2],   0x04(%[tmp])                        \n\t"
1712             "gsldlc1    %[ftmp6],   0x0d(%[tmp])                        \n\t"
1713             "gsldrc1    %[ftmp6],   0x06(%[tmp])                        \n\t"
1714             "gsldlc1    %[ftmp5],   0x13(%[tmp])                        \n\t"
1715             "gsldrc1    %[ftmp5],   0x0c(%[tmp])                        \n\t"
1716             "gsldlc1    %[ftmp7],   0x15(%[tmp])                        \n\t"
1717             "gsldrc1    %[ftmp7],   0x0e(%[tmp])                        \n\t"
1718             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
1719             "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
1720             "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
1721             "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
1722             "psrah      %[ftmp0],   %[ftmp0],       %[ftmp8]            \n\t"
1723             "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]            \n\t"
1724             "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
1725             "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
1726             "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
1727             "paddsh     %[ftmp3] ,  %[ftmp3],       %[ftmp5]            \n\t"
1728             "psrah      %[ftmp0],   %[ftmp0],       %[ftmp8]            \n\t"
1729             "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]            \n\t"
1730             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
1731             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
1732             "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]            \n\t"
1733             "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
1734             "packushb   %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
1735             "addi       %[h],       %[h],           -0x01               \n\t"
1736             "gssdlc1    %[ftmp0],   0x07(%[dst])                        \n\t"
1737             "gssdrc1    %[ftmp0],   0x00(%[dst])                        \n\t"
1738             PTR_ADDIU  "%[tmp],     %[tmp],         0x30                \n\t"
1739             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
1740             "bnez       %[h],       1b                                  \n\t"
1741             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1742               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1743               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1744               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1745               [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1746               [tmp0]"=&r"(tmp0),
1747               [tmp]"+&r"(tmp),              [dst]"+&r"(dst),
1748               [h]"+&r"(h)
1749             : [dstStride]"r"((mips_reg)dstStride)
1750             : "memory"
1751         );
1752
1753         tmp += 8 - size * 24;
1754         dst += 8 - size * dstStride;
1755     } while (w--);
1756 }
1757
1758 static void put_h264_qpel8or16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
1759         const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
1760         ptrdiff_t srcStride, int size)
1761 {
1762     put_h264_qpel8or16_hv1_lowpass_mmi(tmp, src, tmpStride, srcStride, size);
1763     put_h264_qpel8or16_hv2_lowpass_mmi(dst, tmp, dstStride, tmpStride, size);
1764 }
1765
1766 static void put_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
1767         const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
1768         ptrdiff_t srcStride)
1769 {
1770     put_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
1771             srcStride, 8);
1772 }
1773
1774 static void put_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
1775         const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
1776         ptrdiff_t srcStride)
1777 {
1778     put_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
1779             srcStride, 16);
1780 }
1781
1782 static void put_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
1783         const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
1784 {
1785     int h = 8;
1786     double ftmp[9];
1787     uint64_t tmp[1];
1788     uint64_t low32;
1789
1790     __asm__ volatile (
1791         "dli        %[tmp0],    0x02                                    \n\t"
1792         "mtc1       %[tmp0],    %[ftmp7]                                \n\t"
1793         "dli        %[tmp0],    0x05                                    \n\t"
1794         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1795         "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
1796         "1:                                                             \n\t"
1797         "gsldlc1    %[ftmp1],   0x07(%[src])                            \n\t"
1798         "gsldrc1    %[ftmp1],   0x00(%[src])                            \n\t"
1799         "gsldlc1    %[ftmp3],   0x08(%[src])                            \n\t"
1800         "gsldrc1    %[ftmp3],   0x01(%[src])                            \n\t"
1801         "punpckhbh  %[ftmp2],   %[ftmp1],       %[ftmp0]                \n\t"
1802         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1803         "punpckhbh  %[ftmp4],   %[ftmp3],       %[ftmp0]                \n\t"
1804         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1805         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
1806         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
1807         "psllh      %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
1808         "psllh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
1809         "gsldlc1    %[ftmp3],   0x06(%[src])                            \n\t"
1810         "gsldrc1    %[ftmp3],   -0x01(%[src])                           \n\t"
1811         "gsldlc1    %[ftmp5],   0x09(%[src])                            \n\t"
1812         "gsldrc1    %[ftmp5],   0x02(%[src])                            \n\t"
1813         "punpckhbh  %[ftmp4],   %[ftmp3],       %[ftmp0]                \n\t"
1814         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1815         "punpckhbh  %[ftmp6],   %[ftmp5],       %[ftmp0]                \n\t"
1816         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
1817         "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
1818         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
1819         "psubh      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
1820         "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
1821         "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_5]              \n\t"
1822         "pmullh     %[ftmp1],   %[ftmp1],       %[ff_pw_5]              \n\t"
1823         "uld        %[low32],   -0x02(%[src])                           \n\t"
1824         "mtc1       %[low32],   %[ftmp3]                                \n\t"
1825         "uld        %[low32],   0x07(%[src])                            \n\t"
1826         "mtc1       %[low32],   %[ftmp6]                                \n\t"
1827         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1828         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
1829         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
1830         "paddh      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1831         "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
1832         "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]             \n\t"
1833         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
1834         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
1835         "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
1836         "psrah      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
1837         "gsldlc1    %[ftmp5],   0x07(%[src2])                           \n\t"
1838         "gsldrc1    %[ftmp5],   0x00(%[src2])                           \n\t"
1839         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
1840         PTR_ADDU   "%[src],     %[src],         %[dstStride]            \n\t"
1841         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
1842         PTR_ADDU   "%[h],       %[h],           -0x01                   \n\t"
1843         "sdc1       %[ftmp1],   0x00(%[dst])                            \n\t"
1844         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
1845         PTR_ADDU   "%[src2],    %[src2],        %[src2Stride]           \n\t"
1846         "bgtz       %[h],       1b                                      \n\t"
1847         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1848           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1849           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1850           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1851           [ftmp8]"=&f"(ftmp[8]),
1852           [tmp0]"=&r"(tmp[0]),
1853           [src]"+&r"(src),                  [dst]"+&r"(dst),
1854           [src2]"+&r"(src2),                [h]"+&r"(h),
1855           [low32]"=&r"(low32)
1856         : [src2Stride]"r"((mips_reg)src2Stride),
1857           [dstStride]"r"((mips_reg)dstStride),
1858           [ff_pw_5]"f"(ff_pw_5),            [ff_pw_16]"f"(ff_pw_16)
1859         : "memory"
1860     );
1861 }
1862
1863 static void put_pixels8_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
1864         const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int h)
1865 {
1866     double ftmp[7];
1867     uint64_t tmp0;
1868
1869     do {
1870         __asm__ volatile (
1871             "dli        %[tmp0],    0x05                                \n\t"
1872             "gsldlc1    %[ftmp0],   0x07(%[src16])                      \n\t"
1873             "gsldrc1    %[ftmp0],   0x00(%[src16])                      \n\t"
1874             "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
1875             "gsldlc1    %[ftmp1],   0x0f(%[src16])                      \n\t"
1876             "gsldrc1    %[ftmp1],   0x08(%[src16])                      \n\t"
1877             "gsldlc1    %[ftmp2],   0x37(%[src16])                      \n\t"
1878             "gsldrc1    %[ftmp2],   0x30(%[src16])                      \n\t"
1879             "gsldlc1    %[ftmp3],   0x3f(%[src16])                      \n\t"
1880             "gsldrc1    %[ftmp3],   0x38(%[src16])                      \n\t"
1881             "psrah      %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"
1882             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
1883             "psrah      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
1884             "psrah      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
1885             "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
1886             "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t"
1887             "ldc1       %[ftmp5],   0x00(%[src8])                       \n\t"
1888             "gsldxc1    %[ftmp4],   0x00(%[src8],   %[src8Stride])      \n\t"
1889             "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
1890             "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
1891             "sdc1       %[ftmp0],   0x00(%[dst])                        \n\t"
1892             "gssdxc1    %[ftmp2],   0x00(%[dst],    %[dstStride])       \n\t"
1893             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1894               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1895               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1896               [ftmp6]"=&f"(ftmp[6]),
1897               [tmp0]"=&r"(tmp0)
1898             : [src8]"r"(src8),              [src16]"r"(src16),
1899               [dst]"r"(dst),
1900               [src8Stride]"r"((mips_reg)src8Stride),
1901               [dstStride]"r"((mips_reg)dstStride)
1902             : "memory"
1903         );
1904
1905         src8  += 2 * src8Stride;
1906         src16 += 48;
1907         dst   += 2 * dstStride;
1908     } while (h -= 2);
1909 }
1910
1911 static void put_h264_qpel16_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
1912         const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
1913 {
1914     put_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
1915     put_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
1916             src2Stride);
1917
1918     src += 8 * dstStride;
1919     dst += 8 * dstStride;
1920     src2 += 8 * src2Stride;
1921
1922     put_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
1923     put_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
1924             src2Stride);
1925 }
1926
1927 static void put_pixels16_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
1928         const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int h)
1929 {
1930     put_pixels8_l2_shift5_mmi(dst, src16, src8, dstStride, src8Stride, h);
1931     put_pixels8_l2_shift5_mmi(dst + 8, src16 + 8, src8 + 8, dstStride,
1932             src8Stride, h);
1933 }
1934
1935 static void avg_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
1936         int dstStride, int srcStride)
1937 {
1938     INIT_CLIP
1939     int i;
1940     int16_t _tmp[36];
1941     int16_t *tmp = _tmp;
1942     double ftmp[10];
1943     uint64_t tmp0;
1944     uint64_t low32;
1945
1946     src -= 2*srcStride;
1947
1948     __asm__ volatile (
1949         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1950         "dli        %[tmp0],    0x09                                    \n\t"
1951         "1:                                                             \n\t"
1952         "uld        %[low32],   -0x02(%[src])                           \n\t"
1953         "mtc1       %[low32],   %[ftmp1]                                \n\t"
1954         "uld        %[low32],   -0x01(%[src])                           \n\t"
1955         "mtc1       %[low32],   %[ftmp2]                                \n\t"
1956         "uld        %[low32],   0x00(%[src])                            \n\t"
1957         "mtc1       %[low32],   %[ftmp3]                                \n\t"
1958         "uld        %[low32],   0x01(%[src])                            \n\t"
1959         "mtc1       %[low32],   %[ftmp4]                                \n\t"
1960         "uld        %[low32],   0x02(%[src])                            \n\t"
1961         "mtc1       %[low32],   %[ftmp5]                                \n\t"
1962         "uld        %[low32],   0x03(%[src])                            \n\t"
1963         "mtc1       %[low32],   %[ftmp6]                                \n\t"
1964         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1965         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1966         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1967         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1968         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
1969         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
1970         "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
1971         "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
1972         "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
1973         "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
1974         "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
1975         "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
1976         "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
1977         "sdc1       %[ftmp9],   0x00(%[tmp])                            \n\t"
1978         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
1979         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
1980         PTR_ADDU   "%[tmp],     %[tmp],         %[tmpStride]            \n\t"
1981         "bnez       %[tmp0],    1b                                      \n\t"
1982         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1983           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1984           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1985           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1986           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
1987           [tmp0]"=&r"(tmp0),
1988           [tmp]"+&r"(tmp),                  [src]"+&r"(src),
1989           [low32]"=&r"(low32)
1990         : [tmpStride]"r"(8),
1991           [srcStride]"r"((mips_reg)srcStride),
1992           [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5)
1993         : "memory"
1994     );
1995
1996     tmp -= 28;
1997
1998     for (i=0; i<4; i++) {
1999         const int16_t tmpB= tmp[-8];
2000         const int16_t tmpA= tmp[-4];
2001         const int16_t tmp0= tmp[ 0];
2002         const int16_t tmp1= tmp[ 4];
2003         const int16_t tmp2= tmp[ 8];
2004         const int16_t tmp3= tmp[12];
2005         const int16_t tmp4= tmp[16];
2006         const int16_t tmp5= tmp[20];
2007         const int16_t tmp6= tmp[24];
2008         op2_avg(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
2009         op2_avg(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
2010         op2_avg(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
2011         op2_avg(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
2012         dst++;
2013         tmp++;
2014     }
2015 }
2016
2017 static void avg_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
2018         int16_t *tmp, ptrdiff_t dstStride, ptrdiff_t tmpStride, int size)
2019 {
2020     int w = size >> 4;
2021     double ftmp[11];
2022     uint64_t tmp0;
2023
2024     do {
2025         int h = size;
2026         __asm__ volatile (
2027             "dli        %[tmp0],    0x02                                \n\t"
2028             "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
2029             "dli        %[tmp0],    0x06                                \n\t"
2030             "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
2031             "1:                                                         \n\t"
2032             "ldc1       %[ftmp0],   0x00(%[tmp])                        \n\t"
2033             "ldc1       %[ftmp3],   0x08(%[tmp])                        \n\t"
2034             "gsldlc1    %[ftmp1],   0x09(%[tmp])                        \n\t"
2035             "gsldrc1    %[ftmp1],   0x02(%[tmp])                        \n\t"
2036             "gsldlc1    %[ftmp4],   0x11(%[tmp])                        \n\t"
2037             "gsldrc1    %[ftmp4],   0x0a(%[tmp])                        \n\t"
2038             "ldc1       %[ftmp7],   0x10(%[tmp])                        \n\t"
2039             "gsldlc1    %[ftmp8],   0x19(%[tmp])                        \n\t"
2040             "gsldrc1    %[ftmp8],   0x12(%[tmp])                        \n\t"
2041             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]            \n\t"
2042             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
2043             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp8]            \n\t"
2044             "paddh      %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
2045             "gsldlc1    %[ftmp2],   0x0b(%[tmp])                        \n\t"
2046             "gsldrc1    %[ftmp2],   0x04(%[tmp])                        \n\t"
2047             "gsldlc1    %[ftmp5],   0x13(%[tmp])                        \n\t"
2048             "gsldrc1    %[ftmp5],   0x0c(%[tmp])                        \n\t"
2049             "gsldlc1    %[ftmp7],   0x0d(%[tmp])                        \n\t"
2050             "gsldrc1    %[ftmp7],   0x06(%[tmp])                        \n\t"
2051             "gsldlc1    %[ftmp8],   0x15(%[tmp])                        \n\t"
2052             "gsldrc1    %[ftmp8],   0x0e(%[tmp])                        \n\t"
2053             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
2054             "paddh      %[ftmp5],   %[ftmp5],       %[ftmp8]            \n\t"
2055             "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
2056             "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
2057             "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]            \n\t"
2058             "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
2059             "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
2060             "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
2061             "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
2062             "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
2063             "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]            \n\t"
2064             "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
2065             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
2066             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
2067             "psrah      %[ftmp0],   %[ftmp0],       %[ftmp10]           \n\t"
2068             "psrah      %[ftmp3],   %[ftmp3],       %[ftmp10]           \n\t"
2069             "packushb   %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
2070             "ldc1       %[ftmp6],   0x00(%[dst])                        \n\t"
2071             "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"
2072             "sdc1       %[ftmp0],   0x00(%[dst])                        \n\t"
2073             "addi       %[h],       %[h],           -0x01               \n\t"
2074             PTR_ADDI   "%[tmp],     %[tmp],         0x30                \n\t"
2075             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
2076             "bnez       %[h],       1b                                  \n\t"
2077             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2078               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2079               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2080               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2081               [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2082               [ftmp10]"=&f"(ftmp[10]),
2083               [tmp0]"=&r"(tmp0),
2084               [tmp]"+&r"(tmp),              [dst]"+&r"(dst),
2085               [h]"+&r"(h)
2086             : [dstStride]"r"((mips_reg)dstStride)
2087             : "memory"
2088         );
2089
2090         tmp += 8 - size * 24;
2091         dst += 8 - size * dstStride;
2092     } while (w--);
2093 }
2094
2095 static void avg_h264_qpel8or16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
2096         const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
2097         ptrdiff_t srcStride, int size)
2098 {
2099     put_h264_qpel8or16_hv1_lowpass_mmi(tmp, src, tmpStride, srcStride, size);
2100     avg_h264_qpel8or16_hv2_lowpass_mmi(dst, tmp, dstStride, tmpStride, size);
2101 }
2102
2103 static void avg_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
2104         const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
2105         ptrdiff_t srcStride)
2106 {
2107     avg_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
2108             srcStride, 8);
2109 }
2110
2111 static void avg_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
2112         const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
2113         ptrdiff_t srcStride)
2114 {
2115     avg_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
2116             srcStride, 16);
2117 }
2118
2119 static void avg_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
2120         const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
2121 {
2122     double ftmp[10];
2123     uint64_t tmp[2];
2124     uint64_t low32;
2125
2126     __asm__ volatile (
2127         "dli        %[tmp1],    0x02                                    \n\t"
2128         "ori        %[tmp0],    $0,             0x8                     \n\t"
2129         "mtc1       %[tmp1],    %[ftmp7]                                \n\t"
2130         "dli        %[tmp1],    0x05                                    \n\t"
2131         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
2132         "mtc1       %[tmp1],    %[ftmp8]                                \n\t"
2133         "1:                                                             \n\t"
2134         "gsldlc1    %[ftmp1],   0x07(%[src])                            \n\t"
2135         "gsldrc1    %[ftmp1],   0x00(%[src])                            \n\t"
2136         "gsldlc1    %[ftmp2],   0x08(%[src])                            \n\t"
2137         "gsldrc1    %[ftmp2],   0x01(%[src])                            \n\t"
2138         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
2139         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
2140         "punpckhbh  %[ftmp4],   %[ftmp2],       %[ftmp0]                \n\t"
2141         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
2142         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
2143         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
2144         "psllh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
2145         "psllh      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
2146         "gsldlc1    %[ftmp2],   0x06(%[src])                            \n\t"
2147         "gsldrc1    %[ftmp2],   -0x01(%[src])                           \n\t"
2148         "gsldlc1    %[ftmp5],   0x09(%[src])                            \n\t"
2149         "gsldrc1    %[ftmp5],   0x02(%[src])                            \n\t"
2150         "punpckhbh  %[ftmp4],   %[ftmp2],       %[ftmp0]                \n\t"
2151         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
2152         "punpckhbh  %[ftmp6],   %[ftmp5],       %[ftmp0]                \n\t"
2153         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
2154         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
2155         "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
2156         "psubh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
2157         "psubh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
2158         "pmullh     %[ftmp1],   %[ftmp1],       %[ff_pw_5]              \n\t"
2159         "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_5]              \n\t"
2160         "uld        %[low32],   -0x02(%[src])                           \n\t"
2161         "mtc1       %[low32],   %[ftmp2]                                \n\t"
2162         "uld        %[low32],   0x07(%[src])                            \n\t"
2163         "mtc1       %[low32],   %[ftmp6]                                \n\t"
2164         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
2165         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
2166         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
2167         "paddh      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
2168         "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]             \n\t"
2169         "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]             \n\t"
2170         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
2171         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
2172         "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
2173         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
2174         "gsldlc1    %[ftmp5],   0x07(%[src2])                           \n\t"
2175         "gsldrc1    %[ftmp5],   0x00(%[src2])                           \n\t"
2176         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
2177         "ldc1       %[ftmp9],   0x00(%[dst])                            \n\t"
2178         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
2179         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
2180         PTR_ADDU   "%[src],     %[src],         %[dstStride]            \n\t"
2181         "sdc1       %[ftmp1],   0x00(%[dst])                            \n\t"
2182         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
2183         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
2184         PTR_ADDU   "%[src2],    %[src2],        %[src2Stride]           \n\t"
2185         "bgtz       %[tmp0],    1b                                      \n\t"
2186         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2187           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2188           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2189           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2190           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
2191           [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
2192           [dst]"+&r"(dst),                  [src]"+&r"(src),
2193           [src2]"+&r"(src2),
2194           [low32]"=&r"(low32)
2195         : [dstStride]"r"((mips_reg)dstStride),
2196           [src2Stride]"r"((mips_reg)src2Stride),
2197           [ff_pw_5]"f"(ff_pw_5),            [ff_pw_16]"f"(ff_pw_16)
2198         : "memory"
2199     );
2200 }
2201
2202 static void avg_h264_qpel16_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
2203         const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
2204 {
2205     avg_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
2206     avg_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
2207             src2Stride);
2208
2209     src += 8 * dstStride;
2210     dst += 8 * dstStride;
2211     src2 += 8 * src2Stride;
2212
2213     avg_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
2214     avg_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
2215             src2Stride);
2216 }
2217
2218 static void avg_pixels8_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
2219         const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int b)
2220 {
2221     double ftmp[8];
2222     uint64_t tmp0;
2223
2224     do {
2225         __asm__ volatile (
2226             "dli        %[tmp0],    0x05                                \n\t"
2227             "gsldlc1    %[ftmp0],   0x07(%[src16])                      \n\t"
2228             "gsldrc1    %[ftmp0],   0x00(%[src16])                      \n\t"
2229             "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
2230             "gsldlc1    %[ftmp1],   0x0f(%[src16])                      \n\t"
2231             "gsldrc1    %[ftmp1],   0x08(%[src16])                      \n\t"
2232             "gsldlc1    %[ftmp2],   0x37(%[src16])                      \n\t"
2233             "gsldrc1    %[ftmp2],   0x30(%[src16])                      \n\t"
2234             "gsldlc1    %[ftmp3],   0x3f(%[src16])                      \n\t"
2235             "gsldrc1    %[ftmp3],   0x38(%[src16])                      \n\t"
2236             "psrah      %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"
2237             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
2238             "psrah      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
2239             "psrah      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
2240             "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
2241             "ldc1       %[ftmp4],   0x00(%[src8])                       \n\t"
2242             "gsldxc1    %[ftmp5],   0x00(%[src8],   %[src8Stride])      \n\t"
2243             "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t"
2244             "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]            \n\t"
2245             "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
2246             "ldc1       %[ftmp7],   0x00(%[dst])                        \n\t"
2247             "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
2248             "sdc1       %[ftmp0],   0x00(%[dst])                        \n\t"
2249             "gsldxc1    %[ftmp7],   0x00(%[dst],    %[dstStride])       \n\t"
2250             "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
2251             "gssdxc1    %[ftmp2],   0x00(%[dst],    %[dstStride])       \n\t"
2252             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2253               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2254               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2255               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2256               [tmp0]"=&r"(tmp0)
2257             : [src8]"r"(src8),              [src16]"r"(src16),
2258               [dst]"r"(dst),
2259               [src8Stride]"r"((mips_reg)src8Stride),
2260               [dstStride]"r"((mips_reg)dstStride)
2261             : "memory"
2262         );
2263
2264         src8  += 2 * src8Stride;
2265         src16 += 48;
2266         dst   += 2 * dstStride;
2267     } while (b -= 2);
2268 }
2269
2270 static void avg_pixels16_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
2271         const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int b)
2272 {
2273     avg_pixels8_l2_shift5_mmi(dst, src16, src8, dstStride, src8Stride, b);
2274     avg_pixels8_l2_shift5_mmi(dst + 8, src16 + 8, src8 + 8, dstStride,
2275             src8Stride, b);
2276 }
2277
2278 //DEF_H264_MC_MMI(put_, 4)
2279 void ff_put_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
2280         ptrdiff_t stride)
2281 {
2282     ff_put_pixels4_8_mmi(dst, src, stride, 4);
2283 }
2284
2285 void ff_put_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
2286         ptrdiff_t stride)
2287 {
2288     uint8_t half[16];
2289     put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
2290     ff_put_pixels4_l2_8_mmi(dst, src, half, stride, stride, 4, 4);
2291 }
2292
2293 void ff_put_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
2294         ptrdiff_t stride)
2295 {
2296     put_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride);
2297 }
2298
2299 void ff_put_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
2300         ptrdiff_t stride)
2301 {
2302     uint8_t half[16];
2303     put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
2304     ff_put_pixels4_l2_8_mmi(dst, src+1, half, stride, stride, 4, 4);
2305 }
2306
2307 void ff_put_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
2308         ptrdiff_t stride)
2309 {
2310     uint8_t full[36];
2311     uint8_t * const full_mid= full + 8;
2312     uint8_t half[16];
2313     copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2314     put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
2315     ff_put_pixels4_l2_8_mmi(dst, full_mid, half, stride, 4, 4, 4);
2316 }
2317
2318 void ff_put_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
2319         ptrdiff_t stride)
2320 {
2321     uint8_t full[36];
2322     uint8_t * const full_mid= full + 8;
2323     copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2324     put_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4);
2325 }
2326
2327 void ff_put_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
2328         ptrdiff_t stride)
2329 {
2330     uint8_t full[36];
2331     uint8_t * const full_mid= full + 8;
2332     uint8_t half[16];
2333     copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2334     put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
2335     ff_put_pixels4_l2_8_mmi(dst, full_mid+4, half, stride, 4, 4, 4);
2336 }
2337
2338 void ff_put_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
2339         ptrdiff_t stride)
2340 {
2341     uint8_t full[36];
2342     uint8_t * const full_mid= full + 8;
2343     uint8_t halfH[16];
2344     uint8_t halfV[16];
2345     put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
2346     copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2347     put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2348     ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2349 }
2350
2351 void ff_put_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
2352         ptrdiff_t stride)
2353 {
2354     uint8_t full[36];
2355     uint8_t * const full_mid= full + 8;
2356     uint8_t halfH[16];
2357     uint8_t halfV[16];
2358     put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
2359     copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
2360     put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2361     ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2362 }
2363
2364 void ff_put_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
2365         ptrdiff_t stride)
2366 {
2367     uint8_t full[36];
2368     uint8_t * const full_mid= full + 8;
2369     uint8_t halfH[16];
2370     uint8_t halfV[16];
2371     put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
2372     copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2373     put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2374     ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2375 }
2376
2377 void ff_put_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
2378         ptrdiff_t stride)
2379 {
2380     uint8_t full[36];
2381     uint8_t * const full_mid= full + 8;
2382     uint8_t halfH[16];
2383     uint8_t halfV[16];
2384     put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
2385     copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
2386     put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2387     ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2388 }
2389
2390 void ff_put_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
2391         ptrdiff_t stride)
2392 {
2393     put_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride);
2394 }
2395
2396 void ff_put_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
2397         ptrdiff_t stride)
2398 {
2399     uint8_t halfH[16];
2400     uint8_t halfHV[16];
2401     put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
2402     put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2403     ff_put_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
2404 }
2405
2406 void ff_put_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
2407         ptrdiff_t stride)
2408 {
2409     uint8_t halfH[16];
2410     uint8_t halfHV[16];
2411     put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
2412     put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2413     ff_put_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
2414 }
2415
2416 void ff_put_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
2417         ptrdiff_t stride)
2418 {
2419     uint8_t full[36];
2420     uint8_t * const full_mid= full + 8;
2421     uint8_t halfV[16];
2422     uint8_t halfHV[16];
2423     copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2424     put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2425     put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2426     ff_put_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
2427 }
2428
2429 void ff_put_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
2430         ptrdiff_t stride)
2431 {
2432     uint8_t full[36];
2433     uint8_t * const full_mid= full + 8;
2434     uint8_t halfV[16];
2435     uint8_t halfHV[16];
2436     copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
2437     put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2438     put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2439     ff_put_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
2440 }
2441
2442 //DEF_H264_MC_MMI(avg_, 4)
2443 void ff_avg_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
2444         ptrdiff_t stride)
2445 {
2446     ff_avg_pixels4_8_mmi(dst, src, stride, 4);
2447 }
2448
2449 void ff_avg_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
2450         ptrdiff_t stride)
2451 {
2452     uint8_t half[16];
2453     put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
2454     ff_avg_pixels4_l2_8_mmi(dst, src, half, stride, stride, 4, 4);
2455 }
2456
2457 void ff_avg_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
2458         ptrdiff_t stride)
2459 {
2460     avg_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride);
2461 }
2462
2463 void ff_avg_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
2464         ptrdiff_t stride)
2465 {
2466     uint8_t half[16];
2467     put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
2468     ff_avg_pixels4_l2_8_mmi(dst, src+1, half, stride, stride, 4, 4);
2469 }
2470
2471 void ff_avg_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
2472         ptrdiff_t stride)
2473 {
2474     uint8_t full[36];
2475     uint8_t * const full_mid= full + 8;
2476     uint8_t half[16];
2477     copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2478     put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
2479     ff_avg_pixels4_l2_8_mmi(dst, full_mid, half, stride, 4, 4, 4);
2480 }
2481
2482 void ff_avg_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
2483         ptrdiff_t stride)
2484 {
2485     uint8_t full[36];
2486     uint8_t * const full_mid= full + 8;
2487     copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2488     avg_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4);
2489 }
2490
2491 void ff_avg_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
2492         ptrdiff_t stride)
2493 {
2494     uint8_t full[36];
2495     uint8_t * const full_mid= full + 8;
2496     uint8_t half[16];
2497     copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2498     put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
2499     ff_avg_pixels4_l2_8_mmi(dst, full_mid+4, half, stride, 4, 4, 4);
2500 }
2501
2502 void ff_avg_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
2503         ptrdiff_t stride)
2504 {
2505     uint8_t full[36];
2506     uint8_t * const full_mid= full + 8;
2507     uint8_t halfH[16];
2508     uint8_t halfV[16];
2509     put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
2510     copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2511     put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2512     ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2513 }
2514
2515 void ff_avg_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
2516         ptrdiff_t stride)
2517 {
2518     uint8_t full[36];
2519     uint8_t * const full_mid= full + 8;
2520     uint8_t halfH[16];
2521     uint8_t halfV[16];
2522     put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
2523     copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
2524     put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2525     ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2526 }
2527
2528 void ff_avg_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
2529         ptrdiff_t stride)
2530 {
2531     uint8_t full[36];
2532     uint8_t * const full_mid= full + 8;
2533     uint8_t halfH[16];
2534     uint8_t halfV[16];
2535     put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
2536     copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2537     put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2538     ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2539 }
2540
2541 void ff_avg_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
2542         ptrdiff_t stride)
2543 {
2544     uint8_t full[36];
2545     uint8_t * const full_mid= full + 8;
2546     uint8_t halfH[16];
2547     uint8_t halfV[16];
2548     put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
2549     copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
2550     put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2551     ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
2552 }
2553
2554 void ff_avg_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
2555         ptrdiff_t stride)
2556 {
2557     avg_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride);
2558 }
2559
2560 void ff_avg_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
2561         ptrdiff_t stride)
2562 {
2563     uint8_t halfH[16];
2564     uint8_t halfHV[16];
2565     put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
2566     put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2567     ff_avg_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
2568 }
2569
2570 void ff_avg_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
2571         ptrdiff_t stride)
2572 {
2573     uint8_t halfH[16];
2574     uint8_t halfHV[16];
2575     put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
2576     put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2577     ff_avg_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
2578 }
2579
2580 void ff_avg_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
2581         ptrdiff_t stride)
2582 {
2583     uint8_t full[36];
2584     uint8_t * const full_mid= full + 8;
2585     uint8_t halfV[16];
2586     uint8_t halfHV[16];
2587     copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
2588     put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2589     put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2590     ff_avg_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
2591 }
2592
2593 void ff_avg_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
2594         ptrdiff_t stride)
2595 {
2596     uint8_t full[36];
2597     uint8_t * const full_mid= full + 8;
2598     uint8_t halfV[16];
2599     uint8_t halfHV[16];
2600     copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
2601     put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
2602     put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
2603     ff_avg_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
2604 }
2605
2606 //DEF_H264_MC_MMI(put_, 8)
2607 void ff_put_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
2608         ptrdiff_t stride)
2609 {
2610     ff_put_pixels8_8_mmi(dst, src, stride, 8);
2611 }
2612
2613 void ff_put_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
2614         ptrdiff_t stride)
2615 {
2616     uint8_t half[64];
2617     put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
2618     ff_put_pixels8_l2_8_mmi(dst, src, half, stride, stride, 8, 8);
2619 }
2620
2621 void ff_put_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
2622         ptrdiff_t stride)
2623 {
2624     put_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride);
2625 }
2626
2627 void ff_put_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
2628         ptrdiff_t stride)
2629 {
2630     uint8_t half[64];
2631     put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
2632     ff_put_pixels8_l2_8_mmi(dst, src+1, half, stride, stride, 8, 8);
2633 }
2634
2635 void ff_put_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
2636         ptrdiff_t stride)
2637 {
2638     uint8_t full[104];
2639     uint8_t * const full_mid= full + 16;
2640     uint8_t half[64];
2641     copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2642     put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
2643     ff_put_pixels8_l2_8_mmi(dst, full_mid, half, stride, 8, 8, 8);
2644 }
2645
2646 void ff_put_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
2647         ptrdiff_t stride)
2648 {
2649     uint8_t full[104];
2650     uint8_t * const full_mid= full + 16;
2651     copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2652     put_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8);
2653 }
2654
2655 void ff_put_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
2656         ptrdiff_t stride)
2657 {
2658     uint8_t full[104];
2659     uint8_t * const full_mid= full + 16;
2660     uint8_t half[64];
2661     copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2662     put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
2663     ff_put_pixels8_l2_8_mmi(dst, full_mid+8, half, stride, 8, 8, 8);
2664 }
2665
2666 void ff_put_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
2667         ptrdiff_t stride)
2668 {
2669     uint8_t full[104];
2670     uint8_t * const full_mid= full + 16;
2671     uint8_t halfH[64];
2672     uint8_t halfV[64];
2673     put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
2674     copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2675     put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2676     ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2677 }
2678
2679 void ff_put_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
2680         ptrdiff_t stride)
2681 {
2682     uint8_t full[104];
2683     uint8_t * const full_mid= full + 16;
2684     uint8_t halfH[64];
2685     uint8_t halfV[64];
2686     put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
2687     copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
2688     put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2689     ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2690 }
2691
2692 void ff_put_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
2693         ptrdiff_t stride)
2694 {
2695     uint8_t full[104];
2696     uint8_t * const full_mid= full + 16;
2697     uint8_t halfH[64];
2698     uint8_t halfV[64];
2699     put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
2700     copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2701     put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2702     ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2703 }
2704
2705 void ff_put_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
2706         ptrdiff_t stride)
2707 {
2708     uint8_t full[104];
2709     uint8_t * const full_mid= full + 16;
2710     uint8_t halfH[64];
2711     uint8_t halfV[64];
2712     put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
2713     copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
2714     put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2715     ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2716 }
2717
2718 void ff_put_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
2719         ptrdiff_t stride)
2720 {
2721     uint16_t __attribute__ ((aligned(8))) temp[192];
2722
2723     put_h264_qpel8_hv_lowpass_mmi(dst, temp, src, stride, 8, stride);
2724 }
2725
2726 void ff_put_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
2727         ptrdiff_t stride)
2728 {
2729     uint8_t __attribute__ ((aligned(8))) temp[448];
2730     uint8_t *const halfHV = temp;
2731     int16_t *const halfV = (int16_t *) (temp + 64);
2732
2733     put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2734     put_h264_qpel8_h_lowpass_l2_mmi(dst, src, halfHV, stride, 8);
2735 }
2736
2737 void ff_put_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
2738         ptrdiff_t stride)
2739 {
2740     uint8_t __attribute__ ((aligned(8))) temp[448];
2741     uint8_t *const halfHV = temp;
2742     int16_t *const halfV = (int16_t *) (temp + 64);
2743
2744     put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2745     put_h264_qpel8_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 8);
2746 }
2747
2748 void ff_put_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
2749         ptrdiff_t stride)
2750 {
2751     uint8_t __attribute__ ((aligned(8))) temp[448];
2752     uint8_t *const halfHV = temp;
2753     int16_t *const halfV = (int16_t *) (temp + 64);
2754
2755     put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2756     put_pixels8_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 8, 8);
2757 }
2758
2759 void ff_put_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
2760         ptrdiff_t stride)
2761 {
2762     uint8_t __attribute__ ((aligned(8))) temp[448];
2763     uint8_t *const halfHV = temp;
2764     int16_t *const halfV = (int16_t *) (temp + 64);
2765
2766     put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2767     put_pixels8_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 8, 8);
2768 }
2769
2770 //DEF_H264_MC_MMI(avg_, 8)
2771 void ff_avg_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
2772         ptrdiff_t stride)
2773 {
2774     ff_avg_pixels8_8_mmi(dst, src, stride, 8);
2775 }
2776
2777 void ff_avg_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
2778         ptrdiff_t stride)
2779 {
2780     uint8_t half[64];
2781     put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
2782     ff_avg_pixels8_l2_8_mmi(dst, src, half, stride, stride, 8, 8);
2783 }
2784
2785 void ff_avg_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
2786         ptrdiff_t stride)
2787 {
2788     avg_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride);
2789 }
2790
2791 void ff_avg_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
2792         ptrdiff_t stride)
2793 {
2794     uint8_t half[64];
2795     put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
2796     ff_avg_pixels8_l2_8_mmi(dst, src+1, half, stride, stride, 8, 8);
2797 }
2798
2799 void ff_avg_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
2800         ptrdiff_t stride)
2801 {
2802     uint8_t full[104];
2803     uint8_t * const full_mid= full + 16;
2804     uint8_t half[64];
2805     copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2806     put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
2807     ff_avg_pixels8_l2_8_mmi(dst, full_mid, half, stride, 8, 8, 8);
2808 }
2809
2810 void ff_avg_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
2811         ptrdiff_t stride)
2812 {
2813     uint8_t full[104];
2814     uint8_t * const full_mid= full + 16;
2815     copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2816     avg_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8);
2817 }
2818
2819 void ff_avg_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
2820         ptrdiff_t stride)
2821 {
2822     uint8_t full[104];
2823     uint8_t * const full_mid= full + 16;
2824     uint8_t half[64];
2825     copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2826     put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
2827     ff_avg_pixels8_l2_8_mmi(dst, full_mid+8, half, stride, 8, 8, 8);
2828 }
2829
2830 void ff_avg_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
2831         ptrdiff_t stride)
2832 {
2833     uint8_t full[104];
2834     uint8_t * const full_mid= full + 16;
2835     uint8_t halfH[64];
2836     uint8_t halfV[64];
2837     put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
2838     copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2839     put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2840     ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2841 }
2842
2843 void ff_avg_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
2844         ptrdiff_t stride)
2845 {
2846     uint8_t full[104];
2847     uint8_t * const full_mid= full + 16;
2848     uint8_t halfH[64];
2849     uint8_t halfV[64];
2850     put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
2851     copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
2852     put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2853     ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2854 }
2855
2856 void ff_avg_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
2857         ptrdiff_t stride)
2858 {
2859     uint8_t full[104];
2860     uint8_t * const full_mid= full + 16;
2861     uint8_t halfH[64];
2862     uint8_t halfV[64];
2863     put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
2864     copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
2865     put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2866     ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2867 }
2868
2869 void ff_avg_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
2870         ptrdiff_t stride)
2871 {
2872     uint8_t full[104];
2873     uint8_t * const full_mid= full + 16;
2874     uint8_t halfH[64];
2875     uint8_t halfV[64];
2876     put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
2877     copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
2878     put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2879     ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2880 }
2881
2882 void ff_avg_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
2883         ptrdiff_t stride)
2884 {
2885     uint16_t __attribute__ ((aligned(8))) temp[192];
2886
2887     avg_h264_qpel8_hv_lowpass_mmi(dst, temp, src, stride, 8, stride);
2888 }
2889
2890 void ff_avg_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
2891         ptrdiff_t stride)
2892 {
2893     uint8_t __attribute__ ((aligned(8))) temp[448];
2894     uint8_t *const halfHV = temp;
2895     int16_t *const halfV = (int16_t *) (temp + 64);
2896
2897     put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2898     avg_h264_qpel8_h_lowpass_l2_mmi(dst, src, halfHV, stride, 8);
2899 }
2900
2901 void ff_avg_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
2902         ptrdiff_t stride)
2903 {
2904     uint8_t __attribute__ ((aligned(8))) temp[448];
2905     uint8_t *const halfHV = temp;
2906     int16_t *const halfV = (int16_t *) (temp + 64);
2907
2908     put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2909     avg_h264_qpel8_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 8);
2910 }
2911
2912 void ff_avg_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
2913         ptrdiff_t stride)
2914 {
2915     uint8_t __attribute__ ((aligned(8))) temp[448];
2916     uint8_t *const halfHV = temp;
2917     int16_t *const halfV = (int16_t *) (temp + 64);
2918
2919     put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2920     avg_pixels8_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 8, 8);
2921 }
2922
2923 void ff_avg_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
2924         ptrdiff_t stride)
2925 {
2926     uint8_t __attribute__ ((aligned(8))) temp[448];
2927     uint8_t *const halfHV = temp;
2928     int16_t *const halfV = (int16_t *) (temp + 64);
2929
2930     put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
2931     avg_pixels8_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 8, 8);
2932 }
2933
2934 //DEF_H264_MC_MMI(put_, 16)
2935 void ff_put_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
2936         ptrdiff_t stride)
2937 {
2938     ff_put_pixels16_8_mmi(dst, src, stride, 16);
2939 }
2940
2941 void ff_put_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
2942         ptrdiff_t stride)
2943 {
2944     uint8_t half[256];
2945     put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
2946     ff_put_pixels16_l2_8_mmi(dst, src, half, stride, stride, 16, 16);
2947 }
2948
2949 void ff_put_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
2950         ptrdiff_t stride)
2951 {
2952     put_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride);
2953 }
2954
2955 void ff_put_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
2956         ptrdiff_t stride)
2957 {
2958     uint8_t half[256];
2959     put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
2960     ff_put_pixels16_l2_8_mmi(dst, src+1, half, stride, stride, 16, 16);
2961 }
2962
2963 void ff_put_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
2964         ptrdiff_t stride)
2965 {
2966     uint8_t full[336];
2967     uint8_t * const full_mid= full + 32;
2968     uint8_t half[256];
2969     copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
2970     put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
2971     ff_put_pixels16_l2_8_mmi(dst, full_mid, half, stride, 16, 16, 16);
2972 }
2973
2974 void ff_put_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
2975         ptrdiff_t stride)
2976 {
2977     uint8_t full[336];
2978     uint8_t * const full_mid= full + 32;
2979     copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
2980     put_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16);
2981 }
2982
2983 void ff_put_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
2984         ptrdiff_t stride)
2985 {
2986     uint8_t full[336];
2987     uint8_t * const full_mid= full + 32;
2988     uint8_t half[256];
2989     copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
2990     put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
2991     ff_put_pixels16_l2_8_mmi(dst, full_mid+16, half, stride, 16, 16, 16);
2992 }
2993
2994 void ff_put_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
2995         ptrdiff_t stride)
2996 {
2997     uint8_t full[336];
2998     uint8_t * const full_mid= full + 32;
2999     uint8_t halfH[256];
3000     uint8_t halfV[256];
3001     put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
3002     copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
3003     put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
3004     ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
3005 }
3006
3007 void ff_put_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
3008         ptrdiff_t stride)
3009 {
3010     uint8_t full[336];
3011     uint8_t * const full_mid= full + 32;
3012     uint8_t halfH[256];
3013     uint8_t halfV[256];
3014     put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
3015     copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
3016     put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
3017     ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
3018 }
3019
3020 void ff_put_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
3021         ptrdiff_t stride)
3022 {
3023     uint8_t full[336];
3024     uint8_t * const full_mid= full + 32;
3025     uint8_t halfH[256];
3026     uint8_t halfV[256];
3027     put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
3028     copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
3029     put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
3030     ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
3031 }
3032
3033 void ff_put_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
3034         ptrdiff_t stride)
3035 {
3036     uint8_t full[336];
3037     uint8_t * const full_mid= full + 32;
3038     uint8_t halfH[256];
3039     uint8_t halfV[256];
3040     put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
3041     copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
3042     put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
3043     ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
3044 }
3045
3046 void ff_put_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
3047         ptrdiff_t stride)
3048 {
3049     uint16_t __attribute__ ((aligned(8))) temp[384];
3050
3051     put_h264_qpel16_hv_lowpass_mmi(dst, temp, src, stride, 16, stride);
3052 }
3053
3054 void ff_put_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
3055         ptrdiff_t stride)
3056 {
3057     uint8_t __attribute__ ((aligned(8))) temp[1024];
3058     uint8_t *const halfHV = temp;
3059     int16_t *const halfV = (int16_t *) (temp + 256);
3060
3061     put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
3062     put_h264_qpel16_h_lowpass_l2_mmi(dst, src, halfHV, stride, 16);
3063 }
3064
3065 void ff_put_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
3066         ptrdiff_t stride)
3067 {
3068     uint8_t __attribute__ ((aligned(8))) temp[1024];
3069     uint8_t *const halfHV = temp;
3070     int16_t *const halfV = (int16_t *) (temp + 256);
3071
3072     put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
3073     put_h264_qpel16_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 16);
3074 }
3075
3076 void ff_put_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
3077         ptrdiff_t stride)
3078 {
3079     uint8_t __attribute__ ((aligned(8))) temp[1024];
3080     uint8_t *const halfHV = temp;
3081     int16_t *const halfV = (int16_t *) (temp + 256);
3082
3083     put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
3084     put_pixels16_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 16, 16);
3085 }
3086
3087 void ff_put_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
3088         ptrdiff_t stride)
3089 {
3090     uint8_t __attribute__ ((aligned(8))) temp[1024];
3091     uint8_t *const halfHV = temp;
3092     int16_t *const halfV = (int16_t *) (temp + 256);
3093
3094     put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
3095     put_pixels16_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 16, 16);
3096 }
3097
3098 //DEF_H264_MC_MMI(avg_, 16)
3099 void ff_avg_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
3100         ptrdiff_t stride)
3101 {
3102     ff_avg_pixels16_8_mmi(dst, src, stride, 16);
3103 }
3104
3105 void ff_avg_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
3106         ptrdiff_t stride)
3107 {
3108     uint8_t half[256];
3109     put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
3110     ff_avg_pixels16_l2_8_mmi(dst, src, half, stride, stride, 16, 16);
3111 }
3112
3113 void ff_avg_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
3114         ptrdiff_t stride)
3115 {
3116     avg_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride);
3117 }
3118
3119 void ff_avg_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
3120         ptrdiff_t stride)
3121 {
3122     uint8_t half[256];
3123     put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
3124     ff_avg_pixels16_l2_8_mmi(dst, src+1, half, stride, stride, 16, 16);
3125 }
3126
3127 void ff_avg_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
3128         ptrdiff_t stride)
3129 {
3130     uint8_t full[336];
3131     uint8_t * const full_mid= full + 32;
3132     uint8_t half[256];
3133     copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
3134     put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
3135     ff_avg_pixels16_l2_8_mmi(dst, full_mid, half, stride, 16, 16, 16);
3136 }
3137
3138 void ff_avg_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
3139         ptrdiff_t stride)
3140 {
3141     uint8_t full[336];
3142     uint8_t * const full_mid= full + 32;
3143     copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
3144     avg_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16);
3145 }
3146
3147 void ff_avg_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
3148         ptrdiff_t stride)
3149 {
3150     uint8_t full[336];
3151     uint8_t * const full_mid= full + 32;
3152     uint8_t half[256];
3153     copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
3154     put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
3155     ff_avg_pixels16_l2_8_mmi(dst, full_mid+16, half, stride, 16, 16, 16);
3156 }
3157
3158 void ff_avg_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
3159         ptrdiff_t stride)
3160 {
3161     uint8_t full[336];
3162     uint8_t * const full_mid= full + 32;
3163     uint8_t halfH[256];
3164     uint8_t halfV[256];
3165     put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
3166     copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
3167     put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
3168     ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
3169 }
3170
3171 void ff_avg_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
3172         ptrdiff_t stride)
3173 {
3174     uint8_t full[336];
3175     uint8_t * const full_mid= full + 32;
3176     uint8_t halfH[256];
3177     uint8_t halfV[256];
3178     put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
3179     copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
3180     put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
3181     ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
3182 }
3183
3184 void ff_avg_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
3185         ptrdiff_t stride)
3186 {
3187     uint8_t full[336];
3188     uint8_t * const full_mid= full + 32;
3189     uint8_t halfH[256];
3190     uint8_t halfV[256];
3191     put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
3192     copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
3193     put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
3194     ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
3195 }
3196
3197 void ff_avg_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
3198         ptrdiff_t stride)
3199 {
3200     uint8_t full[336];
3201     uint8_t * const full_mid= full + 32;
3202     uint8_t halfH[256];
3203     uint8_t halfV[256];
3204     put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
3205     copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
3206     put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
3207     ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
3208 }
3209
3210 void ff_avg_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
3211         ptrdiff_t stride)
3212 {
3213     uint16_t __attribute__ ((aligned(8))) temp[384];
3214
3215     avg_h264_qpel16_hv_lowpass_mmi(dst, temp, src, stride, 16, stride);
3216 }
3217
3218 void ff_avg_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
3219         ptrdiff_t stride)
3220 {
3221     uint8_t __attribute__ ((aligned(8))) temp[1024];
3222     uint8_t *const halfHV = temp;
3223     int16_t *const halfV = (int16_t *) (temp + 256);
3224
3225     put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
3226     avg_h264_qpel16_h_lowpass_l2_mmi(dst, src, halfHV, stride, 16);
3227 }
3228
3229 void ff_avg_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
3230         ptrdiff_t stride)
3231 {
3232     uint8_t __attribute__ ((aligned(8))) temp[1024];
3233     uint8_t *const halfHV = temp;
3234     int16_t *const halfV = (int16_t *) (temp + 256);
3235
3236     put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
3237     avg_h264_qpel16_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 16);
3238 }
3239
3240 void ff_avg_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
3241         ptrdiff_t stride)
3242 {
3243     uint8_t __attribute__ ((aligned(8))) temp[1024];
3244     uint8_t *const halfHV = temp;
3245     int16_t *const halfV = (int16_t *) (temp + 256);
3246
3247     put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
3248     avg_pixels16_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 16, 16);
3249 }
3250
3251 void ff_avg_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
3252         ptrdiff_t stride)
3253 {
3254     uint8_t __attribute__ ((aligned(8))) temp[1024];
3255     uint8_t *const halfHV = temp;
3256     int16_t *const halfV = (int16_t *) (temp + 256);
3257
3258     put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
3259     avg_pixels16_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 16, 16);
3260 }
3261
3262 #undef op2_avg
3263 #undef op2_put