]> git.sesse.net Git - ffmpeg/blob - libavcodec/mips/h264chroma_mmi.c
fate/aac: Increase fuzz from of fate-aac-pns-encode from 72 to 74 for Loongson
[ffmpeg] / libavcodec / mips / h264chroma_mmi.c
1 /*
2  * Loongson SIMD optimized h264chroma
3  *
4  * Copyright (c) 2015 Loongson Technology Corporation Limited
5  * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25 #include "h264chroma_mips.h"
26
27 void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
28         int h, int x, int y)
29 {
30     const int A = (8 - x) * (8 - y);
31     const int B = x * (8 - y);
32     const int C = (8 - x) * y;
33     const int D = x * y;
34     const int E = B + C;
35     int i;
36
37     av_assert2(x<8 && y<8 && x>=0 && y>=0);
38
39     if (D) {
40         for (i=0; i<h; i++) {
41             __asm__ volatile (
42                 "ldl $2, %2                 \r\n"
43                 "ldr $2, %1                 \r\n"
44                 "ldl $3, %4                 \r\n"
45                 "ldr $3, %3                 \r\n"
46                 "ldl $4, %6                 \r\n"
47                 "ldr $4, %5                 \r\n"
48                 "ldl $5, %8                 \r\n"
49                 "ldr $5, %7                 \r\n"
50                 "daddiu $6, $0, 32          \r\n"
51                 "mtc1 %9, $f6               \r\n"
52                 "mtc1 %10, $f8              \r\n"
53                 "mtc1 %11, $f10             \r\n"
54                 "mtc1 %12, $f12             \r\n"
55                 "mtc1 $0, $f20              \r\n"
56                 "mtc1 $2, $f2               \r\n"
57                 "mtc1 $3, $f4               \r\n"
58                 "mtc1 $4, $f16              \r\n"
59                 "mtc1 $5, $f18              \r\n"
60                 "mtc1 $6, $f14              \r\n"
61                 "punpcklbh $f2, $f2, $f20   \r\n"
62                 "punpcklbh $f4, $f4, $f20   \r\n"
63                 "pshufh $f6, $f6, $f20      \r\n"
64                 "pshufh $f8, $f8, $f20      \r\n"
65                 "pshufh $f10, $f10, $f20    \r\n"
66                 "pshufh $f12, $f12, $f20    \r\n"
67                 "pshufh $f14, $f14, $f20    \r\n"
68                 "punpcklbh $f16, $f16, $f20 \r\n"
69                 "punpcklbh $f18, $f18, $f20 \r\n"
70                 "daddiu $6, $0, 6           \r\n"
71                 "mtc1 $6, $f22              \r\n"
72                 "dsrl32 $2, $2, 0           \r\n"
73                 "dsrl32 $3, $3, 0           \r\n"
74                 "dsrl32 $4, $4, 0           \r\n"
75                 "dsrl32 $5, $5, 0           \r\n"
76                 "pmullh $f2, $f2, $f6       \r\n"
77                 "pmullh $f4, $f4, $f8       \r\n"
78                 "pmullh $f16, $f10, $f16    \r\n"
79                 "pmullh $f18, $f12, $f18    \r\n"
80                 "paddh $f2, $f2, $f14       \r\n"
81                 "paddh $f4, $f4, $f16       \r\n"
82                 "paddh $f2, $f2, $f18       \r\n"
83                 "paddh $f2, $f2, $f4        \r\n"
84                 "psrah $f24, $f2, $f22      \r\n"
85                 "mtc1 $2, $f2               \r\n"
86                 "mtc1 $3, $f4               \r\n"
87                 "mtc1 $4, $f16              \r\n"
88                 "mtc1 $5, $f18              \r\n"
89                 "punpcklbh $f2, $f2, $f20   \r\n"
90                 "punpcklbh $f4, $f4, $f20   \r\n"
91                 "punpcklbh $f16, $f16, $f20 \r\n"
92                 "punpcklbh $f18, $f18, $f20 \r\n"
93                 "pmullh $f2, $f2, $f6       \r\n"
94                 "pmullh $f4, $f4, $f8       \r\n"
95                 "pmullh $f16, $f10, $f16    \r\n"
96                 "pmullh $f18, $f12, $f18    \r\n"
97                 "paddh $f2, $f2, $f14       \r\n"
98                 "paddh $f4, $f4, $f16       \r\n"
99                 "paddh $f2, $f2, $f18       \r\n"
100                 "paddh $f2, $f2, $f4        \r\n"
101                 "psrah $f2, $f2, $f22       \r\n"
102                 "packushb $f2, $f24, $f2    \r\n"
103                 "sdc1 $f2, %0               \r\n"
104                 : "=m"(*dst)
105                 : "m"(*src),"m"(*(src+7)),"m"(*(src+1)),"m"(*(src+8)),
106                   "m"(*(src+stride)),"m"(*(src+stride+7)),
107                   "m"(*(src+stride+1)),"m"(*(src+stride+8)),
108                   "r"(A),"r"(B),"r"(C),"r"(D)
109                 : "$2","$3","$4","$5","$6"
110             );
111
112             dst += stride;
113             src += stride;
114         }
115     } else if (E) {
116         const int step = C ? stride : 1;
117
118         for (i=0; i<h; i++) {
119             __asm__ volatile (
120                 "daddiu $6, $0, 32          \r\n"
121                 "ldl $2, %2                 \r\n"
122                 "ldr $2, %1                 \r\n"
123                 "ldl $3, %4                 \r\n"
124                 "ldr $3, %3                 \r\n"
125                 "mtc1 $6, $f14              \r\n"
126                 "mtc1 %5, $f6               \r\n"
127                 "mtc1 %6, $f8               \r\n"
128                 "mtc1 $0, $f20              \r\n"
129                 "mtc1 $2, $f2               \r\n"
130                 "mtc1 $3, $f4               \r\n"
131                 "daddiu $6, $0, 6           \r\n"
132                 "punpcklbh $f2, $f2, $f20   \r\n"
133                 "punpcklbh $f4, $f4, $f20   \r\n"
134                 "pshufh $f6, $f6, $f20      \r\n"
135                 "pshufh $f8, $f8, $f20      \r\n"
136                 "pshufh $f14, $f14, $f20    \r\n"
137                 "mtc1 $6, $f22              \r\n"
138                 "dsrl32 $2, $2, 0           \r\n"
139                 "dsrl32 $3, $3, 0           \r\n"
140                 "pmullh $f2, $f2, $f6       \r\n"
141                 "pmullh $f4, $f4, $f8       \r\n"
142                 "paddh $f2, $f2, $f14       \r\n"
143                 "paddh $f2, $f2, $f4        \r\n"
144                 "psrah $f24, $f2, $f22      \r\n"
145                 "mtc1 $2, $f2               \r\n"
146                 "mtc1 $3, $f4               \r\n"
147                 "punpcklbh $f2, $f2, $f20   \r\n"
148                 "punpcklbh $f4, $f4, $f20   \r\n"
149                 "pmullh $f2, $f2, $f6       \r\n"
150                 "pmullh $f4, $f4, $f8       \r\n"
151                 "paddh $f2, $f2, $f14       \r\n"
152                 "paddh $f2, $f2, $f4        \r\n"
153                 "psrah $f2, $f2, $f22       \r\n"
154                 "packushb $f2, $f24, $f2    \r\n"
155                 "sdc1 $f2, %0               \r\n"
156                 : "=m"(*dst)
157                 : "m"(*(src)),"m"(*(src+7)),
158                   "m"(*(src+step)),"m"(*(src+step+7)),
159                   "r"(A),"r"(E)
160                 : "$2","$3","$4","$5","$6"
161             );
162
163             dst += stride;
164             src += stride;
165         }
166     } else {
167         for (i = 0; i < h; i++) {
168             __asm__ volatile (
169                 "daddiu $6, $0, 32          \r\n"
170                 "ldl $2, %2                 \r\n"
171                 "ldr $2, %1                 \r\n"
172                 "mtc1 $6, $f14              \r\n"
173                 "mtc1 %3, $f6               \r\n"
174                 "mtc1 $0, $f20              \r\n"
175                 "mtc1 $2, $f2               \r\n"
176                 "daddiu $6, $0, 6           \r\n"
177                 "punpcklbh $f2, $f2, $f20   \r\n"
178                 "pshufh $f6, $f6, $f20      \r\n"
179                 "pshufh $f14, $f14, $f20    \r\n"
180                 "mtc1 $6, $f22              \r\n"
181                 "dsrl32 $2, $2, 0           \r\n"
182                 "pmullh $f2, $f2, $f6       \r\n"
183                 "paddh $f2, $f2, $f14       \r\n"
184                 "psrah $f24, $f2, $f22      \r\n"
185                 "mtc1 $2, $f2               \r\n"
186                 "punpcklbh $f2, $f2, $f20   \r\n"
187                 "pmullh $f2, $f2, $f6       \r\n"
188                 "paddh $f2, $f2, $f14       \r\n"
189                 "psrah $f2, $f2, $f22       \r\n"
190                 "packushb $f2, $f24, $f2    \r\n"
191                 "sdc1 $f2, %0               \r\n"
192                 :"=m"(*dst)
193                 :"m"(*src),"m"(*(src+7)),"r"(A)
194                 :"$2"
195             );
196
197             dst += stride;
198             src += stride;
199         }
200     }
201 }
202
203 void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
204         int h, int x, int y)
205 {
206     const int A = (8 - x) * (8 - y);
207     const int B = x * (8 - y);
208     const int C = (8 - x) * y;
209     const int D = x * y;
210     const int E = B + C;
211     int i;
212
213     av_assert2(x<8 && y<8 && x>=0 && y>=0);
214
215     if (D) {
216         for (i=0; i<h; i++) {
217             __asm__ volatile (
218                 "ldl $2, %2                 \r\n"
219                 "ldr $2, %1                 \r\n"
220                 "ldl $3, %4                 \r\n"
221                 "ldr $3, %3                 \r\n"
222                 "ldl $4, %6                 \r\n"
223                 "ldr $4, %5                 \r\n"
224                 "ldl $5, %8                 \r\n"
225                 "ldr $5, %7                 \r\n"
226                 "daddiu $6, $0, 32          \r\n"
227                 "mtc1 %9, $f6               \r\n"
228                 "mtc1 %10, $f8              \r\n"
229                 "mtc1 %11, $f10             \r\n"
230                 "mtc1 %12, $f12             \r\n"
231                 "mtc1 $0, $f20              \r\n"
232                 "mtc1 $2, $f2               \r\n"
233                 "mtc1 $3, $f4               \r\n"
234                 "mtc1 $4, $f16              \r\n"
235                 "mtc1 $5, $f18              \r\n"
236                 "mtc1 $6, $f14              \r\n"
237                 "punpcklbh $f2, $f2, $f20   \r\n"
238                 "punpcklbh $f4, $f4, $f20   \r\n"
239                 "pshufh $f6, $f6, $f20      \r\n"
240                 "pshufh $f8, $f8, $f20      \r\n"
241                 "pshufh $f10, $f10, $f20    \r\n"
242                 "pshufh $f12, $f12, $f20    \r\n"
243                 "pshufh $f14, $f14, $f20    \r\n"
244                 "punpcklbh $f16, $f16, $f20 \r\n"
245                 "punpcklbh $f18, $f18, $f20 \r\n"
246                 "daddiu $6, $0, 6           \r\n"
247                 "mtc1 $6, $f22              \r\n"
248                 "dsrl32 $2, $2, 0           \r\n"
249                 "dsrl32 $3, $3, 0           \r\n"
250                 "dsrl32 $4, $4, 0           \r\n"
251                 "dsrl32 $5, $5, 0           \r\n"
252                 "pmullh $f2, $f2, $f6       \r\n"
253                 "pmullh $f4, $f4, $f8       \r\n"
254                 "pmullh $f16, $f10, $f16    \r\n"
255                 "pmullh $f18, $f12, $f18    \r\n"
256                 "paddh $f2, $f2, $f14       \r\n"
257                 "paddh $f4, $f4, $f16       \r\n"
258                 "paddh $f2, $f2, $f18       \r\n"
259                 "paddh $f2, $f2, $f4        \r\n"
260                 "psrah $f24, $f2, $f22      \r\n"
261                 "mtc1 $2, $f2               \r\n"
262                 "mtc1 $3, $f4               \r\n"
263                 "mtc1 $4, $f16              \r\n"
264                 "mtc1 $5, $f18              \r\n"
265                 "punpcklbh $f2, $f2, $f20   \r\n"
266                 "punpcklbh $f4, $f4, $f20   \r\n"
267                 "punpcklbh $f16, $f16, $f20 \r\n"
268                 "punpcklbh $f18, $f18, $f20 \r\n"
269                 "pmullh $f2, $f2, $f6       \r\n"
270                 "pmullh $f4, $f4, $f8       \r\n"
271                 "pmullh $f16, $f10, $f16    \r\n"
272                 "pmullh $f18, $f12, $f18    \r\n"
273                 "paddh $f2, $f2, $f14       \r\n"
274                 "paddh $f4, $f4, $f16       \r\n"
275                 "paddh $f2, $f2, $f18       \r\n"
276                 "paddh $f2, $f2, $f4        \r\n"
277                 "psrah $f2, $f2, $f22       \r\n"
278                 "packushb $f2, $f24, $f2    \r\n"
279                 "ldc1 $f4, %0               \r\n"
280                 "pavgb $f2, $f2, $f4        \r\n"
281                 "sdc1 $f2, %0               \r\n"
282                 : "=m"(*dst)
283                 : "m"(*(src)),"m"(*(src+7)),"m"(*(src+1)),"m"(*(src+8)),
284                   "m"(*(src+stride)),"m"(*(src+stride+7)),
285                   "m"(*(src+stride+1)),"m"(*(src+stride+8)),
286                   "r"(A),"r"(B),"r"(C),"r"(D)
287                 : "$2","$3","$4","$5","$6"
288             );
289
290             dst += stride;
291             src += stride;
292         }
293     } else {
294         const int step = C ? stride : 1;
295
296         for (i=0; i<h; i++) {
297             __asm__ volatile (
298                 "daddiu $6, $0, 32          \r\n"
299                 "ldl $2, %2                 \r\n"
300                 "ldr $2, %1                 \r\n"
301                 "ldl $3, %4                 \r\n"
302                 "ldr $3, %3                 \r\n"
303                 "mtc1 $6, $f14              \r\n"
304                 "mtc1 %5, $f6               \r\n"
305                 "mtc1 %6, $f8               \r\n"
306                 "mtc1 $0, $f20              \r\n"
307                 "mtc1 $2, $f2               \r\n"
308                 "mtc1 $3, $f4               \r\n"
309                 "daddiu $6, $0, 6           \r\n"
310                 "punpcklbh $f2, $f2, $f20   \r\n"
311                 "punpcklbh $f4, $f4, $f20   \r\n"
312                 "pshufh $f6, $f6, $f20      \r\n"
313                 "pshufh $f8, $f8, $f20      \r\n"
314                 "pshufh $f14, $f14, $f20    \r\n"
315                 "mtc1 $6, $f22              \r\n"
316                 "dsrl32 $2, $2, 0           \r\n"
317                 "dsrl32 $3, $3, 0           \r\n"
318                 "pmullh $f2, $f2, $f6       \r\n"
319                 "pmullh $f4, $f4, $f8       \r\n"
320                 "paddh $f2, $f2, $f14       \r\n"
321                 "paddh $f2, $f2, $f4        \r\n"
322                 "psrah $f24, $f2, $f22      \r\n"
323                 "mtc1 $2, $f2               \r\n"
324                 "mtc1 $3, $f4               \r\n"
325                 "punpcklbh $f2, $f2, $f20   \r\n"
326                 "punpcklbh $f4, $f4, $f20   \r\n"
327                 "pmullh $f2, $f2, $f6       \r\n"
328                 "pmullh $f4, $f4, $f8       \r\n"
329                 "paddh $f2, $f2, $f14       \r\n"
330                 "paddh $f2, $f2, $f4        \r\n"
331                 "psrah $f2, $f2, $f22       \r\n"
332                 "packushb $f2, $f24, $f2    \r\n"
333                 "ldc1 $f4, %0               \r\n"
334                 "pavgb $f2, $f2, $f4        \r\n"
335                 "sdc1 $f2, %0               \r\n"
336                 : "=m"(*dst)
337                 : "m"(*(src)),"m"(*(src+7)),
338                   "m"(*(src+step)),"m"(*(src+step+7)),"r"(A),"r"(E)
339                 : "$2","$3","$4","$5","$6"
340             );
341
342             dst += stride;
343             src += stride;
344         }
345     }
346 }
347
348 void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
349         int h, int x, int y)
350 {
351     const int A = (8 - x) * (8 - y);
352     const int B = x * (8 - y);
353     const int C = (8 - x) *  y;
354     const int D = x *  y;
355     const int E = B + C;
356     int i;
357
358     av_assert2(x<8 && y<8 && x>=0 && y>=0);
359
360     if (D) {
361         for (i=0; i<h; i++) {
362             __asm__ volatile (
363                 "ldl $2, %2                 \r\n"
364                 "ldr $2, %1                 \r\n"
365                 "ldl $3, %4                 \r\n"
366                 "ldr $3, %3                 \r\n"
367                 "ldl $4, %6                 \r\n"
368                 "ldr $4, %5                 \r\n"
369                 "ldl $5, %8                 \r\n"
370                 "ldr $5, %7                 \r\n"
371                 "daddiu $6, $0, 32          \r\n"
372                 "mtc1 %9, $f6               \r\n"
373                 "mtc1 %10, $f8              \r\n"
374                 "mtc1 %11, $f10             \r\n"
375                 "mtc1 %12, $f12             \r\n"
376                 "mtc1 $0, $f20              \r\n"
377                 "mtc1 $2, $f2               \r\n"
378                 "mtc1 $3, $f4               \r\n"
379                 "mtc1 $4, $f16              \r\n"
380                 "mtc1 $5, $f18              \r\n"
381                 "mtc1 $6, $f14              \r\n"
382                 "punpcklbh $f2, $f2, $f20   \r\n"
383                 "punpcklbh $f4, $f4, $f20   \r\n"
384                 "pshufh $f6, $f6, $f20      \r\n"
385                 "pshufh $f8, $f8, $f20      \r\n"
386                 "pshufh $f10, $f10, $f20    \r\n"
387                 "pshufh $f12, $f12, $f20    \r\n"
388                 "pshufh $f14, $f14, $f20    \r\n"
389                 "punpcklbh $f16, $f16, $f20 \r\n"
390                 "punpcklbh $f18, $f18, $f20 \r\n"
391                 "daddiu $6, $0, 6           \r\n"
392                 "mtc1 $6, $f22              \r\n"
393                 "pmullh $f2, $f2, $f6       \r\n"
394                 "pmullh $f4, $f4, $f8       \r\n"
395                 "pmullh $f16, $f10, $f16    \r\n"
396                 "pmullh $f18, $f12, $f18    \r\n"
397                 "paddh $f2, $f2, $f14       \r\n"
398                 "paddh $f4, $f4, $f16       \r\n"
399                 "paddh $f2, $f2, $f18       \r\n"
400                 "paddh $f2, $f2, $f4        \r\n"
401                 "psrah $f2, $f2, $f22       \r\n"
402                 "packushb $f2, $f2, $f2     \r\n"
403                 "swc1 $f2, %0               \r\n"
404                 : "=m"(*dst)
405                 : "m"(*(src)),"m"(*(src+7)),"m"(*(src+1)),"m"(*(src+8)),
406                   "m"(*(src+stride)),"m"(*(src+stride+7)),
407                   "m"(*(src+stride+1)),"m"(*(src+stride+8)),
408                   "r"(A),"r"(B),"r"(C),"r"(D)
409                 : "$2","$3","$4","$5","$6"
410             );
411
412             dst += stride;
413             src += stride;
414         }
415     } else if (E) {
416         const int step = C ? stride : 1;
417
418         for (i=0; i<h; i++) {
419             __asm__ volatile (
420                 "ldl $2, %2                 \r\n"
421                 "ldr $2, %1                 \r\n"
422                 "ldl $3, %4                 \r\n"
423                 "ldr $3, %3                 \r\n"
424                 "daddiu $4, $0, 32          \r\n"
425                 "mtc1 %5, $f6               \r\n"
426                 "mtc1 %6, $f8               \r\n"
427                 "mtc1 $0, $f20              \r\n"
428                 "mtc1 $2, $f2               \r\n"
429                 "mtc1 $3, $f4               \r\n"
430                 "mtc1 $4, $f10              \r\n"
431                 "punpcklbh $f2, $f2, $f20   \r\n"
432                 "punpcklbh $f4, $f4, $f20   \r\n"
433                 "pshufh $f6, $f6, $f20      \r\n"
434                 "pshufh $f8, $f8, $f20      \r\n"
435                 "pshufh $f10, $f10, $f20    \r\n"
436                 "daddiu $4, $0, 6           \r\n"
437                 "mtc1 $4, $f22              \r\n"
438                 "pmullh $f2, $f2, $f6       \r\n"
439                 "pmullh $f4, $f4, $f8       \r\n"
440                 "paddh $f2, $f2, $f10       \r\n"
441                 "paddh $f2, $f2, $f4        \r\n"
442                 "psrah $f2, $f2, $f22       \r\n"
443                 "packushb $f2, $f2, $f20    \r\n"
444                 "swc1 $f2, %0               \r\n"
445                 : "=m"(*dst)
446                 : "m"(*(src)),"m"(*(src+7)),"m"(*(src+step)),
447                   "m"(*(src+step+7)),"r"(A),"r"(E)
448                 : "$2","$3","$4","$5","$6"
449             );
450
451             dst += stride;
452             src += stride;
453         }
454     } else {
455         for (i=0; i<h; i++) {
456             __asm__ volatile (
457                 "lwl $2, %2                 \r\n"
458                 "lwr $2, %1                 \r\n"
459                 "sw $2, %0                  \r\n"
460                 : "=m"(*dst)
461                 : "m"(*src),"m"(*(src+3))
462                 : "$2"
463             );
464
465             dst += stride;
466             src += stride;
467         }
468     }
469 }
470
471 void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
472         int h, int x, int y)
473 {
474     const int A = (8 - x) *(8 - y);
475     const int B = x * (8 - y);
476     const int C = (8 - x) * y;
477     const int D = x * y;
478     int i;
479
480     av_assert2(x<8 && y<8 && x>=0 && y>=0);
481
482     if (D) {
483         for (i=0; i<h; i++) {
484             __asm__ volatile (
485                 "ldl $2, %2                 \r\n"
486                 "ldr $2, %1                 \r\n"
487                 "ldl $3, %4                 \r\n"
488                 "ldr $3, %3                 \r\n"
489                 "ldl $4, %6                 \r\n"
490                 "ldr $4, %5                 \r\n"
491                 "ldl $5, %8                 \r\n"
492                 "ldr $5, %7                 \r\n"
493                 "daddiu $6, $0, 32          \r\n"
494                 "mtc1 %9, $f6               \r\n"
495                 "mtc1 %10, $f8              \r\n"
496                 "mtc1 %11, $f10             \r\n"
497                 "mtc1 %12, $f12             \r\n"
498                 "mtc1 $0, $f20              \r\n"
499                 "mtc1 $2, $f2               \r\n"
500                 "mtc1 $3, $f4               \r\n"
501                 "mtc1 $4, $f16              \r\n"
502                 "mtc1 $5, $f18              \r\n"
503                 "mtc1 $6, $f14              \r\n"
504                 "punpcklbh $f2, $f2, $f20   \r\n"
505                 "punpcklbh $f4, $f4, $f20   \r\n"
506                 "pshufh $f6, $f6, $f20      \r\n"
507                 "pshufh $f8, $f8, $f20      \r\n"
508                 "pshufh $f10, $f10, $f20    \r\n"
509                 "pshufh $f12, $f12, $f20    \r\n"
510                 "pshufh $f14, $f14, $f20    \r\n"
511                 "punpcklbh $f16, $f16, $f20 \r\n"
512                 "punpcklbh $f18, $f18, $f20 \r\n"
513                 "daddiu $6, $0, 6           \r\n"
514                 "mtc1 $6, $f22              \r\n"
515                 "pmullh $f2, $f2, $f6       \r\n"
516                 "pmullh $f4, $f4, $f8       \r\n"
517                 "pmullh $f16, $f10, $f16    \r\n"
518                 "pmullh $f18, $f12, $f18    \r\n"
519                 "paddh $f2, $f2, $f14       \r\n"
520                 "paddh $f4, $f4, $f16       \r\n"
521                 "paddh $f2, $f2, $f18       \r\n"
522                 "paddh $f2, $f2, $f4        \r\n"
523                 "psrah $f2, $f2, $f22       \r\n"
524                 "packushb $f2, $f2, $f2     \r\n"
525                 "lwc1 $f4, %0               \r\n"
526                 "pavgb $f2, $f2, $f4        \r\n"
527                 "swc1 $f2, %0               \r\n"
528                 : "=m"(*dst)
529                 : "m"(*(src)),"m"(*(src+7)),"m"(*(src+1)),"m"(*(src+8)),
530                   "m"(*(src+stride)),"m"(*(src+stride+7)),
531                   "m"(*(src+stride+1)),"m"(*(src+stride+8)),
532                   "r"(A),"r"(B),"r"(C),"r"(D)
533                 : "$2","$3","$4","$5","$6"
534             );
535
536             dst += stride;
537             src += stride;
538         }
539     } else {
540         const int E = B + C;
541         const int step = C ? stride : 1;
542
543         for (i=0; i<h; i++) {
544             __asm__ volatile (
545                 "ldl $2, %2                 \r\n"
546                 "ldr $2, %1                 \r\n"
547                 "ldl $3, %4                 \r\n"
548                 "ldr $3, %3                 \r\n"
549                 "daddiu $4, $0, 32          \r\n"
550                 "mtc1 %5, $f6               \r\n"
551                 "mtc1 %6, $f8               \r\n"
552                 "mtc1 $0, $f20              \r\n"
553                 "mtc1 $2, $f2               \r\n"
554                 "mtc1 $3, $f4               \r\n"
555                 "mtc1 $4, $f10              \r\n"
556                 "punpcklbh $f2, $f2, $f20   \r\n"
557                 "punpcklbh $f4, $f4, $f20   \r\n"
558                 "pshufh $f6, $f6, $f20      \r\n"
559                 "pshufh $f8, $f8, $f20      \r\n"
560                 "pshufh $f10, $f10, $f20    \r\n"
561                 "daddiu $4, $0, 6           \r\n"
562                 "mtc1 $4, $f22              \r\n"
563                 "pmullh $f2, $f2, $f6       \r\n"
564                 "pmullh $f4, $f4, $f8       \r\n"
565                 "paddh $f2, $f2, $f10       \r\n"
566                 "paddh $f2, $f2, $f4        \r\n"
567                 "psrah $f2, $f2, $f22       \r\n"
568                 "packushb $f2, $f2, $f20    \r\n"
569                 "lwc1 $f4, %0               \r\n"
570                 "pavgb $f2, $f2, $f4        \r\n"
571                 "swc1 $f2, %0               \r\n"
572                 : "=m"(*dst)
573                 : "m"(*(src)),"m"(*(src+7)),"m"(*(src+step)),
574                   "m"(*(src+step+7)),"r"(A),"r"(E)
575                 : "$2","$3","$4","$5","$6"
576             );
577
578             dst += stride;
579             src += stride;
580         }
581     }
582 }