]> git.sesse.net Git - ffmpeg/blob - libavcodec/mips/simple_idct_mmi.c
Merge commit '71d3305c2711d4f6ec8b92db09ff64cf4e19a58e'
[ffmpeg] / libavcodec / mips / simple_idct_mmi.c
1 /*
2  * Loongson SIMD optimized simple idct
3  *
4  * Copyright (c) 2015 Loongson Technology Corporation Limited
5  * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25 #include "idctdsp_mips.h"
26 #include "constants.h"
27
28 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
29 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
30 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
31 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
32 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
33 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34 #define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
35 #define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36
37 #define ROW_SHIFT 11
38 #define COL_SHIFT 20
39
40 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
41     1<<(ROW_SHIFT-1),   0, 1<<(ROW_SHIFT-1),   0,
42     1<<(ROW_SHIFT-1),   1, 1<<(ROW_SHIFT-1),   0,
43                   C4,  C4,               C4,  C4,
44                   C4, -C4,               C4, -C4,
45                   C2,  C6,               C2,  C6,
46                   C6, -C2,               C6, -C2,
47                   C1,  C3,               C1,  C3,
48                   C5,  C7,               C5,  C7,
49                   C3, -C7,               C3, -C7,
50                  -C1, -C5,              -C1, -C5,
51                   C5, -C1,               C5, -C1,
52                   C7,  C3,               C7,  C3,
53                   C7, -C5,               C7, -C5,
54                   C3, -C1,               C3, -C1
55 };
56
57 void ff_simple_idct_mmi(int16_t *block)
58 {
59         DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
60         int16_t * const temp= (int16_t*)align_tmp;
61
62         __asm__ volatile (
63 #undef  DC_COND_IDCT
64 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, rarg, shift)      \
65         "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
66         "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
67         "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
68         "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
69         "ldc1 $f8, %3                   \n\t"                                \
70         "and  $f8, $f8, $f0             \n\t"                                \
71         "or $f8, $f8, $f2               \n\t"                                \
72         "or $f8, $f8, $f4               \n\t"                                \
73         "or $f8, $f8, $f6               \n\t"                                \
74         "packsswh $f8, $f8, $f8         \n\t"                                \
75         "li $11, " #shift "             \n\t"                                \
76         "mfc1 $10, $f8                  \n\t"                                \
77         "mtc1 $11, $f18                 \n\t"                                \
78         "beqz $10, 1f                   \n\t"                                \
79         "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
80         "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
81         "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
82         "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
83         "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
84         "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
85         "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
86         "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
87         "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
88         "ldc1 $f16, " #rarg "           \n\t"                                \
89         "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
90         #rounder " $f8, $f8, $f16       \n\t"                                \
91         "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
92         "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
93         "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
94         "ldc1 $f10, 56(%2)              \n\t" /* C7     C5      C7      C5 */\
95         "ldc1 $f16, " #rarg "           \n\t"                                \
96         "pmaddhw $f10, $f10, $f6        \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
97         #rounder " $f0, $f0, $f16       \n\t"                                \
98         "paddw $f2, $f2, $f0            \n\t" /* A1             a1         */\
99         "ldc1 $f16, 64(%2)              \n\t"                                \
100         "paddw $f0, $f0, $f0            \n\t"                                \
101         "psubw $f0, $f0, $f2            \n\t" /* A2             a2         */\
102         "pmaddhw $f4, $f4, $f16         \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
103         "paddw $f14, $f14, $f10         \n\t" /* B0             b0         */\
104         "ldc1 $f10, 72(%2)              \n\t" /* -C5    -C1     -C5    -C1 */\
105         "pmaddhw $f10, $f10, $f6        \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
106         "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
107         "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
108         "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
109         "paddw $f10, $f10, $f4          \n\t" /* B1             b1         */\
110         "psraw $f14, $f14, $f18         \n\t"                                \
111         "psraw $f8, $f8, $f18           \n\t"                                \
112         "mov.d $f4, $f2                 \n\t" /* A1             a1         */\
113         "paddw $f2, $f2, $f10           \n\t" /* A1+B1          a1+b1      */\
114         "psubw $f4, $f4, $f10           \n\t" /* A1-B1          a1-b1      */\
115         "psraw $f2, $f2, $f18           \n\t"                                \
116         "psraw $f4, $f4, $f18           \n\t"                                \
117         "packsswh $f14, $f14, $f2       \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0   */\
118         "packsswh $f4, $f4, $f8         \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1   */\
119         "sdc1 $f14, " #dst "            \n\t"                                \
120         "ldc1 $f2, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
121         "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
122         "sdc1 $f4, 24+" #dst "          \n\t"                                \
123         "pmaddhw $f8, $f8, $f2          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
124         "ldc1 $f16, 96(%2)              \n\t"                                \
125         "ldc1 $f14, 88(%2)              \n\t" /* C3     C7      C3      C7 */\
126         "pmaddhw $f2, $f2, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
127         "pmaddhw $f14, $f14, $f6        \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
128         "ldc1 $f16, 104(%2)             \n\t"                                \
129         "mov.d $f4, $f0                 \n\t" /* A2             a2         */\
130         "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
131         "paddw $f8, $f8, $f14           \n\t" /* B2             b2         */\
132         "paddw $f4, $f4, $f8            \n\t" /* A2+B2          a2+b2      */\
133         "psubw $f0, $f0, $f8            \n\t" /* a2-B2          a2-b2      */\
134         "psraw $f4, $f4, $f18           \n\t"                                \
135         "psraw $f0, $f0, $f18           \n\t"                                \
136         "mov.d $f8, $f12                \n\t" /* A3             a3         */\
137         "paddw $f6, $f6, $f2            \n\t" /* B3             b3         */\
138         "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
139         "psubw $f8, $f8, $f6            \n\t" /* a3-B3          a3-b3      */\
140         "psraw $f12, $f12, $f18         \n\t"                                \
141         "packsswh $f4, $f4, $f12        \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2   */\
142         "sdc1 $f4, 8+" #dst "           \n\t"                                \
143         "psraw $f8, $f8, $f18           \n\t"                                \
144         "packsswh $f8, $f8, $f0         \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3   */\
145         "sdc1 $f8, 16+" #dst "          \n\t"                                \
146         "b 2f                           \n\t"                                \
147         "1:                             \n\t"                                \
148         "li $10, 16                     \n\t"                                \
149         "mtc1 $10, $f16                 \n\t"                                \
150         "psllw $f0, $f0, $f16           \n\t"                                \
151         "ldc1 $f16, %4                  \n\t"                                \
152         "paddw $f0, $f0, $f16           \n\t"                                \
153         "li $10, 13                     \n\t"                                \
154         "mtc1 $10, $f16                 \n\t"                                \
155         "psraw $f0, $f0, $f16           \n\t"                                \
156         "packsswh $f0, $f0, $f0         \n\t"                                \
157         "sdc1 $f0, " #dst "             \n\t"                                \
158         "sdc1 $f0, 8+" #dst "           \n\t"                                \
159         "sdc1 $f0, 16+" #dst "          \n\t"                                \
160         "sdc1 $f0, 24+" #dst "          \n\t"                                \
161         "2:                             \n\t"
162
163 #undef  Z_COND_IDCT
164 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, rarg, shift, bt)   \
165         "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
166         "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
167         "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
168         "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
169         "mov.d $f8, $f0                 \n\t"                                \
170         "or $f8, $f8, $f2               \n\t"                                \
171         "or $f8, $f8, $f4               \n\t"                                \
172         "or $f8, $f8, $f6               \n\t"                                \
173         "packsswh $f8, $f8, $f8         \n\t"                                \
174         "mfc1 $10, $f8                  \n\t"                                \
175         "beqz $10, " #bt "              \n\t"                                \
176         "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
177         "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
178         "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
179         "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
180         "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
181         "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
182         "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
183         "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
184         "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
185         "ldc1 $f16, " #rarg "           \n\t"                                \
186         "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
187         #rounder " $f8, $f8, $f16       \n\t"                                \
188         "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
189         "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
190         "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
191         "ldc1 $f10, 56(%2)              \n\t" /* C7     C5      C7      C5 */\
192         "ldc1 $f16, " #rarg "           \n\t"                                \
193         "pmaddhw $f10, $f10, $f6        \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
194         #rounder " $f0, $f0, $f16       \n\t"                                \
195         "paddw $f2, $f2, $f0            \n\t" /* A1             a1         */\
196         "paddw $f0, $f0, $f0            \n\t"                                \
197         "ldc1 $f16, 64(%2)              \n\t"                                \
198         "psubw $f0, $f0, $f2            \n\t" /* A2             a2         */\
199         "pmaddhw $f4, $f4, $f16         \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
200         "paddw $f14, $f14, $f10         \n\t" /* B0             b0         */\
201         "ldc1 $f10, 72(%2)              \n\t" /* -C5    -C1     -C5    -C1 */\
202         "pmaddhw $f10, $f10, $f6        \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
203         "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
204         "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
205         "li $10, " #shift "             \n\t"                                \
206         "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
207         "mtc1 $10, $f18                 \n\t"                                \
208         "paddw $f10, $f10, $f4          \n\t" /* B1             b1         */\
209         "psraw $f14, $f14, $f18         \n\t"                                \
210         "psraw $f8, $f8, $f18           \n\t"                                \
211         "mov.d $f4, $f2                 \n\t" /* A1             a1         */\
212         "paddw $f2, $f2, $f10           \n\t" /* A1+B1          a1+b1      */\
213         "psubw $f4, $f4, $f10           \n\t" /* A1-B1          a1-b1      */\
214         "psraw $f2, $f2, $f18           \n\t"                                \
215         "psraw $f4, $f4, $f18           \n\t"                                \
216         "packsswh $f14, $f14, $f2       \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0   */\
217         "packsswh $f4, $f4, $f8         \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1   */\
218         "sdc1 $f14, " #dst "            \n\t"                                \
219         "ldc1 $f2, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
220         "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
221         "sdc1 $f4, 24+" #dst "          \n\t"                                \
222         "pmaddhw $f8, $f8, $f2          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
223         "ldc1 $f16, 96(%2)              \n\t"                                \
224         "ldc1 $f14, 88(%2)              \n\t" /* C3     C7      C3      C7 */\
225         "pmaddhw $f2, $f2, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
226         "pmaddhw $f14, $f14, $f6        \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
227         "ldc1 $f16, 104(%2)             \n\t"                                \
228         "mov.d $f4, $f0                 \n\t" /* A2             a2         */\
229         "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
230         "paddw $f8, $f8, $f14           \n\t" /* B2             b2         */\
231         "paddw $f4, $f4, $f8            \n\t" /* A2+B2          a2+b2      */\
232         "psubw $f0, $f0, $f8            \n\t" /* a2-B2          a2-b2      */\
233         "psraw $f4, $f4, $f18           \n\t"                                \
234         "psraw $f0, $f0, $f18           \n\t"                                \
235         "mov.d $f8, $f12                \n\t" /* A3             a3         */\
236         "paddw $f6, $f6, $f2            \n\t" /* B3             b3         */\
237         "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
238         "psubw $f8, $f8, $f6            \n\t" /* a3-B3          a3-b3      */\
239         "psraw $f12, $f12, $f18         \n\t"                                \
240         "packsswh $f4, $f4, $f12        \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2   */\
241         "sdc1 $f4, 8+" #dst "           \n\t"                                \
242         "psraw $f8, $f8, $f18           \n\t"                                \
243         "packsswh $f8, $f8, $f0         \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3   */\
244         "sdc1 $f8, 16+" #dst "          \n\t"                                \
245
246         //IDCT(       src0,   src4,   src1,   src5,    dst,     rounder, shift)
247         DC_COND_IDCT(0(%0),  8(%0), 16(%0), 24(%0),  0(%1), paddw,8(%2), 11)
248         Z_COND_IDCT(32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddw,(%2), 11, 4f)
249         Z_COND_IDCT(64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddw,(%2), 11, 2f)
250         Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1), paddw,(%2), 11, 1f)
251
252 #undef  IDCT
253 #define IDCT(src0, src4, src1, src5, dst, shift)                             \
254         "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
255         "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
256         "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
257         "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
258         "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
259         "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
260         "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
261         "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
262         "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
263         "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
264         "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
265         "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
266         "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
267         "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
268         "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
269         "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
270         "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
271         "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
272         "paddw $f0, $f0, $f2            \n\t" /* A1             a1         */\
273         "psubw $f10, $f10, $f2          \n\t" /* A2             a2         */\
274         "ldc1 $f2, 56(%2)               \n\t" /* C7     C5      C7      C5 */\
275         "ldc1 $f16, 64(%2)              \n\t"                                \
276         "pmaddhw $f2, $f2, $f6          \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
277         "pmaddhw $f4, $f4, $f16         \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
278         "li $10, " #shift "             \n\t"                                \
279         "paddw $f14, $f14, $f2          \n\t" /* B0             b0         */\
280         "ldc1 $f2, 72(%2)               \n\t" /* -C5    -C1     -C5    -C1 */\
281         "mtc1 $10, $f18                 \n\t"                                \
282         "pmaddhw $f2, $f2, $f6          \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
283         "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
284         "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
285         "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
286         "paddw $f2, $f2, $f4            \n\t" /* B1             b1         */\
287         "psraw $f14, $f14, $f18         \n\t"                                \
288         "psraw $f8, $f8, $f18           \n\t"                                \
289         "mov.d $f4, $f0                 \n\t" /* A1             a1         */\
290         "paddw $f0, $f0, $f2            \n\t" /* A1+B1          a1+b1      */\
291         "psubw $f4, $f4, $f2            \n\t" /* A1-B1          a1-b1      */\
292         "psraw $f0, $f0, $f18           \n\t"                                \
293         "psraw $f4, $f4, $f18           \n\t"                                \
294         "packsswh $f14, $f14, $f14      \n\t" /* A0+B0          a0+b0      */\
295         "swc1 $f14, " #dst "            \n\t"                                \
296         "packsswh $f0, $f0, $f0         \n\t" /* A1+B1          a1+b1      */\
297         "swc1 $f0, 16+" #dst "          \n\t"                                \
298         "packsswh $f4, $f4, $f4         \n\t" /* A1-B1          a1-b1      */\
299         "swc1 $f4, 96+" #dst "          \n\t"                                \
300         "packsswh $f8, $f8, $f8         \n\t" /* A0-B0          a0-b0      */\
301         "swc1 $f8, 112+" #dst "         \n\t"                                \
302         "ldc1 $f0, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
303         "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
304         "pmaddhw $f8, $f8, $f0          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
305         "ldc1 $f16, 96(%2)              \n\t"                                \
306         "ldc1 $f14, 88(%2)              \n\t" /* C3     C7      C3      C7 */\
307         "pmaddhw $f0, $f0, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
308         "pmaddhw $f14, $f14, $f6        \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
309         "ldc1 $f16, 104(%2)             \n\t"                                \
310         "mov.d $f4, $f10                \n\t" /* A2             a2         */\
311         "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
312         "paddw $f8, $f8, $f14           \n\t" /* B2             b2         */\
313         "paddw $f4, $f4, $f8            \n\t" /* A2+B2          a2+b2      */\
314         "psubw $f10, $f10, $f8          \n\t" /* a2-B2          a2-b2      */\
315         "psraw $f4, $f4, $f18           \n\t"                                \
316         "psraw $f10, $f10, $f18         \n\t"                                \
317         "mov.d $f8, $f12                \n\t" /* A3             a3         */\
318         "paddw $f6, $f6, $f0            \n\t" /* B3             b3         */\
319         "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
320         "psubw $f8, $f8, $f6            \n\t" /* a3-B3          a3-b3      */\
321         "psraw $f12, $f12, $f18         \n\t"                                \
322         "psraw $f8, $f8, $f18           \n\t"                                \
323         "packsswh $f4, $f4, $f4         \n\t" /* A2+B2          a2+b2      */\
324         "packsswh $f12, $f12, $f12      \n\t" /* A3+B3          a3+b3      */\
325         "swc1 $f4, 32+" #dst "          \n\t"                                \
326         "packsswh $f8, $f8, $f8         \n\t" /* A3-B3          a3-b3      */\
327         "packsswh $f10, $f10, $f10      \n\t" /* A2-B2          a2-b2      */\
328         "swc1 $f12, 48+" #dst "         \n\t"                                \
329         "swc1 $f8, 64+" #dst "          \n\t"                                \
330         "swc1 $f10, 80+" #dst "         \n\t"
331
332         //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
333         IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
334         IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
335         IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
336         IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
337         "b 9f                           \n\t"
338
339         "# .p2align 4                   \n\t"
340         "4:                             \n\t"
341         Z_COND_IDCT(64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddw,(%2), 11, 6f)
342         Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 5f)
343
344 #undef  IDCT
345 #define IDCT(src0, src4, src1, src5, dst, shift)                             \
346         "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
347         "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
348         "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
349         "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
350         "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
351         "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
352         "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
353         "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
354         "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
355         "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
356         "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
357         "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
358         "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
359         "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
360         "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
361         "paddw $f0, $f0, $f2            \n\t" /* A1             a1         */\
362         "psubw $f10, $f10, $f2          \n\t" /* A2             a2         */\
363         "ldc1 $f2, 56(%2)               \n\t" /* C7     C5      C7      C5 */\
364         "li $10, " #shift "             \n\t"                                \
365         "pmaddhw $f2, $f2, $f6          \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
366         "ldc1 $f14, 72(%2)              \n\t" /* -C5    -C1     -C5    -C1 */\
367         "mtc1 $10, $f18                 \n\t"                                \
368         "pmaddhw $f14, $f14, $f6        \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
369         "paddw $f2, $f2, $f8            \n\t" /* A0+B0          a0+b0      */\
370         "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
371         "psubw $f8, $f8, $f2            \n\t" /* A0-B0          a0-b0      */\
372         "psraw $f2, $f2, $f18           \n\t"                                \
373         "psraw $f8, $f8, $f18           \n\t"                                \
374         "mov.d $f4, $f0                 \n\t" /* A1             a1         */\
375         "paddw $f0, $f0, $f14           \n\t" /* A1+B1          a1+b1      */\
376         "psubw $f4, $f4, $f14           \n\t" /* A1-B1          a1-b1      */\
377         "psraw $f0, $f0, $f18           \n\t"                                \
378         "psraw $f4, $f4, $f18           \n\t"                                \
379         "packsswh $f2, $f2, $f2         \n\t" /* A0+B0          a0+b0      */\
380         "swc1 $f2, " #dst "             \n\t"                                \
381         "packsswh $f0, $f0, $f0         \n\t" /* A1+B1          a1+b1      */\
382         "swc1 $f0, 16+" #dst "          \n\t"                                \
383         "packsswh $f4, $f4, $f4         \n\t" /* A1-B1          a1-b1      */\
384         "swc1 $f4, 96+" #dst "          \n\t"                                \
385         "packsswh $f8, $f8, $f8         \n\t" /* A0-B0          a0-b0      */\
386         "swc1 $f8, 112+" #dst "         \n\t"                                \
387         "ldc1 $f2, 88(%2)               \n\t" /* C3     C7      C3      C7 */\
388         "ldc1 $f16, 104(%2)             \n\t"                                \
389         "pmaddhw $f2, $f2, $f6          \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
390         "mov.d $f4, $f10                \n\t" /* A2             a2         */\
391         "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
392         "paddw $f4, $f4, $f2            \n\t" /* A2+B2          a2+b2      */\
393         "psubw $f10, $f10, $f2          \n\t" /* a2-B2          a2-b2      */\
394         "psraw $f4, $f4, $f18           \n\t"                                \
395         "psraw $f10, $f10, $f18         \n\t"                                \
396         "mov.d $f2, $f12                \n\t" /* A3             a3         */\
397         "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
398         "psubw $f2, $f2, $f6            \n\t" /* a3-B3          a3-b3      */\
399         "psraw $f12, $f12, $f18         \n\t"                                \
400         "psraw $f2, $f2, $f18           \n\t"                                \
401         "packsswh $f4, $f4, $f4         \n\t" /* A2+B2          a2+b2      */\
402         "packsswh $f12, $f12, $f12      \n\t" /* A3+B3          a3+b3      */\
403         "swc1 $f4, 32+" #dst "          \n\t"                                \
404         "packsswh $f2, $f2, $f2         \n\t" /* A3-B3          a3-b3      */\
405         "packsswh $f10, $f10, $f10      \n\t" /* A2-B2          a2-b2      */\
406         "swc1 $f12, 48+" #dst "         \n\t"                                \
407         "swc1 $f2, 64+" #dst "          \n\t"                                \
408         "swc1 $f10, 80+" #dst "         \n\t"
409
410         //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
411         IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
412         IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
413         IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
414         IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
415         "b 9f                           \n\t"
416
417         "# .p2align 4                   \n\t"
418         "6:                             \n\t"
419         Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 7f)
420
421 #undef  IDCT
422 #define IDCT(src0, src4, src1, src5, dst, shift)                             \
423         "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
424         "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
425         "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
426         "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
427         "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
428         "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
429         "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
430         "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
431         "ldc1 $f2, 56(%2)               \n\t" /* C7     C5      C7      C5 */\
432         "pmaddhw $f2, $f2, $f6          \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
433         "ldc1 $f14, 72(%2)              \n\t" /* -C5    -C1     -C5    -C1 */\
434         "li $10, " #shift "             \n\t"                                \
435         "pmaddhw $f14, $f14, $f6        \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
436         "paddw $f2, $f2, $f8            \n\t" /* A0+B0          a0+b0      */\
437         "mtc1 $10, $f18                 \n\t"                                \
438         "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
439         "psubw $f8, $f8, $f2            \n\t" /* A0-B0          a0-b0      */\
440         "psraw $f2, $f2, $f18           \n\t"                                \
441         "psraw $f8, $f8, $f18           \n\t"                                \
442         "mov.d $f4, $f0                 \n\t" /* A1             a1         */\
443         "paddw $f0, $f0, $f14           \n\t" /* A1+B1          a1+b1      */\
444         "psubw $f4, $f4, $f14           \n\t" /* A1-B1          a1-b1      */\
445         "psraw $f0, $f0, $f18           \n\t"                                \
446         "psraw $f4, $f4, $f18           \n\t"                                \
447         "packsswh $f2, $f2, $f2         \n\t" /* A0+B0          a0+b0      */\
448         "swc1 $f2, " #dst "             \n\t"                                \
449         "packsswh $f0, $f0, $f0         \n\t" /* A1+B1          a1+b1      */\
450         "swc1 $f0, 16+" #dst "          \n\t"                                \
451         "packsswh $f4, $f4, $f4         \n\t" /* A1-B1          a1-b1      */\
452         "swc1 $f4, 96+" #dst "          \n\t"                                \
453         "packsswh $f8, $f8, $f8         \n\t" /* A0-B0          a0-b0      */\
454         "swc1 $f8, 112+" #dst "         \n\t"                                \
455         "ldc1 $f2, 88(%2)               \n\t" /* C3     C7      C3      C7 */\
456         "ldc1 $f16, 104(%2)             \n\t"                                \
457         "pmaddhw $f2, $f2, $f6          \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
458         "mov.d $f4, $f10                \n\t" /* A2             a2         */\
459         "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
460         "paddw $f4, $f4, $f2            \n\t" /* A2+B2          a2+b2      */\
461         "psubw $f10, $f10, $f2          \n\t" /* a2-B2          a2-b2      */\
462         "psraw $f4, $f4, $f18           \n\t"                                \
463         "psraw $f10, $f10, $f18         \n\t"                                \
464         "mov.d $f2, $f12                \n\t" /* A3             a3         */\
465         "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
466         "psubw $f2, $f2, $f6            \n\t" /* a3-B3          a3-b3      */\
467         "psraw $f12, $f12, $f18         \n\t"                                \
468         "psraw $f2, $f2, $f18           \n\t"                                \
469         "packsswh $f4, $f4, $f4         \n\t" /* A2+B2          a2+b2      */\
470         "packsswh $f12, $f12, $f12      \n\t" /* A3+B3          a3+b3      */\
471         "swc1 $f4, 32+" #dst "          \n\t"                                \
472         "packsswh $f2, $f2, $f2         \n\t" /* A3-B3          a3-b3      */\
473         "packsswh $f10, $f10, $f10      \n\t" /* A2-B2          a2-b2      */\
474         "swc1 $f12, 48+" #dst "         \n\t"                                \
475         "swc1 $f2, 64+" #dst "          \n\t"                                \
476         "swc1 $f10, 80+" #dst "         \n\t"
477
478         //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
479         IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
480         IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
481         IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
482         IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
483         "b 9f                           \n\t"
484
485         "# .p2align 4                   \n\t"
486         "2:                             \n\t"
487         Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 3f)
488
489 #undef  IDCT
490 #define IDCT(src0, src4, src1, src5, dst, shift)                             \
491         "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
492         "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
493         "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
494         "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
495         "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
496         "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
497         "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
498         "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
499         "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
500         "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
501         "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
502         "ldc1 $f2, 56(%2)               \n\t" /* C7     C5      C7      C5 */\
503         "pmaddhw $f2, $f2, $f6          \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
504         "ldc1 $f16, 64(%2)              \n\t"                                \
505         "pmaddhw $f4, $f4, $f16         \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
506         "paddw $f14, $f14, $f2          \n\t" /* B0             b0         */\
507         "ldc1 $f2, 72(%2)               \n\t" /* -C5    -C1     -C5    -C1 */\
508         "li $10, " #shift "             \n\t"                                \
509         "pmaddhw $f2, $f2, $f6          \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
510         "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
511         "mtc1 $10, $f18                 \n\t"                                \
512         "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
513         "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
514         "paddw $f2, $f2, $f4            \n\t" /* B1             b1         */\
515         "psraw $f14, $f14, $f18         \n\t"                                \
516         "psraw $f8, $f8, $f18           \n\t"                                \
517         "mov.d $f4, $f0                 \n\t" /* A1             a1         */\
518         "paddw $f0, $f0, $f2            \n\t" /* A1+B1          a1+b1      */\
519         "psubw $f4, $f4, $f2            \n\t" /* A1-B1          a1-b1      */\
520         "psraw $f0, $f0, $f18           \n\t"                                \
521         "psraw $f4, $f4, $f18           \n\t"                                \
522         "packsswh $f14, $f14, $f14      \n\t" /* A0+B0          a0+b0      */\
523         "swc1 $f14, " #dst "            \n\t"                                \
524         "packsswh $f0, $f0, $f0         \n\t" /* A1+B1          a1+b1      */\
525         "swc1 $f0, 16+" #dst "          \n\t"                                \
526         "packsswh $f4, $f4, $f4         \n\t" /* A1-B1          a1-b1      */\
527         "swc1 $f4, 96+" #dst "          \n\t"                                \
528         "packsswh $f8, $f8, $f8         \n\t" /* A0-B0          a0-b0      */\
529         "swc1 $f8, 112+" #dst "         \n\t"                                \
530         "ldc1 $f0, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
531         "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
532         "pmaddhw $f8, $f8, $f0          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
533         "ldc1 $f14, 88(%2)              \n\t" /* C3     C7      C3      C7 */\
534         "ldc1 $f16, 96(%2)              \n\t"                                \
535         "pmaddhw $f0, $f0, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
536         "pmaddhw $f14, $f14, $f6        \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
537         "mov.d $f4, $f10                \n\t" /* A2             a2         */\
538         "ldc1 $f16, 104(%2)             \n\t"                                \
539         "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
540         "paddw $f8, $f8, $f14           \n\t" /* B2             b2         */\
541         "paddw $f4, $f4, $f8            \n\t" /* A2+B2          a2+b2      */\
542         "psubw $f10, $f10, $f8          \n\t" /* a2-B2          a2-b2      */\
543         "psraw $f4, $f4, $f18           \n\t"                                \
544         "psraw $f10, $f10, $f18         \n\t"                                \
545         "mov.d $f8, $f12                \n\t" /* A3             a3         */\
546         "paddw $f6, $f6, $f0            \n\t" /* B3             b3         */\
547         "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
548         "psubw $f8, $f8, $f6            \n\t" /* a3-B3          a3-b3      */\
549         "psraw $f12, $f12, $f18         \n\t"                                \
550         "psraw $f8, $f8, $f18           \n\t"                                \
551         "packsswh $f4, $f4, $f4         \n\t" /* A2+B2          a2+b2      */\
552         "packsswh $f12, $f12, $f12      \n\t" /* A3+B3          a3+b3      */\
553         "swc1 $f4, 32+" #dst "          \n\t"                                \
554         "packsswh $f8, $f8, $f8         \n\t" /* A3-B3          a3-b3      */\
555         "packsswh $f10, $f10, $f10      \n\t" /* A2-B2          a2-b2      */\
556         "swc1 $f12, 48+" #dst "         \n\t"                                \
557         "swc1 $f8, 64+" #dst "          \n\t"                                \
558         "swc1 $f10, 80+" #dst "         \n\t"
559
560         //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
561         IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
562         IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
563         IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
564         IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
565         "b 9f                           \n\t"
566
567         "# .p2align 4                   \n\t"
568         "3:                             \n\t"
569
570 #undef  IDCT
571 #define IDCT(src0, src4, src1, src5, dst, shift)                             \
572         "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
573         "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
574         "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
575         "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
576         "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
577         "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
578         "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
579         "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
580         "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
581         "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
582         "ldc1 $f6, 64(%2)               \n\t"                                \
583         "pmaddhw $f6, $f6, $f4          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
584         "li $10, " #shift "             \n\t"                                \
585         "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
586         "mtc1 $10, $f18                 \n\t"                                \
587         "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
588         "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
589         "psraw $f14, $f14, $f18         \n\t"                                \
590         "psraw $f8, $f8, $f18           \n\t"                                \
591         "mov.d $f2, $f0                 \n\t" /* A1             a1         */\
592         "paddw $f0, $f0, $f6            \n\t" /* A1+B1          a1+b1      */\
593         "psubw $f2, $f2, $f6            \n\t" /* A1-B1          a1-b1      */\
594         "psraw $f0, $f0, $f18           \n\t"                                \
595         "psraw $f2, $f2, $f18           \n\t"                                \
596         "packsswh $f14, $f14, $f14      \n\t" /* A0+B0  a0+b0              */\
597         "swc1 $f14, " #dst "            \n\t"                                \
598         "packsswh $f0, $f0, $f0         \n\t" /* A1+B1  a1+b1              */\
599         "swc1 $f0, 16+" #dst "          \n\t"                                \
600         "packsswh $f2, $f2, $f2         \n\t" /* A1-B1  a1-b1              */\
601         "swc1 $f2, 96+" #dst "          \n\t"                                \
602         "packsswh $f8, $f8, $f8         \n\t" /* A0-B0  a0-b0              */\
603         "swc1 $f8, 112+" #dst "         \n\t"                                \
604         "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
605         "ldc1 $f16, 96(%2)              \n\t"                                \
606         "pmaddhw $f8, $f8, $f4          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
607         "pmaddhw $f4, $f4, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
608         "mov.d $f2, $f10                \n\t" /* A2             a2         */\
609         "paddw $f2, $f2, $f8            \n\t" /* A2+B2          a2+b2      */\
610         "psubw $f10, $f10, $f8          \n\t" /* a2-B2          a2-b2      */\
611         "psraw $f2, $f2, $f18           \n\t"                                \
612         "psraw $f10, $f10, $f18         \n\t"                                \
613         "mov.d $f8, $f12                \n\t" /* A3             a3         */\
614         "paddw $f12, $f12, $f4          \n\t" /* A3+B3          a3+b3      */\
615         "psubw $f8, $f8, $f4            \n\t" /* a3-B3          a3-b3      */\
616         "psraw $f12, $f12, $f18         \n\t"                                \
617         "psraw $f8, $f8, $f18           \n\t"                                \
618         "packsswh $f2, $f2, $f2         \n\t" /* A2+B2  a2+b2              */\
619         "packsswh $f12, $f12, $f12      \n\t" /* A3+B3  a3+b3              */\
620         "swc1 $f2, 32+" #dst "          \n\t"                                \
621         "packsswh $f8, $f8, $f8         \n\t" /* A3-B3  a3-b3              */\
622         "packsswh $f10, $f10, $f10      \n\t" /* A2-B2  a2-b2              */\
623         "swc1 $f12, 48+" #dst "         \n\t"                                \
624         "swc1 $f8, 64+" #dst "          \n\t"                                \
625         "swc1 $f10, 80+" #dst "         \n\t"
626
627         //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
628         IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
629         IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
630         IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
631         IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
632         "b 9f                           \n\t"
633
634         "# .p2align 4                   \n\t"
635         "5:                             \n\t"
636
637 #undef  IDCT
638 #define IDCT(src0, src4, src1, src5, dst, shift)                             \
639         "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
640         "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
641         "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
642         "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
643         "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
644         "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
645         "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
646         "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
647         "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
648         "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
649         "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
650         "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
651         "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
652         "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
653         "paddw $f0, $f0, $f2            \n\t" /* A1             a1         */\
654         "psubw $f10, $f10, $f2          \n\t" /* A2             a2         */\
655         "ldc1 $f4, 8+" #src0 "          \n\t" /* R4     R0      r4      r0 */\
656         "ldc1 $f6, 8+" #src4 "          \n\t" /* R6     R2      r6      r2 */\
657         "ldc1 $f2, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
658         "pmaddhw $f2, $f2, $f4          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
659         "ldc1 $f14, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
660         "pmaddhw $f4, $f4, $f14         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
661         "ldc1 $f14, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
662         "ldc1 $f16, 40(%2)              \n\t"                                \
663         "pmaddhw $f14, $f14, $f6        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
664         "pmaddhw $f6, $f6, $f16         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
665         "paddw $f14, $f14, $f2          \n\t" /* A0             a0         */\
666         "paddw $f2, $f2, $f2            \n\t" /* 2C0            2c0        */\
667         "psubw $f2, $f2, $f14           \n\t" /* A3             a3         */\
668         "li $10, " #shift "             \n\t"                                \
669         "paddw $f6, $f6, $f4            \n\t" /* A1             a1         */\
670         "mtc1 $10, $f18                 \n\t"                                \
671         "paddw $f4, $f4, $f4            \n\t" /* 2C1            2c1        */\
672         "psubw $f4, $f4, $f6            \n\t" /* A2             a2         */\
673         "psraw $f8, $f8, $f18           \n\t"                                \
674         "psraw $f14, $f14, $f18         \n\t"                                \
675         "psraw $f6, $f6, $f18           \n\t"                                \
676         "packsswh $f8, $f8, $f14        \n\t" /* A0             a0         */\
677         "sdc1 $f8, " #dst "             \n\t"                                \
678         "psraw $f0, $f0, $f18           \n\t"                                \
679         "packsswh $f0, $f0, $f6         \n\t" /* A1             a1         */\
680         "sdc1 $f0, 16+" #dst "          \n\t"                                \
681         "sdc1 $f0, 96+" #dst "          \n\t"                                \
682         "sdc1 $f8, 112+" #dst "         \n\t"                                \
683         "psraw $f10, $f10, $f18         \n\t"                                \
684         "psraw $f12, $f12, $f18         \n\t"                                \
685         "psraw $f4, $f4, $f18           \n\t"                                \
686         "packsswh $f10, $f10, $f4       \n\t" /* A2-B2          a2-b2      */\
687         "sdc1 $f10, 32+" #dst "         \n\t"                                \
688         "psraw $f2, $f2, $f18           \n\t"                                \
689         "packsswh $f12, $f12, $f2       \n\t" /* A3+B3          a3+b3      */\
690         "sdc1 $f12, 48+" #dst "         \n\t"                                \
691         "sdc1 $f12, 64+" #dst "         \n\t"                                \
692         "sdc1 $f10, 80+" #dst "         \n\t"
693
694         //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
695         IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
696         IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
697         "b 9f                           \n\t"
698
699         "# .p2align 4                   \n\t"
700         "1:                             \n\t"
701
702 #undef  IDCT
703 #define IDCT(src0, src4, src1, src5, dst, shift)                             \
704         "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
705         "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
706         "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
707         "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
708         "li $10, " #shift "             \n\t"                                \
709         "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
710         "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
711         "mtc1 $10, $f18                 \n\t"                                \
712         "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
713         "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
714         "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
715         "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
716         "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
717         "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
718         "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
719         "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
720         "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
721         "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
722         "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
723         "paddw $f0, $f0, $f2            \n\t" /* A1             a1         */\
724         "psubw $f10, $f10, $f2          \n\t" /* A2             a2         */\
725         "ldc1 $f2, 64(%2)               \n\t"                                \
726         "pmaddhw $f2, $f2, $f4          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
727         "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
728         "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
729         "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
730         "psraw $f14, $f14, $f18         \n\t"                                \
731         "psraw $f8, $f8, $f18           \n\t"                                \
732         "mov.d $f6, $f0                 \n\t" /* A1             a1         */\
733         "paddw $f0, $f0, $f2            \n\t" /* A1+B1          a1+b1      */\
734         "psubw $f6, $f6, $f2            \n\t" /* A1-B1          a1-b1      */\
735         "psraw $f0, $f0, $f18           \n\t"                                \
736         "psraw $f6, $f6, $f18           \n\t"                                \
737         "packsswh $f14, $f14, $f14      \n\t" /* A0+B0  a0+b0              */\
738         "swc1 $f14, " #dst "            \n\t"                                \
739         "packsswh $f0, $f0, $f0         \n\t" /* A1+B1  a1+b1              */\
740         "swc1 $f0, 16+" #dst "          \n\t"                                \
741         "packsswh $f6, $f6, $f6         \n\t" /* A1-B1  a1-b1              */\
742         "swc1 $f6, 96+" #dst "          \n\t"                                \
743         "packsswh $f8, $f8, $f8         \n\t" /* A0-B0  a0-b0              */\
744         "swc1 $f8, 112+" #dst "         \n\t"                                \
745         "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
746         "ldc1 $f16, 96(%2)              \n\t"                                \
747         "pmaddhw $f8, $f8, $f4          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
748         "pmaddhw $f4, $f4, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
749         "mov.d $f6, $f10                \n\t" /* A2             a2         */\
750         "paddw $f6, $f6, $f8            \n\t" /* A2+B2          a2+b2      */\
751         "psubw $f10, $f10, $f8          \n\t" /* a2-B2          a2-b2      */\
752         "psraw $f6, $f6, $f18           \n\t"                                \
753         "psraw $f10, $f10, $f18         \n\t"                                \
754         "mov.d $f8, $f12                \n\t" /* A3             a3         */\
755         "paddw $f12, $f12, $f4          \n\t" /* A3+B3          a3+b3      */\
756         "psubw $f8, $f8, $f4            \n\t" /* a3-B3          a3-b3      */\
757         "psraw $f12, $f12, $f18         \n\t"                                \
758         "packsswh $f6, $f6, $f6         \n\t" /* A2+B2          a2+b2      */\
759         "swc1 $f6, 32+" #dst "          \n\t"                                \
760         "psraw $f8, $f8, $f18           \n\t"                                \
761         "packsswh $f12, $f12, $f12      \n\t" /* A3+B3          a3+b3      */\
762         "swc1 $f12, 48+" #dst "         \n\t"                                \
763         "packsswh $f8, $f8, $f8         \n\t" /* A3-B3          a3-b3      */\
764         "packsswh $f10, $f10, $f10      \n\t" /* A2-B2          a2-b2      */\
765         "swc1 $f8, 64+" #dst "          \n\t"                                \
766         "swc1 $f10, 80+" #dst "         \n\t"
767
768         //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
769         IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
770         IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
771         IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
772         IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
773         "b 9f                           \n\t"
774
775         "# .p2align 4                   \n\t"
776         "7:                             \n\t"
777
778 #undef  IDCT
779 #define IDCT(src0, src4, src1, src5, dst, shift)                             \
780         "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
781         "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
782         "li $10, " #shift "             \n\t"                                \
783         "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
784         "mtc1 $10, $f18                 \n\t"                                \
785         "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
786         "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
787         "psraw $f8, $f8, $f18           \n\t"                                \
788         "psraw $f0, $f0, $f18           \n\t"                                \
789         "ldc1 $f4, 8+" #src0 "          \n\t" /* R4     R0      r4      r0 */\
790         "ldc1 $f2, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
791         "pmaddhw $f2, $f2, $f4          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
792         "ldc1 $f14, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
793         "pmaddhw $f4, $f4, $f14         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
794         "ldc1 $f14, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
795         "psraw $f2, $f2, $f18           \n\t"                                \
796         "packsswh $f8, $f8, $f2         \n\t" /* A0             a0         */\
797         "sdc1 $f8, " #dst "             \n\t"                                \
798         "psraw $f4, $f4, $f18           \n\t"                                \
799         "packsswh $f0, $f0, $f4         \n\t" /* A1             a1         */\
800         "sdc1 $f0, 16+" #dst "          \n\t"                                \
801         "sdc1 $f0, 96+" #dst "          \n\t"                                \
802         "sdc1 $f8, 112+" #dst "         \n\t"                                \
803         "sdc1 $f0, 32+" #dst "          \n\t"                                \
804         "sdc1 $f8, 48+" #dst "          \n\t"                                \
805         "sdc1 $f8, 64+" #dst "          \n\t"                                \
806         "sdc1 $f0, 80+" #dst "          \n\t"
807
808         //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
809         IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
810         IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
811
812         "9:                             \n\t"
813         ::"r"(block),"r"(temp),"r"(coeffs),"m"(ff_wm1010),"m"(ff_d40000)
814         : "$10","$11"
815     );
816 }