]> git.sesse.net Git - ffmpeg/blob - libavcodec/sh4/idct_sh4.c
Merge remote-tracking branch 'qatar/master'
[ffmpeg] / libavcodec / sh4 / idct_sh4.c
1 /*
2  * idct for sh4
3  *
4  * Copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 #include "libavcodec/dsputil.h"
24 #include "dsputil_sh4.h"
25 #include "sh4.h"
26
27 #define c1      1.38703984532214752434  /* sqrt(2)*cos(1*pi/16) */
28 #define c2      1.30656296487637657577  /* sqrt(2)*cos(2*pi/16) */
29 #define c3      1.17587560241935884520  /* sqrt(2)*cos(3*pi/16) */
30 #define c4      1.00000000000000000000  /* sqrt(2)*cos(4*pi/16) */
31 #define c5      0.78569495838710234903  /* sqrt(2)*cos(5*pi/16) */
32 #define c6      0.54119610014619712324  /* sqrt(2)*cos(6*pi/16) */
33 #define c7      0.27589937928294311353  /* sqrt(2)*cos(7*pi/16) */
34
35 static const float even_table[] __attribute__ ((aligned(8))) = {
36         c4, c4, c4, c4,
37         c2, c6,-c6,-c2,
38         c4,-c4,-c4, c4,
39         c6,-c2, c2,-c6
40 };
41
42 static const float odd_table[] __attribute__ ((aligned(8))) = {
43         c1, c3, c5, c7,
44         c3,-c7,-c1,-c5,
45         c5,-c1, c7, c3,
46         c7,-c5, c3,-c1
47 };
48
49 #undef  c1
50 #undef  c2
51 #undef  c3
52 #undef  c4
53 #undef  c5
54 #undef  c6
55 #undef  c7
56
57 #define         load_matrix(table) \
58     do { \
59         const float *t = table; \
60         __asm__ volatile( \
61         "       fschg\n" \
62         "       fmov   @%0+,xd0\n" \
63         "       fmov   @%0+,xd2\n" \
64         "       fmov   @%0+,xd4\n" \
65         "       fmov   @%0+,xd6\n" \
66         "       fmov   @%0+,xd8\n" \
67         "       fmov   @%0+,xd10\n" \
68         "       fmov   @%0+,xd12\n" \
69         "       fmov   @%0+,xd14\n" \
70         "       fschg\n" \
71         : "+r"(t) \
72         ); \
73     } while (0)
74
75 #define         ftrv() \
76                 __asm__ volatile("ftrv xmtrx,fv0" \
77                 : "+f"(fr0),"+f"(fr1),"+f"(fr2),"+f"(fr3));
78
79 #define         DEFREG        \
80         register float fr0 __asm__("fr0"); \
81         register float fr1 __asm__("fr1"); \
82         register float fr2 __asm__("fr2"); \
83         register float fr3 __asm__("fr3")
84
85 #define         DESCALE(x,n)    (x)*(1.0f/(1<<(n)))
86
87 /* this code work worse on gcc cvs. 3.2.3 work fine */
88
89
90 //optimized
91
92 void idct_sh4(DCTELEM *block)
93 {
94         DEFREG;
95
96         int i;
97         float        tblock[8*8],*fblock;
98         int ofs1,ofs2,ofs3;
99         int fpscr;
100
101         fp_single_enter(fpscr);
102
103         /* row */
104
105         /* even part */
106         load_matrix(even_table);
107
108         fblock = tblock+4;
109         i = 8;
110         do {
111                 fr0 = block[0];
112                 fr1 = block[2];
113                 fr2 = block[4];
114                 fr3 = block[6];
115                 block+=8;
116                 ftrv();
117                 *--fblock = fr3;
118                 *--fblock = fr2;
119                 *--fblock = fr1;
120                 *--fblock = fr0;
121                 fblock+=8+4;
122         } while(--i);
123         block-=8*8;
124         fblock-=8*8+4;
125
126         load_matrix(odd_table);
127
128         i = 8;
129
130         do {
131                 float t0,t1,t2,t3;
132                 fr0 = block[1];
133                 fr1 = block[3];
134                 fr2 = block[5];
135                 fr3 = block[7];
136                 block+=8;
137                 ftrv();
138                 t0 = *fblock++;
139                 t1 = *fblock++;
140                 t2 = *fblock++;
141                 t3 = *fblock++;
142                 fblock+=4;
143                 *--fblock = t0 - fr0;
144                 *--fblock = t1 - fr1;
145                 *--fblock = t2 - fr2;
146                 *--fblock = t3 - fr3;
147                 *--fblock = t3 + fr3;
148                 *--fblock = t2 + fr2;
149                 *--fblock = t1 + fr1;
150                 *--fblock = t0 + fr0;
151                 fblock+=8;
152         } while(--i);
153         block-=8*8;
154         fblock-=8*8;
155
156         /* col */
157
158         /* even part */
159         load_matrix(even_table);
160
161         ofs1 = sizeof(float)*2*8;
162         ofs2 = sizeof(float)*4*8;
163         ofs3 = sizeof(float)*6*8;
164
165         i = 8;
166
167 #define        OA(fblock,ofs)   *(float*)((char*)fblock + ofs)
168
169         do {
170                 fr0 = OA(fblock,   0);
171                 fr1 = OA(fblock,ofs1);
172                 fr2 = OA(fblock,ofs2);
173                 fr3 = OA(fblock,ofs3);
174                 ftrv();
175                 OA(fblock,0   ) = fr0;
176                 OA(fblock,ofs1) = fr1;
177                 OA(fblock,ofs2) = fr2;
178                 OA(fblock,ofs3) = fr3;
179                 fblock++;
180         } while(--i);
181         fblock-=8;
182
183         load_matrix(odd_table);
184
185         i=8;
186         do {
187                 float t0,t1,t2,t3;
188                 t0 = OA(fblock,   0); /* [8*0] */
189                 t1 = OA(fblock,ofs1); /* [8*2] */
190                 t2 = OA(fblock,ofs2); /* [8*4] */
191                 t3 = OA(fblock,ofs3); /* [8*6] */
192                 fblock+=8;
193                 fr0 = OA(fblock,   0); /* [8*1] */
194                 fr1 = OA(fblock,ofs1); /* [8*3] */
195                 fr2 = OA(fblock,ofs2); /* [8*5] */
196                 fr3 = OA(fblock,ofs3); /* [8*7] */
197                 fblock+=-8+1;
198                 ftrv();
199                 block[8*0] = DESCALE(t0 + fr0,3);
200                 block[8*7] = DESCALE(t0 - fr0,3);
201                 block[8*1] = DESCALE(t1 + fr1,3);
202                 block[8*6] = DESCALE(t1 - fr1,3);
203                 block[8*2] = DESCALE(t2 + fr2,3);
204                 block[8*5] = DESCALE(t2 - fr2,3);
205                 block[8*3] = DESCALE(t3 + fr3,3);
206                 block[8*4] = DESCALE(t3 - fr3,3);
207                 block++;
208         } while(--i);
209
210         fp_single_leave(fpscr);
211 }