1 /*****************************************************************************
\r
3 * XVID MPEG-4 VIDEO CODEC
\r
6 * Copyright (C) 2006-2011 Xvid Solutions GmbH
\r
8 * This program is free software ; you can redistribute it and/or modify
\r
9 * it under the terms of the GNU General Public License as published by
\r
10 * the Free Software Foundation ; either version 2 of the License, or
\r
11 * (at your option) any later version.
\r
13 * This program is distributed in the hope that it will be useful,
\r
14 * but WITHOUT ANY WARRANTY ; without even the implied warranty of
\r
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
16 * GNU General Public License for more details.
\r
18 * You should have received a copy of the GNU General Public License
\r
19 * along with this program ; if not, write to the Free Software
\r
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
\r
22 * $Id: fdct.c 1986 2011-05-18 09:07:40Z Isibaar $
\r
24 ****************************************************************************/
\r
29 * "Fast and precise" LLM implementation of FDCT/IDCT, where
\r
30 * rotations are decomposed using:
\r
32 * x' = tmp + y.(sin t - cos t)
\r
33 * y' = tmp - x.(sin t + cos t)
\r
35 * See details at the end of this file...
\r
38 * Loeffler C., Ligtenberg A., and Moschytz C.S.:
\r
39 * Practical Fast 1D DCT Algorithm with Eleven Multiplications,
\r
40 * Proc. ICASSP 1989, 988-991.
\r
42 * IEEE-1180-like error specs for FDCT:
\r
43 * Peak error: 1.0000
\r
45 * Overall MSE: 0.0200
\r
47 * Overall ME: -0.0033
\r
49 ********************************************************/
\r
55 /* function pointer */
\r
59 //////////////////////////////////////////////////////////
\r
62 #define BUTF(a, b, tmp) \
\r
67 #define LOAD_BUTF(m1, m2, a, b, tmp, S) \
\r
68 (m1) = (S)[(a)] + (S)[(b)]; \
\r
69 (m2) = (S)[(a)] - (S)[(b)]
\r
71 #define ROTATE(m1,m2,c,k1,k2,tmp,Fix,Rnd) \
\r
72 (tmp) = ( (m1) + (m2) )*(c); \
\r
76 (m1) = ((m1)+(tmp))>>(Fix); \
\r
77 (m2) = ((m2)+(tmp))>>(Fix);
\r
79 #define ROTATE2(m1,m2,c,k1,k2,tmp) \
\r
80 (tmp) = ( (m1) + (m2) )*(c); \
\r
83 (m1) = (m1)+(tmp); \
\r
86 #define ROTATE0(m1,m2,c,k1,k2,tmp) \
\r
87 (m1) = ( (m2) )*(c); \
\r
88 (m2) = (m2)*k2+(m1);
\r
90 #define SHIFTL(x,n) ((x)<<(n))
\r
91 #define SHIFTR(x, n) ((x)>>(n))
\r
92 #define HALF(n) (1<<((n)-1))
\r
100 #define ROT6_C 35468
\r
101 #define ROT6_SmC 50159
\r
102 #define ROT6_SpC 121095
\r
103 #define ROT17_C 77062
\r
104 #define ROT17_SmC 25571
\r
105 #define ROT17_SpC 128553
\r
106 #define ROT37_C 58981
\r
107 #define ROT37_SmC 98391
\r
108 #define ROT37_SpC 19571
\r
109 #define ROT13_C 167963
\r
110 #define ROT13_SmC 134553
\r
111 #define ROT13_SpC 201373
\r
116 #define FX(x) ( (int)floor((x)*(1<<FIX) + .5 ) )
\r
118 static const double c1 = cos(1.*M_PI/16);
\r
119 static const double c2 = cos(2.*M_PI/16);
\r
120 static const double c3 = cos(3.*M_PI/16);
\r
121 static const double c4 = cos(4.*M_PI/16);
\r
122 static const double c5 = cos(5.*M_PI/16);
\r
123 static const double c6 = cos(6.*M_PI/16);
\r
124 static const double c7 = cos(7.*M_PI/16);
\r
126 static const int ROT6_C = FX(c2-c6); // 0.541
\r
127 static const int ROT6_SmC = FX(2*c6); // 0.765
\r
128 static const int ROT6_SpC = FX(2*c2); // 1.847
\r
130 static const int ROT17_C = FX(c1+c7); // 1.175
\r
131 static const int ROT17_SmC = FX(2*c7); // 0.390
\r
132 static const int ROT17_SpC = FX(2*c1); // 1.961
\r
134 static const int ROT37_C = FX((c3-c7)/c4); // 0.899
\r
135 static const int ROT37_SmC = FX(2*(c5+c7)); // 1.501
\r
136 static const int ROT37_SpC = FX(2*(c1-c3)); // 0.298
\r
138 static const int ROT13_C = FX((c1+c3)/c4); // 2.562
\r
139 static const int ROT13_SmC = FX(2*(c3+c7)); // 2.053
\r
140 static const int ROT13_SpC = FX(2*(c1+c5)); // 3.072
\r
145 //////////////////////////////////////////////////////////
\r
146 // Forward transform
\r
149 void fdct_int32( short *const In )
\r
157 int mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, Spill;
\r
161 LOAD_BUTF(mm1,mm6, 1, 6, mm0, pIn);
\r
162 LOAD_BUTF(mm2,mm5, 2, 5, mm0, pIn);
\r
163 LOAD_BUTF(mm3,mm4, 3, 4, mm0, pIn);
\r
164 LOAD_BUTF(mm0,mm7, 0, 7, Spill, pIn);
\r
166 BUTF(mm1, mm2, Spill);
\r
167 BUTF(mm0, mm3, Spill);
\r
169 ROTATE(mm3, mm2, ROT6_C, ROT6_SmC, -ROT6_SpC, Spill, FIX-FPASS, HALF(FIX-FPASS));
\r
173 BUTF(mm0, mm1, Spill);
\r
174 pIn[0] = SHIFTL(mm0, FPASS);
\r
175 pIn[4] = SHIFTL(mm1, FPASS);
\r
182 ROTATE(mm2, mm3, ROT17_C, -ROT17_SpC, -ROT17_SmC, mm0, FIX-FPASS, HALF(FIX-FPASS));
\r
183 ROTATE(mm4, mm7, -ROT37_C, ROT37_SpC, ROT37_SmC, mm0, FIX-FPASS, HALF(FIX-FPASS));
\r
189 ROTATE(mm5, mm6, -ROT13_C, ROT13_SmC, ROT13_SpC, mm0, FIX-FPASS, HALF(FIX-FPASS));
\r
201 int mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, Spill;
\r
205 LOAD_BUTF(mm1,mm6, 1*8, 6*8, mm0, pIn);
\r
206 LOAD_BUTF(mm2,mm5, 2*8, 5*8, mm0, pIn);
\r
207 BUTF(mm1, mm2, mm0);
\r
209 LOAD_BUTF(mm3,mm4, 3*8, 4*8, mm0, pIn);
\r
210 LOAD_BUTF(mm0,mm7, 0*8, 7*8, Spill, pIn);
\r
211 BUTF(mm0, mm3, Spill);
\r
213 ROTATE(mm3, mm2, ROT6_C, ROT6_SmC, -ROT6_SpC, Spill, 0, HALF(FIX+FPASS+3));
\r
214 pIn[2*8] = (int16_t)SHIFTR(mm3,FIX+FPASS+3);
\r
215 pIn[6*8] = (int16_t)SHIFTR(mm2,FIX+FPASS+3);
\r
217 mm0 += HALF(FPASS+3) - 1;
\r
218 BUTF(mm0, mm1, Spill);
\r
219 pIn[0*8] = (int16_t)SHIFTR(mm0, FPASS+3);
\r
220 pIn[4*8] = (int16_t)SHIFTR(mm1, FPASS+3);
\r
227 ROTATE(mm2, mm3, ROT17_C, -ROT17_SpC, -ROT17_SmC, mm0, 0, HALF(FIX+FPASS+3));
\r
228 ROTATE2(mm4, mm7, -ROT37_C, ROT37_SpC, ROT37_SmC, mm0);
\r
231 pIn[7*8] = (int16_t)SHIFTR(mm4,FIX+FPASS+3);
\r
232 pIn[1*8] = (int16_t)SHIFTR(mm7,FIX+FPASS+3);
\r
234 ROTATE2(mm5, mm6, -ROT13_C, ROT13_SmC, ROT13_SpC, mm0);
\r
237 pIn[5*8] = (int16_t)SHIFTR(mm5,FIX+FPASS+3);
\r
238 pIn[3*8] = (int16_t)SHIFTR(mm6,FIX+FPASS+3);
\r
254 //////////////////////////////////////////////////////////
\r
255 // - Data flow schematics for FDCT -
\r
256 // Output is scaled by 2.sqrt(2)
\r
257 // Initial butterflies (in0/in7, etc.) are not fully depicted.
\r
258 // Note: Rot6 coeffs are multiplied by sqrt(2).
\r
259 //////////////////////////////////////////////////////////
\r
261 <---------Stage1 =even part=----------->
\r
263 in3 mm3 +_____.___-___________.____* out6
\r
267 in0 mm0 +_____o___+__.___-___ | ___* out4
\r
271 in1 mm1 +_____o___+__o___+___ | ___* out0
\r
275 in2 mm2 +_____.___-___________o____* out2
\r
279 <---------Stage2 =odd part=---------------->
\r
281 mm7*___._________.___-___[xSqrt2]___* out3
\r
285 mm5*__ | ___o____o___+___.___-______* out7
\r
289 mm6*__ |____.____o___+___o___+______* out1
\r
293 mm4*___o_________.___-___[xSqrt2]___* out5
\r
297 Alternative schematics for stage 2:
\r
298 -----------------------------------
\r
300 mm7 *___[xSqrt2]____o___+____o_______* out1
\r
304 mm6 *____o___+______.___-___ | __.___* out5
\r
308 mm5 *____.___-______.___-____.__ | __* out7
\r
312 mm4 *___[xSqrt2]____o___+________o___* out3
\r