4 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 void ff_bfin_fdct (int16_t *buf);
25 This implementation works only for 8x8 input. The range of input
26 must be -256 to 255 i.e. 8bit input represented in a 16bit data
27 word. The original data must be sign extended into the 16bit data
34 X(m) = sum (x(n) * cos ((2n+1)*m*pi/16))
38 0 --*-------------*0+7---*-----*0+3-------*-*-------------------> 0
40 1 --*-\---------/-*1+6---*-\-/-*1+2-------*-*-------------------> 4
42 2 --*---\-----/---*2+5---*-/-\-*1-2---------------*-*-----------> 2
44 3 --*-----\-/-----*3+4---*-----*0-3---------------*-*-----------> 6
46 4 --*-----/-\-----*3-4------------*-*4+5--*-----*---------------> 1
48 5 --*---/-----\---*2-5---*-*------*=*4-5----\-/------*-*--------> 5
49 / \ X S4,S4 / X S3,-S3
50 6 --*-/---------\-*1-6---*-*------*=*7-6----/-\------*-*--------> 3
52 --*-------------*0-7------------*-*7+6--*-----*---------------> 7
56 Cn = cos(n*pi/8) used throughout the code.
60 R0, R1, R2, R3, R4, R5, R6,R7, P0, P1, P2, P3, P4, P5, A0, A1.
62 I0, I1, I2, I3, B0, B2, B3, M0, M1, L3 registers and LC0.
64 Input - r0 - pointer to start of int16_t *block
66 Output - The DCT output coefficients in the int16_t *block
69 This code is called from jpeg_encode.
70 R6, R5, R4 if modified should be stored and restored.
73 Performance: (Timer version 0.6.33)
74 Code Size : 240 Bytes.
76 Input Matrix : 8 * 8 * 2 Bytes.
77 Coefficients : 16 Bytes
78 Temporary matrix: 8 * 8 * 2 Bytes.
79 Cycle Count :26+{18+8*(14+2S)}*2 where S -> Stalls
81 -----------------------------------------
82 | Size | Forward DCT | Inverse DCT |
83 -----------------------------------------
84 | 8x8 | 284 Cycles | 311 Cycles |
85 -----------------------------------------
87 Ck = int16(cos(k/16*pi)*32767+.5)/2
93 Sk = int16(sin(k/16*pi)*32767+.5)/2
99 the coefficients are ordered as follows:
106 -----------------------------------------------------------
107 FFMPEG conformance testing results
108 -----------------------------------------------------------
109 dct-test: modified with the following
110 dct_error("BFINfdct", 0, ff_bfin_fdct, fdct, test);
111 produces the following output:
113 root:/u/ffmpeg/bhead/libavcodec> ./dct-test
116 2 -131 -6 -48 -36 33 -83 24
117 34 52 -24 -15 5 92 57 143
118 -67 -43 -1 74 -16 5 -71 32
119 -78 106 92 -34 -38 81 20 -18
120 7 -62 40 2 -15 90 -62 -83
121 -83 1 -104 -13 43 -19 7 11
122 -63 31 12 -29 83 72 21 10
123 -17 -63 -15 73 50 -91 159 -14
124 DCT BFINfdct: err_inf=2 err2=0.16425938 syserr=0.00795000 maxout=2098 blockSumErr=27
125 DCT BFINfdct: 92.1 kdct/s
126 root:/u/ffmpeg/bhead/libavcodec>
131 #include "config_bfin.h"
133 #if defined(__FDPIC__) && CONFIG_SRAM
134 .section .l1.data.B,"aw",@progbits
140 .short 0x5a82, 0x2d41, 0x187e, 0x3b21, 0x0c7c, 0x3ec5, 0x238e, 0x3537;
142 #if defined(__FDPIC__) && CONFIG_SRAM
143 .section .l1.data.A,"aw",@progbits
151 [--SP] = (R7:4, P5:3); // Push the registers onto the stack.
154 RELOC(r0, P3, dct_coeff);
159 L3 = 16; // L3 is set to 16 to make the coefficient
163 //----------------------------------------------------------------------------
166 * I0, I1, and I2 registers are used to read the input data. I3 register is used
167 * to read the coefficients. P0 and P1 registers are used for writing the output
170 M0 = 12 (X); // All these initializations are used in the
171 M1 = 16 (X); // modification of address offsets.
182 // Prescale the input to get the correct precision.
186 lsetup (.0, .1) LC0 = P3;
188 .0: r1=r0<<3 (v) || r0=[i0++] ;
192 * B0 points to the "in" buffer.
193 * B2 points to "temp" buffer in the first iteration.
196 lsetup (.2, .3) LC0 = P0;
198 I0 = B0; // I0 points to Input Element (0, 0).
199 I1 = B0; // Element 1 and 0 is read in R0.
200 I1 += M0 || R0 = [I0++]; // I1 points to Input Element (0, 6).
201 I2 = I1; // Element 6 is read into R3.H.
202 I2 -= 4 || R3.H = W[I1++]; // I2 points to Input Element (0, 4).
204 I3 = B3; // I3 points to Coefficients.
205 P0 = B2; // P0 points to temporary array Element
207 P1 = B2; // P1 points to temporary array.
208 R7 = [P1++P2] || R2 = [I2++]; // P1 points to temporary array
210 // R7 is a dummy read. X4,X5
212 R3.L = W[I1--]; // X7 is read into R3.L.
213 R1.H = W[I0++]; // X2 is read into R1.H.
217 * X0 = (X0 + X7) / 2.
218 * X1 = (X1 + X6) / 2.
219 * X6 = (X1 - X6) / 2.
220 * X7 = (X0 - X7) / 2.
221 * It reads the data 3 in R1.L.
224 R0 = R0 +|+ R3, R3 = R0 -|- R3 || R1.L = W[I0++] || NOP;
227 * X2 = (X2 + X5) / 2.
228 * X3 = (X3 + X4) / 2.
229 * X4 = (X3 - X4) / 2.
230 * X5 = (X2 - X5) / 2.
231 * R7 = C4 = cos(4*pi/16)
234 R1 = R1 +|+ R2, R2 = R1 -|- R2 (CO) || NOP || R7 = [I3++];
237 * At the end of stage 1 R0 has (1,0), R1 has (2,3), R2 has (4, 5) and
239 * Where the notation (x, y) represents uper/lower half pairs.
248 R0 = R0 +|+ R1, R1 = R0 -|- R1;
250 lsetup (.row0, .row1) LC1 = P2 >> 1; // 1d dct, loops 8x
254 * This is part 2 computation continued.....
255 * A1 = X6 * cos(pi/4)
256 * A0 = X6 * cos(pi/4)
257 * A1 = A1 - X5 * cos(pi/4)
258 * A0 = A0 + X5 * cos(pi/4).
259 * The instruction W[I0] = R3.L is used for packing it to R2.L.
262 A1=R3.H*R7.l, A0=R3.H*R7.l || I1+=M1 || W[I0] = R3.L;
263 R4.H=(A1-=R2.L*R7.l), R4.L=(A0+=R2.L*R7.l) || I2+=M0 || NOP;
265 /* R0 = (X1,X0) R1 = (X2,X3) R4 = (X5, X6). */
268 * A1 = X0 * cos(pi/4)
269 * A0 = X0 * cos(pi/4)
270 * A1 = A1 - X1 * cos(pi/4)
271 * A0 = A0 + X1 * cos(pi/4)
274 A1=R0.L*R7.h, A0=R0.L*R7.h || NOP || R3.H=W[I1++];
275 R5.H=(A1-=R0.H*R7.h),R5.L=(A0+=R0.H*R7.h) || R7=[I3++] || NOP;
278 * A1 = X2 * cos(3pi/8)
279 * A0 = X3 * cos(3pi/8)
280 * A1 = A1 + X3 * cos(pi/8)
281 * A0 = A0 - X2 * cos(pi/8)
283 * R7 = (cos(7pi/8),cos(pi/8))
289 A1=R1.H*R7.L, A0=R1.L*R7.L || W[P0++P3]=R5.L || R2.L=W[I0];
290 R2=R2+|+R4, R4=R2-|-R4 || I0+=4 || R3.L=W[I1--];
291 R6.H=(A1+=R1.L*R7.H),R6.L=(A0 -= R1.H * R7.H) || I0+=4 || R7=[I3++];
293 /* R2 = (X4, X7) R4 = (X5,X6) R5 = (X1, X0) R6 = (X2,X3). */
296 * A1 = X4 * cos(7pi/16)
297 * A0 = X7 * cos(7pi/16)
298 * A1 = A1 + X7 * cos(pi/16)
299 * A0 = A0 - X4 * cos(pi/16)
302 A1=R2.H*R7.L, A0=R2.L*R7.L || W[P0++P3]=R6.H || R0=[I0++];
303 R2.H=(A1+=R2.L*R7.H),R2.L=(A0-=R2.H*R7.H) || W[P0++P3]=R5.H || R7=[I3++];
306 * A1 = X5 * cos(3pi/16)
307 * A0 = X6 * cos(3pi/16)
308 * A1 = A1 + X6 * cos(5pi/16)
309 * A0 = A0 - X5 * cos(5pi/16)
310 * The output values are written.
313 A1=R4.H*R7.H, A0=R4.L*R7.H || W[P0++P2]=R6.L || R1.H=W[I0++];
314 R4.H=(A1+=R4.L*R7.L),R4.L=(A0-=R4.H*R7.L) || W[P0++P4]=R2.L || R1.L=W[I0++];
317 /* Beginning of next stage, **pipelined** + drain and store the
318 rest of the column store. */
320 R0=R0+|+R3,R3=R0-|-R3 || W[P1++P3]=R2.H || R2=[I2++];
321 R1=R1+|+R2,R2=R1-|-R2 (CO) || W[P1++P3]=R4.L || R7=[I3++];
322 .row1: R0=R0+|+R1,R1=R0-|-R1 || W[P1++P5]=R4.H || NOP;
324 // Exchange input with output.
330 (r7:4,p5:3) = [sp++];