git.sesse.net Git - vlc/blob - plugins/idct/idctaltivec.h

   1 /***************************************************************
   2  *
   3  * Copyright:   (c) Copyright Motorola Inc. 1998
   4  *
   5  * Date:        April 17, 1998
   6  *
   7  * Function:    Matrix_Transpose
   8  *
   9  * Description: The following Matrix Transpose is adapted
  10  *              from an algorithm developed by Brett Olsson
  11  *              from IBM. It performs a 8x8 16-bit element
  12  *              full matrix transpose.
  13  *
  14  * Inputs:      array elements stored in input
  15  *               input[0] = [ 00 01 02 03 04 05 06 07 ]
  16  *               input[1] = [ 10 11 12 13 14 15 16 17 ]
  17  *               input[2] = [ 20 21 22 23 24 25 26 27 ]
  18  *               input[3] = [ 30 31 32 33 34 35 36 37 ]
  19  *               input[4] = [ 40 41 42 43 44 45 46 47 ]
  20  *               input[5] = [ 50 51 52 53 54 55 56 57 ]
  21  *               input[6] = [ 60 61 62 63 64 65 66 67 ]
  22  *               input[7] = [ 70 71 72 73 74 75 76 77 ]
  23  *
  24  * Outputs:     transposed elements in output
  25  *
  26  **************************************************************/
  27
  28 static __inline__ void Matrix_Transpose ( vector signed short *input,
  29                                vector signed short *output )
  30 {
  31   vector signed short a0, a1, a2, a3, a4, a5, a6, a7;
  32   vector signed short b0, b1, b2, b3, b4, b5, b6, b7;
  33
  34   b0 = vec_mergeh( input[0], input[4] );     /* [ 00 40 01 41 02 42 03 43 ]*/
  35   b1 = vec_mergel( input[0], input[4] );     /* [ 04 44 05 45 06 46 07 47 ]*/
  36   b2 = vec_mergeh( input[1], input[5] );     /* [ 10 50 11 51 12 52 13 53 ]*/
  37   b3 = vec_mergel( input[1], input[5] );     /* [ 14 54 15 55 16 56 17 57 ]*/
  38   b4 = vec_mergeh( input[2], input[6] );     /* [ 20 60 21 61 22 62 23 63 ]*/
  39   b5 = vec_mergel( input[2], input[6] );     /* [ 24 64 25 65 26 66 27 67 ]*/
  40   b6 = vec_mergeh( input[3], input[7] );     /* [ 30 70 31 71 32 72 33 73 ]*/
  41   b7 = vec_mergel( input[3], input[7] );     /* [ 34 74 35 75 36 76 37 77 ]*/
  42
  43   a0 = vec_mergeh( b0, b4 );                 /* [ 00 20 40 60 01 21 41 61 ]*/
  44   a1 = vec_mergel( b0, b4 );                 /* [ 02 22 42 62 03 23 43 63 ]*/
  45   a2 = vec_mergeh( b1, b5 );                 /* [ 04 24 44 64 05 25 45 65 ]*/
  46   a3 = vec_mergel( b1, b5 );                 /* [ 06 26 46 66 07 27 47 67 ]*/
  47   a4 = vec_mergeh( b2, b6 );                 /* [ 10 30 50 70 11 31 51 71 ]*/
  48   a5 = vec_mergel( b2, b6 );                 /* [ 12 32 52 72 13 33 53 73 ]*/
  49   a6 = vec_mergeh( b3, b7 );                 /* [ 14 34 54 74 15 35 55 75 ]*/
  50   a7 = vec_mergel( b3, b7 );                 /* [ 16 36 56 76 17 37 57 77 ]*/
  51
  52   output[0] = vec_mergeh( a0, a4 );          /* [ 00 10 20 30 40 50 60 70 ]*/
  53   output[1] = vec_mergel( a0, a4 );          /* [ 01 11 21 31 41 51 61 71 ]*/
  54   output[2] = vec_mergeh( a1, a5 );          /* [ 02 12 22 32 42 52 62 72 ]*/
  55   output[3] = vec_mergel( a1, a5 );          /* [ 03 13 23 33 43 53 63 73 ]*/
  56   output[4] = vec_mergeh( a2, a6 );          /* [ 04 14 24 34 44 54 64 74 ]*/
  57   output[5] = vec_mergel( a2, a6 );          /* [ 05 15 25 35 45 55 65 75 ]*/
  58   output[6] = vec_mergeh( a3, a7 );          /* [ 06 16 26 36 46 56 66 76 ]*/
  59   output[7] = vec_mergel( a3, a7 );          /* [ 07 17 27 37 47 57 67 77 ]*/
  60
  61 }
  62
  63
  64 /***************************************************************
  65  *
  66  * Copyright:   (c) Copyright Motorola Inc. 1998
  67  *
  68  * Date:        April 20, 1998
  69  *
  70  * Macro:       IDCT_Transform
  71  *
  72  * Description: Discrete Cosign Transform implemented by the
  73  *              Scaled Chen (III) Algorithm developed by Haifa
  74  *              Research Lab.  The major difference between this
  75  *              algorithm and the Scaled Chen (I) is that
  76  *              certain multiply-subtracts are replaced by
  77  *              multiply adds.  A full description of the
  78  *              Scaled Chen (I) algorithm can be found in:
  79  *              W.C.Chen, C.H.Smith and S.C.Fralick, "A Fast
  80  *              Computational Algorithm for the Discrete Cosine
  81  *              Transform", IEEE Transactions on Commnuications,
  82  *              Vol. COM-25, No. 9, pp 1004-1009, Sept. 1997.
  83  *
  84  * Inputs:      vx     : array of vector short
  85  *              t1-t10 : temporary vector variables set up by caller
  86  *              c4     : cos(4*pi/16)
  87  *              mc4    : -c4
  88  *              a0     : c6/c2
  89  *              a1     : c7/c1
  90  *              a2     : c5/c3
  91  *              ma2    : -a2
  92  *              zero   : an array of zero elements
  93  *
  94  * Outputs:     vy     : array of vector short
  95  *
  96  **************************************************************/
  97
  98 #define IDCT_Transform(vx,vy) \
  99                                                                   \
 100   /* 1st stage. */                                                \
 101   t9 = vec_mradds( a1, vx[1], zero );  /* t8 = (a1) * x1 - x7  */ \
 102   t8 = vec_subs( t9, vx[7]);                                      \
 103   t1 = vec_mradds( a1, vx[7], vx[1] ); /* t1 = (a1) * x7 + x1  */ \
 104   t7 = vec_mradds( a2, vx[5], vx[3] ); /* t7 = (a2) * x5 + x3  */ \
 105   t3 = vec_mradds( ma2, vx[3], vx[5] );/* t3 = (-a2) * x5 + x3 */ \
 106                                                                   \
 107   /* 2nd stage */                                                 \
 108   t5 = vec_adds( vx[0], vx[4] );        /* t5 = x0 + x4 */        \
 109   t0 = vec_subs( vx[0], vx[4] );        /* t0 = x0 - x4 */        \
 110   t9 = vec_mradds( a0, vx[2], zero );   /* t4 = (a0) * x2 - x6 */ \
 111   t4 = vec_subs( t9, vx[6] );                                     \
 112   t2 = vec_mradds( a0, vx[6], vx[2] );  /* t2 = (a0) * x6 + x2 */ \
 113                                                                   \
 114   t6 = vec_adds( t8, t3 );              /* t6 = t8 + t3 */        \
 115   t3 = vec_subs( t8, t3 );              /* t3 = t8 - t3 */        \
 116   t8 = vec_subs( t1, t7 );              /* t8 = t1 - t7 */        \
 117   t1 = vec_adds( t1, t7 );              /* t1 = t1 + t7 */        \
 118                                                                   \
 119   /* 3rd stage. */                                                \
 120   t7 = vec_adds( t5, t2 );              /* t7 = t5 + t2 */        \
 121   t2 = vec_subs( t5, t2 );              /* t2 = t5 - t2 */        \
 122   t5 = vec_adds( t0, t4 );              /* t5 = t0 + t4 */        \
 123   t0 = vec_subs( t0, t4 );              /* t0 = t0 - t4 */        \
 124                                                                   \
 125   t4 = vec_subs( t8, t3 );              /* t4 = t8 - t3 */        \
 126   t3 = vec_adds( t8, t3 );              /* t3 = t8 + t3 */        \
 127                                                                   \
 128   /* 4th stage. */                                                \
 129   vy[0] = vec_adds( t7, t1 );        /* y0 = t7 + t1 */           \
 130   vy[7] = vec_subs( t7, t1 );        /* y7 = t7 - t1 */           \
 131   vy[1] = vec_mradds( c4, t3, t5 );  /* y1 = (c4) * t3 + t5  */   \
 132   vy[6] = vec_mradds( mc4, t3, t5 ); /* y6 = (-c4) * t3 + t5 */   \
 133   vy[2] = vec_mradds( c4, t4, t0 );  /* y2 = (c4) * t4 + t0  */   \
 134   vy[5] = vec_mradds( mc4, t4, t0 ); /* y5 = (-c4) * t4 + t0 */   \
 135   vy[3] = vec_adds( t2, t6 );        /* y3 = t2 + t6 */           \
 136   vy[4] = vec_subs( t2, t6 );        /* y4 = t2 - t6 */
 137
 138
 139 /* Pre-Scaling matrix -- scaled by 1 */
 140 static vector signed short PreScale[8] = {
 141     (vector signed short)( 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681 ),
 142     (vector signed short)( 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880 ),
 143     (vector signed short)( 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422 ),
 144     (vector signed short)( 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680 ),
 145     (vector signed short)( 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681 ),
 146     (vector signed short)( 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680 ),
 147     (vector signed short)( 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422 ),
 148     (vector signed short)( 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880 )
 149 };
 150
 151 /***************************************************************
 152  *
 153  * Copyright:   (c) Copyright Motorola Inc. 1998
 154  *
 155  * Date:        April 17, 1998
 156  *
 157  * Function:    IDCT
 158  *
 159  * Description: Scaled Chen (III) algorithm for IDCT
 160  *              Arithmetic is 16-bit fixed point.
 161  *
 162  * Inputs:      input - Pointer to input data (short), which
 163  *                      must be between -2048 to +2047.
 164  *                      It is assumed that the allocated array
 165  *                      has been 128-bit aligned and contains
 166  *                      8x8 short elements.
 167  *
 168  * Outputs:     output - Pointer to output area for the transfored
 169  *                       data. The output values are between -255
 170  *                       and 255 . It is assumed that a 128-bit
 171  *                       aligned 8x8 array of short has been
 172  *                       pre-allocated.
 173  *
 174  * Return:      None
 175  *
 176  ***************************************************************/
 177
 178 static __inline__ void IDCT(short *input, short *output) {
 179
 180   vector signed short t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
 181   vector signed short a0, a1, a2, ma2, c4, mc4, zero;
 182   vector signed short vx[8], vy[8];
 183   vector signed short *vec_ptr;  /* used for conversion between
 184                                     arrays of short and vector
 185                                     signed short array.  */
 186
 187
 188   /* Load the multiplication constants.  Note: these constants
 189    * could all be loaded directly ( like zero case ), but using the
 190    * SpecialConstants approach causes vsplth instructions to be
 191    * generated instead of lvx which is more efficient given the remainder
 192    * of the instruction mix.
 193    */
 194   vector signed short SpecialConstants =
 195      (vector signed short)( 23170, 13573, 6518, 21895, -23170, -21895, 0 , 0
 196 );
 197
 198   c4   = vec_splat( SpecialConstants, 0 );  /* c4 = cos(4*pi/16)  */
 199   a0   = vec_splat( SpecialConstants, 1 );  /* a0 = c6/c2         */
 200   a1   = vec_splat( SpecialConstants, 2 );  /* a1 = c7/c1         */
 201   a2   = vec_splat( SpecialConstants, 3 );  /* a2 = c5/c3         */
 202   mc4  = vec_splat( SpecialConstants, 4 );  /* -c4                */
 203   ma2  = vec_splat( SpecialConstants, 5 );  /* -a2                */
 204   zero = (vector signed short)(0);
 205
 206   /* Load the rows of input data and Pre-Scale them. */
 207   vec_ptr = ( vector signed short * ) input;
 208   vx[0] = vec_mradds( vec_ptr[0], PreScale[0], zero );
 209   vx[1] = vec_mradds( vec_ptr[1], PreScale[1], zero );
 210   vx[2] = vec_mradds( vec_ptr[2], PreScale[2], zero );
 211   vx[3] = vec_mradds( vec_ptr[3], PreScale[3], zero );
 212   vx[4] = vec_mradds( vec_ptr[4], PreScale[4], zero );
 213   vx[5] = vec_mradds( vec_ptr[5], PreScale[5], zero );
 214   vx[6] = vec_mradds( vec_ptr[6], PreScale[6], zero );
 215   vx[7] = vec_mradds( vec_ptr[7], PreScale[7], zero );
 216
 217   /* Perform IDCT first on the 8 columns */
 218   IDCT_Transform( vx, vy );
 219
 220   /* Transpose matrix to work on rows */
 221   Matrix_Transpose( vy, vx );
 222
 223   /* Perform IDCT next on the 8 rows */
 224   IDCT_Transform( vx, vy );
 225
 226   /* Post-scale and store result. */
 227   vec_ptr = (vector signed short *) output;
 228   vec_ptr[0] = vy[0];
 229   vec_ptr[1] = vy[1];
 230   vec_ptr[2] = vy[2];
 231   vec_ptr[3] = vy[3];
 232   vec_ptr[4] = vy[4];
 233   vec_ptr[5] = vy[5];
 234   vec_ptr[6] = vy[6];
 235   vec_ptr[7] = vy[7];
 236
 237 }
 238