SSSE3/SSE4/AVX 9-way fully merged i8x8 analysis (sa8d_x9)

[x264] / common / dct.h
diff --git a/common/dct.h b/common/dct.h

index c492951c2c242d3ae3d1207e47d6a69899a85062..044ad1e149e9e680d4156118a671f971f6138b9b 100644 (file)
--- a/common/dct.h
+++ b/common/dct.h
@@ -1,10 +1,9 @@
  /*****************************************************************************
- * dct.h: h264 encoder library
+ * dct.h: transform and zigzag
   *****************************************************************************
- * Copyright (C) 2003 Laurent Aimar
- * $Id: dct.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ * Copyright (C) 2004-2011 x264 project
   *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ * Authors: Loren Merritt <lorenm@u.washington.edu>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -18,11 +17,14 @@
   *
   * You should have received a copy of the GNU General Public License
   * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
   *****************************************************************************/
  
-#ifndef _DCT_H
-#define _DCT_H 1
+#ifndef X264_DCT_H
+#define X264_DCT_H
  
  /* the inverse of the scaling factors introduced by 8x8 fdct */
  #define W(i) (i==0 ? FIX8(1.0000) :\
@@ -31,7 +33,7 @@
                i==3 ? FIX8(0.9415) :\
                i==4 ? FIX8(1.2651) :\
                i==5 ? FIX8(1.1910) :0)
-static const int x264_dct8_weight_tab[64] = {
+static const uint16_t x264_dct8_weight_tab[64] = {
      W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
      W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
      W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
@@ -44,20 +46,27 @@ static const int x264_dct8_weight_tab[64] = {
  };
  #undef W
  
+#define W(i) (i==0 ? FIX8(1.76777) :\
+              i==1 ? FIX8(1.11803) :\
+              i==2 ? FIX8(0.70711) :0)
+static const uint16_t x264_dct4_weight_tab[16] = {
+    W(0), W(1), W(0), W(1),
+    W(1), W(2), W(1), W(2),
+    W(0), W(1), W(0), W(1),
+    W(1), W(2), W(1), W(2)
+};
+#undef W
+
  /* inverse squared */
  #define W(i) (i==0 ? FIX8(3.125) :\
                i==1 ? FIX8(1.25) :\
                i==2 ? FIX8(0.5) :0)
-static const int x264_dct4_weight2_tab[16] = {
+static const uint16_t x264_dct4_weight2_tab[16] = {
      W(0), W(1), W(0), W(1),
      W(1), W(2), W(1), W(2),
      W(0), W(1), W(0), W(1),
      W(1), W(2), W(1), W(2)
  };
-static const int x264_dct4_weight2_zigzag[16] = {
-    W(0), W(1), W(1), W(0), W(2), W(0), W(1), W(1),
-    W(1), W(1), W(2), W(0), W(2), W(1), W(1), W(2)
-};
  #undef W
  
  #define W(i) (i==0 ? FIX8(1.00000) :\
@@ -66,7 +75,7 @@ static const int x264_dct4_weight2_zigzag[16] = {
                i==3 ? FIX8(0.88637) :\
                i==4 ? FIX8(1.60040) :\
                i==5 ? FIX8(1.41850) :0)
-static const int x264_dct8_weight2_tab[64] = {
+static const uint16_t x264_dct8_weight2_tab[64] = {
      W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
      W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
      W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
@@ -77,43 +86,56 @@ static const int x264_dct8_weight2_tab[64] = {
      W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
      W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1)
  };
-static const int x264_dct8_weight2_zigzag[64] = {
-    W(0), W(3), W(3), W(4), W(1), W(4), W(3), W(5),
-    W(5), W(3), W(0), W(1), W(2), W(1), W(0), W(3),
-    W(3), W(5), W(5), W(3), W(3), W(4), W(1), W(4),
-    W(1), W(4), W(1), W(4), W(3), W(5), W(5), W(3),
-    W(3), W(5), W(5), W(3), W(1), W(2), W(1), W(0),
-    W(1), W(2), W(1), W(5), W(5), W(3), W(3), W(5),
-    W(5), W(1), W(4), W(1), W(4), W(1), W(3), W(5),
-    W(5), W(3), W(1), W(2), W(1), W(5), W(5), W(1)
-};
  #undef W
  
+extern uint16_t x264_dct4_weight2_zigzag[2][16]; // [2] = {frame, field}
+extern uint16_t x264_dct8_weight2_zigzag[2][64];
+
  typedef struct
  {
-    void (*sub4x4_dct)   ( int16_t dct[4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
-    void (*add4x4_idct)  ( uint8_t *p_dst, int i_dst, int16_t dct[4][4] );
+    // pix1  stride = FENC_STRIDE
+    // pix2  stride = FDEC_STRIDE
+    // p_dst stride = FDEC_STRIDE
+    void (*sub4x4_dct)   ( dctcoef dct[16], pixel *pix1, pixel *pix2 );
+    void (*add4x4_idct)  ( pixel *p_dst, dctcoef dct[16] );
+
+    void (*sub8x8_dct)   ( dctcoef dct[4][16], pixel *pix1, pixel *pix2 );
+    void (*sub8x8_dct_dc)( dctcoef dct[4], pixel *pix1, pixel *pix2 );
+    void (*add8x8_idct)  ( pixel *p_dst, dctcoef dct[4][16] );
+    void (*add8x8_idct_dc) ( pixel *p_dst, dctcoef dct[4] );
  
-    void (*sub8x8_dct)   ( int16_t dct[4][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
-    void (*add8x8_idct)  ( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] );
+    void (*sub8x16_dct_dc)( dctcoef dct[8], pixel *pix1, pixel *pix2 );
  
-    void (*sub16x16_dct)   ( int16_t dct[16][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
-    void (*add16x16_idct)  ( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] );
+    void (*sub16x16_dct) ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
+    void (*add16x16_idct)( pixel *p_dst, dctcoef dct[16][16] );
+    void (*add16x16_idct_dc) ( pixel *p_dst, dctcoef dct[16] );
  
-    void (*sub8x8_dct8)   ( int16_t dct[8][8],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
-    void (*add8x8_idct8)  ( uint8_t *p_dst, int i_dst, int16_t dct[8][8] );
+    void (*sub8x8_dct8)  ( dctcoef dct[64], pixel *pix1, pixel *pix2 );
+    void (*add8x8_idct8) ( pixel *p_dst, dctcoef dct[64] );
  
-    void (*sub16x16_dct8)   ( int16_t dct[4][8][8],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
-    void (*add16x16_idct8)  ( uint8_t *p_dst, int i_dst, int16_t dct[4][8][8] );
+    void (*sub16x16_dct8) ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
+    void (*add16x16_idct8)( pixel *p_dst, dctcoef dct[4][64] );
  
-    void (*dct4x4dc) ( int16_t d[4][4] );
-    void (*idct4x4dc)( int16_t d[4][4] );
+    void (*dct4x4dc) ( dctcoef d[16] );
+    void (*idct4x4dc)( dctcoef d[16] );
  
-    void (*dct2x2dc) ( int16_t d[2][2] );
-    void (*idct2x2dc)( int16_t d[2][2] );
+    void (*dct2x4dc)( dctcoef dct[8], dctcoef dct4x4[8][16] );
  
  } x264_dct_function_t;
  
+typedef struct
+{
+    void (*scan_8x8)( dctcoef level[64], dctcoef dct[64] );
+    void (*scan_4x4)( dctcoef level[16], dctcoef dct[16] );
+    int  (*sub_8x8)  ( dctcoef level[64], const pixel *p_src, pixel *p_dst );
+    int  (*sub_4x4)  ( dctcoef level[16], const pixel *p_src, pixel *p_dst );
+    int  (*sub_4x4ac)( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
+    void (*interleave_8x8_cavlc)( dctcoef *dst, dctcoef *src, uint8_t *nnz );
+
+} x264_zigzag_function_t;
+
  void x264_dct_init( int cpu, x264_dct_function_t *dctf );
+void x264_dct_init_weights( void );
+void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced );
  
  #endif