Change some macros to be more sensitive to memory alignment, thus avoiding

author David Wolstencroft <wolstencroft@alum.rpi.edu>

Sat, 22 Nov 2008 16:54:38 +0000 (17:54 +0100)

committer Guillaume Poirier <gpoirier@mplayerhq.hu>

Tue, 25 Nov 2008 16:29:00 +0000 (17:29 +0100)
author David Wolstencroft <wolstencroft@alum.rpi.edu>
Sat, 22 Nov 2008 16:54:38 +0000 (17:54 +0100)
committer Guillaume Poirier <gpoirier@mplayerhq.hu>
Tue, 25 Nov 2008 16:29:00 +0000 (17:29 +0100)
diff --git a/AUTHORS b/AUTHORS

index 31c7bbbc63cf3f2a830c113d0e56da3deae31cf5..c31dad192ccce52edf54de894d8529eaf3a1032d 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -23,6 +23,9 @@ N: Christian Heine
  E: sennindemokrit AT gmx DOT net
  D: x86 asm
  
+N: David Wolstencroft
+D: Altivec optimizations
+
  N: Eric Petit
  E: eric.petit AT lapsus DOT org
  C: titer
diff --git a/common/ppc/deblock.c b/common/ppc/deblock.c

index f127cec886c65fc4345241290c163d7efc2c4b0b..86816caf389455db3100bf8b75bb87d95c98557f 100644 (file)
--- a/common/ppc/deblock.c
+++ b/common/ppc/deblock.c
@@ -72,28 +72,26 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
      *(dst_int+15*int_dst_stride) = *(src_int + 15);
  }
  
-/** \brief performs a 6x16 transpose of data in src, and stores it to dst
-    \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
-    out of unaligned_load() */
+/** \brief performs a 6x16 transpose of data in src, and stores it to dst */
  #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
      register vec_u8_t r0, r1, r2, r3, r4, r5, r6, r7, r14, r15;\
-    VEC_LOAD(src,                  r0, 16, vec_u8_t);          \
-    VEC_LOAD(src +    src_stride,  r1, 16, vec_u8_t);          \
-    VEC_LOAD(src +  2*src_stride,  r2, 16, vec_u8_t);          \
-    VEC_LOAD(src +  3*src_stride,  r3, 16, vec_u8_t);          \
-    VEC_LOAD(src +  4*src_stride,  r4, 16, vec_u8_t);          \
-    VEC_LOAD(src +  5*src_stride,  r5, 16, vec_u8_t);          \
-    VEC_LOAD(src +  6*src_stride,  r6, 16, vec_u8_t);          \
-    VEC_LOAD(src +  7*src_stride,  r7, 16, vec_u8_t);          \
-    VEC_LOAD(src + 14*src_stride, r14, 16, vec_u8_t);          \
-    VEC_LOAD(src + 15*src_stride, r15, 16, vec_u8_t);          \
+    VEC_LOAD(src,                  r0, 16, vec_u8_t, pix );    \
+    VEC_LOAD(src +    src_stride,  r1, 16, vec_u8_t, pix );    \
+    VEC_LOAD(src +  2*src_stride,  r2, 16, vec_u8_t, pix );    \
+    VEC_LOAD(src +  3*src_stride,  r3, 16, vec_u8_t, pix );    \
+    VEC_LOAD(src +  4*src_stride,  r4, 16, vec_u8_t, pix );    \
+    VEC_LOAD(src +  5*src_stride,  r5, 16, vec_u8_t, pix );    \
+    VEC_LOAD(src +  6*src_stride,  r6, 16, vec_u8_t, pix );    \
+    VEC_LOAD(src +  7*src_stride,  r7, 16, vec_u8_t, pix );    \
+    VEC_LOAD(src + 14*src_stride, r14, 16, vec_u8_t, pix );    \
+    VEC_LOAD(src + 15*src_stride, r15, 16, vec_u8_t, pix );    \
                                                                 \
-    VEC_LOAD(src + 8*src_stride,   r8, 16, vec_u8_t);          \
-    VEC_LOAD(src + 9*src_stride,   r9, 16, vec_u8_t);          \
-    VEC_LOAD(src + 10*src_stride, r10, 16, vec_u8_t);          \
-    VEC_LOAD(src + 11*src_stride, r11, 16, vec_u8_t);          \
-    VEC_LOAD(src + 12*src_stride, r12, 16, vec_u8_t);          \
-    VEC_LOAD(src + 13*src_stride, r13, 16, vec_u8_t);          \
+    VEC_LOAD(src + 8*src_stride,   r8, 16, vec_u8_t, pix );    \
+    VEC_LOAD(src + 9*src_stride,   r9, 16, vec_u8_t, pix );    \
+    VEC_LOAD(src + 10*src_stride, r10, 16, vec_u8_t, pix );    \
+    VEC_LOAD(src + 11*src_stride, r11, 16, vec_u8_t, pix );    \
+    VEC_LOAD(src + 12*src_stride, r12, 16, vec_u8_t, pix );    \
+    VEC_LOAD(src + 13*src_stride, r13, 16, vec_u8_t, pix );    \
                                                                 \
      /*Merge first pairs*/                                      \
      r0 = vec_mergeh(r0, r8);    /*0, 8*/                       \
@@ -294,6 +292,7 @@ void x264_deblock_h_luma_altivec(uint8_t *pix, int stride, int alpha, int beta,
      if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
          return;
      PREP_LOAD;
+    vec_u8_t _pix_ = vec_lvsl(0, pix-3);
      readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
      h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
      transpose4x16(line1, line2, line3, line4);
diff --git a/common/ppc/mc.c b/common/ppc/mc.c

index b9a7e4e6fb722664827712f312fbd837dd668424..7f3509d9d709da966cbf6f63415b89b6d7e9d1d6 100644 (file)
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -40,6 +40,11 @@
  typedef void (*pf_mc_t)( uint8_t *src, int i_src,
                           uint8_t *dst, int i_dst, int i_height );
  
+
+static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+
+
  static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
  {
      return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
@@ -52,11 +57,10 @@ static inline int x264_tapfilter1( uint8_t *pix )
             pix[ 3];
  }
  
-/* pixel_avg */
-static inline void pixel_avg_w4( uint8_t *dst,  int i_dst,
+
+static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst,  int i_dst,
                                   uint8_t *src1, int i_src1,
-                                 uint8_t *src2, int i_src2,
-                                 int i_height )
+                                 uint8_t *src2, int i_height )
  {
      int x, y;
      for( y = 0; y < i_height; y++ )
@@ -67,57 +71,70 @@ static inline void pixel_avg_w4( uint8_t *dst,  int i_dst,
          }
          dst  += i_dst;
          src1 += i_src1;
-        src2 += i_src2;
+        src2 += i_src1;
      }
  }
-static inline void pixel_avg_w8( uint8_t *dst,  int i_dst,
+
+static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst,  int i_dst,
                                   uint8_t *src1, int i_src1,
-                                 uint8_t *src2, int i_src2,
-                                 int i_height )
+                                 uint8_t *src2, int i_height )
  {
      int y;
      vec_u8_t src1v, src2v;
-    LOAD_ZERO;
      PREP_LOAD;
      PREP_STORE8;
+    PREP_LOAD_SRC( src1 );
+    PREP_LOAD_SRC( src2 );
+
      for( y = 0; y < i_height; y++ )
      {
-        VEC_LOAD( src1, src1v, 8, vec_u8_t );
-        VEC_LOAD( src2, src2v, 8, vec_u8_t );
+        VEC_LOAD( src1, src1v, 8, vec_u8_t, src1 );
+        VEC_LOAD( src2, src2v, 8, vec_u8_t, src2 );
          src1v = vec_avg( src1v, src2v );
          VEC_STORE8( src1v, dst );
  
          dst  += i_dst;
          src1 += i_src1;
-        src2 += i_src2;
+        src2 += i_src1;
      }
  }
-static inline void pixel_avg_w16( uint8_t *dst,  int i_dst,
+
+static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst,  int i_dst,
                                    uint8_t *src1, int i_src1,
-                                  uint8_t *src2, int i_src2,
-                                  int i_height )
+                                  uint8_t *src2, int i_height )
  {
      int y;
      vec_u8_t src1v, src2v;
      PREP_LOAD;
-    PREP_STORE16;
+    PREP_LOAD_SRC( src1 );
+    PREP_LOAD_SRC( src2 );
+
      for( y = 0; y < i_height; y++ )
      {
-        VEC_LOAD( src1, src1v, 16, vec_u8_t );
-        VEC_LOAD( src2, src2v, 16, vec_u8_t );
+        VEC_LOAD( src1, src1v, 16, vec_u8_t, src1 );
+        VEC_LOAD( src2, src2v, 16, vec_u8_t, src2 );
          src1v = vec_avg( src1v, src2v );
-        VEC_STORE16( src1v, dst );
+        vec_st(src1v, 0, dst);
  
          dst  += i_dst;
          src1 += i_src1;
-        src2 += i_src2;
+        src2 += i_src1;
      }
  }
  
+static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst,  int i_dst,
+                                uint8_t *src1, int i_src1,
+                                uint8_t *src2, int i_height )
+{
+    x264_pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height);
+    x264_pixel_avg2_w4_altivec(dst+16, i_dst, src1+16, i_src1, src2+16, i_height);
+}
+
  /* mc_copy: plain c */
+
  #define MC_COPY( name, a )                                \
-static void name( uint8_t *src, int i_src,                \
-                  uint8_t *dst, int i_dst, int i_height ) \
+static void name( uint8_t *dst, int i_dst,                \
+                  uint8_t *src, int i_src, int i_height ) \
  {                                                         \
      int y;                                                \
      for( y = 0; y < i_height; y++ )                       \
@@ -127,118 +144,99 @@ static void name( uint8_t *src, int i_src,                \
          dst += i_dst;                                     \
      }                                                     \
  }
-MC_COPY( mc_copy_w4,  4  )
-MC_COPY( mc_copy_w8,  8  )
-MC_COPY( mc_copy_w16, 16 )
+MC_COPY( x264_mc_copy_w4_altivec,  4  )
+MC_COPY( x264_mc_copy_w8_altivec,  8  )
  
-void mc_luma_altivec( uint8_t *dst,    int i_dst_stride,
+static void x264_mc_copy_w16_altivec( uint8_t *dst, int i_dst,                
+                                     uint8_t *src, int i_src, int i_height ) 
+{
+    int y;
+    vec_u8_t cpyV;
+    PREP_LOAD;
+    PREP_LOAD_SRC( src );
+
+    for( y = 0; y < i_height; y++)
+    {
+        VEC_LOAD( src, cpyV, 16, vec_u8_t, src );
+        vec_st(cpyV, 0, dst);
+        
+        src += i_src;
+        dst += i_dst;
+    }
+}
+
+
+static void mc_luma_altivec( uint8_t *dst,    int i_dst_stride,
                        uint8_t *src[4], int i_src_stride,
                        int mvx, int mvy,
                        int i_width, int i_height )
  {
-    uint8_t *src1, *src2;
-    
-    /* todo : fixme... */
-    int correction = (((mvx&3) == 3 && (mvy&3) == 1) || ((mvx&3) == 1 && (mvy&3) == 3)) ? 1:0;
-    
-    int hpel1x = mvx>>1;
-    int hpel1y = (mvy+1-correction)>>1;
-    int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
-    
-    
-    src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
-    
-    if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
+    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
+    if( qpel_idx & 5 ) /* qpel interpolation needed */
      {
-        int hpel2x = (mvx+1)>>1;
-        int hpel2y = (mvy+correction)>>1;
-        int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
-        
-        src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
+        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
          
          switch(i_width) {
          case 4:
-            pixel_avg_w4( dst, i_dst_stride, src1, i_src_stride,
-                          src2, i_src_stride, i_height );
+            x264_pixel_avg2_w4_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
              break;
          case 8:
-            pixel_avg_w8( dst, i_dst_stride, src1, i_src_stride,
-                          src2, i_src_stride, i_height );
+            x264_pixel_avg2_w8_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
              break;
          case 16:
          default:
-            pixel_avg_w16( dst, i_dst_stride, src1, i_src_stride,
-                           src2, i_src_stride, i_height );
+            x264_pixel_avg2_w16_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
          }
-        
+
      }
      else
      {
          switch(i_width) {
          case 4:
-            mc_copy_w4( src1, i_src_stride, dst, i_dst_stride, i_height );
+            x264_mc_copy_w4_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
              break;
          case 8:
-            mc_copy_w8( src1, i_src_stride, dst, i_dst_stride, i_height );
+            x264_mc_copy_w8_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
              break;
          case 16:
-            mc_copy_w16( src1, i_src_stride, dst, i_dst_stride, i_height );
+            x264_mc_copy_w16_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
              break;
          }
-        
      }
  }
  
-uint8_t *get_ref_altivec( uint8_t *dst,    int * i_dst_stride,
+
+
+static uint8_t *get_ref_altivec( uint8_t *dst,   int *i_dst_stride,
                            uint8_t *src[4], int i_src_stride,
                            int mvx, int mvy,
                            int i_width, int i_height )
  {
-    uint8_t *src1, *src2;
-    
-    /* todo : fixme... */
-    int correction = (((mvx&3) == 3 && (mvy&3) == 1) || ((mvx&3) == 1 && (mvy&3) == 3)) ? 1:0;
-    
-    int hpel1x = mvx>>1;
-    int hpel1y = (mvy+1-correction)>>1;
-    int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
-    
-    
-    src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
-    
-    if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
+    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
+    if( qpel_idx & 5 ) /* qpel interpolation needed */
      {
-        int hpel2x = (mvx+1)>>1;
-        int hpel2y = (mvy+correction)>>1;
-        int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
-        
-        src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
-        
+        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
          switch(i_width) {
          case 4:
-            pixel_avg_w4( dst, *i_dst_stride, src1, i_src_stride,
-                          src2, i_src_stride, i_height );
+            x264_pixel_avg2_w4_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
              break;
          case 8:
-            pixel_avg_w8( dst, *i_dst_stride, src1, i_src_stride,
-                          src2, i_src_stride, i_height );
+            x264_pixel_avg2_w8_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
              break;
          case 12:
          case 16:
          default:
-            pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
-                          src2, i_src_stride, i_height );
+            x264_pixel_avg2_w16_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
              break;
          case 20:
-            //FIXME suboptimal
-            pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
-                          src2, i_src_stride, i_height );
-            pixel_avg_w4( dst+16, *i_dst_stride, src1+16, i_src_stride,
-                          src2+16, i_src_stride, i_height );
+            x264_pixel_avg2_w20_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
              break;
          }
          return dst;
-
      }
      else
      {
@@ -273,6 +271,8 @@ static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
      
      LOAD_ZERO;
      PREP_LOAD;
+    PREP_LOAD_SRC( src );
+    PREP_LOAD_SRC( srcp );
      PREP_STORE4;
      vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
      vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
@@ -292,14 +292,14 @@ static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
      permv   = vec_lvsl( 0, (uint8_t *) 1 );
      shiftv  = vec_splat_u16( 6 );
  
-    VEC_LOAD( src, src2v_8, 5, vec_u8_t );
+    VEC_LOAD( src, src2v_8, 5, vec_u8_t, src );
      src3v_8 = vec_perm( src2v_8, src2v_8, permv );
  
      for( y = 0; y < i_height; y++ )
      {
          src0v_8 = src2v_8;
          src1v_8 = src3v_8;
-        VEC_LOAD( srcp, src2v_8, 5, vec_u8_t );
+        VEC_LOAD( srcp, src2v_8, 5, vec_u8_t, srcp );
          src3v_8 = vec_perm( src2v_8, src2v_8, permv );
  
          dstv_16 = k32v;
@@ -339,6 +339,8 @@ static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
      
      LOAD_ZERO;
      PREP_LOAD;
+    PREP_LOAD_SRC( src );
+    PREP_LOAD_SRC( srcp );
      PREP_STORE8;
      vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
      vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
@@ -358,14 +360,14 @@ static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
      permv   = vec_lvsl( 0, (uint8_t *) 1 );
      shiftv  = vec_splat_u16( 6 );
  
-    VEC_LOAD( src, src2v_8, 9, vec_u8_t );
+    VEC_LOAD( src, src2v_8, 9, vec_u8_t, src);
      src3v_8 = vec_perm( src2v_8, src2v_8, permv );
  
      for( y = 0; y < i_height; y++ )
      {
          src0v_8 = src2v_8;
          src1v_8 = src3v_8;
-        VEC_LOAD( srcp, src2v_8, 9, vec_u8_t );
+        VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, srcp );
          src3v_8 = vec_perm( src2v_8, src2v_8, permv );
  
          dstv_16 = k32v;
@@ -431,8 +433,8 @@ static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride,
  
  #define HPEL_FILTER_HORIZONTAL()                            \
  {                                                           \
-    VEC_LOAD( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t ); \
-    VEC_LOAD( &src[x+14+i_stride*y], src6v, 16, vec_u8_t ); \
+    VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \
+    VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \
                                                              \
      src2v = vec_sld( src1v, src6v,  1 );                    \
      src3v = vec_sld( src1v, src6v,  2 );                    \
@@ -468,17 +470,17 @@ static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride,
                                                              \
      destv = vec_packsu( dest1v, dest2v );                   \
                                                              \
-    VEC_STORE16( destv, &dsth[x+i_stride*y] );              \
+    VEC_STORE16( destv, &dsth[x+i_stride*y], dsth );        \
  }
  
  #define HPEL_FILTER_VERTICAL()                               \
  {                                                            \
-    VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t ); \
-    VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t ); \
-    VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t ); \
-    VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t ); \
-    VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t ); \
-    VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t ); \
+    VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src ); \
+    VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src ); \
+    VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src ); \
+    VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src ); \
+    VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src ); \
+    VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src ); \
                                                               \
      temp1v = vec_u8_to_s16_h( src1v );                       \
      temp2v = vec_u8_to_s16_h( src2v );                       \
@@ -508,7 +510,7 @@ static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride,
                                                               \
      destv = vec_packsu( dest1v, dest2v );                    \
                                                               \
-    VEC_STORE16( destv, &dstv[x+i_stride*y] );               \
+    VEC_STORE16( destv, &dstv[x+i_stride*y], dsth );         \
  }
  
  #define HPEL_FILTER_CENTRAL()                     \
@@ -541,7 +543,7 @@ static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride,
                                                    \
      destv = vec_packsu( dest1v, dest2v );         \
                                                    \
-    VEC_STORE16( destv, &dstc[x-16+i_stride*y] ); \
+    VEC_STORE16( destv, &dstc[x-16+i_stride*y], dsth ); \
  }
  
  void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
@@ -556,7 +558,9 @@ void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint
      vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;
  
      PREP_LOAD;
+    PREP_LOAD_SRC( src);
      PREP_STORE16;
+    PREP_STORE16_DST( dsth );
      LOAD_ZERO;
  
      vec_u16_t twov, fourv, fivev, sixv;
@@ -612,12 +616,12 @@ void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint
          }
  
          /* Partial vertical filter */
-        VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t );
-        VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t );
-        VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t );
-        VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t );
-        VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t );
-        VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t );
+        VEC_LOAD_PARTIAL( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src );
+        VEC_LOAD_PARTIAL( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src );
+        VEC_LOAD_PARTIAL( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src );
+        VEC_LOAD_PARTIAL( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src );
+        VEC_LOAD_PARTIAL( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src );
+        VEC_LOAD_PARTIAL( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src );
  
          temp1v = vec_u8_to_s16_h( src1v );
          temp2v = vec_u8_to_s16_h( src2v );
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c

index adf728f6ac2c7bf45a19ca1a4be46efbcff7a290..6a7218c451d4ed2ce74df63d322957ac520702df 100644 (file)
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -45,8 +45,8 @@ static int name( uint8_t *pix1, int i_pix1,            \
      vec_s32_t sumv = zero_s32v;                        \
      for( y = 0; y < ly; y++ )                          \
      {                                                  \
-        VEC_LOAD( pix1, pix1v, lx, vec_u8_t );         \
-        VEC_LOAD( pix2, pix2v, lx, vec_u8_t );         \
+        VEC_LOAD_G( pix1, pix1v, lx, vec_u8_t );       \
+        VEC_LOAD_G( pix2, pix2v, lx, vec_u8_t );       \
          sumv = (vec_s32_t) vec_sum4s(                  \
                     vec_sub( vec_max( pix1v, pix2v ),   \
                              vec_min( pix1v, pix2v ) ), \
@@ -123,14 +123,20 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
      DECLARE_ALIGNED_16( int i_satd );
  
      PREP_DIFF;
+    PREP_LOAD_SRC( pix1 );
      vec_s16_t diff0v, diff1v, diff2v, diff3v;
      vec_s16_t temp0v, temp1v, temp2v, temp3v;
      vec_s32_t satdv;
  
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
+    vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
+    vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
+
+
+
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
  
      /* Hadamar H */
      VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
@@ -167,10 +173,14 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
      vec_s16_t temp0v, temp1v, temp2v, temp3v;
      vec_s32_t satdv;
  
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
+    PREP_LOAD_SRC( pix1 );
+    vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
+    vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
+
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
      VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                   temp0v, temp1v, temp2v, temp3v );
      VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
@@ -182,10 +192,10 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
      VEC_ADD_ABS( temp2v, satdv,     satdv );
      VEC_ADD_ABS( temp3v, satdv,     satdv );
  
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
      VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                   temp0v, temp1v, temp2v, temp3v );
      VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
@@ -219,10 +229,16 @@ static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1,
                temp4v, temp5v, temp6v, temp7v;
      vec_s32_t satdv;
  
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
+
+    PREP_LOAD_SRC( pix1 );
+    vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
+    vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
+
+
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
  
      VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                   temp0v, temp1v, temp2v, temp3v );
@@ -268,14 +284,19 @@ static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
                temp4v, temp5v, temp6v, temp7v;
      vec_s32_t satdv;
  
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
+    PREP_LOAD_SRC( pix1 );
+    vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
+    vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
+
+
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
  
      VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                   temp0v, temp1v, temp2v, temp3v );
@@ -323,14 +344,18 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
                temp4v, temp5v, temp6v, temp7v;
      vec_s32_t satdv;
  
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
+    PREP_LOAD_SRC( pix1 );
+    vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
+    vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
+
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v , offset1v);
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
      VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                   temp0v, temp1v, temp2v, temp3v );
      VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
@@ -352,14 +377,14 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
      VEC_ADD_ABS( temp6v, satdv,     satdv );
      VEC_ADD_ABS( temp7v, satdv,     satdv );
  
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
      VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
                   temp0v, temp1v, temp2v, temp3v );
      VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
@@ -398,6 +423,7 @@ static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
  
      LOAD_ZERO;
      PREP_LOAD;
+    PREP_LOAD_SRC( pix2 );
      vec_s32_t satdv;
      vec_s16_t pix1v, pix2v;
      vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
@@ -489,6 +515,8 @@ static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
                diffl4v, diffl5v, diffl6v, diffl7v;
      vec_s16_t temp0v, temp1v, temp2v, temp3v,
                temp4v, temp5v, temp6v, temp7v;
+    PREP_LOAD_SRC( pix2 );
+
  
      VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
      VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
@@ -1715,18 +1743,20 @@ static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, int i_pix1, uint8_t *pix2
      int32_t i_satd=0;
  
      PREP_DIFF;
+    PREP_LOAD_SRC( pix1 );
+    PREP_LOAD_SRC( pix2 );
  
      vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
  
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, pix2 );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, pix2 );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, pix2 );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, pix2 );
  
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
-    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, pix2 );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, pix2 );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, pix2 );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, pix2 );
  
      vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
  
@@ -1806,14 +1836,16 @@ static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1,
      vec_u8_t pix1v, pix2v;
      vec_u32_t s1v, s2v, ssv, s12v;
      PREP_LOAD;
+    PREP_LOAD_SRC (pix1);
+    PREP_LOAD_SRC (pix2);
      LOAD_ZERO;
  
      s1v = s2v = ssv = s12v = zero_u32v;
  
      for(y=0; y<4; y++)
      {
-        VEC_LOAD( &pix1[y*stride1], pix1v, 16, vec_u8_t );
-        VEC_LOAD( &pix2[y*stride2], pix2v, 16, vec_u8_t );
+        VEC_LOAD( &pix1[y*stride1], pix1v, 16, vec_u8_t, pix1 );
+        VEC_LOAD( &pix2[y*stride2], pix2v, 16, vec_u8_t, pix2 );
  
          s1v = vec_sum4s( pix1v, s1v );
          s2v = vec_sum4s( pix2v, s2v );
diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h

index 2756e3851fc6aedfd02eb9be60705963f6ed4e59..9ad97c6b962a89787f56428ad7f1a2934a4637fc 100644 (file)
--- a/common/ppc/ppccommon.h
+++ b/common/ppc/ppccommon.h
@@ -80,16 +80,36 @@ typedef union {
  
  /***********************************************************************
   * PREP_LOAD: declares two vectors required to perform unaligned loads
- * VEC_LOAD:  loads n bytes from u8 * p into vector v of type t
+ * VEC_LOAD:  loads n bytes from u8 * p into vector v of type t where o is from original src offset
+ * VEC_LOAD:_G: loads n bytes from u8 * p into vectory v of type t - use when offset is not known
+ * VEC_LOAD_OFFSET: as above, but with offset vector known in advance
   **********************************************************************/
  #define PREP_LOAD \
      vec_u8_t _hv, _lv
  
-#define VEC_LOAD( p, v, n, t )                  \
+#define PREP_LOAD_SRC( src ) \
+    vec_u8_t _##src##_ = vec_lvsl(0, src) 
+
+#define VEC_LOAD_G( p, v, n, t )                  \
      _hv = vec_ld( 0, p );                       \
      v   = (t) vec_lvsl( 0, p );                 \
      _lv = vec_ld( n - 1, p );                   \
-    v   = (t) vec_perm( _hv, _lv, (vec_u8_t) v )
+    v   = (t) vec_perm( _hv, _lv, (vec_u8_t) v ) 
+
+#define VEC_LOAD( p, v, n, t, g )              \
+    _hv = vec_ld( 0, p );                       \
+    _lv = vec_ld( n - 1, p );                   \
+    v = (t) vec_perm( _hv, _lv, (vec_u8_t) _##g##_ )
+
+#define VEC_LOAD_OFFSET( p, v, n, t, o )        \
+    _hv = vec_ld( 0, p);                        \
+    _lv = vec_ld( n - 1, p );                   \
+    v   = (t) vec_perm( _hv, _lv, (vec_u8_t) o )
+
+#define VEC_LOAD_PARTIAL( p, v, n, t, g)    \
+    _hv = vec_ld( 0, p);                        \
+    v   = (t) vec_perm( _hv, _hv, (vec_u8_t) _##g##_ )
+    
  
  /***********************************************************************
   * PREP_STORE##n: declares required vectors to store n bytes to a
@@ -97,59 +117,35 @@ typedef union {
   * VEC_STORE##n:  stores n bytes from vector v to address p
   **********************************************************************/
  #define PREP_STORE16 \
-    vec_u8_t _tmp1v, _tmp2v \
+    vec_u8_t _tmp1v\
  
-#define VEC_STORE16( v, p ) \
+#define PREP_STORE16_DST( dst ) \
+    vec_u8_t _##dst##l_ = vec_lvsl(0, dst); \
+    vec_u8_t _##dst##r_ = vec_lvsr(0, dst);
+
+#define VEC_STORE16( v, p, o ) \
      _hv    = vec_ld( 0, p ); \
-    _tmp2v = vec_lvsl( 0, p ); \
      _lv    = vec_ld( 15, p ); \
-    _tmp1v = vec_perm( _lv, _hv, _tmp2v ); \
-    _tmp2v = vec_lvsr( 0, p ); \
-    _lv    = vec_perm( (vec_u8_t) v, _tmp1v, _tmp2v ); \
+    _tmp1v = vec_perm( _lv, _hv, _##o##l_ ); \
+    _lv    = vec_perm( (vec_u8_t) v, _tmp1v, _##o##r_ ); \
      vec_st( _lv, 15, (uint8_t *) p ); \
-    _hv    = vec_perm( _tmp1v, (vec_u8_t) v, _tmp2v ); \
-    vec_st( _hv, 0, (uint8_t *) p )
+    _hv    = vec_perm( _tmp1v, (vec_u8_t) v, _##o##r_ ); \
+    vec_st( _hv, 0, (uint8_t *) p ) 
+
  
  #define PREP_STORE8 \
-    PREP_STORE16; \
-    vec_u8_t _tmp3v, _tmp4v; \
-    const vec_u8_t sel_h = \
-        (vec_u8_t) CV(-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0)
-
-#define PREP_STORE8_HL \
-    PREP_STORE8; \
-    const vec_u8_t sel_l = \
-        (vec_u8_t) CV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1)
-
-#define VEC_STORE8 VEC_STORE8_H
-
-#define VEC_STORE8_H( v, p ) \
-    _tmp3v = vec_lvsr( 0, (uint8_t *) p ); \
-    _tmp4v = vec_perm( (vec_u8_t) v, (vec_u8_t) v, _tmp3v ); \
-    _lv    = vec_ld( 7, (uint8_t *) p ); \
-    _tmp1v = vec_perm( sel_h, zero_u8v, _tmp3v ); \
-    _lv    = vec_sel( _lv, _tmp4v, _tmp1v ); \
-    vec_st( _lv, 7, (uint8_t *) p ); \
-    _hv    = vec_ld( 0, (uint8_t *) p ); \
-    _tmp2v = vec_perm( zero_u8v, sel_h, _tmp3v ); \
-    _hv    = vec_sel( _hv, _tmp4v, _tmp2v ); \
-    vec_st( _hv, 0, (uint8_t *) p )
-
-#define VEC_STORE8_L( v, p ) \
-    _tmp3v = vec_lvsr( 8, (uint8_t *) p ); \
-    _tmp4v = vec_perm( (vec_u8_t) v, (vec_u8_t) v, _tmp3v ); \
-    _lv    = vec_ld( 7, (uint8_t *) p ); \
-    _tmp1v = vec_perm( sel_l, zero_u8v, _tmp3v ); \
-    _lv    = vec_sel( _lv, _tmp4v, _tmp1v ); \
-    vec_st( _lv, 7, (uint8_t *) p ); \
-    _hv    = vec_ld( 0, (uint8_t *) p ); \
-    _tmp2v = vec_perm( zero_u8v, sel_l, _tmp3v ); \
-    _hv    = vec_sel( _hv, _tmp4v, _tmp2v ); \
-    vec_st( _hv, 0, (uint8_t *) p )
+    vec_u8_t _tmp3v  \
+
+#define VEC_STORE8( v, p ) \
+    _tmp3v = vec_lvsl(0, p); \
+    v = vec_perm(v, v, _tmp3v); \
+    vec_ste((vec_u32_t)v,0,(uint32_t*)p); \
+    vec_ste((vec_u32_t)v,4,(uint32_t*)p)
+
  
  #define PREP_STORE4 \
      PREP_STORE16; \
-    vec_u8_t _tmp3v; \
+    vec_u8_t _tmp2v, _tmp3v; \
      const vec_u8_t sel = \
          (vec_u8_t) CV(-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0)
  
@@ -226,17 +222,18 @@ typedef union {
   * d:         s16v
   *
   * Loads n bytes from p1 and p2, do the diff of the high elements into
- * d, increments p1 and p2 by i1 and i2
+ * d, increments p1 and p2 by i1 and i2 into known offset g
   **********************************************************************/
  #define PREP_DIFF           \
      LOAD_ZERO;              \
      PREP_LOAD;              \
      vec_s16_t pix1v, pix2v;
  
-#define VEC_DIFF_H(p1,i1,p2,i2,n,d)      \
-    VEC_LOAD( p1, pix1v, n, vec_s16_t ); \
+
+#define VEC_DIFF_H(p1,i1,p2,i2,n,d,g)      \
+    VEC_LOAD_PARTIAL( p1, pix1v, n, vec_s16_t, p1); \
      pix1v = vec_u8_to_s16( pix1v );      \
-    VEC_LOAD( p2, pix2v, n, vec_s16_t ); \
+    VEC_LOAD( p2, pix2v, n, vec_s16_t, g); \
      pix2v = vec_u8_to_s16( pix2v );      \
      d     = vec_sub( pix1v, pix2v );     \
      p1   += i1;                          \
@@ -254,10 +251,10 @@ typedef union {
   * and i2
   **********************************************************************/
  #define VEC_DIFF_HL(p1,i1,p2,i2,dh,dl)    \
-    VEC_LOAD( p1, pix1v, 16, vec_s16_t ); \
+    pix1v = vec_ld(0, p1);                  \
      temp0v = vec_u8_to_s16_h( pix1v );    \
      temp1v = vec_u8_to_s16_l( pix1v );    \
-    VEC_LOAD( p2, pix2v, 16, vec_s16_t ); \
+    VEC_LOAD( p2, pix2v, 16, vec_s16_t, p2); \
      temp2v = vec_u8_to_s16_h( pix2v );    \
      temp3v = vec_u8_to_s16_l( pix2v );    \
      dh     = vec_sub( temp0v, temp2v );   \
author	David Wolstencroft <wolstencroft@alum.rpi.edu>
	Sat, 22 Nov 2008 16:54:38 +0000 (17:54 +0100)
committer	Guillaume Poirier <gpoirier@mplayerhq.hu>
	Tue, 25 Nov 2008 16:29:00 +0000 (17:29 +0100)
AUTHORS		patch \| blob \| history
common/ppc/deblock.c		patch \| blob \| history
common/ppc/mc.c		patch \| blob \| history
common/ppc/pixel.c		patch \| blob \| history
common/ppc/ppccommon.h		patch \| blob \| history