git.sesse.net Git - x264/blob - common/ppc/mc.c

   1 /*****************************************************************************
   2  * mc.c: h264 encoder library (Motion Compensation)
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Eric Petit <eric.petit@lapsus.org>
   7  *          Guillaume Poirier <gpoirier@mplayerhq.hu>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <string.h>
  27 #include <stdint.h>
  28 #include <stdarg.h>
  29
  30 #include "x264.h"
  31 #include "common/common.h"
  32 #include "common/mc.h"
  33 #include "mc.h"
  34 #include "ppccommon.h"
  35
  36 typedef void (*pf_mc_t)( uint8_t *src, int i_src,
  37                          uint8_t *dst, int i_dst, int i_height );
  38
  39
  40 static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
  41 static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
  42
  43
  44 static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
  45 {
  46     return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
  47            pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
  48            pix[ 3*i_pix_next];
  49 }
  50 static inline int x264_tapfilter1( uint8_t *pix )
  51 {
  52     return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
  53            pix[ 3];
  54 }
  55
  56
  57 static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst,  int i_dst,
  58                                                uint8_t *src1, int i_src1,
  59                                                uint8_t *src2, int i_height )
  60 {
  61     int x, y;
  62     for( y = 0; y < i_height; y++ )
  63     {
  64         for( x = 0; x < 4; x++ )
  65         {
  66             dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
  67         }
  68         dst  += i_dst;
  69         src1 += i_src1;
  70         src2 += i_src1;
  71     }
  72 }
  73
  74 static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst,  int i_dst,
  75                                                uint8_t *src1, int i_src1,
  76                                                uint8_t *src2, int i_height )
  77 {
  78     int y;
  79     vec_u8_t src1v, src2v;
  80     PREP_LOAD;
  81     PREP_STORE8;
  82     PREP_LOAD_SRC( src1 );
  83     PREP_LOAD_SRC( src2 );
  84
  85     for( y = 0; y < i_height; y++ )
  86     {
  87         VEC_LOAD( src1, src1v, 8, vec_u8_t, src1 );
  88         VEC_LOAD( src2, src2v, 8, vec_u8_t, src2 );
  89         src1v = vec_avg( src1v, src2v );
  90         VEC_STORE8( src1v, dst );
  91
  92         dst  += i_dst;
  93         src1 += i_src1;
  94         src2 += i_src1;
  95     }
  96 }
  97
  98 static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst,  int i_dst,
  99                                                 uint8_t *src1, int i_src1,
 100                                                 uint8_t *src2, int i_height )
 101 {
 102     int y;
 103     vec_u8_t src1v, src2v;
 104     PREP_LOAD;
 105     PREP_LOAD_SRC( src1 );
 106     PREP_LOAD_SRC( src2 );
 107
 108     for( y = 0; y < i_height; y++ )
 109     {
 110         VEC_LOAD( src1, src1v, 16, vec_u8_t, src1 );
 111         VEC_LOAD( src2, src2v, 16, vec_u8_t, src2 );
 112         src1v = vec_avg( src1v, src2v );
 113         vec_st(src1v, 0, dst);
 114
 115         dst  += i_dst;
 116         src1 += i_src1;
 117         src2 += i_src1;
 118     }
 119 }
 120
 121 static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst,  int i_dst,
 122                                                 uint8_t *src1, int i_src1,
 123                                                 uint8_t *src2, int i_height )
 124 {
 125     x264_pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height);
 126     x264_pixel_avg2_w4_altivec(dst+16, i_dst, src1+16, i_src1, src2+16, i_height);
 127 }
 128
 129 /* mc_copy: plain c */
 130
 131 #define MC_COPY( name, a )                                \
 132 static void name( uint8_t *dst, int i_dst,                \
 133                   uint8_t *src, int i_src, int i_height ) \
 134 {                                                         \
 135     int y;                                                \
 136     for( y = 0; y < i_height; y++ )                       \
 137     {                                                     \
 138         memcpy( dst, src, a );                            \
 139         src += i_src;                                     \
 140         dst += i_dst;                                     \
 141     }                                                     \
 142 }
 143 MC_COPY( x264_mc_copy_w4_altivec,  4  )
 144 MC_COPY( x264_mc_copy_w8_altivec,  8  )
 145
 146 static void x264_mc_copy_w16_altivec( uint8_t *dst, int i_dst,
 147                                       uint8_t *src, int i_src, int i_height )
 148 {
 149     int y;
 150     vec_u8_t cpyV;
 151     PREP_LOAD;
 152     PREP_LOAD_SRC( src );
 153
 154     for( y = 0; y < i_height; y++)
 155     {
 156         VEC_LOAD( src, cpyV, 16, vec_u8_t, src );
 157         vec_st(cpyV, 0, dst);
 158
 159         src += i_src;
 160         dst += i_dst;
 161     }
 162 }
 163
 164
 165 static void mc_luma_altivec( uint8_t *dst,    int i_dst_stride,
 166                              uint8_t *src[4], int i_src_stride,
 167                              int mvx, int mvy,
 168                              int i_width, int i_height )
 169 {
 170     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
 171     int offset = (mvy>>2)*i_src_stride + (mvx>>2);
 172     uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
 173     if( qpel_idx & 5 ) /* qpel interpolation needed */
 174     {
 175         uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
 176
 177         switch(i_width) {
 178         case 4:
 179             x264_pixel_avg2_w4_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
 180             break;
 181         case 8:
 182             x264_pixel_avg2_w8_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
 183             break;
 184         case 16:
 185         default:
 186             x264_pixel_avg2_w16_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
 187         }
 188
 189     }
 190     else
 191     {
 192         switch(i_width) {
 193         case 4:
 194             x264_mc_copy_w4_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
 195             break;
 196         case 8:
 197             x264_mc_copy_w8_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
 198             break;
 199         case 16:
 200             x264_mc_copy_w16_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
 201             break;
 202         }
 203     }
 204 }
 205
 206
 207
 208 static uint8_t *get_ref_altivec( uint8_t *dst,   int *i_dst_stride,
 209                                  uint8_t *src[4], int i_src_stride,
 210                                  int mvx, int mvy,
 211                                  int i_width, int i_height )
 212 {
 213     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
 214     int offset = (mvy>>2)*i_src_stride + (mvx>>2);
 215     uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
 216     if( qpel_idx & 5 ) /* qpel interpolation needed */
 217     {
 218         uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
 219         switch(i_width) {
 220         case 4:
 221             x264_pixel_avg2_w4_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
 222             break;
 223         case 8:
 224             x264_pixel_avg2_w8_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
 225             break;
 226         case 12:
 227         case 16:
 228         default:
 229             x264_pixel_avg2_w16_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
 230             break;
 231         case 20:
 232             x264_pixel_avg2_w20_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
 233             break;
 234         }
 235         return dst;
 236     }
 237     else
 238     {
 239         *i_dst_stride = i_src_stride;
 240         return src1;
 241     }
 242 }
 243
 244 #define DO_PROCESS(a) \
 245         src##a##v_16 = vec_u8_to_u16( src##a##v_8 ); \
 246         src##a##v_16 = vec_mladd( coeff##a##v, src##a##v_16, zero_u16v ); \
 247         dstv_16      = vec_add( dstv_16, src##a##v_16 )
 248
 249 static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
 250                                    uint8_t *src, int i_src_stride,
 251                                    int mvx, int mvy,
 252                                    int i_height )
 253 {
 254     uint8_t *srcp;
 255     int y;
 256     int d8x = mvx & 0x07;
 257     int d8y = mvy & 0x07;
 258
 259     DECLARE_ALIGNED_16( uint16_t coeff[4] );
 260     coeff[0] = (8-d8x)*(8-d8y);
 261     coeff[1] = d8x    *(8-d8y);
 262     coeff[2] = (8-d8x)*d8y;
 263     coeff[3] = d8x    *d8y;
 264
 265     src  += (mvy >> 3) * i_src_stride + (mvx >> 3);
 266     srcp  = &src[i_src_stride];
 267
 268     LOAD_ZERO;
 269     PREP_LOAD;
 270     PREP_LOAD_SRC( src );
 271     PREP_STORE4;
 272     vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
 273     vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
 274     vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16;
 275     vec_u8_t    dstv_8;
 276     vec_u16_t   dstv_16;
 277     vec_u8_t    permv;
 278     vec_u16_t   shiftv;
 279     vec_u16_t   k32v;
 280
 281     coeff0v = vec_ld( 0, coeff );
 282     coeff3v = vec_splat( coeff0v, 3 );
 283     coeff2v = vec_splat( coeff0v, 2 );
 284     coeff1v = vec_splat( coeff0v, 1 );
 285     coeff0v = vec_splat( coeff0v, 0 );
 286     k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
 287     permv   = vec_lvsl( 0, (uint8_t *) 1 );
 288     shiftv  = vec_splat_u16( 6 );
 289
 290     VEC_LOAD( src, src2v_8, 5, vec_u8_t, src );
 291     src3v_8 = vec_perm( src2v_8, src2v_8, permv );
 292
 293     for( y = 0; y < i_height; y++ )
 294     {
 295         src0v_8 = src2v_8;
 296         src1v_8 = src3v_8;
 297         VEC_LOAD_G( srcp, src2v_8, 5, vec_u8_t );
 298         src3v_8 = vec_perm( src2v_8, src2v_8, permv );
 299
 300         dstv_16 = k32v;
 301
 302         DO_PROCESS( 0 );
 303         DO_PROCESS( 1 );
 304         DO_PROCESS( 2 );
 305         DO_PROCESS( 3 );
 306
 307         dstv_16 = vec_sr( dstv_16, shiftv );
 308         dstv_8  = vec_u16_to_u8( dstv_16 );
 309         VEC_STORE4( dstv_8, dst );
 310
 311         dst  += i_dst_stride;
 312         srcp += i_src_stride;
 313     }
 314 }
 315
 316 static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
 317                                    uint8_t *src, int i_src_stride,
 318                                    int mvx, int mvy,
 319                                    int i_height )
 320 {
 321     uint8_t *srcp;
 322     int y;
 323     int d8x = mvx & 0x07;
 324     int d8y = mvy & 0x07;
 325
 326     DECLARE_ALIGNED_16( uint16_t coeff[4] );
 327     coeff[0] = (8-d8x)*(8-d8y);
 328     coeff[1] = d8x    *(8-d8y);
 329     coeff[2] = (8-d8x)*d8y;
 330     coeff[3] = d8x    *d8y;
 331
 332     src  += (mvy >> 3) * i_src_stride + (mvx >> 3);
 333     srcp  = &src[i_src_stride];
 334
 335     LOAD_ZERO;
 336     PREP_LOAD;
 337     PREP_LOAD_SRC( src );
 338     PREP_STORE8;
 339     vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
 340     vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
 341     vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16;
 342     vec_u8_t    dstv_8;
 343     vec_u16_t   dstv_16;
 344     vec_u8_t    permv;
 345     vec_u16_t   shiftv;
 346     vec_u16_t   k32v;
 347
 348     coeff0v = vec_ld( 0, coeff );
 349     coeff3v = vec_splat( coeff0v, 3 );
 350     coeff2v = vec_splat( coeff0v, 2 );
 351     coeff1v = vec_splat( coeff0v, 1 );
 352     coeff0v = vec_splat( coeff0v, 0 );
 353     k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
 354     permv   = vec_lvsl( 0, (uint8_t *) 1 );
 355     shiftv  = vec_splat_u16( 6 );
 356
 357     VEC_LOAD( src, src2v_8, 9, vec_u8_t, src);
 358     src3v_8 = vec_perm( src2v_8, src2v_8, permv );
 359
 360     for( y = 0; y < i_height; y++ )
 361     {
 362         src0v_8 = src2v_8;
 363         src1v_8 = src3v_8;
 364         VEC_LOAD_G( srcp, src2v_8, 9, vec_u8_t );
 365         src3v_8 = vec_perm( src2v_8, src2v_8, permv );
 366
 367         dstv_16 = k32v;
 368
 369         DO_PROCESS( 0 );
 370         DO_PROCESS( 1 );
 371         DO_PROCESS( 2 );
 372         DO_PROCESS( 3 );
 373
 374         dstv_16 = vec_sr( dstv_16, shiftv );
 375         dstv_8  = vec_u16_to_u8( dstv_16 );
 376         VEC_STORE8( dstv_8, dst );
 377
 378         dst  += i_dst_stride;
 379         srcp += i_src_stride;
 380     }
 381 }
 382
 383 static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride,
 384                                uint8_t *src, int i_src_stride,
 385                                int mvx, int mvy,
 386                                int i_width, int i_height )
 387 {
 388     if( i_width == 8 )
 389     {
 390         mc_chroma_altivec_8xh( dst, i_dst_stride, src, i_src_stride,
 391                                mvx, mvy, i_height );
 392     }
 393     else
 394     {
 395         mc_chroma_altivec_4xh( dst, i_dst_stride, src, i_src_stride,
 396                                mvx, mvy, i_height );
 397     }
 398 }
 399
 400 #define HPEL_FILTER_1( t1v, t2v, t3v, t4v, t5v, t6v ) \
 401 {                                                     \
 402     t1v = vec_add( t1v, t6v );                        \
 403     t2v = vec_add( t2v, t5v );                        \
 404     t3v = vec_add( t3v, t4v );                        \
 405                                                       \
 406     t1v = vec_sub( t1v, t2v );   /* (a-b) */          \
 407     t2v = vec_sub( t2v, t3v );   /* (b-c) */          \
 408     t2v = vec_sl(  t2v, twov );  /* (b-c)*4 */        \
 409     t1v = vec_sub( t1v, t2v );   /* a-5*b+4*c */      \
 410     t3v = vec_sl(  t3v, fourv ); /* 16*c */           \
 411     t1v = vec_add( t1v, t3v );   /* a-5*b+20*c */     \
 412 }
 413
 414 #define HPEL_FILTER_2( t1v, t2v, t3v, t4v, t5v, t6v ) \
 415 {                                                     \
 416     t1v = vec_add( t1v, t6v );                        \
 417     t2v = vec_add( t2v, t5v );                        \
 418     t3v = vec_add( t3v, t4v );                        \
 419                                                       \
 420     t1v = vec_sub( t1v, t2v );  /* (a-b) */           \
 421     t1v = vec_sra( t1v, twov ); /* (a-b)/4 */         \
 422     t1v = vec_sub( t1v, t2v );  /* (a-b)/4-b */       \
 423     t1v = vec_add( t1v, t3v );  /* (a-b)/4-b+c */     \
 424     t1v = vec_sra( t1v, twov ); /* ((a-b)/4-b+c)/4 */ \
 425     t1v = vec_add( t1v, t3v );  /* ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 */ \
 426 }
 427
 428 #define HPEL_FILTER_HORIZONTAL()                             \
 429 {                                                            \
 430     VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \
 431     VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \
 432                                                              \
 433     src2v = vec_sld( src1v, src6v,  1 );                     \
 434     src3v = vec_sld( src1v, src6v,  2 );                     \
 435     src4v = vec_sld( src1v, src6v,  3 );                     \
 436     src5v = vec_sld( src1v, src6v,  4 );                     \
 437     src6v = vec_sld( src1v, src6v,  5 );                     \
 438                                                              \
 439     temp1v = vec_u8_to_s16_h( src1v );                       \
 440     temp2v = vec_u8_to_s16_h( src2v );                       \
 441     temp3v = vec_u8_to_s16_h( src3v );                       \
 442     temp4v = vec_u8_to_s16_h( src4v );                       \
 443     temp5v = vec_u8_to_s16_h( src5v );                       \
 444     temp6v = vec_u8_to_s16_h( src6v );                       \
 445                                                              \
 446     HPEL_FILTER_1( temp1v, temp2v, temp3v,                   \
 447                    temp4v, temp5v, temp6v );                 \
 448                                                              \
 449     dest1v = vec_add( temp1v, sixteenv );                    \
 450     dest1v = vec_sra( dest1v, fivev );                       \
 451                                                              \
 452     temp1v = vec_u8_to_s16_l( src1v );                       \
 453     temp2v = vec_u8_to_s16_l( src2v );                       \
 454     temp3v = vec_u8_to_s16_l( src3v );                       \
 455     temp4v = vec_u8_to_s16_l( src4v );                       \
 456     temp5v = vec_u8_to_s16_l( src5v );                       \
 457     temp6v = vec_u8_to_s16_l( src6v );                       \
 458                                                              \
 459     HPEL_FILTER_1( temp1v, temp2v, temp3v,                   \
 460                    temp4v, temp5v, temp6v );                 \
 461                                                              \
 462     dest2v = vec_add( temp1v, sixteenv );                    \
 463     dest2v = vec_sra( dest2v, fivev );                       \
 464                                                              \
 465     destv = vec_packsu( dest1v, dest2v );                    \
 466                                                              \
 467     VEC_STORE16( destv, &dsth[x+i_stride*y], dsth );         \
 468 }
 469
 470 #define HPEL_FILTER_VERTICAL()                                    \
 471 {                                                                 \
 472     VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src ); \
 473     VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src ); \
 474     VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src ); \
 475     VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src ); \
 476     VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src ); \
 477     VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src ); \
 478                                                                   \
 479     temp1v = vec_u8_to_s16_h( src1v );                            \
 480     temp2v = vec_u8_to_s16_h( src2v );                            \
 481     temp3v = vec_u8_to_s16_h( src3v );                            \
 482     temp4v = vec_u8_to_s16_h( src4v );                            \
 483     temp5v = vec_u8_to_s16_h( src5v );                            \
 484     temp6v = vec_u8_to_s16_h( src6v );                            \
 485                                                                   \
 486     HPEL_FILTER_1( temp1v, temp2v, temp3v,                        \
 487                    temp4v, temp5v, temp6v );                      \
 488                                                                   \
 489     dest1v = vec_add( temp1v, sixteenv );                         \
 490     dest1v = vec_sra( dest1v, fivev );                            \
 491                                                                   \
 492     temp4v = vec_u8_to_s16_l( src1v );                            \
 493     temp5v = vec_u8_to_s16_l( src2v );                            \
 494     temp6v = vec_u8_to_s16_l( src3v );                            \
 495     temp7v = vec_u8_to_s16_l( src4v );                            \
 496     temp8v = vec_u8_to_s16_l( src5v );                            \
 497     temp9v = vec_u8_to_s16_l( src6v );                            \
 498                                                                   \
 499     HPEL_FILTER_1( temp4v, temp5v, temp6v,                        \
 500                    temp7v, temp8v, temp9v );                      \
 501                                                                   \
 502     dest2v = vec_add( temp4v, sixteenv );                         \
 503     dest2v = vec_sra( dest2v, fivev );                            \
 504                                                                   \
 505     destv = vec_packsu( dest1v, dest2v );                         \
 506                                                                   \
 507     VEC_STORE16( destv, &dstv[x+i_stride*y], dsth );              \
 508 }
 509
 510 #define HPEL_FILTER_CENTRAL()                           \
 511 {                                                       \
 512     temp1v = vec_sld( tempav, tempbv, 12 );             \
 513     temp2v = vec_sld( tempav, tempbv, 14 );             \
 514     temp3v = tempbv;                                    \
 515     temp4v = vec_sld( tempbv, tempcv,  2 );             \
 516     temp5v = vec_sld( tempbv, tempcv,  4 );             \
 517     temp6v = vec_sld( tempbv, tempcv,  6 );             \
 518                                                         \
 519     HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
 520                    temp4v, temp5v, temp6v );            \
 521                                                         \
 522     dest1v = vec_add( temp1v, thirtytwov );             \
 523     dest1v = vec_sra( dest1v, sixv );                   \
 524                                                         \
 525     temp1v = vec_sld( tempbv, tempcv, 12 );             \
 526     temp2v = vec_sld( tempbv, tempcv, 14 );             \
 527     temp3v = tempcv;                                    \
 528     temp4v = vec_sld( tempcv, tempdv,  2 );             \
 529     temp5v = vec_sld( tempcv, tempdv,  4 );             \
 530     temp6v = vec_sld( tempcv, tempdv,  6 );             \
 531                                                         \
 532     HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
 533                    temp4v, temp5v, temp6v );            \
 534                                                         \
 535     dest2v = vec_add( temp1v, thirtytwov );             \
 536     dest2v = vec_sra( dest2v, sixv );                   \
 537                                                         \
 538     destv = vec_packsu( dest1v, dest2v );               \
 539                                                         \
 540     VEC_STORE16( destv, &dstc[x-16+i_stride*y], dsth ); \
 541 }
 542
 543 void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
 544                                int i_stride, int i_width, int i_height )
 545 {
 546     int x, y;
 547
 548     vec_u8_t destv;
 549     vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v;
 550     vec_s16_t dest1v, dest2v;
 551     vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, temp8v, temp9v;
 552     vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;
 553
 554     PREP_LOAD;
 555     PREP_LOAD_SRC( src);
 556     PREP_STORE16;
 557     PREP_STORE16_DST( dsth );
 558     LOAD_ZERO;
 559
 560     vec_u16_t twov, fourv, fivev, sixv;
 561     vec_s16_t sixteenv, thirtytwov;
 562     vect_ushort_u temp_u;
 563
 564     temp_u.s[0]=2;
 565     twov = vec_splat( temp_u.v, 0 );
 566     temp_u.s[0]=4;
 567     fourv = vec_splat( temp_u.v, 0 );
 568     temp_u.s[0]=5;
 569     fivev = vec_splat( temp_u.v, 0 );
 570     temp_u.s[0]=6;
 571     sixv = vec_splat( temp_u.v, 0 );
 572     temp_u.s[0]=16;
 573     sixteenv = (vec_s16_t)vec_splat( temp_u.v, 0 );
 574     temp_u.s[0]=32;
 575     thirtytwov = (vec_s16_t)vec_splat( temp_u.v, 0 );
 576
 577     for( y = 0; y < i_height; y++ )
 578     {
 579         x = 0;
 580
 581         /* horizontal_filter */
 582         HPEL_FILTER_HORIZONTAL();
 583
 584         /* vertical_filter */
 585         HPEL_FILTER_VERTICAL();
 586
 587         /* central_filter */
 588         tempav = tempcv;
 589         tempbv = tempdv;
 590         tempcv = vec_splat( temp1v, 0 ); /* first only */
 591         tempdv = temp1v;
 592         tempev = temp4v;
 593
 594         for( x = 16; x < i_width; x+=16 )
 595         {
 596             /* horizontal_filter */
 597             HPEL_FILTER_HORIZONTAL();
 598
 599             /* vertical_filter */
 600             HPEL_FILTER_VERTICAL();
 601
 602             /* central_filter */
 603             tempav = tempcv;
 604             tempbv = tempdv;
 605             tempcv = tempev;
 606             tempdv = temp1v;
 607             tempev = temp4v;
 608
 609             HPEL_FILTER_CENTRAL();
 610         }
 611
 612         /* Partial vertical filter */
 613         VEC_LOAD_PARTIAL( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src );
 614         VEC_LOAD_PARTIAL( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src );
 615         VEC_LOAD_PARTIAL( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src );
 616         VEC_LOAD_PARTIAL( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src );
 617         VEC_LOAD_PARTIAL( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src );
 618         VEC_LOAD_PARTIAL( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src );
 619
 620         temp1v = vec_u8_to_s16_h( src1v );
 621         temp2v = vec_u8_to_s16_h( src2v );
 622         temp3v = vec_u8_to_s16_h( src3v );
 623         temp4v = vec_u8_to_s16_h( src4v );
 624         temp5v = vec_u8_to_s16_h( src5v );
 625         temp6v = vec_u8_to_s16_h( src6v );
 626
 627         HPEL_FILTER_1( temp1v, temp2v, temp3v,
 628                        temp4v, temp5v, temp6v );
 629
 630         /* central_filter */
 631         tempav = tempcv;
 632         tempbv = tempdv;
 633         tempcv = tempev;
 634         tempdv = temp1v;
 635         /* tempev is not used */
 636
 637         HPEL_FILTER_CENTRAL();
 638     }
 639 }
 640
 641 void x264_mc_altivec_init( x264_mc_functions_t *pf )
 642 {
 643     pf->mc_luma   = mc_luma_altivec;
 644     pf->get_ref   = get_ref_altivec;
 645     pf->mc_chroma = mc_chroma_altivec;
 646
 647     pf->hpel_filter = x264_hpel_filter_altivec;
 648 }