git.sesse.net Git - x264/blob - common/ppc/mc.c

   1 /*****************************************************************************
   2  * mc.c: h264 encoder library (Motion Compensation)
   3  *****************************************************************************
   4  * Copyright (C) 2003 Laurent Aimar
   5  * $Id: mc.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
   6  *
   7  * Authors: Eric Petit <titer@m0k.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <string.h>
  27 #include <stdint.h>
  28 #include <stdarg.h>
  29
  30 #ifdef SYS_LINUX
  31 #include <altivec.h>
  32 #endif
  33
  34 #include "x264.h"
  35 #include "common/mc.h"
  36 #include "common/clip1.h"
  37 #include "mc.h"
  38 #include "ppccommon.h"
  39
  40 typedef void (*pf_mc_t)( uint8_t *src, int i_src,
  41                          uint8_t *dst, int i_dst, int i_height );
  42
  43 static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
  44 {
  45     return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
  46            pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
  47            pix[ 3*i_pix_next];
  48 }
  49 static inline int x264_tapfilter1( uint8_t *pix )
  50 {
  51     return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
  52            pix[ 3];
  53 }
  54
  55 /* pixel_avg */
  56 static inline void pixel_avg_w4( uint8_t *dst,  int i_dst,
  57                                  uint8_t *src1, int i_src1,
  58                                  uint8_t *src2, int i_src2,
  59                                  int i_height )
  60 {
  61     int x, y;
  62     for( y = 0; y < i_height; y++ )
  63     {
  64         for( x = 0; x < 4; x++ )
  65         {
  66             dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
  67         }
  68         dst  += i_dst;
  69         src1 += i_src1;
  70         src2 += i_src2;
  71     }
  72 }
  73 static inline void pixel_avg_w8( uint8_t *dst,  int i_dst,
  74                                  uint8_t *src1, int i_src1,
  75                                  uint8_t *src2, int i_src2,
  76                                  int i_height )
  77 {
  78     /* TODO - optimize */
  79     pixel_avg_w4( &dst[0], i_dst, &src1[0], i_src1, &src2[0], i_src2,
  80                   i_height );
  81     pixel_avg_w4( &dst[4], i_dst, &src1[4], i_src1, &src2[4], i_src2,
  82                   i_height );
  83 }
  84 static inline void pixel_avg_w16( uint8_t *dst,  int i_dst,
  85                                   uint8_t *src1, int i_src1,
  86                                   uint8_t *src2, int i_src2,
  87                                   int i_height )
  88 {
  89     int y;
  90     vec_u8_t src1v, src2v;
  91     for( y = 0; y < i_height; y++ )
  92     {
  93         LOAD_16( src1, src1v );
  94         LOAD_16( src2, src2v );
  95         src1v = vec_avg( src1v, src2v );
  96         STORE_16( src1v, dst );
  97
  98         dst  += i_dst;
  99         src1 += i_src1;
 100         src2 += i_src2;
 101     }
 102 }
 103
 104 /* mc_copy: plain c */
 105 #define MC_COPY( name, a )                                \
 106 static void name( uint8_t *src, int i_src,                \
 107                   uint8_t *dst, int i_dst, int i_height ) \
 108 {                                                         \
 109     int y;                                                \
 110     for( y = 0; y < i_height; y++ )                       \
 111     {                                                     \
 112         memcpy( dst, src, a );                            \
 113         src += i_src;                                     \
 114         dst += i_dst;                                     \
 115     }                                                     \
 116 }
 117 MC_COPY( mc_copy_w4,  4  )
 118 MC_COPY( mc_copy_w8,  8  )
 119 MC_COPY( mc_copy_w16, 16 )
 120
 121 /* TAP_FILTER:
 122    a is source (vec_s16_t [6])
 123    b is a temporary vec_s16_t
 124    c is the result
 125
 126    c   = src[0] + a[5] - 5 * ( a[1] + a[4] ) + 20 * ( a[2] + a[3] );
 127    c  += 16;
 128    c >>= 5;
 129    c  += 80; */
 130 #define TAP_FILTER( a, b, c )                       \
 131     c = vec_add( a[0], a[5] );                      \
 132     b = vec_add( a[1], a[4] );                      \
 133     c = vec_sub( c, b );                            \
 134     b = vec_sl( b, vec_splat_u16( 2 ) );            \
 135     c = vec_sub( c, b );                            \
 136     b = vec_add( a[2], a[3] );                      \
 137     b = vec_sl( b, vec_splat_u16( 2 ) );            \
 138     c = vec_add( c, b );                            \
 139     b = vec_sl( b, vec_splat_u16( 2 ) );            \
 140     c = vec_add( c, b );                            \
 141     c = vec_add( c, vec_splat_s16( 8 ) );           \
 142     c = vec_add( c, vec_splat_s16( 8 ) );           \
 143     c = vec_sr( c, vec_splat_u16( 5 ) );            \
 144     c = vec_add( c, vec_sl( vec_splat_s16( 5 ),     \
 145                             vec_splat_u16( 4 ) ) );
 146
 147 /* mc_hh */
 148 static inline void mc_hh_w4( uint8_t *src, int i_src,
 149                              uint8_t *dst, int i_dst, int i_height )
 150 {
 151     int x, y;
 152     for( y = 0; y < i_height; y++ )
 153     {
 154         for( x = 0; x < 4; x++ )
 155         {
 156             dst[x] = x264_mc_clip1( ( x264_tapfilter1( &src[x] ) +
 157                                       16 ) >> 5 );
 158         }
 159         src += i_src;
 160         dst += i_dst;
 161     }
 162 }
 163 static inline void mc_hh_w8( uint8_t *src, int i_src,
 164                              uint8_t *dst, int i_dst, int i_height )
 165 {
 166     long x, y;
 167     DECLARE_ALIGNED( int16_t, tmp[8], 16 );
 168
 169     LOAD_ZERO;
 170     vec_u8_t    loadv;
 171     vec_s16_t   srcv[6];
 172     vec_u8_t  * _srcv = (vec_u8_t*) srcv;
 173     vec_s16_t   dstv;
 174     vec_s16_t   tmpv;
 175
 176     for( y = 0; y < i_height; y++ )
 177     {
 178         LOAD_16( &src[-2], loadv );
 179
 180         for( x = 0; x < 6; x++ )
 181         {
 182             _srcv[x] = vec_perm( loadv, zero_u8v,
 183                                  vec_lvsl( 0, (int*) x ) );
 184             CONVERT_U8_TO_S16( srcv[x], srcv[x] );
 185         }
 186
 187         TAP_FILTER( srcv, tmpv, dstv );
 188         vec_st( dstv, 0, tmp );
 189
 190         for( x = 0; x < 8; x++ )
 191         {
 192             dst[x] = x264_mc_clip1_table[tmp[x]];
 193         }
 194
 195         src += i_src;
 196         dst += i_dst;
 197     }
 198 }
 199 static inline void mc_hh_w16( uint8_t *src, int i_src,
 200                               uint8_t *dst, int i_dst, int i_height )
 201 {
 202     mc_hh_w8( &src[0], i_src, &dst[0], i_dst, i_height );
 203     mc_hh_w8( &src[8], i_src, &dst[8], i_dst, i_height );
 204 }
 205
 206 /* mc_hv */
 207 static inline void mc_hv_w4( uint8_t *src, int i_src,
 208                              uint8_t *dst, int i_dst, int i_height )
 209 {
 210     int x, y;
 211     for( y = 0; y < i_height; y++ )
 212     {
 213         for( x = 0; x < 4; x++ )
 214         {
 215             dst[x] = x264_mc_clip1( ( x264_tapfilter( &src[x], i_src ) +
 216                                       16 ) >> 5 );
 217         }
 218         src += i_src;
 219         dst += i_dst;
 220     }
 221 }
 222 static inline void mc_hv_w8( uint8_t *src, int i_src,
 223                              uint8_t *dst, int i_dst, int i_height )
 224 {
 225     int x, y;
 226     DECLARE_ALIGNED( int16_t, tmp[8], 16 );
 227
 228     LOAD_ZERO;
 229     vec_s16_t   srcv[6];
 230     vec_u8_t  * _srcv = (vec_u8_t*) srcv;
 231     vec_s16_t   dstv;
 232     vec_s16_t   tmpv;
 233
 234     for( y = 0; y < i_height; y++ )
 235     {
 236         if( y )
 237         {
 238             for( x = 0; x < 5; x++ )
 239             {
 240                 srcv[x] = srcv[x+1];
 241             }
 242             LOAD_8( &src[3*i_src], _srcv[5] );
 243             CONVERT_U8_TO_S16( srcv[5], srcv[5] );
 244         }
 245         else
 246         {
 247             for( x = 0; x < 6; x++ )
 248             {
 249                 LOAD_8( &src[(x-2)*i_src], _srcv[x] );
 250                 CONVERT_U8_TO_S16( srcv[x], srcv[x] );
 251             }
 252         }
 253
 254         TAP_FILTER( srcv, tmpv, dstv );
 255         vec_st( dstv, 0, tmp );
 256
 257         for( x = 0; x < 8; x++ )
 258         {
 259             dst[x] = x264_mc_clip1_table[tmp[x]];
 260         }
 261         src += i_src;
 262         dst += i_dst;
 263     }
 264 }
 265 static inline void mc_hv_w16( uint8_t *src, int i_src,
 266                               uint8_t *dst, int i_dst, int i_height )
 267 {
 268     mc_hv_w8( &src[0], i_src, &dst[0], i_dst, i_height );
 269     mc_hv_w8( &src[8], i_src, &dst[8], i_dst, i_height );
 270 }
 271
 272 /* mc_hc */
 273 static inline void mc_hc_w4( uint8_t *src, int i_src,
 274                              uint8_t *dst, int i_dst, int i_height )
 275 {
 276     uint8_t *out;
 277     uint8_t *pix;
 278     int x, y;
 279
 280     for( x = 0; x < 4; x++ )
 281     {
 282         int tap[6];
 283
 284         pix = &src[x];
 285         out = &dst[x];
 286
 287         tap[0] = x264_tapfilter1( &pix[-2*i_src] );
 288         tap[1] = x264_tapfilter1( &pix[-1*i_src] );
 289         tap[2] = x264_tapfilter1( &pix[ 0*i_src] );
 290         tap[3] = x264_tapfilter1( &pix[ 1*i_src] );
 291         tap[4] = x264_tapfilter1( &pix[ 2*i_src] );
 292
 293         for( y = 0; y < i_height; y++ )
 294         {
 295             tap[5] = x264_tapfilter1( &pix[ 3*i_src] );
 296
 297             *out = x264_mc_clip1( ( tap[0] - 5*tap[1] + 20 * tap[2] +
 298                                     20 * tap[3] -5*tap[4] + tap[5] +
 299                                     512 ) >> 10 );
 300
 301             /* Next line */
 302             pix += i_src;
 303             out += i_dst;
 304             tap[0] = tap[1];
 305             tap[1] = tap[2];
 306             tap[2] = tap[3];
 307             tap[3] = tap[4];
 308             tap[4] = tap[5];
 309         }
 310     }
 311 }
 312 static inline void mc_hc_w8( uint8_t *src, int i_src,
 313                              uint8_t *dst, int i_dst, int i_height )
 314 {
 315     /* TODO: optimize */
 316     mc_hc_w4( &src[0], i_src, &dst[0], i_dst, i_height );
 317     mc_hc_w4( &src[4], i_src, &dst[4], i_dst, i_height );
 318 }
 319 static inline void mc_hc_w16( uint8_t *src, int i_src,
 320                               uint8_t *dst, int i_dst, int i_height )
 321 {
 322     mc_hc_w8( &src[0], i_src, &dst[0], i_dst, i_height );
 323     mc_hc_w8( &src[8], i_src, &dst[8], i_dst, i_height );
 324 }
 325
 326 /* mc I+H */
 327 static void mc_xy10_w4( uint8_t *src, int i_src,
 328                         uint8_t *dst, int i_dst, int i_height )
 329 {
 330     uint8_t tmp[16*4];
 331     mc_hh_w4( src, i_src, tmp, 4, i_height );
 332     pixel_avg_w4( dst, i_dst, src, i_src, tmp, 4, i_height );
 333 }
 334 static void mc_xy10_w8( uint8_t *src, int i_src,
 335                         uint8_t *dst, int i_dst, int i_height )
 336 {
 337     uint8_t tmp[16*8];
 338     mc_hh_w8( src, i_src, tmp, 8, i_height );
 339     pixel_avg_w8( dst, i_dst, src, i_src, tmp, 8, i_height );
 340 }
 341 static void mc_xy10_w16( uint8_t *src, int i_src,
 342                          uint8_t *dst, int i_dst, int i_height )
 343 {
 344     uint8_t tmp[16*16];
 345     mc_hh_w16( src, i_src, tmp, 16, i_height );
 346     pixel_avg_w16( dst, i_dst, src, i_src, tmp, 16, i_height );
 347 }
 348
 349 static void mc_xy30_w4( uint8_t *src, int i_src,
 350                         uint8_t *dst, int i_dst, int i_height )
 351 {
 352     uint8_t tmp[16*4];
 353     mc_hh_w4( src, i_src, tmp, 4, i_height );
 354     pixel_avg_w4( dst, i_dst, src + 1, i_src, tmp, 4, i_height );
 355 }
 356 static void mc_xy30_w8( uint8_t *src, int i_src,
 357                         uint8_t *dst, int i_dst, int i_height )
 358 {
 359     uint8_t tmp[16*8];
 360     mc_hh_w8( src, i_src, tmp, 8, i_height );
 361     pixel_avg_w8( dst, i_dst, src + 1, i_src, tmp, 8, i_height );
 362 }
 363 static void mc_xy30_w16( uint8_t *src, int i_src,
 364                          uint8_t *dst, int i_dst, int i_height )
 365 {
 366     uint8_t tmp[16*16];
 367     mc_hh_w16( src, i_src, tmp, 16, i_height );
 368     pixel_avg_w16( dst, i_dst, src + 1, i_src, tmp, 16, i_height );
 369 }
 370
 371 /* mc I+V */
 372 static void mc_xy01_w4( uint8_t *src, int i_src,
 373                         uint8_t *dst, int i_dst, int i_height )
 374 {
 375     uint8_t tmp[16*4];
 376     mc_hv_w4( src, i_src, tmp, 4, i_height );
 377     pixel_avg_w4( dst, i_dst, src, i_src, tmp, 4, i_height );
 378 }
 379 static void mc_xy01_w8( uint8_t *src, int i_src,
 380                         uint8_t *dst, int i_dst, int i_height )
 381 {
 382     uint8_t tmp[16*8];
 383     mc_hv_w8( src, i_src, tmp, 8, i_height );
 384     pixel_avg_w8( dst, i_dst, src, i_src, tmp, 8, i_height );
 385 }
 386 static void mc_xy01_w16( uint8_t *src, int i_src,
 387                          uint8_t *dst, int i_dst, int i_height )
 388 {
 389     uint8_t tmp[16*16];
 390     mc_hv_w16( src, i_src, tmp, 16, i_height );
 391     pixel_avg_w16( dst, i_dst, src, i_src, tmp, 16, i_height );
 392 }
 393
 394 static void mc_xy03_w4( uint8_t *src, int i_src,
 395                         uint8_t *dst, int i_dst, int i_height )
 396 {
 397     uint8_t tmp[16*4];
 398     mc_hv_w4( src, i_src, tmp, 4, i_height );
 399     pixel_avg_w4( dst, i_dst, src + i_src, i_src, tmp, 4, i_height );
 400 }
 401 static void mc_xy03_w8( uint8_t *src, int i_src,
 402                         uint8_t *dst, int i_dst, int i_height )
 403 {
 404     uint8_t tmp[16*8];
 405     mc_hv_w8( src, i_src, tmp, 8, i_height );
 406     pixel_avg_w8( dst, i_dst, src + i_src, i_src, tmp, 8, i_height );
 407 }
 408 static void mc_xy03_w16( uint8_t *src, int i_src,
 409                          uint8_t *dst, int i_dst, int i_height )
 410 {
 411     uint8_t tmp[16*16];
 412     mc_hv_w16( src, i_src, tmp, 16, i_height );
 413     pixel_avg_w16( dst, i_dst, src + i_src, i_src, tmp, 16, i_height );
 414 }
 415
 416 /* H+V */
 417 static void mc_xy11_w4( uint8_t *src, int i_src,
 418                         uint8_t *dst, int i_dst, int i_height )
 419 {
 420     uint8_t tmp1[16*4];
 421     uint8_t tmp2[16*4];
 422     mc_hv_w4( src, i_src, tmp1, 4, i_height );
 423     mc_hh_w4( src, i_src, tmp2, 4, i_height );
 424     pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
 425 }
 426 static void mc_xy11_w8( uint8_t *src, int i_src,
 427                         uint8_t *dst, int i_dst, int i_height )
 428 {
 429     uint8_t tmp1[16*8];
 430     uint8_t tmp2[16*8];
 431     mc_hv_w8( src, i_src, tmp1, 8, i_height );
 432     mc_hh_w8( src, i_src, tmp2, 8, i_height );
 433     pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
 434 }
 435 static void mc_xy11_w16( uint8_t *src, int i_src,
 436                          uint8_t *dst, int i_dst, int i_height )
 437 {
 438     uint8_t tmp1[16*16];
 439     uint8_t tmp2[16*16];
 440     mc_hv_w16( src, i_src, tmp1, 16, i_height );
 441     mc_hh_w16( src, i_src, tmp2, 16, i_height );
 442     pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
 443 }
 444
 445 static void mc_xy31_w4( uint8_t *src, int i_src,
 446                         uint8_t *dst, int i_dst, int i_height )
 447 {
 448     uint8_t tmp1[16*4];
 449     uint8_t tmp2[16*4];
 450     mc_hv_w4( src+1, i_src, tmp1, 4, i_height );
 451     mc_hh_w4( src,   i_src, tmp2, 4, i_height );
 452     pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
 453 }
 454 static void mc_xy31_w8( uint8_t *src, int i_src,
 455                         uint8_t *dst, int i_dst, int i_height )
 456 {
 457     uint8_t tmp1[16*8];
 458     uint8_t tmp2[16*8];
 459     mc_hv_w8( src+1, i_src, tmp1, 8, i_height );
 460     mc_hh_w8( src,   i_src, tmp2, 8, i_height );
 461     pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
 462 }
 463 static void mc_xy31_w16( uint8_t *src, int i_src,
 464                          uint8_t *dst, int i_dst, int i_height )
 465 {
 466     uint8_t tmp1[16*16];
 467     uint8_t tmp2[16*16];
 468     mc_hv_w16( src+1, i_src, tmp1, 16, i_height );
 469     mc_hh_w16( src,   i_src, tmp2, 16, i_height );
 470     pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
 471 }
 472
 473 static void mc_xy13_w4( uint8_t *src, int i_src,
 474                         uint8_t *dst, int i_dst, int i_height )
 475 {
 476     uint8_t tmp1[16*4];
 477     uint8_t tmp2[16*4];
 478     mc_hv_w4( src,       i_src, tmp1, 4, i_height );
 479     mc_hh_w4( src+i_src, i_src, tmp2, 4, i_height );
 480     pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
 481 }
 482 static void mc_xy13_w8( uint8_t *src, int i_src,
 483                         uint8_t *dst, int i_dst, int i_height )
 484 {
 485     uint8_t tmp1[16*8];
 486     uint8_t tmp2[16*8];
 487     mc_hv_w8( src,       i_src, tmp1, 8, i_height );
 488     mc_hh_w8( src+i_src, i_src, tmp2, 8, i_height );
 489     pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
 490 }
 491 static void mc_xy13_w16( uint8_t *src, int i_src,
 492                          uint8_t *dst, int i_dst, int i_height )
 493 {
 494     uint8_t tmp1[16*16];
 495     uint8_t tmp2[16*16];
 496     mc_hv_w16( src,       i_src, tmp1, 16, i_height );
 497     mc_hh_w16( src+i_src, i_src, tmp2, 16, i_height );
 498     pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
 499 }
 500
 501 static void mc_xy33_w4( uint8_t *src, int i_src,
 502                         uint8_t *dst, int i_dst, int i_height )
 503 {
 504     uint8_t tmp1[16*4];
 505     uint8_t tmp2[16*4];
 506     mc_hv_w4( src+1,     i_src, tmp1, 4, i_height );
 507     mc_hh_w4( src+i_src, i_src, tmp2, 4, i_height );
 508     pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
 509 }
 510 static void mc_xy33_w8( uint8_t *src, int i_src,
 511                         uint8_t *dst, int i_dst, int i_height )
 512 {
 513     uint8_t tmp1[16*8];
 514     uint8_t tmp2[16*8];
 515     mc_hv_w8( src+1,     i_src, tmp1, 8, i_height );
 516     mc_hh_w8( src+i_src, i_src, tmp2, 8, i_height );
 517     pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
 518 }
 519 static void mc_xy33_w16( uint8_t *src, int i_src,
 520                          uint8_t *dst, int i_dst, int i_height )
 521 {
 522     uint8_t tmp1[16*16];
 523     uint8_t tmp2[16*16];
 524     mc_hv_w16( src+1,     i_src, tmp1, 16, i_height );
 525     mc_hh_w16( src+i_src, i_src, tmp2, 16, i_height );
 526     pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
 527 }
 528
 529 static void mc_xy21_w4( uint8_t *src, int i_src,
 530                         uint8_t *dst, int i_dst, int i_height )
 531 {
 532     uint8_t tmp1[16*4];
 533     uint8_t tmp2[16*4];
 534     mc_hc_w4( src, i_src, tmp1, 4, i_height );
 535     mc_hh_w4( src, i_src, tmp2, 4, i_height );
 536     pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
 537 }
 538 static void mc_xy21_w8( uint8_t *src, int i_src,
 539                         uint8_t *dst, int i_dst, int i_height )
 540 {
 541     uint8_t tmp1[16*8];
 542     uint8_t tmp2[16*8];
 543     mc_hc_w8( src, i_src, tmp1, 8, i_height );
 544     mc_hh_w8( src, i_src, tmp2, 8, i_height );
 545     pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
 546 }
 547 static void mc_xy21_w16( uint8_t *src, int i_src,
 548                          uint8_t *dst, int i_dst, int i_height )
 549 {
 550     uint8_t tmp1[16*16];
 551     uint8_t tmp2[16*16];
 552     mc_hc_w16( src, i_src, tmp1, 16, i_height );
 553     mc_hh_w16( src, i_src, tmp2, 16, i_height );
 554     pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
 555 }
 556
 557 static void mc_xy12_w4( uint8_t *src, int i_src,
 558                         uint8_t *dst, int i_dst, int i_height )
 559 {
 560     uint8_t tmp1[16*4];
 561     uint8_t tmp2[16*4];
 562     mc_hc_w4( src, i_src, tmp1, 4, i_height );
 563     mc_hv_w4( src, i_src, tmp2, 4, i_height );
 564     pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
 565 }
 566 static void mc_xy12_w8( uint8_t *src, int i_src,
 567                         uint8_t *dst, int i_dst, int i_height )
 568 {
 569     uint8_t tmp1[16*8];
 570     uint8_t tmp2[16*8];
 571     mc_hc_w8( src, i_src, tmp1, 8, i_height );
 572     mc_hv_w8( src, i_src, tmp2, 8, i_height );
 573     pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
 574 }
 575 static void mc_xy12_w16( uint8_t *src, int i_src,
 576                          uint8_t *dst, int i_dst, int i_height )
 577 {
 578     uint8_t tmp1[16*16];
 579     uint8_t tmp2[16*16];
 580     mc_hc_w16( src, i_src, tmp1, 16, i_height );
 581     mc_hv_w16( src, i_src, tmp2, 16, i_height );
 582     pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
 583 }
 584
 585 static void mc_xy32_w4( uint8_t *src, int i_src,
 586                         uint8_t *dst, int i_dst, int i_height )
 587 {
 588     uint8_t tmp1[16*4];
 589     uint8_t tmp2[16*4];
 590     mc_hc_w4( src,   i_src, tmp1, 4, i_height );
 591     mc_hv_w4( src+1, i_src, tmp2, 4, i_height );
 592     pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
 593 }
 594 static void mc_xy32_w8( uint8_t *src, int i_src,
 595                         uint8_t *dst, int i_dst, int i_height )
 596 {
 597     uint8_t tmp1[16*8];
 598     uint8_t tmp2[16*8];
 599     mc_hc_w8( src,   i_src, tmp1, 8, i_height );
 600     mc_hv_w8( src+1, i_src, tmp2, 8, i_height );
 601     pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
 602 }
 603 static void mc_xy32_w16( uint8_t *src, int i_src,
 604                          uint8_t *dst, int i_dst, int i_height )
 605 {
 606     uint8_t tmp1[16*16];
 607     uint8_t tmp2[16*16];
 608     mc_hc_w16( src,   i_src, tmp1, 16, i_height );
 609     mc_hv_w16( src+1, i_src, tmp2, 16, i_height );
 610     pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
 611 }
 612
 613 static void mc_xy23_w4( uint8_t *src, int i_src,
 614                         uint8_t *dst, int i_dst, int i_height )
 615 {
 616     uint8_t tmp1[16*4];
 617     uint8_t tmp2[16*4];
 618     mc_hc_w4( src,       i_src, tmp1, 4, i_height );
 619     mc_hh_w4( src+i_src, i_src, tmp2, 4, i_height );
 620     pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
 621 }
 622 static void mc_xy23_w8( uint8_t *src, int i_src,
 623                         uint8_t *dst, int i_dst, int i_height )
 624 {
 625     uint8_t tmp1[16*8];
 626     uint8_t tmp2[16*8];
 627     mc_hc_w8( src,       i_src, tmp1, 8, i_height );
 628     mc_hh_w8( src+i_src, i_src, tmp2, 8, i_height );
 629     pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
 630 }
 631 static void mc_xy23_w16( uint8_t *src, int i_src,
 632                          uint8_t *dst, int i_dst, int i_height )
 633 {
 634     uint8_t tmp1[16*16];
 635     uint8_t tmp2[16*16];
 636     mc_hc_w16( src,       i_src, tmp1, 16, i_height );
 637     mc_hh_w16( src+i_src, i_src, tmp2, 16, i_height );
 638     pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
 639 }
 640
 641 static void motion_compensation_luma( uint8_t *src, int i_src,
 642                                       uint8_t *dst, int i_dst,
 643                                       int mvx,int mvy,
 644                                       int i_width, int i_height )
 645 {
 646     static const pf_mc_t pf_mc[3][4][4] =    /*XXX [dqy][dqx] */
 647     {
 648         {
 649             { mc_copy_w4,  mc_xy10_w4,    mc_hh_w4,      mc_xy30_w4 },
 650             { mc_xy01_w4,  mc_xy11_w4,    mc_xy21_w4,    mc_xy31_w4 },
 651             { mc_hv_w4,    mc_xy12_w4,    mc_hc_w4,      mc_xy32_w4 },
 652             { mc_xy03_w4,  mc_xy13_w4,    mc_xy23_w4,    mc_xy33_w4 },
 653         },
 654         {
 655             { mc_copy_w8,  mc_xy10_w8,    mc_hh_w8,      mc_xy30_w8 },
 656             { mc_xy01_w8,  mc_xy11_w8,    mc_xy21_w8,    mc_xy31_w8 },
 657             { mc_hv_w8,    mc_xy12_w8,    mc_hc_w8,      mc_xy32_w8 },
 658             { mc_xy03_w8,  mc_xy13_w8,    mc_xy23_w8,    mc_xy33_w8 },
 659         },
 660         {
 661             { mc_copy_w16,  mc_xy10_w16,    mc_hh_w16,      mc_xy30_w16 },
 662             { mc_xy01_w16,  mc_xy11_w16,    mc_xy21_w16,    mc_xy31_w16 },
 663             { mc_hv_w16,    mc_xy12_w16,    mc_hc_w16,      mc_xy32_w16 },
 664             { mc_xy03_w16,  mc_xy13_w16,    mc_xy23_w16,    mc_xy33_w16 },
 665         }
 666     };
 667
 668     src += (mvy >> 2) * i_src + (mvx >> 2);
 669     if( i_width == 4 )
 670     {
 671         pf_mc[0][mvy&0x03][mvx&0x03]( src, i_src, dst, i_dst, i_height );
 672     }
 673     else if( i_width == 8 )
 674     {
 675         pf_mc[1][mvy&0x03][mvx&0x03]( src, i_src, dst, i_dst, i_height );
 676     }
 677     else if( i_width == 16 )
 678     {
 679         pf_mc[2][mvy&0x03][mvx&0x03]( src, i_src, dst, i_dst, i_height );
 680     }
 681 }
 682
 683 void mc_luma_altivec( uint8_t *src[4], int i_src_stride,
 684                       uint8_t *dst,    int i_dst_stride,
 685                       int mvx, int mvy,
 686                       int i_width, int i_height )
 687 {
 688     uint8_t *src1, *src2;
 689
 690     /* todo : fixme... */
 691     int correction = ((mvx&3) == 3 && (mvy&3) == 1 || (mvx&3) == 1 && (mvy&3) == 3) ? 1:0;
 692
 693     int hpel1x = mvx>>1;
 694     int hpel1y = (mvy+1-correction)>>1;
 695     int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
 696
 697
 698     src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
 699
 700     if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
 701     {
 702         int hpel2x = (mvx+1)>>1;
 703         int hpel2y = (mvy+correction)>>1;
 704         int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
 705
 706         src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
 707
 708         switch(i_width) {
 709         case 4:
 710             pixel_avg_w4( dst, i_dst_stride, src1, i_src_stride,
 711                           src2, i_src_stride, i_height );
 712             break;
 713         case 8:
 714             pixel_avg_w8( dst, i_dst_stride, src1, i_src_stride,
 715                           src2, i_src_stride, i_height );
 716             break;
 717         case 16:
 718         default:
 719             pixel_avg_w16( dst, i_dst_stride, src1, i_src_stride,
 720                            src2, i_src_stride, i_height );
 721         }
 722
 723     }
 724     else
 725     {
 726         switch(i_width) {
 727         case 4:
 728             mc_copy_w4( src1, i_src_stride, dst, i_dst_stride, i_height );
 729             break;
 730         case 8:
 731             mc_copy_w8( src1, i_src_stride, dst, i_dst_stride, i_height );
 732             break;
 733         case 16:
 734             mc_copy_w16( src1, i_src_stride, dst, i_dst_stride, i_height );
 735             break;
 736         }
 737
 738     }
 739 }
 740
 741 uint8_t *get_ref_altivec( uint8_t *src[4], int i_src_stride,
 742                           uint8_t *dst,    int * i_dst_stride,
 743                           int mvx, int mvy,
 744                           int i_width, int i_height )
 745 {
 746     uint8_t *src1, *src2;
 747
 748     /* todo : fixme... */
 749     int correction = ((mvx&3) == 3 && (mvy&3) == 1 || (mvx&3) == 1 && (mvy&3) == 3) ? 1:0;
 750
 751     int hpel1x = mvx>>1;
 752     int hpel1y = (mvy+1-correction)>>1;
 753     int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
 754
 755
 756     src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
 757
 758     if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
 759     {
 760         int hpel2x = (mvx+1)>>1;
 761         int hpel2y = (mvy+correction)>>1;
 762         int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
 763
 764         src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
 765
 766         switch(i_width) {
 767         case 4:
 768             pixel_avg_w4( dst, *i_dst_stride, src1, i_src_stride,
 769                           src2, i_src_stride, i_height );
 770             break;
 771         case 8:
 772             pixel_avg_w8( dst, *i_dst_stride, src1, i_src_stride,
 773                           src2, i_src_stride, i_height );
 774             break;
 775         case 16:
 776         default:
 777             pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
 778                           src2, i_src_stride, i_height );
 779         }
 780         return dst;
 781
 782     }
 783     else
 784     {
 785         *i_dst_stride = i_src_stride;
 786         return src1;
 787     }
 788 }
 789
 790 static void mc_chroma_altivec( uint8_t *src, int i_src_stride,
 791                                uint8_t *dst, int i_dst_stride,
 792                                int mvx, int mvy,
 793                                int i_width, int i_height )
 794 {
 795     uint8_t *srcp;
 796     int x, y;
 797     int d8x = mvx & 0x07;
 798     int d8y = mvy & 0x07;
 799
 800     DECLARE_ALIGNED( uint16_t, coeff[4], 16 );
 801     coeff[0] = (8-d8x)*(8-d8y);
 802     coeff[1] = d8x    *(8-d8y);
 803     coeff[2] = (8-d8x)*d8y;
 804     coeff[3] = d8x    *d8y;
 805
 806     src  += (mvy >> 3) * i_src_stride + (mvx >> 3);
 807     srcp  = &src[i_src_stride];
 808
 809     if( i_width < 8 )
 810     {
 811         /* TODO: optimize */
 812         for( y = 0; y < i_height; y++ )
 813         {
 814             for( x = 0; x < i_width; x++ )
 815             {
 816                 dst[x] = ( coeff[0]*src[x]  + coeff[1]*src[x+1] +
 817                            coeff[2]*srcp[x] + coeff[3]*srcp[x+1] + 32 ) >> 6;
 818             }
 819             dst  += i_dst_stride;
 820
 821             src   = srcp;
 822             srcp += i_src_stride;
 823         }
 824         return;
 825     }
 826
 827     /* We now assume that i_width == 8 */
 828     LOAD_ZERO;
 829     vec_u16_t   coeffv[4];
 830     vec_u16_t   k32v;
 831     vec_u8_t    srcv_8[4];
 832     vec_u16_t   srcv_16[4];
 833     vec_u8_t    dstv_8;
 834     vec_u16_t   dstv_16;
 835     vec_u8_t    permv;
 836     vec_u16_t   shiftv;
 837
 838     coeffv[0] = vec_ld( 0, coeff );
 839     coeffv[3] = vec_splat( coeffv[0], 3 );
 840     coeffv[2] = vec_splat( coeffv[0], 2 );
 841     coeffv[1] = vec_splat( coeffv[0], 1 );
 842     coeffv[0] = vec_splat( coeffv[0], 0 );
 843     k32v      = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
 844     permv     = vec_lvsl( 0, (uint8_t *) 1 );
 845     shiftv    = vec_splat_u16( 6 );
 846
 847     LOAD_16( src, srcv_8[2] );
 848     srcv_8[3] = vec_perm( srcv_8[2], srcv_8[2], permv );
 849
 850     for( y = 0; y < i_height; y++ )
 851     {
 852         int i;
 853
 854         srcv_8[0] = srcv_8[2];
 855         srcv_8[1] = srcv_8[3];
 856         LOAD_16( srcp, srcv_8[2] );
 857         srcv_8[3] = vec_perm( srcv_8[2], srcv_8[2], permv );
 858
 859         dstv_16 = k32v;
 860         for( i = 0; i < 4; i++ )
 861         {
 862             CONVERT_U8_TO_U16( srcv_8[i], srcv_16[i] );
 863             srcv_16[i] = vec_mladd( coeffv[i], srcv_16[i], zero_u16v );
 864             dstv_16 = vec_add( dstv_16, srcv_16[i] );
 865         }
 866         dstv_16 = vec_sr( dstv_16, shiftv );
 867         CONVERT_U16_TO_U8( dstv_16, dstv_8 );
 868         STORE_8( dstv_8, dst );
 869
 870         dst  += i_dst_stride;
 871         srcp += i_src_stride;
 872     }
 873 }
 874
 875 void x264_mc_altivec_init( x264_mc_functions_t *pf )
 876 {
 877     pf->mc_luma   = mc_luma_altivec;
 878     pf->get_ref   = get_ref_altivec;
 879     pf->mc_chroma = mc_chroma_altivec;
 880 }