1 /*****************************************************************************
2 * mc.c: h264 encoder library (Motion Compensation)
3 *****************************************************************************
4 * Copyright (C) 2003 Laurent Aimar
5 * $Id: mc.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
7 * Authors: Eric Petit <titer@m0k.org>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 *****************************************************************************/
35 #include "common/mc.h"
36 #include "common/clip1.h"
38 #include "ppccommon.h"
40 typedef void (*pf_mc_t)( uint8_t *src, int i_src,
41 uint8_t *dst, int i_dst, int i_height );
43 static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
45 return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
46 pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
49 static inline int x264_tapfilter1( uint8_t *pix )
51 return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
56 static inline void pixel_avg_w4( uint8_t *dst, int i_dst,
57 uint8_t *src1, int i_src1,
58 uint8_t *src2, int i_src2,
62 for( y = 0; y < i_height; y++ )
64 for( x = 0; x < 4; x++ )
66 dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
73 static inline void pixel_avg_w8( uint8_t *dst, int i_dst,
74 uint8_t *src1, int i_src1,
75 uint8_t *src2, int i_src2,
79 pixel_avg_w4( &dst[0], i_dst, &src1[0], i_src1, &src2[0], i_src2,
81 pixel_avg_w4( &dst[4], i_dst, &src1[4], i_src1, &src2[4], i_src2,
84 static inline void pixel_avg_w16( uint8_t *dst, int i_dst,
85 uint8_t *src1, int i_src1,
86 uint8_t *src2, int i_src2,
90 vec_u8_t src1v, src2v;
91 for( y = 0; y < i_height; y++ )
93 LOAD_16( src1, src1v );
94 LOAD_16( src2, src2v );
95 src1v = vec_avg( src1v, src2v );
96 STORE_16( src1v, dst );
104 /* mc_copy: plain c */
105 #define MC_COPY( name, a ) \
106 static void name( uint8_t *src, int i_src, \
107 uint8_t *dst, int i_dst, int i_height ) \
110 for( y = 0; y < i_height; y++ ) \
112 memcpy( dst, src, a ); \
117 MC_COPY( mc_copy_w4, 4 )
118 MC_COPY( mc_copy_w8, 8 )
119 MC_COPY( mc_copy_w16, 16 )
122 a is source (vec_s16_t [6])
123 b is a temporary vec_s16_t
126 c = src[0] + a[5] - 5 * ( a[1] + a[4] ) + 20 * ( a[2] + a[3] );
130 #define TAP_FILTER( a, b, c ) \
131 c = vec_add( a[0], a[5] ); \
132 b = vec_add( a[1], a[4] ); \
133 c = vec_sub( c, b ); \
134 b = vec_sl( b, vec_splat_u16( 2 ) ); \
135 c = vec_sub( c, b ); \
136 b = vec_add( a[2], a[3] ); \
137 b = vec_sl( b, vec_splat_u16( 2 ) ); \
138 c = vec_add( c, b ); \
139 b = vec_sl( b, vec_splat_u16( 2 ) ); \
140 c = vec_add( c, b ); \
141 c = vec_add( c, vec_splat_s16( 8 ) ); \
142 c = vec_add( c, vec_splat_s16( 8 ) ); \
143 c = vec_sr( c, vec_splat_u16( 5 ) ); \
144 c = vec_add( c, vec_sl( vec_splat_s16( 5 ), \
145 vec_splat_u16( 4 ) ) );
148 static inline void mc_hh_w4( uint8_t *src, int i_src,
149 uint8_t *dst, int i_dst, int i_height )
152 for( y = 0; y < i_height; y++ )
154 for( x = 0; x < 4; x++ )
156 dst[x] = x264_mc_clip1( ( x264_tapfilter1( &src[x] ) +
163 static inline void mc_hh_w8( uint8_t *src, int i_src,
164 uint8_t *dst, int i_dst, int i_height )
167 DECLARE_ALIGNED( int16_t, tmp[8], 16 );
172 vec_u8_t * _srcv = (vec_u8_t*) srcv;
176 for( y = 0; y < i_height; y++ )
178 LOAD_16( &src[-2], loadv );
180 for( x = 0; x < 6; x++ )
182 _srcv[x] = vec_perm( loadv, zero_u8v,
183 vec_lvsl( 0, (int*) x ) );
184 CONVERT_U8_TO_S16( srcv[x], srcv[x] );
187 TAP_FILTER( srcv, tmpv, dstv );
188 vec_st( dstv, 0, tmp );
190 for( x = 0; x < 8; x++ )
192 dst[x] = x264_mc_clip1_table[tmp[x]];
199 static inline void mc_hh_w16( uint8_t *src, int i_src,
200 uint8_t *dst, int i_dst, int i_height )
202 mc_hh_w8( &src[0], i_src, &dst[0], i_dst, i_height );
203 mc_hh_w8( &src[8], i_src, &dst[8], i_dst, i_height );
207 static inline void mc_hv_w4( uint8_t *src, int i_src,
208 uint8_t *dst, int i_dst, int i_height )
211 for( y = 0; y < i_height; y++ )
213 for( x = 0; x < 4; x++ )
215 dst[x] = x264_mc_clip1( ( x264_tapfilter( &src[x], i_src ) +
222 static inline void mc_hv_w8( uint8_t *src, int i_src,
223 uint8_t *dst, int i_dst, int i_height )
226 DECLARE_ALIGNED( int16_t, tmp[8], 16 );
230 vec_u8_t * _srcv = (vec_u8_t*) srcv;
234 for( y = 0; y < i_height; y++ )
238 for( x = 0; x < 5; x++ )
242 LOAD_8( &src[3*i_src], _srcv[5] );
243 CONVERT_U8_TO_S16( srcv[5], srcv[5] );
247 for( x = 0; x < 6; x++ )
249 LOAD_8( &src[(x-2)*i_src], _srcv[x] );
250 CONVERT_U8_TO_S16( srcv[x], srcv[x] );
254 TAP_FILTER( srcv, tmpv, dstv );
255 vec_st( dstv, 0, tmp );
257 for( x = 0; x < 8; x++ )
259 dst[x] = x264_mc_clip1_table[tmp[x]];
265 static inline void mc_hv_w16( uint8_t *src, int i_src,
266 uint8_t *dst, int i_dst, int i_height )
268 mc_hv_w8( &src[0], i_src, &dst[0], i_dst, i_height );
269 mc_hv_w8( &src[8], i_src, &dst[8], i_dst, i_height );
273 static inline void mc_hc_w4( uint8_t *src, int i_src,
274 uint8_t *dst, int i_dst, int i_height )
280 for( x = 0; x < 4; x++ )
287 tap[0] = x264_tapfilter1( &pix[-2*i_src] );
288 tap[1] = x264_tapfilter1( &pix[-1*i_src] );
289 tap[2] = x264_tapfilter1( &pix[ 0*i_src] );
290 tap[3] = x264_tapfilter1( &pix[ 1*i_src] );
291 tap[4] = x264_tapfilter1( &pix[ 2*i_src] );
293 for( y = 0; y < i_height; y++ )
295 tap[5] = x264_tapfilter1( &pix[ 3*i_src] );
297 *out = x264_mc_clip1( ( tap[0] - 5*tap[1] + 20 * tap[2] +
298 20 * tap[3] -5*tap[4] + tap[5] +
312 static inline void mc_hc_w8( uint8_t *src, int i_src,
313 uint8_t *dst, int i_dst, int i_height )
316 mc_hc_w4( &src[0], i_src, &dst[0], i_dst, i_height );
317 mc_hc_w4( &src[4], i_src, &dst[4], i_dst, i_height );
319 static inline void mc_hc_w16( uint8_t *src, int i_src,
320 uint8_t *dst, int i_dst, int i_height )
322 mc_hc_w8( &src[0], i_src, &dst[0], i_dst, i_height );
323 mc_hc_w8( &src[8], i_src, &dst[8], i_dst, i_height );
327 static void mc_xy10_w4( uint8_t *src, int i_src,
328 uint8_t *dst, int i_dst, int i_height )
331 mc_hh_w4( src, i_src, tmp, 4, i_height );
332 pixel_avg_w4( dst, i_dst, src, i_src, tmp, 4, i_height );
334 static void mc_xy10_w8( uint8_t *src, int i_src,
335 uint8_t *dst, int i_dst, int i_height )
338 mc_hh_w8( src, i_src, tmp, 8, i_height );
339 pixel_avg_w8( dst, i_dst, src, i_src, tmp, 8, i_height );
341 static void mc_xy10_w16( uint8_t *src, int i_src,
342 uint8_t *dst, int i_dst, int i_height )
345 mc_hh_w16( src, i_src, tmp, 16, i_height );
346 pixel_avg_w16( dst, i_dst, src, i_src, tmp, 16, i_height );
349 static void mc_xy30_w4( uint8_t *src, int i_src,
350 uint8_t *dst, int i_dst, int i_height )
353 mc_hh_w4( src, i_src, tmp, 4, i_height );
354 pixel_avg_w4( dst, i_dst, src + 1, i_src, tmp, 4, i_height );
356 static void mc_xy30_w8( uint8_t *src, int i_src,
357 uint8_t *dst, int i_dst, int i_height )
360 mc_hh_w8( src, i_src, tmp, 8, i_height );
361 pixel_avg_w8( dst, i_dst, src + 1, i_src, tmp, 8, i_height );
363 static void mc_xy30_w16( uint8_t *src, int i_src,
364 uint8_t *dst, int i_dst, int i_height )
367 mc_hh_w16( src, i_src, tmp, 16, i_height );
368 pixel_avg_w16( dst, i_dst, src + 1, i_src, tmp, 16, i_height );
372 static void mc_xy01_w4( uint8_t *src, int i_src,
373 uint8_t *dst, int i_dst, int i_height )
376 mc_hv_w4( src, i_src, tmp, 4, i_height );
377 pixel_avg_w4( dst, i_dst, src, i_src, tmp, 4, i_height );
379 static void mc_xy01_w8( uint8_t *src, int i_src,
380 uint8_t *dst, int i_dst, int i_height )
383 mc_hv_w8( src, i_src, tmp, 8, i_height );
384 pixel_avg_w8( dst, i_dst, src, i_src, tmp, 8, i_height );
386 static void mc_xy01_w16( uint8_t *src, int i_src,
387 uint8_t *dst, int i_dst, int i_height )
390 mc_hv_w16( src, i_src, tmp, 16, i_height );
391 pixel_avg_w16( dst, i_dst, src, i_src, tmp, 16, i_height );
394 static void mc_xy03_w4( uint8_t *src, int i_src,
395 uint8_t *dst, int i_dst, int i_height )
398 mc_hv_w4( src, i_src, tmp, 4, i_height );
399 pixel_avg_w4( dst, i_dst, src + i_src, i_src, tmp, 4, i_height );
401 static void mc_xy03_w8( uint8_t *src, int i_src,
402 uint8_t *dst, int i_dst, int i_height )
405 mc_hv_w8( src, i_src, tmp, 8, i_height );
406 pixel_avg_w8( dst, i_dst, src + i_src, i_src, tmp, 8, i_height );
408 static void mc_xy03_w16( uint8_t *src, int i_src,
409 uint8_t *dst, int i_dst, int i_height )
412 mc_hv_w16( src, i_src, tmp, 16, i_height );
413 pixel_avg_w16( dst, i_dst, src + i_src, i_src, tmp, 16, i_height );
417 static void mc_xy11_w4( uint8_t *src, int i_src,
418 uint8_t *dst, int i_dst, int i_height )
422 mc_hv_w4( src, i_src, tmp1, 4, i_height );
423 mc_hh_w4( src, i_src, tmp2, 4, i_height );
424 pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
426 static void mc_xy11_w8( uint8_t *src, int i_src,
427 uint8_t *dst, int i_dst, int i_height )
431 mc_hv_w8( src, i_src, tmp1, 8, i_height );
432 mc_hh_w8( src, i_src, tmp2, 8, i_height );
433 pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
435 static void mc_xy11_w16( uint8_t *src, int i_src,
436 uint8_t *dst, int i_dst, int i_height )
440 mc_hv_w16( src, i_src, tmp1, 16, i_height );
441 mc_hh_w16( src, i_src, tmp2, 16, i_height );
442 pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
445 static void mc_xy31_w4( uint8_t *src, int i_src,
446 uint8_t *dst, int i_dst, int i_height )
450 mc_hv_w4( src+1, i_src, tmp1, 4, i_height );
451 mc_hh_w4( src, i_src, tmp2, 4, i_height );
452 pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
454 static void mc_xy31_w8( uint8_t *src, int i_src,
455 uint8_t *dst, int i_dst, int i_height )
459 mc_hv_w8( src+1, i_src, tmp1, 8, i_height );
460 mc_hh_w8( src, i_src, tmp2, 8, i_height );
461 pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
463 static void mc_xy31_w16( uint8_t *src, int i_src,
464 uint8_t *dst, int i_dst, int i_height )
468 mc_hv_w16( src+1, i_src, tmp1, 16, i_height );
469 mc_hh_w16( src, i_src, tmp2, 16, i_height );
470 pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
473 static void mc_xy13_w4( uint8_t *src, int i_src,
474 uint8_t *dst, int i_dst, int i_height )
478 mc_hv_w4( src, i_src, tmp1, 4, i_height );
479 mc_hh_w4( src+i_src, i_src, tmp2, 4, i_height );
480 pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
482 static void mc_xy13_w8( uint8_t *src, int i_src,
483 uint8_t *dst, int i_dst, int i_height )
487 mc_hv_w8( src, i_src, tmp1, 8, i_height );
488 mc_hh_w8( src+i_src, i_src, tmp2, 8, i_height );
489 pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
491 static void mc_xy13_w16( uint8_t *src, int i_src,
492 uint8_t *dst, int i_dst, int i_height )
496 mc_hv_w16( src, i_src, tmp1, 16, i_height );
497 mc_hh_w16( src+i_src, i_src, tmp2, 16, i_height );
498 pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
501 static void mc_xy33_w4( uint8_t *src, int i_src,
502 uint8_t *dst, int i_dst, int i_height )
506 mc_hv_w4( src+1, i_src, tmp1, 4, i_height );
507 mc_hh_w4( src+i_src, i_src, tmp2, 4, i_height );
508 pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
510 static void mc_xy33_w8( uint8_t *src, int i_src,
511 uint8_t *dst, int i_dst, int i_height )
515 mc_hv_w8( src+1, i_src, tmp1, 8, i_height );
516 mc_hh_w8( src+i_src, i_src, tmp2, 8, i_height );
517 pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
519 static void mc_xy33_w16( uint8_t *src, int i_src,
520 uint8_t *dst, int i_dst, int i_height )
524 mc_hv_w16( src+1, i_src, tmp1, 16, i_height );
525 mc_hh_w16( src+i_src, i_src, tmp2, 16, i_height );
526 pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
529 static void mc_xy21_w4( uint8_t *src, int i_src,
530 uint8_t *dst, int i_dst, int i_height )
534 mc_hc_w4( src, i_src, tmp1, 4, i_height );
535 mc_hh_w4( src, i_src, tmp2, 4, i_height );
536 pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
538 static void mc_xy21_w8( uint8_t *src, int i_src,
539 uint8_t *dst, int i_dst, int i_height )
543 mc_hc_w8( src, i_src, tmp1, 8, i_height );
544 mc_hh_w8( src, i_src, tmp2, 8, i_height );
545 pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
547 static void mc_xy21_w16( uint8_t *src, int i_src,
548 uint8_t *dst, int i_dst, int i_height )
552 mc_hc_w16( src, i_src, tmp1, 16, i_height );
553 mc_hh_w16( src, i_src, tmp2, 16, i_height );
554 pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
557 static void mc_xy12_w4( uint8_t *src, int i_src,
558 uint8_t *dst, int i_dst, int i_height )
562 mc_hc_w4( src, i_src, tmp1, 4, i_height );
563 mc_hv_w4( src, i_src, tmp2, 4, i_height );
564 pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
566 static void mc_xy12_w8( uint8_t *src, int i_src,
567 uint8_t *dst, int i_dst, int i_height )
571 mc_hc_w8( src, i_src, tmp1, 8, i_height );
572 mc_hv_w8( src, i_src, tmp2, 8, i_height );
573 pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
575 static void mc_xy12_w16( uint8_t *src, int i_src,
576 uint8_t *dst, int i_dst, int i_height )
580 mc_hc_w16( src, i_src, tmp1, 16, i_height );
581 mc_hv_w16( src, i_src, tmp2, 16, i_height );
582 pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
585 static void mc_xy32_w4( uint8_t *src, int i_src,
586 uint8_t *dst, int i_dst, int i_height )
590 mc_hc_w4( src, i_src, tmp1, 4, i_height );
591 mc_hv_w4( src+1, i_src, tmp2, 4, i_height );
592 pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
594 static void mc_xy32_w8( uint8_t *src, int i_src,
595 uint8_t *dst, int i_dst, int i_height )
599 mc_hc_w8( src, i_src, tmp1, 8, i_height );
600 mc_hv_w8( src+1, i_src, tmp2, 8, i_height );
601 pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
603 static void mc_xy32_w16( uint8_t *src, int i_src,
604 uint8_t *dst, int i_dst, int i_height )
608 mc_hc_w16( src, i_src, tmp1, 16, i_height );
609 mc_hv_w16( src+1, i_src, tmp2, 16, i_height );
610 pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
613 static void mc_xy23_w4( uint8_t *src, int i_src,
614 uint8_t *dst, int i_dst, int i_height )
618 mc_hc_w4( src, i_src, tmp1, 4, i_height );
619 mc_hh_w4( src+i_src, i_src, tmp2, 4, i_height );
620 pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
622 static void mc_xy23_w8( uint8_t *src, int i_src,
623 uint8_t *dst, int i_dst, int i_height )
627 mc_hc_w8( src, i_src, tmp1, 8, i_height );
628 mc_hh_w8( src+i_src, i_src, tmp2, 8, i_height );
629 pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
631 static void mc_xy23_w16( uint8_t *src, int i_src,
632 uint8_t *dst, int i_dst, int i_height )
636 mc_hc_w16( src, i_src, tmp1, 16, i_height );
637 mc_hh_w16( src+i_src, i_src, tmp2, 16, i_height );
638 pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
641 static void motion_compensation_luma( uint8_t *src, int i_src,
642 uint8_t *dst, int i_dst,
644 int i_width, int i_height )
646 static const pf_mc_t pf_mc[3][4][4] = /*XXX [dqy][dqx] */
649 { mc_copy_w4, mc_xy10_w4, mc_hh_w4, mc_xy30_w4 },
650 { mc_xy01_w4, mc_xy11_w4, mc_xy21_w4, mc_xy31_w4 },
651 { mc_hv_w4, mc_xy12_w4, mc_hc_w4, mc_xy32_w4 },
652 { mc_xy03_w4, mc_xy13_w4, mc_xy23_w4, mc_xy33_w4 },
655 { mc_copy_w8, mc_xy10_w8, mc_hh_w8, mc_xy30_w8 },
656 { mc_xy01_w8, mc_xy11_w8, mc_xy21_w8, mc_xy31_w8 },
657 { mc_hv_w8, mc_xy12_w8, mc_hc_w8, mc_xy32_w8 },
658 { mc_xy03_w8, mc_xy13_w8, mc_xy23_w8, mc_xy33_w8 },
661 { mc_copy_w16, mc_xy10_w16, mc_hh_w16, mc_xy30_w16 },
662 { mc_xy01_w16, mc_xy11_w16, mc_xy21_w16, mc_xy31_w16 },
663 { mc_hv_w16, mc_xy12_w16, mc_hc_w16, mc_xy32_w16 },
664 { mc_xy03_w16, mc_xy13_w16, mc_xy23_w16, mc_xy33_w16 },
668 src += (mvy >> 2) * i_src + (mvx >> 2);
671 pf_mc[0][mvy&0x03][mvx&0x03]( src, i_src, dst, i_dst, i_height );
673 else if( i_width == 8 )
675 pf_mc[1][mvy&0x03][mvx&0x03]( src, i_src, dst, i_dst, i_height );
677 else if( i_width == 16 )
679 pf_mc[2][mvy&0x03][mvx&0x03]( src, i_src, dst, i_dst, i_height );
683 void mc_luma_altivec( uint8_t *src[4], int i_src_stride,
684 uint8_t *dst, int i_dst_stride,
686 int i_width, int i_height )
688 uint8_t *src1, *src2;
690 /* todo : fixme... */
691 int correction = ((mvx&3) == 3 && (mvy&3) == 1 || (mvx&3) == 1 && (mvy&3) == 3) ? 1:0;
694 int hpel1y = (mvy+1-correction)>>1;
695 int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
698 src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
700 if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
702 int hpel2x = (mvx+1)>>1;
703 int hpel2y = (mvy+correction)>>1;
704 int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
706 src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
710 pixel_avg_w4( dst, i_dst_stride, src1, i_src_stride,
711 src2, i_src_stride, i_height );
714 pixel_avg_w8( dst, i_dst_stride, src1, i_src_stride,
715 src2, i_src_stride, i_height );
719 pixel_avg_w16( dst, i_dst_stride, src1, i_src_stride,
720 src2, i_src_stride, i_height );
728 mc_copy_w4( src1, i_src_stride, dst, i_dst_stride, i_height );
731 mc_copy_w8( src1, i_src_stride, dst, i_dst_stride, i_height );
734 mc_copy_w16( src1, i_src_stride, dst, i_dst_stride, i_height );
741 uint8_t *get_ref_altivec( uint8_t *src[4], int i_src_stride,
742 uint8_t *dst, int * i_dst_stride,
744 int i_width, int i_height )
746 uint8_t *src1, *src2;
748 /* todo : fixme... */
749 int correction = ((mvx&3) == 3 && (mvy&3) == 1 || (mvx&3) == 1 && (mvy&3) == 3) ? 1:0;
752 int hpel1y = (mvy+1-correction)>>1;
753 int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
756 src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
758 if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
760 int hpel2x = (mvx+1)>>1;
761 int hpel2y = (mvy+correction)>>1;
762 int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
764 src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
768 pixel_avg_w4( dst, *i_dst_stride, src1, i_src_stride,
769 src2, i_src_stride, i_height );
772 pixel_avg_w8( dst, *i_dst_stride, src1, i_src_stride,
773 src2, i_src_stride, i_height );
777 pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
778 src2, i_src_stride, i_height );
785 *i_dst_stride = i_src_stride;
790 static void mc_chroma_altivec( uint8_t *src, int i_src_stride,
791 uint8_t *dst, int i_dst_stride,
793 int i_width, int i_height )
797 int d8x = mvx & 0x07;
798 int d8y = mvy & 0x07;
800 DECLARE_ALIGNED( uint16_t, coeff[4], 16 );
801 coeff[0] = (8-d8x)*(8-d8y);
802 coeff[1] = d8x *(8-d8y);
803 coeff[2] = (8-d8x)*d8y;
806 src += (mvy >> 3) * i_src_stride + (mvx >> 3);
807 srcp = &src[i_src_stride];
812 for( y = 0; y < i_height; y++ )
814 for( x = 0; x < i_width; x++ )
816 dst[x] = ( coeff[0]*src[x] + coeff[1]*src[x+1] +
817 coeff[2]*srcp[x] + coeff[3]*srcp[x+1] + 32 ) >> 6;
822 srcp += i_src_stride;
827 /* We now assume that i_width == 8 */
832 vec_u16_t srcv_16[4];
838 coeffv[0] = vec_ld( 0, coeff );
839 coeffv[3] = vec_splat( coeffv[0], 3 );
840 coeffv[2] = vec_splat( coeffv[0], 2 );
841 coeffv[1] = vec_splat( coeffv[0], 1 );
842 coeffv[0] = vec_splat( coeffv[0], 0 );
843 k32v = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
844 permv = vec_lvsl( 0, (uint8_t *) 1 );
845 shiftv = vec_splat_u16( 6 );
847 LOAD_16( src, srcv_8[2] );
848 srcv_8[3] = vec_perm( srcv_8[2], srcv_8[2], permv );
850 for( y = 0; y < i_height; y++ )
854 srcv_8[0] = srcv_8[2];
855 srcv_8[1] = srcv_8[3];
856 LOAD_16( srcp, srcv_8[2] );
857 srcv_8[3] = vec_perm( srcv_8[2], srcv_8[2], permv );
860 for( i = 0; i < 4; i++ )
862 CONVERT_U8_TO_U16( srcv_8[i], srcv_16[i] );
863 srcv_16[i] = vec_mladd( coeffv[i], srcv_16[i], zero_u16v );
864 dstv_16 = vec_add( dstv_16, srcv_16[i] );
866 dstv_16 = vec_sr( dstv_16, shiftv );
867 CONVERT_U16_TO_U8( dstv_16, dstv_8 );
868 STORE_8( dstv_8, dst );
871 srcp += i_src_stride;
875 void x264_mc_altivec_init( x264_mc_functions_t *pf )
877 pf->mc_luma = mc_luma_altivec;
878 pf->get_ref = get_ref_altivec;
879 pf->mc_chroma = mc_chroma_altivec;