1 /*****************************************************************************
2 * mc.c: h264 encoder library (Motion Compensation)
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Eric Petit <eric.petit@lapsus.org>
7 * Guillaume Poirier <gpoirier@mplayerhq.hu>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *****************************************************************************/
31 #include "common/common.h"
32 #include "common/mc.h"
34 #include "ppccommon.h"
36 typedef void (*pf_mc_t)( uint8_t *src, int i_src,
37 uint8_t *dst, int i_dst, int i_height );
40 static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
41 static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
44 static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
46 return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
47 pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
50 static inline int x264_tapfilter1( uint8_t *pix )
52 return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
57 static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst, int i_dst,
58 uint8_t *src1, int i_src1,
59 uint8_t *src2, int i_height )
62 for( y = 0; y < i_height; y++ )
64 for( x = 0; x < 4; x++ )
66 dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
74 static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst, int i_dst,
75 uint8_t *src1, int i_src1,
76 uint8_t *src2, int i_height )
79 vec_u8_t src1v, src2v;
82 PREP_LOAD_SRC( src1 );
83 PREP_LOAD_SRC( src2 );
85 for( y = 0; y < i_height; y++ )
87 VEC_LOAD( src1, src1v, 8, vec_u8_t, src1 );
88 VEC_LOAD( src2, src2v, 8, vec_u8_t, src2 );
89 src1v = vec_avg( src1v, src2v );
90 VEC_STORE8( src1v, dst );
98 static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst, int i_dst,
99 uint8_t *src1, int i_src1,
100 uint8_t *src2, int i_height )
103 vec_u8_t src1v, src2v;
105 PREP_LOAD_SRC( src1 );
106 PREP_LOAD_SRC( src2 );
108 for( y = 0; y < i_height; y++ )
110 VEC_LOAD( src1, src1v, 16, vec_u8_t, src1 );
111 VEC_LOAD( src2, src2v, 16, vec_u8_t, src2 );
112 src1v = vec_avg( src1v, src2v );
113 vec_st(src1v, 0, dst);
121 static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst, int i_dst,
122 uint8_t *src1, int i_src1,
123 uint8_t *src2, int i_height )
125 x264_pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height);
126 x264_pixel_avg2_w4_altivec(dst+16, i_dst, src1+16, i_src1, src2+16, i_height);
129 /* mc_copy: plain c */
131 #define MC_COPY( name, a ) \
132 static void name( uint8_t *dst, int i_dst, \
133 uint8_t *src, int i_src, int i_height ) \
136 for( y = 0; y < i_height; y++ ) \
138 memcpy( dst, src, a ); \
143 MC_COPY( x264_mc_copy_w4_altivec, 4 )
144 MC_COPY( x264_mc_copy_w8_altivec, 8 )
146 static void x264_mc_copy_w16_altivec( uint8_t *dst, int i_dst,
147 uint8_t *src, int i_src, int i_height )
152 PREP_LOAD_SRC( src );
154 for( y = 0; y < i_height; y++)
156 VEC_LOAD( src, cpyV, 16, vec_u8_t, src );
157 vec_st(cpyV, 0, dst);
165 static void mc_luma_altivec( uint8_t *dst, int i_dst_stride,
166 uint8_t *src[4], int i_src_stride,
168 int i_width, int i_height )
170 int qpel_idx = ((mvy&3)<<2) + (mvx&3);
171 int offset = (mvy>>2)*i_src_stride + (mvx>>2);
172 uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
173 if( qpel_idx & 5 ) /* qpel interpolation needed */
175 uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
179 x264_pixel_avg2_w4_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
182 x264_pixel_avg2_w8_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
186 x264_pixel_avg2_w16_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
194 x264_mc_copy_w4_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
197 x264_mc_copy_w8_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
200 x264_mc_copy_w16_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
208 static uint8_t *get_ref_altivec( uint8_t *dst, int *i_dst_stride,
209 uint8_t *src[4], int i_src_stride,
211 int i_width, int i_height )
213 int qpel_idx = ((mvy&3)<<2) + (mvx&3);
214 int offset = (mvy>>2)*i_src_stride + (mvx>>2);
215 uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
216 if( qpel_idx & 5 ) /* qpel interpolation needed */
218 uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
221 x264_pixel_avg2_w4_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
224 x264_pixel_avg2_w8_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
229 x264_pixel_avg2_w16_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
232 x264_pixel_avg2_w20_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
239 *i_dst_stride = i_src_stride;
244 #define DO_PROCESS(a) \
245 src##a##v_16 = vec_u8_to_u16( src##a##v_8 ); \
246 src##a##v_16 = vec_mladd( coeff##a##v, src##a##v_16, zero_u16v ); \
247 dstv_16 = vec_add( dstv_16, src##a##v_16 )
249 static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
250 uint8_t *src, int i_src_stride,
256 int d8x = mvx & 0x07;
257 int d8y = mvy & 0x07;
259 DECLARE_ALIGNED_16( uint16_t coeff[4] );
260 coeff[0] = (8-d8x)*(8-d8y);
261 coeff[1] = d8x *(8-d8y);
262 coeff[2] = (8-d8x)*d8y;
265 src += (mvy >> 3) * i_src_stride + (mvx >> 3);
266 srcp = &src[i_src_stride];
270 PREP_LOAD_SRC( src );
272 vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v;
273 vec_u8_t src0v_8, src1v_8, src2v_8, src3v_8;
274 vec_u16_t src0v_16, src1v_16, src2v_16, src3v_16;
281 coeff0v = vec_ld( 0, coeff );
282 coeff3v = vec_splat( coeff0v, 3 );
283 coeff2v = vec_splat( coeff0v, 2 );
284 coeff1v = vec_splat( coeff0v, 1 );
285 coeff0v = vec_splat( coeff0v, 0 );
286 k32v = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
287 permv = vec_lvsl( 0, (uint8_t *) 1 );
288 shiftv = vec_splat_u16( 6 );
290 VEC_LOAD( src, src2v_8, 5, vec_u8_t, src );
291 src3v_8 = vec_perm( src2v_8, src2v_8, permv );
293 for( y = 0; y < i_height; y++ )
297 VEC_LOAD_G( srcp, src2v_8, 5, vec_u8_t );
298 src3v_8 = vec_perm( src2v_8, src2v_8, permv );
307 dstv_16 = vec_sr( dstv_16, shiftv );
308 dstv_8 = vec_u16_to_u8( dstv_16 );
309 VEC_STORE4( dstv_8, dst );
312 srcp += i_src_stride;
316 static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
317 uint8_t *src, int i_src_stride,
323 int d8x = mvx & 0x07;
324 int d8y = mvy & 0x07;
326 DECLARE_ALIGNED_16( uint16_t coeff[4] );
327 coeff[0] = (8-d8x)*(8-d8y);
328 coeff[1] = d8x *(8-d8y);
329 coeff[2] = (8-d8x)*d8y;
332 src += (mvy >> 3) * i_src_stride + (mvx >> 3);
333 srcp = &src[i_src_stride];
337 PREP_LOAD_SRC( src );
339 vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v;
340 vec_u8_t src0v_8, src1v_8, src2v_8, src3v_8;
341 vec_u16_t src0v_16, src1v_16, src2v_16, src3v_16;
348 coeff0v = vec_ld( 0, coeff );
349 coeff3v = vec_splat( coeff0v, 3 );
350 coeff2v = vec_splat( coeff0v, 2 );
351 coeff1v = vec_splat( coeff0v, 1 );
352 coeff0v = vec_splat( coeff0v, 0 );
353 k32v = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
354 permv = vec_lvsl( 0, (uint8_t *) 1 );
355 shiftv = vec_splat_u16( 6 );
357 VEC_LOAD( src, src2v_8, 9, vec_u8_t, src);
358 src3v_8 = vec_perm( src2v_8, src2v_8, permv );
360 for( y = 0; y < i_height; y++ )
364 VEC_LOAD_G( srcp, src2v_8, 9, vec_u8_t );
365 src3v_8 = vec_perm( src2v_8, src2v_8, permv );
374 dstv_16 = vec_sr( dstv_16, shiftv );
375 dstv_8 = vec_u16_to_u8( dstv_16 );
376 VEC_STORE8( dstv_8, dst );
379 srcp += i_src_stride;
383 static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride,
384 uint8_t *src, int i_src_stride,
386 int i_width, int i_height )
390 mc_chroma_altivec_8xh( dst, i_dst_stride, src, i_src_stride,
391 mvx, mvy, i_height );
395 mc_chroma_altivec_4xh( dst, i_dst_stride, src, i_src_stride,
396 mvx, mvy, i_height );
400 #define HPEL_FILTER_1( t1v, t2v, t3v, t4v, t5v, t6v ) \
402 t1v = vec_add( t1v, t6v ); \
403 t2v = vec_add( t2v, t5v ); \
404 t3v = vec_add( t3v, t4v ); \
406 t1v = vec_sub( t1v, t2v ); /* (a-b) */ \
407 t2v = vec_sub( t2v, t3v ); /* (b-c) */ \
408 t2v = vec_sl( t2v, twov ); /* (b-c)*4 */ \
409 t1v = vec_sub( t1v, t2v ); /* a-5*b+4*c */ \
410 t3v = vec_sl( t3v, fourv ); /* 16*c */ \
411 t1v = vec_add( t1v, t3v ); /* a-5*b+20*c */ \
414 #define HPEL_FILTER_2( t1v, t2v, t3v, t4v, t5v, t6v ) \
416 t1v = vec_add( t1v, t6v ); \
417 t2v = vec_add( t2v, t5v ); \
418 t3v = vec_add( t3v, t4v ); \
420 t1v = vec_sub( t1v, t2v ); /* (a-b) */ \
421 t1v = vec_sra( t1v, twov ); /* (a-b)/4 */ \
422 t1v = vec_sub( t1v, t2v ); /* (a-b)/4-b */ \
423 t1v = vec_add( t1v, t3v ); /* (a-b)/4-b+c */ \
424 t1v = vec_sra( t1v, twov ); /* ((a-b)/4-b+c)/4 */ \
425 t1v = vec_add( t1v, t3v ); /* ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 */ \
428 #define HPEL_FILTER_HORIZONTAL() \
430 VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \
431 VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \
433 src2v = vec_sld( src1v, src6v, 1 ); \
434 src3v = vec_sld( src1v, src6v, 2 ); \
435 src4v = vec_sld( src1v, src6v, 3 ); \
436 src5v = vec_sld( src1v, src6v, 4 ); \
437 src6v = vec_sld( src1v, src6v, 5 ); \
439 temp1v = vec_u8_to_s16_h( src1v ); \
440 temp2v = vec_u8_to_s16_h( src2v ); \
441 temp3v = vec_u8_to_s16_h( src3v ); \
442 temp4v = vec_u8_to_s16_h( src4v ); \
443 temp5v = vec_u8_to_s16_h( src5v ); \
444 temp6v = vec_u8_to_s16_h( src6v ); \
446 HPEL_FILTER_1( temp1v, temp2v, temp3v, \
447 temp4v, temp5v, temp6v ); \
449 dest1v = vec_add( temp1v, sixteenv ); \
450 dest1v = vec_sra( dest1v, fivev ); \
452 temp1v = vec_u8_to_s16_l( src1v ); \
453 temp2v = vec_u8_to_s16_l( src2v ); \
454 temp3v = vec_u8_to_s16_l( src3v ); \
455 temp4v = vec_u8_to_s16_l( src4v ); \
456 temp5v = vec_u8_to_s16_l( src5v ); \
457 temp6v = vec_u8_to_s16_l( src6v ); \
459 HPEL_FILTER_1( temp1v, temp2v, temp3v, \
460 temp4v, temp5v, temp6v ); \
462 dest2v = vec_add( temp1v, sixteenv ); \
463 dest2v = vec_sra( dest2v, fivev ); \
465 destv = vec_packsu( dest1v, dest2v ); \
467 VEC_STORE16( destv, &dsth[x+i_stride*y], dsth ); \
470 #define HPEL_FILTER_VERTICAL() \
472 VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src ); \
473 VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src ); \
474 VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src ); \
475 VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src ); \
476 VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src ); \
477 VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src ); \
479 temp1v = vec_u8_to_s16_h( src1v ); \
480 temp2v = vec_u8_to_s16_h( src2v ); \
481 temp3v = vec_u8_to_s16_h( src3v ); \
482 temp4v = vec_u8_to_s16_h( src4v ); \
483 temp5v = vec_u8_to_s16_h( src5v ); \
484 temp6v = vec_u8_to_s16_h( src6v ); \
486 HPEL_FILTER_1( temp1v, temp2v, temp3v, \
487 temp4v, temp5v, temp6v ); \
489 dest1v = vec_add( temp1v, sixteenv ); \
490 dest1v = vec_sra( dest1v, fivev ); \
492 temp4v = vec_u8_to_s16_l( src1v ); \
493 temp5v = vec_u8_to_s16_l( src2v ); \
494 temp6v = vec_u8_to_s16_l( src3v ); \
495 temp7v = vec_u8_to_s16_l( src4v ); \
496 temp8v = vec_u8_to_s16_l( src5v ); \
497 temp9v = vec_u8_to_s16_l( src6v ); \
499 HPEL_FILTER_1( temp4v, temp5v, temp6v, \
500 temp7v, temp8v, temp9v ); \
502 dest2v = vec_add( temp4v, sixteenv ); \
503 dest2v = vec_sra( dest2v, fivev ); \
505 destv = vec_packsu( dest1v, dest2v ); \
507 VEC_STORE16( destv, &dstv[x+i_stride*y], dsth ); \
510 #define HPEL_FILTER_CENTRAL() \
512 temp1v = vec_sld( tempav, tempbv, 12 ); \
513 temp2v = vec_sld( tempav, tempbv, 14 ); \
515 temp4v = vec_sld( tempbv, tempcv, 2 ); \
516 temp5v = vec_sld( tempbv, tempcv, 4 ); \
517 temp6v = vec_sld( tempbv, tempcv, 6 ); \
519 HPEL_FILTER_2( temp1v, temp2v, temp3v, \
520 temp4v, temp5v, temp6v ); \
522 dest1v = vec_add( temp1v, thirtytwov ); \
523 dest1v = vec_sra( dest1v, sixv ); \
525 temp1v = vec_sld( tempbv, tempcv, 12 ); \
526 temp2v = vec_sld( tempbv, tempcv, 14 ); \
528 temp4v = vec_sld( tempcv, tempdv, 2 ); \
529 temp5v = vec_sld( tempcv, tempdv, 4 ); \
530 temp6v = vec_sld( tempcv, tempdv, 6 ); \
532 HPEL_FILTER_2( temp1v, temp2v, temp3v, \
533 temp4v, temp5v, temp6v ); \
535 dest2v = vec_add( temp1v, thirtytwov ); \
536 dest2v = vec_sra( dest2v, sixv ); \
538 destv = vec_packsu( dest1v, dest2v ); \
540 VEC_STORE16( destv, &dstc[x-16+i_stride*y], dsth ); \
543 void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
544 int i_stride, int i_width, int i_height )
549 vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v;
550 vec_s16_t dest1v, dest2v;
551 vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, temp8v, temp9v;
552 vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;
557 PREP_STORE16_DST( dsth );
560 vec_u16_t twov, fourv, fivev, sixv;
561 vec_s16_t sixteenv, thirtytwov;
562 vect_ushort_u temp_u;
565 twov = vec_splat( temp_u.v, 0 );
567 fourv = vec_splat( temp_u.v, 0 );
569 fivev = vec_splat( temp_u.v, 0 );
571 sixv = vec_splat( temp_u.v, 0 );
573 sixteenv = (vec_s16_t)vec_splat( temp_u.v, 0 );
575 thirtytwov = (vec_s16_t)vec_splat( temp_u.v, 0 );
577 for( y = 0; y < i_height; y++ )
581 /* horizontal_filter */
582 HPEL_FILTER_HORIZONTAL();
584 /* vertical_filter */
585 HPEL_FILTER_VERTICAL();
590 tempcv = vec_splat( temp1v, 0 ); /* first only */
594 for( x = 16; x < i_width; x+=16 )
596 /* horizontal_filter */
597 HPEL_FILTER_HORIZONTAL();
599 /* vertical_filter */
600 HPEL_FILTER_VERTICAL();
609 HPEL_FILTER_CENTRAL();
612 /* Partial vertical filter */
613 VEC_LOAD_PARTIAL( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src );
614 VEC_LOAD_PARTIAL( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src );
615 VEC_LOAD_PARTIAL( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src );
616 VEC_LOAD_PARTIAL( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src );
617 VEC_LOAD_PARTIAL( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src );
618 VEC_LOAD_PARTIAL( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src );
620 temp1v = vec_u8_to_s16_h( src1v );
621 temp2v = vec_u8_to_s16_h( src2v );
622 temp3v = vec_u8_to_s16_h( src3v );
623 temp4v = vec_u8_to_s16_h( src4v );
624 temp5v = vec_u8_to_s16_h( src5v );
625 temp6v = vec_u8_to_s16_h( src6v );
627 HPEL_FILTER_1( temp1v, temp2v, temp3v,
628 temp4v, temp5v, temp6v );
635 /* tempev is not used */
637 HPEL_FILTER_CENTRAL();
641 void x264_mc_altivec_init( x264_mc_functions_t *pf )
643 pf->mc_luma = mc_luma_altivec;
644 pf->get_ref = get_ref_altivec;
645 pf->mc_chroma = mc_chroma_altivec;
647 pf->hpel_filter = x264_hpel_filter_altivec;