]> git.sesse.net Git - x264/blob - common/ppc/mc.c
Update file headers throughout x264
[x264] / common / ppc / mc.c
1 /*****************************************************************************
2  * mc.c: h264 encoder library (Motion Compensation)
3  *****************************************************************************
4  * Copyright (C) 2003-2008 x264 project
5  *
6  * Authors: Eric Petit <titer@m0k.org>
7  *          Guillaume Poirier <gpoirier@mplayerhq.hu>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22  *****************************************************************************/
23
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <stdint.h>
28 #include <stdarg.h>
29
30 #ifdef SYS_LINUX
31 #include <altivec.h>
32 #endif
33
34 #include "x264.h"
35 #include "common/common.h"
36 #include "common/mc.h"
37 #include "mc.h"
38 #include "ppccommon.h"
39
40 typedef void (*pf_mc_t)( uint8_t *src, int i_src,
41                          uint8_t *dst, int i_dst, int i_height );
42
43 static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
44 {
45     return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
46            pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
47            pix[ 3*i_pix_next];
48 }
49 static inline int x264_tapfilter1( uint8_t *pix )
50 {
51     return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
52            pix[ 3];
53 }
54
55 /* pixel_avg */
56 static inline void pixel_avg_w4( uint8_t *dst,  int i_dst,
57                                  uint8_t *src1, int i_src1,
58                                  uint8_t *src2, int i_src2,
59                                  int i_height )
60 {
61     int x, y;
62     for( y = 0; y < i_height; y++ )
63     {
64         for( x = 0; x < 4; x++ )
65         {
66             dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
67         }
68         dst  += i_dst;
69         src1 += i_src1;
70         src2 += i_src2;
71     }
72 }
73 static inline void pixel_avg_w8( uint8_t *dst,  int i_dst,
74                                  uint8_t *src1, int i_src1,
75                                  uint8_t *src2, int i_src2,
76                                  int i_height )
77 {
78     int y;
79     vec_u8_t src1v, src2v;
80     LOAD_ZERO;
81     PREP_LOAD;
82     PREP_STORE8;
83     for( y = 0; y < i_height; y++ )
84     {
85         VEC_LOAD( src1, src1v, 8, vec_u8_t );
86         VEC_LOAD( src2, src2v, 8, vec_u8_t );
87         src1v = vec_avg( src1v, src2v );
88         VEC_STORE8( src1v, dst );
89
90         dst  += i_dst;
91         src1 += i_src1;
92         src2 += i_src2;
93     }
94 }
95 static inline void pixel_avg_w16( uint8_t *dst,  int i_dst,
96                                   uint8_t *src1, int i_src1,
97                                   uint8_t *src2, int i_src2,
98                                   int i_height )
99 {
100     int y;
101     vec_u8_t src1v, src2v;
102     PREP_LOAD;
103     PREP_STORE16;
104     for( y = 0; y < i_height; y++ )
105     {
106         VEC_LOAD( src1, src1v, 16, vec_u8_t );
107         VEC_LOAD( src2, src2v, 16, vec_u8_t );
108         src1v = vec_avg( src1v, src2v );
109         VEC_STORE16( src1v, dst );
110
111         dst  += i_dst;
112         src1 += i_src1;
113         src2 += i_src2;
114     }
115 }
116
117 /* mc_copy: plain c */
118 #define MC_COPY( name, a )                                \
119 static void name( uint8_t *src, int i_src,                \
120                   uint8_t *dst, int i_dst, int i_height ) \
121 {                                                         \
122     int y;                                                \
123     for( y = 0; y < i_height; y++ )                       \
124     {                                                     \
125         memcpy( dst, src, a );                            \
126         src += i_src;                                     \
127         dst += i_dst;                                     \
128     }                                                     \
129 }
130 MC_COPY( mc_copy_w4,  4  )
131 MC_COPY( mc_copy_w8,  8  )
132 MC_COPY( mc_copy_w16, 16 )
133
134 void mc_luma_altivec( uint8_t *dst,    int i_dst_stride,
135                       uint8_t *src[4], int i_src_stride,
136                       int mvx, int mvy,
137                       int i_width, int i_height )
138 {
139     uint8_t *src1, *src2;
140     
141     /* todo : fixme... */
142     int correction = (((mvx&3) == 3 && (mvy&3) == 1) || ((mvx&3) == 1 && (mvy&3) == 3)) ? 1:0;
143     
144     int hpel1x = mvx>>1;
145     int hpel1y = (mvy+1-correction)>>1;
146     int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
147     
148     
149     src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
150     
151     if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
152     {
153         int hpel2x = (mvx+1)>>1;
154         int hpel2y = (mvy+correction)>>1;
155         int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
156         
157         src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
158         
159         switch(i_width) {
160         case 4:
161             pixel_avg_w4( dst, i_dst_stride, src1, i_src_stride,
162                           src2, i_src_stride, i_height );
163             break;
164         case 8:
165             pixel_avg_w8( dst, i_dst_stride, src1, i_src_stride,
166                           src2, i_src_stride, i_height );
167             break;
168         case 16:
169         default:
170             pixel_avg_w16( dst, i_dst_stride, src1, i_src_stride,
171                            src2, i_src_stride, i_height );
172         }
173         
174     }
175     else
176     {
177         switch(i_width) {
178         case 4:
179             mc_copy_w4( src1, i_src_stride, dst, i_dst_stride, i_height );
180             break;
181         case 8:
182             mc_copy_w8( src1, i_src_stride, dst, i_dst_stride, i_height );
183             break;
184         case 16:
185             mc_copy_w16( src1, i_src_stride, dst, i_dst_stride, i_height );
186             break;
187         }
188         
189     }
190 }
191
192 uint8_t *get_ref_altivec( uint8_t *dst,    int * i_dst_stride,
193                           uint8_t *src[4], int i_src_stride,
194                           int mvx, int mvy,
195                           int i_width, int i_height )
196 {
197     uint8_t *src1, *src2;
198     
199     /* todo : fixme... */
200     int correction = (((mvx&3) == 3 && (mvy&3) == 1) || ((mvx&3) == 1 && (mvy&3) == 3)) ? 1:0;
201     
202     int hpel1x = mvx>>1;
203     int hpel1y = (mvy+1-correction)>>1;
204     int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
205     
206     
207     src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
208     
209     if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
210     {
211         int hpel2x = (mvx+1)>>1;
212         int hpel2y = (mvy+correction)>>1;
213         int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
214         
215         src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
216         
217         switch(i_width) {
218         case 4:
219             pixel_avg_w4( dst, *i_dst_stride, src1, i_src_stride,
220                           src2, i_src_stride, i_height );
221             break;
222         case 8:
223             pixel_avg_w8( dst, *i_dst_stride, src1, i_src_stride,
224                           src2, i_src_stride, i_height );
225             break;
226         case 12:
227         case 16:
228         default:
229             pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
230                           src2, i_src_stride, i_height );
231             break;
232         case 20:
233             //FIXME suboptimal
234             pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
235                           src2, i_src_stride, i_height );
236             pixel_avg_w4( dst+16, *i_dst_stride, src1+16, i_src_stride,
237                           src2+16, i_src_stride, i_height );
238             break;
239         }
240         return dst;
241
242     }
243     else
244     {
245         *i_dst_stride = i_src_stride;
246         return src1;
247     }
248 }
249
250 #define DO_PROCESS(a) \
251         src##a##v_16 = vec_u8_to_u16( src##a##v_8 ); \
252         src##a##v_16 = vec_mladd( coeff##a##v, src##a##v_16, zero_u16v ); \
253         dstv_16      = vec_add( dstv_16, src##a##v_16 )
254
255 static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
256                                    uint8_t *src, int i_src_stride,
257                                    int mvx, int mvy,
258                                    int i_height )
259 {
260     uint8_t *srcp;
261     int y;
262     int d8x = mvx & 0x07;
263     int d8y = mvy & 0x07;
264
265     DECLARE_ALIGNED_16( uint16_t coeff[4] );
266     coeff[0] = (8-d8x)*(8-d8y);
267     coeff[1] = d8x    *(8-d8y);
268     coeff[2] = (8-d8x)*d8y;
269     coeff[3] = d8x    *d8y;
270
271     src  += (mvy >> 3) * i_src_stride + (mvx >> 3);
272     srcp  = &src[i_src_stride];
273     
274     LOAD_ZERO;
275     PREP_LOAD;
276     PREP_STORE4;
277     vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
278     vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
279     vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16;
280     vec_u8_t    dstv_8;
281     vec_u16_t   dstv_16;
282     vec_u8_t    permv;
283     vec_u16_t   shiftv;
284     vec_u16_t   k32v;
285     
286     coeff0v = vec_ld( 0, coeff );
287     coeff3v = vec_splat( coeff0v, 3 );
288     coeff2v = vec_splat( coeff0v, 2 );
289     coeff1v = vec_splat( coeff0v, 1 );
290     coeff0v = vec_splat( coeff0v, 0 );
291     k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
292     permv   = vec_lvsl( 0, (uint8_t *) 1 );
293     shiftv  = vec_splat_u16( 6 );
294
295     VEC_LOAD( src, src2v_8, 5, vec_u8_t );
296     src3v_8 = vec_perm( src2v_8, src2v_8, permv );
297
298     for( y = 0; y < i_height; y++ )
299     {
300         src0v_8 = src2v_8;
301         src1v_8 = src3v_8;
302         VEC_LOAD( srcp, src2v_8, 5, vec_u8_t );
303         src3v_8 = vec_perm( src2v_8, src2v_8, permv );
304
305         dstv_16 = k32v;
306
307         DO_PROCESS( 0 );
308         DO_PROCESS( 1 );
309         DO_PROCESS( 2 );
310         DO_PROCESS( 3 );
311
312         dstv_16 = vec_sr( dstv_16, shiftv );
313         dstv_8  = vec_u16_to_u8( dstv_16 );
314         VEC_STORE4( dstv_8, dst );
315
316         dst  += i_dst_stride;
317         srcp += i_src_stride;
318     }
319 }
320
321 static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
322                                    uint8_t *src, int i_src_stride,
323                                    int mvx, int mvy,
324                                    int i_height )
325 {
326     uint8_t *srcp;
327     int y;
328     int d8x = mvx & 0x07;
329     int d8y = mvy & 0x07;
330
331     DECLARE_ALIGNED_16( uint16_t coeff[4] );
332     coeff[0] = (8-d8x)*(8-d8y);
333     coeff[1] = d8x    *(8-d8y);
334     coeff[2] = (8-d8x)*d8y;
335     coeff[3] = d8x    *d8y;
336
337     src  += (mvy >> 3) * i_src_stride + (mvx >> 3);
338     srcp  = &src[i_src_stride];
339     
340     LOAD_ZERO;
341     PREP_LOAD;
342     PREP_STORE8;
343     vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
344     vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
345     vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16;
346     vec_u8_t    dstv_8;
347     vec_u16_t   dstv_16;
348     vec_u8_t    permv;
349     vec_u16_t   shiftv;
350     vec_u16_t   k32v;
351     
352     coeff0v = vec_ld( 0, coeff );
353     coeff3v = vec_splat( coeff0v, 3 );
354     coeff2v = vec_splat( coeff0v, 2 );
355     coeff1v = vec_splat( coeff0v, 1 );
356     coeff0v = vec_splat( coeff0v, 0 );
357     k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
358     permv   = vec_lvsl( 0, (uint8_t *) 1 );
359     shiftv  = vec_splat_u16( 6 );
360
361     VEC_LOAD( src, src2v_8, 9, vec_u8_t );
362     src3v_8 = vec_perm( src2v_8, src2v_8, permv );
363
364     for( y = 0; y < i_height; y++ )
365     {
366         src0v_8 = src2v_8;
367         src1v_8 = src3v_8;
368         VEC_LOAD( srcp, src2v_8, 9, vec_u8_t );
369         src3v_8 = vec_perm( src2v_8, src2v_8, permv );
370
371         dstv_16 = k32v;
372
373         DO_PROCESS( 0 );
374         DO_PROCESS( 1 );
375         DO_PROCESS( 2 );
376         DO_PROCESS( 3 );
377
378         dstv_16 = vec_sr( dstv_16, shiftv );
379         dstv_8  = vec_u16_to_u8( dstv_16 );
380         VEC_STORE8( dstv_8, dst );
381
382         dst  += i_dst_stride;
383         srcp += i_src_stride;
384     }
385 }
386
387 static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride,
388                                uint8_t *src, int i_src_stride,
389                                int mvx, int mvy,
390                                int i_width, int i_height )
391 {
392     if( i_width == 8 )
393     {
394         mc_chroma_altivec_8xh( dst, i_dst_stride, src, i_src_stride,
395                                mvx, mvy, i_height );
396     }
397     else
398     {
399         mc_chroma_altivec_4xh( dst, i_dst_stride, src, i_src_stride,
400                                mvx, mvy, i_height );
401     }
402 }
403
404 #define HPEL_FILTER_1( t1v, t2v, t3v, t4v, t5v, t6v ) \
405 {                                                     \
406     t1v = vec_add( t1v, t6v );                        \
407     t2v = vec_add( t2v, t5v );                        \
408     t3v = vec_add( t3v, t4v );                        \
409                                                       \
410     t1v = vec_sub( t1v, t2v );   /* (a-b) */          \
411     t2v = vec_sub( t2v, t3v );   /* (b-c) */          \
412     t2v = vec_sl(  t2v, twov );  /* (b-c)*4 */        \
413     t1v = vec_sub( t1v, t2v );   /* a-5*b+4*c */      \
414     t3v = vec_sl(  t3v, fourv ); /* 16*c */           \
415     t1v = vec_add( t1v, t3v );   /* a-5*b+20*c */     \
416 }
417
418 #define HPEL_FILTER_2( t1v, t2v, t3v, t4v, t5v, t6v ) \
419 {                                                     \
420     t1v = vec_add( t1v, t6v );                        \
421     t2v = vec_add( t2v, t5v );                        \
422     t3v = vec_add( t3v, t4v );                        \
423                                                       \
424     t1v = vec_sub( t1v, t2v );  /* (a-b) */           \
425     t1v = vec_sra( t1v, twov ); /* (a-b)/4 */         \
426     t1v = vec_sub( t1v, t2v );  /* (a-b)/4-b */       \
427     t1v = vec_add( t1v, t3v );  /* (a-b)/4-b+c */     \
428     t1v = vec_sra( t1v, twov ); /* ((a-b)/4-b+c)/4 */ \
429     t1v = vec_add( t1v, t3v );  /* ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 */ \
430 }
431
432 #define HPEL_FILTER_HORIZONTAL()                            \
433 {                                                           \
434     VEC_LOAD( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t ); \
435     VEC_LOAD( &src[x+14+i_stride*y], src6v, 16, vec_u8_t ); \
436                                                             \
437     src2v = vec_sld( src1v, src6v,  1 );                    \
438     src3v = vec_sld( src1v, src6v,  2 );                    \
439     src4v = vec_sld( src1v, src6v,  3 );                    \
440     src5v = vec_sld( src1v, src6v,  4 );                    \
441     src6v = vec_sld( src1v, src6v,  5 );                    \
442                                                             \
443     temp1v = vec_u8_to_s16_h( src1v );                      \
444     temp2v = vec_u8_to_s16_h( src2v );                      \
445     temp3v = vec_u8_to_s16_h( src3v );                      \
446     temp4v = vec_u8_to_s16_h( src4v );                      \
447     temp5v = vec_u8_to_s16_h( src5v );                      \
448     temp6v = vec_u8_to_s16_h( src6v );                      \
449                                                             \
450     HPEL_FILTER_1( temp1v, temp2v, temp3v,                  \
451                    temp4v, temp5v, temp6v );                \
452                                                             \
453     dest1v = vec_add( temp1v, sixteenv );                   \
454     dest1v = vec_sra( dest1v, fivev );                      \
455                                                             \
456     temp1v = vec_u8_to_s16_l( src1v );                      \
457     temp2v = vec_u8_to_s16_l( src2v );                      \
458     temp3v = vec_u8_to_s16_l( src3v );                      \
459     temp4v = vec_u8_to_s16_l( src4v );                      \
460     temp5v = vec_u8_to_s16_l( src5v );                      \
461     temp6v = vec_u8_to_s16_l( src6v );                      \
462                                                             \
463     HPEL_FILTER_1( temp1v, temp2v, temp3v,                  \
464                    temp4v, temp5v, temp6v );                \
465                                                             \
466     dest2v = vec_add( temp1v, sixteenv );                   \
467     dest2v = vec_sra( dest2v, fivev );                      \
468                                                             \
469     destv = vec_packsu( dest1v, dest2v );                   \
470                                                             \
471     VEC_STORE16( destv, &dsth[x+i_stride*y] );              \
472 }
473
474 #define HPEL_FILTER_VERTICAL()                               \
475 {                                                            \
476     VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t ); \
477     VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t ); \
478     VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t ); \
479     VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t ); \
480     VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t ); \
481     VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t ); \
482                                                              \
483     temp1v = vec_u8_to_s16_h( src1v );                       \
484     temp2v = vec_u8_to_s16_h( src2v );                       \
485     temp3v = vec_u8_to_s16_h( src3v );                       \
486     temp4v = vec_u8_to_s16_h( src4v );                       \
487     temp5v = vec_u8_to_s16_h( src5v );                       \
488     temp6v = vec_u8_to_s16_h( src6v );                       \
489                                                              \
490     HPEL_FILTER_1( temp1v, temp2v, temp3v,                   \
491                    temp4v, temp5v, temp6v );                 \
492                                                              \
493     dest1v = vec_add( temp1v, sixteenv );                    \
494     dest1v = vec_sra( dest1v, fivev );                       \
495                                                              \
496     temp4v = vec_u8_to_s16_l( src1v );                       \
497     temp5v = vec_u8_to_s16_l( src2v );                       \
498     temp6v = vec_u8_to_s16_l( src3v );                       \
499     temp7v = vec_u8_to_s16_l( src4v );                       \
500     temp8v = vec_u8_to_s16_l( src5v );                       \
501     temp9v = vec_u8_to_s16_l( src6v );                       \
502                                                              \
503     HPEL_FILTER_1( temp4v, temp5v, temp6v,                   \
504                    temp7v, temp8v, temp9v );                 \
505                                                              \
506     dest2v = vec_add( temp4v, sixteenv );                    \
507     dest2v = vec_sra( dest2v, fivev );                       \
508                                                              \
509     destv = vec_packsu( dest1v, dest2v );                    \
510                                                              \
511     VEC_STORE16( destv, &dstv[x+i_stride*y] );               \
512 }
513
514 #define HPEL_FILTER_CENTRAL()                     \
515 {                                                 \
516     temp1v = vec_sld( tempav, tempbv, 12 );       \
517     temp2v = vec_sld( tempav, tempbv, 14 );       \
518     temp3v = tempbv;                              \
519     temp4v = vec_sld( tempbv, tempcv,  2 );       \
520     temp5v = vec_sld( tempbv, tempcv,  4 );       \
521     temp6v = vec_sld( tempbv, tempcv,  6 );       \
522                                                   \
523     HPEL_FILTER_2( temp1v, temp2v, temp3v,        \
524                    temp4v, temp5v, temp6v );      \
525                                                   \
526     dest1v = vec_add( temp1v, thirtytwov );       \
527     dest1v = vec_sra( dest1v, sixv );             \
528                                                   \
529     temp1v = vec_sld( tempbv, tempcv, 12 );       \
530     temp2v = vec_sld( tempbv, tempcv, 14 );       \
531     temp3v = tempcv;                              \
532     temp4v = vec_sld( tempcv, tempdv,  2 );       \
533     temp5v = vec_sld( tempcv, tempdv,  4 );       \
534     temp6v = vec_sld( tempcv, tempdv,  6 );       \
535                                                   \
536     HPEL_FILTER_2( temp1v, temp2v, temp3v,        \
537                    temp4v, temp5v, temp6v );      \
538                                                   \
539     dest2v = vec_add( temp1v, thirtytwov );       \
540     dest2v = vec_sra( dest2v, sixv );             \
541                                                   \
542     destv = vec_packsu( dest1v, dest2v );         \
543                                                   \
544     VEC_STORE16( destv, &dstc[x-16+i_stride*y] ); \
545 }
546
547 void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
548                                int i_stride, int i_width, int i_height )
549 {
550     int x, y;
551
552     vec_u8_t destv;
553     vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v;
554     vec_s16_t dest1v, dest2v;
555     vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, temp8v, temp9v;
556     vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;
557
558     PREP_LOAD;
559     PREP_STORE16;
560     LOAD_ZERO;
561
562     vec_u16_t twov, fourv, fivev, sixv;
563     vec_s16_t sixteenv, thirtytwov;
564     vect_ushort_u temp_u;
565
566     temp_u.s[0]=2;
567     twov = vec_splat( temp_u.v, 0 );
568     temp_u.s[0]=4;
569     fourv = vec_splat( temp_u.v, 0 );
570     temp_u.s[0]=5;
571     fivev = vec_splat( temp_u.v, 0 );
572     temp_u.s[0]=6;
573     sixv = vec_splat( temp_u.v, 0 );
574     temp_u.s[0]=16;
575     sixteenv = (vec_s16_t)vec_splat( temp_u.v, 0 );
576     temp_u.s[0]=32;
577     thirtytwov = (vec_s16_t)vec_splat( temp_u.v, 0 );
578
579     for( y = 0; y < i_height; y++ )
580     {
581         x = 0;
582
583         /* horizontal_filter */
584         HPEL_FILTER_HORIZONTAL();
585
586         /* vertical_filter */
587         HPEL_FILTER_VERTICAL();
588
589         /* central_filter */
590         tempav = tempcv;
591         tempbv = tempdv;
592         tempcv = vec_splat( temp1v, 0 ); /* first only */
593         tempdv = temp1v;
594         tempev = temp4v;
595
596         for( x = 16; x < i_width; x+=16 )
597         {
598             /* horizontal_filter */
599             HPEL_FILTER_HORIZONTAL();
600
601             /* vertical_filter */
602             HPEL_FILTER_VERTICAL();
603
604             /* central_filter */
605             tempav = tempcv;
606             tempbv = tempdv;
607             tempcv = tempev;
608             tempdv = temp1v;
609             tempev = temp4v;
610
611             HPEL_FILTER_CENTRAL();
612         }
613
614         /* Partial vertical filter */
615         VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t );
616         VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t );
617         VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t );
618         VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t );
619         VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t );
620         VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t );
621
622         temp1v = vec_u8_to_s16_h( src1v );
623         temp2v = vec_u8_to_s16_h( src2v );
624         temp3v = vec_u8_to_s16_h( src3v );
625         temp4v = vec_u8_to_s16_h( src4v );
626         temp5v = vec_u8_to_s16_h( src5v );
627         temp6v = vec_u8_to_s16_h( src6v );
628
629         HPEL_FILTER_1( temp1v, temp2v, temp3v,
630                       temp4v, temp5v, temp6v );
631
632         /* central_filter */
633         tempav = tempcv;
634         tempbv = tempdv;
635         tempcv = tempev;
636         tempdv = temp1v;
637         /* tempev is not used */
638
639         HPEL_FILTER_CENTRAL();
640     }
641 }
642
643 void x264_mc_altivec_init( x264_mc_functions_t *pf )
644 {
645     pf->mc_luma   = mc_luma_altivec;
646     pf->get_ref   = get_ref_altivec;
647     pf->mc_chroma = mc_chroma_altivec;
648
649     pf->hpel_filter = x264_hpel_filter_altivec;
650 }