]> git.sesse.net Git - x264/blob - common/ppc/mc.c
Factorize in ppccommon.h the conditional inclusion of altivec.h on Linux systems.
[x264] / common / ppc / mc.c
1 /*****************************************************************************
2  * mc.c: h264 encoder library (Motion Compensation)
3  *****************************************************************************
4  * Copyright (C) 2003-2008 x264 project
5  *
6  * Authors: Eric Petit <eric.petit@lapsus.org>
7  *          Guillaume Poirier <gpoirier@mplayerhq.hu>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22  *****************************************************************************/
23
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <stdint.h>
28 #include <stdarg.h>
29
30 #include "x264.h"
31 #include "common/common.h"
32 #include "common/mc.h"
33 #include "mc.h"
34 #include "ppccommon.h"
35
36 typedef void (*pf_mc_t)( uint8_t *src, int i_src,
37                          uint8_t *dst, int i_dst, int i_height );
38
39
40 static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
41 static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
42
43
44 static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
45 {
46     return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
47            pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
48            pix[ 3*i_pix_next];
49 }
50 static inline int x264_tapfilter1( uint8_t *pix )
51 {
52     return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
53            pix[ 3];
54 }
55
56
57 static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst,  int i_dst,
58                                                uint8_t *src1, int i_src1,
59                                                uint8_t *src2, int i_height )
60 {
61     int x, y;
62     for( y = 0; y < i_height; y++ )
63     {
64         for( x = 0; x < 4; x++ )
65         {
66             dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
67         }
68         dst  += i_dst;
69         src1 += i_src1;
70         src2 += i_src1;
71     }
72 }
73
74 static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst,  int i_dst,
75                                                uint8_t *src1, int i_src1,
76                                                uint8_t *src2, int i_height )
77 {
78     int y;
79     vec_u8_t src1v, src2v;
80     PREP_LOAD;
81     PREP_STORE8;
82     PREP_LOAD_SRC( src1 );
83     PREP_LOAD_SRC( src2 );
84
85     for( y = 0; y < i_height; y++ )
86     {
87         VEC_LOAD( src1, src1v, 8, vec_u8_t, src1 );
88         VEC_LOAD( src2, src2v, 8, vec_u8_t, src2 );
89         src1v = vec_avg( src1v, src2v );
90         VEC_STORE8( src1v, dst );
91
92         dst  += i_dst;
93         src1 += i_src1;
94         src2 += i_src1;
95     }
96 }
97
98 static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst,  int i_dst,
99                                                 uint8_t *src1, int i_src1,
100                                                 uint8_t *src2, int i_height )
101 {
102     int y;
103     vec_u8_t src1v, src2v;
104     PREP_LOAD;
105     PREP_LOAD_SRC( src1 );
106     PREP_LOAD_SRC( src2 );
107
108     for( y = 0; y < i_height; y++ )
109     {
110         VEC_LOAD( src1, src1v, 16, vec_u8_t, src1 );
111         VEC_LOAD( src2, src2v, 16, vec_u8_t, src2 );
112         src1v = vec_avg( src1v, src2v );
113         vec_st(src1v, 0, dst);
114
115         dst  += i_dst;
116         src1 += i_src1;
117         src2 += i_src1;
118     }
119 }
120
121 static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst,  int i_dst,
122                                                 uint8_t *src1, int i_src1,
123                                                 uint8_t *src2, int i_height )
124 {
125     x264_pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height);
126     x264_pixel_avg2_w4_altivec(dst+16, i_dst, src1+16, i_src1, src2+16, i_height);
127 }
128
129 /* mc_copy: plain c */
130
131 #define MC_COPY( name, a )                                \
132 static void name( uint8_t *dst, int i_dst,                \
133                   uint8_t *src, int i_src, int i_height ) \
134 {                                                         \
135     int y;                                                \
136     for( y = 0; y < i_height; y++ )                       \
137     {                                                     \
138         memcpy( dst, src, a );                            \
139         src += i_src;                                     \
140         dst += i_dst;                                     \
141     }                                                     \
142 }
143 MC_COPY( x264_mc_copy_w4_altivec,  4  )
144 MC_COPY( x264_mc_copy_w8_altivec,  8  )
145
146 static void x264_mc_copy_w16_altivec( uint8_t *dst, int i_dst,
147                                       uint8_t *src, int i_src, int i_height )
148 {
149     int y;
150     vec_u8_t cpyV;
151     PREP_LOAD;
152     PREP_LOAD_SRC( src );
153
154     for( y = 0; y < i_height; y++)
155     {
156         VEC_LOAD( src, cpyV, 16, vec_u8_t, src );
157         vec_st(cpyV, 0, dst);
158
159         src += i_src;
160         dst += i_dst;
161     }
162 }
163
164
165 static void mc_luma_altivec( uint8_t *dst,    int i_dst_stride,
166                              uint8_t *src[4], int i_src_stride,
167                              int mvx, int mvy,
168                              int i_width, int i_height )
169 {
170     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
171     int offset = (mvy>>2)*i_src_stride + (mvx>>2);
172     uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
173     if( qpel_idx & 5 ) /* qpel interpolation needed */
174     {
175         uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
176
177         switch(i_width) {
178         case 4:
179             x264_pixel_avg2_w4_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
180             break;
181         case 8:
182             x264_pixel_avg2_w8_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
183             break;
184         case 16:
185         default:
186             x264_pixel_avg2_w16_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
187         }
188
189     }
190     else
191     {
192         switch(i_width) {
193         case 4:
194             x264_mc_copy_w4_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
195             break;
196         case 8:
197             x264_mc_copy_w8_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
198             break;
199         case 16:
200             x264_mc_copy_w16_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
201             break;
202         }
203     }
204 }
205
206
207
208 static uint8_t *get_ref_altivec( uint8_t *dst,   int *i_dst_stride,
209                                  uint8_t *src[4], int i_src_stride,
210                                  int mvx, int mvy,
211                                  int i_width, int i_height )
212 {
213     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
214     int offset = (mvy>>2)*i_src_stride + (mvx>>2);
215     uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
216     if( qpel_idx & 5 ) /* qpel interpolation needed */
217     {
218         uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
219         switch(i_width) {
220         case 4:
221             x264_pixel_avg2_w4_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
222             break;
223         case 8:
224             x264_pixel_avg2_w8_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
225             break;
226         case 12:
227         case 16:
228         default:
229             x264_pixel_avg2_w16_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
230             break;
231         case 20:
232             x264_pixel_avg2_w20_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
233             break;
234         }
235         return dst;
236     }
237     else
238     {
239         *i_dst_stride = i_src_stride;
240         return src1;
241     }
242 }
243
244 #define DO_PROCESS(a) \
245         src##a##v_16 = vec_u8_to_u16( src##a##v_8 ); \
246         src##a##v_16 = vec_mladd( coeff##a##v, src##a##v_16, zero_u16v ); \
247         dstv_16      = vec_add( dstv_16, src##a##v_16 )
248
249 static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
250                                    uint8_t *src, int i_src_stride,
251                                    int mvx, int mvy,
252                                    int i_height )
253 {
254     uint8_t *srcp;
255     int y;
256     int d8x = mvx & 0x07;
257     int d8y = mvy & 0x07;
258
259     DECLARE_ALIGNED_16( uint16_t coeff[4] );
260     coeff[0] = (8-d8x)*(8-d8y);
261     coeff[1] = d8x    *(8-d8y);
262     coeff[2] = (8-d8x)*d8y;
263     coeff[3] = d8x    *d8y;
264
265     src  += (mvy >> 3) * i_src_stride + (mvx >> 3);
266     srcp  = &src[i_src_stride];
267
268     LOAD_ZERO;
269     PREP_LOAD;
270     PREP_LOAD_SRC( src );
271     PREP_STORE4;
272     vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
273     vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
274     vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16;
275     vec_u8_t    dstv_8;
276     vec_u16_t   dstv_16;
277     vec_u8_t    permv;
278     vec_u16_t   shiftv;
279     vec_u16_t   k32v;
280
281     coeff0v = vec_ld( 0, coeff );
282     coeff3v = vec_splat( coeff0v, 3 );
283     coeff2v = vec_splat( coeff0v, 2 );
284     coeff1v = vec_splat( coeff0v, 1 );
285     coeff0v = vec_splat( coeff0v, 0 );
286     k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
287     permv   = vec_lvsl( 0, (uint8_t *) 1 );
288     shiftv  = vec_splat_u16( 6 );
289
290     VEC_LOAD( src, src2v_8, 5, vec_u8_t, src );
291     src3v_8 = vec_perm( src2v_8, src2v_8, permv );
292
293     for( y = 0; y < i_height; y++ )
294     {
295         src0v_8 = src2v_8;
296         src1v_8 = src3v_8;
297         VEC_LOAD_G( srcp, src2v_8, 5, vec_u8_t );
298         src3v_8 = vec_perm( src2v_8, src2v_8, permv );
299
300         dstv_16 = k32v;
301
302         DO_PROCESS( 0 );
303         DO_PROCESS( 1 );
304         DO_PROCESS( 2 );
305         DO_PROCESS( 3 );
306
307         dstv_16 = vec_sr( dstv_16, shiftv );
308         dstv_8  = vec_u16_to_u8( dstv_16 );
309         VEC_STORE4( dstv_8, dst );
310
311         dst  += i_dst_stride;
312         srcp += i_src_stride;
313     }
314 }
315
316 static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
317                                    uint8_t *src, int i_src_stride,
318                                    int mvx, int mvy,
319                                    int i_height )
320 {
321     uint8_t *srcp;
322     int y;
323     int d8x = mvx & 0x07;
324     int d8y = mvy & 0x07;
325
326     DECLARE_ALIGNED_16( uint16_t coeff[4] );
327     coeff[0] = (8-d8x)*(8-d8y);
328     coeff[1] = d8x    *(8-d8y);
329     coeff[2] = (8-d8x)*d8y;
330     coeff[3] = d8x    *d8y;
331
332     src  += (mvy >> 3) * i_src_stride + (mvx >> 3);
333     srcp  = &src[i_src_stride];
334
335     LOAD_ZERO;
336     PREP_LOAD;
337     PREP_LOAD_SRC( src );
338     PREP_STORE8;
339     vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
340     vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
341     vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16;
342     vec_u8_t    dstv_8;
343     vec_u16_t   dstv_16;
344     vec_u8_t    permv;
345     vec_u16_t   shiftv;
346     vec_u16_t   k32v;
347
348     coeff0v = vec_ld( 0, coeff );
349     coeff3v = vec_splat( coeff0v, 3 );
350     coeff2v = vec_splat( coeff0v, 2 );
351     coeff1v = vec_splat( coeff0v, 1 );
352     coeff0v = vec_splat( coeff0v, 0 );
353     k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
354     permv   = vec_lvsl( 0, (uint8_t *) 1 );
355     shiftv  = vec_splat_u16( 6 );
356
357     VEC_LOAD( src, src2v_8, 9, vec_u8_t, src);
358     src3v_8 = vec_perm( src2v_8, src2v_8, permv );
359
360     for( y = 0; y < i_height; y++ )
361     {
362         src0v_8 = src2v_8;
363         src1v_8 = src3v_8;
364         VEC_LOAD_G( srcp, src2v_8, 9, vec_u8_t );
365         src3v_8 = vec_perm( src2v_8, src2v_8, permv );
366
367         dstv_16 = k32v;
368
369         DO_PROCESS( 0 );
370         DO_PROCESS( 1 );
371         DO_PROCESS( 2 );
372         DO_PROCESS( 3 );
373
374         dstv_16 = vec_sr( dstv_16, shiftv );
375         dstv_8  = vec_u16_to_u8( dstv_16 );
376         VEC_STORE8( dstv_8, dst );
377
378         dst  += i_dst_stride;
379         srcp += i_src_stride;
380     }
381 }
382
383 static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride,
384                                uint8_t *src, int i_src_stride,
385                                int mvx, int mvy,
386                                int i_width, int i_height )
387 {
388     if( i_width == 8 )
389     {
390         mc_chroma_altivec_8xh( dst, i_dst_stride, src, i_src_stride,
391                                mvx, mvy, i_height );
392     }
393     else
394     {
395         mc_chroma_altivec_4xh( dst, i_dst_stride, src, i_src_stride,
396                                mvx, mvy, i_height );
397     }
398 }
399
400 #define HPEL_FILTER_1( t1v, t2v, t3v, t4v, t5v, t6v ) \
401 {                                                     \
402     t1v = vec_add( t1v, t6v );                        \
403     t2v = vec_add( t2v, t5v );                        \
404     t3v = vec_add( t3v, t4v );                        \
405                                                       \
406     t1v = vec_sub( t1v, t2v );   /* (a-b) */          \
407     t2v = vec_sub( t2v, t3v );   /* (b-c) */          \
408     t2v = vec_sl(  t2v, twov );  /* (b-c)*4 */        \
409     t1v = vec_sub( t1v, t2v );   /* a-5*b+4*c */      \
410     t3v = vec_sl(  t3v, fourv ); /* 16*c */           \
411     t1v = vec_add( t1v, t3v );   /* a-5*b+20*c */     \
412 }
413
414 #define HPEL_FILTER_2( t1v, t2v, t3v, t4v, t5v, t6v ) \
415 {                                                     \
416     t1v = vec_add( t1v, t6v );                        \
417     t2v = vec_add( t2v, t5v );                        \
418     t3v = vec_add( t3v, t4v );                        \
419                                                       \
420     t1v = vec_sub( t1v, t2v );  /* (a-b) */           \
421     t1v = vec_sra( t1v, twov ); /* (a-b)/4 */         \
422     t1v = vec_sub( t1v, t2v );  /* (a-b)/4-b */       \
423     t1v = vec_add( t1v, t3v );  /* (a-b)/4-b+c */     \
424     t1v = vec_sra( t1v, twov ); /* ((a-b)/4-b+c)/4 */ \
425     t1v = vec_add( t1v, t3v );  /* ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 */ \
426 }
427
428 #define HPEL_FILTER_HORIZONTAL()                             \
429 {                                                            \
430     VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \
431     VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \
432                                                              \
433     src2v = vec_sld( src1v, src6v,  1 );                     \
434     src3v = vec_sld( src1v, src6v,  2 );                     \
435     src4v = vec_sld( src1v, src6v,  3 );                     \
436     src5v = vec_sld( src1v, src6v,  4 );                     \
437     src6v = vec_sld( src1v, src6v,  5 );                     \
438                                                              \
439     temp1v = vec_u8_to_s16_h( src1v );                       \
440     temp2v = vec_u8_to_s16_h( src2v );                       \
441     temp3v = vec_u8_to_s16_h( src3v );                       \
442     temp4v = vec_u8_to_s16_h( src4v );                       \
443     temp5v = vec_u8_to_s16_h( src5v );                       \
444     temp6v = vec_u8_to_s16_h( src6v );                       \
445                                                              \
446     HPEL_FILTER_1( temp1v, temp2v, temp3v,                   \
447                    temp4v, temp5v, temp6v );                 \
448                                                              \
449     dest1v = vec_add( temp1v, sixteenv );                    \
450     dest1v = vec_sra( dest1v, fivev );                       \
451                                                              \
452     temp1v = vec_u8_to_s16_l( src1v );                       \
453     temp2v = vec_u8_to_s16_l( src2v );                       \
454     temp3v = vec_u8_to_s16_l( src3v );                       \
455     temp4v = vec_u8_to_s16_l( src4v );                       \
456     temp5v = vec_u8_to_s16_l( src5v );                       \
457     temp6v = vec_u8_to_s16_l( src6v );                       \
458                                                              \
459     HPEL_FILTER_1( temp1v, temp2v, temp3v,                   \
460                    temp4v, temp5v, temp6v );                 \
461                                                              \
462     dest2v = vec_add( temp1v, sixteenv );                    \
463     dest2v = vec_sra( dest2v, fivev );                       \
464                                                              \
465     destv = vec_packsu( dest1v, dest2v );                    \
466                                                              \
467     VEC_STORE16( destv, &dsth[x+i_stride*y], dsth );         \
468 }
469
470 #define HPEL_FILTER_VERTICAL()                                    \
471 {                                                                 \
472     VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src ); \
473     VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src ); \
474     VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src ); \
475     VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src ); \
476     VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src ); \
477     VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src ); \
478                                                                   \
479     temp1v = vec_u8_to_s16_h( src1v );                            \
480     temp2v = vec_u8_to_s16_h( src2v );                            \
481     temp3v = vec_u8_to_s16_h( src3v );                            \
482     temp4v = vec_u8_to_s16_h( src4v );                            \
483     temp5v = vec_u8_to_s16_h( src5v );                            \
484     temp6v = vec_u8_to_s16_h( src6v );                            \
485                                                                   \
486     HPEL_FILTER_1( temp1v, temp2v, temp3v,                        \
487                    temp4v, temp5v, temp6v );                      \
488                                                                   \
489     dest1v = vec_add( temp1v, sixteenv );                         \
490     dest1v = vec_sra( dest1v, fivev );                            \
491                                                                   \
492     temp4v = vec_u8_to_s16_l( src1v );                            \
493     temp5v = vec_u8_to_s16_l( src2v );                            \
494     temp6v = vec_u8_to_s16_l( src3v );                            \
495     temp7v = vec_u8_to_s16_l( src4v );                            \
496     temp8v = vec_u8_to_s16_l( src5v );                            \
497     temp9v = vec_u8_to_s16_l( src6v );                            \
498                                                                   \
499     HPEL_FILTER_1( temp4v, temp5v, temp6v,                        \
500                    temp7v, temp8v, temp9v );                      \
501                                                                   \
502     dest2v = vec_add( temp4v, sixteenv );                         \
503     dest2v = vec_sra( dest2v, fivev );                            \
504                                                                   \
505     destv = vec_packsu( dest1v, dest2v );                         \
506                                                                   \
507     VEC_STORE16( destv, &dstv[x+i_stride*y], dsth );              \
508 }
509
510 #define HPEL_FILTER_CENTRAL()                           \
511 {                                                       \
512     temp1v = vec_sld( tempav, tempbv, 12 );             \
513     temp2v = vec_sld( tempav, tempbv, 14 );             \
514     temp3v = tempbv;                                    \
515     temp4v = vec_sld( tempbv, tempcv,  2 );             \
516     temp5v = vec_sld( tempbv, tempcv,  4 );             \
517     temp6v = vec_sld( tempbv, tempcv,  6 );             \
518                                                         \
519     HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
520                    temp4v, temp5v, temp6v );            \
521                                                         \
522     dest1v = vec_add( temp1v, thirtytwov );             \
523     dest1v = vec_sra( dest1v, sixv );                   \
524                                                         \
525     temp1v = vec_sld( tempbv, tempcv, 12 );             \
526     temp2v = vec_sld( tempbv, tempcv, 14 );             \
527     temp3v = tempcv;                                    \
528     temp4v = vec_sld( tempcv, tempdv,  2 );             \
529     temp5v = vec_sld( tempcv, tempdv,  4 );             \
530     temp6v = vec_sld( tempcv, tempdv,  6 );             \
531                                                         \
532     HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
533                    temp4v, temp5v, temp6v );            \
534                                                         \
535     dest2v = vec_add( temp1v, thirtytwov );             \
536     dest2v = vec_sra( dest2v, sixv );                   \
537                                                         \
538     destv = vec_packsu( dest1v, dest2v );               \
539                                                         \
540     VEC_STORE16( destv, &dstc[x-16+i_stride*y], dsth ); \
541 }
542
543 void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
544                                int i_stride, int i_width, int i_height )
545 {
546     int x, y;
547
548     vec_u8_t destv;
549     vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v;
550     vec_s16_t dest1v, dest2v;
551     vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, temp8v, temp9v;
552     vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;
553
554     PREP_LOAD;
555     PREP_LOAD_SRC( src);
556     PREP_STORE16;
557     PREP_STORE16_DST( dsth );
558     LOAD_ZERO;
559
560     vec_u16_t twov, fourv, fivev, sixv;
561     vec_s16_t sixteenv, thirtytwov;
562     vect_ushort_u temp_u;
563
564     temp_u.s[0]=2;
565     twov = vec_splat( temp_u.v, 0 );
566     temp_u.s[0]=4;
567     fourv = vec_splat( temp_u.v, 0 );
568     temp_u.s[0]=5;
569     fivev = vec_splat( temp_u.v, 0 );
570     temp_u.s[0]=6;
571     sixv = vec_splat( temp_u.v, 0 );
572     temp_u.s[0]=16;
573     sixteenv = (vec_s16_t)vec_splat( temp_u.v, 0 );
574     temp_u.s[0]=32;
575     thirtytwov = (vec_s16_t)vec_splat( temp_u.v, 0 );
576
577     for( y = 0; y < i_height; y++ )
578     {
579         x = 0;
580
581         /* horizontal_filter */
582         HPEL_FILTER_HORIZONTAL();
583
584         /* vertical_filter */
585         HPEL_FILTER_VERTICAL();
586
587         /* central_filter */
588         tempav = tempcv;
589         tempbv = tempdv;
590         tempcv = vec_splat( temp1v, 0 ); /* first only */
591         tempdv = temp1v;
592         tempev = temp4v;
593
594         for( x = 16; x < i_width; x+=16 )
595         {
596             /* horizontal_filter */
597             HPEL_FILTER_HORIZONTAL();
598
599             /* vertical_filter */
600             HPEL_FILTER_VERTICAL();
601
602             /* central_filter */
603             tempav = tempcv;
604             tempbv = tempdv;
605             tempcv = tempev;
606             tempdv = temp1v;
607             tempev = temp4v;
608
609             HPEL_FILTER_CENTRAL();
610         }
611
612         /* Partial vertical filter */
613         VEC_LOAD_PARTIAL( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src );
614         VEC_LOAD_PARTIAL( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src );
615         VEC_LOAD_PARTIAL( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src );
616         VEC_LOAD_PARTIAL( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src );
617         VEC_LOAD_PARTIAL( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src );
618         VEC_LOAD_PARTIAL( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src );
619
620         temp1v = vec_u8_to_s16_h( src1v );
621         temp2v = vec_u8_to_s16_h( src2v );
622         temp3v = vec_u8_to_s16_h( src3v );
623         temp4v = vec_u8_to_s16_h( src4v );
624         temp5v = vec_u8_to_s16_h( src5v );
625         temp6v = vec_u8_to_s16_h( src6v );
626
627         HPEL_FILTER_1( temp1v, temp2v, temp3v,
628                        temp4v, temp5v, temp6v );
629
630         /* central_filter */
631         tempav = tempcv;
632         tempbv = tempdv;
633         tempcv = tempev;
634         tempdv = temp1v;
635         /* tempev is not used */
636
637         HPEL_FILTER_CENTRAL();
638     }
639 }
640
641 void x264_mc_altivec_init( x264_mc_functions_t *pf )
642 {
643     pf->mc_luma   = mc_luma_altivec;
644     pf->get_ref   = get_ref_altivec;
645     pf->mc_chroma = mc_chroma_altivec;
646
647     pf->hpel_filter = x264_hpel_filter_altivec;
648 }