1 /*****************************************************************************
2 * mc.c: h264 encoder library (Motion Compensation)
3 *****************************************************************************
4 * Copyright (C) 2003 Laurent Aimar
5 * $Id: mc-c.c,v 1.5 2004/06/18 01:59:58 chenm001 Exp $
7 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 *****************************************************************************/
34 #include "x264.h" /* DECLARE_ALIGNED */
35 #include "common/pixel.h"
36 #include "common/mc.h"
37 #include "common/clip1.h"
41 extern void x264_pixel_avg_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
42 extern void x264_pixel_avg_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
43 extern void x264_pixel_avg_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
44 extern void x264_pixel_avg_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
45 extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
46 extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
47 extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int );
48 extern void x264_mc_copy_w4_mmxext( uint8_t *, int, uint8_t *, int, int );
49 extern void x264_mc_copy_w8_mmxext( uint8_t *, int, uint8_t *, int, int );
50 extern void x264_mc_copy_w16_mmxext( uint8_t *, int, uint8_t *, int, int );
51 extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
54 static void x264_pixel_avg_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src ) \
56 x264_pixel_avg_w ## W ## _mmxext( dst, i_dst, dst, i_dst, src, i_src, H ); \
67 #define AVG_WEIGHT(W,H) \
68 void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
70 x264_pixel_avg_weight_w ## W ## _mmxext( dst, i_dst, src, i_src, i_weight_dst, H ); \
80 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
81 #define USED_UINT64(foo) \
82 static const uint64_t foo __asm__ (#foo) __attribute__((used))
84 #define USED_UINT64(foo) \
85 static const uint64_t foo __asm__ (#foo) __attribute__((unused))
88 USED_UINT64( x264_w0x10 ) = 0x0010001000100010ULL;
91 #define MMX_ZERO( MMZ ) \
92 asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: )
94 #define MMX_INIT( MMV, NAME ) \
95 asm volatile( "movq " #NAME ", " #MMV "\n" :: )
97 #define MMX_SAVE_4P( MMP, MMZ, dst ) \
98 asm volatile( "packuswb " #MMZ "," #MMP "\n" \
99 "movd " #MMP ", (%0)" :: "r"(dst) )
101 #define MMX_LOAD_4P( MMP, MMZ, pix ) \
102 asm volatile( "movd (%0), " #MMP "\n" \
103 "punpcklbw " #MMZ ", " #MMP "\n" : : "r"(pix) )
105 #define MMX_LOAD_4x4( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\
106 MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \
107 MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] ); \
108 MMX_LOAD_4P( MMP3, MMZ, &(pix)[2*(i_pix)] ); \
109 MMX_LOAD_4P( MMP4, MMZ, &(pix)[3*(i_pix)] )
111 #define MMX_LOAD_2x4( MMP1, MMP2, MMZ, pix, i_pix )\
112 MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \
113 MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] )
115 #define MMX_SAVEPACK_8P( MMP1, MMP2, MMZ, dst ) \
116 asm volatile( "packuswb " #MMP2 "," #MMP1 "\n" \
117 "movq " #MMP1 ", (%0)\n" :: "r"(dst) )
120 #define MMX_LOAD_8P( MMP1, MMP2, MMZ, pix ) \
121 asm volatile( "movq (%0) , " #MMP1 "\n" \
122 "movq " #MMP1 ", " #MMP2 "\n" \
123 "punpcklbw " #MMZ ", " #MMP1 "\n" \
124 "punpckhbw " #MMZ ", " #MMP2 "\n" : : "r"(pix) )
126 #define MMX_LOAD_2x8( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\
127 MMX_LOAD_8P( MMP1, MMP2, MMZ, &(pix)[0*(i_pix)] ); \
128 MMX_LOAD_8P( MMP3, MMP4, MMZ, &(pix)[1*(i_pix)] )
130 #define SBUTTERFLYwd(a,b,t )\
131 asm volatile( "movq " #a ", " #t " \n\t" \
132 "punpcklwd " #b ", " #a " \n\t" \
133 "punpckhwd " #b ", " #t " \n\t" :: )
135 #define SBUTTERFLYdq(a,b,t )\
136 asm volatile( "movq " #a ", " #t " \n\t" \
137 "punpckldq " #b ", " #a " \n\t" \
138 "punpckhdq " #b ", " #t " \n\t" :: )
140 /* input ABCD output ADTC ( or 0?31-2->0123 ) */
141 #define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \
142 SBUTTERFLYwd( MMA, MMB, MMT ); \
143 SBUTTERFLYwd( MMC, MMD, MMB ); \
144 SBUTTERFLYdq( MMA, MMC, MMD ); \
145 SBUTTERFLYdq( MMT, MMB, MMC )
147 /* first pass MM0 = MM0 -5*MM1 */
148 #define MMX_FILTERTAP_P1( MMP0, MMP1 ) \
149 asm volatile( "psubw " #MMP1 "," #MMP0 "\n" \
150 "psllw $2, " #MMP1 "\n" \
151 "psubw " #MMP1 "," #MMP0 "\n" :: )
153 /* second pass MM0 = MM0 + 20*(MM2+MM3) */
154 #define MMX_FILTERTAP_P2( MMP0, MMP2, MMP3 ) \
155 asm volatile( "paddw " #MMP3 "," #MMP2 "\n" \
157 "psllw $2, " #MMP2 "\n" \
158 "paddw " #MMP2 "," #MMP0 "\n" \
159 "psllw $2, " #MMP2 "\n" \
160 "paddw " #MMP2 "," #MMP0 "\n" :: )
162 /* last pass: MM0 = ( MM0 -5*MM1 + MM2 + MMV ) >> 5 */
163 #define MMX_FILTERTAP_P3( MMP0, MMP1, MMP2, MMV, MMZ ) \
164 asm volatile( "psubw " #MMP1 "," #MMP0 "\n" \
165 "psllw $2, " #MMP1 "\n" \
166 "psubw " #MMP1 "," #MMP0 "\n" \
168 "paddw " #MMP2 "," #MMP0 "\n" \
169 "paddw " #MMV "," #MMP0 "\n" \
170 "psraw $5, " #MMP0 "\n" :: )
172 #define MMX_FILTERTAP2_P1( MMP0, MMP1, MMP2, MMP3 ) \
173 asm volatile( "psubw " #MMP1 "," #MMP0 "\n" \
174 "psubw " #MMP3 "," #MMP2 "\n" \
175 "psllw $2, " #MMP1 "\n" \
176 "psllw $2, " #MMP3 "\n" \
177 "psubw " #MMP1 "," #MMP0 "\n" \
178 "psubw " #MMP3 "," #MMP2 "\n" :: )
180 /* second pass MM0 = MM0 + 20*(MM1+MM2) */
181 #define MMX_FILTERTAP2_P2( MMP0, MMP1, MMP2, MMP3, MMP4, MMP5 ) \
182 asm volatile( "paddw " #MMP2 "," #MMP1 "\n" \
183 "paddw " #MMP5 "," #MMP4 "\n" \
185 "psllw $2, " #MMP1 "\n" \
186 "psllw $2, " #MMP4 "\n" \
187 "paddw " #MMP1 "," #MMP0 "\n" \
188 "paddw " #MMP4 "," #MMP3 "\n" \
189 "psllw $2, " #MMP1 "\n" \
190 "psllw $2, " #MMP4 "\n" \
191 "paddw " #MMP1 "," #MMP0 "\n" \
192 "paddw " #MMP4 "," #MMP3 "\n" :: )
194 #define MMX_LOAD_1r( m1, dst ) \
195 asm volatile( "movq (%0), " #m1 "\n" :: "r"(dst) ); \
197 #define MMX_SAVE_1r( m1, dst ) \
198 asm volatile( "movq " #m1 ", (%0)\n" :: "r"(dst) ); \
200 #define MMX_LOAD_2r( m1, m2, dst, i_dst ) \
201 asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
202 asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )
204 #define MMX_SAVE_2r( m1, m2, dst, i_dst ) \
205 asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
206 asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )
208 #define MMX_SAVE_4r( m1, m2, m3, m4, dst, i_dst ) \
209 asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
210 asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \
211 asm volatile( "movq " #m3 ", (%0)\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \
212 asm volatile( "movq " #m4 ", (%0)\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )
214 #define MMX_LOAD_4r( m1, m2, m3, m4, dst, i_dst ) \
215 asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
216 asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \
217 asm volatile( "movq (%0), " #m3 "\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \
218 asm volatile( "movq (%0), " #m4 "\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )
221 static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
223 return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + pix[ 3*i_pix_next];
225 static inline int x264_tapfilter1( uint8_t *pix )
227 return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3];
230 typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
232 /* Macro to define NxM functions */
234 #define MC_IH( name, cpu, width, height, off ) \
235 static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
237 DECLARE_ALIGNED( uint8_t, tmp[width*height], width ); \
239 mc_hh_w##width( src, i_src_stride, tmp, width, i_height ); \
240 x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \
241 src+(off), i_src_stride, \
242 tmp, width, i_height ); \
246 #define MC_IV( name, cpu, width, height, off ) \
247 static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
249 DECLARE_ALIGNED( uint8_t, tmp[width*height], width ); \
251 mc_hv_w##width( src, i_src_stride, tmp, width, i_height ); \
252 x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \
253 src+(off), i_src_stride, \
254 tmp, width, i_height ); \
258 #define MC_HV( name, cpu, width, height, off1, off2 ) \
259 static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
261 DECLARE_ALIGNED( uint8_t, tmp1[width*height], width ); \
262 DECLARE_ALIGNED( uint8_t, tmp2[width*height], width ); \
264 mc_hv_w##width( src+(off1), i_src_stride, tmp1, width, i_height ); \
265 mc_hh_w##width( src+(off2), i_src_stride, tmp2, width, i_height ); \
266 x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \
267 tmp1, width, tmp2, width, \
272 #define MC_CH( name, cpu, width, height, off ) \
273 static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
275 DECLARE_ALIGNED( uint8_t, tmp1[width*height], width ); \
276 DECLARE_ALIGNED( uint8_t, tmp2[width*height], width ); \
278 mc_hc_w##width( src, i_src_stride, tmp1, width, i_height ); \
279 mc_hh_w##width( src+(off), i_src_stride, tmp2, width, i_height ); \
280 x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \
281 tmp1, width, tmp2, width, \
286 #define MC_CV( name, cpu, width, height, off ) \
287 static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
289 DECLARE_ALIGNED( uint8_t, tmp1[width*height], width ); \
290 DECLARE_ALIGNED( uint8_t, tmp2[width*height], width ); \
292 mc_hc_w##width( src, i_src_stride, tmp1, width, i_height ); \
293 mc_hv_w##width( src+(off), i_src_stride, tmp2, width, i_height ); \
294 x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \
295 tmp1, width, tmp2, width, \
300 /*****************************************************************************
301 * MC with width == 4 (height <= 8)
302 *****************************************************************************/
304 static inline void mc_hh_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
306 const int h4 = i_height / 4;
314 MMX_INIT( %%mm6, x264_w0x10 );
316 for( y = 0; y < h4; y++ )
320 /* Preload data and transpose them */
321 MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[0], i_src );
322 MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
323 MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*0], 8 );
325 MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[4], i_src );
326 MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
327 MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*1], 8 );
329 /* we read 2 more bytes that needed */
330 MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[8], i_src );
331 MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
332 MMX_SAVE_2r( %%mm0, %%mm1, &srct[4*8*2], 8 );
335 for( i = 0; i < 4; i++ )
337 MMX_LOAD_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[8*(i+0)], 8 );
338 MMX_FILTERTAP_P1( %%mm0, %%mm1 );
339 MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );
341 MMX_LOAD_2r( %%mm1, %%mm2, &srct[8*(i+4)], 8 );
342 MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 );
344 MMX_SAVE_1r( %%mm0, &tmp[i] );
347 MMX_LOAD_4r( %%mm0, %%mm4, %%mm3, %%mm1, tmp, 8 );
348 MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
349 MMX_SAVE_4P( %%mm0, %%mm7, &dst[0*i_dst] );
350 MMX_SAVE_4P( %%mm1, %%mm7, &dst[1*i_dst] );
351 MMX_SAVE_4P( %%mm2, %%mm7, &dst[2*i_dst] );
352 MMX_SAVE_4P( %%mm3, %%mm7, &dst[3*i_dst] );
358 static inline void mc_hv_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
365 MMX_INIT( %%mm6, x264_w0x10 );
367 for( y = 0; y < i_height; y++ )
369 MMX_LOAD_4x4( %%mm0, %%mm1, %%mm2, %%mm3, %%mm7, src, i_src );
370 MMX_FILTERTAP_P1( %%mm0, %%mm1 );
371 MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );
373 MMX_LOAD_2x4( %%mm4, %%mm5, %%mm7, &src[4*i_src], i_src );
374 MMX_FILTERTAP_P3( %%mm0, %%mm4, %%mm5, %%mm6, %%mm7 );
375 MMX_SAVE_4P( %%mm0, %%mm7, dst );
382 static inline void mc_hc_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
386 for( y = 0; y < i_height; y++ )
390 for( i = 0; i < 5+4; i++ )
392 tap[i] = x264_tapfilter( &src[-2+i], i_src_stride );
395 for( x = 0; x < 4; x++ )
397 dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 );
405 MC_IH( mc_xy10, mmxext, 4, 8, 0 )
406 MC_IH( mc_xy30, mmxext, 4, 8, 1 )
408 MC_IV( mc_xy01, mmxext, 4, 8, 0 )
409 MC_IV( mc_xy03, mmxext, 4, 8, i_src_stride )
411 MC_HV( mc_xy11, mmxext, 4, 8, 0, 0 )
412 MC_HV( mc_xy31, mmxext, 4, 8, 1, 0 )
413 MC_HV( mc_xy13, mmxext, 4, 8, 0, i_src_stride )
414 MC_HV( mc_xy33, mmxext, 4, 8, 1, i_src_stride )
416 MC_CH( mc_xy21, mmxext, 4, 8, 0 )
417 MC_CH( mc_xy23, mmxext, 4, 8, i_src_stride )
419 MC_CV( mc_xy12, mmxext, 4, 8, 0 )
420 MC_CV( mc_xy32, mmxext, 4, 8, 1 )
423 static void mc_xy10_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
426 mc_hh_w4( src, i_src_stride, tmp, 4, i_height );
427 pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height );
429 static void mc_xy30_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
432 mc_hh_w4( src, i_src_stride, tmp, 4, i_height );
433 pixel_avg_w4( dst, i_dst_stride, src+1, i_src_stride, tmp, 4, i_height );
436 static void mc_xy01_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
439 mc_hv_w4( src, i_src_stride, tmp, 4, i_height );
440 pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height );
442 static void mc_xy03_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
445 mc_hv_w4( src, i_src_stride, tmp, 4, i_height );
446 pixel_avg_w4( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 4, i_height );
449 static void mc_xy11_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
454 mc_hv_w4( src, i_src_stride, tmp1, 4, i_height );
455 mc_hh_w4( src, i_src_stride, tmp2, 4, i_height );
456 pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
458 static void mc_xy31_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
463 mc_hv_w4( src+1, i_src_stride, tmp1, 4, i_height );
464 mc_hh_w4( src, i_src_stride, tmp2, 4, i_height );
465 pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
467 static void mc_xy13_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
472 mc_hv_w4( src, i_src_stride, tmp1, 4, i_height );
473 mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
474 pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
476 static void mc_xy33_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
481 mc_hv_w4( src+1, i_src_stride, tmp1, 4, i_height );
482 mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
483 pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
486 static void mc_xy21_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
491 mc_hc_w4( src, i_src_stride, tmp1, 4, i_height );
492 mc_hh_w4( src, i_src_stride, tmp2, 4, i_height );
493 pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
495 static void mc_xy23_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
500 mc_hc_w4( src, i_src_stride, tmp1, 4, i_height );
501 mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
502 pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
505 static void mc_xy12_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
510 mc_hc_w4( src, i_src_stride, tmp1, 4, i_height );
511 mc_hv_w4( src, i_src_stride, tmp2, 4, i_height );
512 pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
514 static void mc_xy32_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
519 mc_hc_w4( src, i_src_stride, tmp1, 4, i_height );
520 mc_hv_w4( src+1, i_src_stride, tmp2, 4, i_height );
521 pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
525 /*****************************************************************************
526 * MC with width == 8 (height <= 16)
527 *****************************************************************************/
529 static inline void mc_hh_w8( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
531 mc_hh_w4( &src[0], i_src, &dst[0], i_dst, i_height );
532 mc_hh_w4( &src[4], i_src, &dst[4], i_dst, i_height );
534 static inline void mc_hv_w8( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
541 MMX_INIT( %%mm6, x264_w0x10 );
543 for( y = 0; y < i_height; y++ )
545 MMX_LOAD_2x8( %%mm0, %%mm5, %%mm1, %%mm2, %%mm7, &src[0*i_src], i_src );
546 MMX_FILTERTAP2_P1( %%mm0, %%mm1, %%mm5, %%mm2 );
549 MMX_LOAD_2x8( %%mm1, %%mm3, %%mm2, %%mm4, %%mm7, &src[2*i_src], i_src );
550 MMX_FILTERTAP2_P2( %%mm0, %%mm1, %%mm2, %%mm5, %%mm3, %%mm4 );
552 MMX_LOAD_2x8( %%mm1, %%mm3, %%mm2, %%mm4, %%mm7, &src[4*i_src], i_src );
553 MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 );
554 MMX_FILTERTAP_P3( %%mm5, %%mm3, %%mm4, %%mm6, %%mm7 );
556 MMX_SAVEPACK_8P( %%mm0, %%mm5, %%mm7, dst );
563 static inline void mc_hc_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
567 asm volatile( "pxor %%mm7, %%mm7\n" : : );
569 for( y = 0; y < i_height; y++ )
575 "leal (%0, %1), %%eax\n"
577 "movq (%0), %%mm0\n" /* load pix-2 */
578 "movq %%mm0, %%mm2\n"
579 "punpcklbw %%mm7, %%mm0\n"
580 "punpckhbw %%mm7, %%mm2\n"
582 "movq (%%eax),%%mm1\n" /* load pix-1 */
583 "movq %%mm1, %%mm3\n"
584 "punpcklbw %%mm7, %%mm1\n"
585 "punpckhbw %%mm7, %%mm3\n"
586 "psubw %%mm1, %%mm0\n"
588 "psubw %%mm1, %%mm0\n"
589 "psubw %%mm3, %%mm2\n"
591 "psubw %%mm3, %%mm2\n"
593 "movq (%%eax,%1),%%mm1\n" /* load pix */
594 "movq %%mm1, %%mm3\n"
595 "punpcklbw %%mm7, %%mm1\n"
596 "punpckhbw %%mm7, %%mm3\n"
598 "paddw %%mm1, %%mm0\n"
600 "paddw %%mm1, %%mm0\n"
602 "paddw %%mm3, %%mm2\n"
604 "paddw %%mm3, %%mm2\n"
606 "movq (%%eax,%1,2),%%mm1\n" /* load pix+1 */
607 "movq %%mm1, %%mm3\n"
608 "punpcklbw %%mm7, %%mm1\n"
609 "punpckhbw %%mm7, %%mm3\n"
611 "paddw %%mm1, %%mm0\n"
613 "paddw %%mm1, %%mm0\n"
615 "paddw %%mm3, %%mm2\n"
617 "paddw %%mm3, %%mm2\n"
619 "movq (%0,%1,4),%%mm1\n" /* load pix+2 */
620 "movq %%mm1, %%mm3\n"
621 "punpcklbw %%mm7, %%mm1\n"
622 "punpckhbw %%mm7, %%mm3\n"
623 "psubw %%mm1, %%mm0\n"
625 "psubw %%mm1, %%mm0\n"
626 "psubw %%mm3, %%mm2\n"
628 "psubw %%mm3, %%mm2\n"
630 "movq (%%eax,%1,4),%%mm1\n" /* load pix+3 */
631 "movq %%mm1, %%mm3\n"
632 "punpcklbw %%mm7, %%mm1\n"
633 "punpckhbw %%mm7, %%mm3\n"
634 "paddw %%mm1, %%mm0\n"
635 "paddw %%mm3, %%mm2\n"
638 "movq %%mm2, 8(%2)\n"
645 "movd (%0), %%mm0\n" /* load pix-2 */
646 "punpcklbw %%mm7, %%mm0\n"
648 "movd (%%eax),%%mm1\n" /* load pix-1 */
649 "punpcklbw %%mm7, %%mm1\n"
650 "psubw %%mm1, %%mm0\n"
652 "psubw %%mm1, %%mm0\n"
654 "movd (%%eax,%1),%%mm1\n" /* load pix */
655 "punpcklbw %%mm7, %%mm1\n"
657 "paddw %%mm1, %%mm0\n"
659 "paddw %%mm1, %%mm0\n"
661 "movd (%%eax,%1,2),%%mm1\n" /* load pix+1 */
662 "punpcklbw %%mm7, %%mm1\n"
664 "paddw %%mm1, %%mm0\n"
666 "paddw %%mm1, %%mm0\n"
668 "movd (%0,%1,4),%%mm1\n" /* load pix+2 */
669 "punpcklbw %%mm7, %%mm1\n"
670 "psubw %%mm1, %%mm0\n"
672 "psubw %%mm1, %%mm0\n"
674 "movd (%%eax,%1,4),%%mm1\n" /* load pix+3 */
675 "punpcklbw %%mm7, %%mm1\n"
676 "paddw %%mm1, %%mm0\n"
678 "movq %%mm0, 16(%2)\n"
679 : : "r"(src-2*i_src_stride-2), "r"(i_src_stride), "r"(&tap[0]) : "%eax" );
682 tap[8+4] = x264_tapfilter( &src[-2+8+4], i_src_stride );
684 for( x = 0; x < 8; x++ )
686 dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 );
694 MC_IH( mc_xy10, mmxext, 8, 16, 0 )
695 MC_IH( mc_xy30, mmxext, 8, 16, 1 )
697 MC_IV( mc_xy01, mmxext, 8, 16, 0 )
698 MC_IV( mc_xy03, mmxext, 8, 16, i_src_stride )
700 MC_HV( mc_xy11, mmxext, 8, 16, 0, 0 )
701 MC_HV( mc_xy31, mmxext, 8, 16, 1, 0 )
702 MC_HV( mc_xy13, mmxext, 8, 16, 0, i_src_stride )
703 MC_HV( mc_xy33, mmxext, 8, 16, 1, i_src_stride )
705 MC_CH( mc_xy21, mmxext, 8, 16, 0 )
706 MC_CH( mc_xy23, mmxext, 8, 16, i_src_stride )
708 MC_CV( mc_xy12, mmxext, 8, 16, 0 )
709 MC_CV( mc_xy32, mmxext, 8, 16, 1 )
713 static void mc_xy10_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
716 mc_hh_w8( src, i_src_stride, tmp, 8, i_height );
717 pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );
719 static void mc_xy30_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
722 mc_hh_w8( src, i_src_stride, tmp, 8, i_height );
723 pixel_avg_w8( dst, i_dst_stride, src+1, i_src_stride, tmp, 8, i_height );
726 static void mc_xy01_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
729 mc_hv_w8( src, i_src_stride, tmp, 8, i_height );
730 pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );
732 static void mc_xy03_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
735 mc_hv_w8( src, i_src_stride, tmp, 8, i_height );
736 pixel_avg_w8( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 8, i_height );
739 static void mc_xy11_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
744 mc_hv_w8( src, i_src_stride, tmp1, 8, i_height );
745 mc_hh_w8( src, i_src_stride, tmp2, 8, i_height );
746 pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
748 static void mc_xy31_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
753 mc_hv_w8( src+1, i_src_stride, tmp1, 8, i_height );
754 mc_hh_w8( src, i_src_stride, tmp2, 8, i_height );
755 pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
757 static void mc_xy13_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
762 mc_hv_w8( src, i_src_stride, tmp1, 8, i_height );
763 mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
764 pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
766 static void mc_xy33_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
771 mc_hv_w8( src+1, i_src_stride, tmp1, 8, i_height );
772 mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
773 pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
775 static void mc_xy21_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
780 mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );
781 mc_hh_w8( src, i_src_stride, tmp2, 8, i_height );
782 pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
784 static void mc_xy12_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
789 mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );
790 mc_hv_w8( src, i_src_stride, tmp2, 8, i_height );
791 pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
793 static void mc_xy32_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
798 mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );
799 mc_hv_w8( src+1, i_src_stride, tmp2, 8, i_height );
800 pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
802 static void mc_xy23_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
807 mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );
808 mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
809 pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
813 /*****************************************************************************
814 * MC with width == 16 (height <= 16)
815 *****************************************************************************/
817 static inline void mc_hh_w16( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
819 mc_hh_w4( &src[ 0], i_src, &dst[ 0], i_dst, i_height );
820 mc_hh_w4( &src[ 4], i_src, &dst[ 4], i_dst, i_height );
821 mc_hh_w4( &src[ 8], i_src, &dst[ 8], i_dst, i_height );
822 mc_hh_w4( &src[12], i_src, &dst[12], i_dst, i_height );
824 static inline void mc_hv_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
826 mc_hv_w8( src, i_src_stride, dst, i_dst_stride, i_height );
827 mc_hv_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height );
830 static inline void mc_hc_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
832 mc_hc_w8( src, i_src_stride, dst, i_dst_stride, i_height );
833 mc_hc_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height );
837 MC_IH( mc_xy10, mmxext, 16, 16, 0 )
838 MC_IH( mc_xy30, mmxext, 16, 16, 1 )
840 MC_IV( mc_xy01, mmxext, 16, 16, 0 )
841 MC_IV( mc_xy03, mmxext, 16, 16, i_src_stride )
843 MC_HV( mc_xy11, mmxext, 16, 16, 0, 0 )
844 MC_HV( mc_xy31, mmxext, 16, 16, 1, 0 )
845 MC_HV( mc_xy13, mmxext, 16, 16, 0, i_src_stride )
846 MC_HV( mc_xy33, mmxext, 16, 16, 1, i_src_stride )
848 MC_CH( mc_xy21, mmxext, 16, 16, 0 )
849 MC_CH( mc_xy23, mmxext, 16, 16, i_src_stride )
851 MC_CV( mc_xy12, mmxext, 16, 16, 0 )
852 MC_CV( mc_xy32, mmxext, 16, 16, 1 )
855 MC_IH( mc_xy10, sse2, 16, 16, 0 )
856 MC_IH( mc_xy30, sse2, 16, 16, 1 )
858 MC_IV( mc_xy01, sse2, 16, 16, 0 )
859 MC_IV( mc_xy03, sse2, 16, 16, i_src_stride )
861 MC_HV( mc_xy11, sse2, 16, 16, 0, 0 )
862 MC_HV( mc_xy31, sse2, 16, 16, 1, 0 )
863 MC_HV( mc_xy13, sse2, 16, 16, 0, i_src_stride )
864 MC_HV( mc_xy33, sse2, 16, 16, 1, i_src_stride )
866 MC_CH( mc_xy21, sse2, 16, 16, 0 )
867 MC_CH( mc_xy23, sse2, 16, 16, i_src_stride )
869 MC_CV( mc_xy12, sse2, 16, 16, 0 )
870 MC_CV( mc_xy32, sse2, 16, 16, 1 )
875 static void mc_xy10_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
877 DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
878 mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
879 pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
881 static void mc_xy30_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
883 DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
884 mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
885 pixel_avg_w16( dst, i_dst_stride, src+1, i_src_stride, tmp, 16, i_height );
888 static void mc_xy01_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
890 DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
891 mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
892 pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
894 static void mc_xy03_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
896 DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
897 mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
898 pixel_avg_w16( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 16, i_height );
901 static void mc_xy11_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
903 DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
904 DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
906 mc_hv_w16( src, i_src_stride, tmp1, 16, i_height );
907 mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
908 pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
910 static void mc_xy31_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
912 DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
913 DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
915 mc_hv_w16( src+1, i_src_stride, tmp1, 16, i_height );
916 mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
917 pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
919 static void mc_xy13_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
921 DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
922 DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
924 mc_hv_w16( src, i_src_stride, tmp1, 16, i_height );
925 mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
926 pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
928 static void mc_xy33_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
930 DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
931 DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
933 mc_hv_w16( src+1, i_src_stride, tmp1, 16, i_height );
934 mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
935 pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
937 static void mc_xy21_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
939 DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
940 DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
942 mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
943 mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
944 pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
946 static void mc_xy12_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
948 DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
949 DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
951 mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
952 mc_hv_w16( src, i_src_stride, tmp2, 16, i_height );
953 pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
955 static void mc_xy32_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
957 DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
958 DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
960 mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
961 mc_hv_w16( src+1, i_src_stride, tmp2, 16, i_height );
962 pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
964 static void mc_xy23_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
966 DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
967 DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
969 mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
970 mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
971 pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
975 #define MOTION_COMPENSATION_LUMA \
976 src += (mvy >> 2) * i_src_stride + (mvx >> 2); \
979 pf_mc[0][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); \
981 else if( i_width == 8 ) \
983 pf_mc[1][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); \
985 else if( i_width == 16 ) \
987 pf_mc[2][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); \
991 fprintf( stderr, "Error: motion_compensation_luma called with invalid width" ); \
994 static void motion_compensation_luma_mmxext( uint8_t *src, int i_src_stride,
995 uint8_t *dst, int i_dst_stride,
997 int i_width, int i_height )
999 static const pf_mc_t pf_mc[3][4][4] = /*XXX [dqy][dqx] */
1002 { x264_mc_copy_w4_mmxext, mc_xy10_w4_mmxext, mc_hh_w4, mc_xy30_w4_mmxext },
1003 { mc_xy01_w4_mmxext, mc_xy11_w4_mmxext, mc_xy21_w4_mmxext, mc_xy31_w4_mmxext },
1004 { mc_hv_w4, mc_xy12_w4_mmxext, mc_hc_w4, mc_xy32_w4_mmxext },
1005 { mc_xy03_w4_mmxext, mc_xy13_w4_mmxext, mc_xy23_w4_mmxext, mc_xy33_w4_mmxext },
1008 { x264_mc_copy_w8_mmxext, mc_xy10_w8_mmxext, mc_hh_w8, mc_xy30_w8_mmxext },
1009 { mc_xy01_w8_mmxext, mc_xy11_w8_mmxext, mc_xy21_w8_mmxext, mc_xy31_w8_mmxext },
1010 { mc_hv_w8, mc_xy12_w8_mmxext, mc_hc_w8, mc_xy32_w8_mmxext },
1011 { mc_xy03_w8_mmxext, mc_xy13_w8_mmxext, mc_xy23_w8_mmxext, mc_xy33_w8_mmxext },
1014 { x264_mc_copy_w16_mmxext, mc_xy10_w16_mmxext, mc_hh_w16, mc_xy30_w16_mmxext },
1015 { mc_xy01_w16_mmxext, mc_xy11_w16_mmxext, mc_xy21_w16_mmxext, mc_xy31_w16_mmxext },
1016 { mc_hv_w16, mc_xy12_w16_mmxext, mc_hc_w16, mc_xy32_w16_mmxext },
1017 { mc_xy03_w16_mmxext, mc_xy13_w16_mmxext, mc_xy23_w16_mmxext, mc_xy33_w16_mmxext },
1021 MOTION_COMPENSATION_LUMA
1024 static void motion_compensation_luma_sse2( uint8_t *src, int i_src_stride,
1025 uint8_t *dst, int i_dst_stride,
1027 int i_width, int i_height )
1029 static const pf_mc_t pf_mc[3][4][4] = /*XXX [dqy][dqx] */
1032 { x264_mc_copy_w4_mmxext, mc_xy10_w4_mmxext, mc_hh_w4, mc_xy30_w4_mmxext },
1033 { mc_xy01_w4_mmxext, mc_xy11_w4_mmxext, mc_xy21_w4_mmxext, mc_xy31_w4_mmxext },
1034 { mc_hv_w4, mc_xy12_w4_mmxext, mc_hc_w4, mc_xy32_w4_mmxext },
1035 { mc_xy03_w4_mmxext, mc_xy13_w4_mmxext, mc_xy23_w4_mmxext, mc_xy33_w4_mmxext },
1038 { x264_mc_copy_w8_mmxext, mc_xy10_w8_mmxext, mc_hh_w8, mc_xy30_w8_mmxext },
1039 { mc_xy01_w8_mmxext, mc_xy11_w8_mmxext, mc_xy21_w8_mmxext, mc_xy31_w8_mmxext },
1040 { mc_hv_w8, mc_xy12_w8_mmxext, mc_hc_w8, mc_xy32_w8_mmxext },
1041 { mc_xy03_w8_mmxext, mc_xy13_w8_mmxext, mc_xy23_w8_mmxext, mc_xy33_w8_mmxext },
1044 { x264_mc_copy_w16_sse2, mc_xy10_w16_sse2, mc_hh_w16, mc_xy30_w16_sse2 },
1045 { mc_xy01_w16_sse2, mc_xy11_w16_sse2, mc_xy21_w16_sse2, mc_xy31_w16_sse2 },
1046 { mc_hv_w16, mc_xy12_w16_sse2, mc_hc_w16, mc_xy32_w16_sse2 },
1047 { mc_xy03_w16_sse2, mc_xy13_w16_sse2, mc_xy23_w16_sse2, mc_xy33_w16_sse2 },
1050 MOTION_COMPENSATION_LUMA
1055 void mc_luma_mmx( uint8_t *src[4], int i_src_stride,
1056 uint8_t *dst, int i_dst_stride,
1058 int i_width, int i_height )
1060 uint8_t *src1, *src2;
1062 int correction = (mvx&1) && (mvy&1) && ((mvx&2) ^ (mvy&2));
1063 int hpel1x = mvx>>1;
1064 int hpel1y = (mvy+1-correction)>>1;
1065 int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
1067 src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
1069 if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
1071 int hpel2x = (mvx+1)>>1;
1072 int hpel2y = (mvy+correction)>>1;
1073 int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
1075 src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
1079 x264_pixel_avg_w4_mmxext( dst, i_dst_stride, src1, i_src_stride,
1080 src2, i_src_stride, i_height );
1083 x264_pixel_avg_w8_mmxext( dst, i_dst_stride, src1, i_src_stride,
1084 src2, i_src_stride, i_height );
1088 x264_pixel_avg_w16_mmxext(dst, i_dst_stride, src1, i_src_stride,
1089 src2, i_src_stride, i_height );
1096 x264_mc_copy_w4_mmxext( src1, i_src_stride, dst, i_dst_stride, i_height );
1099 x264_mc_copy_w8_mmxext( src1, i_src_stride, dst, i_dst_stride, i_height );
1102 x264_mc_copy_w16_mmxext( src1, i_src_stride, dst, i_dst_stride, i_height );
1108 uint8_t *get_ref_mmx( uint8_t *src[4], int i_src_stride,
1109 uint8_t *dst, int *i_dst_stride,
1111 int i_width, int i_height )
1113 uint8_t *src1, *src2;
1115 int correction = (mvx&1) && (mvy&1) && ((mvx&2) ^ (mvy&2));
1116 int hpel1x = mvx>>1;
1117 int hpel1y = (mvy+1-correction)>>1;
1118 int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
1120 src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
1122 if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
1124 int hpel2x = (mvx+1)>>1;
1125 int hpel2y = (mvy+correction)>>1;
1126 int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
1128 src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
1132 x264_pixel_avg_w4_mmxext( dst, *i_dst_stride, src1, i_src_stride,
1133 src2, i_src_stride, i_height );
1136 x264_pixel_avg_w8_mmxext( dst, *i_dst_stride, src1, i_src_stride,
1137 src2, i_src_stride, i_height );
1141 x264_pixel_avg_w16_mmxext(dst, *i_dst_stride, src1, i_src_stride,
1142 src2, i_src_stride, i_height );
1149 *i_dst_stride = i_src_stride;
1155 void x264_mc_mmxext_init( x264_mc_functions_t *pf )
1157 pf->mc_luma = mc_luma_mmx;
1158 pf->get_ref = get_ref_mmx;
1160 pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
1161 pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmxext;
1162 pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_mmxext;
1163 pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_mmxext;
1164 pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_mmxext;
1165 pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_mmxext;
1166 pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext;
1167 pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext;
1169 pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_mmxext;
1170 pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_mmxext;
1171 pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_mmxext;
1172 pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_mmxext;
1173 pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_mmxext;
1174 pf->avg_weight[PIXEL_4x4] = x264_pixel_avg_weight_4x4_mmxext;
1175 // avg_weight_4x8 is rare and 4x2 is not used
1177 void x264_mc_sse2_init( x264_mc_functions_t *pf )
1179 /* todo: use sse2 */
1180 pf->mc_luma = mc_luma_mmx;
1181 pf->get_ref = get_ref_mmx;
1185 void get_funcs_mmx(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv)
1189 *int_hv = mc_hc_w16;
1192 void get_funcs_sse2(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv)
1196 *int_hv = mc_hc_w16;