1 /*****************************************************************************
2 * motionmmx.c : MMX motion compensation module for vlc
3 *****************************************************************************
4 * Copyright (C) 2001 VideoLAN
5 * $Id: motionmmx.c,v 1.12 2001/11/28 15:08:05 massiot Exp $
7 * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
8 * Michel Lespinasse <walken@zoy.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define MODULE_NAME motionmmx
26 #include "modules_inner.h"
28 /*****************************************************************************
30 *****************************************************************************/
33 #include <stdlib.h> /* malloc(), free() */
37 #include "common.h" /* boolean_t, byte_t */
46 #include "modules_export.h"
48 /*****************************************************************************
49 * Local and extern prototypes.
50 *****************************************************************************/
51 static void motion_getfunctions( function_list_t * p_function_list );
53 /*****************************************************************************
54 * Build configuration tree.
55 *****************************************************************************/
57 ADD_WINDOW( "Configuration for MMX motion compensation module" )
58 ADD_COMMENT( "Ha, ha -- nothing to configure yet" )
62 p_module->i_capabilities = MODULE_CAPABILITY_NULL
63 | MODULE_CAPABILITY_MOTION;
64 p_module->psz_longname = "MMX motion compensation module";
68 motion_getfunctions( &p_module->p_functions->motion );
71 MODULE_DEACTIVATE_START
72 MODULE_DEACTIVATE_STOP
74 /*****************************************************************************
75 * motion_Probe: tests probe the CPU and return a score
76 *****************************************************************************/
77 static int motion_Probe( probedata_t *p_data )
79 if( !TestCPU( CPU_CAPABILITY_MMX ) )
84 if( TestMethod( MOTION_METHOD_VAR, "motionmmx" )
85 || TestMethod( MOTION_METHOD_VAR, "mmx" ) )
93 /*****************************************************************************
94 * Motion compensation in MMX
95 *****************************************************************************/
97 // some rounding constants
98 mmx_t round1 = {0x0001000100010001LL};
99 mmx_t round4 = {0x0002000200020002LL};
102 * This code should probably be compiled with loop unrolling
103 * (ie, -funroll-loops in gcc)becuase some of the loops
104 * use a small static number of iterations. This was written
105 * with the assumption the compiler knows best about when
106 * unrolling will help
109 static __inline__ void mmx_zero_reg ()
115 static __inline__ void mmx_average_2_U8 (yuv_data_t * dest,
116 yuv_data_t * src1, yuv_data_t * src2)
119 // *dest = (*src1 + *src2 + 1)/ 2;
122 movq_m2r (*src1, mm1); // load 8 src1 bytes
123 movq_r2r (mm1, mm2); // copy 8 src1 bytes
125 movq_m2r (*src2, mm3); // load 8 src2 bytes
126 movq_r2r (mm3, mm4); // copy 8 src2 bytes
128 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
129 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
131 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
132 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
134 paddw_r2r (mm3, mm1); // add lows to mm1
135 paddw_m2r (round1, mm1);
136 psraw_i2r (1, mm1); // /2
138 paddw_r2r (mm4, mm2); // add highs to mm2
139 paddw_m2r (round1, mm2);
140 psraw_i2r (1, mm2); // /2
142 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
143 movq_r2m (mm1, *dest); // store result in dest
146 static __inline__ void mmx_interp_average_2_U8 (yuv_data_t * dest,
147 yuv_data_t * src1, yuv_data_t * src2)
150 // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
153 movq_m2r (*dest, mm1); // load 8 dest bytes
154 movq_r2r (mm1, mm2); // copy 8 dest bytes
156 movq_m2r (*src1, mm3); // load 8 src1 bytes
157 movq_r2r (mm3, mm4); // copy 8 src1 bytes
159 movq_m2r (*src2, mm5); // load 8 src2 bytes
160 movq_r2r (mm5, mm6); // copy 8 src2 bytes
162 punpcklbw_r2r (mm0, mm1); // unpack low dest bytes
163 punpckhbw_r2r (mm0, mm2); // unpack high dest bytes
165 punpcklbw_r2r (mm0, mm3); // unpack low src1 bytes
166 punpckhbw_r2r (mm0, mm4); // unpack high src1 bytes
168 punpcklbw_r2r (mm0, mm5); // unpack low src2 bytes
169 punpckhbw_r2r (mm0, mm6); // unpack high src2 bytes
171 paddw_r2r (mm5, mm3); // add lows
172 paddw_m2r (round1, mm3);
173 psraw_i2r (1, mm3); // /2
175 paddw_r2r (mm6, mm4); // add highs
176 paddw_m2r (round1, mm4);
177 psraw_i2r (1, mm4); // /2
179 paddw_r2r (mm3, mm1); // add lows
180 paddw_m2r (round1, mm1);
181 psraw_i2r (1, mm1); // /2
183 paddw_r2r (mm4, mm2); // add highs
184 paddw_m2r (round1, mm2);
185 psraw_i2r (1, mm2); // /2
187 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
188 movq_r2m (mm1, *dest); // store result in dest
191 static __inline__ void mmx_average_4_U8 (yuv_data_t * dest,
192 yuv_data_t * src1, yuv_data_t * src2,
193 yuv_data_t * src3, yuv_data_t * src4)
196 // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
199 movq_m2r (*src1, mm1); // load 8 src1 bytes
200 movq_r2r (mm1, mm2); // copy 8 src1 bytes
202 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
203 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
205 movq_m2r (*src2, mm3); // load 8 src2 bytes
206 movq_r2r (mm3, mm4); // copy 8 src2 bytes
208 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
209 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
211 paddw_r2r (mm3, mm1); // add lows
212 paddw_r2r (mm4, mm2); // add highs
214 // now have partials in mm1 and mm2
216 movq_m2r (*src3, mm3); // load 8 src3 bytes
217 movq_r2r (mm3, mm4); // copy 8 src3 bytes
219 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
220 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
222 paddw_r2r (mm3, mm1); // add lows
223 paddw_r2r (mm4, mm2); // add highs
225 movq_m2r (*src4, mm5); // load 8 src4 bytes
226 movq_r2r (mm5, mm6); // copy 8 src4 bytes
228 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
229 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
231 paddw_r2r (mm5, mm1); // add lows
232 paddw_r2r (mm6, mm2); // add highs
234 // now have subtotal in mm1 and mm2
236 paddw_m2r (round4, mm1);
237 psraw_i2r (2, mm1); // /4
238 paddw_m2r (round4, mm2);
239 psraw_i2r (2, mm2); // /4
241 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
242 movq_r2m (mm1, *dest); // store result in dest
245 static __inline__ void mmx_interp_average_4_U8 (yuv_data_t * dest,
246 yuv_data_t * src1, yuv_data_t * src2,
247 yuv_data_t * src3, yuv_data_t * src4)
250 // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
253 movq_m2r (*src1, mm1); // load 8 src1 bytes
254 movq_r2r (mm1, mm2); // copy 8 src1 bytes
256 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
257 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
259 movq_m2r (*src2, mm3); // load 8 src2 bytes
260 movq_r2r (mm3, mm4); // copy 8 src2 bytes
262 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
263 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
265 paddw_r2r (mm3, mm1); // add lows
266 paddw_r2r (mm4, mm2); // add highs
268 // now have partials in mm1 and mm2
270 movq_m2r (*src3, mm3); // load 8 src3 bytes
271 movq_r2r (mm3, mm4); // copy 8 src3 bytes
273 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
274 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
276 paddw_r2r (mm3, mm1); // add lows
277 paddw_r2r (mm4, mm2); // add highs
279 movq_m2r (*src4, mm5); // load 8 src4 bytes
280 movq_r2r (mm5, mm6); // copy 8 src4 bytes
282 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
283 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
285 paddw_r2r (mm5, mm1); // add lows
286 paddw_r2r (mm6, mm2); // add highs
288 paddw_m2r (round4, mm1);
289 psraw_i2r (2, mm1); // /4
290 paddw_m2r (round4, mm2);
291 psraw_i2r (2, mm2); // /4
293 // now have subtotal/4 in mm1 and mm2
295 movq_m2r (*dest, mm3); // load 8 dest bytes
296 movq_r2r (mm3, mm4); // copy 8 dest bytes
298 punpcklbw_r2r (mm0, mm3); // unpack low dest bytes
299 punpckhbw_r2r (mm0, mm4); // unpack high dest bytes
301 paddw_r2r (mm3, mm1); // add lows
302 paddw_r2r (mm4, mm2); // add highs
304 paddw_m2r (round1, mm1);
305 psraw_i2r (1, mm1); // /2
306 paddw_m2r (round1, mm2);
307 psraw_i2r (1, mm2); // /2
309 // now have end value in mm1 and mm2
311 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
312 movq_r2m (mm1,*dest); // store result in dest
315 //-----------------------------------------------------------------------
317 static __inline__ void MC_avg_mmx (int width, int height,
318 yuv_data_t * dest, yuv_data_t * ref, int stride)
323 mmx_average_2_U8 (dest, dest, ref);
326 mmx_average_2_U8 (dest+8, dest+8, ref+8);
333 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
334 int stride, int height)
336 MC_avg_mmx (16, height, dest, ref, stride);
339 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
340 int stride, int height)
342 MC_avg_mmx (8, height, dest, ref, stride);
345 //-----------------------------------------------------------------------
347 static __inline__ void MC_put_mmx (int width, int height,
348 yuv_data_t * dest, yuv_data_t * ref, int stride)
353 movq_m2r (* ref, mm1); // load 8 ref bytes
354 movq_r2m (mm1,* dest); // store 8 bytes at curr
358 movq_m2r (* (ref+8), mm1); // load 8 ref bytes
359 movq_r2m (mm1,* (dest+8)); // store 8 bytes at curr
367 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
368 int stride, int height)
370 MC_put_mmx (16, height, dest, ref, stride);
373 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
374 int stride, int height)
376 MC_put_mmx (8, height, dest, ref, stride);
379 //-----------------------------------------------------------------------
381 // Half pixel interpolation in the x direction
382 static __inline__ void MC_avg_x_mmx (int width, int height,
383 yuv_data_t * dest, yuv_data_t * ref, int stride)
388 mmx_interp_average_2_U8 (dest, ref, ref+1);
391 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
398 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
399 int stride, int height)
401 MC_avg_x_mmx (16, height, dest, ref, stride);
404 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
405 int stride, int height)
407 MC_avg_x_mmx (8, height, dest, ref, stride);
410 //-----------------------------------------------------------------------
412 static __inline__ void MC_put_x_mmx (int width, int height,
413 yuv_data_t * dest, yuv_data_t * ref, int stride)
418 mmx_average_2_U8 (dest, ref, ref+1);
421 mmx_average_2_U8 (dest+8, ref+8, ref+9);
428 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
429 int stride, int height)
431 MC_put_x_mmx (16, height, dest, ref, stride);
434 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
435 int stride, int height)
437 MC_put_x_mmx (8, height, dest, ref, stride);
440 //-----------------------------------------------------------------------
442 static __inline__ void MC_avg_xy_mmx (int width, int height,
443 yuv_data_t * dest, yuv_data_t * ref, int stride)
445 yuv_data_t * ref_next = ref+stride;
450 mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
453 mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
454 ref_next+8, ref_next+9);
462 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
463 int stride, int height)
465 MC_avg_xy_mmx (16, height, dest, ref, stride);
468 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
469 int stride, int height)
471 MC_avg_xy_mmx (8, height, dest, ref, stride);
474 //-----------------------------------------------------------------------
476 static __inline__ void MC_put_xy_mmx (int width, int height,
477 yuv_data_t * dest, yuv_data_t * ref, int stride)
479 yuv_data_t * ref_next = ref+stride;
484 mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
487 mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
495 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
496 int stride, int height)
498 MC_put_xy_mmx (16, height, dest, ref, stride);
501 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
502 int stride, int height)
504 MC_put_xy_mmx (8, height, dest, ref, stride);
507 //-----------------------------------------------------------------------
509 static __inline__ void MC_avg_y_mmx (int width, int height,
510 yuv_data_t * dest, yuv_data_t * ref, int stride)
512 yuv_data_t * ref_next = ref+stride;
517 mmx_interp_average_2_U8 (dest, ref, ref_next);
520 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
528 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
529 int stride, int height)
531 MC_avg_y_mmx (16, height, dest, ref, stride);
534 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
535 int stride, int height)
537 MC_avg_y_mmx (8, height, dest, ref, stride);
540 //-----------------------------------------------------------------------
542 static __inline__ void MC_put_y_mmx (int width, int height,
543 yuv_data_t * dest, yuv_data_t * ref, int stride)
545 yuv_data_t * ref_next = ref+stride;
550 mmx_average_2_U8 (dest, ref, ref_next);
553 mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
561 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
562 int stride, int height)
564 MC_put_y_mmx (16, height, dest, ref, stride);
567 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
568 int stride, int height)
570 MC_put_y_mmx (8, height, dest, ref, stride);
574 /*****************************************************************************
575 * Functions exported as capabilities. They are declared as static so that
576 * we don't pollute the namespace too much.
577 *****************************************************************************/
578 static void motion_getfunctions( function_list_t * p_function_list )
580 static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *,
584 /* Copying functions */
587 MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx
591 MC_put_8_mmx, MC_put_x8_mmx, MC_put_y8_mmx, MC_put_xy8_mmx
595 /* Averaging functions */
598 MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx
602 MC_avg_8_mmx, MC_avg_x8_mmx, MC_avg_y8_mmx, MC_avg_xy8_mmx
607 p_function_list->pf_probe = motion_Probe;
609 #define list p_function_list->functions.motion
610 memcpy( list.ppppf_motion, ppppf_motion, sizeof( void * ) * 16 );