1 /*****************************************************************************
2 * motionmmx.c : MMX motion compensation module for vlc
3 *****************************************************************************
4 * Copyright (C) 2001 VideoLAN
5 * $Id: motionmmx.c,v 1.10 2001/08/22 17:21:45 massiot Exp $
7 * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
8 * Michel Lespinasse <walken@zoy.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define MODULE_NAME motionmmx
26 #include "modules_inner.h"
28 /*****************************************************************************
30 *****************************************************************************/
33 #include <stdlib.h> /* malloc(), free() */
36 #include "common.h" /* boolean_t, byte_t */
44 #include "modules_export.h"
46 /*****************************************************************************
47 * Local and extern prototypes.
48 *****************************************************************************/
49 static void motion_getfunctions( function_list_t * p_function_list );
51 /*****************************************************************************
52 * Build configuration tree.
53 *****************************************************************************/
55 ADD_WINDOW( "Configuration for MMX motion compensation module" )
56 ADD_COMMENT( "Ha, ha -- nothing to configure yet" )
60 p_module->i_capabilities = MODULE_CAPABILITY_NULL
61 | MODULE_CAPABILITY_MOTION;
62 p_module->psz_longname = "MMX motion compensation module";
66 motion_getfunctions( &p_module->p_functions->motion );
69 MODULE_DEACTIVATE_START
70 MODULE_DEACTIVATE_STOP
72 /*****************************************************************************
73 * motion_Probe: tests probe the CPU and return a score
74 *****************************************************************************/
75 static int motion_Probe( probedata_t *p_data )
77 if( !TestCPU( CPU_CAPABILITY_MMX ) )
82 if( TestMethod( MOTION_METHOD_VAR, "motionmmx" )
83 || TestMethod( MOTION_METHOD_VAR, "mmx" ) )
91 /*****************************************************************************
92 * Motion compensation in MMX
93 *****************************************************************************/
95 // some rounding constants
96 mmx_t round1 = {0x0001000100010001LL};
97 mmx_t round4 = {0x0002000200020002LL};
100 * This code should probably be compiled with loop unrolling
101 * (ie, -funroll-loops in gcc)becuase some of the loops
102 * use a small static number of iterations. This was written
103 * with the assumption the compiler knows best about when
104 * unrolling will help
107 static __inline__ void mmx_zero_reg ()
113 static __inline__ void mmx_average_2_U8 (yuv_data_t * dest,
114 yuv_data_t * src1, yuv_data_t * src2)
117 // *dest = (*src1 + *src2 + 1)/ 2;
120 movq_m2r (*src1, mm1); // load 8 src1 bytes
121 movq_r2r (mm1, mm2); // copy 8 src1 bytes
123 movq_m2r (*src2, mm3); // load 8 src2 bytes
124 movq_r2r (mm3, mm4); // copy 8 src2 bytes
126 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
127 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
129 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
130 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
132 paddw_r2r (mm3, mm1); // add lows to mm1
133 paddw_m2r (round1, mm1);
134 psraw_i2r (1, mm1); // /2
136 paddw_r2r (mm4, mm2); // add highs to mm2
137 paddw_m2r (round1, mm2);
138 psraw_i2r (1, mm2); // /2
140 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
141 movq_r2m (mm1, *dest); // store result in dest
144 static __inline__ void mmx_interp_average_2_U8 (yuv_data_t * dest,
145 yuv_data_t * src1, yuv_data_t * src2)
148 // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
151 movq_m2r (*dest, mm1); // load 8 dest bytes
152 movq_r2r (mm1, mm2); // copy 8 dest bytes
154 movq_m2r (*src1, mm3); // load 8 src1 bytes
155 movq_r2r (mm3, mm4); // copy 8 src1 bytes
157 movq_m2r (*src2, mm5); // load 8 src2 bytes
158 movq_r2r (mm5, mm6); // copy 8 src2 bytes
160 punpcklbw_r2r (mm0, mm1); // unpack low dest bytes
161 punpckhbw_r2r (mm0, mm2); // unpack high dest bytes
163 punpcklbw_r2r (mm0, mm3); // unpack low src1 bytes
164 punpckhbw_r2r (mm0, mm4); // unpack high src1 bytes
166 punpcklbw_r2r (mm0, mm5); // unpack low src2 bytes
167 punpckhbw_r2r (mm0, mm6); // unpack high src2 bytes
169 paddw_r2r (mm5, mm3); // add lows
170 paddw_m2r (round1, mm3);
171 psraw_i2r (1, mm3); // /2
173 paddw_r2r (mm6, mm4); // add highs
174 paddw_m2r (round1, mm4);
175 psraw_i2r (1, mm4); // /2
177 paddw_r2r (mm3, mm1); // add lows
178 paddw_m2r (round1, mm1);
179 psraw_i2r (1, mm1); // /2
181 paddw_r2r (mm4, mm2); // add highs
182 paddw_m2r (round1, mm2);
183 psraw_i2r (1, mm2); // /2
185 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
186 movq_r2m (mm1, *dest); // store result in dest
189 static __inline__ void mmx_average_4_U8 (yuv_data_t * dest,
190 yuv_data_t * src1, yuv_data_t * src2,
191 yuv_data_t * src3, yuv_data_t * src4)
194 // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
197 movq_m2r (*src1, mm1); // load 8 src1 bytes
198 movq_r2r (mm1, mm2); // copy 8 src1 bytes
200 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
201 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
203 movq_m2r (*src2, mm3); // load 8 src2 bytes
204 movq_r2r (mm3, mm4); // copy 8 src2 bytes
206 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
207 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
209 paddw_r2r (mm3, mm1); // add lows
210 paddw_r2r (mm4, mm2); // add highs
212 // now have partials in mm1 and mm2
214 movq_m2r (*src3, mm3); // load 8 src3 bytes
215 movq_r2r (mm3, mm4); // copy 8 src3 bytes
217 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
218 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
220 paddw_r2r (mm3, mm1); // add lows
221 paddw_r2r (mm4, mm2); // add highs
223 movq_m2r (*src4, mm5); // load 8 src4 bytes
224 movq_r2r (mm5, mm6); // copy 8 src4 bytes
226 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
227 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
229 paddw_r2r (mm5, mm1); // add lows
230 paddw_r2r (mm6, mm2); // add highs
232 // now have subtotal in mm1 and mm2
234 paddw_m2r (round4, mm1);
235 psraw_i2r (2, mm1); // /4
236 paddw_m2r (round4, mm2);
237 psraw_i2r (2, mm2); // /4
239 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
240 movq_r2m (mm1, *dest); // store result in dest
243 static __inline__ void mmx_interp_average_4_U8 (yuv_data_t * dest,
244 yuv_data_t * src1, yuv_data_t * src2,
245 yuv_data_t * src3, yuv_data_t * src4)
248 // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
251 movq_m2r (*src1, mm1); // load 8 src1 bytes
252 movq_r2r (mm1, mm2); // copy 8 src1 bytes
254 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
255 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
257 movq_m2r (*src2, mm3); // load 8 src2 bytes
258 movq_r2r (mm3, mm4); // copy 8 src2 bytes
260 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
261 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
263 paddw_r2r (mm3, mm1); // add lows
264 paddw_r2r (mm4, mm2); // add highs
266 // now have partials in mm1 and mm2
268 movq_m2r (*src3, mm3); // load 8 src3 bytes
269 movq_r2r (mm3, mm4); // copy 8 src3 bytes
271 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
272 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
274 paddw_r2r (mm3, mm1); // add lows
275 paddw_r2r (mm4, mm2); // add highs
277 movq_m2r (*src4, mm5); // load 8 src4 bytes
278 movq_r2r (mm5, mm6); // copy 8 src4 bytes
280 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
281 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
283 paddw_r2r (mm5, mm1); // add lows
284 paddw_r2r (mm6, mm2); // add highs
286 paddw_m2r (round4, mm1);
287 psraw_i2r (2, mm1); // /4
288 paddw_m2r (round4, mm2);
289 psraw_i2r (2, mm2); // /4
291 // now have subtotal/4 in mm1 and mm2
293 movq_m2r (*dest, mm3); // load 8 dest bytes
294 movq_r2r (mm3, mm4); // copy 8 dest bytes
296 punpcklbw_r2r (mm0, mm3); // unpack low dest bytes
297 punpckhbw_r2r (mm0, mm4); // unpack high dest bytes
299 paddw_r2r (mm3, mm1); // add lows
300 paddw_r2r (mm4, mm2); // add highs
302 paddw_m2r (round1, mm1);
303 psraw_i2r (1, mm1); // /2
304 paddw_m2r (round1, mm2);
305 psraw_i2r (1, mm2); // /2
307 // now have end value in mm1 and mm2
309 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
310 movq_r2m (mm1,*dest); // store result in dest
313 //-----------------------------------------------------------------------
315 static __inline__ void MC_avg_mmx (int width, int height,
316 yuv_data_t * dest, yuv_data_t * ref, int stride)
321 mmx_average_2_U8 (dest, dest, ref);
324 mmx_average_2_U8 (dest+8, dest+8, ref+8);
331 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
332 int stride, int height)
334 MC_avg_mmx (16, height, dest, ref, stride);
337 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
338 int stride, int height)
340 MC_avg_mmx (8, height, dest, ref, stride);
343 //-----------------------------------------------------------------------
345 static __inline__ void MC_put_mmx (int width, int height,
346 yuv_data_t * dest, yuv_data_t * ref, int stride)
351 movq_m2r (* ref, mm1); // load 8 ref bytes
352 movq_r2m (mm1,* dest); // store 8 bytes at curr
356 movq_m2r (* (ref+8), mm1); // load 8 ref bytes
357 movq_r2m (mm1,* (dest+8)); // store 8 bytes at curr
365 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
366 int stride, int height)
368 MC_put_mmx (16, height, dest, ref, stride);
371 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
372 int stride, int height)
374 MC_put_mmx (8, height, dest, ref, stride);
377 //-----------------------------------------------------------------------
379 // Half pixel interpolation in the x direction
380 static __inline__ void MC_avg_x_mmx (int width, int height,
381 yuv_data_t * dest, yuv_data_t * ref, int stride)
386 mmx_interp_average_2_U8 (dest, ref, ref+1);
389 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
396 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
397 int stride, int height)
399 MC_avg_x_mmx (16, height, dest, ref, stride);
402 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
403 int stride, int height)
405 MC_avg_x_mmx (8, height, dest, ref, stride);
408 //-----------------------------------------------------------------------
410 static __inline__ void MC_put_x_mmx (int width, int height,
411 yuv_data_t * dest, yuv_data_t * ref, int stride)
416 mmx_average_2_U8 (dest, ref, ref+1);
419 mmx_average_2_U8 (dest+8, ref+8, ref+9);
426 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
427 int stride, int height)
429 MC_put_x_mmx (16, height, dest, ref, stride);
432 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
433 int stride, int height)
435 MC_put_x_mmx (8, height, dest, ref, stride);
438 //-----------------------------------------------------------------------
440 static __inline__ void MC_avg_xy_mmx (int width, int height,
441 yuv_data_t * dest, yuv_data_t * ref, int stride)
443 yuv_data_t * ref_next = ref+stride;
448 mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
451 mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
452 ref_next+8, ref_next+9);
460 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
461 int stride, int height)
463 MC_avg_xy_mmx (16, height, dest, ref, stride);
466 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
467 int stride, int height)
469 MC_avg_xy_mmx (8, height, dest, ref, stride);
472 //-----------------------------------------------------------------------
474 static __inline__ void MC_put_xy_mmx (int width, int height,
475 yuv_data_t * dest, yuv_data_t * ref, int stride)
477 yuv_data_t * ref_next = ref+stride;
482 mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
485 mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
493 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
494 int stride, int height)
496 MC_put_xy_mmx (16, height, dest, ref, stride);
499 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
500 int stride, int height)
502 MC_put_xy_mmx (8, height, dest, ref, stride);
505 //-----------------------------------------------------------------------
507 static __inline__ void MC_avg_y_mmx (int width, int height,
508 yuv_data_t * dest, yuv_data_t * ref, int stride)
510 yuv_data_t * ref_next = ref+stride;
515 mmx_interp_average_2_U8 (dest, ref, ref_next);
518 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
526 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
527 int stride, int height)
529 MC_avg_y_mmx (16, height, dest, ref, stride);
532 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
533 int stride, int height)
535 MC_avg_y_mmx (8, height, dest, ref, stride);
538 //-----------------------------------------------------------------------
540 static __inline__ void MC_put_y_mmx (int width, int height,
541 yuv_data_t * dest, yuv_data_t * ref, int stride)
543 yuv_data_t * ref_next = ref+stride;
548 mmx_average_2_U8 (dest, ref, ref_next);
551 mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
559 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
560 int stride, int height)
562 MC_put_y_mmx (16, height, dest, ref, stride);
565 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
566 int stride, int height)
568 MC_put_y_mmx (8, height, dest, ref, stride);
572 /*****************************************************************************
573 * Functions exported as capabilities. They are declared as static so that
574 * we don't pollute the namespace too much.
575 *****************************************************************************/
576 static void motion_getfunctions( function_list_t * p_function_list )
578 static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *,
582 /* Copying functions */
585 MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx
589 MC_put_8_mmx, MC_put_x8_mmx, MC_put_y8_mmx, MC_put_xy8_mmx
593 /* Averaging functions */
596 MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx
600 MC_avg_8_mmx, MC_avg_x8_mmx, MC_avg_y8_mmx, MC_avg_xy8_mmx
605 p_function_list->pf_probe = motion_Probe;
607 #define list p_function_list->functions.motion
608 memcpy( list.ppppf_motion, ppppf_motion, sizeof( void * ) * 16 );