1 /*****************************************************************************
2 * motionmmx.c : MMX motion compensation module for vlc
3 *****************************************************************************
4 * Copyright (C) 2001 VideoLAN
5 * $Id: motionmmx.c,v 1.11 2001/09/06 14:02:56 massiot Exp $
7 * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
8 * Michel Lespinasse <walken@zoy.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define MODULE_NAME motionmmx
26 #include "modules_inner.h"
28 /*****************************************************************************
30 *****************************************************************************/
33 #include <stdlib.h> /* malloc(), free() */
37 #include "common.h" /* boolean_t, byte_t */
45 #include "modules_export.h"
47 /*****************************************************************************
48 * Local and extern prototypes.
49 *****************************************************************************/
50 static void motion_getfunctions( function_list_t * p_function_list );
52 /*****************************************************************************
53 * Build configuration tree.
54 *****************************************************************************/
56 ADD_WINDOW( "Configuration for MMX motion compensation module" )
57 ADD_COMMENT( "Ha, ha -- nothing to configure yet" )
61 p_module->i_capabilities = MODULE_CAPABILITY_NULL
62 | MODULE_CAPABILITY_MOTION;
63 p_module->psz_longname = "MMX motion compensation module";
67 motion_getfunctions( &p_module->p_functions->motion );
70 MODULE_DEACTIVATE_START
71 MODULE_DEACTIVATE_STOP
73 /*****************************************************************************
74 * motion_Probe: tests probe the CPU and return a score
75 *****************************************************************************/
76 static int motion_Probe( probedata_t *p_data )
78 if( !TestCPU( CPU_CAPABILITY_MMX ) )
83 if( TestMethod( MOTION_METHOD_VAR, "motionmmx" )
84 || TestMethod( MOTION_METHOD_VAR, "mmx" ) )
92 /*****************************************************************************
93 * Motion compensation in MMX
94 *****************************************************************************/
96 // some rounding constants
97 mmx_t round1 = {0x0001000100010001LL};
98 mmx_t round4 = {0x0002000200020002LL};
101 * This code should probably be compiled with loop unrolling
102 * (ie, -funroll-loops in gcc)becuase some of the loops
103 * use a small static number of iterations. This was written
104 * with the assumption the compiler knows best about when
105 * unrolling will help
108 static __inline__ void mmx_zero_reg ()
114 static __inline__ void mmx_average_2_U8 (yuv_data_t * dest,
115 yuv_data_t * src1, yuv_data_t * src2)
118 // *dest = (*src1 + *src2 + 1)/ 2;
121 movq_m2r (*src1, mm1); // load 8 src1 bytes
122 movq_r2r (mm1, mm2); // copy 8 src1 bytes
124 movq_m2r (*src2, mm3); // load 8 src2 bytes
125 movq_r2r (mm3, mm4); // copy 8 src2 bytes
127 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
128 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
130 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
131 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
133 paddw_r2r (mm3, mm1); // add lows to mm1
134 paddw_m2r (round1, mm1);
135 psraw_i2r (1, mm1); // /2
137 paddw_r2r (mm4, mm2); // add highs to mm2
138 paddw_m2r (round1, mm2);
139 psraw_i2r (1, mm2); // /2
141 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
142 movq_r2m (mm1, *dest); // store result in dest
145 static __inline__ void mmx_interp_average_2_U8 (yuv_data_t * dest,
146 yuv_data_t * src1, yuv_data_t * src2)
149 // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
152 movq_m2r (*dest, mm1); // load 8 dest bytes
153 movq_r2r (mm1, mm2); // copy 8 dest bytes
155 movq_m2r (*src1, mm3); // load 8 src1 bytes
156 movq_r2r (mm3, mm4); // copy 8 src1 bytes
158 movq_m2r (*src2, mm5); // load 8 src2 bytes
159 movq_r2r (mm5, mm6); // copy 8 src2 bytes
161 punpcklbw_r2r (mm0, mm1); // unpack low dest bytes
162 punpckhbw_r2r (mm0, mm2); // unpack high dest bytes
164 punpcklbw_r2r (mm0, mm3); // unpack low src1 bytes
165 punpckhbw_r2r (mm0, mm4); // unpack high src1 bytes
167 punpcklbw_r2r (mm0, mm5); // unpack low src2 bytes
168 punpckhbw_r2r (mm0, mm6); // unpack high src2 bytes
170 paddw_r2r (mm5, mm3); // add lows
171 paddw_m2r (round1, mm3);
172 psraw_i2r (1, mm3); // /2
174 paddw_r2r (mm6, mm4); // add highs
175 paddw_m2r (round1, mm4);
176 psraw_i2r (1, mm4); // /2
178 paddw_r2r (mm3, mm1); // add lows
179 paddw_m2r (round1, mm1);
180 psraw_i2r (1, mm1); // /2
182 paddw_r2r (mm4, mm2); // add highs
183 paddw_m2r (round1, mm2);
184 psraw_i2r (1, mm2); // /2
186 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
187 movq_r2m (mm1, *dest); // store result in dest
190 static __inline__ void mmx_average_4_U8 (yuv_data_t * dest,
191 yuv_data_t * src1, yuv_data_t * src2,
192 yuv_data_t * src3, yuv_data_t * src4)
195 // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
198 movq_m2r (*src1, mm1); // load 8 src1 bytes
199 movq_r2r (mm1, mm2); // copy 8 src1 bytes
201 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
202 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
204 movq_m2r (*src2, mm3); // load 8 src2 bytes
205 movq_r2r (mm3, mm4); // copy 8 src2 bytes
207 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
208 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
210 paddw_r2r (mm3, mm1); // add lows
211 paddw_r2r (mm4, mm2); // add highs
213 // now have partials in mm1 and mm2
215 movq_m2r (*src3, mm3); // load 8 src3 bytes
216 movq_r2r (mm3, mm4); // copy 8 src3 bytes
218 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
219 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
221 paddw_r2r (mm3, mm1); // add lows
222 paddw_r2r (mm4, mm2); // add highs
224 movq_m2r (*src4, mm5); // load 8 src4 bytes
225 movq_r2r (mm5, mm6); // copy 8 src4 bytes
227 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
228 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
230 paddw_r2r (mm5, mm1); // add lows
231 paddw_r2r (mm6, mm2); // add highs
233 // now have subtotal in mm1 and mm2
235 paddw_m2r (round4, mm1);
236 psraw_i2r (2, mm1); // /4
237 paddw_m2r (round4, mm2);
238 psraw_i2r (2, mm2); // /4
240 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
241 movq_r2m (mm1, *dest); // store result in dest
244 static __inline__ void mmx_interp_average_4_U8 (yuv_data_t * dest,
245 yuv_data_t * src1, yuv_data_t * src2,
246 yuv_data_t * src3, yuv_data_t * src4)
249 // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
252 movq_m2r (*src1, mm1); // load 8 src1 bytes
253 movq_r2r (mm1, mm2); // copy 8 src1 bytes
255 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
256 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
258 movq_m2r (*src2, mm3); // load 8 src2 bytes
259 movq_r2r (mm3, mm4); // copy 8 src2 bytes
261 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
262 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
264 paddw_r2r (mm3, mm1); // add lows
265 paddw_r2r (mm4, mm2); // add highs
267 // now have partials in mm1 and mm2
269 movq_m2r (*src3, mm3); // load 8 src3 bytes
270 movq_r2r (mm3, mm4); // copy 8 src3 bytes
272 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
273 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
275 paddw_r2r (mm3, mm1); // add lows
276 paddw_r2r (mm4, mm2); // add highs
278 movq_m2r (*src4, mm5); // load 8 src4 bytes
279 movq_r2r (mm5, mm6); // copy 8 src4 bytes
281 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
282 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
284 paddw_r2r (mm5, mm1); // add lows
285 paddw_r2r (mm6, mm2); // add highs
287 paddw_m2r (round4, mm1);
288 psraw_i2r (2, mm1); // /4
289 paddw_m2r (round4, mm2);
290 psraw_i2r (2, mm2); // /4
292 // now have subtotal/4 in mm1 and mm2
294 movq_m2r (*dest, mm3); // load 8 dest bytes
295 movq_r2r (mm3, mm4); // copy 8 dest bytes
297 punpcklbw_r2r (mm0, mm3); // unpack low dest bytes
298 punpckhbw_r2r (mm0, mm4); // unpack high dest bytes
300 paddw_r2r (mm3, mm1); // add lows
301 paddw_r2r (mm4, mm2); // add highs
303 paddw_m2r (round1, mm1);
304 psraw_i2r (1, mm1); // /2
305 paddw_m2r (round1, mm2);
306 psraw_i2r (1, mm2); // /2
308 // now have end value in mm1 and mm2
310 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
311 movq_r2m (mm1,*dest); // store result in dest
314 //-----------------------------------------------------------------------
316 static __inline__ void MC_avg_mmx (int width, int height,
317 yuv_data_t * dest, yuv_data_t * ref, int stride)
322 mmx_average_2_U8 (dest, dest, ref);
325 mmx_average_2_U8 (dest+8, dest+8, ref+8);
332 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
333 int stride, int height)
335 MC_avg_mmx (16, height, dest, ref, stride);
338 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
339 int stride, int height)
341 MC_avg_mmx (8, height, dest, ref, stride);
344 //-----------------------------------------------------------------------
346 static __inline__ void MC_put_mmx (int width, int height,
347 yuv_data_t * dest, yuv_data_t * ref, int stride)
352 movq_m2r (* ref, mm1); // load 8 ref bytes
353 movq_r2m (mm1,* dest); // store 8 bytes at curr
357 movq_m2r (* (ref+8), mm1); // load 8 ref bytes
358 movq_r2m (mm1,* (dest+8)); // store 8 bytes at curr
366 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
367 int stride, int height)
369 MC_put_mmx (16, height, dest, ref, stride);
372 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
373 int stride, int height)
375 MC_put_mmx (8, height, dest, ref, stride);
378 //-----------------------------------------------------------------------
380 // Half pixel interpolation in the x direction
381 static __inline__ void MC_avg_x_mmx (int width, int height,
382 yuv_data_t * dest, yuv_data_t * ref, int stride)
387 mmx_interp_average_2_U8 (dest, ref, ref+1);
390 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
397 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
398 int stride, int height)
400 MC_avg_x_mmx (16, height, dest, ref, stride);
403 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
404 int stride, int height)
406 MC_avg_x_mmx (8, height, dest, ref, stride);
409 //-----------------------------------------------------------------------
411 static __inline__ void MC_put_x_mmx (int width, int height,
412 yuv_data_t * dest, yuv_data_t * ref, int stride)
417 mmx_average_2_U8 (dest, ref, ref+1);
420 mmx_average_2_U8 (dest+8, ref+8, ref+9);
427 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
428 int stride, int height)
430 MC_put_x_mmx (16, height, dest, ref, stride);
433 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
434 int stride, int height)
436 MC_put_x_mmx (8, height, dest, ref, stride);
439 //-----------------------------------------------------------------------
441 static __inline__ void MC_avg_xy_mmx (int width, int height,
442 yuv_data_t * dest, yuv_data_t * ref, int stride)
444 yuv_data_t * ref_next = ref+stride;
449 mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
452 mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
453 ref_next+8, ref_next+9);
461 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
462 int stride, int height)
464 MC_avg_xy_mmx (16, height, dest, ref, stride);
467 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
468 int stride, int height)
470 MC_avg_xy_mmx (8, height, dest, ref, stride);
473 //-----------------------------------------------------------------------
475 static __inline__ void MC_put_xy_mmx (int width, int height,
476 yuv_data_t * dest, yuv_data_t * ref, int stride)
478 yuv_data_t * ref_next = ref+stride;
483 mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
486 mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
494 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
495 int stride, int height)
497 MC_put_xy_mmx (16, height, dest, ref, stride);
500 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
501 int stride, int height)
503 MC_put_xy_mmx (8, height, dest, ref, stride);
506 //-----------------------------------------------------------------------
508 static __inline__ void MC_avg_y_mmx (int width, int height,
509 yuv_data_t * dest, yuv_data_t * ref, int stride)
511 yuv_data_t * ref_next = ref+stride;
516 mmx_interp_average_2_U8 (dest, ref, ref_next);
519 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
527 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
528 int stride, int height)
530 MC_avg_y_mmx (16, height, dest, ref, stride);
533 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
534 int stride, int height)
536 MC_avg_y_mmx (8, height, dest, ref, stride);
539 //-----------------------------------------------------------------------
541 static __inline__ void MC_put_y_mmx (int width, int height,
542 yuv_data_t * dest, yuv_data_t * ref, int stride)
544 yuv_data_t * ref_next = ref+stride;
549 mmx_average_2_U8 (dest, ref, ref_next);
552 mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
560 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
561 int stride, int height)
563 MC_put_y_mmx (16, height, dest, ref, stride);
566 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
567 int stride, int height)
569 MC_put_y_mmx (8, height, dest, ref, stride);
573 /*****************************************************************************
574 * Functions exported as capabilities. They are declared as static so that
575 * we don't pollute the namespace too much.
576 *****************************************************************************/
577 static void motion_getfunctions( function_list_t * p_function_list )
579 static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *,
583 /* Copying functions */
586 MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx
590 MC_put_8_mmx, MC_put_x8_mmx, MC_put_y8_mmx, MC_put_xy8_mmx
594 /* Averaging functions */
597 MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx
601 MC_avg_8_mmx, MC_avg_x8_mmx, MC_avg_y8_mmx, MC_avg_xy8_mmx
606 p_function_list->pf_probe = motion_Probe;
608 #define list p_function_list->functions.motion
609 memcpy( list.ppppf_motion, ppppf_motion, sizeof( void * ) * 16 );