1 /*****************************************************************************
2 * motionmmx.c : MMX motion compensation module for vlc
3 *****************************************************************************
4 * Copyright (C) 2001 VideoLAN
5 * $Id: motionmmx.c,v 1.14 2001/12/30 07:09:55 sam Exp $
7 * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
8 * Michel Lespinasse <walken@zoy.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
28 #include <stdlib.h> /* malloc(), free() */
31 #include <videolan/vlc.h>
35 /*****************************************************************************
36 * Local and extern prototypes.
37 *****************************************************************************/
38 static void motion_getfunctions( function_list_t * p_function_list );
40 /*****************************************************************************
41 * Build configuration tree.
42 *****************************************************************************/
47 SET_DESCRIPTION( "MMX motion compensation module" )
48 ADD_CAPABILITY( MOTION, 150 )
49 ADD_REQUIREMENT( MMX )
51 ADD_SHORTCUT( "motionmmx" )
55 motion_getfunctions( &p_module->p_functions->motion );
58 MODULE_DEACTIVATE_START
59 MODULE_DEACTIVATE_STOP
61 /*****************************************************************************
62 * motion_Probe: tests probe the CPU and return a score
63 *****************************************************************************/
64 static int motion_Probe( probedata_t *p_data )
69 /*****************************************************************************
70 * Motion compensation in MMX
71 *****************************************************************************/
73 // some rounding constants
74 mmx_t round1 = {0x0001000100010001LL};
75 mmx_t round4 = {0x0002000200020002LL};
78 * This code should probably be compiled with loop unrolling
79 * (ie, -funroll-loops in gcc)becuase some of the loops
80 * use a small static number of iterations. This was written
81 * with the assumption the compiler knows best about when
85 static __inline__ void mmx_zero_reg ()
91 static __inline__ void mmx_average_2_U8 (yuv_data_t * dest,
92 yuv_data_t * src1, yuv_data_t * src2)
95 // *dest = (*src1 + *src2 + 1)/ 2;
98 movq_m2r (*src1, mm1); // load 8 src1 bytes
99 movq_r2r (mm1, mm2); // copy 8 src1 bytes
101 movq_m2r (*src2, mm3); // load 8 src2 bytes
102 movq_r2r (mm3, mm4); // copy 8 src2 bytes
104 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
105 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
107 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
108 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
110 paddw_r2r (mm3, mm1); // add lows to mm1
111 paddw_m2r (round1, mm1);
112 psraw_i2r (1, mm1); // /2
114 paddw_r2r (mm4, mm2); // add highs to mm2
115 paddw_m2r (round1, mm2);
116 psraw_i2r (1, mm2); // /2
118 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
119 movq_r2m (mm1, *dest); // store result in dest
122 static __inline__ void mmx_interp_average_2_U8 (yuv_data_t * dest,
123 yuv_data_t * src1, yuv_data_t * src2)
126 // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
129 movq_m2r (*dest, mm1); // load 8 dest bytes
130 movq_r2r (mm1, mm2); // copy 8 dest bytes
132 movq_m2r (*src1, mm3); // load 8 src1 bytes
133 movq_r2r (mm3, mm4); // copy 8 src1 bytes
135 movq_m2r (*src2, mm5); // load 8 src2 bytes
136 movq_r2r (mm5, mm6); // copy 8 src2 bytes
138 punpcklbw_r2r (mm0, mm1); // unpack low dest bytes
139 punpckhbw_r2r (mm0, mm2); // unpack high dest bytes
141 punpcklbw_r2r (mm0, mm3); // unpack low src1 bytes
142 punpckhbw_r2r (mm0, mm4); // unpack high src1 bytes
144 punpcklbw_r2r (mm0, mm5); // unpack low src2 bytes
145 punpckhbw_r2r (mm0, mm6); // unpack high src2 bytes
147 paddw_r2r (mm5, mm3); // add lows
148 paddw_m2r (round1, mm3);
149 psraw_i2r (1, mm3); // /2
151 paddw_r2r (mm6, mm4); // add highs
152 paddw_m2r (round1, mm4);
153 psraw_i2r (1, mm4); // /2
155 paddw_r2r (mm3, mm1); // add lows
156 paddw_m2r (round1, mm1);
157 psraw_i2r (1, mm1); // /2
159 paddw_r2r (mm4, mm2); // add highs
160 paddw_m2r (round1, mm2);
161 psraw_i2r (1, mm2); // /2
163 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
164 movq_r2m (mm1, *dest); // store result in dest
167 static __inline__ void mmx_average_4_U8 (yuv_data_t * dest,
168 yuv_data_t * src1, yuv_data_t * src2,
169 yuv_data_t * src3, yuv_data_t * src4)
172 // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
175 movq_m2r (*src1, mm1); // load 8 src1 bytes
176 movq_r2r (mm1, mm2); // copy 8 src1 bytes
178 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
179 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
181 movq_m2r (*src2, mm3); // load 8 src2 bytes
182 movq_r2r (mm3, mm4); // copy 8 src2 bytes
184 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
185 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
187 paddw_r2r (mm3, mm1); // add lows
188 paddw_r2r (mm4, mm2); // add highs
190 // now have partials in mm1 and mm2
192 movq_m2r (*src3, mm3); // load 8 src3 bytes
193 movq_r2r (mm3, mm4); // copy 8 src3 bytes
195 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
196 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
198 paddw_r2r (mm3, mm1); // add lows
199 paddw_r2r (mm4, mm2); // add highs
201 movq_m2r (*src4, mm5); // load 8 src4 bytes
202 movq_r2r (mm5, mm6); // copy 8 src4 bytes
204 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
205 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
207 paddw_r2r (mm5, mm1); // add lows
208 paddw_r2r (mm6, mm2); // add highs
210 // now have subtotal in mm1 and mm2
212 paddw_m2r (round4, mm1);
213 psraw_i2r (2, mm1); // /4
214 paddw_m2r (round4, mm2);
215 psraw_i2r (2, mm2); // /4
217 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
218 movq_r2m (mm1, *dest); // store result in dest
221 static __inline__ void mmx_interp_average_4_U8 (yuv_data_t * dest,
222 yuv_data_t * src1, yuv_data_t * src2,
223 yuv_data_t * src3, yuv_data_t * src4)
226 // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
229 movq_m2r (*src1, mm1); // load 8 src1 bytes
230 movq_r2r (mm1, mm2); // copy 8 src1 bytes
232 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
233 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
235 movq_m2r (*src2, mm3); // load 8 src2 bytes
236 movq_r2r (mm3, mm4); // copy 8 src2 bytes
238 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
239 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
241 paddw_r2r (mm3, mm1); // add lows
242 paddw_r2r (mm4, mm2); // add highs
244 // now have partials in mm1 and mm2
246 movq_m2r (*src3, mm3); // load 8 src3 bytes
247 movq_r2r (mm3, mm4); // copy 8 src3 bytes
249 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
250 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
252 paddw_r2r (mm3, mm1); // add lows
253 paddw_r2r (mm4, mm2); // add highs
255 movq_m2r (*src4, mm5); // load 8 src4 bytes
256 movq_r2r (mm5, mm6); // copy 8 src4 bytes
258 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
259 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
261 paddw_r2r (mm5, mm1); // add lows
262 paddw_r2r (mm6, mm2); // add highs
264 paddw_m2r (round4, mm1);
265 psraw_i2r (2, mm1); // /4
266 paddw_m2r (round4, mm2);
267 psraw_i2r (2, mm2); // /4
269 // now have subtotal/4 in mm1 and mm2
271 movq_m2r (*dest, mm3); // load 8 dest bytes
272 movq_r2r (mm3, mm4); // copy 8 dest bytes
274 punpcklbw_r2r (mm0, mm3); // unpack low dest bytes
275 punpckhbw_r2r (mm0, mm4); // unpack high dest bytes
277 paddw_r2r (mm3, mm1); // add lows
278 paddw_r2r (mm4, mm2); // add highs
280 paddw_m2r (round1, mm1);
281 psraw_i2r (1, mm1); // /2
282 paddw_m2r (round1, mm2);
283 psraw_i2r (1, mm2); // /2
285 // now have end value in mm1 and mm2
287 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
288 movq_r2m (mm1,*dest); // store result in dest
291 //-----------------------------------------------------------------------
293 static __inline__ void MC_avg_mmx (int width, int height,
294 yuv_data_t * dest, yuv_data_t * ref, int stride)
299 mmx_average_2_U8 (dest, dest, ref);
302 mmx_average_2_U8 (dest+8, dest+8, ref+8);
309 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
310 int stride, int height)
312 MC_avg_mmx (16, height, dest, ref, stride);
315 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
316 int stride, int height)
318 MC_avg_mmx (8, height, dest, ref, stride);
321 //-----------------------------------------------------------------------
323 static __inline__ void MC_put_mmx (int width, int height,
324 yuv_data_t * dest, yuv_data_t * ref, int stride)
329 movq_m2r (* ref, mm1); // load 8 ref bytes
330 movq_r2m (mm1,* dest); // store 8 bytes at curr
334 movq_m2r (* (ref+8), mm1); // load 8 ref bytes
335 movq_r2m (mm1,* (dest+8)); // store 8 bytes at curr
343 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
344 int stride, int height)
346 MC_put_mmx (16, height, dest, ref, stride);
349 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
350 int stride, int height)
352 MC_put_mmx (8, height, dest, ref, stride);
355 //-----------------------------------------------------------------------
357 // Half pixel interpolation in the x direction
358 static __inline__ void MC_avg_x_mmx (int width, int height,
359 yuv_data_t * dest, yuv_data_t * ref, int stride)
364 mmx_interp_average_2_U8 (dest, ref, ref+1);
367 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
374 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
375 int stride, int height)
377 MC_avg_x_mmx (16, height, dest, ref, stride);
380 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
381 int stride, int height)
383 MC_avg_x_mmx (8, height, dest, ref, stride);
386 //-----------------------------------------------------------------------
388 static __inline__ void MC_put_x_mmx (int width, int height,
389 yuv_data_t * dest, yuv_data_t * ref, int stride)
394 mmx_average_2_U8 (dest, ref, ref+1);
397 mmx_average_2_U8 (dest+8, ref+8, ref+9);
404 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
405 int stride, int height)
407 MC_put_x_mmx (16, height, dest, ref, stride);
410 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
411 int stride, int height)
413 MC_put_x_mmx (8, height, dest, ref, stride);
416 //-----------------------------------------------------------------------
418 static __inline__ void MC_avg_xy_mmx (int width, int height,
419 yuv_data_t * dest, yuv_data_t * ref, int stride)
421 yuv_data_t * ref_next = ref+stride;
426 mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
429 mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
430 ref_next+8, ref_next+9);
438 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
439 int stride, int height)
441 MC_avg_xy_mmx (16, height, dest, ref, stride);
444 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
445 int stride, int height)
447 MC_avg_xy_mmx (8, height, dest, ref, stride);
450 //-----------------------------------------------------------------------
452 static __inline__ void MC_put_xy_mmx (int width, int height,
453 yuv_data_t * dest, yuv_data_t * ref, int stride)
455 yuv_data_t * ref_next = ref+stride;
460 mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
463 mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
471 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
472 int stride, int height)
474 MC_put_xy_mmx (16, height, dest, ref, stride);
477 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
478 int stride, int height)
480 MC_put_xy_mmx (8, height, dest, ref, stride);
483 //-----------------------------------------------------------------------
485 static __inline__ void MC_avg_y_mmx (int width, int height,
486 yuv_data_t * dest, yuv_data_t * ref, int stride)
488 yuv_data_t * ref_next = ref+stride;
493 mmx_interp_average_2_U8 (dest, ref, ref_next);
496 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
504 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
505 int stride, int height)
507 MC_avg_y_mmx (16, height, dest, ref, stride);
510 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
511 int stride, int height)
513 MC_avg_y_mmx (8, height, dest, ref, stride);
516 //-----------------------------------------------------------------------
518 static __inline__ void MC_put_y_mmx (int width, int height,
519 yuv_data_t * dest, yuv_data_t * ref, int stride)
521 yuv_data_t * ref_next = ref+stride;
526 mmx_average_2_U8 (dest, ref, ref_next);
529 mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
537 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
538 int stride, int height)
540 MC_put_y_mmx (16, height, dest, ref, stride);
543 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
544 int stride, int height)
546 MC_put_y_mmx (8, height, dest, ref, stride);
550 /*****************************************************************************
551 * Functions exported as capabilities. They are declared as static so that
552 * we don't pollute the namespace too much.
553 *****************************************************************************/
554 static void motion_getfunctions( function_list_t * p_function_list )
556 static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *,
560 /* Copying functions */
563 MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx
567 MC_put_8_mmx, MC_put_x8_mmx, MC_put_y8_mmx, MC_put_xy8_mmx
571 /* Averaging functions */
574 MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx
578 MC_avg_8_mmx, MC_avg_x8_mmx, MC_avg_y8_mmx, MC_avg_xy8_mmx
583 p_function_list->pf_probe = motion_Probe;
585 #define list p_function_list->functions.motion
586 memcpy( list.ppppf_motion, ppppf_motion, sizeof( void * ) * 16 );