1 /*****************************************************************************
2 * motionmmx.c : MMX motion compensation module for vlc
3 *****************************************************************************
4 * Copyright (C) 2001 VideoLAN
5 * $Id: motionmmx.c,v 1.17 2002/05/18 17:47:47 sam Exp $
7 * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
8 * Michel Lespinasse <walken@zoy.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
28 #include <stdlib.h> /* malloc(), free() */
31 #include <videolan/vlc.h>
35 /*****************************************************************************
36 * Local and extern prototypes.
37 *****************************************************************************/
38 static void motion_getfunctions( function_list_t * p_function_list );
40 /*****************************************************************************
41 * Build configuration tree.
42 *****************************************************************************/
47 SET_DESCRIPTION( _("MMX motion compensation module") )
48 ADD_CAPABILITY( MOTION, 150 )
49 ADD_REQUIREMENT( MMX )
51 ADD_SHORTCUT( "motionmmx" )
55 motion_getfunctions( &p_module->p_functions->motion );
58 MODULE_DEACTIVATE_START
59 MODULE_DEACTIVATE_STOP
61 /*****************************************************************************
62 * Motion compensation in MMX
63 *****************************************************************************/
65 // some rounding constants
66 mmx_t round1 = {0x0001000100010001LL};
67 mmx_t round4 = {0x0002000200020002LL};
70 * This code should probably be compiled with loop unrolling
71 * (ie, -funroll-loops in gcc)becuase some of the loops
72 * use a small static number of iterations. This was written
73 * with the assumption the compiler knows best about when
77 static inline void mmx_zero_reg ()
83 static inline void mmx_average_2_U8 (yuv_data_t * dest,
84 yuv_data_t * src1, yuv_data_t * src2)
87 // *dest = (*src1 + *src2 + 1)/ 2;
90 movq_m2r (*src1, mm1); // load 8 src1 bytes
91 movq_r2r (mm1, mm2); // copy 8 src1 bytes
93 movq_m2r (*src2, mm3); // load 8 src2 bytes
94 movq_r2r (mm3, mm4); // copy 8 src2 bytes
96 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
97 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
99 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
100 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
102 paddw_r2r (mm3, mm1); // add lows to mm1
103 paddw_m2r (round1, mm1);
104 psraw_i2r (1, mm1); // /2
106 paddw_r2r (mm4, mm2); // add highs to mm2
107 paddw_m2r (round1, mm2);
108 psraw_i2r (1, mm2); // /2
110 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
111 movq_r2m (mm1, *dest); // store result in dest
114 static inline void mmx_interp_average_2_U8 (yuv_data_t * dest,
115 yuv_data_t * src1, yuv_data_t * src2)
118 // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
121 movq_m2r (*dest, mm1); // load 8 dest bytes
122 movq_r2r (mm1, mm2); // copy 8 dest bytes
124 movq_m2r (*src1, mm3); // load 8 src1 bytes
125 movq_r2r (mm3, mm4); // copy 8 src1 bytes
127 movq_m2r (*src2, mm5); // load 8 src2 bytes
128 movq_r2r (mm5, mm6); // copy 8 src2 bytes
130 punpcklbw_r2r (mm0, mm1); // unpack low dest bytes
131 punpckhbw_r2r (mm0, mm2); // unpack high dest bytes
133 punpcklbw_r2r (mm0, mm3); // unpack low src1 bytes
134 punpckhbw_r2r (mm0, mm4); // unpack high src1 bytes
136 punpcklbw_r2r (mm0, mm5); // unpack low src2 bytes
137 punpckhbw_r2r (mm0, mm6); // unpack high src2 bytes
139 paddw_r2r (mm5, mm3); // add lows
140 paddw_m2r (round1, mm3);
141 psraw_i2r (1, mm3); // /2
143 paddw_r2r (mm6, mm4); // add highs
144 paddw_m2r (round1, mm4);
145 psraw_i2r (1, mm4); // /2
147 paddw_r2r (mm3, mm1); // add lows
148 paddw_m2r (round1, mm1);
149 psraw_i2r (1, mm1); // /2
151 paddw_r2r (mm4, mm2); // add highs
152 paddw_m2r (round1, mm2);
153 psraw_i2r (1, mm2); // /2
155 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
156 movq_r2m (mm1, *dest); // store result in dest
159 static inline void mmx_average_4_U8 (yuv_data_t * dest,
160 yuv_data_t * src1, yuv_data_t * src2,
161 yuv_data_t * src3, yuv_data_t * src4)
164 // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
167 movq_m2r (*src1, mm1); // load 8 src1 bytes
168 movq_r2r (mm1, mm2); // copy 8 src1 bytes
170 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
171 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
173 movq_m2r (*src2, mm3); // load 8 src2 bytes
174 movq_r2r (mm3, mm4); // copy 8 src2 bytes
176 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
177 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
179 paddw_r2r (mm3, mm1); // add lows
180 paddw_r2r (mm4, mm2); // add highs
182 // now have partials in mm1 and mm2
184 movq_m2r (*src3, mm3); // load 8 src3 bytes
185 movq_r2r (mm3, mm4); // copy 8 src3 bytes
187 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
188 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
190 paddw_r2r (mm3, mm1); // add lows
191 paddw_r2r (mm4, mm2); // add highs
193 movq_m2r (*src4, mm5); // load 8 src4 bytes
194 movq_r2r (mm5, mm6); // copy 8 src4 bytes
196 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
197 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
199 paddw_r2r (mm5, mm1); // add lows
200 paddw_r2r (mm6, mm2); // add highs
202 // now have subtotal in mm1 and mm2
204 paddw_m2r (round4, mm1);
205 psraw_i2r (2, mm1); // /4
206 paddw_m2r (round4, mm2);
207 psraw_i2r (2, mm2); // /4
209 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
210 movq_r2m (mm1, *dest); // store result in dest
213 static inline void mmx_interp_average_4_U8 (yuv_data_t * dest,
214 yuv_data_t * src1, yuv_data_t * src2,
215 yuv_data_t * src3, yuv_data_t * src4)
218 // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
221 movq_m2r (*src1, mm1); // load 8 src1 bytes
222 movq_r2r (mm1, mm2); // copy 8 src1 bytes
224 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
225 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
227 movq_m2r (*src2, mm3); // load 8 src2 bytes
228 movq_r2r (mm3, mm4); // copy 8 src2 bytes
230 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
231 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
233 paddw_r2r (mm3, mm1); // add lows
234 paddw_r2r (mm4, mm2); // add highs
236 // now have partials in mm1 and mm2
238 movq_m2r (*src3, mm3); // load 8 src3 bytes
239 movq_r2r (mm3, mm4); // copy 8 src3 bytes
241 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
242 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
244 paddw_r2r (mm3, mm1); // add lows
245 paddw_r2r (mm4, mm2); // add highs
247 movq_m2r (*src4, mm5); // load 8 src4 bytes
248 movq_r2r (mm5, mm6); // copy 8 src4 bytes
250 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
251 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
253 paddw_r2r (mm5, mm1); // add lows
254 paddw_r2r (mm6, mm2); // add highs
256 paddw_m2r (round4, mm1);
257 psraw_i2r (2, mm1); // /4
258 paddw_m2r (round4, mm2);
259 psraw_i2r (2, mm2); // /4
261 // now have subtotal/4 in mm1 and mm2
263 movq_m2r (*dest, mm3); // load 8 dest bytes
264 movq_r2r (mm3, mm4); // copy 8 dest bytes
266 punpcklbw_r2r (mm0, mm3); // unpack low dest bytes
267 punpckhbw_r2r (mm0, mm4); // unpack high dest bytes
269 paddw_r2r (mm3, mm1); // add lows
270 paddw_r2r (mm4, mm2); // add highs
272 paddw_m2r (round1, mm1);
273 psraw_i2r (1, mm1); // /2
274 paddw_m2r (round1, mm2);
275 psraw_i2r (1, mm2); // /2
277 // now have end value in mm1 and mm2
279 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
280 movq_r2m (mm1,*dest); // store result in dest
283 //-----------------------------------------------------------------------
285 static inline void MC_avg_mmx (int width, int height,
286 yuv_data_t * dest, yuv_data_t * ref, int stride)
291 mmx_average_2_U8 (dest, dest, ref);
294 mmx_average_2_U8 (dest+8, dest+8, ref+8);
301 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
302 int stride, int height)
304 MC_avg_mmx (16, height, dest, ref, stride);
307 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
308 int stride, int height)
310 MC_avg_mmx (8, height, dest, ref, stride);
313 //-----------------------------------------------------------------------
315 static inline void MC_put_mmx (int width, int height,
316 yuv_data_t * dest, yuv_data_t * ref, int stride)
321 movq_m2r (* ref, mm1); // load 8 ref bytes
322 movq_r2m (mm1,* dest); // store 8 bytes at curr
326 movq_m2r (* (ref+8), mm1); // load 8 ref bytes
327 movq_r2m (mm1,* (dest+8)); // store 8 bytes at curr
335 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
336 int stride, int height)
338 MC_put_mmx (16, height, dest, ref, stride);
341 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
342 int stride, int height)
344 MC_put_mmx (8, height, dest, ref, stride);
347 //-----------------------------------------------------------------------
349 // Half pixel interpolation in the x direction
350 static inline void MC_avg_x_mmx (int width, int height,
351 yuv_data_t * dest, yuv_data_t * ref, int stride)
356 mmx_interp_average_2_U8 (dest, ref, ref+1);
359 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
366 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
367 int stride, int height)
369 MC_avg_x_mmx (16, height, dest, ref, stride);
372 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
373 int stride, int height)
375 MC_avg_x_mmx (8, height, dest, ref, stride);
378 //-----------------------------------------------------------------------
380 static inline void MC_put_x_mmx (int width, int height,
381 yuv_data_t * dest, yuv_data_t * ref, int stride)
386 mmx_average_2_U8 (dest, ref, ref+1);
389 mmx_average_2_U8 (dest+8, ref+8, ref+9);
396 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
397 int stride, int height)
399 MC_put_x_mmx (16, height, dest, ref, stride);
402 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
403 int stride, int height)
405 MC_put_x_mmx (8, height, dest, ref, stride);
408 //-----------------------------------------------------------------------
410 static inline void MC_avg_xy_mmx (int width, int height,
411 yuv_data_t * dest, yuv_data_t * ref, int stride)
413 yuv_data_t * ref_next = ref+stride;
418 mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
421 mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
422 ref_next+8, ref_next+9);
430 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
431 int stride, int height)
433 MC_avg_xy_mmx (16, height, dest, ref, stride);
436 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
437 int stride, int height)
439 MC_avg_xy_mmx (8, height, dest, ref, stride);
442 //-----------------------------------------------------------------------
444 static inline void MC_put_xy_mmx (int width, int height,
445 yuv_data_t * dest, yuv_data_t * ref, int stride)
447 yuv_data_t * ref_next = ref+stride;
452 mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
455 mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
463 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
464 int stride, int height)
466 MC_put_xy_mmx (16, height, dest, ref, stride);
469 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
470 int stride, int height)
472 MC_put_xy_mmx (8, height, dest, ref, stride);
475 //-----------------------------------------------------------------------
477 static inline void MC_avg_y_mmx (int width, int height,
478 yuv_data_t * dest, yuv_data_t * ref, int stride)
480 yuv_data_t * ref_next = ref+stride;
485 mmx_interp_average_2_U8 (dest, ref, ref_next);
488 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
496 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
497 int stride, int height)
499 MC_avg_y_mmx (16, height, dest, ref, stride);
502 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
503 int stride, int height)
505 MC_avg_y_mmx (8, height, dest, ref, stride);
508 //-----------------------------------------------------------------------
510 static inline void MC_put_y_mmx (int width, int height,
511 yuv_data_t * dest, yuv_data_t * ref, int stride)
513 yuv_data_t * ref_next = ref+stride;
518 mmx_average_2_U8 (dest, ref, ref_next);
521 mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
529 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
530 int stride, int height)
532 MC_put_y_mmx (16, height, dest, ref, stride);
535 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
536 int stride, int height)
538 MC_put_y_mmx (8, height, dest, ref, stride);
542 /*****************************************************************************
543 * Functions exported as capabilities. They are declared as static so that
544 * we don't pollute the namespace too much.
545 *****************************************************************************/
546 static void motion_getfunctions( function_list_t * p_function_list )
548 static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *,
552 /* Copying functions */
555 MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx
559 MC_put_8_mmx, MC_put_x8_mmx, MC_put_y8_mmx, MC_put_xy8_mmx
563 /* Averaging functions */
566 MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx
570 MC_avg_8_mmx, MC_avg_x8_mmx, MC_avg_y8_mmx, MC_avg_xy8_mmx
575 #define list p_function_list->functions.motion
576 memcpy( list.ppppf_motion, ppppf_motion, sizeof( void * ) * 16 );