1 /*****************************************************************************
2 * motionmmx.c : MMX motion compensation module for vlc
3 *****************************************************************************
4 * Copyright (C) 2001 VideoLAN
5 * $Id: motionmmx.c,v 1.17.2.1 2002/06/02 23:17:44 sam Exp $
7 * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
8 * Michel Lespinasse <walken@zoy.org>
9 * Vladimir Chernyshov <greengrass@writeme.com>
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
24 *****************************************************************************/
26 /*****************************************************************************
28 *****************************************************************************/
29 #include <stdlib.h> /* malloc(), free() */
32 #include <videolan/vlc.h>
36 /*****************************************************************************
37 * Local and extern prototypes.
38 *****************************************************************************/
39 static void motion_getfunctions( function_list_t * p_function_list );
41 /*****************************************************************************
42 * Build configuration tree.
43 *****************************************************************************/
48 SET_DESCRIPTION( _("MMX motion compensation module") )
49 ADD_CAPABILITY( MOTION, 150 )
50 ADD_REQUIREMENT( MMX )
52 ADD_SHORTCUT( "motionmmx" )
56 motion_getfunctions( &p_module->p_functions->motion );
59 MODULE_DEACTIVATE_START
60 MODULE_DEACTIVATE_STOP
62 /*****************************************************************************
63 * Motion compensation in MMX
64 *****************************************************************************/
66 // some rounding constants
67 mmx_t round1 = {0x0001000100010001LL};
68 mmx_t round4 = {0x0002000200020002LL};
71 * This code should probably be compiled with loop unrolling
72 * (ie, -funroll-loops in gcc)becuase some of the loops
73 * use a small static number of iterations. This was written
74 * with the assumption the compiler knows best about when
78 static inline void mmx_zero_reg ()
84 static inline void mmx_average_2_U8 (yuv_data_t * dest,
85 yuv_data_t * src1, yuv_data_t * src2)
88 // *dest = (*src1 + *src2 + 1)/ 2;
90 static mmx_t mask1 = {0x0101010101010101LL};
91 static mmx_t mask7f = {0x7f7f7f7f7f7f7f7fLL};
93 movq_m2r (*src1, mm1); // load 8 src1 bytes
96 pand_m2r (mask7f, mm1);
98 movq_m2r (*src2, mm3); // load 8 src2 bytes
101 pand_m2r (mask7f, mm3);
103 paddb_r2r (mm1, mm3);
104 pand_m2r (mask1, mm2);
105 paddb_r2r (mm3, mm2);
106 movq_r2m (mm2, *dest); // store result in dest
109 static inline void mmx_interp_average_2_U8 (yuv_data_t * dest,
110 yuv_data_t * src1, yuv_data_t * src2)
113 // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
116 movq_m2r (*dest, mm1); // load 8 dest bytes
117 movq_r2r (mm1, mm2); // copy 8 dest bytes
119 movq_m2r (*src1, mm3); // load 8 src1 bytes
120 movq_r2r (mm3, mm4); // copy 8 src1 bytes
122 movq_m2r (*src2, mm5); // load 8 src2 bytes
123 movq_r2r (mm5, mm6); // copy 8 src2 bytes
125 punpcklbw_r2r (mm0, mm1); // unpack low dest bytes
126 punpckhbw_r2r (mm0, mm2); // unpack high dest bytes
128 punpcklbw_r2r (mm0, mm3); // unpack low src1 bytes
129 punpckhbw_r2r (mm0, mm4); // unpack high src1 bytes
131 punpcklbw_r2r (mm0, mm5); // unpack low src2 bytes
132 punpckhbw_r2r (mm0, mm6); // unpack high src2 bytes
134 paddw_r2r (mm5, mm3); // add lows
135 paddw_m2r (round1, mm3);
136 psraw_i2r (1, mm3); // /2
138 paddw_r2r (mm6, mm4); // add highs
139 paddw_m2r (round1, mm4);
140 psraw_i2r (1, mm4); // /2
142 paddw_r2r (mm3, mm1); // add lows
143 paddw_m2r (round1, mm1);
144 psraw_i2r (1, mm1); // /2
146 paddw_r2r (mm4, mm2); // add highs
147 paddw_m2r (round1, mm2);
148 psraw_i2r (1, mm2); // /2
150 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
151 movq_r2m (mm1, *dest); // store result in dest
154 static inline void mmx_average_4_U8 (yuv_data_t * dest,
155 yuv_data_t * src1, yuv_data_t * src2,
156 yuv_data_t * src3, yuv_data_t * src4)
159 // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
162 movq_m2r (*src1, mm1); // load 8 src1 bytes
163 movq_r2r (mm1, mm2); // copy 8 src1 bytes
165 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
166 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
168 movq_m2r (*src2, mm3); // load 8 src2 bytes
169 movq_r2r (mm3, mm4); // copy 8 src2 bytes
171 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
172 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
174 paddw_r2r (mm3, mm1); // add lows
175 paddw_r2r (mm4, mm2); // add highs
177 // now have partials in mm1 and mm2
179 movq_m2r (*src3, mm3); // load 8 src3 bytes
180 movq_r2r (mm3, mm4); // copy 8 src3 bytes
182 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
183 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
185 paddw_r2r (mm3, mm1); // add lows
186 paddw_r2r (mm4, mm2); // add highs
188 movq_m2r (*src4, mm5); // load 8 src4 bytes
189 movq_r2r (mm5, mm6); // copy 8 src4 bytes
191 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
192 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
194 paddw_r2r (mm5, mm1); // add lows
195 paddw_r2r (mm6, mm2); // add highs
197 // now have subtotal in mm1 and mm2
199 paddw_m2r (round4, mm1);
200 psraw_i2r (2, mm1); // /4
201 paddw_m2r (round4, mm2);
202 psraw_i2r (2, mm2); // /4
204 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
205 movq_r2m (mm1, *dest); // store result in dest
208 static inline void mmx_interp_average_4_U8 (yuv_data_t * dest,
209 yuv_data_t * src1, yuv_data_t * src2,
210 yuv_data_t * src3, yuv_data_t * src4)
213 // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
216 movq_m2r (*src1, mm1); // load 8 src1 bytes
217 movq_r2r (mm1, mm2); // copy 8 src1 bytes
219 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
220 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
222 movq_m2r (*src2, mm3); // load 8 src2 bytes
223 movq_r2r (mm3, mm4); // copy 8 src2 bytes
225 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
226 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
228 paddw_r2r (mm3, mm1); // add lows
229 paddw_r2r (mm4, mm2); // add highs
231 // now have partials in mm1 and mm2
233 movq_m2r (*src3, mm3); // load 8 src3 bytes
234 movq_r2r (mm3, mm4); // copy 8 src3 bytes
236 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
237 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
239 paddw_r2r (mm3, mm1); // add lows
240 paddw_r2r (mm4, mm2); // add highs
242 movq_m2r (*src4, mm5); // load 8 src4 bytes
243 movq_r2r (mm5, mm6); // copy 8 src4 bytes
245 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
246 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
248 paddw_r2r (mm5, mm1); // add lows
249 paddw_r2r (mm6, mm2); // add highs
251 paddw_m2r (round4, mm1);
252 psraw_i2r (2, mm1); // /4
253 paddw_m2r (round4, mm2);
254 psraw_i2r (2, mm2); // /4
256 // now have subtotal/4 in mm1 and mm2
258 movq_m2r (*dest, mm3); // load 8 dest bytes
259 movq_r2r (mm3, mm4); // copy 8 dest bytes
261 punpcklbw_r2r (mm0, mm3); // unpack low dest bytes
262 punpckhbw_r2r (mm0, mm4); // unpack high dest bytes
264 paddw_r2r (mm3, mm1); // add lows
265 paddw_r2r (mm4, mm2); // add highs
267 paddw_m2r (round1, mm1);
268 psraw_i2r (1, mm1); // /2
269 paddw_m2r (round1, mm2);
270 psraw_i2r (1, mm2); // /2
272 // now have end value in mm1 and mm2
274 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
275 movq_r2m (mm1,*dest); // store result in dest
278 //-----------------------------------------------------------------------
280 static inline void MC_avg_mmx (int width, int height,
281 yuv_data_t * dest, yuv_data_t * ref, int stride)
286 mmx_average_2_U8 (dest, dest, ref);
289 mmx_average_2_U8 (dest+8, dest+8, ref+8);
296 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
297 int stride, int height)
299 MC_avg_mmx (16, height, dest, ref, stride);
302 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
303 int stride, int height)
305 MC_avg_mmx (8, height, dest, ref, stride);
308 //-----------------------------------------------------------------------
310 static inline void MC_put_mmx (int width, int height,
311 yuv_data_t * dest, yuv_data_t * ref, int stride)
316 movq_m2r (* ref, mm1); // load 8 ref bytes
317 movq_r2m (mm1,* dest); // store 8 bytes at curr
321 movq_m2r (* (ref+8), mm1); // load 8 ref bytes
322 movq_r2m (mm1,* (dest+8)); // store 8 bytes at curr
330 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
331 int stride, int height)
333 MC_put_mmx (16, height, dest, ref, stride);
336 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
337 int stride, int height)
339 MC_put_mmx (8, height, dest, ref, stride);
342 //-----------------------------------------------------------------------
344 // Half pixel interpolation in the x direction
345 static inline void MC_avg_x_mmx (int width, int height,
346 yuv_data_t * dest, yuv_data_t * ref, int stride)
351 mmx_interp_average_2_U8 (dest, ref, ref+1);
354 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
361 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
362 int stride, int height)
364 MC_avg_x_mmx (16, height, dest, ref, stride);
367 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
368 int stride, int height)
370 MC_avg_x_mmx (8, height, dest, ref, stride);
373 //-----------------------------------------------------------------------
375 static inline void MC_put_x_mmx (int width, int height,
376 yuv_data_t * dest, yuv_data_t * ref, int stride)
381 mmx_average_2_U8 (dest, ref, ref+1);
384 mmx_average_2_U8 (dest+8, ref+8, ref+9);
391 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
392 int stride, int height)
394 MC_put_x_mmx (16, height, dest, ref, stride);
397 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
398 int stride, int height)
400 MC_put_x_mmx (8, height, dest, ref, stride);
403 //-----------------------------------------------------------------------
405 static inline void MC_avg_xy_8wide_mmx (int height, yuv_data_t * dest,
406 yuv_data_t * ref, int stride)
409 movq_m2r (round4, mm7);
411 movq_m2r (*ref, mm1); // calculate first row ref[0] + ref[1]
414 punpcklbw_r2r (mm0, mm1);
415 punpckhbw_r2r (mm0, mm2);
417 movq_m2r (*(ref+1), mm3);
420 punpcklbw_r2r (mm0, mm3);
421 punpckhbw_r2r (mm0, mm4);
423 paddw_r2r (mm3, mm1);
424 paddw_r2r (mm4, mm2);
430 movq_m2r (*ref, mm5); // calculate next row ref[0] + ref[1]
433 punpcklbw_r2r (mm0, mm5);
434 punpckhbw_r2r (mm0, mm6);
436 movq_m2r (*(ref+1), mm3);
439 punpcklbw_r2r (mm0, mm3);
440 punpckhbw_r2r (mm0, mm4);
442 paddw_r2r (mm3, mm5);
443 paddw_r2r (mm4, mm6);
445 movq_r2r (mm7, mm3); // calculate round4 + previous row + current row
448 paddw_r2r (mm1, mm3);
449 paddw_r2r (mm2, mm4);
451 paddw_r2r (mm5, mm3);
452 paddw_r2r (mm6, mm4);
454 psraw_i2r (2, mm3); // /4
455 psraw_i2r (2, mm4); // /4
457 movq_m2r (*dest, mm1); // calculate (subtotal + dest[0] + round1) / 2
460 punpcklbw_r2r (mm0, mm1);
461 punpckhbw_r2r (mm0, mm2);
463 paddw_r2r (mm1, mm3);
464 paddw_r2r (mm2, mm4);
466 paddw_m2r (round1, mm3);
467 paddw_m2r (round1, mm4);
469 psraw_i2r (1, mm3); // /2
470 psraw_i2r (1, mm4); // /2
472 packuswb_r2r (mm4, mm3); // pack (w/ saturation)
473 movq_r2m (mm3, *dest); // store result in dest
475 movq_r2r (mm5, mm1); // remember current row for the next pass
484 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
485 int stride, int height)
487 MC_avg_xy_8wide_mmx(height, dest, ref, stride);
488 MC_avg_xy_8wide_mmx(height, dest+8, ref+8, stride);
491 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
492 int stride, int height)
494 MC_avg_xy_8wide_mmx(height, dest, ref, stride);
497 //-----------------------------------------------------------------------
499 static inline void MC_put_xy_8wide_mmx (int height, yuv_data_t * dest,
500 yuv_data_t * ref, int stride)
503 movq_m2r (round4, mm7);
505 movq_m2r (*ref, mm1); // calculate first row ref[0] + ref[1]
508 punpcklbw_r2r (mm0, mm1);
509 punpckhbw_r2r (mm0, mm2);
511 movq_m2r (*(ref+1), mm3);
514 punpcklbw_r2r (mm0, mm3);
515 punpckhbw_r2r (mm0, mm4);
517 paddw_r2r (mm3, mm1);
518 paddw_r2r (mm4, mm2);
524 movq_m2r (*ref, mm5); // calculate next row ref[0] + ref[1]
527 punpcklbw_r2r (mm0, mm5);
528 punpckhbw_r2r (mm0, mm6);
530 movq_m2r (*(ref+1), mm3);
533 punpcklbw_r2r (mm0, mm3);
534 punpckhbw_r2r (mm0, mm4);
536 paddw_r2r (mm3, mm5);
537 paddw_r2r (mm4, mm6);
539 movq_r2r (mm7, mm3); // calculate round4 + previous row + current row
542 paddw_r2r (mm1, mm3);
543 paddw_r2r (mm2, mm4);
545 paddw_r2r (mm5, mm3);
546 paddw_r2r (mm6, mm4);
548 psraw_i2r (2, mm3); // /4
549 psraw_i2r (2, mm4); // /4
551 packuswb_r2r (mm4, mm3); // pack (w/ saturation)
552 movq_r2m (mm3, *dest); // store result in dest
554 movq_r2r (mm5, mm1); // advance to the next row
563 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
564 int stride, int height)
566 MC_put_xy_8wide_mmx(height, dest, ref, stride);
567 MC_put_xy_8wide_mmx(height, dest + 8, ref + 8, stride);
570 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
571 int stride, int height)
573 MC_put_xy_8wide_mmx(height, dest, ref, stride);
576 //-----------------------------------------------------------------------
578 static inline void MC_avg_y_mmx (int width, int height,
579 yuv_data_t * dest, yuv_data_t * ref, int stride)
581 yuv_data_t * ref_next = ref+stride;
586 mmx_interp_average_2_U8 (dest, ref, ref_next);
589 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
597 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
598 int stride, int height)
600 MC_avg_y_mmx (16, height, dest, ref, stride);
603 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
604 int stride, int height)
606 MC_avg_y_mmx (8, height, dest, ref, stride);
609 //-----------------------------------------------------------------------
611 static inline void MC_put_y_mmx (int width, int height,
612 yuv_data_t * dest, yuv_data_t * ref, int stride)
614 yuv_data_t * ref_next = ref+stride;
619 mmx_average_2_U8 (dest, ref, ref_next);
622 mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
630 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
631 int stride, int height)
633 MC_put_y_mmx (16, height, dest, ref, stride);
636 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
637 int stride, int height)
639 MC_put_y_mmx (8, height, dest, ref, stride);
643 /*****************************************************************************
644 * Functions exported as capabilities. They are declared as static so that
645 * we don't pollute the namespace too much.
646 *****************************************************************************/
647 static void motion_getfunctions( function_list_t * p_function_list )
649 static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *,
653 /* Copying functions */
656 MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx
660 MC_put_8_mmx, MC_put_x8_mmx, MC_put_y8_mmx, MC_put_xy8_mmx
664 /* Averaging functions */
667 MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx
671 MC_avg_8_mmx, MC_avg_x8_mmx, MC_avg_y8_mmx, MC_avg_xy8_mmx
676 #define list p_function_list->functions.motion
677 memcpy( list.ppppf_motion, ppppf_motion, sizeof( void * ) * 16 );