1 /*****************************************************************************
2 * motionmmx.c : MMX motion compensation module for vlc
3 *****************************************************************************
4 * Copyright (C) 2001 VideoLAN
5 * $Id: motionmmx.c,v 1.19 2002/06/02 23:29:29 sam Exp $
7 * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
8 * Michel Lespinasse <walken@zoy.org>
9 * Vladimir Chernyshov <greengrass@writeme.com>
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
24 *****************************************************************************/
26 /*****************************************************************************
28 *****************************************************************************/
29 #include <stdlib.h> /* malloc(), free() */
36 /*****************************************************************************
37 * Local and extern prototypes.
38 *****************************************************************************/
39 static void motion_getfunctions( function_list_t * p_function_list );
41 /*****************************************************************************
42 * Build configuration tree.
43 *****************************************************************************/
48 SET_DESCRIPTION( _("MMX motion compensation module") )
49 ADD_CAPABILITY( MOTION, 150 )
50 ADD_REQUIREMENT( MMX )
55 motion_getfunctions( &p_module->p_functions->motion );
58 MODULE_DEACTIVATE_START
59 MODULE_DEACTIVATE_STOP
61 /*****************************************************************************
62 * Motion compensation in MMX
63 *****************************************************************************/
65 // some rounding constants
66 mmx_t round1 = {0x0001000100010001LL};
67 mmx_t round4 = {0x0002000200020002LL};
70 * This code should probably be compiled with loop unrolling
71 * (ie, -funroll-loops in gcc)becuase some of the loops
72 * use a small static number of iterations. This was written
73 * with the assumption the compiler knows best about when
77 static inline void mmx_zero_reg ()
83 static inline void mmx_average_2_U8 (yuv_data_t * dest,
84 yuv_data_t * src1, yuv_data_t * src2)
87 // *dest = (*src1 + *src2 + 1)/ 2;
89 static mmx_t mask1 = {0x0101010101010101LL};
90 static mmx_t mask7f = {0x7f7f7f7f7f7f7f7fLL};
92 movq_m2r (*src1, mm1); // load 8 src1 bytes
95 pand_m2r (mask7f, mm1);
97 movq_m2r (*src2, mm3); // load 8 src2 bytes
100 pand_m2r (mask7f, mm3);
102 paddb_r2r (mm1, mm3);
103 pand_m2r (mask1, mm2);
104 paddb_r2r (mm3, mm2);
105 movq_r2m (mm2, *dest); // store result in dest
108 static inline void mmx_interp_average_2_U8 (yuv_data_t * dest,
109 yuv_data_t * src1, yuv_data_t * src2)
112 // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
115 movq_m2r (*dest, mm1); // load 8 dest bytes
116 movq_r2r (mm1, mm2); // copy 8 dest bytes
118 movq_m2r (*src1, mm3); // load 8 src1 bytes
119 movq_r2r (mm3, mm4); // copy 8 src1 bytes
121 movq_m2r (*src2, mm5); // load 8 src2 bytes
122 movq_r2r (mm5, mm6); // copy 8 src2 bytes
124 punpcklbw_r2r (mm0, mm1); // unpack low dest bytes
125 punpckhbw_r2r (mm0, mm2); // unpack high dest bytes
127 punpcklbw_r2r (mm0, mm3); // unpack low src1 bytes
128 punpckhbw_r2r (mm0, mm4); // unpack high src1 bytes
130 punpcklbw_r2r (mm0, mm5); // unpack low src2 bytes
131 punpckhbw_r2r (mm0, mm6); // unpack high src2 bytes
133 paddw_r2r (mm5, mm3); // add lows
134 paddw_m2r (round1, mm3);
135 psraw_i2r (1, mm3); // /2
137 paddw_r2r (mm6, mm4); // add highs
138 paddw_m2r (round1, mm4);
139 psraw_i2r (1, mm4); // /2
141 paddw_r2r (mm3, mm1); // add lows
142 paddw_m2r (round1, mm1);
143 psraw_i2r (1, mm1); // /2
145 paddw_r2r (mm4, mm2); // add highs
146 paddw_m2r (round1, mm2);
147 psraw_i2r (1, mm2); // /2
149 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
150 movq_r2m (mm1, *dest); // store result in dest
153 static inline void mmx_average_4_U8 (yuv_data_t * dest,
154 yuv_data_t * src1, yuv_data_t * src2,
155 yuv_data_t * src3, yuv_data_t * src4)
158 // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
161 movq_m2r (*src1, mm1); // load 8 src1 bytes
162 movq_r2r (mm1, mm2); // copy 8 src1 bytes
164 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
165 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
167 movq_m2r (*src2, mm3); // load 8 src2 bytes
168 movq_r2r (mm3, mm4); // copy 8 src2 bytes
170 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
171 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
173 paddw_r2r (mm3, mm1); // add lows
174 paddw_r2r (mm4, mm2); // add highs
176 // now have partials in mm1 and mm2
178 movq_m2r (*src3, mm3); // load 8 src3 bytes
179 movq_r2r (mm3, mm4); // copy 8 src3 bytes
181 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
182 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
184 paddw_r2r (mm3, mm1); // add lows
185 paddw_r2r (mm4, mm2); // add highs
187 movq_m2r (*src4, mm5); // load 8 src4 bytes
188 movq_r2r (mm5, mm6); // copy 8 src4 bytes
190 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
191 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
193 paddw_r2r (mm5, mm1); // add lows
194 paddw_r2r (mm6, mm2); // add highs
196 // now have subtotal in mm1 and mm2
198 paddw_m2r (round4, mm1);
199 psraw_i2r (2, mm1); // /4
200 paddw_m2r (round4, mm2);
201 psraw_i2r (2, mm2); // /4
203 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
204 movq_r2m (mm1, *dest); // store result in dest
207 static inline void mmx_interp_average_4_U8 (yuv_data_t * dest,
208 yuv_data_t * src1, yuv_data_t * src2,
209 yuv_data_t * src3, yuv_data_t * src4)
212 // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
215 movq_m2r (*src1, mm1); // load 8 src1 bytes
216 movq_r2r (mm1, mm2); // copy 8 src1 bytes
218 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
219 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
221 movq_m2r (*src2, mm3); // load 8 src2 bytes
222 movq_r2r (mm3, mm4); // copy 8 src2 bytes
224 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
225 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
227 paddw_r2r (mm3, mm1); // add lows
228 paddw_r2r (mm4, mm2); // add highs
230 // now have partials in mm1 and mm2
232 movq_m2r (*src3, mm3); // load 8 src3 bytes
233 movq_r2r (mm3, mm4); // copy 8 src3 bytes
235 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
236 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
238 paddw_r2r (mm3, mm1); // add lows
239 paddw_r2r (mm4, mm2); // add highs
241 movq_m2r (*src4, mm5); // load 8 src4 bytes
242 movq_r2r (mm5, mm6); // copy 8 src4 bytes
244 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
245 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
247 paddw_r2r (mm5, mm1); // add lows
248 paddw_r2r (mm6, mm2); // add highs
250 paddw_m2r (round4, mm1);
251 psraw_i2r (2, mm1); // /4
252 paddw_m2r (round4, mm2);
253 psraw_i2r (2, mm2); // /4
255 // now have subtotal/4 in mm1 and mm2
257 movq_m2r (*dest, mm3); // load 8 dest bytes
258 movq_r2r (mm3, mm4); // copy 8 dest bytes
260 punpcklbw_r2r (mm0, mm3); // unpack low dest bytes
261 punpckhbw_r2r (mm0, mm4); // unpack high dest bytes
263 paddw_r2r (mm3, mm1); // add lows
264 paddw_r2r (mm4, mm2); // add highs
266 paddw_m2r (round1, mm1);
267 psraw_i2r (1, mm1); // /2
268 paddw_m2r (round1, mm2);
269 psraw_i2r (1, mm2); // /2
271 // now have end value in mm1 and mm2
273 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
274 movq_r2m (mm1,*dest); // store result in dest
277 //-----------------------------------------------------------------------
279 static inline void MC_avg_mmx (int width, int height,
280 yuv_data_t * dest, yuv_data_t * ref, int stride)
285 mmx_average_2_U8 (dest, dest, ref);
288 mmx_average_2_U8 (dest+8, dest+8, ref+8);
295 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
296 int stride, int height)
298 MC_avg_mmx (16, height, dest, ref, stride);
301 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
302 int stride, int height)
304 MC_avg_mmx (8, height, dest, ref, stride);
307 //-----------------------------------------------------------------------
309 static inline void MC_put_mmx (int width, int height,
310 yuv_data_t * dest, yuv_data_t * ref, int stride)
315 movq_m2r (* ref, mm1); // load 8 ref bytes
316 movq_r2m (mm1,* dest); // store 8 bytes at curr
320 movq_m2r (* (ref+8), mm1); // load 8 ref bytes
321 movq_r2m (mm1,* (dest+8)); // store 8 bytes at curr
329 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
330 int stride, int height)
332 MC_put_mmx (16, height, dest, ref, stride);
335 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
336 int stride, int height)
338 MC_put_mmx (8, height, dest, ref, stride);
341 //-----------------------------------------------------------------------
343 // Half pixel interpolation in the x direction
344 static inline void MC_avg_x_mmx (int width, int height,
345 yuv_data_t * dest, yuv_data_t * ref, int stride)
350 mmx_interp_average_2_U8 (dest, ref, ref+1);
353 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
360 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
361 int stride, int height)
363 MC_avg_x_mmx (16, height, dest, ref, stride);
366 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
367 int stride, int height)
369 MC_avg_x_mmx (8, height, dest, ref, stride);
372 //-----------------------------------------------------------------------
374 static inline void MC_put_x_mmx (int width, int height,
375 yuv_data_t * dest, yuv_data_t * ref, int stride)
380 mmx_average_2_U8 (dest, ref, ref+1);
383 mmx_average_2_U8 (dest+8, ref+8, ref+9);
390 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
391 int stride, int height)
393 MC_put_x_mmx (16, height, dest, ref, stride);
396 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
397 int stride, int height)
399 MC_put_x_mmx (8, height, dest, ref, stride);
402 //-----------------------------------------------------------------------
404 static inline void MC_avg_xy_8wide_mmx (int height, yuv_data_t * dest,
405 yuv_data_t * ref, int stride)
408 movq_m2r (round4, mm7);
410 movq_m2r (*ref, mm1); // calculate first row ref[0] + ref[1]
413 punpcklbw_r2r (mm0, mm1);
414 punpckhbw_r2r (mm0, mm2);
416 movq_m2r (*(ref+1), mm3);
419 punpcklbw_r2r (mm0, mm3);
420 punpckhbw_r2r (mm0, mm4);
422 paddw_r2r (mm3, mm1);
423 paddw_r2r (mm4, mm2);
429 movq_m2r (*ref, mm5); // calculate next row ref[0] + ref[1]
432 punpcklbw_r2r (mm0, mm5);
433 punpckhbw_r2r (mm0, mm6);
435 movq_m2r (*(ref+1), mm3);
438 punpcklbw_r2r (mm0, mm3);
439 punpckhbw_r2r (mm0, mm4);
441 paddw_r2r (mm3, mm5);
442 paddw_r2r (mm4, mm6);
444 movq_r2r (mm7, mm3); // calculate round4 + previous row + current row
447 paddw_r2r (mm1, mm3);
448 paddw_r2r (mm2, mm4);
450 paddw_r2r (mm5, mm3);
451 paddw_r2r (mm6, mm4);
453 psraw_i2r (2, mm3); // /4
454 psraw_i2r (2, mm4); // /4
456 movq_m2r (*dest, mm1); // calculate (subtotal + dest[0] + round1) / 2
459 punpcklbw_r2r (mm0, mm1);
460 punpckhbw_r2r (mm0, mm2);
462 paddw_r2r (mm1, mm3);
463 paddw_r2r (mm2, mm4);
465 paddw_m2r (round1, mm3);
466 paddw_m2r (round1, mm4);
468 psraw_i2r (1, mm3); // /2
469 psraw_i2r (1, mm4); // /2
471 packuswb_r2r (mm4, mm3); // pack (w/ saturation)
472 movq_r2m (mm3, *dest); // store result in dest
474 movq_r2r (mm5, mm1); // remember current row for the next pass
483 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
484 int stride, int height)
486 MC_avg_xy_8wide_mmx(height, dest, ref, stride);
487 MC_avg_xy_8wide_mmx(height, dest+8, ref+8, stride);
490 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
491 int stride, int height)
493 MC_avg_xy_8wide_mmx(height, dest, ref, stride);
496 //-----------------------------------------------------------------------
498 static inline void MC_put_xy_8wide_mmx (int height, yuv_data_t * dest,
499 yuv_data_t * ref, int stride)
502 movq_m2r (round4, mm7);
504 movq_m2r (*ref, mm1); // calculate first row ref[0] + ref[1]
507 punpcklbw_r2r (mm0, mm1);
508 punpckhbw_r2r (mm0, mm2);
510 movq_m2r (*(ref+1), mm3);
513 punpcklbw_r2r (mm0, mm3);
514 punpckhbw_r2r (mm0, mm4);
516 paddw_r2r (mm3, mm1);
517 paddw_r2r (mm4, mm2);
523 movq_m2r (*ref, mm5); // calculate next row ref[0] + ref[1]
526 punpcklbw_r2r (mm0, mm5);
527 punpckhbw_r2r (mm0, mm6);
529 movq_m2r (*(ref+1), mm3);
532 punpcklbw_r2r (mm0, mm3);
533 punpckhbw_r2r (mm0, mm4);
535 paddw_r2r (mm3, mm5);
536 paddw_r2r (mm4, mm6);
538 movq_r2r (mm7, mm3); // calculate round4 + previous row + current row
541 paddw_r2r (mm1, mm3);
542 paddw_r2r (mm2, mm4);
544 paddw_r2r (mm5, mm3);
545 paddw_r2r (mm6, mm4);
547 psraw_i2r (2, mm3); // /4
548 psraw_i2r (2, mm4); // /4
550 packuswb_r2r (mm4, mm3); // pack (w/ saturation)
551 movq_r2m (mm3, *dest); // store result in dest
553 movq_r2r (mm5, mm1); // advance to the next row
562 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
563 int stride, int height)
565 MC_put_xy_8wide_mmx(height, dest, ref, stride);
566 MC_put_xy_8wide_mmx(height, dest + 8, ref + 8, stride);
569 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
570 int stride, int height)
572 MC_put_xy_8wide_mmx(height, dest, ref, stride);
575 //-----------------------------------------------------------------------
577 static inline void MC_avg_y_mmx (int width, int height,
578 yuv_data_t * dest, yuv_data_t * ref, int stride)
580 yuv_data_t * ref_next = ref+stride;
585 mmx_interp_average_2_U8 (dest, ref, ref_next);
588 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
596 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
597 int stride, int height)
599 MC_avg_y_mmx (16, height, dest, ref, stride);
602 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
603 int stride, int height)
605 MC_avg_y_mmx (8, height, dest, ref, stride);
608 //-----------------------------------------------------------------------
610 static inline void MC_put_y_mmx (int width, int height,
611 yuv_data_t * dest, yuv_data_t * ref, int stride)
613 yuv_data_t * ref_next = ref+stride;
618 mmx_average_2_U8 (dest, ref, ref_next);
621 mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
629 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
630 int stride, int height)
632 MC_put_y_mmx (16, height, dest, ref, stride);
635 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
636 int stride, int height)
638 MC_put_y_mmx (8, height, dest, ref, stride);
642 /*****************************************************************************
643 * Functions exported as capabilities. They are declared as static so that
644 * we don't pollute the namespace too much.
645 *****************************************************************************/
646 static void motion_getfunctions( function_list_t * p_function_list )
648 static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *,
652 /* Copying functions */
655 MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx
659 MC_put_8_mmx, MC_put_x8_mmx, MC_put_y8_mmx, MC_put_xy8_mmx
663 /* Averaging functions */
666 MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx
670 MC_avg_8_mmx, MC_avg_x8_mmx, MC_avg_y8_mmx, MC_avg_xy8_mmx
675 #define list p_function_list->functions.motion
676 memcpy( list.ppppf_motion, ppppf_motion, sizeof( void * ) * 16 );