1 /*****************************************************************************
2 * motionmmx.c : MMX motion compensation module for vlc
3 *****************************************************************************
4 * Copyright (C) 2001 VideoLAN
5 * $Id: motionmmx.c,v 1.20 2002/07/31 20:56:52 sam Exp $
7 * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
8 * Michel Lespinasse <walken@zoy.org>
9 * Vladimir Chernyshov <greengrass@writeme.com>
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
24 *****************************************************************************/
26 /*****************************************************************************
28 *****************************************************************************/
29 #include <stdlib.h> /* malloc(), free() */
36 /*****************************************************************************
38 *****************************************************************************/
39 static int Open ( vlc_object_t * );
41 /*****************************************************************************
43 *****************************************************************************/
45 set_description( _("MMX motion compensation module") );
46 set_capability( "motion compensation", 150 );
47 add_requirement( MMX );
48 add_shortcut( "mmx" );
49 set_callbacks( Open, NULL );
52 /*****************************************************************************
53 * Motion compensation in MMX
54 *****************************************************************************/
56 // some rounding constants
57 mmx_t round1 = {0x0001000100010001LL};
58 mmx_t round4 = {0x0002000200020002LL};
61 * This code should probably be compiled with loop unrolling
62 * (ie, -funroll-loops in gcc)becuase some of the loops
63 * use a small static number of iterations. This was written
64 * with the assumption the compiler knows best about when
68 static inline void mmx_zero_reg ()
74 static inline void mmx_average_2_U8 (yuv_data_t * dest,
75 yuv_data_t * src1, yuv_data_t * src2)
78 // *dest = (*src1 + *src2 + 1)/ 2;
80 static mmx_t mask1 = {0x0101010101010101LL};
81 static mmx_t mask7f = {0x7f7f7f7f7f7f7f7fLL};
83 movq_m2r (*src1, mm1); // load 8 src1 bytes
86 pand_m2r (mask7f, mm1);
88 movq_m2r (*src2, mm3); // load 8 src2 bytes
91 pand_m2r (mask7f, mm3);
94 pand_m2r (mask1, mm2);
96 movq_r2m (mm2, *dest); // store result in dest
99 static inline void mmx_interp_average_2_U8 (yuv_data_t * dest,
100 yuv_data_t * src1, yuv_data_t * src2)
103 // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
106 movq_m2r (*dest, mm1); // load 8 dest bytes
107 movq_r2r (mm1, mm2); // copy 8 dest bytes
109 movq_m2r (*src1, mm3); // load 8 src1 bytes
110 movq_r2r (mm3, mm4); // copy 8 src1 bytes
112 movq_m2r (*src2, mm5); // load 8 src2 bytes
113 movq_r2r (mm5, mm6); // copy 8 src2 bytes
115 punpcklbw_r2r (mm0, mm1); // unpack low dest bytes
116 punpckhbw_r2r (mm0, mm2); // unpack high dest bytes
118 punpcklbw_r2r (mm0, mm3); // unpack low src1 bytes
119 punpckhbw_r2r (mm0, mm4); // unpack high src1 bytes
121 punpcklbw_r2r (mm0, mm5); // unpack low src2 bytes
122 punpckhbw_r2r (mm0, mm6); // unpack high src2 bytes
124 paddw_r2r (mm5, mm3); // add lows
125 paddw_m2r (round1, mm3);
126 psraw_i2r (1, mm3); // /2
128 paddw_r2r (mm6, mm4); // add highs
129 paddw_m2r (round1, mm4);
130 psraw_i2r (1, mm4); // /2
132 paddw_r2r (mm3, mm1); // add lows
133 paddw_m2r (round1, mm1);
134 psraw_i2r (1, mm1); // /2
136 paddw_r2r (mm4, mm2); // add highs
137 paddw_m2r (round1, mm2);
138 psraw_i2r (1, mm2); // /2
140 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
141 movq_r2m (mm1, *dest); // store result in dest
144 static inline void mmx_average_4_U8 (yuv_data_t * dest,
145 yuv_data_t * src1, yuv_data_t * src2,
146 yuv_data_t * src3, yuv_data_t * src4)
149 // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
152 movq_m2r (*src1, mm1); // load 8 src1 bytes
153 movq_r2r (mm1, mm2); // copy 8 src1 bytes
155 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
156 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
158 movq_m2r (*src2, mm3); // load 8 src2 bytes
159 movq_r2r (mm3, mm4); // copy 8 src2 bytes
161 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
162 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
164 paddw_r2r (mm3, mm1); // add lows
165 paddw_r2r (mm4, mm2); // add highs
167 // now have partials in mm1 and mm2
169 movq_m2r (*src3, mm3); // load 8 src3 bytes
170 movq_r2r (mm3, mm4); // copy 8 src3 bytes
172 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
173 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
175 paddw_r2r (mm3, mm1); // add lows
176 paddw_r2r (mm4, mm2); // add highs
178 movq_m2r (*src4, mm5); // load 8 src4 bytes
179 movq_r2r (mm5, mm6); // copy 8 src4 bytes
181 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
182 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
184 paddw_r2r (mm5, mm1); // add lows
185 paddw_r2r (mm6, mm2); // add highs
187 // now have subtotal in mm1 and mm2
189 paddw_m2r (round4, mm1);
190 psraw_i2r (2, mm1); // /4
191 paddw_m2r (round4, mm2);
192 psraw_i2r (2, mm2); // /4
194 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
195 movq_r2m (mm1, *dest); // store result in dest
198 static inline void mmx_interp_average_4_U8 (yuv_data_t * dest,
199 yuv_data_t * src1, yuv_data_t * src2,
200 yuv_data_t * src3, yuv_data_t * src4)
203 // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
206 movq_m2r (*src1, mm1); // load 8 src1 bytes
207 movq_r2r (mm1, mm2); // copy 8 src1 bytes
209 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
210 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
212 movq_m2r (*src2, mm3); // load 8 src2 bytes
213 movq_r2r (mm3, mm4); // copy 8 src2 bytes
215 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
216 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
218 paddw_r2r (mm3, mm1); // add lows
219 paddw_r2r (mm4, mm2); // add highs
221 // now have partials in mm1 and mm2
223 movq_m2r (*src3, mm3); // load 8 src3 bytes
224 movq_r2r (mm3, mm4); // copy 8 src3 bytes
226 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
227 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
229 paddw_r2r (mm3, mm1); // add lows
230 paddw_r2r (mm4, mm2); // add highs
232 movq_m2r (*src4, mm5); // load 8 src4 bytes
233 movq_r2r (mm5, mm6); // copy 8 src4 bytes
235 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
236 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
238 paddw_r2r (mm5, mm1); // add lows
239 paddw_r2r (mm6, mm2); // add highs
241 paddw_m2r (round4, mm1);
242 psraw_i2r (2, mm1); // /4
243 paddw_m2r (round4, mm2);
244 psraw_i2r (2, mm2); // /4
246 // now have subtotal/4 in mm1 and mm2
248 movq_m2r (*dest, mm3); // load 8 dest bytes
249 movq_r2r (mm3, mm4); // copy 8 dest bytes
251 punpcklbw_r2r (mm0, mm3); // unpack low dest bytes
252 punpckhbw_r2r (mm0, mm4); // unpack high dest bytes
254 paddw_r2r (mm3, mm1); // add lows
255 paddw_r2r (mm4, mm2); // add highs
257 paddw_m2r (round1, mm1);
258 psraw_i2r (1, mm1); // /2
259 paddw_m2r (round1, mm2);
260 psraw_i2r (1, mm2); // /2
262 // now have end value in mm1 and mm2
264 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
265 movq_r2m (mm1,*dest); // store result in dest
268 //-----------------------------------------------------------------------
270 static inline void MC_avg_mmx (int width, int height,
271 yuv_data_t * dest, yuv_data_t * ref, int stride)
276 mmx_average_2_U8 (dest, dest, ref);
279 mmx_average_2_U8 (dest+8, dest+8, ref+8);
286 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
287 int stride, int height)
289 MC_avg_mmx (16, height, dest, ref, stride);
292 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
293 int stride, int height)
295 MC_avg_mmx (8, height, dest, ref, stride);
298 //-----------------------------------------------------------------------
300 static inline void MC_put_mmx (int width, int height,
301 yuv_data_t * dest, yuv_data_t * ref, int stride)
306 movq_m2r (* ref, mm1); // load 8 ref bytes
307 movq_r2m (mm1,* dest); // store 8 bytes at curr
311 movq_m2r (* (ref+8), mm1); // load 8 ref bytes
312 movq_r2m (mm1,* (dest+8)); // store 8 bytes at curr
320 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
321 int stride, int height)
323 MC_put_mmx (16, height, dest, ref, stride);
326 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
327 int stride, int height)
329 MC_put_mmx (8, height, dest, ref, stride);
332 //-----------------------------------------------------------------------
334 // Half pixel interpolation in the x direction
335 static inline void MC_avg_x_mmx (int width, int height,
336 yuv_data_t * dest, yuv_data_t * ref, int stride)
341 mmx_interp_average_2_U8 (dest, ref, ref+1);
344 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
351 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
352 int stride, int height)
354 MC_avg_x_mmx (16, height, dest, ref, stride);
357 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
358 int stride, int height)
360 MC_avg_x_mmx (8, height, dest, ref, stride);
363 //-----------------------------------------------------------------------
365 static inline void MC_put_x_mmx (int width, int height,
366 yuv_data_t * dest, yuv_data_t * ref, int stride)
371 mmx_average_2_U8 (dest, ref, ref+1);
374 mmx_average_2_U8 (dest+8, ref+8, ref+9);
381 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
382 int stride, int height)
384 MC_put_x_mmx (16, height, dest, ref, stride);
387 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
388 int stride, int height)
390 MC_put_x_mmx (8, height, dest, ref, stride);
393 //-----------------------------------------------------------------------
395 static inline void MC_avg_xy_8wide_mmx (int height, yuv_data_t * dest,
396 yuv_data_t * ref, int stride)
399 movq_m2r (round4, mm7);
401 movq_m2r (*ref, mm1); // calculate first row ref[0] + ref[1]
404 punpcklbw_r2r (mm0, mm1);
405 punpckhbw_r2r (mm0, mm2);
407 movq_m2r (*(ref+1), mm3);
410 punpcklbw_r2r (mm0, mm3);
411 punpckhbw_r2r (mm0, mm4);
413 paddw_r2r (mm3, mm1);
414 paddw_r2r (mm4, mm2);
420 movq_m2r (*ref, mm5); // calculate next row ref[0] + ref[1]
423 punpcklbw_r2r (mm0, mm5);
424 punpckhbw_r2r (mm0, mm6);
426 movq_m2r (*(ref+1), mm3);
429 punpcklbw_r2r (mm0, mm3);
430 punpckhbw_r2r (mm0, mm4);
432 paddw_r2r (mm3, mm5);
433 paddw_r2r (mm4, mm6);
435 movq_r2r (mm7, mm3); // calculate round4 + previous row + current row
438 paddw_r2r (mm1, mm3);
439 paddw_r2r (mm2, mm4);
441 paddw_r2r (mm5, mm3);
442 paddw_r2r (mm6, mm4);
444 psraw_i2r (2, mm3); // /4
445 psraw_i2r (2, mm4); // /4
447 movq_m2r (*dest, mm1); // calculate (subtotal + dest[0] + round1) / 2
450 punpcklbw_r2r (mm0, mm1);
451 punpckhbw_r2r (mm0, mm2);
453 paddw_r2r (mm1, mm3);
454 paddw_r2r (mm2, mm4);
456 paddw_m2r (round1, mm3);
457 paddw_m2r (round1, mm4);
459 psraw_i2r (1, mm3); // /2
460 psraw_i2r (1, mm4); // /2
462 packuswb_r2r (mm4, mm3); // pack (w/ saturation)
463 movq_r2m (mm3, *dest); // store result in dest
465 movq_r2r (mm5, mm1); // remember current row for the next pass
474 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
475 int stride, int height)
477 MC_avg_xy_8wide_mmx(height, dest, ref, stride);
478 MC_avg_xy_8wide_mmx(height, dest+8, ref+8, stride);
481 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
482 int stride, int height)
484 MC_avg_xy_8wide_mmx(height, dest, ref, stride);
487 //-----------------------------------------------------------------------
489 static inline void MC_put_xy_8wide_mmx (int height, yuv_data_t * dest,
490 yuv_data_t * ref, int stride)
493 movq_m2r (round4, mm7);
495 movq_m2r (*ref, mm1); // calculate first row ref[0] + ref[1]
498 punpcklbw_r2r (mm0, mm1);
499 punpckhbw_r2r (mm0, mm2);
501 movq_m2r (*(ref+1), mm3);
504 punpcklbw_r2r (mm0, mm3);
505 punpckhbw_r2r (mm0, mm4);
507 paddw_r2r (mm3, mm1);
508 paddw_r2r (mm4, mm2);
514 movq_m2r (*ref, mm5); // calculate next row ref[0] + ref[1]
517 punpcklbw_r2r (mm0, mm5);
518 punpckhbw_r2r (mm0, mm6);
520 movq_m2r (*(ref+1), mm3);
523 punpcklbw_r2r (mm0, mm3);
524 punpckhbw_r2r (mm0, mm4);
526 paddw_r2r (mm3, mm5);
527 paddw_r2r (mm4, mm6);
529 movq_r2r (mm7, mm3); // calculate round4 + previous row + current row
532 paddw_r2r (mm1, mm3);
533 paddw_r2r (mm2, mm4);
535 paddw_r2r (mm5, mm3);
536 paddw_r2r (mm6, mm4);
538 psraw_i2r (2, mm3); // /4
539 psraw_i2r (2, mm4); // /4
541 packuswb_r2r (mm4, mm3); // pack (w/ saturation)
542 movq_r2m (mm3, *dest); // store result in dest
544 movq_r2r (mm5, mm1); // advance to the next row
553 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
554 int stride, int height)
556 MC_put_xy_8wide_mmx(height, dest, ref, stride);
557 MC_put_xy_8wide_mmx(height, dest + 8, ref + 8, stride);
560 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
561 int stride, int height)
563 MC_put_xy_8wide_mmx(height, dest, ref, stride);
566 //-----------------------------------------------------------------------
568 static inline void MC_avg_y_mmx (int width, int height,
569 yuv_data_t * dest, yuv_data_t * ref, int stride)
571 yuv_data_t * ref_next = ref+stride;
576 mmx_interp_average_2_U8 (dest, ref, ref_next);
579 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
587 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
588 int stride, int height)
590 MC_avg_y_mmx (16, height, dest, ref, stride);
593 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
594 int stride, int height)
596 MC_avg_y_mmx (8, height, dest, ref, stride);
599 //-----------------------------------------------------------------------
601 static inline void MC_put_y_mmx (int width, int height,
602 yuv_data_t * dest, yuv_data_t * ref, int stride)
604 yuv_data_t * ref_next = ref+stride;
609 mmx_average_2_U8 (dest, ref, ref_next);
612 mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
620 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
621 int stride, int height)
623 MC_put_y_mmx (16, height, dest, ref, stride);
626 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
627 int stride, int height)
629 MC_put_y_mmx (8, height, dest, ref, stride);
633 /*****************************************************************************
634 * Functions exported as capabilities. They are declared as static so that
635 * we don't pollute the namespace too much.
636 *****************************************************************************/
637 static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *, int, int ) =
639 /* Copying functions */
642 { MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx },
644 { MC_put_8_mmx, MC_put_x8_mmx, MC_put_y8_mmx, MC_put_xy8_mmx }
646 /* Averaging functions */
649 { MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx },
651 { MC_avg_8_mmx, MC_avg_x8_mmx, MC_avg_y8_mmx, MC_avg_xy8_mmx }
655 static int Open ( vlc_object_t *p_this )
657 p_this->p_private = ppppf_motion;