1 /*****************************************************************************
2 * motionmmx.c : MMX motion compensation module for vlc
3 *****************************************************************************
4 * Copyright (C) 2001 VideoLAN
5 * $Id: motionmmx.c,v 1.3 2003/03/30 18:14:37 gbazin Exp $
7 * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
8 * Michel Lespinasse <walken@zoy.org>
9 * Vladimir Chernyshov <greengrass@writeme.com>
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
24 *****************************************************************************/
26 /*****************************************************************************
28 *****************************************************************************/
29 #include <stdlib.h> /* malloc(), free() */
36 /*****************************************************************************
38 *****************************************************************************/
39 static int Open ( vlc_object_t * );
41 /*****************************************************************************
43 *****************************************************************************/
45 set_description( _("MMX motion compensation") );
46 set_capability( "motion compensation", 150 );
47 add_requirement( MMX );
48 add_shortcut( "mmx" );
49 set_callbacks( Open, NULL );
52 /*****************************************************************************
53 * Motion compensation in MMX
54 *****************************************************************************/
56 /* some rounding constants */
57 mmx_t round1 = {0x0001000100010001LL};
58 mmx_t round4 = {0x0002000200020002LL};
61 * This code should probably be compiled with loop unrolling
62 * (ie, -funroll-loops in gcc)becuase some of the loops
63 * use a small static number of iterations. This was written
64 * with the assumption the compiler knows best about when
68 static inline void mmx_zero_reg ()
74 static inline void mmx_average_2_U8 (yuv_data_t * dest,
75 yuv_data_t * src1, yuv_data_t * src2)
78 * *dest = (*src1 + *src2 + 1)/ 2;
80 static mmx_t mask1 = {0x0101010101010101LL};
81 static mmx_t mask7f = {0x7f7f7f7f7f7f7f7fLL};
83 movq_m2r (*src1, mm1); /* load 8 src1 bytes */
86 pand_m2r (mask7f, mm1);
88 movq_m2r (*src2, mm3); /* load 8 src2 bytes */
91 pand_m2r (mask7f, mm3);
94 pand_m2r (mask1, mm2);
96 movq_r2m (mm2, *dest); /* store result in dest */
99 static inline void mmx_interp_average_2_U8 (yuv_data_t * dest,
100 yuv_data_t * src1, yuv_data_t * src2)
103 * *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
106 movq_m2r (*dest, mm1); /* load 8 dest bytes */
107 movq_r2r (mm1, mm2); /* copy 8 dest bytes */
109 movq_m2r (*src1, mm3); /* load 8 src1 bytes */
110 movq_r2r (mm3, mm4); /* copy 8 src1 bytes */
112 movq_m2r (*src2, mm5); /* load 8 src2 bytes */
113 movq_r2r (mm5, mm6); /* copy 8 src2 bytes */
115 punpcklbw_r2r (mm0, mm1); /* unpack low dest bytes */
116 punpckhbw_r2r (mm0, mm2); /* unpack high dest bytes */
118 punpcklbw_r2r (mm0, mm3); /* unpack low src1 bytes */
119 punpckhbw_r2r (mm0, mm4); /* unpack high src1 bytes */
121 punpcklbw_r2r (mm0, mm5); /* unpack low src2 bytes */
122 punpckhbw_r2r (mm0, mm6); /* unpack high src2 bytes */
124 paddw_r2r (mm5, mm3); /* add lows */
125 paddw_m2r (round1, mm3);
126 psraw_i2r (1, mm3); /* /2 */
128 paddw_r2r (mm6, mm4); /* add highs */
129 paddw_m2r (round1, mm4);
130 psraw_i2r (1, mm4); /* /2 */
132 paddw_r2r (mm3, mm1); /* add lows */
133 paddw_m2r (round1, mm1);
134 psraw_i2r (1, mm1); /* /2 */
136 paddw_r2r (mm4, mm2); /* add highs */
137 paddw_m2r (round1, mm2);
138 psraw_i2r (1, mm2); /* /2 */
140 packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */
141 movq_r2m (mm1, *dest); /* store result in dest */
144 static inline void mmx_average_4_U8 (yuv_data_t * dest,
145 yuv_data_t * src1, yuv_data_t * src2,
146 yuv_data_t * src3, yuv_data_t * src4)
149 * *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
152 movq_m2r (*src1, mm1); /* load 8 src1 bytes */
153 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */
155 punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */
156 punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */
158 movq_m2r (*src2, mm3); /* load 8 src2 bytes */
159 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */
161 punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */
162 punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */
164 paddw_r2r (mm3, mm1); /* add lows */
165 paddw_r2r (mm4, mm2); /* add highs */
167 /* now have partials in mm1 and mm2 */
169 movq_m2r (*src3, mm3); /* load 8 src3 bytes */
170 movq_r2r (mm3, mm4); /* copy 8 src3 bytes */
172 punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */
173 punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */
175 paddw_r2r (mm3, mm1); /* add lows */
176 paddw_r2r (mm4, mm2); /* add highs */
178 movq_m2r (*src4, mm5); /* load 8 src4 bytes */
179 movq_r2r (mm5, mm6); /* copy 8 src4 bytes */
181 punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */
182 punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */
184 paddw_r2r (mm5, mm1); /* add lows */
185 paddw_r2r (mm6, mm2); /* add highs */
187 /* now have subtotal in mm1 and mm2 */
189 paddw_m2r (round4, mm1);
190 psraw_i2r (2, mm1); /* /4 */
191 paddw_m2r (round4, mm2);
192 psraw_i2r (2, mm2); /* /4 */
194 packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */
195 movq_r2m (mm1, *dest); /* store result in dest */
198 static inline void mmx_interp_average_4_U8 (yuv_data_t * dest,
199 yuv_data_t * src1, yuv_data_t * src2,
200 yuv_data_t * src3, yuv_data_t * src4)
203 * *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
206 movq_m2r (*src1, mm1); /* load 8 src1 bytes */
207 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */
209 punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */
210 punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */
212 movq_m2r (*src2, mm3); /* load 8 src2 bytes */
213 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */
215 punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */
216 punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */
218 paddw_r2r (mm3, mm1); /* add lows */
219 paddw_r2r (mm4, mm2); /* add highs */
221 /* now have partials in mm1 and mm2 */
223 movq_m2r (*src3, mm3); /* load 8 src3 bytes */
224 movq_r2r (mm3, mm4); /* copy 8 src3 bytes */
226 punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */
227 punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */
229 paddw_r2r (mm3, mm1); /* add lows */
230 paddw_r2r (mm4, mm2); /* add highs */
232 movq_m2r (*src4, mm5); /* load 8 src4 bytes */
233 movq_r2r (mm5, mm6); /* copy 8 src4 bytes */
235 punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */
236 punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */
238 paddw_r2r (mm5, mm1); /* add lows */
239 paddw_r2r (mm6, mm2); /* add highs */
241 paddw_m2r (round4, mm1);
242 psraw_i2r (2, mm1); /* /4 */
243 paddw_m2r (round4, mm2);
244 psraw_i2r (2, mm2); /* /4 */
246 /* now have subtotal/4 in mm1 and mm2 */
248 movq_m2r (*dest, mm3); /* load 8 dest bytes */
249 movq_r2r (mm3, mm4); /* copy 8 dest bytes */
251 punpcklbw_r2r (mm0, mm3); /* unpack low dest bytes */
252 punpckhbw_r2r (mm0, mm4); /* unpack high dest bytes */
254 paddw_r2r (mm3, mm1); /* add lows */
255 paddw_r2r (mm4, mm2); /* add highs */
257 paddw_m2r (round1, mm1);
258 psraw_i2r (1, mm1); /* /2 */
259 paddw_m2r (round1, mm2);
260 psraw_i2r (1, mm2); /* /2 */
262 /* now have end value in mm1 and mm2 */
264 packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */
265 movq_r2m (mm1,*dest); /* store result in dest */
268 /*-----------------------------------------------------------------------*/
270 static inline void MC_avg_mmx (int width, int height,
271 yuv_data_t * dest, yuv_data_t * ref, int stride)
276 mmx_average_2_U8 (dest, dest, ref);
279 mmx_average_2_U8 (dest+8, dest+8, ref+8);
286 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
287 int stride, int height)
289 MC_avg_mmx (16, height, dest, ref, stride);
292 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
293 int stride, int height)
295 MC_avg_mmx (8, height, dest, ref, stride);
298 /*-----------------------------------------------------------------------*/
300 static inline void MC_put_mmx (int width, int height,
301 yuv_data_t * dest, yuv_data_t * ref, int stride)
306 movq_m2r (* ref, mm1); /* load 8 ref bytes */
307 movq_r2m (mm1,* dest); /* store 8 bytes at curr */
311 movq_m2r (* (ref+8), mm1); /* load 8 ref bytes */
312 movq_r2m (mm1,* (dest+8)); /* store 8 bytes at curr */
320 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
321 int stride, int height)
323 MC_put_mmx (16, height, dest, ref, stride);
326 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
327 int stride, int height)
329 MC_put_mmx (8, height, dest, ref, stride);
332 /*-----------------------------------------------------------------------*/
334 /* Half pixel interpolation in the x direction */
335 static inline void MC_avg_x_mmx (int width, int height,
336 yuv_data_t * dest, yuv_data_t * ref, int stride)
341 mmx_interp_average_2_U8 (dest, ref, ref+1);
344 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
351 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
352 int stride, int height)
354 MC_avg_x_mmx (16, height, dest, ref, stride);
357 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
358 int stride, int height)
360 MC_avg_x_mmx (8, height, dest, ref, stride);
363 /*-----------------------------------------------------------------------*/
365 static inline void MC_put_x_mmx (int width, int height,
366 yuv_data_t * dest, yuv_data_t * ref, int stride)
371 mmx_average_2_U8 (dest, ref, ref+1);
374 mmx_average_2_U8 (dest+8, ref+8, ref+9);
381 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
382 int stride, int height)
384 MC_put_x_mmx (16, height, dest, ref, stride);
387 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
388 int stride, int height)
390 MC_put_x_mmx (8, height, dest, ref, stride);
393 /*-----------------------------------------------------------------------*/
395 static inline void MC_avg_xy_8wide_mmx (int height, yuv_data_t * dest,
396 yuv_data_t * ref, int stride)
399 movq_m2r (round4, mm7);
401 movq_m2r (*ref, mm1); /* calculate first row ref[0] + ref[1] */
404 punpcklbw_r2r (mm0, mm1);
405 punpckhbw_r2r (mm0, mm2);
407 movq_m2r (*(ref+1), mm3);
410 punpcklbw_r2r (mm0, mm3);
411 punpckhbw_r2r (mm0, mm4);
413 paddw_r2r (mm3, mm1);
414 paddw_r2r (mm4, mm2);
420 movq_m2r (*ref, mm5); /* calculate next row ref[0] + ref[1] */
423 punpcklbw_r2r (mm0, mm5);
424 punpckhbw_r2r (mm0, mm6);
426 movq_m2r (*(ref+1), mm3);
429 punpcklbw_r2r (mm0, mm3);
430 punpckhbw_r2r (mm0, mm4);
432 paddw_r2r (mm3, mm5);
433 paddw_r2r (mm4, mm6);
435 movq_r2r (mm7, mm3); /* calculate round4 + previous row + current row */
438 paddw_r2r (mm1, mm3);
439 paddw_r2r (mm2, mm4);
441 paddw_r2r (mm5, mm3);
442 paddw_r2r (mm6, mm4);
444 psraw_i2r (2, mm3); /* /4 */
445 psraw_i2r (2, mm4); /* /4 */
447 movq_m2r (*dest, mm1); /* calculate (subtotal + dest[0] + round1) / 2 */
450 punpcklbw_r2r (mm0, mm1);
451 punpckhbw_r2r (mm0, mm2);
453 paddw_r2r (mm1, mm3);
454 paddw_r2r (mm2, mm4);
456 paddw_m2r (round1, mm3);
457 paddw_m2r (round1, mm4);
459 psraw_i2r (1, mm3); /* /2 */
460 psraw_i2r (1, mm4); /* /2 */
462 packuswb_r2r (mm4, mm3); /* pack (w/ saturation) */
463 movq_r2m (mm3, *dest); /* store result in dest */
465 movq_r2r (mm5, mm1); /* remember current row for the next pass */
474 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
475 int stride, int height)
477 MC_avg_xy_8wide_mmx(height, dest, ref, stride);
478 MC_avg_xy_8wide_mmx(height, dest+8, ref+8, stride);
481 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
482 int stride, int height)
484 MC_avg_xy_8wide_mmx(height, dest, ref, stride);
487 /*-----------------------------------------------------------------------*/
489 static inline void MC_put_xy_8wide_mmx (int height, yuv_data_t * dest,
490 yuv_data_t * ref, int stride)
493 movq_m2r (round4, mm7);
495 movq_m2r (*ref, mm1); /* calculate first row ref[0] + ref[1] */
498 punpcklbw_r2r (mm0, mm1);
499 punpckhbw_r2r (mm0, mm2);
501 movq_m2r (*(ref+1), mm3);
504 punpcklbw_r2r (mm0, mm3);
505 punpckhbw_r2r (mm0, mm4);
507 paddw_r2r (mm3, mm1);
508 paddw_r2r (mm4, mm2);
514 movq_m2r (*ref, mm5); /* calculate next row ref[0] + ref[1] */
517 punpcklbw_r2r (mm0, mm5);
518 punpckhbw_r2r (mm0, mm6);
520 movq_m2r (*(ref+1), mm3);
523 punpcklbw_r2r (mm0, mm3);
524 punpckhbw_r2r (mm0, mm4);
526 paddw_r2r (mm3, mm5);
527 paddw_r2r (mm4, mm6);
529 movq_r2r (mm7, mm3); /* calculate round4 + previous row + current row */
532 paddw_r2r (mm1, mm3);
533 paddw_r2r (mm2, mm4);
535 paddw_r2r (mm5, mm3);
536 paddw_r2r (mm6, mm4);
538 psraw_i2r (2, mm3); /* /4 */
539 psraw_i2r (2, mm4); /* /4 */
541 packuswb_r2r (mm4, mm3); /* pack (w/ saturation) */
542 movq_r2m (mm3, *dest); /* store result in dest */
544 movq_r2r (mm5, mm1); /* advance to the next row */
553 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
554 int stride, int height)
556 MC_put_xy_8wide_mmx(height, dest, ref, stride);
557 MC_put_xy_8wide_mmx(height, dest + 8, ref + 8, stride);
560 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
561 int stride, int height)
563 MC_put_xy_8wide_mmx(height, dest, ref, stride);
566 /*-----------------------------------------------------------------------*/
568 static inline void MC_avg_y_mmx (int width, int height,
569 yuv_data_t * dest, yuv_data_t * ref, int stride)
571 yuv_data_t * ref_next = ref+stride;
576 mmx_interp_average_2_U8 (dest, ref, ref_next);
579 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
587 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
588 int stride, int height)
590 MC_avg_y_mmx (16, height, dest, ref, stride);
593 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
594 int stride, int height)
596 MC_avg_y_mmx (8, height, dest, ref, stride);
599 /*-----------------------------------------------------------------------*/
601 static inline void MC_put_y_mmx (int width, int height,
602 yuv_data_t * dest, yuv_data_t * ref, int stride)
604 yuv_data_t * ref_next = ref+stride;
609 mmx_average_2_U8 (dest, ref, ref_next);
612 mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
620 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
621 int stride, int height)
623 MC_put_y_mmx (16, height, dest, ref, stride);
626 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
627 int stride, int height)
629 MC_put_y_mmx (8, height, dest, ref, stride);
633 /*****************************************************************************
634 * Functions exported as capabilities. They are declared as static so that
635 * we don't pollute the namespace too much.
636 *****************************************************************************/
637 static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *, int, int ) =
639 /* Copying functions */
642 { MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx },
644 { MC_put_8_mmx, MC_put_x8_mmx, MC_put_y8_mmx, MC_put_xy8_mmx }
646 /* Averaging functions */
649 { MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx },
651 { MC_avg_8_mmx, MC_avg_x8_mmx, MC_avg_y8_mmx, MC_avg_xy8_mmx }
655 static int Open ( vlc_object_t *p_this )
657 p_this->p_private = ppppf_motion;