1 /*****************************************************************************
2 * motionmmx.c : MMX motion compensation module for vlc
3 *****************************************************************************
4 * Copyright (C) 2001 VideoLAN
5 * $Id: motionmmx.c,v 1.18 2002/06/01 12:32:00 sam Exp $
7 * Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
8 * Michel Lespinasse <walken@zoy.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
28 #include <stdlib.h> /* malloc(), free() */
35 /*****************************************************************************
36 * Local and extern prototypes.
37 *****************************************************************************/
38 static void motion_getfunctions( function_list_t * p_function_list );
40 /*****************************************************************************
41 * Build configuration tree.
42 *****************************************************************************/
47 SET_DESCRIPTION( _("MMX motion compensation module") )
48 ADD_CAPABILITY( MOTION, 150 )
49 ADD_REQUIREMENT( MMX )
54 motion_getfunctions( &p_module->p_functions->motion );
57 MODULE_DEACTIVATE_START
58 MODULE_DEACTIVATE_STOP
60 /*****************************************************************************
61 * Motion compensation in MMX
62 *****************************************************************************/
64 // some rounding constants
65 mmx_t round1 = {0x0001000100010001LL};
66 mmx_t round4 = {0x0002000200020002LL};
69 * This code should probably be compiled with loop unrolling
70 * (ie, -funroll-loops in gcc)becuase some of the loops
71 * use a small static number of iterations. This was written
72 * with the assumption the compiler knows best about when
76 static inline void mmx_zero_reg ()
82 static inline void mmx_average_2_U8 (yuv_data_t * dest,
83 yuv_data_t * src1, yuv_data_t * src2)
86 // *dest = (*src1 + *src2 + 1)/ 2;
89 movq_m2r (*src1, mm1); // load 8 src1 bytes
90 movq_r2r (mm1, mm2); // copy 8 src1 bytes
92 movq_m2r (*src2, mm3); // load 8 src2 bytes
93 movq_r2r (mm3, mm4); // copy 8 src2 bytes
95 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
96 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
98 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
99 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
101 paddw_r2r (mm3, mm1); // add lows to mm1
102 paddw_m2r (round1, mm1);
103 psraw_i2r (1, mm1); // /2
105 paddw_r2r (mm4, mm2); // add highs to mm2
106 paddw_m2r (round1, mm2);
107 psraw_i2r (1, mm2); // /2
109 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
110 movq_r2m (mm1, *dest); // store result in dest
113 static inline void mmx_interp_average_2_U8 (yuv_data_t * dest,
114 yuv_data_t * src1, yuv_data_t * src2)
117 // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
120 movq_m2r (*dest, mm1); // load 8 dest bytes
121 movq_r2r (mm1, mm2); // copy 8 dest bytes
123 movq_m2r (*src1, mm3); // load 8 src1 bytes
124 movq_r2r (mm3, mm4); // copy 8 src1 bytes
126 movq_m2r (*src2, mm5); // load 8 src2 bytes
127 movq_r2r (mm5, mm6); // copy 8 src2 bytes
129 punpcklbw_r2r (mm0, mm1); // unpack low dest bytes
130 punpckhbw_r2r (mm0, mm2); // unpack high dest bytes
132 punpcklbw_r2r (mm0, mm3); // unpack low src1 bytes
133 punpckhbw_r2r (mm0, mm4); // unpack high src1 bytes
135 punpcklbw_r2r (mm0, mm5); // unpack low src2 bytes
136 punpckhbw_r2r (mm0, mm6); // unpack high src2 bytes
138 paddw_r2r (mm5, mm3); // add lows
139 paddw_m2r (round1, mm3);
140 psraw_i2r (1, mm3); // /2
142 paddw_r2r (mm6, mm4); // add highs
143 paddw_m2r (round1, mm4);
144 psraw_i2r (1, mm4); // /2
146 paddw_r2r (mm3, mm1); // add lows
147 paddw_m2r (round1, mm1);
148 psraw_i2r (1, mm1); // /2
150 paddw_r2r (mm4, mm2); // add highs
151 paddw_m2r (round1, mm2);
152 psraw_i2r (1, mm2); // /2
154 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
155 movq_r2m (mm1, *dest); // store result in dest
158 static inline void mmx_average_4_U8 (yuv_data_t * dest,
159 yuv_data_t * src1, yuv_data_t * src2,
160 yuv_data_t * src3, yuv_data_t * src4)
163 // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
166 movq_m2r (*src1, mm1); // load 8 src1 bytes
167 movq_r2r (mm1, mm2); // copy 8 src1 bytes
169 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
170 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
172 movq_m2r (*src2, mm3); // load 8 src2 bytes
173 movq_r2r (mm3, mm4); // copy 8 src2 bytes
175 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
176 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
178 paddw_r2r (mm3, mm1); // add lows
179 paddw_r2r (mm4, mm2); // add highs
181 // now have partials in mm1 and mm2
183 movq_m2r (*src3, mm3); // load 8 src3 bytes
184 movq_r2r (mm3, mm4); // copy 8 src3 bytes
186 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
187 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
189 paddw_r2r (mm3, mm1); // add lows
190 paddw_r2r (mm4, mm2); // add highs
192 movq_m2r (*src4, mm5); // load 8 src4 bytes
193 movq_r2r (mm5, mm6); // copy 8 src4 bytes
195 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
196 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
198 paddw_r2r (mm5, mm1); // add lows
199 paddw_r2r (mm6, mm2); // add highs
201 // now have subtotal in mm1 and mm2
203 paddw_m2r (round4, mm1);
204 psraw_i2r (2, mm1); // /4
205 paddw_m2r (round4, mm2);
206 psraw_i2r (2, mm2); // /4
208 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
209 movq_r2m (mm1, *dest); // store result in dest
212 static inline void mmx_interp_average_4_U8 (yuv_data_t * dest,
213 yuv_data_t * src1, yuv_data_t * src2,
214 yuv_data_t * src3, yuv_data_t * src4)
217 // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
220 movq_m2r (*src1, mm1); // load 8 src1 bytes
221 movq_r2r (mm1, mm2); // copy 8 src1 bytes
223 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
224 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
226 movq_m2r (*src2, mm3); // load 8 src2 bytes
227 movq_r2r (mm3, mm4); // copy 8 src2 bytes
229 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
230 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
232 paddw_r2r (mm3, mm1); // add lows
233 paddw_r2r (mm4, mm2); // add highs
235 // now have partials in mm1 and mm2
237 movq_m2r (*src3, mm3); // load 8 src3 bytes
238 movq_r2r (mm3, mm4); // copy 8 src3 bytes
240 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
241 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
243 paddw_r2r (mm3, mm1); // add lows
244 paddw_r2r (mm4, mm2); // add highs
246 movq_m2r (*src4, mm5); // load 8 src4 bytes
247 movq_r2r (mm5, mm6); // copy 8 src4 bytes
249 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
250 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
252 paddw_r2r (mm5, mm1); // add lows
253 paddw_r2r (mm6, mm2); // add highs
255 paddw_m2r (round4, mm1);
256 psraw_i2r (2, mm1); // /4
257 paddw_m2r (round4, mm2);
258 psraw_i2r (2, mm2); // /4
260 // now have subtotal/4 in mm1 and mm2
262 movq_m2r (*dest, mm3); // load 8 dest bytes
263 movq_r2r (mm3, mm4); // copy 8 dest bytes
265 punpcklbw_r2r (mm0, mm3); // unpack low dest bytes
266 punpckhbw_r2r (mm0, mm4); // unpack high dest bytes
268 paddw_r2r (mm3, mm1); // add lows
269 paddw_r2r (mm4, mm2); // add highs
271 paddw_m2r (round1, mm1);
272 psraw_i2r (1, mm1); // /2
273 paddw_m2r (round1, mm2);
274 psraw_i2r (1, mm2); // /2
276 // now have end value in mm1 and mm2
278 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
279 movq_r2m (mm1,*dest); // store result in dest
282 //-----------------------------------------------------------------------
284 static inline void MC_avg_mmx (int width, int height,
285 yuv_data_t * dest, yuv_data_t * ref, int stride)
290 mmx_average_2_U8 (dest, dest, ref);
293 mmx_average_2_U8 (dest+8, dest+8, ref+8);
300 static void MC_avg_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
301 int stride, int height)
303 MC_avg_mmx (16, height, dest, ref, stride);
306 static void MC_avg_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
307 int stride, int height)
309 MC_avg_mmx (8, height, dest, ref, stride);
312 //-----------------------------------------------------------------------
314 static inline void MC_put_mmx (int width, int height,
315 yuv_data_t * dest, yuv_data_t * ref, int stride)
320 movq_m2r (* ref, mm1); // load 8 ref bytes
321 movq_r2m (mm1,* dest); // store 8 bytes at curr
325 movq_m2r (* (ref+8), mm1); // load 8 ref bytes
326 movq_r2m (mm1,* (dest+8)); // store 8 bytes at curr
334 static void MC_put_16_mmx (yuv_data_t * dest, yuv_data_t * ref,
335 int stride, int height)
337 MC_put_mmx (16, height, dest, ref, stride);
340 static void MC_put_8_mmx (yuv_data_t * dest, yuv_data_t * ref,
341 int stride, int height)
343 MC_put_mmx (8, height, dest, ref, stride);
346 //-----------------------------------------------------------------------
348 // Half pixel interpolation in the x direction
349 static inline void MC_avg_x_mmx (int width, int height,
350 yuv_data_t * dest, yuv_data_t * ref, int stride)
355 mmx_interp_average_2_U8 (dest, ref, ref+1);
358 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
365 static void MC_avg_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
366 int stride, int height)
368 MC_avg_x_mmx (16, height, dest, ref, stride);
371 static void MC_avg_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
372 int stride, int height)
374 MC_avg_x_mmx (8, height, dest, ref, stride);
377 //-----------------------------------------------------------------------
379 static inline void MC_put_x_mmx (int width, int height,
380 yuv_data_t * dest, yuv_data_t * ref, int stride)
385 mmx_average_2_U8 (dest, ref, ref+1);
388 mmx_average_2_U8 (dest+8, ref+8, ref+9);
395 static void MC_put_x16_mmx (yuv_data_t * dest, yuv_data_t * ref,
396 int stride, int height)
398 MC_put_x_mmx (16, height, dest, ref, stride);
401 static void MC_put_x8_mmx (yuv_data_t * dest, yuv_data_t * ref,
402 int stride, int height)
404 MC_put_x_mmx (8, height, dest, ref, stride);
407 //-----------------------------------------------------------------------
409 static inline void MC_avg_xy_mmx (int width, int height,
410 yuv_data_t * dest, yuv_data_t * ref, int stride)
412 yuv_data_t * ref_next = ref+stride;
417 mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
420 mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
421 ref_next+8, ref_next+9);
429 static void MC_avg_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
430 int stride, int height)
432 MC_avg_xy_mmx (16, height, dest, ref, stride);
435 static void MC_avg_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
436 int stride, int height)
438 MC_avg_xy_mmx (8, height, dest, ref, stride);
441 //-----------------------------------------------------------------------
443 static inline void MC_put_xy_mmx (int width, int height,
444 yuv_data_t * dest, yuv_data_t * ref, int stride)
446 yuv_data_t * ref_next = ref+stride;
451 mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
454 mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
462 static void MC_put_xy16_mmx (yuv_data_t * dest, yuv_data_t * ref,
463 int stride, int height)
465 MC_put_xy_mmx (16, height, dest, ref, stride);
468 static void MC_put_xy8_mmx (yuv_data_t * dest, yuv_data_t * ref,
469 int stride, int height)
471 MC_put_xy_mmx (8, height, dest, ref, stride);
474 //-----------------------------------------------------------------------
476 static inline void MC_avg_y_mmx (int width, int height,
477 yuv_data_t * dest, yuv_data_t * ref, int stride)
479 yuv_data_t * ref_next = ref+stride;
484 mmx_interp_average_2_U8 (dest, ref, ref_next);
487 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
495 static void MC_avg_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
496 int stride, int height)
498 MC_avg_y_mmx (16, height, dest, ref, stride);
501 static void MC_avg_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
502 int stride, int height)
504 MC_avg_y_mmx (8, height, dest, ref, stride);
507 //-----------------------------------------------------------------------
509 static inline void MC_put_y_mmx (int width, int height,
510 yuv_data_t * dest, yuv_data_t * ref, int stride)
512 yuv_data_t * ref_next = ref+stride;
517 mmx_average_2_U8 (dest, ref, ref_next);
520 mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
528 static void MC_put_y16_mmx (yuv_data_t * dest, yuv_data_t * ref,
529 int stride, int height)
531 MC_put_y_mmx (16, height, dest, ref, stride);
534 static void MC_put_y8_mmx (yuv_data_t * dest, yuv_data_t * ref,
535 int stride, int height)
537 MC_put_y_mmx (8, height, dest, ref, stride);
541 /*****************************************************************************
542 * Functions exported as capabilities. They are declared as static so that
543 * we don't pollute the namespace too much.
544 *****************************************************************************/
545 static void motion_getfunctions( function_list_t * p_function_list )
547 static void (* ppppf_motion[2][2][4])( yuv_data_t *, yuv_data_t *,
551 /* Copying functions */
554 MC_put_16_mmx, MC_put_x16_mmx, MC_put_y16_mmx, MC_put_xy16_mmx
558 MC_put_8_mmx, MC_put_x8_mmx, MC_put_y8_mmx, MC_put_xy8_mmx
562 /* Averaging functions */
565 MC_avg_16_mmx, MC_avg_x16_mmx, MC_avg_y16_mmx, MC_avg_xy16_mmx
569 MC_avg_8_mmx, MC_avg_x8_mmx, MC_avg_y8_mmx, MC_avg_xy8_mmx
574 #define list p_function_list->functions.motion
575 memcpy( list.ppppf_motion, ppppf_motion, sizeof( void * ) * 16 );