1 /*****************************************************************************
2 * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
4 *****************************************************************************
5 * Copyright (C) 1999, 2000 VideoLAN
6 * $Id: vdec_motion_inner_mmx.c,v 1.1 2001/01/18 05:13:22 sam Exp $
8 * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
9 * work done by the livid project <http://www.linuxvideo.org/>
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
24 *****************************************************************************/
26 /*****************************************************************************
28 *****************************************************************************/
38 #include "attributes.h"
41 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
43 /* Some rounding constants */
44 mmx_t round1 = {0x0001000100010001LL};
45 mmx_t round4 = {0x0002000200020002LL};
51 static __inline__ void MMXZeroReg()
57 static __inline__ void MMXAverage2( u8 *dst, u8 *src1, u8 *src2 )
60 // *dst = clip_to_u8((*src1 + *src2 + 1)/2);
63 movq_m2r(*src1,mm1); // load 8 src1 bytes
64 movq_r2r(mm1,mm2); // copy 8 src1 bytes
66 movq_m2r(*src2,mm3); // load 8 src2 bytes
67 movq_r2r(mm3,mm4); // copy 8 src2 bytes
69 punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes
70 punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes
72 punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes
73 punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes
75 paddw_r2r(mm3,mm1); // add lows to mm1
76 paddw_m2r(round1,mm1);
77 psraw_i2r(1,mm1); // /2
79 paddw_r2r(mm4,mm2); // add highs to mm2
80 paddw_m2r(round1,mm2);
81 psraw_i2r(1,mm2); // /2
83 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
84 movq_r2m(mm1,*dst); // store result in dst
87 static __inline__ void MMXInterpAverage2( u8 *dst, u8 *src1, u8 *src2 )
90 // *dst = clip_to_u8((*dst + (*src1 + *src2 + 1)/2 + 1)/2);
93 movq_m2r(*dst,mm1); // load 8 dst bytes
94 movq_r2r(mm1,mm2); // copy 8 dst bytes
96 movq_m2r(*src1,mm3); // load 8 src1 bytes
97 movq_r2r(mm3,mm4); // copy 8 src1 bytes
99 movq_m2r(*src2,mm5); // load 8 src2 bytes
100 movq_r2r(mm5,mm6); // copy 8 src2 bytes
102 punpcklbw_r2r(mm0,mm1); // unpack low dst bytes
103 punpckhbw_r2r(mm0,mm2); // unpack high dst bytes
105 punpcklbw_r2r(mm0,mm3); // unpack low src1 bytes
106 punpckhbw_r2r(mm0,mm4); // unpack high src1 bytes
108 punpcklbw_r2r(mm0,mm5); // unpack low src2 bytes
109 punpckhbw_r2r(mm0,mm6); // unpack high src2 bytes
111 paddw_r2r(mm5,mm3); // add lows
112 paddw_m2r(round1,mm3);
113 psraw_i2r(1,mm3); // /2
115 paddw_r2r(mm6,mm4); // add highs
116 paddw_m2r(round1,mm4);
117 psraw_i2r(1,mm4); // /2
119 paddw_r2r(mm3,mm1); // add lows
120 paddw_m2r(round1,mm1);
121 psraw_i2r(1,mm1); // /2
123 paddw_r2r(mm4,mm2); // add highs
124 paddw_m2r(round1,mm2);
125 psraw_i2r(1,mm2); // /2
127 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
128 movq_r2m(mm1,*dst); // store result in dst
131 static __inline__ void MMXAverage4( u8 *dst, u8 *src1, u8 *src2, u8 *src3,
135 // *dst = (*src1 + *src2 + *src3 + *src4 + 2) / 4;
138 movq_m2r(*src1,mm1); // load 8 src1 bytes
139 movq_r2r(mm1,mm2); // copy 8 src1 bytes
141 punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes
142 punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes
144 movq_m2r(*src2,mm3); // load 8 src2 bytes
145 movq_r2r(mm3,mm4); // copy 8 src2 bytes
147 punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes
148 punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes
150 paddw_r2r(mm3,mm1); // add lows
151 paddw_r2r(mm4,mm2); // add highs
153 // now have partials in mm1 and mm2
155 movq_m2r(*src3,mm3); // load 8 src3 bytes
156 movq_r2r(mm3,mm4); // copy 8 src3 bytes
158 punpcklbw_r2r(mm0,mm3); // unpack low src3 bytes
159 punpckhbw_r2r(mm0,mm4); // unpack high src3 bytes
161 paddw_r2r(mm3,mm1); // add lows
162 paddw_r2r(mm4,mm2); // add highs
164 movq_m2r(*src4,mm5); // load 8 src4 bytes
165 movq_r2r(mm5,mm6); // copy 8 src4 bytes
167 punpcklbw_r2r(mm0,mm5); // unpack low src4 bytes
168 punpckhbw_r2r(mm0,mm6); // unpack high src4 bytes
170 paddw_r2r(mm5,mm1); // add lows
171 paddw_r2r(mm6,mm2); // add highs
173 // now have subtotal in mm1 and mm2
175 paddw_m2r(round4,mm1);
176 psraw_i2r(2,mm1); // /4
177 paddw_m2r(round4,mm2);
178 psraw_i2r(2,mm2); // /4
180 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
181 movq_r2m(mm1,*dst); // store result in dst
184 static __inline__ void MMXInterpAverage4( u8 *dst, u8 *src1, u8 *src2,
188 // *dst = clip_to_u8((*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2);
191 movq_m2r(*src1,mm1); // load 8 src1 bytes
192 movq_r2r(mm1,mm2); // copy 8 src1 bytes
194 punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes
195 punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes
197 movq_m2r(*src2,mm3); // load 8 src2 bytes
198 movq_r2r(mm3,mm4); // copy 8 src2 bytes
200 punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes
201 punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes
203 paddw_r2r(mm3,mm1); // add lows
204 paddw_r2r(mm4,mm2); // add highs
206 // now have partials in mm1 and mm2
208 movq_m2r(*src3,mm3); // load 8 src3 bytes
209 movq_r2r(mm3,mm4); // copy 8 src3 bytes
211 punpcklbw_r2r(mm0,mm3); // unpack low src3 bytes
212 punpckhbw_r2r(mm0,mm4); // unpack high src3 bytes
214 paddw_r2r(mm3,mm1); // add lows
215 paddw_r2r(mm4,mm2); // add highs
217 movq_m2r(*src4,mm5); // load 8 src4 bytes
218 movq_r2r(mm5,mm6); // copy 8 src4 bytes
220 punpcklbw_r2r(mm0,mm5); // unpack low src4 bytes
221 punpckhbw_r2r(mm0,mm6); // unpack high src4 bytes
223 paddw_r2r(mm5,mm1); // add lows
224 paddw_r2r(mm6,mm2); // add highs
226 paddw_m2r(round4,mm1);
227 psraw_i2r(2,mm1); // /4
228 paddw_m2r(round4,mm2);
229 psraw_i2r(2,mm2); // /4
231 // now have subtotal/4 in mm1 and mm2
233 movq_m2r(*dst,mm3); // load 8 dst bytes
234 movq_r2r(mm3,mm4); // copy 8 dst bytes
236 punpcklbw_r2r(mm0,mm3); // unpack low dst bytes
237 punpckhbw_r2r(mm0,mm4); // unpack high dst bytes
239 paddw_r2r(mm3,mm1); // add lows
240 paddw_r2r(mm4,mm2); // add highs
242 paddw_m2r(round1,mm1);
243 psraw_i2r(1,mm1); // /2
244 paddw_m2r(round1,mm2);
245 psraw_i2r(1,mm2); // /2
247 // now have end value in mm1 and mm2
249 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
250 movq_r2m(mm1,*dst); // store result in dst
255 * Actual Motion compensation
258 #define pavg_r2r(src,dest) pavgusb_r2r (src, dest);
259 #define pavg_m2r(src,dest) pavgusb_m2r (src, dest);
261 #define __MotionComponent_x_y_copy(width,height) \
262 void MotionComponent_x_y_copy_##width##_##height(yuv_data_t * p_src, \
263 yuv_data_t * p_dest, \
270 for( i_y = 0; i_y < height; i_y ++ ) \
272 movq_m2r( *p_src, mm0 ); /* load 8 ref bytes */ \
274 movq_m2r( *(p_src + 8), mm1 ); \
277 movq_r2m( mm0, *p_dest ); /* store 8 bytes at curr */ \
279 movq_r2m( mm1, *(p_dest + 8) ); \
280 p_dest += i_stride; \
284 #define __MotionComponent_X_y_copy(width,height) \
285 void MotionComponent_X_y_copy_##width##_##height(yuv_data_t * p_src, \
286 yuv_data_t * p_dest, \
293 for( i_y = 0; i_y < height; i_y ++ ) \
295 MMXAverage2( p_dest, p_src, p_src + 1 ); \
299 MMXAverage2( p_dest + 8, p_src + 8, p_src + 9 ); \
302 p_dest += i_stride; \
307 #define __MotionComponent_x_Y_copy(width,height) \
308 void MotionComponent_x_Y_copy_##width##_##height(yuv_data_t * p_src, \
309 yuv_data_t * p_dest, \
313 yuv_data_t * p_next_src = p_src + i_stride; \
317 for( i_y = 0; i_y < height; i_y ++ ) \
319 MMXAverage2( p_dest, p_src, p_next_src ); \
323 MMXAverage2( p_dest + 8, p_src + 8, p_next_src + 8 ); \
326 p_dest += i_stride; \
328 p_next_src += i_stride; \
332 #define __MotionComponent_X_Y_copy(width,height) \
333 void MotionComponent_X_Y_copy_##width##_##height(yuv_data_t * p_src, \
334 yuv_data_t * p_dest, \
338 yuv_data_t * p_next_src = p_src + i_stride; \
342 for( i_y = 0; i_y < height; i_y ++ ) \
344 MMXAverage4( p_dest, p_src, p_src + 1, p_next_src, p_next_src + 1 );\
348 MMXAverage4( p_dest + 8, p_src + 8, p_src + 9, \
349 p_next_src + 8, p_next_src + 9 ); \
352 p_dest += i_stride; \
354 p_next_src += i_stride; \
358 #define __MotionComponent_x_y_avg(width,height) \
359 void MotionComponent_x_y_avg_##width##_##height(yuv_data_t * p_src, \
360 yuv_data_t * p_dest, \
367 for( i_y = 0; i_y < height; i_y ++ ) \
369 MMXAverage2( p_dest, p_dest, p_src ); \
373 MMXAverage2( p_dest + 8, p_dest + 8, p_src + 8 ); \
376 p_dest += i_stride; \
381 #define __MotionComponent_X_y_avg(width,height) \
382 void MotionComponent_X_y_avg_##width##_##height(yuv_data_t * p_src, \
383 yuv_data_t * p_dest, \
390 for( i_y = 0; i_y < height; i_y ++ ) \
392 MMXInterpAverage2( p_dest, p_src, p_src + 1 ); \
396 MMXInterpAverage2( p_dest + 8, p_src + 8, p_src + 9 ); \
399 p_dest += i_stride; \
404 #define __MotionComponent_x_Y_avg(width,height) \
405 void MotionComponent_x_Y_avg_##width##_##height(yuv_data_t * p_src, \
406 yuv_data_t * p_dest, \
410 yuv_data_t * p_next_src = p_src + i_stride; \
414 for( i_y = 0; i_y < height; i_y ++ ) \
416 MMXInterpAverage2( p_dest, p_src, p_next_src ); \
420 MMXInterpAverage2( p_dest + 8, p_src + 8, p_next_src + 8 ); \
422 p_dest += i_stride; \
424 p_next_src += i_stride; \
428 #define __MotionComponent_X_Y_avg(width,height) \
429 void MotionComponent_X_Y_avg_##width##_##height(yuv_data_t * p_src, \
430 yuv_data_t * p_dest, \
434 yuv_data_t * p_next_src = p_src + i_stride; \
438 for( i_y = 0; i_y < height; i_y ++ ) \
440 MMXInterpAverage4( p_dest, p_src, p_src + 1, p_next_src, \
445 MMXInterpAverage4( p_dest + 8, p_src + 8, p_src + 9, \
446 p_next_src + 8, p_next_src + 9 ); \
449 p_dest += i_stride; \
451 p_next_src += i_stride; \
455 #define __MotionComponents(width,height) \
456 __MotionComponent_x_y_copy(width,height) \
457 __MotionComponent_X_y_copy(width,height) \
458 __MotionComponent_x_Y_copy(width,height) \
459 __MotionComponent_X_Y_copy(width,height) \
460 __MotionComponent_x_y_avg(width,height) \
461 __MotionComponent_X_y_avg(width,height) \
462 __MotionComponent_x_Y_avg(width,height) \
463 __MotionComponent_X_Y_avg(width,height)
465 __MotionComponents (16,16) /* 444, 422, 420 */
466 __MotionComponents (16,8) /* 444, 422, 420 */
467 __MotionComponents (8,8) /* 422, 420 */
468 __MotionComponents (8,4) /* 420 */
470 __MotionComponents (8,16) /* 422 */