1 /*****************************************************************************
2 * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
4 *****************************************************************************
5 * Copyright (C) 1999, 2000 VideoLAN
6 * $Id: vdec_motion_inner_mmx.c,v 1.2 2001/06/07 15:27:44 sam Exp $
8 * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
9 * work done by the livid project <http://www.linuxvideo.org/>
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
24 *****************************************************************************/
26 #include "modules_inner.h"
28 /*****************************************************************************
30 *****************************************************************************/
40 #include "attributes.h"
43 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
45 /* Some rounding constants */
46 mmx_t round1 = {0x0001000100010001LL};
47 mmx_t round4 = {0x0002000200020002LL};
53 static __inline__ void MMXZeroReg()
59 static __inline__ void MMXAverage2( u8 *dst, u8 *src1, u8 *src2 )
62 // *dst = clip_to_u8((*src1 + *src2 + 1)/2);
65 movq_m2r(*src1,mm1); // load 8 src1 bytes
66 movq_r2r(mm1,mm2); // copy 8 src1 bytes
68 movq_m2r(*src2,mm3); // load 8 src2 bytes
69 movq_r2r(mm3,mm4); // copy 8 src2 bytes
71 punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes
72 punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes
74 punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes
75 punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes
77 paddw_r2r(mm3,mm1); // add lows to mm1
78 paddw_m2r(round1,mm1);
79 psraw_i2r(1,mm1); // /2
81 paddw_r2r(mm4,mm2); // add highs to mm2
82 paddw_m2r(round1,mm2);
83 psraw_i2r(1,mm2); // /2
85 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
86 movq_r2m(mm1,*dst); // store result in dst
89 static __inline__ void MMXInterpAverage2( u8 *dst, u8 *src1, u8 *src2 )
92 // *dst = clip_to_u8((*dst + (*src1 + *src2 + 1)/2 + 1)/2);
95 movq_m2r(*dst,mm1); // load 8 dst bytes
96 movq_r2r(mm1,mm2); // copy 8 dst bytes
98 movq_m2r(*src1,mm3); // load 8 src1 bytes
99 movq_r2r(mm3,mm4); // copy 8 src1 bytes
101 movq_m2r(*src2,mm5); // load 8 src2 bytes
102 movq_r2r(mm5,mm6); // copy 8 src2 bytes
104 punpcklbw_r2r(mm0,mm1); // unpack low dst bytes
105 punpckhbw_r2r(mm0,mm2); // unpack high dst bytes
107 punpcklbw_r2r(mm0,mm3); // unpack low src1 bytes
108 punpckhbw_r2r(mm0,mm4); // unpack high src1 bytes
110 punpcklbw_r2r(mm0,mm5); // unpack low src2 bytes
111 punpckhbw_r2r(mm0,mm6); // unpack high src2 bytes
113 paddw_r2r(mm5,mm3); // add lows
114 paddw_m2r(round1,mm3);
115 psraw_i2r(1,mm3); // /2
117 paddw_r2r(mm6,mm4); // add highs
118 paddw_m2r(round1,mm4);
119 psraw_i2r(1,mm4); // /2
121 paddw_r2r(mm3,mm1); // add lows
122 paddw_m2r(round1,mm1);
123 psraw_i2r(1,mm1); // /2
125 paddw_r2r(mm4,mm2); // add highs
126 paddw_m2r(round1,mm2);
127 psraw_i2r(1,mm2); // /2
129 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
130 movq_r2m(mm1,*dst); // store result in dst
133 static __inline__ void MMXAverage4( u8 *dst, u8 *src1, u8 *src2, u8 *src3,
137 // *dst = (*src1 + *src2 + *src3 + *src4 + 2) / 4;
140 movq_m2r(*src1,mm1); // load 8 src1 bytes
141 movq_r2r(mm1,mm2); // copy 8 src1 bytes
143 punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes
144 punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes
146 movq_m2r(*src2,mm3); // load 8 src2 bytes
147 movq_r2r(mm3,mm4); // copy 8 src2 bytes
149 punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes
150 punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes
152 paddw_r2r(mm3,mm1); // add lows
153 paddw_r2r(mm4,mm2); // add highs
155 // now have partials in mm1 and mm2
157 movq_m2r(*src3,mm3); // load 8 src3 bytes
158 movq_r2r(mm3,mm4); // copy 8 src3 bytes
160 punpcklbw_r2r(mm0,mm3); // unpack low src3 bytes
161 punpckhbw_r2r(mm0,mm4); // unpack high src3 bytes
163 paddw_r2r(mm3,mm1); // add lows
164 paddw_r2r(mm4,mm2); // add highs
166 movq_m2r(*src4,mm5); // load 8 src4 bytes
167 movq_r2r(mm5,mm6); // copy 8 src4 bytes
169 punpcklbw_r2r(mm0,mm5); // unpack low src4 bytes
170 punpckhbw_r2r(mm0,mm6); // unpack high src4 bytes
172 paddw_r2r(mm5,mm1); // add lows
173 paddw_r2r(mm6,mm2); // add highs
175 // now have subtotal in mm1 and mm2
177 paddw_m2r(round4,mm1);
178 psraw_i2r(2,mm1); // /4
179 paddw_m2r(round4,mm2);
180 psraw_i2r(2,mm2); // /4
182 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
183 movq_r2m(mm1,*dst); // store result in dst
186 static __inline__ void MMXInterpAverage4( u8 *dst, u8 *src1, u8 *src2,
190 // *dst = clip_to_u8((*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2);
193 movq_m2r(*src1,mm1); // load 8 src1 bytes
194 movq_r2r(mm1,mm2); // copy 8 src1 bytes
196 punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes
197 punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes
199 movq_m2r(*src2,mm3); // load 8 src2 bytes
200 movq_r2r(mm3,mm4); // copy 8 src2 bytes
202 punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes
203 punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes
205 paddw_r2r(mm3,mm1); // add lows
206 paddw_r2r(mm4,mm2); // add highs
208 // now have partials in mm1 and mm2
210 movq_m2r(*src3,mm3); // load 8 src3 bytes
211 movq_r2r(mm3,mm4); // copy 8 src3 bytes
213 punpcklbw_r2r(mm0,mm3); // unpack low src3 bytes
214 punpckhbw_r2r(mm0,mm4); // unpack high src3 bytes
216 paddw_r2r(mm3,mm1); // add lows
217 paddw_r2r(mm4,mm2); // add highs
219 movq_m2r(*src4,mm5); // load 8 src4 bytes
220 movq_r2r(mm5,mm6); // copy 8 src4 bytes
222 punpcklbw_r2r(mm0,mm5); // unpack low src4 bytes
223 punpckhbw_r2r(mm0,mm6); // unpack high src4 bytes
225 paddw_r2r(mm5,mm1); // add lows
226 paddw_r2r(mm6,mm2); // add highs
228 paddw_m2r(round4,mm1);
229 psraw_i2r(2,mm1); // /4
230 paddw_m2r(round4,mm2);
231 psraw_i2r(2,mm2); // /4
233 // now have subtotal/4 in mm1 and mm2
235 movq_m2r(*dst,mm3); // load 8 dst bytes
236 movq_r2r(mm3,mm4); // copy 8 dst bytes
238 punpcklbw_r2r(mm0,mm3); // unpack low dst bytes
239 punpckhbw_r2r(mm0,mm4); // unpack high dst bytes
241 paddw_r2r(mm3,mm1); // add lows
242 paddw_r2r(mm4,mm2); // add highs
244 paddw_m2r(round1,mm1);
245 psraw_i2r(1,mm1); // /2
246 paddw_m2r(round1,mm2);
247 psraw_i2r(1,mm2); // /2
249 // now have end value in mm1 and mm2
251 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
252 movq_r2m(mm1,*dst); // store result in dst
257 * Actual Motion compensation
260 #define pavg_r2r(src,dest) pavgusb_r2r (src, dest);
261 #define pavg_m2r(src,dest) pavgusb_m2r (src, dest);
263 #define __MotionComponent_x_y_copy(width,height) \
264 void _M(MotionComponent_x_y_copy_##width##_##height)(yuv_data_t * p_src, \
265 yuv_data_t * p_dest, \
272 for( i_y = 0; i_y < height; i_y ++ ) \
274 movq_m2r( *p_src, mm0 ); /* load 8 ref bytes */ \
276 movq_m2r( *(p_src + 8), mm1 ); \
279 movq_r2m( mm0, *p_dest ); /* store 8 bytes at curr */ \
281 movq_r2m( mm1, *(p_dest + 8) ); \
282 p_dest += i_stride; \
286 #define __MotionComponent_X_y_copy(width,height) \
287 void _M(MotionComponent_X_y_copy_##width##_##height)(yuv_data_t * p_src, \
288 yuv_data_t * p_dest, \
295 for( i_y = 0; i_y < height; i_y ++ ) \
297 MMXAverage2( p_dest, p_src, p_src + 1 ); \
301 MMXAverage2( p_dest + 8, p_src + 8, p_src + 9 ); \
304 p_dest += i_stride; \
309 #define __MotionComponent_x_Y_copy(width,height) \
310 void _M(MotionComponent_x_Y_copy_##width##_##height)(yuv_data_t * p_src, \
311 yuv_data_t * p_dest, \
315 yuv_data_t * p_next_src = p_src + i_stride; \
319 for( i_y = 0; i_y < height; i_y ++ ) \
321 MMXAverage2( p_dest, p_src, p_next_src ); \
325 MMXAverage2( p_dest + 8, p_src + 8, p_next_src + 8 ); \
328 p_dest += i_stride; \
330 p_next_src += i_stride; \
334 #define __MotionComponent_X_Y_copy(width,height) \
335 void _M(MotionComponent_X_Y_copy_##width##_##height)(yuv_data_t * p_src, \
336 yuv_data_t * p_dest, \
340 yuv_data_t * p_next_src = p_src + i_stride; \
344 for( i_y = 0; i_y < height; i_y ++ ) \
346 MMXAverage4( p_dest, p_src, p_src + 1, p_next_src, p_next_src + 1 );\
350 MMXAverage4( p_dest + 8, p_src + 8, p_src + 9, \
351 p_next_src + 8, p_next_src + 9 ); \
354 p_dest += i_stride; \
356 p_next_src += i_stride; \
360 #define __MotionComponent_x_y_avg(width,height) \
361 void _M(MotionComponent_x_y_avg_##width##_##height)(yuv_data_t * p_src, \
362 yuv_data_t * p_dest, \
369 for( i_y = 0; i_y < height; i_y ++ ) \
371 MMXAverage2( p_dest, p_dest, p_src ); \
375 MMXAverage2( p_dest + 8, p_dest + 8, p_src + 8 ); \
378 p_dest += i_stride; \
383 #define __MotionComponent_X_y_avg(width,height) \
384 void _M(MotionComponent_X_y_avg_##width##_##height)(yuv_data_t * p_src, \
385 yuv_data_t * p_dest, \
392 for( i_y = 0; i_y < height; i_y ++ ) \
394 MMXInterpAverage2( p_dest, p_src, p_src + 1 ); \
398 MMXInterpAverage2( p_dest + 8, p_src + 8, p_src + 9 ); \
401 p_dest += i_stride; \
406 #define __MotionComponent_x_Y_avg(width,height) \
407 void _M(MotionComponent_x_Y_avg_##width##_##height)(yuv_data_t * p_src, \
408 yuv_data_t * p_dest, \
412 yuv_data_t * p_next_src = p_src + i_stride; \
416 for( i_y = 0; i_y < height; i_y ++ ) \
418 MMXInterpAverage2( p_dest, p_src, p_next_src ); \
422 MMXInterpAverage2( p_dest + 8, p_src + 8, p_next_src + 8 ); \
424 p_dest += i_stride; \
426 p_next_src += i_stride; \
430 #define __MotionComponent_X_Y_avg(width,height) \
431 void _M(MotionComponent_X_Y_avg_##width##_##height)(yuv_data_t * p_src, \
432 yuv_data_t * p_dest, \
436 yuv_data_t * p_next_src = p_src + i_stride; \
440 for( i_y = 0; i_y < height; i_y ++ ) \
442 MMXInterpAverage4( p_dest, p_src, p_src + 1, p_next_src, \
447 MMXInterpAverage4( p_dest + 8, p_src + 8, p_src + 9, \
448 p_next_src + 8, p_next_src + 9 ); \
451 p_dest += i_stride; \
453 p_next_src += i_stride; \
457 #define __MotionComponents(width,height) \
458 __MotionComponent_x_y_copy(width,height) \
459 __MotionComponent_X_y_copy(width,height) \
460 __MotionComponent_x_Y_copy(width,height) \
461 __MotionComponent_X_Y_copy(width,height) \
462 __MotionComponent_x_y_avg(width,height) \
463 __MotionComponent_X_y_avg(width,height) \
464 __MotionComponent_x_Y_avg(width,height) \
465 __MotionComponent_X_Y_avg(width,height)
467 __MotionComponents (16,16) /* 444, 422, 420 */
468 __MotionComponents (16,8) /* 444, 422, 420 */
469 __MotionComponents (8,8) /* 422, 420 */
470 __MotionComponents (8,4) /* 420 */
472 __MotionComponents (8,16) /* 422 */