1 /*****************************************************************************
2 * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
4 *****************************************************************************
5 * Copyright (C) 1999, 2000 VideoLAN
6 * $Id: vdec_motion_inner_mmx.c,v 1.8 2001/01/16 17:59:23 massiot Exp $
8 * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
9 * work done by the livid project <http://www.linuxvideo.org/>
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
24 *****************************************************************************/
26 /*****************************************************************************
28 *****************************************************************************/
39 #include "stream_control.h"
40 #include "input_ext-dec.h"
43 #include "video_output.h"
45 #include "vdec_idct.h"
46 #include "video_decoder.h"
47 #include "vdec_motion.h"
49 #include "vpar_blocks.h"
50 #include "vpar_headers.h"
51 #include "vpar_synchro.h"
52 #include "video_parser.h"
53 #include "video_fifo.h"
55 #include "attributes.h"
58 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
60 /* Some rounding constants */
61 mmx_t round1 = {0x0001000100010001LL};
62 mmx_t round4 = {0x0002000200020002LL};
68 static __inline__ void MMXZeroReg()
74 static __inline__ void MMXAverage2( u8 *dst, u8 *src1, u8 *src2 )
77 // *dst = clip_to_u8((*src1 + *src2 + 1)/2);
80 movq_m2r(*src1,mm1); // load 8 src1 bytes
81 movq_r2r(mm1,mm2); // copy 8 src1 bytes
83 movq_m2r(*src2,mm3); // load 8 src2 bytes
84 movq_r2r(mm3,mm4); // copy 8 src2 bytes
86 punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes
87 punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes
89 punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes
90 punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes
92 paddw_r2r(mm3,mm1); // add lows to mm1
93 paddw_m2r(round1,mm1);
94 psraw_i2r(1,mm1); // /2
96 paddw_r2r(mm4,mm2); // add highs to mm2
97 paddw_m2r(round1,mm2);
98 psraw_i2r(1,mm2); // /2
100 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
101 movq_r2m(mm1,*dst); // store result in dst
104 static __inline__ void MMXInterpAverage2( u8 *dst, u8 *src1, u8 *src2 )
107 // *dst = clip_to_u8((*dst + (*src1 + *src2 + 1)/2 + 1)/2);
110 movq_m2r(*dst,mm1); // load 8 dst bytes
111 movq_r2r(mm1,mm2); // copy 8 dst bytes
113 movq_m2r(*src1,mm3); // load 8 src1 bytes
114 movq_r2r(mm3,mm4); // copy 8 src1 bytes
116 movq_m2r(*src2,mm5); // load 8 src2 bytes
117 movq_r2r(mm5,mm6); // copy 8 src2 bytes
119 punpcklbw_r2r(mm0,mm1); // unpack low dst bytes
120 punpckhbw_r2r(mm0,mm2); // unpack high dst bytes
122 punpcklbw_r2r(mm0,mm3); // unpack low src1 bytes
123 punpckhbw_r2r(mm0,mm4); // unpack high src1 bytes
125 punpcklbw_r2r(mm0,mm5); // unpack low src2 bytes
126 punpckhbw_r2r(mm0,mm6); // unpack high src2 bytes
128 paddw_r2r(mm5,mm3); // add lows
129 paddw_m2r(round1,mm3);
130 psraw_i2r(1,mm3); // /2
132 paddw_r2r(mm6,mm4); // add highs
133 paddw_m2r(round1,mm4);
134 psraw_i2r(1,mm4); // /2
136 paddw_r2r(mm3,mm1); // add lows
137 paddw_m2r(round1,mm1);
138 psraw_i2r(1,mm1); // /2
140 paddw_r2r(mm4,mm2); // add highs
141 paddw_m2r(round1,mm2);
142 psraw_i2r(1,mm2); // /2
144 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
145 movq_r2m(mm1,*dst); // store result in dst
148 static __inline__ void MMXAverage4( u8 *dst, u8 *src1, u8 *src2, u8 *src3,
152 // *dst = (*src1 + *src2 + *src3 + *src4 + 2) / 4;
155 movq_m2r(*src1,mm1); // load 8 src1 bytes
156 movq_r2r(mm1,mm2); // copy 8 src1 bytes
158 punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes
159 punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes
161 movq_m2r(*src2,mm3); // load 8 src2 bytes
162 movq_r2r(mm3,mm4); // copy 8 src2 bytes
164 punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes
165 punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes
167 paddw_r2r(mm3,mm1); // add lows
168 paddw_r2r(mm4,mm2); // add highs
170 // now have partials in mm1 and mm2
172 movq_m2r(*src3,mm3); // load 8 src3 bytes
173 movq_r2r(mm3,mm4); // copy 8 src3 bytes
175 punpcklbw_r2r(mm0,mm3); // unpack low src3 bytes
176 punpckhbw_r2r(mm0,mm4); // unpack high src3 bytes
178 paddw_r2r(mm3,mm1); // add lows
179 paddw_r2r(mm4,mm2); // add highs
181 movq_m2r(*src4,mm5); // load 8 src4 bytes
182 movq_r2r(mm5,mm6); // copy 8 src4 bytes
184 punpcklbw_r2r(mm0,mm5); // unpack low src4 bytes
185 punpckhbw_r2r(mm0,mm6); // unpack high src4 bytes
187 paddw_r2r(mm5,mm1); // add lows
188 paddw_r2r(mm6,mm2); // add highs
190 // now have subtotal in mm1 and mm2
192 paddw_m2r(round4,mm1);
193 psraw_i2r(2,mm1); // /4
194 paddw_m2r(round4,mm2);
195 psraw_i2r(2,mm2); // /4
197 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
198 movq_r2m(mm1,*dst); // store result in dst
201 static __inline__ void MMXInterpAverage4( u8 *dst, u8 *src1, u8 *src2,
205 // *dst = clip_to_u8((*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2);
208 movq_m2r(*src1,mm1); // load 8 src1 bytes
209 movq_r2r(mm1,mm2); // copy 8 src1 bytes
211 punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes
212 punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes
214 movq_m2r(*src2,mm3); // load 8 src2 bytes
215 movq_r2r(mm3,mm4); // copy 8 src2 bytes
217 punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes
218 punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes
220 paddw_r2r(mm3,mm1); // add lows
221 paddw_r2r(mm4,mm2); // add highs
223 // now have partials in mm1 and mm2
225 movq_m2r(*src3,mm3); // load 8 src3 bytes
226 movq_r2r(mm3,mm4); // copy 8 src3 bytes
228 punpcklbw_r2r(mm0,mm3); // unpack low src3 bytes
229 punpckhbw_r2r(mm0,mm4); // unpack high src3 bytes
231 paddw_r2r(mm3,mm1); // add lows
232 paddw_r2r(mm4,mm2); // add highs
234 movq_m2r(*src4,mm5); // load 8 src4 bytes
235 movq_r2r(mm5,mm6); // copy 8 src4 bytes
237 punpcklbw_r2r(mm0,mm5); // unpack low src4 bytes
238 punpckhbw_r2r(mm0,mm6); // unpack high src4 bytes
240 paddw_r2r(mm5,mm1); // add lows
241 paddw_r2r(mm6,mm2); // add highs
243 paddw_m2r(round4,mm1);
244 psraw_i2r(2,mm1); // /4
245 paddw_m2r(round4,mm2);
246 psraw_i2r(2,mm2); // /4
248 // now have subtotal/4 in mm1 and mm2
250 movq_m2r(*dst,mm3); // load 8 dst bytes
251 movq_r2r(mm3,mm4); // copy 8 dst bytes
253 punpcklbw_r2r(mm0,mm3); // unpack low dst bytes
254 punpckhbw_r2r(mm0,mm4); // unpack high dst bytes
256 paddw_r2r(mm3,mm1); // add lows
257 paddw_r2r(mm4,mm2); // add highs
259 paddw_m2r(round1,mm1);
260 psraw_i2r(1,mm1); // /2
261 paddw_m2r(round1,mm2);
262 psraw_i2r(1,mm2); // /2
264 // now have end value in mm1 and mm2
266 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
267 movq_r2m(mm1,*dst); // store result in dst
272 * Actual Motion compensation
275 #define pavg_r2r(src,dest) pavgusb_r2r (src, dest);
276 #define pavg_m2r(src,dest) pavgusb_m2r (src, dest);
278 #define __MotionComponent_x_y_copy(width,height) \
279 void MotionComponent_x_y_copy_##width##_##height(yuv_data_t * p_src, \
280 yuv_data_t * p_dest, \
287 for( i_y = 0; i_y < height; i_y ++ ) \
289 movq_m2r( *p_src, mm0 ); /* load 8 ref bytes */ \
291 movq_m2r( *(p_src + 8), mm1 ); \
294 movq_r2m( mm0, *p_dest ); /* store 8 bytes at curr */ \
296 movq_r2m( mm1, *(p_dest + 8) ); \
297 p_dest += i_stride; \
301 #define __MotionComponent_X_y_copy(width,height) \
302 void MotionComponent_X_y_copy_##width##_##height(yuv_data_t * p_src, \
303 yuv_data_t * p_dest, \
310 for( i_y = 0; i_y < height; i_y ++ ) \
312 MMXAverage2( p_dest, p_src, p_src + 1 ); \
316 MMXAverage2( p_dest + 8, p_src + 8, p_src + 9 ); \
319 p_dest += i_stride; \
324 #define __MotionComponent_x_Y_copy(width,height) \
325 void MotionComponent_x_Y_copy_##width##_##height(yuv_data_t * p_src, \
326 yuv_data_t * p_dest, \
330 yuv_data_t * p_next_src = p_src + i_stride; \
334 for( i_y = 0; i_y < height; i_y ++ ) \
336 MMXAverage2( p_dest, p_src, p_next_src ); \
340 MMXAverage2( p_dest + 8, p_src + 8, p_next_src + 8 ); \
343 p_dest += i_stride; \
345 p_next_src += i_stride; \
349 #define __MotionComponent_X_Y_copy(width,height) \
350 void MotionComponent_X_Y_copy_##width##_##height(yuv_data_t * p_src, \
351 yuv_data_t * p_dest, \
355 yuv_data_t * p_next_src = p_src + i_stride; \
359 for( i_y = 0; i_y < height; i_y ++ ) \
361 MMXAverage4( p_dest, p_src, p_src + 1, p_next_src, p_next_src + 1 );\
365 MMXAverage4( p_dest + 8, p_src + 8, p_src + 9, \
366 p_next_src + 8, p_next_src + 9 ); \
369 p_dest += i_stride; \
371 p_next_src += i_stride; \
375 #define __MotionComponent_x_y_avg(width,height) \
376 void MotionComponent_x_y_avg_##width##_##height(yuv_data_t * p_src, \
377 yuv_data_t * p_dest, \
384 for( i_y = 0; i_y < height; i_y ++ ) \
386 MMXAverage2( p_dest, p_dest, p_src ); \
390 MMXAverage2( p_dest + 8, p_dest + 8, p_src + 8 ); \
393 p_dest += i_stride; \
398 #define __MotionComponent_X_y_avg(width,height) \
399 void MotionComponent_X_y_avg_##width##_##height(yuv_data_t * p_src, \
400 yuv_data_t * p_dest, \
407 for( i_y = 0; i_y < height; i_y ++ ) \
409 MMXInterpAverage2( p_dest, p_src, p_src + 1 ); \
413 MMXInterpAverage2( p_dest + 8, p_src + 8, p_src + 9 ); \
416 p_dest += i_stride; \
421 #define __MotionComponent_x_Y_avg(width,height) \
422 void MotionComponent_x_Y_avg_##width##_##height(yuv_data_t * p_src, \
423 yuv_data_t * p_dest, \
427 yuv_data_t * p_next_src = p_src + i_stride; \
431 for( i_y = 0; i_y < height; i_y ++ ) \
433 MMXInterpAverage2( p_dest, p_src, p_next_src ); \
437 MMXInterpAverage2( p_dest + 8, p_src + 8, p_next_src + 8 ); \
439 p_dest += i_stride; \
441 p_next_src += i_stride; \
445 #define __MotionComponent_X_Y_avg(width,height) \
446 void MotionComponent_X_Y_avg_##width##_##height(yuv_data_t * p_src, \
447 yuv_data_t * p_dest, \
451 yuv_data_t * p_next_src = p_src + i_stride; \
455 for( i_y = 0; i_y < height; i_y ++ ) \
457 MMXInterpAverage4( p_dest, p_src, p_src + 1, p_next_src, \
462 MMXInterpAverage4( p_dest + 8, p_src + 8, p_src + 9, \
463 p_next_src + 8, p_next_src + 9 ); \
466 p_dest += i_stride; \
468 p_next_src += i_stride; \
472 #define __MotionComponents(width,height) \
473 __MotionComponent_x_y_copy(width,height) \
474 __MotionComponent_X_y_copy(width,height) \
475 __MotionComponent_x_Y_copy(width,height) \
476 __MotionComponent_X_Y_copy(width,height) \
477 __MotionComponent_x_y_avg(width,height) \
478 __MotionComponent_X_y_avg(width,height) \
479 __MotionComponent_x_Y_avg(width,height) \
480 __MotionComponent_X_Y_avg(width,height)
482 __MotionComponents (16,16) /* 444, 422, 420 */
483 __MotionComponents (16,8) /* 444, 422, 420 */
484 __MotionComponents (8,8) /* 422, 420 */
485 __MotionComponents (8,4) /* 420 */
487 __MotionComponents (8,16) /* 422 */