1 /*****************************************************************************
2 * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
4 *****************************************************************************
5 * Copyright (C) 1999, 2000 VideoLAN
7 * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
8 * work done by the livid project <http://www.linuxvideo.org/>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
30 #include <sys/types.h> /* on BSD, uio.h needs types.h */
31 #include <sys/uio.h> /* for input.h */
41 #include "stream_control.h"
42 #include "input_ext-dec.h"
45 #include "video_output.h"
47 #include "vdec_idct.h"
48 #include "video_decoder.h"
49 #include "vdec_motion.h"
51 #include "vpar_blocks.h"
52 #include "vpar_headers.h"
53 #include "vpar_synchro.h"
54 #include "video_parser.h"
55 #include "video_fifo.h"
59 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
61 /* Some rounding constants */
62 mmx_t round1 = {0x0001000100010001LL};
63 mmx_t round4 = {0x0002000200020002LL};
69 static __inline__ void MMXZeroReg()
75 static __inline__ void MMXAverage2( u8 *dst, u8 *src1, u8 *src2 )
78 // *dst = clip_to_u8((*src1 + *src2 + 1)/2);
83 movq_m2r(*src1,mm1); // load 8 src1 bytes
84 movq_r2r(mm1,mm2); // copy 8 src1 bytes
86 movq_m2r(*src2,mm3); // load 8 src2 bytes
87 movq_r2r(mm3,mm4); // copy 8 src2 bytes
89 punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes
90 punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes
92 punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes
93 punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes
95 paddw_r2r(mm3,mm1); // add lows to mm1
96 paddw_m2r(round1,mm1);
97 psraw_i2r(1,mm1); // /2
99 paddw_r2r(mm4,mm2); // add highs to mm2
100 paddw_m2r(round1,mm2);
101 psraw_i2r(1,mm2); // /2
103 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
104 movq_r2m(mm1,*dst); // store result in dst
107 static __inline__ void MMXInterpAverage2( u8 *dst, u8 *src1, u8 *src2 )
110 // *dst = clip_to_u8((*dst + (*src1 + *src2 + 1)/2 + 1)/2);
115 movq_m2r(*dst,mm1); // load 8 dst bytes
116 movq_r2r(mm1,mm2); // copy 8 dst bytes
118 movq_m2r(*src1,mm3); // load 8 src1 bytes
119 movq_r2r(mm3,mm4); // copy 8 src1 bytes
121 movq_m2r(*src2,mm5); // load 8 src2 bytes
122 movq_r2r(mm5,mm6); // copy 8 src2 bytes
124 punpcklbw_r2r(mm0,mm1); // unpack low dst bytes
125 punpckhbw_r2r(mm0,mm2); // unpack high dst bytes
127 punpcklbw_r2r(mm0,mm3); // unpack low src1 bytes
128 punpckhbw_r2r(mm0,mm4); // unpack high src1 bytes
130 punpcklbw_r2r(mm0,mm5); // unpack low src2 bytes
131 punpckhbw_r2r(mm0,mm6); // unpack high src2 bytes
133 paddw_r2r(mm5,mm3); // add lows
134 paddw_m2r(round1,mm3);
135 psraw_i2r(1,mm3); // /2
137 paddw_r2r(mm6,mm4); // add highs
138 paddw_m2r(round1,mm4);
139 psraw_i2r(1,mm4); // /2
141 paddw_r2r(mm3,mm1); // add lows
142 paddw_m2r(round1,mm1);
143 psraw_i2r(1,mm1); // /2
145 paddw_r2r(mm4,mm2); // add highs
146 paddw_m2r(round1,mm2);
147 psraw_i2r(1,mm2); // /2
149 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
150 movq_r2m(mm1,*dst); // store result in dst
153 static __inline__ void MMXAverage4( u8 *dst, u8 *src1, u8 *src2, u8 *src3,
157 // *dst = clip_to_u8((*src1 + *src2 + *src3 + *src4 + 2)/4);
162 movq_m2r(*src1,mm1); // load 8 src1 bytes
163 movq_r2r(mm1,mm2); // copy 8 src1 bytes
165 punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes
166 punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes
168 movq_m2r(*src2,mm3); // load 8 src2 bytes
169 movq_r2r(mm3,mm4); // copy 8 src2 bytes
171 punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes
172 punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes
174 paddw_r2r(mm3,mm1); // add lows
175 paddw_r2r(mm4,mm2); // add highs
177 // now have partials in mm1 and mm2
179 movq_m2r(*src3,mm3); // load 8 src3 bytes
180 movq_r2r(mm3,mm4); // copy 8 src3 bytes
182 punpcklbw_r2r(mm0,mm3); // unpack low src3 bytes
183 punpckhbw_r2r(mm0,mm4); // unpack high src3 bytes
185 paddw_r2r(mm3,mm1); // add lows
186 paddw_r2r(mm4,mm2); // add highs
188 movq_m2r(*src4,mm5); // load 8 src4 bytes
189 movq_r2r(mm5,mm6); // copy 8 src4 bytes
191 punpcklbw_r2r(mm0,mm5); // unpack low src4 bytes
192 punpckhbw_r2r(mm0,mm6); // unpack high src4 bytes
194 paddw_r2r(mm5,mm1); // add lows
195 paddw_r2r(mm6,mm2); // add highs
197 // now have subtotal in mm1 and mm2
199 paddw_m2r(round4,mm1);
200 psraw_i2r(2,mm1); // /4
201 paddw_m2r(round4,mm2);
202 psraw_i2r(2,mm2); // /4
204 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
205 movq_r2m(mm1,*dst); // store result in dst
208 static __inline__ void MMXInterpAverage4( u8 *dst, u8 *src1, u8 *src2,
212 // *dst = clip_to_u8((*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2);
217 movq_m2r(*src1,mm1); // load 8 src1 bytes
218 movq_r2r(mm1,mm2); // copy 8 src1 bytes
220 punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes
221 punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes
223 movq_m2r(*src2,mm3); // load 8 src2 bytes
224 movq_r2r(mm3,mm4); // copy 8 src2 bytes
226 punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes
227 punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes
229 paddw_r2r(mm3,mm1); // add lows
230 paddw_r2r(mm4,mm2); // add highs
232 // now have partials in mm1 and mm2
234 movq_m2r(*src3,mm3); // load 8 src3 bytes
235 movq_r2r(mm3,mm4); // copy 8 src3 bytes
237 punpcklbw_r2r(mm0,mm3); // unpack low src3 bytes
238 punpckhbw_r2r(mm0,mm4); // unpack high src3 bytes
240 paddw_r2r(mm3,mm1); // add lows
241 paddw_r2r(mm4,mm2); // add highs
243 movq_m2r(*src4,mm5); // load 8 src4 bytes
244 movq_r2r(mm5,mm6); // copy 8 src4 bytes
246 punpcklbw_r2r(mm0,mm5); // unpack low src4 bytes
247 punpckhbw_r2r(mm0,mm6); // unpack high src4 bytes
249 paddw_r2r(mm5,mm1); // add lows
250 paddw_r2r(mm6,mm2); // add highs
252 paddw_m2r(round4,mm1);
253 psraw_i2r(2,mm1); // /4
254 paddw_m2r(round4,mm2);
255 psraw_i2r(2,mm2); // /4
257 // now have subtotal/4 in mm1 and mm2
259 movq_m2r(*dst,mm3); // load 8 dst bytes
260 movq_r2r(mm3,mm4); // copy 8 dst bytes
262 punpcklbw_r2r(mm0,mm3); // unpack low dst bytes
263 punpckhbw_r2r(mm0,mm4); // unpack high dst bytes
265 paddw_r2r(mm3,mm1); // add lows
266 paddw_r2r(mm4,mm2); // add highs
268 paddw_m2r(round1,mm1);
269 psraw_i2r(1,mm1); // /2
270 paddw_m2r(round1,mm2);
271 psraw_i2r(1,mm2); // /2
273 // now have end value in mm1 and mm2
275 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
276 movq_r2m(mm1,*dst); // store result in dst
281 * Actual Motion compensation
284 #define __MotionComponent_x_y_copy(width,height) \
285 void MotionComponent_x_y_copy_##width##_##height(yuv_data_t * p_src, \
286 yuv_data_t * p_dest, \
293 for( i_y = 0; i_y < height; i_y ++ ) \
295 movq_m2r( *p_src, mm1 ); /* load 8 ref bytes */ \
296 movq_r2m( mm1, *p_dest ); /* store 8 bytes at curr */ \
300 movq_m2r( *(p_src + 8), mm1 ); /* load 8 ref bytes */ \
301 movq_r2m( mm1, *(p_dest + 8) ); /* store 8 bytes at curr */ \
304 p_dest += i_stride; \
309 #define __MotionComponent_X_y_copy(width,height) \
310 void MotionComponent_X_y_copy_##width##_##height(yuv_data_t * p_src, \
311 yuv_data_t * p_dest, \
318 for( i_y = 0; i_y < height; i_y ++ ) \
320 MMXAverage2( p_dest, p_src, p_src + 1 ); \
324 MMXAverage2( p_dest + 8, p_src + 8, p_src + 9 ); \
327 p_dest += i_stride; \
332 #define __MotionComponent_x_Y_copy(width,height) \
333 void MotionComponent_x_Y_copy_##width##_##height(yuv_data_t * p_src, \
334 yuv_data_t * p_dest, \
338 yuv_data_t * p_next_src = p_src + i_stride; \
342 for( i_y = 0; i_y < height; i_y ++ ) \
344 MMXAverage2( p_dest, p_src, p_next_src ); \
348 MMXAverage2( p_dest + 8, p_src + 8, p_next_src + 8 ); \
351 p_dest += i_stride; \
353 p_next_src += i_stride; \
357 #define __MotionComponent_X_Y_copy(width,height) \
358 void MotionComponent_X_Y_copy_##width##_##height(yuv_data_t * p_src, \
359 yuv_data_t * p_dest, \
363 yuv_data_t * p_next_src = p_src + i_stride; \
367 for( i_y = 0; i_y < height; i_y ++ ) \
369 MMXAverage4( p_dest, p_src, p_src + 1, p_next_src, p_next_src + 1 );\
373 MMXAverage4( p_dest + 8, p_src + 8, p_src + 9, \
374 p_next_src + 8, p_next_src + 9 ); \
377 p_dest += i_stride; \
379 p_next_src += i_stride; \
383 #define __MotionComponent_x_y_avg(width,height) \
384 void MotionComponent_x_y_avg_##width##_##height(yuv_data_t * p_src, \
385 yuv_data_t * p_dest, \
392 for( i_y = 0; i_y < height; i_y ++ ) \
394 MMXAverage2( p_dest, p_dest, p_src ); \
398 MMXAverage2( p_dest + 8, p_dest + 8, p_src + 8 ); \
401 p_dest += i_stride; \
406 #define __MotionComponent_X_y_avg(width,height) \
407 void MotionComponent_X_y_avg_##width##_##height(yuv_data_t * p_src, \
408 yuv_data_t * p_dest, \
415 for( i_y = 0; i_y < height; i_y ++ ) \
417 MMXInterpAverage2( p_dest, p_src, p_src + 1 ); \
421 MMXInterpAverage2( p_dest + 8, p_dest + 8, p_src + 9 ); \
424 p_dest += i_stride; \
429 #define __MotionComponent_x_Y_avg(width,height) \
430 void MotionComponent_x_Y_avg_##width##_##height(yuv_data_t * p_src, \
431 yuv_data_t * p_dest, \
435 unsigned int i_dummy; \
437 for( i_y = 0; i_y < height; i_y ++ ) \
439 for( i_x = 0; i_x < width; i_x++ ) \
442 p_dest[i_x] + ((unsigned int)(p_src[i_x] \
443 + p_src[i_x + i_stride] \
445 p_dest[i_x] = (i_dummy + 1) >> 1; \
447 p_dest += i_stride; \
452 #define __MotionComponent_X_Y_avg(width,height) \
453 void MotionComponent_X_Y_avg_##width##_##height(yuv_data_t * p_src, \
454 yuv_data_t * p_dest, \
458 yuv_data_t * p_next_src = p_src + i_stride; \
462 for( i_y = 0; i_y < height; i_y ++ ) \
464 MMXInterpAverage4( p_dest, p_src, p_src + 1, p_next_src, \
469 MMXInterpAverage4( p_dest + 8, p_src + 8, p_src + 9, \
470 p_next_src + 8, p_next_src + 9 ); \
473 p_dest += i_stride; \
475 p_next_src += i_stride; \
479 #define __MotionComponents(width,height) \
480 __MotionComponent_x_y_copy(width,height) \
481 __MotionComponent_X_y_copy(width,height) \
482 __MotionComponent_x_Y_copy(width,height) \
483 __MotionComponent_X_Y_copy(width,height) \
484 __MotionComponent_x_y_avg(width,height) \
485 __MotionComponent_X_y_avg(width,height) \
486 __MotionComponent_x_Y_avg(width,height) \
487 __MotionComponent_X_Y_avg(width,height)
489 __MotionComponents (16,16) /* 444, 422, 420 */
490 __MotionComponents (16,8) /* 444, 422, 420 */
491 __MotionComponents (8,8) /* 422, 420 */
492 __MotionComponents (8,4) /* 420 */
494 __MotionComponents (8,16) /* 422 */