1 /*****************************************************************************
2 * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
4 *****************************************************************************
5 * Copyright (C) 1999, 2000 VideoLAN
6 * $Id: vdec_motion_inner_mmxext.c,v 1.1 2001/01/16 17:59:23 massiot Exp $
8 * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
9 * work done by the livid project <http://www.linuxvideo.org/>
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
24 *****************************************************************************/
26 /*****************************************************************************
28 *****************************************************************************/
39 #include "stream_control.h"
40 #include "input_ext-dec.h"
43 #include "video_output.h"
45 #include "vdec_idct.h"
46 #include "video_decoder.h"
47 #include "vdec_motion.h"
49 #include "vpar_blocks.h"
50 #include "vpar_headers.h"
51 #include "vpar_synchro.h"
52 #include "video_parser.h"
53 #include "video_fifo.h"
55 #include "attributes.h"
58 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
60 static mmx_t mask_one = {0x0101010101010101LL};
66 #define pavg_r2r(src,dest) pavgb_r2r (src, dest);
67 #define pavg_m2r(src,dest) pavgb_m2r (src, dest);
69 #define __MotionComponent_x_y_copy(width,height) \
70 void MotionComponent_x_y_copy_##width##_##height(yuv_data_t * p_src, \
71 yuv_data_t * p_dest, \
76 pxor_r2r (mm0, mm0); \
77 pxor_r2r (mm1, mm1); \
78 pxor_r2r (mm2, mm2); \
79 pxor_r2r (mm3, mm3); \
80 pxor_r2r (mm4, mm4); \
81 pxor_r2r (mm5, mm5); \
82 pxor_r2r (mm6, mm6); \
83 pxor_r2r (mm7, mm7); \
85 for( i_y = 0; i_y < height; i_y ++ ) \
87 movq_m2r( *p_src, mm0 ); /* load 8 ref bytes */ \
89 movq_m2r( *(p_src + 8), mm1 ); \
92 movq_r2m( mm0, *p_dest ); /* store 8 bytes at curr */ \
94 movq_r2m( mm1, *(p_dest + 8) ); \
99 #define __MotionComponent_X_y_copy(width,height) \
100 void MotionComponent_X_y_copy_##width##_##height(yuv_data_t * p_src, \
101 yuv_data_t * p_dest, \
106 for( i_y = 0; i_y < height; i_y ++ ) \
108 movq_m2r (*p_src, mm0); \
110 movq_m2r (*(p_src + 8), mm1); \
111 pavg_m2r (*(p_src + 1), mm0); \
113 pavg_m2r (*(p_src + 9), mm1); \
114 movq_r2m (mm0, *p_dest); \
117 movq_r2m (mm1, *(p_dest + 8)); \
118 p_dest += i_stride; \
122 #define __MotionComponent_x_Y_copy(width,height) \
123 void MotionComponent_x_Y_copy_##width##_##height(yuv_data_t * p_src, \
124 yuv_data_t * p_dest, \
128 yuv_data_t * p_next_src = p_src + i_stride; \
130 for( i_y = 0; i_y < height; i_y ++ ) \
132 movq_m2r (*p_src, mm0); \
134 movq_m2r (*(p_src + 8), mm1); \
135 pavg_m2r (*(p_next_src), mm0); \
137 pavg_m2r (*(p_next_src + 8), mm1); \
138 movq_r2m (mm0, *p_dest); \
140 p_next_src += i_stride; \
142 movq_r2m (mm1, *(p_dest + 8)); \
143 p_dest += i_stride; \
147 #define __MotionComponent_X_Y_copy(width,height) \
148 void MotionComponent_X_Y_copy_##width##_##height(yuv_data_t * p_src, \
149 yuv_data_t * p_dest, \
156 for( i_y = 0; i_y < height; i_y ++ ) \
158 movq_m2r (*p_src, mm0); \
159 movq_m2r (*(p_src+i_stride+1), mm1); \
160 movq_r2r (mm0, mm7); \
161 movq_m2r (*(p_src+1), mm2); \
162 pxor_r2r (mm1, mm7); \
163 movq_m2r (*(p_src + i_stride), mm3); \
164 movq_r2r (mm2, mm6); \
165 pxor_r2r (mm3, mm6); \
166 pavg_r2r (mm1, mm0); \
167 pavg_r2r (mm3, mm2); \
168 por_r2r (mm6, mm7); \
169 movq_r2r (mm0, mm6); \
170 pxor_r2r (mm2, mm6); \
171 pand_r2r (mm6, mm7); \
172 pand_m2r (mask_one, mm7); \
173 pavg_r2r (mm2, mm0); \
174 psubusb_r2r (mm7, mm0); \
175 movq_r2m (mm0, *p_dest); \
177 movq_m2r (*(p_src+8), mm0); \
178 movq_m2r (*(p_src+i_stride+9), mm1); \
179 movq_r2r (mm0, mm7); \
180 movq_m2r (*(p_src+9), mm2); \
181 pxor_r2r (mm1, mm7); \
182 movq_m2r (*(p_src+i_stride+8), mm3); \
183 movq_r2r (mm2, mm6); \
184 pxor_r2r (mm3, mm6); \
185 pavg_r2r (mm1, mm0); \
186 pavg_r2r (mm3, mm2); \
187 por_r2r (mm6, mm7); \
188 movq_r2r (mm0, mm6); \
189 pxor_r2r (mm2, mm6); \
190 pand_r2r (mm6, mm7); \
191 pand_m2r (mask_one, mm7); \
192 pavg_r2r (mm2, mm0); \
193 psubusb_r2r (mm7, mm0); \
195 movq_r2m (mm0, *(p_dest+8)); \
196 p_dest += i_stride; \
201 movq_m2r (*p_src, mm0); \
202 movq_m2r (*(p_src+1), mm1); \
203 movq_r2r (mm0, mm7); \
204 pxor_r2r (mm1, mm7); \
205 pavg_r2r (mm1, mm0); \
208 for( i_y = 0; i_y < height; i_y ++ ) \
210 movq_m2r (*p_src, mm2); \
211 movq_r2r (mm0, mm5); \
212 movq_m2r (*(p_src+1), mm3); \
213 movq_r2r (mm2, mm6); \
214 pxor_r2r (mm3, mm6); \
215 pavg_r2r (mm3, mm2); \
216 por_r2r (mm6, mm7); \
217 pxor_r2r (mm2, mm5); \
218 pand_r2r (mm5, mm7); \
219 pavg_r2r (mm2, mm0); \
220 pand_m2r (mask_one, mm7); \
221 psubusb_r2r (mm7, mm0); \
223 movq_r2m (mm0, *p_dest); \
224 p_dest += i_stride; \
225 movq_r2r (mm6, mm7); \
226 movq_r2r (mm2, mm0); \
231 #define __MotionComponent_x_y_avg(width,height) \
232 void MotionComponent_x_y_avg_##width##_##height(yuv_data_t * p_src, \
233 yuv_data_t * p_dest, \
238 for( i_y = 0; i_y < height; i_y ++ ) \
240 movq_m2r( *p_src, mm0 ); \
242 movq_m2r( *(p_src + 8), mm1 ); \
243 pavg_m2r( *p_dest, mm0 ); \
245 pavg_m2r( *(p_dest + 8), mm1 ); \
246 movq_r2m( mm0, *p_dest ); \
249 movq_r2m( mm1, *(p_dest + 8) ); \
250 p_dest += i_stride; \
254 #define __MotionComponent_X_y_avg(width,height) \
255 void MotionComponent_X_y_avg_##width##_##height(yuv_data_t * p_src, \
256 yuv_data_t * p_dest, \
261 for( i_y = 0; i_y < height; i_y ++ ) \
263 movq_m2r (*p_src, mm0); \
265 movq_m2r (*(p_src + 8), mm1); \
266 pavg_m2r (*(p_src + 1), mm0); \
268 pavg_m2r (*(p_src + 9), mm1); \
269 pavg_m2r (*p_dest, mm0); \
271 pavg_m2r (*(p_dest + 8), mm1); \
273 movq_r2m (mm0, *p_dest); \
275 movq_r2m (mm1, *(p_dest + 8)); \
276 p_dest += i_stride; \
280 #define __MotionComponent_x_Y_avg(width,height) \
281 void MotionComponent_x_Y_avg_##width##_##height(yuv_data_t * p_src, \
282 yuv_data_t * p_dest, \
286 yuv_data_t * p_next_src = p_src + i_stride; \
288 for( i_y = 0; i_y < height; i_y ++ ) \
290 movq_m2r (*p_src, mm0); \
292 movq_m2r (*(p_src + 8), mm1); \
293 pavg_m2r (*(p_next_src), mm0); \
295 pavg_m2r (*(p_next_src + 8), mm1); \
296 pavg_m2r (*p_dest, mm0); \
298 pavg_m2r (*(p_dest + 8), mm1); \
300 p_next_src += i_stride; \
301 movq_r2m (mm0, *p_dest); \
303 movq_r2m (mm1, *(p_dest + 8)); \
304 p_dest += i_stride; \
308 #define __MotionComponent_X_Y_avg(width,height) \
309 void MotionComponent_X_Y_avg_##width##_##height(yuv_data_t * p_src, \
310 yuv_data_t * p_dest, \
317 for( i_y = 0; i_y < height; i_y ++ ) \
319 movq_m2r (*p_src, mm0); \
320 movq_m2r (*(p_src+i_stride+1), mm1); \
321 movq_r2r (mm0, mm7); \
322 movq_m2r (*(p_src+1), mm2); \
323 pxor_r2r (mm1, mm7); \
324 movq_m2r (*(p_src+i_stride), mm3); \
325 movq_r2r (mm2, mm6); \
326 pxor_r2r (mm3, mm6); \
327 pavg_r2r (mm1, mm0); \
328 pavg_r2r (mm3, mm2); \
329 por_r2r (mm6, mm7); \
330 movq_r2r (mm0, mm6); \
331 pxor_r2r (mm2, mm6); \
332 pand_r2r (mm6, mm7); \
333 pand_m2r (mask_one, mm7); \
334 pavg_r2r (mm2, mm0); \
335 psubusb_r2r (mm7, mm0); \
336 movq_m2r (*p_dest, mm1); \
337 pavg_r2r (mm1, mm0); \
338 movq_r2m (mm0, *p_dest); \
340 movq_m2r (*(p_src+8), mm0); \
341 movq_m2r (*(p_src+i_stride+9), mm1); \
342 movq_r2r (mm0, mm7); \
343 movq_m2r (*(p_src+9), mm2); \
344 pxor_r2r (mm1, mm7); \
345 movq_m2r (*(p_src+i_stride+8), mm3); \
346 movq_r2r (mm2, mm6); \
347 pxor_r2r (mm3, mm6); \
348 pavg_r2r (mm1, mm0); \
349 pavg_r2r (mm3, mm2); \
350 por_r2r (mm6, mm7); \
351 movq_r2r (mm0, mm6); \
352 pxor_r2r (mm2, mm6); \
353 pand_r2r (mm6, mm7); \
354 pand_m2r (mask_one, mm7); \
355 pavg_r2r (mm2, mm0); \
356 psubusb_r2r (mm7, mm0); \
357 movq_m2r (*(p_dest+8), mm1); \
358 pavg_r2r (mm1, mm0); \
360 movq_r2m (mm0, *(p_dest+8)); \
361 p_dest += i_stride; \
366 for( i_y = 0; i_y < height; i_y ++ ) \
368 movq_m2r (*p_src, mm0); \
369 movq_m2r (*(p_src+i_stride+1), mm1); \
370 movq_r2r (mm0, mm7); \
371 movq_m2r (*(p_src+1), mm2); \
372 pxor_r2r (mm1, mm7); \
373 movq_m2r (*(p_src+i_stride), mm3); \
374 movq_r2r (mm2, mm6); \
375 pxor_r2r (mm3, mm6); \
376 pavg_r2r (mm1, mm0); \
377 pavg_r2r (mm3, mm2); \
378 por_r2r (mm6, mm7); \
379 movq_r2r (mm0, mm6); \
380 pxor_r2r (mm2, mm6); \
381 pand_r2r (mm6, mm7); \
382 pand_m2r (mask_one, mm7); \
383 pavg_r2r (mm2, mm0); \
384 psubusb_r2r (mm7, mm0); \
385 movq_m2r (*p_dest, mm1); \
386 pavg_r2r (mm1, mm0); \
388 movq_r2m (mm0, *p_dest); \
389 p_dest += i_stride; \
394 #define __MotionComponents(width,height) \
395 __MotionComponent_x_y_copy(width,height) \
396 __MotionComponent_X_y_copy(width,height) \
397 __MotionComponent_x_Y_copy(width,height) \
398 __MotionComponent_X_Y_copy(width,height) \
399 __MotionComponent_x_y_avg(width,height) \
400 __MotionComponent_X_y_avg(width,height) \
401 __MotionComponent_x_Y_avg(width,height) \
402 __MotionComponent_X_Y_avg(width,height)
404 __MotionComponents (16,16) /* 444, 422, 420 */
405 __MotionComponents (16,8) /* 444, 422, 420 */
406 __MotionComponents (8,8) /* 422, 420 */
407 __MotionComponents (8,4) /* 420 */
409 __MotionComponents (8,16) /* 422 */