1 /*****************************************************************************
2 * vdec_motion_inner_mmxext.c : motion compensation inner routines optimized
4 *****************************************************************************
5 * Copyright (C) 1999, 2000 VideoLAN
6 * $Id: vdec_motion_inner_mmxext.c,v 1.2 2001/06/07 15:27:44 sam Exp $
8 * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
9 * work done by the livid project <http://www.linuxvideo.org/>
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
24 *****************************************************************************/
26 #include "modules_inner.h"
28 /*****************************************************************************
30 *****************************************************************************/
40 #include "attributes.h"
43 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
45 static mmx_t mask_one = {0x0101010101010101LL};
51 #define pavg_r2r(src,dest) pavgb_r2r (src, dest);
52 #define pavg_m2r(src,dest) pavgb_m2r (src, dest);
54 #define __MotionComponent_x_y_copy(width,height) \
55 void _M(MotionComponent_x_y_copy_##width##_##height)(yuv_data_t * p_src, \
56 yuv_data_t * p_dest, \
61 pxor_r2r (mm0, mm0); \
62 pxor_r2r (mm1, mm1); \
63 pxor_r2r (mm2, mm2); \
64 pxor_r2r (mm3, mm3); \
65 pxor_r2r (mm4, mm4); \
66 pxor_r2r (mm5, mm5); \
67 pxor_r2r (mm6, mm6); \
68 pxor_r2r (mm7, mm7); \
70 for( i_y = 0; i_y < height; i_y ++ ) \
72 movq_m2r( *p_src, mm0 ); /* load 8 ref bytes */ \
74 movq_m2r( *(p_src + 8), mm1 ); \
77 movq_r2m( mm0, *p_dest ); /* store 8 bytes at curr */ \
79 movq_r2m( mm1, *(p_dest + 8) ); \
84 #define __MotionComponent_X_y_copy(width,height) \
85 void _M(MotionComponent_X_y_copy_##width##_##height)(yuv_data_t * p_src, \
86 yuv_data_t * p_dest, \
91 for( i_y = 0; i_y < height; i_y ++ ) \
93 movq_m2r (*p_src, mm0); \
95 movq_m2r (*(p_src + 8), mm1); \
96 pavg_m2r (*(p_src + 1), mm0); \
98 pavg_m2r (*(p_src + 9), mm1); \
99 movq_r2m (mm0, *p_dest); \
102 movq_r2m (mm1, *(p_dest + 8)); \
103 p_dest += i_stride; \
107 #define __MotionComponent_x_Y_copy(width,height) \
108 void _M(MotionComponent_x_Y_copy_##width##_##height)(yuv_data_t * p_src, \
109 yuv_data_t * p_dest, \
113 yuv_data_t * p_next_src = p_src + i_stride; \
115 for( i_y = 0; i_y < height; i_y ++ ) \
117 movq_m2r (*p_src, mm0); \
119 movq_m2r (*(p_src + 8), mm1); \
120 pavg_m2r (*(p_next_src), mm0); \
122 pavg_m2r (*(p_next_src + 8), mm1); \
123 movq_r2m (mm0, *p_dest); \
125 p_next_src += i_stride; \
127 movq_r2m (mm1, *(p_dest + 8)); \
128 p_dest += i_stride; \
132 #define __MotionComponent_X_Y_copy(width,height) \
133 void _M(MotionComponent_X_Y_copy_##width##_##height)(yuv_data_t * p_src, \
134 yuv_data_t * p_dest, \
141 for( i_y = 0; i_y < height; i_y ++ ) \
143 movq_m2r (*p_src, mm0); \
144 movq_m2r (*(p_src+i_stride+1), mm1); \
145 movq_r2r (mm0, mm7); \
146 movq_m2r (*(p_src+1), mm2); \
147 pxor_r2r (mm1, mm7); \
148 movq_m2r (*(p_src + i_stride), mm3); \
149 movq_r2r (mm2, mm6); \
150 pxor_r2r (mm3, mm6); \
151 pavg_r2r (mm1, mm0); \
152 pavg_r2r (mm3, mm2); \
153 por_r2r (mm6, mm7); \
154 movq_r2r (mm0, mm6); \
155 pxor_r2r (mm2, mm6); \
156 pand_r2r (mm6, mm7); \
157 pand_m2r (mask_one, mm7); \
158 pavg_r2r (mm2, mm0); \
159 psubusb_r2r (mm7, mm0); \
160 movq_r2m (mm0, *p_dest); \
162 movq_m2r (*(p_src+8), mm0); \
163 movq_m2r (*(p_src+i_stride+9), mm1); \
164 movq_r2r (mm0, mm7); \
165 movq_m2r (*(p_src+9), mm2); \
166 pxor_r2r (mm1, mm7); \
167 movq_m2r (*(p_src+i_stride+8), mm3); \
168 movq_r2r (mm2, mm6); \
169 pxor_r2r (mm3, mm6); \
170 pavg_r2r (mm1, mm0); \
171 pavg_r2r (mm3, mm2); \
172 por_r2r (mm6, mm7); \
173 movq_r2r (mm0, mm6); \
174 pxor_r2r (mm2, mm6); \
175 pand_r2r (mm6, mm7); \
176 pand_m2r (mask_one, mm7); \
177 pavg_r2r (mm2, mm0); \
178 psubusb_r2r (mm7, mm0); \
180 movq_r2m (mm0, *(p_dest+8)); \
181 p_dest += i_stride; \
186 movq_m2r (*p_src, mm0); \
187 movq_m2r (*(p_src+1), mm1); \
188 movq_r2r (mm0, mm7); \
189 pxor_r2r (mm1, mm7); \
190 pavg_r2r (mm1, mm0); \
193 for( i_y = 0; i_y < height; i_y ++ ) \
195 movq_m2r (*p_src, mm2); \
196 movq_r2r (mm0, mm5); \
197 movq_m2r (*(p_src+1), mm3); \
198 movq_r2r (mm2, mm6); \
199 pxor_r2r (mm3, mm6); \
200 pavg_r2r (mm3, mm2); \
201 por_r2r (mm6, mm7); \
202 pxor_r2r (mm2, mm5); \
203 pand_r2r (mm5, mm7); \
204 pavg_r2r (mm2, mm0); \
205 pand_m2r (mask_one, mm7); \
206 psubusb_r2r (mm7, mm0); \
208 movq_r2m (mm0, *p_dest); \
209 p_dest += i_stride; \
210 movq_r2r (mm6, mm7); \
211 movq_r2r (mm2, mm0); \
216 #define __MotionComponent_x_y_avg(width,height) \
217 void _M(MotionComponent_x_y_avg_##width##_##height)(yuv_data_t * p_src, \
218 yuv_data_t * p_dest, \
223 for( i_y = 0; i_y < height; i_y ++ ) \
225 movq_m2r( *p_src, mm0 ); \
227 movq_m2r( *(p_src + 8), mm1 ); \
228 pavg_m2r( *p_dest, mm0 ); \
230 pavg_m2r( *(p_dest + 8), mm1 ); \
231 movq_r2m( mm0, *p_dest ); \
234 movq_r2m( mm1, *(p_dest + 8) ); \
235 p_dest += i_stride; \
239 #define __MotionComponent_X_y_avg(width,height) \
240 void _M(MotionComponent_X_y_avg_##width##_##height)(yuv_data_t * p_src, \
241 yuv_data_t * p_dest, \
246 for( i_y = 0; i_y < height; i_y ++ ) \
248 movq_m2r (*p_src, mm0); \
250 movq_m2r (*(p_src + 8), mm1); \
251 pavg_m2r (*(p_src + 1), mm0); \
253 pavg_m2r (*(p_src + 9), mm1); \
254 pavg_m2r (*p_dest, mm0); \
256 pavg_m2r (*(p_dest + 8), mm1); \
258 movq_r2m (mm0, *p_dest); \
260 movq_r2m (mm1, *(p_dest + 8)); \
261 p_dest += i_stride; \
265 #define __MotionComponent_x_Y_avg(width,height) \
266 void _M(MotionComponent_x_Y_avg_##width##_##height)(yuv_data_t * p_src, \
267 yuv_data_t * p_dest, \
271 yuv_data_t * p_next_src = p_src + i_stride; \
273 for( i_y = 0; i_y < height; i_y ++ ) \
275 movq_m2r (*p_src, mm0); \
277 movq_m2r (*(p_src + 8), mm1); \
278 pavg_m2r (*(p_next_src), mm0); \
280 pavg_m2r (*(p_next_src + 8), mm1); \
281 pavg_m2r (*p_dest, mm0); \
283 pavg_m2r (*(p_dest + 8), mm1); \
285 p_next_src += i_stride; \
286 movq_r2m (mm0, *p_dest); \
288 movq_r2m (mm1, *(p_dest + 8)); \
289 p_dest += i_stride; \
293 #define __MotionComponent_X_Y_avg(width,height) \
294 void _M(MotionComponent_X_Y_avg_##width##_##height)(yuv_data_t * p_src, \
295 yuv_data_t * p_dest, \
302 for( i_y = 0; i_y < height; i_y ++ ) \
304 movq_m2r (*p_src, mm0); \
305 movq_m2r (*(p_src+i_stride+1), mm1); \
306 movq_r2r (mm0, mm7); \
307 movq_m2r (*(p_src+1), mm2); \
308 pxor_r2r (mm1, mm7); \
309 movq_m2r (*(p_src+i_stride), mm3); \
310 movq_r2r (mm2, mm6); \
311 pxor_r2r (mm3, mm6); \
312 pavg_r2r (mm1, mm0); \
313 pavg_r2r (mm3, mm2); \
314 por_r2r (mm6, mm7); \
315 movq_r2r (mm0, mm6); \
316 pxor_r2r (mm2, mm6); \
317 pand_r2r (mm6, mm7); \
318 pand_m2r (mask_one, mm7); \
319 pavg_r2r (mm2, mm0); \
320 psubusb_r2r (mm7, mm0); \
321 movq_m2r (*p_dest, mm1); \
322 pavg_r2r (mm1, mm0); \
323 movq_r2m (mm0, *p_dest); \
325 movq_m2r (*(p_src+8), mm0); \
326 movq_m2r (*(p_src+i_stride+9), mm1); \
327 movq_r2r (mm0, mm7); \
328 movq_m2r (*(p_src+9), mm2); \
329 pxor_r2r (mm1, mm7); \
330 movq_m2r (*(p_src+i_stride+8), mm3); \
331 movq_r2r (mm2, mm6); \
332 pxor_r2r (mm3, mm6); \
333 pavg_r2r (mm1, mm0); \
334 pavg_r2r (mm3, mm2); \
335 por_r2r (mm6, mm7); \
336 movq_r2r (mm0, mm6); \
337 pxor_r2r (mm2, mm6); \
338 pand_r2r (mm6, mm7); \
339 pand_m2r (mask_one, mm7); \
340 pavg_r2r (mm2, mm0); \
341 psubusb_r2r (mm7, mm0); \
342 movq_m2r (*(p_dest+8), mm1); \
343 pavg_r2r (mm1, mm0); \
345 movq_r2m (mm0, *(p_dest+8)); \
346 p_dest += i_stride; \
351 for( i_y = 0; i_y < height; i_y ++ ) \
353 movq_m2r (*p_src, mm0); \
354 movq_m2r (*(p_src+i_stride+1), mm1); \
355 movq_r2r (mm0, mm7); \
356 movq_m2r (*(p_src+1), mm2); \
357 pxor_r2r (mm1, mm7); \
358 movq_m2r (*(p_src+i_stride), mm3); \
359 movq_r2r (mm2, mm6); \
360 pxor_r2r (mm3, mm6); \
361 pavg_r2r (mm1, mm0); \
362 pavg_r2r (mm3, mm2); \
363 por_r2r (mm6, mm7); \
364 movq_r2r (mm0, mm6); \
365 pxor_r2r (mm2, mm6); \
366 pand_r2r (mm6, mm7); \
367 pand_m2r (mask_one, mm7); \
368 pavg_r2r (mm2, mm0); \
369 psubusb_r2r (mm7, mm0); \
370 movq_m2r (*p_dest, mm1); \
371 pavg_r2r (mm1, mm0); \
373 movq_r2m (mm0, *p_dest); \
374 p_dest += i_stride; \
379 #define __MotionComponents(width,height) \
380 __MotionComponent_x_y_copy(width,height) \
381 __MotionComponent_X_y_copy(width,height) \
382 __MotionComponent_x_Y_copy(width,height) \
383 __MotionComponent_X_Y_copy(width,height) \
384 __MotionComponent_x_y_avg(width,height) \
385 __MotionComponent_X_y_avg(width,height) \
386 __MotionComponent_x_Y_avg(width,height) \
387 __MotionComponent_X_Y_avg(width,height)
389 __MotionComponents (16,16) /* 444, 422, 420 */
390 __MotionComponents (16,8) /* 444, 422, 420 */
391 __MotionComponents (8,8) /* 422, 420 */
392 __MotionComponents (8,4) /* 420 */
394 __MotionComponents (8,16) /* 422 */