]> git.sesse.net Git - vlc/blob - plugins/motion/vdec_motion_inner_mmx.c
* SDL compilation fix for FreeBSD.
[vlc] / plugins / motion / vdec_motion_inner_mmx.c
1 /*****************************************************************************
2  * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
3  *                           MMX
4  *****************************************************************************
5  * Copyright (C) 1999, 2000 VideoLAN
6  * $Id: vdec_motion_inner_mmx.c,v 1.3 2001/06/07 22:14:55 sam Exp $
7  *
8  * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
9  *          work done by the livid project <http://www.linuxvideo.org/>
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  * 
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
24  *****************************************************************************/
25
26 #define MODULE_NAME motionmmx
27 #include "modules_inner.h"
28
29 /*****************************************************************************
30  * Preamble
31  *****************************************************************************/
32 #include "defs.h"
33
34 #include "config.h"
35 #include "common.h"
36 #include "threads.h"
37 #include "mtime.h"
38
39 #include "video.h"
40
41 #include "attributes.h"
42 #include "mmx.h"
43
44 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
45
46 /* Some rounding constants */
47 mmx_t round1 = {0x0001000100010001LL};
48 mmx_t round4 = {0x0002000200020002LL};
49
50 /*
51  * Useful functions
52  */
53
54 static __inline__ void MMXZeroReg()
55 {
56    /* load 0 into mm0 */
57    pxor_r2r(mm0,mm0);
58 }
59
60 static __inline__ void MMXAverage2( u8 *dst, u8 *src1, u8 *src2 )
61 {
62    //
63    // *dst = clip_to_u8((*src1 + *src2 + 1)/2);
64    //
65
66    movq_m2r(*src1,mm1);        // load 8 src1 bytes
67    movq_r2r(mm1,mm2);          // copy 8 src1 bytes
68
69    movq_m2r(*src2,mm3);        // load 8 src2 bytes
70    movq_r2r(mm3,mm4);          // copy 8 src2 bytes
71
72    punpcklbw_r2r(mm0,mm1);     // unpack low src1 bytes
73    punpckhbw_r2r(mm0,mm2);     // unpack high src1 bytes
74
75    punpcklbw_r2r(mm0,mm3);     // unpack low src2 bytes
76    punpckhbw_r2r(mm0,mm4);     // unpack high src2 bytes
77
78    paddw_r2r(mm3,mm1);         // add lows to mm1
79    paddw_m2r(round1,mm1);
80    psraw_i2r(1,mm1);           // /2
81
82    paddw_r2r(mm4,mm2);         // add highs to mm2
83    paddw_m2r(round1,mm2);
84    psraw_i2r(1,mm2);           // /2
85
86    packuswb_r2r(mm2,mm1);      // pack (w/ saturation)
87    movq_r2m(mm1,*dst);         // store result in dst
88 }
89
90 static __inline__ void MMXInterpAverage2( u8 *dst, u8 *src1, u8 *src2 )
91 {
92    //
93    // *dst = clip_to_u8((*dst + (*src1 + *src2 + 1)/2 + 1)/2);
94    //
95
96    movq_m2r(*dst,mm1);            // load 8 dst bytes
97    movq_r2r(mm1,mm2);             // copy 8 dst bytes
98
99    movq_m2r(*src1,mm3);           // load 8 src1 bytes
100    movq_r2r(mm3,mm4);             // copy 8 src1 bytes
101
102    movq_m2r(*src2,mm5);           // load 8 src2 bytes
103    movq_r2r(mm5,mm6);             // copy 8 src2 bytes
104
105    punpcklbw_r2r(mm0,mm1);        // unpack low dst bytes
106    punpckhbw_r2r(mm0,mm2);        // unpack high dst bytes
107
108    punpcklbw_r2r(mm0,mm3);        // unpack low src1 bytes
109    punpckhbw_r2r(mm0,mm4);        // unpack high src1 bytes
110
111    punpcklbw_r2r(mm0,mm5);        // unpack low src2 bytes
112    punpckhbw_r2r(mm0,mm6);        // unpack high src2 bytes
113
114    paddw_r2r(mm5,mm3);            // add lows
115    paddw_m2r(round1,mm3);
116    psraw_i2r(1,mm3);              // /2
117
118    paddw_r2r(mm6,mm4);            // add highs
119    paddw_m2r(round1,mm4);
120    psraw_i2r(1,mm4);              // /2
121
122    paddw_r2r(mm3,mm1);            // add lows
123    paddw_m2r(round1,mm1);
124    psraw_i2r(1,mm1);              // /2
125
126    paddw_r2r(mm4,mm2);            // add highs
127    paddw_m2r(round1,mm2);
128    psraw_i2r(1,mm2);              // /2
129
130    packuswb_r2r(mm2,mm1);         // pack (w/ saturation)
131    movq_r2m(mm1,*dst);            // store result in dst
132 }
133
134 static __inline__ void MMXAverage4( u8 *dst, u8 *src1, u8 *src2, u8 *src3,
135                                     u8 *src4 )
136 {
137    //
138    // *dst = (*src1 + *src2 + *src3 + *src4 + 2) / 4;
139    //
140
141    movq_m2r(*src1,mm1);                // load 8 src1 bytes
142    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
143
144    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
145    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
146
147    movq_m2r(*src2,mm3);                // load 8 src2 bytes
148    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
149
150    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
151    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
152
153    paddw_r2r(mm3,mm1);                 // add lows
154    paddw_r2r(mm4,mm2);                 // add highs
155
156    // now have partials in mm1 and mm2
157
158    movq_m2r(*src3,mm3);                // load 8 src3 bytes
159    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
160
161    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
162    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
163
164    paddw_r2r(mm3,mm1);                 // add lows
165    paddw_r2r(mm4,mm2);                 // add highs
166
167    movq_m2r(*src4,mm5);                // load 8 src4 bytes
168    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
169
170    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
171    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
172
173    paddw_r2r(mm5,mm1);                 // add lows
174    paddw_r2r(mm6,mm2);                 // add highs
175
176    // now have subtotal in mm1 and mm2
177
178    paddw_m2r(round4,mm1);
179    psraw_i2r(2,mm1);                   // /4
180    paddw_m2r(round4,mm2);
181    psraw_i2r(2,mm2);                   // /4
182
183    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
184    movq_r2m(mm1,*dst);                 // store result in dst
185 }
186
187 static __inline__ void MMXInterpAverage4( u8 *dst, u8 *src1, u8 *src2,
188                                           u8 *src3, u8 *src4 )
189 {
190    //
191    // *dst = clip_to_u8((*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2);
192    //
193
194    movq_m2r(*src1,mm1);                // load 8 src1 bytes
195    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
196
197    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
198    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
199
200    movq_m2r(*src2,mm3);                // load 8 src2 bytes
201    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
202
203    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
204    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
205
206    paddw_r2r(mm3,mm1);                 // add lows
207    paddw_r2r(mm4,mm2);                 // add highs
208
209    // now have partials in mm1 and mm2
210
211    movq_m2r(*src3,mm3);                // load 8 src3 bytes
212    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
213
214    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
215    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
216
217    paddw_r2r(mm3,mm1);                 // add lows
218    paddw_r2r(mm4,mm2);                 // add highs
219
220    movq_m2r(*src4,mm5);                // load 8 src4 bytes
221    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
222
223    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
224    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
225
226    paddw_r2r(mm5,mm1);                 // add lows
227    paddw_r2r(mm6,mm2);                 // add highs
228
229    paddw_m2r(round4,mm1);
230    psraw_i2r(2,mm1);                   // /4
231    paddw_m2r(round4,mm2);
232    psraw_i2r(2,mm2);                   // /4
233
234    // now have subtotal/4 in mm1 and mm2
235
236    movq_m2r(*dst,mm3);                 // load 8 dst bytes
237    movq_r2r(mm3,mm4);                  // copy 8 dst bytes
238
239    punpcklbw_r2r(mm0,mm3);             // unpack low dst bytes
240    punpckhbw_r2r(mm0,mm4);             // unpack high dst bytes
241
242    paddw_r2r(mm3,mm1);                 // add lows
243    paddw_r2r(mm4,mm2);                 // add highs
244
245    paddw_m2r(round1,mm1);
246    psraw_i2r(1,mm1);                   // /2
247    paddw_m2r(round1,mm2);
248    psraw_i2r(1,mm2);                   // /2
249
250    // now have end value in mm1 and mm2
251
252    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
253    movq_r2m(mm1,*dst);                 // store result in dst
254 }
255
256
257 /*
258  * Actual Motion compensation
259  */
260
261 #define pavg_r2r(src,dest)      pavgusb_r2r (src, dest);
262 #define pavg_m2r(src,dest)      pavgusb_m2r (src, dest);
263
264 #define __MotionComponent_x_y_copy(width,height)                            \
265 void _M(MotionComponent_x_y_copy_##width##_##height)(yuv_data_t * p_src,    \
266                                                  yuv_data_t * p_dest,       \
267                                                  int i_stride)              \
268 {                                                                           \
269     int i_y;                                                                \
270                                                                             \
271     MMXZeroReg();                                                           \
272                                                                             \
273     for( i_y = 0; i_y < height; i_y ++ )                                    \
274     {                                                                       \
275         movq_m2r( *p_src, mm0 );     /* load 8 ref bytes */                 \
276         if( width == 16 )                                                   \
277             movq_m2r( *(p_src + 8), mm1 );                                  \
278         p_src += i_stride;                                                  \
279                                                                             \
280         movq_r2m( mm0, *p_dest );    /* store 8 bytes at curr */            \
281         if( width == 16 )                                                   \
282             movq_r2m( mm1, *(p_dest + 8) );                                 \
283         p_dest += i_stride;                                                 \
284     }                                                                       \
285 }
286
287 #define __MotionComponent_X_y_copy(width,height)                            \
288 void _M(MotionComponent_X_y_copy_##width##_##height)(yuv_data_t * p_src,    \
289                                                  yuv_data_t * p_dest,       \
290                                                  int i_stride)              \
291 {                                                                           \
292     int i_y;                                                                \
293                                                                             \
294     MMXZeroReg();                                                           \
295                                                                             \
296     for( i_y = 0; i_y < height; i_y ++ )                                    \
297     {                                                                       \
298         MMXAverage2( p_dest, p_src, p_src + 1 );                            \
299                                                                             \
300         if( width == 16 )                                                   \
301         {                                                                   \
302             MMXAverage2( p_dest + 8, p_src + 8, p_src + 9 );                \
303         }                                                                   \
304                                                                             \
305         p_dest += i_stride;                                                 \
306         p_src += i_stride;                                                  \
307     }                                                                       \
308 }
309
310 #define __MotionComponent_x_Y_copy(width,height)                            \
311 void _M(MotionComponent_x_Y_copy_##width##_##height)(yuv_data_t * p_src,    \
312                                                  yuv_data_t * p_dest,       \
313                                                  int i_stride)              \
314 {                                                                           \
315     int i_y;                                                                \
316     yuv_data_t * p_next_src = p_src + i_stride;                             \
317                                                                             \
318     MMXZeroReg();                                                           \
319                                                                             \
320     for( i_y = 0; i_y < height; i_y ++ )                                    \
321     {                                                                       \
322         MMXAverage2( p_dest, p_src, p_next_src );                           \
323                                                                             \
324         if( width == 16 )                                                   \
325         {                                                                   \
326             MMXAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );           \
327         }                                                                   \
328                                                                             \
329         p_dest += i_stride;                                                 \
330         p_src += i_stride;                                                  \
331         p_next_src += i_stride;                                             \
332     }                                                                       \
333 }
334
335 #define __MotionComponent_X_Y_copy(width,height)                            \
336 void _M(MotionComponent_X_Y_copy_##width##_##height)(yuv_data_t * p_src,    \
337                                                  yuv_data_t * p_dest,       \
338                                                  int i_stride)              \
339 {                                                                           \
340     int i_y;                                                                \
341     yuv_data_t * p_next_src = p_src + i_stride;                             \
342                                                                             \
343     MMXZeroReg();                                                           \
344                                                                             \
345     for( i_y = 0; i_y < height; i_y ++ )                                    \
346     {                                                                       \
347         MMXAverage4( p_dest, p_src, p_src + 1, p_next_src, p_next_src + 1 );\
348                                                                             \
349         if( width == 16 )                                                   \
350         {                                                                   \
351             MMXAverage4( p_dest + 8, p_src + 8, p_src + 9,                  \
352                          p_next_src + 8, p_next_src + 9 );                  \
353         }                                                                   \
354                                                                             \
355         p_dest += i_stride;                                                 \
356         p_src += i_stride;                                                  \
357         p_next_src += i_stride;                                             \
358     }                                                                       \
359 }
360
361 #define __MotionComponent_x_y_avg(width,height)                             \
362 void _M(MotionComponent_x_y_avg_##width##_##height)(yuv_data_t * p_src,     \
363                                                 yuv_data_t * p_dest,        \
364                                                 int i_stride)               \
365 {                                                                           \
366     int i_y;                                                                \
367                                                                             \
368     MMXZeroReg();                                                           \
369                                                                             \
370     for( i_y = 0; i_y < height; i_y ++ )                                    \
371     {                                                                       \
372         MMXAverage2( p_dest, p_dest, p_src );                               \
373                                                                             \
374         if( width == 16 )                                                   \
375         {                                                                   \
376             MMXAverage2( p_dest + 8, p_dest + 8, p_src + 8 );               \
377         }                                                                   \
378                                                                             \
379         p_dest += i_stride;                                                 \
380         p_src += i_stride;                                                  \
381     }                                                                       \
382 }
383
384 #define __MotionComponent_X_y_avg(width,height)                             \
385 void _M(MotionComponent_X_y_avg_##width##_##height)(yuv_data_t * p_src,     \
386                                                 yuv_data_t * p_dest,        \
387                                                 int i_stride)               \
388 {                                                                           \
389     int i_y;                                                                \
390                                                                             \
391     MMXZeroReg();                                                           \
392                                                                             \
393     for( i_y = 0; i_y < height; i_y ++ )                                    \
394     {                                                                       \
395         MMXInterpAverage2( p_dest, p_src, p_src + 1 );                      \
396                                                                             \
397         if( width == 16 )                                                   \
398         {                                                                   \
399             MMXInterpAverage2( p_dest + 8, p_src + 8, p_src + 9 );          \
400         }                                                                   \
401                                                                             \
402         p_dest += i_stride;                                                 \
403         p_src += i_stride;                                                  \
404     }                                                                       \
405 }
406
407 #define __MotionComponent_x_Y_avg(width,height)                             \
408 void _M(MotionComponent_x_Y_avg_##width##_##height)(yuv_data_t * p_src,     \
409                                                 yuv_data_t * p_dest,        \
410                                                 int i_stride)               \
411 {                                                                           \
412     int i_y;                                                                \
413     yuv_data_t * p_next_src = p_src + i_stride;                             \
414                                                                             \
415     MMXZeroReg();                                                           \
416                                                                             \
417     for( i_y = 0; i_y < height; i_y ++ )                                    \
418     {                                                                       \
419         MMXInterpAverage2( p_dest, p_src, p_next_src );                     \
420                                                                             \
421         if( width == 16 )                                                   \
422         {                                                                   \
423             MMXInterpAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );     \
424         }                                                                   \
425         p_dest += i_stride;                                                 \
426         p_src += i_stride;                                                  \
427         p_next_src += i_stride;                                             \
428     }                                                                       \
429 }
430
431 #define __MotionComponent_X_Y_avg(width,height)                             \
432 void _M(MotionComponent_X_Y_avg_##width##_##height)(yuv_data_t * p_src,     \
433                                                 yuv_data_t * p_dest,        \
434                                                 int i_stride)               \
435 {                                                                           \
436     int i_y;                                                                \
437     yuv_data_t * p_next_src = p_src + i_stride;                             \
438                                                                             \
439     MMXZeroReg();                                                           \
440                                                                             \
441     for( i_y = 0; i_y < height; i_y ++ )                                    \
442     {                                                                       \
443         MMXInterpAverage4( p_dest, p_src, p_src + 1, p_next_src,            \
444                            p_next_src + 1 );                                \
445                                                                             \
446         if( width == 16 )                                                   \
447         {                                                                   \
448             MMXInterpAverage4( p_dest + 8, p_src + 8, p_src + 9,            \
449                                p_next_src + 8, p_next_src + 9 );            \
450         }                                                                   \
451                                                                             \
452         p_dest += i_stride;                                                 \
453         p_src += i_stride;                                                  \
454         p_next_src += i_stride;                                             \
455     }                                                                       \
456 }
457
458 #define __MotionComponents(width,height)                                    \
459 __MotionComponent_x_y_copy(width,height)                                    \
460 __MotionComponent_X_y_copy(width,height)                                    \
461 __MotionComponent_x_Y_copy(width,height)                                    \
462 __MotionComponent_X_Y_copy(width,height)                                    \
463 __MotionComponent_x_y_avg(width,height)                                     \
464 __MotionComponent_X_y_avg(width,height)                                     \
465 __MotionComponent_x_Y_avg(width,height)                                     \
466 __MotionComponent_X_Y_avg(width,height)
467
468 __MotionComponents (16,16)      /* 444, 422, 420 */
469 __MotionComponents (16,8)       /* 444, 422, 420 */
470 __MotionComponents (8,8)        /* 422, 420 */
471 __MotionComponents (8,4)        /* 420 */
472 #if 0
473 __MotionComponents (8,16)       /* 422 */
474 #endif