]> git.sesse.net Git - vlc/blob - plugins/motion/vdec_motion_inner_mmx.c
* Borrowed LiViD's MMX and MMX EXT IDCT.
[vlc] / plugins / motion / vdec_motion_inner_mmx.c
1 /*****************************************************************************
2  * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
3  *                           MMX
4  *****************************************************************************
5  * Copyright (C) 1999, 2000 VideoLAN
6  * $Id: vdec_motion_inner_mmx.c,v 1.1 2001/01/18 05:13:22 sam Exp $
7  *
8  * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
9  *          work done by the livid project <http://www.linuxvideo.org/>
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  * 
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
24  *****************************************************************************/
25
26 /*****************************************************************************
27  * Preamble
28  *****************************************************************************/
29 #include "defs.h"
30
31 #include "config.h"
32 #include "common.h"
33 #include "threads.h"
34 #include "mtime.h"
35
36 #include "video.h"
37
38 #include "attributes.h"
39 #include "mmx.h"
40
41 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
42
43 /* Some rounding constants */
44 mmx_t round1 = {0x0001000100010001LL};
45 mmx_t round4 = {0x0002000200020002LL};
46
47 /*
48  * Useful functions
49  */
50
51 static __inline__ void MMXZeroReg()
52 {
53    /* load 0 into mm0 */
54    pxor_r2r(mm0,mm0);
55 }
56
57 static __inline__ void MMXAverage2( u8 *dst, u8 *src1, u8 *src2 )
58 {
59    //
60    // *dst = clip_to_u8((*src1 + *src2 + 1)/2);
61    //
62
63    movq_m2r(*src1,mm1);        // load 8 src1 bytes
64    movq_r2r(mm1,mm2);          // copy 8 src1 bytes
65
66    movq_m2r(*src2,mm3);        // load 8 src2 bytes
67    movq_r2r(mm3,mm4);          // copy 8 src2 bytes
68
69    punpcklbw_r2r(mm0,mm1);     // unpack low src1 bytes
70    punpckhbw_r2r(mm0,mm2);     // unpack high src1 bytes
71
72    punpcklbw_r2r(mm0,mm3);     // unpack low src2 bytes
73    punpckhbw_r2r(mm0,mm4);     // unpack high src2 bytes
74
75    paddw_r2r(mm3,mm1);         // add lows to mm1
76    paddw_m2r(round1,mm1);
77    psraw_i2r(1,mm1);           // /2
78
79    paddw_r2r(mm4,mm2);         // add highs to mm2
80    paddw_m2r(round1,mm2);
81    psraw_i2r(1,mm2);           // /2
82
83    packuswb_r2r(mm2,mm1);      // pack (w/ saturation)
84    movq_r2m(mm1,*dst);         // store result in dst
85 }
86
87 static __inline__ void MMXInterpAverage2( u8 *dst, u8 *src1, u8 *src2 )
88 {
89    //
90    // *dst = clip_to_u8((*dst + (*src1 + *src2 + 1)/2 + 1)/2);
91    //
92
93    movq_m2r(*dst,mm1);            // load 8 dst bytes
94    movq_r2r(mm1,mm2);             // copy 8 dst bytes
95
96    movq_m2r(*src1,mm3);           // load 8 src1 bytes
97    movq_r2r(mm3,mm4);             // copy 8 src1 bytes
98
99    movq_m2r(*src2,mm5);           // load 8 src2 bytes
100    movq_r2r(mm5,mm6);             // copy 8 src2 bytes
101
102    punpcklbw_r2r(mm0,mm1);        // unpack low dst bytes
103    punpckhbw_r2r(mm0,mm2);        // unpack high dst bytes
104
105    punpcklbw_r2r(mm0,mm3);        // unpack low src1 bytes
106    punpckhbw_r2r(mm0,mm4);        // unpack high src1 bytes
107
108    punpcklbw_r2r(mm0,mm5);        // unpack low src2 bytes
109    punpckhbw_r2r(mm0,mm6);        // unpack high src2 bytes
110
111    paddw_r2r(mm5,mm3);            // add lows
112    paddw_m2r(round1,mm3);
113    psraw_i2r(1,mm3);              // /2
114
115    paddw_r2r(mm6,mm4);            // add highs
116    paddw_m2r(round1,mm4);
117    psraw_i2r(1,mm4);              // /2
118
119    paddw_r2r(mm3,mm1);            // add lows
120    paddw_m2r(round1,mm1);
121    psraw_i2r(1,mm1);              // /2
122
123    paddw_r2r(mm4,mm2);            // add highs
124    paddw_m2r(round1,mm2);
125    psraw_i2r(1,mm2);              // /2
126
127    packuswb_r2r(mm2,mm1);         // pack (w/ saturation)
128    movq_r2m(mm1,*dst);            // store result in dst
129 }
130
131 static __inline__ void MMXAverage4( u8 *dst, u8 *src1, u8 *src2, u8 *src3,
132                                     u8 *src4 )
133 {
134    //
135    // *dst = (*src1 + *src2 + *src3 + *src4 + 2) / 4;
136    //
137
138    movq_m2r(*src1,mm1);                // load 8 src1 bytes
139    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
140
141    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
142    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
143
144    movq_m2r(*src2,mm3);                // load 8 src2 bytes
145    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
146
147    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
148    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
149
150    paddw_r2r(mm3,mm1);                 // add lows
151    paddw_r2r(mm4,mm2);                 // add highs
152
153    // now have partials in mm1 and mm2
154
155    movq_m2r(*src3,mm3);                // load 8 src3 bytes
156    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
157
158    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
159    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
160
161    paddw_r2r(mm3,mm1);                 // add lows
162    paddw_r2r(mm4,mm2);                 // add highs
163
164    movq_m2r(*src4,mm5);                // load 8 src4 bytes
165    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
166
167    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
168    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
169
170    paddw_r2r(mm5,mm1);                 // add lows
171    paddw_r2r(mm6,mm2);                 // add highs
172
173    // now have subtotal in mm1 and mm2
174
175    paddw_m2r(round4,mm1);
176    psraw_i2r(2,mm1);                   // /4
177    paddw_m2r(round4,mm2);
178    psraw_i2r(2,mm2);                   // /4
179
180    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
181    movq_r2m(mm1,*dst);                 // store result in dst
182 }
183
184 static __inline__ void MMXInterpAverage4( u8 *dst, u8 *src1, u8 *src2,
185                                           u8 *src3, u8 *src4 )
186 {
187    //
188    // *dst = clip_to_u8((*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2);
189    //
190
191    movq_m2r(*src1,mm1);                // load 8 src1 bytes
192    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
193
194    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
195    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
196
197    movq_m2r(*src2,mm3);                // load 8 src2 bytes
198    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
199
200    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
201    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
202
203    paddw_r2r(mm3,mm1);                 // add lows
204    paddw_r2r(mm4,mm2);                 // add highs
205
206    // now have partials in mm1 and mm2
207
208    movq_m2r(*src3,mm3);                // load 8 src3 bytes
209    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
210
211    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
212    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
213
214    paddw_r2r(mm3,mm1);                 // add lows
215    paddw_r2r(mm4,mm2);                 // add highs
216
217    movq_m2r(*src4,mm5);                // load 8 src4 bytes
218    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
219
220    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
221    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
222
223    paddw_r2r(mm5,mm1);                 // add lows
224    paddw_r2r(mm6,mm2);                 // add highs
225
226    paddw_m2r(round4,mm1);
227    psraw_i2r(2,mm1);                   // /4
228    paddw_m2r(round4,mm2);
229    psraw_i2r(2,mm2);                   // /4
230
231    // now have subtotal/4 in mm1 and mm2
232
233    movq_m2r(*dst,mm3);                 // load 8 dst bytes
234    movq_r2r(mm3,mm4);                  // copy 8 dst bytes
235
236    punpcklbw_r2r(mm0,mm3);             // unpack low dst bytes
237    punpckhbw_r2r(mm0,mm4);             // unpack high dst bytes
238
239    paddw_r2r(mm3,mm1);                 // add lows
240    paddw_r2r(mm4,mm2);                 // add highs
241
242    paddw_m2r(round1,mm1);
243    psraw_i2r(1,mm1);                   // /2
244    paddw_m2r(round1,mm2);
245    psraw_i2r(1,mm2);                   // /2
246
247    // now have end value in mm1 and mm2
248
249    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
250    movq_r2m(mm1,*dst);                 // store result in dst
251 }
252
253
254 /*
255  * Actual Motion compensation
256  */
257
258 #define pavg_r2r(src,dest)      pavgusb_r2r (src, dest);
259 #define pavg_m2r(src,dest)      pavgusb_m2r (src, dest);
260
261 #define __MotionComponent_x_y_copy(width,height)                            \
262 void MotionComponent_x_y_copy_##width##_##height(yuv_data_t * p_src,        \
263                                                  yuv_data_t * p_dest,       \
264                                                  int i_stride)              \
265 {                                                                           \
266     int i_y;                                                                \
267                                                                             \
268     MMXZeroReg();                                                           \
269                                                                             \
270     for( i_y = 0; i_y < height; i_y ++ )                                    \
271     {                                                                       \
272         movq_m2r( *p_src, mm0 );     /* load 8 ref bytes */                 \
273         if( width == 16 )                                                   \
274             movq_m2r( *(p_src + 8), mm1 );                                  \
275         p_src += i_stride;                                                  \
276                                                                             \
277         movq_r2m( mm0, *p_dest );    /* store 8 bytes at curr */            \
278         if( width == 16 )                                                   \
279             movq_r2m( mm1, *(p_dest + 8) );                                 \
280         p_dest += i_stride;                                                 \
281     }                                                                       \
282 }
283
284 #define __MotionComponent_X_y_copy(width,height)                            \
285 void MotionComponent_X_y_copy_##width##_##height(yuv_data_t * p_src,        \
286                                                  yuv_data_t * p_dest,       \
287                                                  int i_stride)              \
288 {                                                                           \
289     int i_y;                                                                \
290                                                                             \
291     MMXZeroReg();                                                           \
292                                                                             \
293     for( i_y = 0; i_y < height; i_y ++ )                                    \
294     {                                                                       \
295         MMXAverage2( p_dest, p_src, p_src + 1 );                            \
296                                                                             \
297         if( width == 16 )                                                   \
298         {                                                                   \
299             MMXAverage2( p_dest + 8, p_src + 8, p_src + 9 );                \
300         }                                                                   \
301                                                                             \
302         p_dest += i_stride;                                                 \
303         p_src += i_stride;                                                  \
304     }                                                                       \
305 }
306
307 #define __MotionComponent_x_Y_copy(width,height)                            \
308 void MotionComponent_x_Y_copy_##width##_##height(yuv_data_t * p_src,        \
309                                                  yuv_data_t * p_dest,       \
310                                                  int i_stride)              \
311 {                                                                           \
312     int i_y;                                                                \
313     yuv_data_t * p_next_src = p_src + i_stride;                             \
314                                                                             \
315     MMXZeroReg();                                                           \
316                                                                             \
317     for( i_y = 0; i_y < height; i_y ++ )                                    \
318     {                                                                       \
319         MMXAverage2( p_dest, p_src, p_next_src );                           \
320                                                                             \
321         if( width == 16 )                                                   \
322         {                                                                   \
323             MMXAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );           \
324         }                                                                   \
325                                                                             \
326         p_dest += i_stride;                                                 \
327         p_src += i_stride;                                                  \
328         p_next_src += i_stride;                                             \
329     }                                                                       \
330 }
331
332 #define __MotionComponent_X_Y_copy(width,height)                            \
333 void MotionComponent_X_Y_copy_##width##_##height(yuv_data_t * p_src,        \
334                                                  yuv_data_t * p_dest,       \
335                                                  int i_stride)              \
336 {                                                                           \
337     int i_y;                                                                \
338     yuv_data_t * p_next_src = p_src + i_stride;                             \
339                                                                             \
340     MMXZeroReg();                                                           \
341                                                                             \
342     for( i_y = 0; i_y < height; i_y ++ )                                    \
343     {                                                                       \
344         MMXAverage4( p_dest, p_src, p_src + 1, p_next_src, p_next_src + 1 );\
345                                                                             \
346         if( width == 16 )                                                   \
347         {                                                                   \
348             MMXAverage4( p_dest + 8, p_src + 8, p_src + 9,                  \
349                          p_next_src + 8, p_next_src + 9 );                  \
350         }                                                                   \
351                                                                             \
352         p_dest += i_stride;                                                 \
353         p_src += i_stride;                                                  \
354         p_next_src += i_stride;                                             \
355     }                                                                       \
356 }
357
358 #define __MotionComponent_x_y_avg(width,height)                             \
359 void MotionComponent_x_y_avg_##width##_##height(yuv_data_t * p_src,         \
360                                                 yuv_data_t * p_dest,        \
361                                                 int i_stride)               \
362 {                                                                           \
363     int i_y;                                                                \
364                                                                             \
365     MMXZeroReg();                                                           \
366                                                                             \
367     for( i_y = 0; i_y < height; i_y ++ )                                    \
368     {                                                                       \
369         MMXAverage2( p_dest, p_dest, p_src );                               \
370                                                                             \
371         if( width == 16 )                                                   \
372         {                                                                   \
373             MMXAverage2( p_dest + 8, p_dest + 8, p_src + 8 );               \
374         }                                                                   \
375                                                                             \
376         p_dest += i_stride;                                                 \
377         p_src += i_stride;                                                  \
378     }                                                                       \
379 }
380
381 #define __MotionComponent_X_y_avg(width,height)                             \
382 void MotionComponent_X_y_avg_##width##_##height(yuv_data_t * p_src,         \
383                                                 yuv_data_t * p_dest,        \
384                                                 int i_stride)               \
385 {                                                                           \
386     int i_y;                                                                \
387                                                                             \
388     MMXZeroReg();                                                           \
389                                                                             \
390     for( i_y = 0; i_y < height; i_y ++ )                                    \
391     {                                                                       \
392         MMXInterpAverage2( p_dest, p_src, p_src + 1 );                      \
393                                                                             \
394         if( width == 16 )                                                   \
395         {                                                                   \
396             MMXInterpAverage2( p_dest + 8, p_src + 8, p_src + 9 );          \
397         }                                                                   \
398                                                                             \
399         p_dest += i_stride;                                                 \
400         p_src += i_stride;                                                  \
401     }                                                                       \
402 }
403
404 #define __MotionComponent_x_Y_avg(width,height)                             \
405 void MotionComponent_x_Y_avg_##width##_##height(yuv_data_t * p_src,         \
406                                                 yuv_data_t * p_dest,        \
407                                                 int i_stride)               \
408 {                                                                           \
409     int i_y;                                                                \
410     yuv_data_t * p_next_src = p_src + i_stride;                             \
411                                                                             \
412     MMXZeroReg();                                                           \
413                                                                             \
414     for( i_y = 0; i_y < height; i_y ++ )                                    \
415     {                                                                       \
416         MMXInterpAverage2( p_dest, p_src, p_next_src );                     \
417                                                                             \
418         if( width == 16 )                                                   \
419         {                                                                   \
420             MMXInterpAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );     \
421         }                                                                   \
422         p_dest += i_stride;                                                 \
423         p_src += i_stride;                                                  \
424         p_next_src += i_stride;                                             \
425     }                                                                       \
426 }
427
428 #define __MotionComponent_X_Y_avg(width,height)                             \
429 void MotionComponent_X_Y_avg_##width##_##height(yuv_data_t * p_src,         \
430                                                 yuv_data_t * p_dest,        \
431                                                 int i_stride)               \
432 {                                                                           \
433     int i_y;                                                                \
434     yuv_data_t * p_next_src = p_src + i_stride;                             \
435                                                                             \
436     MMXZeroReg();                                                           \
437                                                                             \
438     for( i_y = 0; i_y < height; i_y ++ )                                    \
439     {                                                                       \
440         MMXInterpAverage4( p_dest, p_src, p_src + 1, p_next_src,            \
441                            p_next_src + 1 );                                \
442                                                                             \
443         if( width == 16 )                                                   \
444         {                                                                   \
445             MMXInterpAverage4( p_dest + 8, p_src + 8, p_src + 9,            \
446                                p_next_src + 8, p_next_src + 9 );            \
447         }                                                                   \
448                                                                             \
449         p_dest += i_stride;                                                 \
450         p_src += i_stride;                                                  \
451         p_next_src += i_stride;                                             \
452     }                                                                       \
453 }
454
455 #define __MotionComponents(width,height)                                    \
456 __MotionComponent_x_y_copy(width,height)                                    \
457 __MotionComponent_X_y_copy(width,height)                                    \
458 __MotionComponent_x_Y_copy(width,height)                                    \
459 __MotionComponent_X_Y_copy(width,height)                                    \
460 __MotionComponent_x_y_avg(width,height)                                     \
461 __MotionComponent_X_y_avg(width,height)                                     \
462 __MotionComponent_x_Y_avg(width,height)                                     \
463 __MotionComponent_X_Y_avg(width,height)
464
465 __MotionComponents (16,16)      /* 444, 422, 420 */
466 __MotionComponents (16,8)       /* 444, 422, 420 */
467 __MotionComponents (8,8)        /* 422, 420 */
468 __MotionComponents (8,4)        /* 420 */
469 #if 0
470 __MotionComponents (8,16)       /* 422 */
471 #endif