]> git.sesse.net Git - vlc/blob - src/video_decoder/vdec_motion_inner_mmx.c
0951d3dcf40f3435d4c9f3da7b17e65a5e5f458a
[vlc] / src / video_decoder / vdec_motion_inner_mmx.c
1 /*****************************************************************************
2  * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
3  *                           MMX
4  *****************************************************************************
5  * Copyright (C) 1999, 2000 VideoLAN
6  *
7  * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
8  *          work done by the livid project <http://www.linuxvideo.org/>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  * 
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28 #include "defs.h"
29
30 #include <sys/types.h>                        /* on BSD, uio.h needs types.h */
31 #include <sys/uio.h>                                          /* for input.h */
32
33 #include "config.h"
34 #include "common.h"
35 #include "threads.h"
36 #include "mtime.h"
37 #include "plugins.h"
38
39 #include "intf_msg.h"
40
41 #include "stream_control.h"
42 #include "input_ext-dec.h"
43
44 #include "video.h"
45 #include "video_output.h"
46
47 #include "vdec_idct.h"
48 #include "video_decoder.h"
49 #include "vdec_motion.h"
50
51 #include "vpar_blocks.h"
52 #include "vpar_headers.h"
53 #include "vpar_synchro.h"
54 #include "video_parser.h"
55 #include "video_fifo.h"
56
57 #include "mmx.h"
58
59 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
60
61 /* Some rounding constants */
62 mmx_t round1 = {0x0001000100010001LL};
63 mmx_t round4 = {0x0002000200020002LL};
64
65 /*
66  * Useful functions
67  */
68
69 static __inline__ void MMXZeroReg()
70 {
71    /* load 0 into mm0 */
72    pxor_r2r(mm0,mm0);
73 }
74
75 static __inline__ void MMXAverage2( u8 *dst, u8 *src1, u8 *src2 )
76 {
77    //
78    // *dst = clip_to_u8((*src1 + *src2 + 1)/2);
79    //
80
81    //mmx_zero_reg();
82
83    movq_m2r(*src1,mm1);        // load 8 src1 bytes
84    movq_r2r(mm1,mm2);          // copy 8 src1 bytes
85
86    movq_m2r(*src2,mm3);        // load 8 src2 bytes
87    movq_r2r(mm3,mm4);          // copy 8 src2 bytes
88
89    punpcklbw_r2r(mm0,mm1);     // unpack low src1 bytes
90    punpckhbw_r2r(mm0,mm2);     // unpack high src1 bytes
91
92    punpcklbw_r2r(mm0,mm3);     // unpack low src2 bytes
93    punpckhbw_r2r(mm0,mm4);     // unpack high src2 bytes
94
95    paddw_r2r(mm3,mm1);         // add lows to mm1
96    paddw_m2r(round1,mm1);
97    psraw_i2r(1,mm1);           // /2
98
99    paddw_r2r(mm4,mm2);         // add highs to mm2
100    paddw_m2r(round1,mm2);
101    psraw_i2r(1,mm2);           // /2
102
103    packuswb_r2r(mm2,mm1);      // pack (w/ saturation)
104    movq_r2m(mm1,*dst);         // store result in dst
105 }
106
107 static __inline__ void MMXInterpAverage2( u8 *dst, u8 *src1, u8 *src2 )
108 {
109    //
110    // *dst = clip_to_u8((*dst + (*src1 + *src2 + 1)/2 + 1)/2);
111    //
112
113    //mmx_zero_reg();
114
115    movq_m2r(*dst,mm1);            // load 8 dst bytes
116    movq_r2r(mm1,mm2);             // copy 8 dst bytes
117
118    movq_m2r(*src1,mm3);           // load 8 src1 bytes
119    movq_r2r(mm3,mm4);             // copy 8 src1 bytes
120
121    movq_m2r(*src2,mm5);           // load 8 src2 bytes
122    movq_r2r(mm5,mm6);             // copy 8 src2 bytes
123
124    punpcklbw_r2r(mm0,mm1);        // unpack low dst bytes
125    punpckhbw_r2r(mm0,mm2);        // unpack high dst bytes
126
127    punpcklbw_r2r(mm0,mm3);        // unpack low src1 bytes
128    punpckhbw_r2r(mm0,mm4);        // unpack high src1 bytes
129
130    punpcklbw_r2r(mm0,mm5);        // unpack low src2 bytes
131    punpckhbw_r2r(mm0,mm6);        // unpack high src2 bytes
132
133    paddw_r2r(mm5,mm3);            // add lows
134    paddw_m2r(round1,mm3);
135    psraw_i2r(1,mm3);              // /2
136
137    paddw_r2r(mm6,mm4);            // add highs
138    paddw_m2r(round1,mm4);
139    psraw_i2r(1,mm4);              // /2
140
141    paddw_r2r(mm3,mm1);            // add lows
142    paddw_m2r(round1,mm1);
143    psraw_i2r(1,mm1);              // /2
144
145    paddw_r2r(mm4,mm2);            // add highs
146    paddw_m2r(round1,mm2);
147    psraw_i2r(1,mm2);              // /2
148
149    packuswb_r2r(mm2,mm1);         // pack (w/ saturation)
150    movq_r2m(mm1,*dst);            // store result in dst
151 }
152
153 static __inline__ void MMXAverage4( u8 *dst, u8 *src1, u8 *src2, u8 *src3,
154                                     u8 *src4 )
155 {
156    //
157    // *dst = clip_to_u8((*src1 + *src2 + *src3 + *src4 + 2)/4);
158    //
159
160    //mmx_zero_reg();
161
162    movq_m2r(*src1,mm1);                // load 8 src1 bytes
163    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
164
165    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
166    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
167
168    movq_m2r(*src2,mm3);                // load 8 src2 bytes
169    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
170
171    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
172    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
173
174    paddw_r2r(mm3,mm1);                 // add lows
175    paddw_r2r(mm4,mm2);                 // add highs
176
177    // now have partials in mm1 and mm2
178
179    movq_m2r(*src3,mm3);                // load 8 src3 bytes
180    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
181
182    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
183    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
184
185    paddw_r2r(mm3,mm1);                 // add lows
186    paddw_r2r(mm4,mm2);                 // add highs
187
188    movq_m2r(*src4,mm5);                // load 8 src4 bytes
189    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
190
191    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
192    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
193
194    paddw_r2r(mm5,mm1);                 // add lows
195    paddw_r2r(mm6,mm2);                 // add highs
196
197    // now have subtotal in mm1 and mm2
198
199    paddw_m2r(round4,mm1);
200    psraw_i2r(2,mm1);                   // /4
201    paddw_m2r(round4,mm2);
202    psraw_i2r(2,mm2);                   // /4
203
204    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
205    movq_r2m(mm1,*dst);                 // store result in dst
206 }
207
208 static __inline__ void MMXInterpAverage4( u8 *dst, u8 *src1, u8 *src2,
209                                           u8 *src3, u8 *src4 )
210 {
211    //
212    // *dst = clip_to_u8((*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2);
213    //
214
215    //mmx_zero_reg();
216
217    movq_m2r(*src1,mm1);                // load 8 src1 bytes
218    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
219
220    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
221    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
222
223    movq_m2r(*src2,mm3);                // load 8 src2 bytes
224    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
225
226    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
227    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
228
229    paddw_r2r(mm3,mm1);                 // add lows
230    paddw_r2r(mm4,mm2);                 // add highs
231
232    // now have partials in mm1 and mm2
233
234    movq_m2r(*src3,mm3);                // load 8 src3 bytes
235    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
236
237    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
238    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
239
240    paddw_r2r(mm3,mm1);                 // add lows
241    paddw_r2r(mm4,mm2);                 // add highs
242
243    movq_m2r(*src4,mm5);                // load 8 src4 bytes
244    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
245
246    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
247    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
248
249    paddw_r2r(mm5,mm1);                 // add lows
250    paddw_r2r(mm6,mm2);                 // add highs
251
252    paddw_m2r(round4,mm1);
253    psraw_i2r(2,mm1);                   // /4
254    paddw_m2r(round4,mm2);
255    psraw_i2r(2,mm2);                   // /4
256
257    // now have subtotal/4 in mm1 and mm2
258
259    movq_m2r(*dst,mm3);                 // load 8 dst bytes
260    movq_r2r(mm3,mm4);                  // copy 8 dst bytes
261
262    punpcklbw_r2r(mm0,mm3);             // unpack low dst bytes
263    punpckhbw_r2r(mm0,mm4);             // unpack high dst bytes
264
265    paddw_r2r(mm3,mm1);                 // add lows
266    paddw_r2r(mm4,mm2);                 // add highs
267
268    paddw_m2r(round1,mm1);
269    psraw_i2r(1,mm1);                   // /2
270    paddw_m2r(round1,mm2);
271    psraw_i2r(1,mm2);                   // /2
272
273    // now have end value in mm1 and mm2
274
275    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
276    movq_r2m(mm1,*dst);                 // store result in dst
277 }
278
279
280 /*
281  * Actual Motion compensation
282  */
283
284 #define __MotionComponent_x_y_copy(width,height)                            \
285 void MotionComponent_x_y_copy_##width##_##height(yuv_data_t * p_src,        \
286                                                  yuv_data_t * p_dest,       \
287                                                  int i_stride)              \
288 {                                                                           \
289     int i_y;                                                                \
290                                                                             \
291     MMXZeroReg();                                                           \
292                                                                             \
293     for( i_y = 0; i_y < height; i_y ++ )                                    \
294     {                                                                       \
295         movq_m2r( *p_src, mm1 );     /* load 8 ref bytes */                 \
296         movq_r2m( mm1, *p_dest );    /* store 8 bytes at curr */            \
297                                                                             \
298         if( width == 16 )                                                   \
299         {                                                                   \
300             movq_m2r( *(p_src + 8), mm1 );      /* load 8 ref bytes */      \
301             movq_r2m( mm1, *(p_dest + 8) );     /* store 8 bytes at curr */ \
302         }                                                                   \
303                                                                             \
304         p_dest += i_stride;                                                 \
305         p_src += i_stride;                                                  \
306     }                                                                       \
307 }
308
309 #define __MotionComponent_X_y_copy(width,height)                            \
310 void MotionComponent_X_y_copy_##width##_##height(yuv_data_t * p_src,        \
311                                                  yuv_data_t * p_dest,       \
312                                                  int i_stride)              \
313 {                                                                           \
314     int i_y;                                                                \
315                                                                             \
316     MMXZeroReg();                                                           \
317                                                                             \
318     for( i_y = 0; i_y < height; i_y ++ )                                    \
319     {                                                                       \
320         MMXAverage2( p_dest, p_src, p_src + 1 );                            \
321                                                                             \
322         if( width == 16 )                                                   \
323         {                                                                   \
324             MMXAverage2( p_dest + 8, p_src + 8, p_src + 9 );                \
325         }                                                                   \
326                                                                             \
327         p_dest += i_stride;                                                 \
328         p_src += i_stride;                                                  \
329     }                                                                       \
330 }
331
332 #define __MotionComponent_x_Y_copy(width,height)                            \
333 void MotionComponent_x_Y_copy_##width##_##height(yuv_data_t * p_src,        \
334                                                  yuv_data_t * p_dest,       \
335                                                  int i_stride)              \
336 {                                                                           \
337     int i_y;                                                                \
338     yuv_data_t * p_next_src = p_src + i_stride;                             \
339                                                                             \
340     MMXZeroReg();                                                           \
341                                                                             \
342     for( i_y = 0; i_y < height; i_y ++ )                                    \
343     {                                                                       \
344         MMXAverage2( p_dest, p_src, p_next_src );                           \
345                                                                             \
346         if( width == 16 )                                                   \
347         {                                                                   \
348             MMXAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );           \
349         }                                                                   \
350                                                                             \
351         p_dest += i_stride;                                                 \
352         p_src += i_stride;                                                  \
353         p_next_src += i_stride;                                             \
354     }                                                                       \
355 }
356
357 #define __MotionComponent_X_Y_copy(width,height)                            \
358 void MotionComponent_X_Y_copy_##width##_##height(yuv_data_t * p_src,        \
359                                                  yuv_data_t * p_dest,       \
360                                                  int i_stride)              \
361 {                                                                           \
362     int i_y;                                                                \
363     yuv_data_t * p_next_src = p_src + i_stride;                             \
364                                                                             \
365     MMXZeroReg();                                                           \
366                                                                             \
367     for( i_y = 0; i_y < height; i_y ++ )                                    \
368     {                                                                       \
369         MMXAverage4( p_dest, p_src, p_src + 1, p_next_src, p_next_src + 1 );\
370                                                                             \
371         if( width == 16 )                                                   \
372         {                                                                   \
373             MMXAverage4( p_dest + 8, p_src + 8, p_src + 9,                  \
374                          p_next_src + 8, p_next_src + 9 );                  \
375         }                                                                   \
376                                                                             \
377         p_dest += i_stride;                                                 \
378         p_src += i_stride;                                                  \
379         p_next_src += i_stride;                                             \
380     }                                                                       \
381 }
382
383 #define __MotionComponent_x_y_avg(width,height)                             \
384 void MotionComponent_x_y_avg_##width##_##height(yuv_data_t * p_src,         \
385                                                 yuv_data_t * p_dest,        \
386                                                 int i_stride)               \
387 {                                                                           \
388     int i_y;                                                                \
389                                                                             \
390     MMXZeroReg();                                                           \
391                                                                             \
392     for( i_y = 0; i_y < height; i_y ++ )                                    \
393     {                                                                       \
394         MMXAverage2( p_dest, p_dest, p_src );                               \
395                                                                             \
396         if( width == 16 )                                                   \
397         {                                                                   \
398             MMXAverage2( p_dest + 8, p_dest + 8, p_src + 8 );               \
399         }                                                                   \
400                                                                             \
401         p_dest += i_stride;                                                 \
402         p_src += i_stride;                                                  \
403     }                                                                       \
404 }
405
406 #define __MotionComponent_X_y_avg(width,height)                             \
407 void MotionComponent_X_y_avg_##width##_##height(yuv_data_t * p_src,         \
408                                                 yuv_data_t * p_dest,        \
409                                                 int i_stride)               \
410 {                                                                           \
411     int i_y;                                                                \
412                                                                             \
413     MMXZeroReg();                                                           \
414                                                                             \
415     for( i_y = 0; i_y < height; i_y ++ )                                    \
416     {                                                                       \
417         MMXInterpAverage2( p_dest, p_src, p_src + 1 );                      \
418                                                                             \
419         if( width == 16 )                                                   \
420         {                                                                   \
421             MMXInterpAverage2( p_dest + 8, p_dest + 8, p_src + 9 );         \
422         }                                                                   \
423                                                                             \
424         p_dest += i_stride;                                                 \
425         p_src += i_stride;                                                  \
426     }                                                                       \
427 }
428
429 #define __MotionComponent_x_Y_avg(width,height)                             \
430 void MotionComponent_x_Y_avg_##width##_##height(yuv_data_t * p_src,         \
431                                                 yuv_data_t * p_dest,        \
432                                                 int i_stride)               \
433 {                                                                           \
434     int i_x, i_y;                                                           \
435     unsigned int i_dummy;                                                   \
436                                                                             \
437     for( i_y = 0; i_y < height; i_y ++ )                                    \
438     {                                                                       \
439         for( i_x = 0; i_x < width; i_x++ )                                  \
440         {                                                                   \
441             i_dummy =                                                       \
442                 p_dest[i_x] + ((unsigned int)(p_src[i_x]                    \
443                                               + p_src[i_x + i_stride]       \
444                                               + 1) >> 1);                   \
445             p_dest[i_x] = (i_dummy + 1) >> 1;                               \
446         }                                                                   \
447         p_dest += i_stride;                                                 \
448         p_src += i_stride;                                                  \
449     }                                                                       \
450 }
451
452 #define __MotionComponent_X_Y_avg(width,height)                             \
453 void MotionComponent_X_Y_avg_##width##_##height(yuv_data_t * p_src,         \
454                                                 yuv_data_t * p_dest,        \
455                                                 int i_stride)               \
456 {                                                                           \
457     int i_y;                                                                \
458     yuv_data_t * p_next_src = p_src + i_stride;                             \
459                                                                             \
460     MMXZeroReg();                                                           \
461                                                                             \
462     for( i_y = 0; i_y < height; i_y ++ )                                    \
463     {                                                                       \
464         MMXInterpAverage4( p_dest, p_src, p_src + 1, p_next_src,            \
465                            p_next_src + 1 );                                \
466                                                                             \
467         if( width == 16 )                                                   \
468         {                                                                   \
469             MMXInterpAverage4( p_dest + 8, p_src + 8, p_src + 9,            \
470                                p_next_src + 8, p_next_src + 9 );            \
471         }                                                                   \
472                                                                             \
473         p_dest += i_stride;                                                 \
474         p_src += i_stride;                                                  \
475         p_next_src += i_stride;                                             \
476     }                                                                       \
477 }
478
479 #define __MotionComponents(width,height)                                    \
480 __MotionComponent_x_y_copy(width,height)                                    \
481 __MotionComponent_X_y_copy(width,height)                                    \
482 __MotionComponent_x_Y_copy(width,height)                                    \
483 __MotionComponent_X_Y_copy(width,height)                                    \
484 __MotionComponent_x_y_avg(width,height)                                     \
485 __MotionComponent_X_y_avg(width,height)                                     \
486 __MotionComponent_x_Y_avg(width,height)                                     \
487 __MotionComponent_X_Y_avg(width,height)
488
489 __MotionComponents (16,16)        /* 444, 422, 420 */
490 __MotionComponents (16,8)        /* 444, 422, 420 */
491 __MotionComponents (8,8)        /* 422, 420 */
492 __MotionComponents (8,4)        /* 420 */
493 #if 0
494 __MotionComponents (8,16)        /* 422 */
495 #endif