]> git.sesse.net Git - vlc/blob - src/video_decoder/vdec_motion_inner_mmx.c
* Borrowed LiViD's MMX and MMX EXT IDCT.
[vlc] / src / video_decoder / vdec_motion_inner_mmx.c
1 /*****************************************************************************
2  * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
3  *                           MMX
4  *****************************************************************************
5  * Copyright (C) 1999, 2000 VideoLAN
6  * $Id: vdec_motion_inner_mmx.c,v 1.8 2001/01/16 17:59:23 massiot Exp $
7  *
8  * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
9  *          work done by the livid project <http://www.linuxvideo.org/>
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  * 
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
24  *****************************************************************************/
25
26 /*****************************************************************************
27  * Preamble
28  *****************************************************************************/
29 #include "defs.h"
30
31 #include "config.h"
32 #include "common.h"
33 #include "threads.h"
34 #include "mtime.h"
35 #include "plugins.h"
36
37 #include "intf_msg.h"
38
39 #include "stream_control.h"
40 #include "input_ext-dec.h"
41
42 #include "video.h"
43 #include "video_output.h"
44
45 #include "vdec_idct.h"
46 #include "video_decoder.h"
47 #include "vdec_motion.h"
48
49 #include "vpar_blocks.h"
50 #include "vpar_headers.h"
51 #include "vpar_synchro.h"
52 #include "video_parser.h"
53 #include "video_fifo.h"
54
55 #include "attributes.h"
56 #include "mmx.h"
57
58 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
59
60 /* Some rounding constants */
61 mmx_t round1 = {0x0001000100010001LL};
62 mmx_t round4 = {0x0002000200020002LL};
63
64 /*
65  * Useful functions
66  */
67
68 static __inline__ void MMXZeroReg()
69 {
70    /* load 0 into mm0 */
71    pxor_r2r(mm0,mm0);
72 }
73
74 static __inline__ void MMXAverage2( u8 *dst, u8 *src1, u8 *src2 )
75 {
76    //
77    // *dst = clip_to_u8((*src1 + *src2 + 1)/2);
78    //
79
80    movq_m2r(*src1,mm1);        // load 8 src1 bytes
81    movq_r2r(mm1,mm2);          // copy 8 src1 bytes
82
83    movq_m2r(*src2,mm3);        // load 8 src2 bytes
84    movq_r2r(mm3,mm4);          // copy 8 src2 bytes
85
86    punpcklbw_r2r(mm0,mm1);     // unpack low src1 bytes
87    punpckhbw_r2r(mm0,mm2);     // unpack high src1 bytes
88
89    punpcklbw_r2r(mm0,mm3);     // unpack low src2 bytes
90    punpckhbw_r2r(mm0,mm4);     // unpack high src2 bytes
91
92    paddw_r2r(mm3,mm1);         // add lows to mm1
93    paddw_m2r(round1,mm1);
94    psraw_i2r(1,mm1);           // /2
95
96    paddw_r2r(mm4,mm2);         // add highs to mm2
97    paddw_m2r(round1,mm2);
98    psraw_i2r(1,mm2);           // /2
99
100    packuswb_r2r(mm2,mm1);      // pack (w/ saturation)
101    movq_r2m(mm1,*dst);         // store result in dst
102 }
103
104 static __inline__ void MMXInterpAverage2( u8 *dst, u8 *src1, u8 *src2 )
105 {
106    //
107    // *dst = clip_to_u8((*dst + (*src1 + *src2 + 1)/2 + 1)/2);
108    //
109
110    movq_m2r(*dst,mm1);            // load 8 dst bytes
111    movq_r2r(mm1,mm2);             // copy 8 dst bytes
112
113    movq_m2r(*src1,mm3);           // load 8 src1 bytes
114    movq_r2r(mm3,mm4);             // copy 8 src1 bytes
115
116    movq_m2r(*src2,mm5);           // load 8 src2 bytes
117    movq_r2r(mm5,mm6);             // copy 8 src2 bytes
118
119    punpcklbw_r2r(mm0,mm1);        // unpack low dst bytes
120    punpckhbw_r2r(mm0,mm2);        // unpack high dst bytes
121
122    punpcklbw_r2r(mm0,mm3);        // unpack low src1 bytes
123    punpckhbw_r2r(mm0,mm4);        // unpack high src1 bytes
124
125    punpcklbw_r2r(mm0,mm5);        // unpack low src2 bytes
126    punpckhbw_r2r(mm0,mm6);        // unpack high src2 bytes
127
128    paddw_r2r(mm5,mm3);            // add lows
129    paddw_m2r(round1,mm3);
130    psraw_i2r(1,mm3);              // /2
131
132    paddw_r2r(mm6,mm4);            // add highs
133    paddw_m2r(round1,mm4);
134    psraw_i2r(1,mm4);              // /2
135
136    paddw_r2r(mm3,mm1);            // add lows
137    paddw_m2r(round1,mm1);
138    psraw_i2r(1,mm1);              // /2
139
140    paddw_r2r(mm4,mm2);            // add highs
141    paddw_m2r(round1,mm2);
142    psraw_i2r(1,mm2);              // /2
143
144    packuswb_r2r(mm2,mm1);         // pack (w/ saturation)
145    movq_r2m(mm1,*dst);            // store result in dst
146 }
147
148 static __inline__ void MMXAverage4( u8 *dst, u8 *src1, u8 *src2, u8 *src3,
149                                     u8 *src4 )
150 {
151    //
152    // *dst = (*src1 + *src2 + *src3 + *src4 + 2) / 4;
153    //
154
155    movq_m2r(*src1,mm1);                // load 8 src1 bytes
156    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
157
158    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
159    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
160
161    movq_m2r(*src2,mm3);                // load 8 src2 bytes
162    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
163
164    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
165    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
166
167    paddw_r2r(mm3,mm1);                 // add lows
168    paddw_r2r(mm4,mm2);                 // add highs
169
170    // now have partials in mm1 and mm2
171
172    movq_m2r(*src3,mm3);                // load 8 src3 bytes
173    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
174
175    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
176    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
177
178    paddw_r2r(mm3,mm1);                 // add lows
179    paddw_r2r(mm4,mm2);                 // add highs
180
181    movq_m2r(*src4,mm5);                // load 8 src4 bytes
182    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
183
184    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
185    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
186
187    paddw_r2r(mm5,mm1);                 // add lows
188    paddw_r2r(mm6,mm2);                 // add highs
189
190    // now have subtotal in mm1 and mm2
191
192    paddw_m2r(round4,mm1);
193    psraw_i2r(2,mm1);                   // /4
194    paddw_m2r(round4,mm2);
195    psraw_i2r(2,mm2);                   // /4
196
197    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
198    movq_r2m(mm1,*dst);                 // store result in dst
199 }
200
201 static __inline__ void MMXInterpAverage4( u8 *dst, u8 *src1, u8 *src2,
202                                           u8 *src3, u8 *src4 )
203 {
204    //
205    // *dst = clip_to_u8((*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2);
206    //
207
208    movq_m2r(*src1,mm1);                // load 8 src1 bytes
209    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
210
211    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
212    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
213
214    movq_m2r(*src2,mm3);                // load 8 src2 bytes
215    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
216
217    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
218    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
219
220    paddw_r2r(mm3,mm1);                 // add lows
221    paddw_r2r(mm4,mm2);                 // add highs
222
223    // now have partials in mm1 and mm2
224
225    movq_m2r(*src3,mm3);                // load 8 src3 bytes
226    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
227
228    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
229    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
230
231    paddw_r2r(mm3,mm1);                 // add lows
232    paddw_r2r(mm4,mm2);                 // add highs
233
234    movq_m2r(*src4,mm5);                // load 8 src4 bytes
235    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
236
237    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
238    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
239
240    paddw_r2r(mm5,mm1);                 // add lows
241    paddw_r2r(mm6,mm2);                 // add highs
242
243    paddw_m2r(round4,mm1);
244    psraw_i2r(2,mm1);                   // /4
245    paddw_m2r(round4,mm2);
246    psraw_i2r(2,mm2);                   // /4
247
248    // now have subtotal/4 in mm1 and mm2
249
250    movq_m2r(*dst,mm3);                 // load 8 dst bytes
251    movq_r2r(mm3,mm4);                  // copy 8 dst bytes
252
253    punpcklbw_r2r(mm0,mm3);             // unpack low dst bytes
254    punpckhbw_r2r(mm0,mm4);             // unpack high dst bytes
255
256    paddw_r2r(mm3,mm1);                 // add lows
257    paddw_r2r(mm4,mm2);                 // add highs
258
259    paddw_m2r(round1,mm1);
260    psraw_i2r(1,mm1);                   // /2
261    paddw_m2r(round1,mm2);
262    psraw_i2r(1,mm2);                   // /2
263
264    // now have end value in mm1 and mm2
265
266    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
267    movq_r2m(mm1,*dst);                 // store result in dst
268 }
269
270
271 /*
272  * Actual Motion compensation
273  */
274
275 #define pavg_r2r(src,dest)      pavgusb_r2r (src, dest);
276 #define pavg_m2r(src,dest)      pavgusb_m2r (src, dest);
277
278 #define __MotionComponent_x_y_copy(width,height)                            \
279 void MotionComponent_x_y_copy_##width##_##height(yuv_data_t * p_src,        \
280                                                  yuv_data_t * p_dest,       \
281                                                  int i_stride)              \
282 {                                                                           \
283     int i_y;                                                                \
284                                                                             \
285     MMXZeroReg();                                                           \
286                                                                             \
287     for( i_y = 0; i_y < height; i_y ++ )                                    \
288     {                                                                       \
289         movq_m2r( *p_src, mm0 );     /* load 8 ref bytes */                 \
290         if( width == 16 )                                                   \
291             movq_m2r( *(p_src + 8), mm1 );                                  \
292         p_src += i_stride;                                                  \
293                                                                             \
294         movq_r2m( mm0, *p_dest );    /* store 8 bytes at curr */            \
295         if( width == 16 )                                                   \
296             movq_r2m( mm1, *(p_dest + 8) );                                 \
297         p_dest += i_stride;                                                 \
298     }                                                                       \
299 }
300
301 #define __MotionComponent_X_y_copy(width,height)                            \
302 void MotionComponent_X_y_copy_##width##_##height(yuv_data_t * p_src,        \
303                                                  yuv_data_t * p_dest,       \
304                                                  int i_stride)              \
305 {                                                                           \
306     int i_y;                                                                \
307                                                                             \
308     MMXZeroReg();                                                           \
309                                                                             \
310     for( i_y = 0; i_y < height; i_y ++ )                                    \
311     {                                                                       \
312         MMXAverage2( p_dest, p_src, p_src + 1 );                            \
313                                                                             \
314         if( width == 16 )                                                   \
315         {                                                                   \
316             MMXAverage2( p_dest + 8, p_src + 8, p_src + 9 );                \
317         }                                                                   \
318                                                                             \
319         p_dest += i_stride;                                                 \
320         p_src += i_stride;                                                  \
321     }                                                                       \
322 }
323
324 #define __MotionComponent_x_Y_copy(width,height)                            \
325 void MotionComponent_x_Y_copy_##width##_##height(yuv_data_t * p_src,        \
326                                                  yuv_data_t * p_dest,       \
327                                                  int i_stride)              \
328 {                                                                           \
329     int i_y;                                                                \
330     yuv_data_t * p_next_src = p_src + i_stride;                             \
331                                                                             \
332     MMXZeroReg();                                                           \
333                                                                             \
334     for( i_y = 0; i_y < height; i_y ++ )                                    \
335     {                                                                       \
336         MMXAverage2( p_dest, p_src, p_next_src );                           \
337                                                                             \
338         if( width == 16 )                                                   \
339         {                                                                   \
340             MMXAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );           \
341         }                                                                   \
342                                                                             \
343         p_dest += i_stride;                                                 \
344         p_src += i_stride;                                                  \
345         p_next_src += i_stride;                                             \
346     }                                                                       \
347 }
348
349 #define __MotionComponent_X_Y_copy(width,height)                            \
350 void MotionComponent_X_Y_copy_##width##_##height(yuv_data_t * p_src,        \
351                                                  yuv_data_t * p_dest,       \
352                                                  int i_stride)              \
353 {                                                                           \
354     int i_y;                                                                \
355     yuv_data_t * p_next_src = p_src + i_stride;                             \
356                                                                             \
357     MMXZeroReg();                                                           \
358                                                                             \
359     for( i_y = 0; i_y < height; i_y ++ )                                    \
360     {                                                                       \
361         MMXAverage4( p_dest, p_src, p_src + 1, p_next_src, p_next_src + 1 );\
362                                                                             \
363         if( width == 16 )                                                   \
364         {                                                                   \
365             MMXAverage4( p_dest + 8, p_src + 8, p_src + 9,                  \
366                          p_next_src + 8, p_next_src + 9 );                  \
367         }                                                                   \
368                                                                             \
369         p_dest += i_stride;                                                 \
370         p_src += i_stride;                                                  \
371         p_next_src += i_stride;                                             \
372     }                                                                       \
373 }
374
375 #define __MotionComponent_x_y_avg(width,height)                             \
376 void MotionComponent_x_y_avg_##width##_##height(yuv_data_t * p_src,         \
377                                                 yuv_data_t * p_dest,        \
378                                                 int i_stride)               \
379 {                                                                           \
380     int i_y;                                                                \
381                                                                             \
382     MMXZeroReg();                                                           \
383                                                                             \
384     for( i_y = 0; i_y < height; i_y ++ )                                    \
385     {                                                                       \
386         MMXAverage2( p_dest, p_dest, p_src );                               \
387                                                                             \
388         if( width == 16 )                                                   \
389         {                                                                   \
390             MMXAverage2( p_dest + 8, p_dest + 8, p_src + 8 );               \
391         }                                                                   \
392                                                                             \
393         p_dest += i_stride;                                                 \
394         p_src += i_stride;                                                  \
395     }                                                                       \
396 }
397
398 #define __MotionComponent_X_y_avg(width,height)                             \
399 void MotionComponent_X_y_avg_##width##_##height(yuv_data_t * p_src,         \
400                                                 yuv_data_t * p_dest,        \
401                                                 int i_stride)               \
402 {                                                                           \
403     int i_y;                                                                \
404                                                                             \
405     MMXZeroReg();                                                           \
406                                                                             \
407     for( i_y = 0; i_y < height; i_y ++ )                                    \
408     {                                                                       \
409         MMXInterpAverage2( p_dest, p_src, p_src + 1 );                      \
410                                                                             \
411         if( width == 16 )                                                   \
412         {                                                                   \
413             MMXInterpAverage2( p_dest + 8, p_src + 8, p_src + 9 );          \
414         }                                                                   \
415                                                                             \
416         p_dest += i_stride;                                                 \
417         p_src += i_stride;                                                  \
418     }                                                                       \
419 }
420
421 #define __MotionComponent_x_Y_avg(width,height)                             \
422 void MotionComponent_x_Y_avg_##width##_##height(yuv_data_t * p_src,         \
423                                                 yuv_data_t * p_dest,        \
424                                                 int i_stride)               \
425 {                                                                           \
426     int i_y;                                                                \
427     yuv_data_t * p_next_src = p_src + i_stride;                             \
428                                                                             \
429     MMXZeroReg();                                                           \
430                                                                             \
431     for( i_y = 0; i_y < height; i_y ++ )                                    \
432     {                                                                       \
433         MMXInterpAverage2( p_dest, p_src, p_next_src );                     \
434                                                                             \
435         if( width == 16 )                                                   \
436         {                                                                   \
437             MMXInterpAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );     \
438         }                                                                   \
439         p_dest += i_stride;                                                 \
440         p_src += i_stride;                                                  \
441         p_next_src += i_stride;                                             \
442     }                                                                       \
443 }
444
445 #define __MotionComponent_X_Y_avg(width,height)                             \
446 void MotionComponent_X_Y_avg_##width##_##height(yuv_data_t * p_src,         \
447                                                 yuv_data_t * p_dest,        \
448                                                 int i_stride)               \
449 {                                                                           \
450     int i_y;                                                                \
451     yuv_data_t * p_next_src = p_src + i_stride;                             \
452                                                                             \
453     MMXZeroReg();                                                           \
454                                                                             \
455     for( i_y = 0; i_y < height; i_y ++ )                                    \
456     {                                                                       \
457         MMXInterpAverage4( p_dest, p_src, p_src + 1, p_next_src,            \
458                            p_next_src + 1 );                                \
459                                                                             \
460         if( width == 16 )                                                   \
461         {                                                                   \
462             MMXInterpAverage4( p_dest + 8, p_src + 8, p_src + 9,            \
463                                p_next_src + 8, p_next_src + 9 );            \
464         }                                                                   \
465                                                                             \
466         p_dest += i_stride;                                                 \
467         p_src += i_stride;                                                  \
468         p_next_src += i_stride;                                             \
469     }                                                                       \
470 }
471
472 #define __MotionComponents(width,height)                                    \
473 __MotionComponent_x_y_copy(width,height)                                    \
474 __MotionComponent_X_y_copy(width,height)                                    \
475 __MotionComponent_x_Y_copy(width,height)                                    \
476 __MotionComponent_X_Y_copy(width,height)                                    \
477 __MotionComponent_x_y_avg(width,height)                                     \
478 __MotionComponent_X_y_avg(width,height)                                     \
479 __MotionComponent_x_Y_avg(width,height)                                     \
480 __MotionComponent_X_Y_avg(width,height)
481
482 __MotionComponents (16,16)      /* 444, 422, 420 */
483 __MotionComponents (16,8)       /* 444, 422, 420 */
484 __MotionComponents (8,8)        /* 422, 420 */
485 __MotionComponents (8,4)        /* 420 */
486 #if 0
487 __MotionComponents (8,16)       /* 422 */
488 #endif