]> git.sesse.net Git - vlc/blob - plugins/motion/vdec_motion_inner_mmx.c
11db6863be6a304caaa2f14c4368e4eb34e5ef8d
[vlc] / plugins / motion / vdec_motion_inner_mmx.c
1 /*****************************************************************************
2  * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
3  *                           MMX
4  *****************************************************************************
5  * Copyright (C) 1999, 2000 VideoLAN
6  * $Id: vdec_motion_inner_mmx.c,v 1.2 2001/06/07 15:27:44 sam Exp $
7  *
8  * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
9  *          work done by the livid project <http://www.linuxvideo.org/>
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  * 
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
24  *****************************************************************************/
25
26 #include "modules_inner.h"
27
28 /*****************************************************************************
29  * Preamble
30  *****************************************************************************/
31 #include "defs.h"
32
33 #include "config.h"
34 #include "common.h"
35 #include "threads.h"
36 #include "mtime.h"
37
38 #include "video.h"
39
40 #include "attributes.h"
41 #include "mmx.h"
42
43 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
44
45 /* Some rounding constants */
46 mmx_t round1 = {0x0001000100010001LL};
47 mmx_t round4 = {0x0002000200020002LL};
48
49 /*
50  * Useful functions
51  */
52
53 static __inline__ void MMXZeroReg()
54 {
55    /* load 0 into mm0 */
56    pxor_r2r(mm0,mm0);
57 }
58
59 static __inline__ void MMXAverage2( u8 *dst, u8 *src1, u8 *src2 )
60 {
61    //
62    // *dst = clip_to_u8((*src1 + *src2 + 1)/2);
63    //
64
65    movq_m2r(*src1,mm1);        // load 8 src1 bytes
66    movq_r2r(mm1,mm2);          // copy 8 src1 bytes
67
68    movq_m2r(*src2,mm3);        // load 8 src2 bytes
69    movq_r2r(mm3,mm4);          // copy 8 src2 bytes
70
71    punpcklbw_r2r(mm0,mm1);     // unpack low src1 bytes
72    punpckhbw_r2r(mm0,mm2);     // unpack high src1 bytes
73
74    punpcklbw_r2r(mm0,mm3);     // unpack low src2 bytes
75    punpckhbw_r2r(mm0,mm4);     // unpack high src2 bytes
76
77    paddw_r2r(mm3,mm1);         // add lows to mm1
78    paddw_m2r(round1,mm1);
79    psraw_i2r(1,mm1);           // /2
80
81    paddw_r2r(mm4,mm2);         // add highs to mm2
82    paddw_m2r(round1,mm2);
83    psraw_i2r(1,mm2);           // /2
84
85    packuswb_r2r(mm2,mm1);      // pack (w/ saturation)
86    movq_r2m(mm1,*dst);         // store result in dst
87 }
88
89 static __inline__ void MMXInterpAverage2( u8 *dst, u8 *src1, u8 *src2 )
90 {
91    //
92    // *dst = clip_to_u8((*dst + (*src1 + *src2 + 1)/2 + 1)/2);
93    //
94
95    movq_m2r(*dst,mm1);            // load 8 dst bytes
96    movq_r2r(mm1,mm2);             // copy 8 dst bytes
97
98    movq_m2r(*src1,mm3);           // load 8 src1 bytes
99    movq_r2r(mm3,mm4);             // copy 8 src1 bytes
100
101    movq_m2r(*src2,mm5);           // load 8 src2 bytes
102    movq_r2r(mm5,mm6);             // copy 8 src2 bytes
103
104    punpcklbw_r2r(mm0,mm1);        // unpack low dst bytes
105    punpckhbw_r2r(mm0,mm2);        // unpack high dst bytes
106
107    punpcklbw_r2r(mm0,mm3);        // unpack low src1 bytes
108    punpckhbw_r2r(mm0,mm4);        // unpack high src1 bytes
109
110    punpcklbw_r2r(mm0,mm5);        // unpack low src2 bytes
111    punpckhbw_r2r(mm0,mm6);        // unpack high src2 bytes
112
113    paddw_r2r(mm5,mm3);            // add lows
114    paddw_m2r(round1,mm3);
115    psraw_i2r(1,mm3);              // /2
116
117    paddw_r2r(mm6,mm4);            // add highs
118    paddw_m2r(round1,mm4);
119    psraw_i2r(1,mm4);              // /2
120
121    paddw_r2r(mm3,mm1);            // add lows
122    paddw_m2r(round1,mm1);
123    psraw_i2r(1,mm1);              // /2
124
125    paddw_r2r(mm4,mm2);            // add highs
126    paddw_m2r(round1,mm2);
127    psraw_i2r(1,mm2);              // /2
128
129    packuswb_r2r(mm2,mm1);         // pack (w/ saturation)
130    movq_r2m(mm1,*dst);            // store result in dst
131 }
132
133 static __inline__ void MMXAverage4( u8 *dst, u8 *src1, u8 *src2, u8 *src3,
134                                     u8 *src4 )
135 {
136    //
137    // *dst = (*src1 + *src2 + *src3 + *src4 + 2) / 4;
138    //
139
140    movq_m2r(*src1,mm1);                // load 8 src1 bytes
141    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
142
143    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
144    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
145
146    movq_m2r(*src2,mm3);                // load 8 src2 bytes
147    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
148
149    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
150    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
151
152    paddw_r2r(mm3,mm1);                 // add lows
153    paddw_r2r(mm4,mm2);                 // add highs
154
155    // now have partials in mm1 and mm2
156
157    movq_m2r(*src3,mm3);                // load 8 src3 bytes
158    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
159
160    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
161    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
162
163    paddw_r2r(mm3,mm1);                 // add lows
164    paddw_r2r(mm4,mm2);                 // add highs
165
166    movq_m2r(*src4,mm5);                // load 8 src4 bytes
167    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
168
169    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
170    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
171
172    paddw_r2r(mm5,mm1);                 // add lows
173    paddw_r2r(mm6,mm2);                 // add highs
174
175    // now have subtotal in mm1 and mm2
176
177    paddw_m2r(round4,mm1);
178    psraw_i2r(2,mm1);                   // /4
179    paddw_m2r(round4,mm2);
180    psraw_i2r(2,mm2);                   // /4
181
182    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
183    movq_r2m(mm1,*dst);                 // store result in dst
184 }
185
186 static __inline__ void MMXInterpAverage4( u8 *dst, u8 *src1, u8 *src2,
187                                           u8 *src3, u8 *src4 )
188 {
189    //
190    // *dst = clip_to_u8((*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2);
191    //
192
193    movq_m2r(*src1,mm1);                // load 8 src1 bytes
194    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
195
196    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
197    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
198
199    movq_m2r(*src2,mm3);                // load 8 src2 bytes
200    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
201
202    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
203    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
204
205    paddw_r2r(mm3,mm1);                 // add lows
206    paddw_r2r(mm4,mm2);                 // add highs
207
208    // now have partials in mm1 and mm2
209
210    movq_m2r(*src3,mm3);                // load 8 src3 bytes
211    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
212
213    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
214    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
215
216    paddw_r2r(mm3,mm1);                 // add lows
217    paddw_r2r(mm4,mm2);                 // add highs
218
219    movq_m2r(*src4,mm5);                // load 8 src4 bytes
220    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
221
222    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
223    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
224
225    paddw_r2r(mm5,mm1);                 // add lows
226    paddw_r2r(mm6,mm2);                 // add highs
227
228    paddw_m2r(round4,mm1);
229    psraw_i2r(2,mm1);                   // /4
230    paddw_m2r(round4,mm2);
231    psraw_i2r(2,mm2);                   // /4
232
233    // now have subtotal/4 in mm1 and mm2
234
235    movq_m2r(*dst,mm3);                 // load 8 dst bytes
236    movq_r2r(mm3,mm4);                  // copy 8 dst bytes
237
238    punpcklbw_r2r(mm0,mm3);             // unpack low dst bytes
239    punpckhbw_r2r(mm0,mm4);             // unpack high dst bytes
240
241    paddw_r2r(mm3,mm1);                 // add lows
242    paddw_r2r(mm4,mm2);                 // add highs
243
244    paddw_m2r(round1,mm1);
245    psraw_i2r(1,mm1);                   // /2
246    paddw_m2r(round1,mm2);
247    psraw_i2r(1,mm2);                   // /2
248
249    // now have end value in mm1 and mm2
250
251    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
252    movq_r2m(mm1,*dst);                 // store result in dst
253 }
254
255
256 /*
257  * Actual Motion compensation
258  */
259
260 #define pavg_r2r(src,dest)      pavgusb_r2r (src, dest);
261 #define pavg_m2r(src,dest)      pavgusb_m2r (src, dest);
262
263 #define __MotionComponent_x_y_copy(width,height)                            \
264 void _M(MotionComponent_x_y_copy_##width##_##height)(yuv_data_t * p_src,    \
265                                                  yuv_data_t * p_dest,       \
266                                                  int i_stride)              \
267 {                                                                           \
268     int i_y;                                                                \
269                                                                             \
270     MMXZeroReg();                                                           \
271                                                                             \
272     for( i_y = 0; i_y < height; i_y ++ )                                    \
273     {                                                                       \
274         movq_m2r( *p_src, mm0 );     /* load 8 ref bytes */                 \
275         if( width == 16 )                                                   \
276             movq_m2r( *(p_src + 8), mm1 );                                  \
277         p_src += i_stride;                                                  \
278                                                                             \
279         movq_r2m( mm0, *p_dest );    /* store 8 bytes at curr */            \
280         if( width == 16 )                                                   \
281             movq_r2m( mm1, *(p_dest + 8) );                                 \
282         p_dest += i_stride;                                                 \
283     }                                                                       \
284 }
285
286 #define __MotionComponent_X_y_copy(width,height)                            \
287 void _M(MotionComponent_X_y_copy_##width##_##height)(yuv_data_t * p_src,    \
288                                                  yuv_data_t * p_dest,       \
289                                                  int i_stride)              \
290 {                                                                           \
291     int i_y;                                                                \
292                                                                             \
293     MMXZeroReg();                                                           \
294                                                                             \
295     for( i_y = 0; i_y < height; i_y ++ )                                    \
296     {                                                                       \
297         MMXAverage2( p_dest, p_src, p_src + 1 );                            \
298                                                                             \
299         if( width == 16 )                                                   \
300         {                                                                   \
301             MMXAverage2( p_dest + 8, p_src + 8, p_src + 9 );                \
302         }                                                                   \
303                                                                             \
304         p_dest += i_stride;                                                 \
305         p_src += i_stride;                                                  \
306     }                                                                       \
307 }
308
309 #define __MotionComponent_x_Y_copy(width,height)                            \
310 void _M(MotionComponent_x_Y_copy_##width##_##height)(yuv_data_t * p_src,    \
311                                                  yuv_data_t * p_dest,       \
312                                                  int i_stride)              \
313 {                                                                           \
314     int i_y;                                                                \
315     yuv_data_t * p_next_src = p_src + i_stride;                             \
316                                                                             \
317     MMXZeroReg();                                                           \
318                                                                             \
319     for( i_y = 0; i_y < height; i_y ++ )                                    \
320     {                                                                       \
321         MMXAverage2( p_dest, p_src, p_next_src );                           \
322                                                                             \
323         if( width == 16 )                                                   \
324         {                                                                   \
325             MMXAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );           \
326         }                                                                   \
327                                                                             \
328         p_dest += i_stride;                                                 \
329         p_src += i_stride;                                                  \
330         p_next_src += i_stride;                                             \
331     }                                                                       \
332 }
333
334 #define __MotionComponent_X_Y_copy(width,height)                            \
335 void _M(MotionComponent_X_Y_copy_##width##_##height)(yuv_data_t * p_src,    \
336                                                  yuv_data_t * p_dest,       \
337                                                  int i_stride)              \
338 {                                                                           \
339     int i_y;                                                                \
340     yuv_data_t * p_next_src = p_src + i_stride;                             \
341                                                                             \
342     MMXZeroReg();                                                           \
343                                                                             \
344     for( i_y = 0; i_y < height; i_y ++ )                                    \
345     {                                                                       \
346         MMXAverage4( p_dest, p_src, p_src + 1, p_next_src, p_next_src + 1 );\
347                                                                             \
348         if( width == 16 )                                                   \
349         {                                                                   \
350             MMXAverage4( p_dest + 8, p_src + 8, p_src + 9,                  \
351                          p_next_src + 8, p_next_src + 9 );                  \
352         }                                                                   \
353                                                                             \
354         p_dest += i_stride;                                                 \
355         p_src += i_stride;                                                  \
356         p_next_src += i_stride;                                             \
357     }                                                                       \
358 }
359
360 #define __MotionComponent_x_y_avg(width,height)                             \
361 void _M(MotionComponent_x_y_avg_##width##_##height)(yuv_data_t * p_src,     \
362                                                 yuv_data_t * p_dest,        \
363                                                 int i_stride)               \
364 {                                                                           \
365     int i_y;                                                                \
366                                                                             \
367     MMXZeroReg();                                                           \
368                                                                             \
369     for( i_y = 0; i_y < height; i_y ++ )                                    \
370     {                                                                       \
371         MMXAverage2( p_dest, p_dest, p_src );                               \
372                                                                             \
373         if( width == 16 )                                                   \
374         {                                                                   \
375             MMXAverage2( p_dest + 8, p_dest + 8, p_src + 8 );               \
376         }                                                                   \
377                                                                             \
378         p_dest += i_stride;                                                 \
379         p_src += i_stride;                                                  \
380     }                                                                       \
381 }
382
383 #define __MotionComponent_X_y_avg(width,height)                             \
384 void _M(MotionComponent_X_y_avg_##width##_##height)(yuv_data_t * p_src,     \
385                                                 yuv_data_t * p_dest,        \
386                                                 int i_stride)               \
387 {                                                                           \
388     int i_y;                                                                \
389                                                                             \
390     MMXZeroReg();                                                           \
391                                                                             \
392     for( i_y = 0; i_y < height; i_y ++ )                                    \
393     {                                                                       \
394         MMXInterpAverage2( p_dest, p_src, p_src + 1 );                      \
395                                                                             \
396         if( width == 16 )                                                   \
397         {                                                                   \
398             MMXInterpAverage2( p_dest + 8, p_src + 8, p_src + 9 );          \
399         }                                                                   \
400                                                                             \
401         p_dest += i_stride;                                                 \
402         p_src += i_stride;                                                  \
403     }                                                                       \
404 }
405
406 #define __MotionComponent_x_Y_avg(width,height)                             \
407 void _M(MotionComponent_x_Y_avg_##width##_##height)(yuv_data_t * p_src,     \
408                                                 yuv_data_t * p_dest,        \
409                                                 int i_stride)               \
410 {                                                                           \
411     int i_y;                                                                \
412     yuv_data_t * p_next_src = p_src + i_stride;                             \
413                                                                             \
414     MMXZeroReg();                                                           \
415                                                                             \
416     for( i_y = 0; i_y < height; i_y ++ )                                    \
417     {                                                                       \
418         MMXInterpAverage2( p_dest, p_src, p_next_src );                     \
419                                                                             \
420         if( width == 16 )                                                   \
421         {                                                                   \
422             MMXInterpAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );     \
423         }                                                                   \
424         p_dest += i_stride;                                                 \
425         p_src += i_stride;                                                  \
426         p_next_src += i_stride;                                             \
427     }                                                                       \
428 }
429
430 #define __MotionComponent_X_Y_avg(width,height)                             \
431 void _M(MotionComponent_X_Y_avg_##width##_##height)(yuv_data_t * p_src,     \
432                                                 yuv_data_t * p_dest,        \
433                                                 int i_stride)               \
434 {                                                                           \
435     int i_y;                                                                \
436     yuv_data_t * p_next_src = p_src + i_stride;                             \
437                                                                             \
438     MMXZeroReg();                                                           \
439                                                                             \
440     for( i_y = 0; i_y < height; i_y ++ )                                    \
441     {                                                                       \
442         MMXInterpAverage4( p_dest, p_src, p_src + 1, p_next_src,            \
443                            p_next_src + 1 );                                \
444                                                                             \
445         if( width == 16 )                                                   \
446         {                                                                   \
447             MMXInterpAverage4( p_dest + 8, p_src + 8, p_src + 9,            \
448                                p_next_src + 8, p_next_src + 9 );            \
449         }                                                                   \
450                                                                             \
451         p_dest += i_stride;                                                 \
452         p_src += i_stride;                                                  \
453         p_next_src += i_stride;                                             \
454     }                                                                       \
455 }
456
457 #define __MotionComponents(width,height)                                    \
458 __MotionComponent_x_y_copy(width,height)                                    \
459 __MotionComponent_X_y_copy(width,height)                                    \
460 __MotionComponent_x_Y_copy(width,height)                                    \
461 __MotionComponent_X_Y_copy(width,height)                                    \
462 __MotionComponent_x_y_avg(width,height)                                     \
463 __MotionComponent_X_y_avg(width,height)                                     \
464 __MotionComponent_x_Y_avg(width,height)                                     \
465 __MotionComponent_X_Y_avg(width,height)
466
467 __MotionComponents (16,16)      /* 444, 422, 420 */
468 __MotionComponents (16,8)       /* 444, 422, 420 */
469 __MotionComponents (8,8)        /* 422, 420 */
470 __MotionComponents (8,4)        /* 420 */
471 #if 0
472 __MotionComponents (8,16)       /* 422 */
473 #endif