]> git.sesse.net Git - vlc/blob - modules/video_filter/deinterlace.c
Use var_InheritString for --decklink-video-connection.
[vlc] / modules / video_filter / deinterlace.c
1 /*****************************************************************************
2  * deinterlace.c : deinterlacer plugin for vlc
3  *****************************************************************************
4  * Copyright (C) 2000-2009 the VideoLAN team
5  * $Id$
6  *
7  * Author: Sam Hocevar <sam@zoy.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22  *****************************************************************************/
23
24 /*****************************************************************************
25  * Preamble
26  *****************************************************************************/
27
28 #ifdef HAVE_CONFIG_H
29 # include "config.h"
30 #endif
31
32 #include <assert.h>
33
34 #ifdef HAVE_ALTIVEC_H
35 #   include <altivec.h>
36 #endif
37
38 #include <vlc_common.h>
39 #include <vlc_plugin.h>
40 #include <vlc_filter.h>
41 #include <vlc_cpu.h>
42
43 #ifdef CAN_COMPILE_MMXEXT
44 #   include "mmx.h"
45 #endif
46
47 #define DEINTERLACE_DISCARD 1
48 #define DEINTERLACE_MEAN    2
49 #define DEINTERLACE_BLEND   3
50 #define DEINTERLACE_BOB     4
51 #define DEINTERLACE_LINEAR  5
52 #define DEINTERLACE_X       6
53 #define DEINTERLACE_YADIF   7
54 #define DEINTERLACE_YADIF2X 8
55
56 /*****************************************************************************
57  * Module descriptor
58  *****************************************************************************/
59 static int  Open ( vlc_object_t * );
60 static void Close( vlc_object_t * );
61
62 #define MODE_TEXT N_("Deinterlace mode")
63 #define MODE_LONGTEXT N_("Deinterlace method to use for local playback.")
64
65 #define SOUT_MODE_TEXT N_("Streaming deinterlace mode")
66 #define SOUT_MODE_LONGTEXT N_("Deinterlace method to use for streaming.")
67
68 #define FILTER_CFG_PREFIX "sout-deinterlace-"
69
70 static const char *const mode_list[] = {
71     "discard", "blend", "mean", "bob", "linear", "x", "yadif", "yadif2x" };
72 static const char *const mode_list_text[] = {
73     N_("Discard"), N_("Blend"), N_("Mean"), N_("Bob"), N_("Linear"), "X", "Yadif", "Yadif (2x)" };
74
75 vlc_module_begin ()
76     set_description( N_("Deinterlacing video filter") )
77     set_shortname( N_("Deinterlace" ))
78     set_capability( "video filter", 0 )
79     set_category( CAT_VIDEO )
80     set_subcategory( SUBCAT_VIDEO_VFILTER )
81
82     set_capability( "video filter2", 0 )
83     add_string( FILTER_CFG_PREFIX "mode", "blend", NULL, SOUT_MODE_TEXT,
84                 SOUT_MODE_LONGTEXT, false )
85         change_string_list( mode_list, mode_list_text, 0 )
86         change_safe ()
87     add_shortcut( "deinterlace" )
88     set_callbacks( Open, Close )
89 vlc_module_end ()
90
91
92 /*****************************************************************************
93  * Local protypes
94  *****************************************************************************/
95 static void RenderDiscard( filter_t *, picture_t *, picture_t *, int );
96 static void RenderBob    ( filter_t *, picture_t *, picture_t *, int );
97 static void RenderMean   ( filter_t *, picture_t *, picture_t * );
98 static void RenderBlend  ( filter_t *, picture_t *, picture_t * );
99 static void RenderLinear ( filter_t *, picture_t *, picture_t *, int );
100 static void RenderX      ( picture_t *, picture_t * );
101 static int  RenderYadif  ( filter_t *, picture_t *, picture_t *, int, int );
102
103 static void MergeGeneric ( void *, const void *, const void *, size_t );
104 #if defined(CAN_COMPILE_C_ALTIVEC)
105 static void MergeAltivec ( void *, const void *, const void *, size_t );
106 #endif
107 #if defined(CAN_COMPILE_MMXEXT)
108 static void MergeMMXEXT  ( void *, const void *, const void *, size_t );
109 #endif
110 #if defined(CAN_COMPILE_3DNOW)
111 static void Merge3DNow   ( void *, const void *, const void *, size_t );
112 #endif
113 #if defined(CAN_COMPILE_SSE)
114 static void MergeSSE2    ( void *, const void *, const void *, size_t );
115 #endif
116 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
117 static void EndMMX       ( void );
118 #endif
119 #if defined(CAN_COMPILE_3DNOW)
120 static void End3DNow     ( void );
121 #endif
122 #if defined __ARM_NEON__
123 static void MergeNEON (void *, const void *, const void *, size_t);
124 #endif
125
126 static const char *const ppsz_filter_options[] = {
127     "mode", NULL
128 };
129
130 #define HISTORY_SIZE (3)
131 struct filter_sys_t
132 {
133     int  i_mode;        /* Deinterlace mode */
134     bool b_double_rate; /* Shall we double the framerate? */
135     bool b_half_height; /* Shall be divide the height by 2 */
136
137     void (*pf_merge) ( void *, const void *, const void *, size_t );
138     void (*pf_end_merge) ( void );
139
140     mtime_t i_last_date;
141
142     /* Yadif */
143     picture_t *pp_history[HISTORY_SIZE];
144 };
145
146 /*****************************************************************************
147  * SetFilterMethod: setup the deinterlace method to use.
148  *****************************************************************************/
149 static void SetFilterMethod( filter_t *p_filter, const char *psz_method, vlc_fourcc_t i_chroma )
150 {
151     filter_sys_t *p_sys = p_filter->p_sys;
152
153     if( !psz_method )
154         psz_method = "";
155
156     if( !strcmp( psz_method, "mean" ) )
157     {
158         p_sys->i_mode = DEINTERLACE_MEAN;
159         p_sys->b_double_rate = false;
160         p_sys->b_half_height = true;
161     }
162     else if( !strcmp( psz_method, "bob" )
163              || !strcmp( psz_method, "progressive-scan" ) )
164     {
165         p_sys->i_mode = DEINTERLACE_BOB;
166         p_sys->b_double_rate = true;
167         p_sys->b_half_height = false;
168     }
169     else if( !strcmp( psz_method, "linear" ) )
170     {
171         p_sys->i_mode = DEINTERLACE_LINEAR;
172         p_sys->b_double_rate = true;
173         p_sys->b_half_height = false;
174     }
175     else if( !strcmp( psz_method, "x" ) )
176     {
177         p_sys->i_mode = DEINTERLACE_X;
178         p_sys->b_double_rate = false;
179         p_sys->b_half_height = false;
180     }
181     else if( !strcmp( psz_method, "yadif" ) )
182     {
183         p_sys->i_mode = DEINTERLACE_YADIF;
184         p_sys->b_double_rate = false;
185         p_sys->b_half_height = false;
186     }
187     else if( !strcmp( psz_method, "yadif2x" ) )
188     {
189         p_sys->i_mode = DEINTERLACE_YADIF2X;
190         p_sys->b_double_rate = true;
191         p_sys->b_half_height = false;
192     }
193     else if( !strcmp( psz_method, "discard" ) )
194     {
195         const bool b_i422 = i_chroma == VLC_CODEC_I422 ||
196                             i_chroma == VLC_CODEC_J422;
197
198         p_sys->i_mode = DEINTERLACE_DISCARD;
199         p_sys->b_double_rate = false;
200         p_sys->b_half_height = !b_i422;
201     }
202     else
203     {
204         if( strcmp( psz_method, "blend" ) )
205             msg_Err( p_filter,
206                      "no valid deinterlace mode provided, using \"blend\"" );
207
208         p_sys->i_mode = DEINTERLACE_BLEND;
209         p_sys->b_double_rate = false;
210         p_sys->b_half_height = false;
211     }
212
213     msg_Dbg( p_filter, "using %s deinterlace method", psz_method );
214 }
215
216 static void GetOutputFormat( filter_t *p_filter,
217                              video_format_t *p_dst, const video_format_t *p_src )
218 {
219     filter_sys_t *p_sys = p_filter->p_sys;
220     *p_dst = *p_src;
221
222     if( p_sys->b_half_height )
223     {
224         p_dst->i_height /= 2;
225         p_dst->i_visible_height /= 2;
226         p_dst->i_y_offset /= 2;
227         p_dst->i_sar_den *= 2;
228     }
229
230     if( p_src->i_chroma == VLC_CODEC_I422 ||
231         p_src->i_chroma == VLC_CODEC_J422 )
232     {
233         switch( p_sys->i_mode )
234         {
235         case DEINTERLACE_MEAN:
236         case DEINTERLACE_LINEAR:
237         case DEINTERLACE_X:
238         case DEINTERLACE_YADIF:
239         case DEINTERLACE_YADIF2X:
240             p_dst->i_chroma = p_src->i_chroma;
241             break;
242         default:
243             p_dst->i_chroma = p_src->i_chroma == VLC_CODEC_I422 ? VLC_CODEC_I420 :
244                                                                   VLC_CODEC_J420;
245             break;
246         }
247     }
248 }
249
250 static bool IsChromaSupported( vlc_fourcc_t i_chroma )
251 {
252     return i_chroma == VLC_CODEC_I420 ||
253            i_chroma == VLC_CODEC_J420 ||
254            i_chroma == VLC_CODEC_YV12 ||
255            i_chroma == VLC_CODEC_I422 ||
256            i_chroma == VLC_CODEC_J422;
257 }
258
259 /*****************************************************************************
260  * RenderDiscard: only keep TOP or BOTTOM field, discard the other.
261  *****************************************************************************/
262 static void RenderDiscard( filter_t *p_filter,
263                            picture_t *p_outpic, picture_t *p_pic, int i_field )
264 {
265     int i_plane;
266
267     /* Copy image and skip lines */
268     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
269     {
270         uint8_t *p_in, *p_out_end, *p_out;
271         int i_increment;
272
273         p_in = p_pic->p[i_plane].p_pixels
274                    + i_field * p_pic->p[i_plane].i_pitch;
275
276         p_out = p_outpic->p[i_plane].p_pixels;
277         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
278                              * p_outpic->p[i_plane].i_visible_lines;
279
280         switch( p_filter->fmt_in.video.i_chroma )
281         {
282         case VLC_CODEC_I420:
283         case VLC_CODEC_J420:
284         case VLC_CODEC_YV12:
285
286             for( ; p_out < p_out_end ; )
287             {
288                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
289
290                 p_out += p_outpic->p[i_plane].i_pitch;
291                 p_in += 2 * p_pic->p[i_plane].i_pitch;
292             }
293             break;
294
295         case VLC_CODEC_I422:
296         case VLC_CODEC_J422:
297
298             i_increment = 2 * p_pic->p[i_plane].i_pitch;
299
300             if( i_plane == Y_PLANE )
301             {
302                 for( ; p_out < p_out_end ; )
303                 {
304                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
305                     p_out += p_outpic->p[i_plane].i_pitch;
306                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
307                     p_out += p_outpic->p[i_plane].i_pitch;
308                     p_in += i_increment;
309                 }
310             }
311             else
312             {
313                 for( ; p_out < p_out_end ; )
314                 {
315                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
316                     p_out += p_outpic->p[i_plane].i_pitch;
317                     p_in += i_increment;
318                 }
319             }
320             break;
321
322         default:
323             break;
324         }
325     }
326 }
327
328 /*****************************************************************************
329  * RenderBob: renders a BOB picture - simple copy
330  *****************************************************************************/
331 static void RenderBob( filter_t *p_filter,
332                        picture_t *p_outpic, picture_t *p_pic, int i_field )
333 {
334     int i_plane;
335
336     /* Copy image and skip lines */
337     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
338     {
339         uint8_t *p_in, *p_out_end, *p_out;
340
341         p_in = p_pic->p[i_plane].p_pixels;
342         p_out = p_outpic->p[i_plane].p_pixels;
343         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
344                              * p_outpic->p[i_plane].i_visible_lines;
345
346         switch( p_filter->fmt_in.video.i_chroma )
347         {
348             case VLC_CODEC_I420:
349             case VLC_CODEC_J420:
350             case VLC_CODEC_YV12:
351                 /* For BOTTOM field we need to add the first line */
352                 if( i_field == 1 )
353                 {
354                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
355                     p_in += p_pic->p[i_plane].i_pitch;
356                     p_out += p_outpic->p[i_plane].i_pitch;
357                 }
358
359                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
360
361                 for( ; p_out < p_out_end ; )
362                 {
363                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
364
365                     p_out += p_outpic->p[i_plane].i_pitch;
366
367                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
368
369                     p_in += 2 * p_pic->p[i_plane].i_pitch;
370                     p_out += p_outpic->p[i_plane].i_pitch;
371                 }
372
373                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
374
375                 /* For TOP field we need to add the last line */
376                 if( i_field == 0 )
377                 {
378                     p_in += p_pic->p[i_plane].i_pitch;
379                     p_out += p_outpic->p[i_plane].i_pitch;
380                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
381                 }
382                 break;
383
384             case VLC_CODEC_I422:
385             case VLC_CODEC_J422:
386                 /* For BOTTOM field we need to add the first line */
387                 if( i_field == 1 )
388                 {
389                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
390                     p_in += p_pic->p[i_plane].i_pitch;
391                     p_out += p_outpic->p[i_plane].i_pitch;
392                 }
393
394                 p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
395
396                 if( i_plane == Y_PLANE )
397                 {
398                     for( ; p_out < p_out_end ; )
399                     {
400                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
401
402                         p_out += p_outpic->p[i_plane].i_pitch;
403
404                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
405
406                         p_in += 2 * p_pic->p[i_plane].i_pitch;
407                         p_out += p_outpic->p[i_plane].i_pitch;
408                     }
409                 }
410                 else
411                 {
412                     for( ; p_out < p_out_end ; )
413                     {
414                         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
415
416                         p_out += p_outpic->p[i_plane].i_pitch;
417                         p_in += 2 * p_pic->p[i_plane].i_pitch;
418                     }
419                 }
420
421                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
422
423                 /* For TOP field we need to add the last line */
424                 if( i_field == 0 )
425                 {
426                     p_in += p_pic->p[i_plane].i_pitch;
427                     p_out += p_outpic->p[i_plane].i_pitch;
428                     vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
429                 }
430                 break;
431         }
432     }
433 }
434
435 #define Merge p_filter->p_sys->pf_merge
436 #define EndMerge if(p_filter->p_sys->pf_end_merge) p_filter->p_sys->pf_end_merge
437
438 /*****************************************************************************
439  * RenderLinear: BOB with linear interpolation
440  *****************************************************************************/
441 static void RenderLinear( filter_t *p_filter,
442                           picture_t *p_outpic, picture_t *p_pic, int i_field )
443 {
444     int i_plane;
445
446     /* Copy image and skip lines */
447     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
448     {
449         uint8_t *p_in, *p_out_end, *p_out;
450
451         p_in = p_pic->p[i_plane].p_pixels;
452         p_out = p_outpic->p[i_plane].p_pixels;
453         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
454                              * p_outpic->p[i_plane].i_visible_lines;
455
456         /* For BOTTOM field we need to add the first line */
457         if( i_field == 1 )
458         {
459             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
460             p_in += p_pic->p[i_plane].i_pitch;
461             p_out += p_outpic->p[i_plane].i_pitch;
462         }
463
464         p_out_end -= 2 * p_outpic->p[i_plane].i_pitch;
465
466         for( ; p_out < p_out_end ; )
467         {
468             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
469
470             p_out += p_outpic->p[i_plane].i_pitch;
471
472             Merge( p_out, p_in, p_in + 2 * p_pic->p[i_plane].i_pitch,
473                    p_pic->p[i_plane].i_pitch );
474
475             p_in += 2 * p_pic->p[i_plane].i_pitch;
476             p_out += p_outpic->p[i_plane].i_pitch;
477         }
478
479         vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
480
481         /* For TOP field we need to add the last line */
482         if( i_field == 0 )
483         {
484             p_in += p_pic->p[i_plane].i_pitch;
485             p_out += p_outpic->p[i_plane].i_pitch;
486             vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
487         }
488     }
489     EndMerge();
490 }
491
492 static void RenderMean( filter_t *p_filter,
493                         picture_t *p_outpic, picture_t *p_pic )
494 {
495     int i_plane;
496
497     /* Copy image and skip lines */
498     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
499     {
500         uint8_t *p_in, *p_out_end, *p_out;
501
502         p_in = p_pic->p[i_plane].p_pixels;
503
504         p_out = p_outpic->p[i_plane].p_pixels;
505         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
506                              * p_outpic->p[i_plane].i_visible_lines;
507
508         /* All lines: mean value */
509         for( ; p_out < p_out_end ; )
510         {
511             Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
512                    p_pic->p[i_plane].i_pitch );
513
514             p_out += p_outpic->p[i_plane].i_pitch;
515             p_in += 2 * p_pic->p[i_plane].i_pitch;
516         }
517     }
518     EndMerge();
519 }
520
521 static void RenderBlend( filter_t *p_filter,
522                          picture_t *p_outpic, picture_t *p_pic )
523 {
524     int i_plane;
525
526     /* Copy image and skip lines */
527     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
528     {
529         uint8_t *p_in, *p_out_end, *p_out;
530
531         p_in = p_pic->p[i_plane].p_pixels;
532
533         p_out = p_outpic->p[i_plane].p_pixels;
534         p_out_end = p_out + p_outpic->p[i_plane].i_pitch
535                              * p_outpic->p[i_plane].i_visible_lines;
536
537         switch( p_filter->fmt_in.video.i_chroma )
538         {
539             case VLC_CODEC_I420:
540             case VLC_CODEC_J420:
541             case VLC_CODEC_YV12:
542                 /* First line: simple copy */
543                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
544                 p_out += p_outpic->p[i_plane].i_pitch;
545
546                 /* Remaining lines: mean value */
547                 for( ; p_out < p_out_end ; )
548                 {
549                     Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
550                            p_pic->p[i_plane].i_pitch );
551
552                     p_out += p_outpic->p[i_plane].i_pitch;
553                     p_in += p_pic->p[i_plane].i_pitch;
554                 }
555                 break;
556
557             case VLC_CODEC_I422:
558             case VLC_CODEC_J422:
559                 /* First line: simple copy */
560                 vlc_memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
561                 p_out += p_outpic->p[i_plane].i_pitch;
562
563                 /* Remaining lines: mean value */
564                 if( i_plane == Y_PLANE )
565                 {
566                     for( ; p_out < p_out_end ; )
567                     {
568                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
569                                p_pic->p[i_plane].i_pitch );
570
571                         p_out += p_outpic->p[i_plane].i_pitch;
572                         p_in += p_pic->p[i_plane].i_pitch;
573                     }
574                 }
575
576                 else
577                 {
578                     for( ; p_out < p_out_end ; )
579                     {
580                         Merge( p_out, p_in, p_in + p_pic->p[i_plane].i_pitch,
581                                p_pic->p[i_plane].i_pitch );
582
583                         p_out += p_outpic->p[i_plane].i_pitch;
584                         p_in += 2*p_pic->p[i_plane].i_pitch;
585                     }
586                 }
587                 break;
588         }
589     }
590     EndMerge();
591 }
592
593 #undef Merge
594
595 static void MergeGeneric( void *_p_dest, const void *_p_s1,
596                           const void *_p_s2, size_t i_bytes )
597 {
598     uint8_t* p_dest = (uint8_t*)_p_dest;
599     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
600     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
601     uint8_t* p_end = p_dest + i_bytes - 8;
602
603     while( p_dest < p_end )
604     {
605         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
606         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
607         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
608         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
609         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
610         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
611         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
612         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
613     }
614
615     p_end += 8;
616
617     while( p_dest < p_end )
618     {
619         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
620     }
621 }
622
623 #if defined(CAN_COMPILE_MMXEXT)
624 static void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
625                          size_t i_bytes )
626 {
627     uint8_t* p_dest = (uint8_t*)_p_dest;
628     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
629     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
630     uint8_t* p_end = p_dest + i_bytes - 8;
631     while( p_dest < p_end )
632     {
633         __asm__  __volatile__( "movq %2,%%mm1;"
634                                "pavgb %1, %%mm1;"
635                                "movq %%mm1, %0" :"=m" (*p_dest):
636                                                  "m" (*p_s1),
637                                                  "m" (*p_s2) );
638         p_dest += 8;
639         p_s1 += 8;
640         p_s2 += 8;
641     }
642
643     p_end += 8;
644
645     while( p_dest < p_end )
646     {
647         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
648     }
649 }
650 #endif
651
652 #if defined(CAN_COMPILE_3DNOW)
653 static void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
654                         size_t i_bytes )
655 {
656     uint8_t* p_dest = (uint8_t*)_p_dest;
657     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
658     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
659     uint8_t* p_end = p_dest + i_bytes - 8;
660     while( p_dest < p_end )
661     {
662         __asm__  __volatile__( "movq %2,%%mm1;"
663                                "pavgusb %1, %%mm1;"
664                                "movq %%mm1, %0" :"=m" (*p_dest):
665                                                  "m" (*p_s1),
666                                                  "m" (*p_s2) );
667         p_dest += 8;
668         p_s1 += 8;
669         p_s2 += 8;
670     }
671
672     p_end += 8;
673
674     while( p_dest < p_end )
675     {
676         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
677     }
678 }
679 #endif
680
681 #if defined(CAN_COMPILE_SSE)
682 static void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
683                        size_t i_bytes )
684 {
685     uint8_t* p_dest = (uint8_t*)_p_dest;
686     const uint8_t *p_s1 = (const uint8_t *)_p_s1;
687     const uint8_t *p_s2 = (const uint8_t *)_p_s2;
688     uint8_t* p_end;
689     while( (uintptr_t)p_s1 % 16 )
690     {
691         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
692     }
693     p_end = p_dest + i_bytes - 16;
694     while( p_dest < p_end )
695     {
696         __asm__  __volatile__( "movdqu %2,%%xmm1;"
697                                "pavgb %1, %%xmm1;"
698                                "movdqu %%xmm1, %0" :"=m" (*p_dest):
699                                                  "m" (*p_s1),
700                                                  "m" (*p_s2) );
701         p_dest += 16;
702         p_s1 += 16;
703         p_s2 += 16;
704     }
705
706     p_end += 16;
707
708     while( p_dest < p_end )
709     {
710         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
711     }
712 }
713 #endif
714
715 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
716 static void EndMMX( void )
717 {
718     __asm__ __volatile__( "emms" :: );
719 }
720 #endif
721
722 #if defined(CAN_COMPILE_3DNOW)
723 static void End3DNow( void )
724 {
725     __asm__ __volatile__( "femms" :: );
726 }
727 #endif
728
729 #ifdef CAN_COMPILE_C_ALTIVEC
730 static void MergeAltivec( void *_p_dest, const void *_p_s1,
731                           const void *_p_s2, size_t i_bytes )
732 {
733     uint8_t *p_dest = (uint8_t *)_p_dest;
734     uint8_t *p_s1   = (uint8_t *)_p_s1;
735     uint8_t *p_s2   = (uint8_t *)_p_s2;
736     uint8_t *p_end  = p_dest + i_bytes - 15;
737
738     /* Use C until the first 16-bytes aligned destination pixel */
739     while( (uintptr_t)p_dest & 0xF )
740     {
741         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
742     }
743
744     if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
745     {
746         /* Unaligned source */
747         vector unsigned char s1v, s2v, destv;
748         vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
749         vector unsigned char perm1v, perm2v;
750
751         perm1v = vec_lvsl( 0, p_s1 );
752         perm2v = vec_lvsl( 0, p_s2 );
753         s1oldv = vec_ld( 0, p_s1 );
754         s2oldv = vec_ld( 0, p_s2 );
755
756         while( p_dest < p_end )
757         {
758             s1newv = vec_ld( 16, p_s1 );
759             s2newv = vec_ld( 16, p_s2 );
760             s1v    = vec_perm( s1oldv, s1newv, perm1v );
761             s2v    = vec_perm( s2oldv, s2newv, perm2v );
762             s1oldv = s1newv;
763             s2oldv = s2newv;
764             destv  = vec_avg( s1v, s2v );
765             vec_st( destv, 0, p_dest );
766
767             p_s1   += 16;
768             p_s2   += 16;
769             p_dest += 16;
770         }
771     }
772     else
773     {
774         /* Aligned source */
775         vector unsigned char s1v, s2v, destv;
776
777         while( p_dest < p_end )
778         {
779             s1v   = vec_ld( 0, p_s1 );
780             s2v   = vec_ld( 0, p_s2 );
781             destv = vec_avg( s1v, s2v );
782             vec_st( destv, 0, p_dest );
783
784             p_s1   += 16;
785             p_s2   += 16;
786             p_dest += 16;
787         }
788     }
789
790     p_end += 15;
791
792     while( p_dest < p_end )
793     {
794         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
795     }
796 }
797 #endif
798
799 #ifdef __ARM_NEON__
800 static void MergeNEON (void *restrict out, const void *in1,
801                        const void *in2, size_t n)
802 {
803     uint8_t *outp = out;
804     const uint8_t *in1p = in1;
805     const uint8_t *in2p = in2;
806     size_t mis = ((uintptr_t)outp) & 15;
807
808     if (mis)
809     {
810         MergeGeneric (outp, in1p, in2p, mis);
811         outp += mis;
812         in1p += mis;
813         in2p += mis;
814         n -= mis;
815     }
816
817     uint8_t *end = outp + (n & ~15);
818
819     if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
820         while (outp < end)
821             asm volatile (
822                 "vld1.u8  {q0-q1}, [%[in1]]!\n"
823                 "vld1.u8  {q2-q3}, [%[in2]]!\n"
824                 "vhadd.u8 q4, q0, q2\n"
825                 "vld1.u8  {q6-q7}, [%[in1]]!\n"
826                 "vhadd.u8 q5, q1, q3\n"
827                 "vld1.u8  {q8-q9}, [%[in2]]!\n"
828                 "vhadd.u8 q10, q6, q8\n"
829                 "vhadd.u8 q11, q7, q9\n"
830                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
831                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
832                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
833                 :
834                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
835                   "q8", "q9", "q10", "q11", "memory");
836     else
837          while (outp < end)
838             asm volatile (
839                 "vld1.u8  {q0-q1}, [%[in1],:128]!\n"
840                 "vld1.u8  {q2-q3}, [%[in2],:128]!\n"
841                 "vhadd.u8 q4, q0, q2\n"
842                 "vld1.u8  {q6-q7}, [%[in1],:128]!\n"
843                 "vhadd.u8 q5, q1, q3\n"
844                 "vld1.u8  {q8-q9}, [%[in2],:128]!\n"
845                 "vhadd.u8 q10, q6, q8\n"
846                 "vhadd.u8 q11, q7, q9\n"
847                 "vst1.u8  {q4-q5}, [%[out],:128]!\n"
848                 "vst1.u8  {q10-q11}, [%[out],:128]!\n"
849                 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
850                 :
851                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
852                   "q8", "q9", "q10", "q11", "memory");
853     n &= 15;
854     if (n)
855         MergeGeneric (outp, in1p, in2p, n);
856 }
857 #endif
858
859 /*****************************************************************************
860  * RenderX: This algo works on a 8x8 block basic, it copies the top field
861  * and apply a process to recreate the bottom field :
862  *  If a 8x8 block is classified as :
863  *   - progressive: it applies a small blend (1,6,1)
864  *   - interlaced:
865  *    * in the MMX version: we do a ME between the 2 fields, if there is a
866  *    good match we use MC to recreate the bottom field (with a small
867  *    blend (1,6,1) )
868  *    * otherwise: it recreates the bottom field by an edge oriented
869  *    interpolation.
870   *****************************************************************************/
871
872 /* XDeint8x8Detect: detect if a 8x8 block is interlaced.
873  * XXX: It need to access to 8x10
874  * We use more than 8 lines to help with scrolling (text)
875  * (and because XDeint8x8Frame use line 9)
876  * XXX: smooth/uniform area with noise detection doesn't works well
877  * but it's not really a problem because they don't have much details anyway
878  */
879 static inline int ssd( int a ) { return a*a; }
880 static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
881 {
882     int y, x;
883     int ff, fr;
884     int fc;
885
886     /* Detect interlacing */
887     fc = 0;
888     for( y = 0; y < 7; y += 2 )
889     {
890         ff = fr = 0;
891         for( x = 0; x < 8; x++ )
892         {
893             fr += ssd(src[      x] - src[1*i_src+x]) +
894                   ssd(src[i_src+x] - src[2*i_src+x]);
895             ff += ssd(src[      x] - src[2*i_src+x]) +
896                   ssd(src[i_src+x] - src[3*i_src+x]);
897         }
898         if( ff < 6*fr/8 && fr > 32 )
899             fc++;
900
901         src += 2*i_src;
902     }
903
904     return fc < 1 ? false : true;
905 }
906 #ifdef CAN_COMPILE_MMXEXT
907 static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
908 {
909
910     int y, x;
911     int32_t ff, fr;
912     int fc;
913
914     /* Detect interlacing */
915     fc = 0;
916     pxor_r2r( mm7, mm7 );
917     for( y = 0; y < 9; y += 2 )
918     {
919         ff = fr = 0;
920         pxor_r2r( mm5, mm5 );
921         pxor_r2r( mm6, mm6 );
922         for( x = 0; x < 8; x+=4 )
923         {
924             movd_m2r( src[        x], mm0 );
925             movd_m2r( src[1*i_src+x], mm1 );
926             movd_m2r( src[2*i_src+x], mm2 );
927             movd_m2r( src[3*i_src+x], mm3 );
928
929             punpcklbw_r2r( mm7, mm0 );
930             punpcklbw_r2r( mm7, mm1 );
931             punpcklbw_r2r( mm7, mm2 );
932             punpcklbw_r2r( mm7, mm3 );
933
934             movq_r2r( mm0, mm4 );
935
936             psubw_r2r( mm1, mm0 );
937             psubw_r2r( mm2, mm4 );
938
939             psubw_r2r( mm1, mm2 );
940             psubw_r2r( mm1, mm3 );
941
942             pmaddwd_r2r( mm0, mm0 );
943             pmaddwd_r2r( mm4, mm4 );
944             pmaddwd_r2r( mm2, mm2 );
945             pmaddwd_r2r( mm3, mm3 );
946             paddd_r2r( mm0, mm2 );
947             paddd_r2r( mm4, mm3 );
948             paddd_r2r( mm2, mm5 );
949             paddd_r2r( mm3, mm6 );
950         }
951
952         movq_r2r( mm5, mm0 );
953         psrlq_i2r( 32, mm0 );
954         paddd_r2r( mm0, mm5 );
955         movd_r2m( mm5, fr );
956
957         movq_r2r( mm6, mm0 );
958         psrlq_i2r( 32, mm0 );
959         paddd_r2r( mm0, mm6 );
960         movd_r2m( mm6, ff );
961
962         if( ff < 6*fr/8 && fr > 32 )
963             fc++;
964
965         src += 2*i_src;
966     }
967     return fc;
968 }
969 #endif
970
971 static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
972                                     uint8_t *src1, int i_src1,
973                                     uint8_t *src2, int i_src2 )
974 {
975     int y, x;
976
977     /* Progressive */
978     for( y = 0; y < 8; y += 2 )
979     {
980         memcpy( dst, src1, 8 );
981         dst  += i_dst;
982
983         for( x = 0; x < 8; x++ )
984             dst[x] = (src1[x] + 6*src2[x] + src1[i_src1+x] + 4 ) >> 3;
985         dst += i_dst;
986
987         src1 += i_src1;
988         src2 += i_src2;
989     }
990 }
991
992 #ifdef CAN_COMPILE_MMXEXT
993 static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
994                                          uint8_t *src1, int i_src1,
995                                          uint8_t *src2, int i_src2 )
996 {
997     static const uint64_t m_4 = INT64_C(0x0004000400040004);
998     int y, x;
999
1000     /* Progressive */
1001     pxor_r2r( mm7, mm7 );
1002     for( y = 0; y < 8; y += 2 )
1003     {
1004         for( x = 0; x < 8; x +=4 )
1005         {
1006             movd_m2r( src1[x], mm0 );
1007             movd_r2m( mm0, dst[x] );
1008
1009             movd_m2r( src2[x], mm1 );
1010             movd_m2r( src1[i_src1+x], mm2 );
1011
1012             punpcklbw_r2r( mm7, mm0 );
1013             punpcklbw_r2r( mm7, mm1 );
1014             punpcklbw_r2r( mm7, mm2 );
1015             paddw_r2r( mm1, mm1 );
1016             movq_r2r( mm1, mm3 );
1017             paddw_r2r( mm3, mm3 );
1018             paddw_r2r( mm2, mm0 );
1019             paddw_r2r( mm3, mm1 );
1020             paddw_m2r( m_4, mm1 );
1021             paddw_r2r( mm1, mm0 );
1022             psraw_i2r( 3, mm0 );
1023             packuswb_r2r( mm7, mm0 );
1024             movd_r2m( mm0, dst[i_dst+x] );
1025         }
1026         dst += 2*i_dst;
1027         src1 += i_src1;
1028         src2 += i_src2;
1029     }
1030 }
1031
1032 #endif
1033
1034 /* For debug */
1035 static inline void XDeint8x8Set( uint8_t *dst, int i_dst, uint8_t v )
1036 {
1037     int y;
1038     for( y = 0; y < 8; y++ )
1039         memset( &dst[y*i_dst], v, 8 );
1040 }
1041
1042 /* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
1043  * neighbour
1044  * (Use 8x9 pixels)
1045  * TODO: a better one for the inner part.
1046  */
1047 static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
1048                                      uint8_t *src, int i_src )
1049 {
1050     int y, x;
1051
1052     /* Interlaced */
1053     for( y = 0; y < 8; y += 2 )
1054     {
1055         memcpy( dst, src, 8 );
1056         dst += i_dst;
1057
1058         for( x = 0; x < 8; x++ )
1059             dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1060         dst += 1*i_dst;
1061         src += 2*i_src;
1062     }
1063 }
1064 #ifdef CAN_COMPILE_MMXEXT
1065 static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
1066                                           uint8_t *src, int i_src )
1067 {
1068     int y;
1069
1070     /* Interlaced */
1071     for( y = 0; y < 8; y += 2 )
1072     {
1073         movq_m2r( src[0], mm0 );
1074         movq_r2m( mm0, dst[0] );
1075         dst += i_dst;
1076
1077         movq_m2r( src[2*i_src], mm1 );
1078         pavgb_r2r( mm1, mm0 );
1079
1080         movq_r2m( mm0, dst[0] );
1081
1082         dst += 1*i_dst;
1083         src += 2*i_src;
1084     }
1085 }
1086 #endif
1087
1088 /* XDeint8x8Field: Edge oriented interpolation
1089  * (Need -4 and +5 pixels H, +1 line)
1090  */
1091 static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
1092                                     uint8_t *src, int i_src )
1093 {
1094     int y, x;
1095
1096     /* Interlaced */
1097     for( y = 0; y < 8; y += 2 )
1098     {
1099         memcpy( dst, src, 8 );
1100         dst += i_dst;
1101
1102         for( x = 0; x < 8; x++ )
1103         {
1104             uint8_t *src2 = &src[2*i_src];
1105             /* I use 8 pixels just to match the MMX version, but it's overkill
1106              * 5 would be enough (less isn't good) */
1107             const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
1108                            abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
1109                            abs(src[x+0]-src2[x+2]) + abs(src[x+1]-src2[x+3]) +
1110                            abs(src[x+2]-src2[x+4]) + abs(src[x+3]-src2[x+5]);
1111
1112             const int c1 = abs(src[x-3]-src2[x-3]) + abs(src[x-2]-src2[x-2]) +
1113                            abs(src[x-1]-src2[x-1]) + abs(src[x+0]-src2[x+0]) +
1114                            abs(src[x+1]-src2[x+1]) + abs(src[x+2]-src2[x+2]) +
1115                            abs(src[x+3]-src2[x+3]) + abs(src[x+4]-src2[x+4]);
1116
1117             const int c2 = abs(src[x-2]-src2[x-4]) + abs(src[x-1]-src2[x-3]) +
1118                            abs(src[x+0]-src2[x-2]) + abs(src[x+1]-src2[x-1]) +
1119                            abs(src[x+2]-src2[x+0]) + abs(src[x+3]-src2[x+1]) +
1120                            abs(src[x+4]-src2[x+2]) + abs(src[x+5]-src2[x+3]);
1121
1122             if( c0 < c1 && c1 <= c2 )
1123                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1124             else if( c2 < c1 && c1 <= c0 )
1125                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1126             else
1127                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1128         }
1129
1130         dst += 1*i_dst;
1131         src += 2*i_src;
1132     }
1133 }
1134 #ifdef CAN_COMPILE_MMXEXT
1135 static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
1136                                          uint8_t *src, int i_src )
1137 {
1138     int y, x;
1139
1140     /* Interlaced */
1141     for( y = 0; y < 8; y += 2 )
1142     {
1143         memcpy( dst, src, 8 );
1144         dst += i_dst;
1145
1146         for( x = 0; x < 8; x++ )
1147         {
1148             uint8_t *src2 = &src[2*i_src];
1149             int32_t c0, c1, c2;
1150
1151             movq_m2r( src[x-2], mm0 );
1152             movq_m2r( src[x-3], mm1 );
1153             movq_m2r( src[x-4], mm2 );
1154
1155             psadbw_m2r( src2[x-4], mm0 );
1156             psadbw_m2r( src2[x-3], mm1 );
1157             psadbw_m2r( src2[x-2], mm2 );
1158
1159             movd_r2m( mm0, c2 );
1160             movd_r2m( mm1, c1 );
1161             movd_r2m( mm2, c0 );
1162
1163             if( c0 < c1 && c1 <= c2 )
1164                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
1165             else if( c2 < c1 && c1 <= c0 )
1166                 dst[x] = (src[x+1] + src2[x-1]) >> 1;
1167             else
1168                 dst[x] = (src[x+0] + src2[x+0]) >> 1;
1169         }
1170
1171         dst += 1*i_dst;
1172         src += 2*i_src;
1173     }
1174 }
1175 #endif
1176
1177 /* NxN arbitray size (and then only use pixel in the NxN block)
1178  */
1179 static inline int XDeintNxNDetect( uint8_t *src, int i_src,
1180                                    int i_height, int i_width )
1181 {
1182     int y, x;
1183     int ff, fr;
1184     int fc;
1185
1186
1187     /* Detect interlacing */
1188     /* FIXME way too simple, need to be more like XDeint8x8Detect */
1189     ff = fr = 0;
1190     fc = 0;
1191     for( y = 0; y < i_height - 2; y += 2 )
1192     {
1193         const uint8_t *s = &src[y*i_src];
1194         for( x = 0; x < i_width; x++ )
1195         {
1196             fr += ssd(s[      x] - s[1*i_src+x]);
1197             ff += ssd(s[      x] - s[2*i_src+x]);
1198         }
1199         if( ff < fr && fr > i_width / 2 )
1200             fc++;
1201     }
1202
1203     return fc < 2 ? false : true;
1204 }
1205
1206 static inline void XDeintNxNFrame( uint8_t *dst, int i_dst,
1207                                    uint8_t *src, int i_src,
1208                                    int i_width, int i_height )
1209 {
1210     int y, x;
1211
1212     /* Progressive */
1213     for( y = 0; y < i_height; y += 2 )
1214     {
1215         memcpy( dst, src, i_width );
1216         dst += i_dst;
1217
1218         if( y < i_height - 2 )
1219         {
1220             for( x = 0; x < i_width; x++ )
1221                 dst[x] = (src[x] + 2*src[1*i_src+x] + src[2*i_src+x] + 2 ) >> 2;
1222         }
1223         else
1224         {
1225             /* Blend last line */
1226             for( x = 0; x < i_width; x++ )
1227                 dst[x] = (src[x] + src[1*i_src+x] ) >> 1;
1228         }
1229         dst += 1*i_dst;
1230         src += 2*i_src;
1231     }
1232 }
1233
1234 static inline void XDeintNxNField( uint8_t *dst, int i_dst,
1235                                    uint8_t *src, int i_src,
1236                                    int i_width, int i_height )
1237 {
1238     int y, x;
1239
1240     /* Interlaced */
1241     for( y = 0; y < i_height; y += 2 )
1242     {
1243         memcpy( dst, src, i_width );
1244         dst += i_dst;
1245
1246         if( y < i_height - 2 )
1247         {
1248             for( x = 0; x < i_width; x++ )
1249                 dst[x] = (src[x] + src[2*i_src+x] ) >> 1;
1250         }
1251         else
1252         {
1253             /* Blend last line */
1254             for( x = 0; x < i_width; x++ )
1255                 dst[x] = (src[x] + src[i_src+x]) >> 1;
1256         }
1257         dst += 1*i_dst;
1258         src += 2*i_src;
1259     }
1260 }
1261
1262 static inline void XDeintNxN( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
1263                               int i_width, int i_height )
1264 {
1265     if( XDeintNxNDetect( src, i_src, i_width, i_height ) )
1266         XDeintNxNField( dst, i_dst, src, i_src, i_width, i_height );
1267     else
1268         XDeintNxNFrame( dst, i_dst, src, i_src, i_width, i_height );
1269 }
1270
1271
1272 static inline int median( int a, int b, int c )
1273 {
1274     int min = a, max =a;
1275     if( b < min )
1276         min = b;
1277     else
1278         max = b;
1279
1280     if( c < min )
1281         min = c;
1282     else if( c > max )
1283         max = c;
1284
1285     return a + b + c - min - max;
1286 }
1287
1288
1289 /* XDeintBand8x8:
1290  */
1291 static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
1292                                    uint8_t *src, int i_src,
1293                                    const int i_mbx, int i_modx )
1294 {
1295     int x;
1296
1297     for( x = 0; x < i_mbx; x++ )
1298     {
1299         int s;
1300         if( ( s = XDeint8x8DetectC( src, i_src ) ) )
1301         {
1302             if( x == 0 || x == i_mbx - 1 )
1303                 XDeint8x8FieldEC( dst, i_dst, src, i_src );
1304             else
1305                 XDeint8x8FieldC( dst, i_dst, src, i_src );
1306         }
1307         else
1308         {
1309             XDeint8x8MergeC( dst, i_dst,
1310                              &src[0*i_src], 2*i_src,
1311                              &src[1*i_src], 2*i_src );
1312         }
1313
1314         dst += 8;
1315         src += 8;
1316     }
1317
1318     if( i_modx )
1319         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1320 }
1321 #ifdef CAN_COMPILE_MMXEXT
1322 static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
1323                                         uint8_t *src, int i_src,
1324                                         const int i_mbx, int i_modx )
1325 {
1326     int x;
1327
1328     /* Reset current line */
1329     for( x = 0; x < i_mbx; x++ )
1330     {
1331         int s;
1332         if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
1333         {
1334             if( x == 0 || x == i_mbx - 1 )
1335                 XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
1336             else
1337                 XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
1338         }
1339         else
1340         {
1341             XDeint8x8MergeMMXEXT( dst, i_dst,
1342                                   &src[0*i_src], 2*i_src,
1343                                   &src[1*i_src], 2*i_src );
1344         }
1345
1346         dst += 8;
1347         src += 8;
1348     }
1349
1350     if( i_modx )
1351         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
1352 }
1353 #endif
1354
1355 static void RenderX( picture_t *p_outpic, picture_t *p_pic )
1356 {
1357     int i_plane;
1358
1359     /* Copy image and skip lines */
1360     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
1361     {
1362         const int i_mby = ( p_outpic->p[i_plane].i_visible_lines + 7 )/8 - 1;
1363         const int i_mbx = p_outpic->p[i_plane].i_visible_pitch/8;
1364
1365         const int i_mody = p_outpic->p[i_plane].i_visible_lines - 8*i_mby;
1366         const int i_modx = p_outpic->p[i_plane].i_visible_pitch - 8*i_mbx;
1367
1368         const int i_dst = p_outpic->p[i_plane].i_pitch;
1369         const int i_src = p_pic->p[i_plane].i_pitch;
1370
1371         int y, x;
1372
1373         for( y = 0; y < i_mby; y++ )
1374         {
1375             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1376             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1377
1378 #ifdef CAN_COMPILE_MMXEXT
1379             if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1380                 XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
1381             else
1382 #endif
1383                 XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
1384         }
1385
1386         /* Last line (C only)*/
1387         if( i_mody )
1388         {
1389             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
1390             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
1391
1392             for( x = 0; x < i_mbx; x++ )
1393             {
1394                 XDeintNxN( dst, i_dst, src, i_src, 8, i_mody );
1395
1396                 dst += 8;
1397                 src += 8;
1398             }
1399
1400             if( i_modx )
1401                 XDeintNxN( dst, i_dst, src, i_src, i_modx, i_mody );
1402         }
1403     }
1404
1405 #ifdef CAN_COMPILE_MMXEXT
1406     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1407         emms();
1408 #endif
1409 }
1410
1411 /*****************************************************************************
1412  * Yadif (Yet Another DeInterlacing Filter).
1413  *****************************************************************************/
1414 /* */
1415 struct vf_priv_s {
1416     /*
1417      * 0: Output 1 frame for each frame.
1418      * 1: Output 1 frame for each field.
1419      * 2: Like 0 but skips spatial interlacing check.
1420      * 3: Like 1 but skips spatial interlacing check.
1421      *
1422      * In vlc, only & 0x02 has meaning, as we do the & 0x01 ourself.
1423      */
1424     int mode;
1425 };
1426
1427 /* I am unsure it is the right one */
1428 typedef intptr_t x86_reg;
1429
1430 #define FFABS(a) ((a) >= 0 ? (a) : (-(a)))
1431 #define FFMAX(a,b)      __MAX(a,b)
1432 #define FFMAX3(a,b,c)   FFMAX(FFMAX(a,b),c)
1433 #define FFMIN(a,b)      __MIN(a,b)
1434 #define FFMIN3(a,b,c)   FFMIN(FFMIN(a,b),c)
1435
1436 /* yadif.h comes from vf_yadif.c of mplayer project */
1437 #include "yadif.h"
1438
1439 static int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src, int i_order, int i_field )
1440 {
1441     filter_sys_t *p_sys = p_filter->p_sys;
1442
1443     /* */
1444     assert( i_order == 0 || i_order == 1 );
1445     assert( i_field == 0 || i_field == 1 );
1446
1447     if( i_order == 0 )
1448     {
1449         /* Duplicate the picture
1450          * TODO when the vout rework is finished, picture_Hold() might be enough
1451          * but becarefull, the pitches must match */
1452         picture_t *p_dup = picture_NewFromFormat( &p_src->format );
1453         if( p_dup )
1454             picture_Copy( p_dup, p_src );
1455
1456         /* Slide the history */
1457         if( p_sys->pp_history[0] )
1458             picture_Release( p_sys->pp_history[0]  );
1459         for( int i = 1; i < HISTORY_SIZE; i++ )
1460             p_sys->pp_history[i-1] = p_sys->pp_history[i];
1461         p_sys->pp_history[HISTORY_SIZE-1] = p_dup;
1462     }
1463
1464     /* As the pitches must match, use ONLY pictures coming from picture_New()! */
1465     picture_t *p_prev = p_sys->pp_history[0];
1466     picture_t *p_cur  = p_sys->pp_history[1];
1467     picture_t *p_next = p_sys->pp_history[2];
1468
1469     /* Filter if we have all the pictures we need */
1470     if( p_prev && p_cur && p_next )
1471     {
1472         /* */
1473         void (*filter)(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity);
1474 #if defined(HAVE_YADIF_SSE2)
1475         if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1476             filter = yadif_filter_line_mmx2;
1477         else
1478 #endif
1479             filter = yadif_filter_line_c;
1480
1481         for( int n = 0; n < p_dst->i_planes; n++ )
1482         {
1483             const plane_t *prevp = &p_prev->p[n];
1484             const plane_t *curp  = &p_cur->p[n];
1485             const plane_t *nextp = &p_next->p[n];
1486             plane_t *dstp        = &p_dst->p[n];
1487
1488             for( int y = 1; y < dstp->i_visible_lines - 1; y++ )
1489             {
1490                 if( (y % 2) == i_field )
1491                 {
1492                     vlc_memcpy( &dstp->p_pixels[y * dstp->i_pitch],
1493                                 &curp->p_pixels[y * curp->i_pitch], dstp->i_visible_pitch );
1494                 }
1495                 else
1496                 {
1497                     struct vf_priv_s cfg;
1498                     /* Spatial checks only when enough data */
1499                     cfg.mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2;
1500
1501                     assert( prevp->i_pitch == curp->i_pitch && curp->i_pitch == nextp->i_pitch );
1502                     filter( &cfg,
1503                             &dstp->p_pixels[y * dstp->i_pitch],
1504                             &prevp->p_pixels[y * prevp->i_pitch],
1505                             &curp->p_pixels[y * curp->i_pitch],
1506                             &nextp->p_pixels[y * nextp->i_pitch],
1507                             dstp->i_visible_pitch,
1508                             curp->i_pitch,
1509                             (i_field ^ (i_order == i_field)) & 1 );
1510                 }
1511
1512                 /* We duplicate the first and last lines */
1513                 if( y == 1 )
1514                     vlc_memcpy(&dstp->p_pixels[(y-1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1515                 else if( y == dstp->i_visible_lines - 2 )
1516                     vlc_memcpy(&dstp->p_pixels[(y+1) * dstp->i_pitch], &dstp->p_pixels[y * dstp->i_pitch], dstp->i_pitch);
1517             }
1518         }
1519
1520         /* */
1521         p_dst->date = (p_next->date - p_cur->date) * i_order / 2 + p_cur->date;
1522         return VLC_SUCCESS;
1523     }
1524     else if( !p_prev && !p_cur && p_next )
1525     {
1526         /* FIXME not good as it does not use i_order/i_field */
1527         RenderX( p_dst, p_next );
1528         return VLC_SUCCESS;
1529     }
1530     else
1531     {
1532         return VLC_EGENERIC;
1533     }
1534 }
1535
1536 /*****************************************************************************
1537  * video filter2 functions
1538  *****************************************************************************/
1539 static picture_t *Deinterlace( filter_t *p_filter, picture_t *p_pic )
1540 {
1541     filter_sys_t *p_sys = p_filter->p_sys;
1542     picture_t *p_dst[2];
1543
1544     /* Request output picture */
1545     p_dst[0] = filter_NewPicture( p_filter );
1546     if( p_dst[0] == NULL )
1547     {
1548         picture_Release( p_pic );
1549         return NULL;
1550     }
1551     picture_CopyProperties( p_dst[0], p_pic );
1552
1553     if( p_sys->b_double_rate )
1554     {
1555         p_dst[0]->p_next =
1556         p_dst[1]         = filter_NewPicture( p_filter );
1557         if( p_dst[1] )
1558         {
1559             picture_CopyProperties( p_dst[1], p_pic );
1560             /* XXX it's not really good especially for the first picture, but
1561              * I don't think that delaying by one frame is worth it */
1562             if( p_sys->i_last_date > VLC_TS_INVALID && p_pic->date > VLC_TS_INVALID )
1563                 p_dst[1]->date = p_pic->date + (p_pic->date - p_sys->i_last_date) / 2;
1564         }
1565         p_sys->i_last_date = p_pic->date;
1566     }
1567     else
1568     {
1569         p_dst[1] = NULL;
1570     }
1571
1572     switch( p_sys->i_mode )
1573     {
1574         case DEINTERLACE_DISCARD:
1575             RenderDiscard( p_filter, p_dst[0], p_pic, 0 );
1576             break;
1577
1578         case DEINTERLACE_BOB:
1579             RenderBob( p_filter, p_dst[0], p_pic, !p_pic->b_top_field_first );
1580             if( p_dst[1] )
1581                 RenderBob( p_filter, p_dst[1], p_pic, p_pic->b_top_field_first );
1582             break;;
1583
1584         case DEINTERLACE_LINEAR:
1585             RenderLinear( p_filter, p_dst[0], p_pic, !p_pic->b_top_field_first );
1586             if( p_dst[1] )
1587                 RenderLinear( p_filter, p_dst[1], p_pic, p_pic->b_top_field_first );
1588             break;
1589
1590         case DEINTERLACE_MEAN:
1591             RenderMean( p_filter, p_dst[0], p_pic );
1592             break;
1593
1594         case DEINTERLACE_BLEND:
1595             RenderBlend( p_filter, p_dst[0], p_pic );
1596             break;
1597
1598         case DEINTERLACE_X:
1599             RenderX( p_dst[0], p_pic );
1600             break;
1601
1602         case DEINTERLACE_YADIF:
1603             if( RenderYadif( p_filter, p_dst[0], p_pic, 0, 0 ) )
1604                 goto drop;
1605             break;
1606
1607         case DEINTERLACE_YADIF2X:
1608             if( RenderYadif( p_filter, p_dst[0], p_pic, 0, !p_pic->b_top_field_first ) )
1609                 goto drop;
1610             if( p_dst[1] )
1611                 RenderYadif( p_filter, p_dst[1], p_pic, 1, p_pic->b_top_field_first );
1612             break;
1613     }
1614
1615     p_dst[0]->b_progressive = true;
1616     if( p_dst[1] )
1617         p_dst[1]->b_progressive = true;
1618
1619     picture_Release( p_pic );
1620     return p_dst[0];
1621
1622 drop:
1623     picture_Release( p_dst[0] );
1624     if( p_dst[1] )
1625         picture_Release( p_dst[1] );
1626     picture_Release( p_pic );
1627     return NULL;
1628 }
1629
1630 static void Flush( filter_t *p_filter )
1631 {
1632     filter_sys_t *p_sys = p_filter->p_sys;
1633
1634     p_sys->i_last_date = VLC_TS_INVALID;
1635     for( int i = 0; i < HISTORY_SIZE; i++ )
1636     {
1637         if( p_sys->pp_history[i] )
1638             picture_Release( p_sys->pp_history[i] );
1639         p_sys->pp_history[i] = NULL;
1640     }
1641 }
1642
1643 static int Mouse( filter_t *p_filter,
1644                   vlc_mouse_t *p_mouse, const vlc_mouse_t *p_old, const vlc_mouse_t *p_new )
1645 {
1646     VLC_UNUSED(p_old);
1647     *p_mouse = *p_new;
1648     if( p_filter->p_sys->b_half_height )
1649         p_mouse->i_y *= 2;
1650     return VLC_SUCCESS;
1651 }
1652
1653
1654 /*****************************************************************************
1655  * Open
1656  *****************************************************************************/
1657 static int Open( vlc_object_t *p_this )
1658 {
1659     filter_t *p_filter = (filter_t*)p_this;
1660     filter_sys_t *p_sys;
1661
1662     if( !IsChromaSupported( p_filter->fmt_in.video.i_chroma ) )
1663         return VLC_EGENERIC;
1664
1665     /* */
1666     p_sys = p_filter->p_sys = malloc( sizeof( *p_sys ) );
1667     if( !p_sys )
1668         return VLC_ENOMEM;
1669
1670     p_sys->i_mode = DEINTERLACE_BLEND;
1671     p_sys->b_double_rate = false;
1672     p_sys->b_half_height = true;
1673     p_sys->i_last_date = VLC_TS_INVALID;
1674     for( int i = 0; i < HISTORY_SIZE; i++ )
1675         p_sys->pp_history[i] = NULL;
1676
1677 #if defined(CAN_COMPILE_C_ALTIVEC)
1678     if( vlc_CPU() & CPU_CAPABILITY_ALTIVEC )
1679     {
1680         p_sys->pf_merge = MergeAltivec;
1681         p_sys->pf_end_merge = NULL;
1682     }
1683     else
1684 #endif
1685 #if defined(CAN_COMPILE_SSE)
1686     if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
1687     {
1688         p_sys->pf_merge = MergeSSE2;
1689         p_sys->pf_end_merge = EndMMX;
1690     }
1691     else
1692 #endif
1693 #if defined(CAN_COMPILE_MMXEXT)
1694     if( vlc_CPU() & CPU_CAPABILITY_MMXEXT )
1695     {
1696         p_sys->pf_merge = MergeMMXEXT;
1697         p_sys->pf_end_merge = EndMMX;
1698     }
1699     else
1700 #endif
1701 #if defined(CAN_COMPILE_3DNOW)
1702     if( vlc_CPU() & CPU_CAPABILITY_3DNOW )
1703     {
1704         p_sys->pf_merge = Merge3DNow;
1705         p_sys->pf_end_merge = End3DNow;
1706     }
1707     else
1708 #endif
1709 #if defined __ARM_NEON__
1710     if( vlc_CPU() & CPU_CAPABILITY_NEON )
1711     {
1712         p_sys->pf_merge = MergeNEON;
1713         p_sys->pf_end_merge = NULL;
1714     }
1715     else
1716 #endif
1717     {
1718         p_sys->pf_merge = MergeGeneric;
1719         p_sys->pf_end_merge = NULL;
1720     }
1721
1722     /* */
1723     config_ChainParse( p_filter, FILTER_CFG_PREFIX, ppsz_filter_options,
1724                        p_filter->p_cfg );
1725
1726     char *psz_mode = var_GetNonEmptyString( p_filter, FILTER_CFG_PREFIX "mode" );
1727     SetFilterMethod( p_filter, psz_mode, p_filter->fmt_in.video.i_chroma );
1728     free( psz_mode );
1729
1730     /* */
1731     video_format_t fmt;
1732     GetOutputFormat( p_filter, &fmt, &p_filter->fmt_in.video );
1733     if( !p_filter->b_allow_fmt_out_change &&
1734         ( fmt.i_chroma != p_filter->fmt_in.video.i_chroma ||
1735           fmt.i_height != p_filter->fmt_in.video.i_height ) )
1736     {
1737         Close( VLC_OBJECT(p_filter) );
1738         return VLC_EGENERIC;
1739     }
1740     p_filter->fmt_out.video = fmt;
1741     p_filter->fmt_out.i_codec = fmt.i_chroma;
1742     p_filter->pf_video_filter = Deinterlace;
1743     p_filter->pf_video_flush  = Flush;
1744     p_filter->pf_video_mouse  = Mouse;
1745
1746     msg_Dbg( p_filter, "deinterlacing" );
1747
1748     return VLC_SUCCESS;
1749 }
1750
1751 /*****************************************************************************
1752  * Close: clean up the filter
1753  *****************************************************************************/
1754 static void Close( vlc_object_t *p_this )
1755 {
1756     filter_t *p_filter = (filter_t*)p_this;
1757
1758     Flush( p_filter );
1759     free( p_filter->p_sys );
1760 }
1761