]> git.sesse.net Git - mlt/commitdiff
Add YADIF methods in deinterlace filter.
authorDan Dennedy <dan@dennedy.org>
Fri, 5 Feb 2010 04:01:02 +0000 (20:01 -0800)
committerDan Dennedy <dan@dennedy.org>
Fri, 5 Feb 2010 04:46:53 +0000 (20:46 -0800)
src/modules/xine/Makefile
src/modules/xine/deinterlace.h
src/modules/xine/filter_deinterlace.c
src/modules/xine/vf_yadif_template.h [new file with mode: 0644]
src/modules/xine/yadif.c [new file with mode: 0644]
src/modules/xine/yadif.h [new file with mode: 0644]

index 14619ad5788f227982b0c44c619e19addce9fda2..f18419fd959e334078f944a202e3db885f9a02af 100644 (file)
@@ -8,6 +8,7 @@ TARGET = ../libmltxine$(LIBSUF)
 
 OBJS = factory.o \
           deinterlace.o \
+          yadif.o \
           filter_deinterlace.o
 
 ifdef MMX_FLAGS
index 0cad92a095e4bf60c1d28a8b3e3621c4228fc50b..2d7ff02e165aa388b9c558aec51878bb3a6f7275 100644 (file)
@@ -41,7 +41,9 @@ void deinterlace_yuv( uint8_t *pdst, uint8_t *psrc[],
 #define DEINTERLACE_ONEFIELD    4
 #define DEINTERLACE_ONEFIELDXV  5
 #define DEINTERLACE_LINEARBLEND 6
-
+#define DEINTERLACE_YADIF       7
+#define DEINTERLACE_YADIF_NOSPATIAL 8
+               
 extern const char *deinterlace_methods[];
 
 #endif
index fef3301ff3f3cc95150839ea382c15aed03553b8..c7341afd911dc2f33614b4269fe3c96ec6984d65 100644 (file)
 
 #include <framework/mlt_filter.h>
 #include <framework/mlt_log.h>
+#include <framework/mlt_producer.h>
+#include <framework/mlt_events.h>
 #include "deinterlace.h"
+#include "yadif.h"
 
 #include <framework/mlt_frame.h>
 
 #include <string.h>
 #include <stdlib.h>
 
+int deinterlace_yadif( mlt_frame frame, mlt_filter filter, uint8_t **image, mlt_image_format *format, int *width, int *height, int mode )
+{
+       mlt_properties properties = MLT_FRAME_PROPERTIES( frame );
+       mlt_frame previous_frame = mlt_properties_get_data( properties, "previous frame", NULL );
+       uint8_t* previous_image = NULL;
+       int previous_width = *width;
+       int previous_height = *height;
+       mlt_frame next_frame = mlt_properties_get_data( properties, "next frame", NULL );
+       uint8_t* next_image = NULL;
+       int next_width = *width;
+       int next_height = *height;
+       yadif_filter *yadif = mlt_properties_get_data( MLT_FILTER_PROPERTIES( filter ), "yadif", NULL );
+       
+       mlt_log_debug( MLT_FILTER_SERVICE(filter), "previous %d current %d next %d\n", 
+               previous_frame? mlt_frame_get_position(previous_frame) : -1,
+               mlt_frame_get_position(frame),
+               next_frame?  mlt_frame_get_position(next_frame) : -1);
+
+       if ( !previous_frame || !next_frame )
+               return 1;
+       
+       // Get the preceding frame's image
+       int error = mlt_frame_get_image( previous_frame, &previous_image, format, &previous_width, &previous_height, 0 );
+       
+       if ( !error && previous_image && *format == mlt_image_yuv422 )
+       {
+               // Get the current frame's image
+               error = mlt_frame_get_image( frame, image, format, width, height, 0 );
+               
+               // Check that we aren't already progressive
+               if ( !error && *image && *format == mlt_image_yuv422 &&
+                    !mlt_properties_get_int( MLT_FRAME_PROPERTIES( frame ), "progressive" ) ) 
+               {
+                       // Get the following frame's image
+                       error = mlt_frame_get_image( next_frame, &next_image, format, &next_width, &next_height, 0 );
+               
+                       if ( !error && next_image && *format == mlt_image_yuv422 )
+                       {
+                               if ( !yadif->ysrc )
+                               {
+                                       // Create intermediate planar planes
+                                       yadif->yheight = *height;
+                                       yadif->ywidth  = *width;
+                                       yadif->uvwidth = yadif->ywidth / 2;
+                                       yadif->ypitch  = ( yadif->ywidth +  15 ) / 16 * 16;
+                                       yadif->uvpitch = ( yadif->uvwidth + 15 ) / 16 * 16;
+                                       yadif->ysrc  = (unsigned char *) mlt_pool_alloc( yadif->yheight * yadif->ypitch );
+                                       yadif->usrc  = (unsigned char *) mlt_pool_alloc( yadif->yheight * yadif->uvpitch);
+                                       yadif->vsrc  = (unsigned char *) mlt_pool_alloc( yadif->yheight * yadif->uvpitch );
+                                       yadif->yprev = (unsigned char *) mlt_pool_alloc( yadif->yheight * yadif->ypitch );
+                                       yadif->uprev = (unsigned char *) mlt_pool_alloc( yadif->yheight * yadif->uvpitch );
+                                       yadif->vprev = (unsigned char *) mlt_pool_alloc( yadif->yheight * yadif->uvpitch );
+                                       yadif->ynext = (unsigned char *) mlt_pool_alloc( yadif->yheight * yadif->ypitch );
+                                       yadif->unext = (unsigned char *) mlt_pool_alloc( yadif->yheight * yadif->uvpitch );
+                                       yadif->vnext = (unsigned char *) mlt_pool_alloc( yadif->yheight * yadif->uvpitch );
+                                       yadif->ydest = (unsigned char *) mlt_pool_alloc( yadif->yheight * yadif->ypitch );
+                                       yadif->udest = (unsigned char *) mlt_pool_alloc( yadif->yheight * yadif->uvpitch );
+                                       yadif->vdest = (unsigned char *) mlt_pool_alloc( yadif->yheight * yadif->uvpitch );
+                                       
+                               }
+                       
+                               const int order = mlt_properties_get_int( properties, "top_field_first" );
+                               const int pitch = *width << 1;
+                               const int parity = 0;
+                       
+                               // Convert packed to planar
+                               YUY2ToPlanes( *image, pitch, *width, *height, yadif->ysrc,
+                                       yadif->ypitch, yadif->usrc, yadif->vsrc, yadif->uvpitch, yadif->cpu );
+                               YUY2ToPlanes( previous_image, pitch, *width, *height, yadif->yprev,
+                                       yadif->ypitch, yadif->uprev, yadif->vprev, yadif->uvpitch, yadif->cpu );
+                               YUY2ToPlanes( next_image, pitch, *width, *height, yadif->ynext,
+                                       yadif->ypitch, yadif->unext, yadif->vnext, yadif->uvpitch, yadif->cpu );
+                               
+                               // Deinterlace each plane
+                               filter_plane( mode, yadif->ydest, yadif->ypitch, yadif->yprev, yadif->ysrc,
+                                       yadif->ynext, yadif->ypitch, *width, *height, parity, order, yadif->cpu);
+                               filter_plane( mode, yadif->udest, yadif->uvpitch,yadif->uprev, yadif->usrc,
+                                       yadif->unext, yadif->uvpitch, *width >> 1, *height, parity, order, yadif->cpu);
+                               filter_plane( mode, yadif->vdest, yadif->uvpitch, yadif->vprev, yadif->vsrc,
+                                       yadif->vnext, yadif->uvpitch, *width >> 1, *height, parity, order, yadif->cpu);
+                               
+                               // Convert planar to packed
+                               YUY2FromPlanes( *image, pitch, *width, *height, yadif->ydest,
+                                       yadif->ypitch, yadif->udest, yadif->vdest, yadif->uvpitch, yadif->cpu);
+                       }
+               }
+       }
+       return error;
+}
+
 /** Do it :-).
 */
 
 static int filter_get_image( mlt_frame this, uint8_t **image, mlt_image_format *format, int *width, int *height, int writable )
 {
        int error = 0;
-       int deinterlace = mlt_properties_get_int( MLT_FRAME_PROPERTIES( this ), "consumer_deinterlace" );
-       int progressive = mlt_properties_get_int( MLT_FRAME_PROPERTIES( this ), "progressive" );
+       mlt_properties properties = MLT_FRAME_PROPERTIES( this );
+       int deinterlace = mlt_properties_get_int( properties, "consumer_deinterlace" );
+       int progressive = mlt_properties_get_int( properties, "progressive" );
        
        // Pop the service off the stack
        mlt_filter filter = mlt_frame_pop_service( this );
 
-       // Determine if we need a writable version or not
-       if ( deinterlace && !writable )
-                writable = !progressive;
-
        // Get the input image
        if ( deinterlace && !progressive )
-               *format = mlt_image_yuv422;
-       error = mlt_frame_get_image( this, image, format, width, height, writable );
-       progressive = mlt_properties_get_int( MLT_FRAME_PROPERTIES( this ), "progressive" );
-       mlt_log_debug( MLT_FILTER_SERVICE( filter ), "xine.deinterlace %d prog %d format %s\n",
-               deinterlace, progressive, mlt_image_format_name( *format ) );
-
-       // Check that we want progressive and we aren't already progressive
-       if ( deinterlace && *format == mlt_image_yuv422 && *image && !progressive )
        {
                // Determine deinterlace method
                char *method_str = mlt_properties_get( MLT_FILTER_PROPERTIES( filter ), "method" );
-               int method = DEINTERLACE_LINEARBLEND;
-               char *frame_method_str = mlt_properties_get( MLT_FRAME_PROPERTIES( this ), "deinterlace_method" );
+               int method = DEINTERLACE_NONE;
+               char *frame_method_str = mlt_properties_get( properties, "deinterlace_method" );
                
-               if ( frame_method_str != NULL )
+               if ( frame_method_str )
                        method_str = frame_method_str;
                
-               if ( method_str == NULL )
+               if ( !method_str || strcmp( method_str, "yadif" ) == 0 )
+                       method = DEINTERLACE_YADIF;
+               else if ( strcmp( method_str, "yadif-nospatial" ) == 0 )
+                       method = DEINTERLACE_YADIF_NOSPATIAL;
+               else if ( strcmp( method_str, "onefield" ) == 0 )
+                       method = DEINTERLACE_ONEFIELD;
+               else if ( strcmp( method_str, "linearblend" ) == 0 )
                        method = DEINTERLACE_LINEARBLEND;
                else if ( strcmp( method_str, "bob" ) == 0 )
                        method = DEINTERLACE_BOB;
@@ -70,16 +158,66 @@ static int filter_get_image( mlt_frame this, uint8_t **image, mlt_image_format *
                        method = DEINTERLACE_BOB;
                else if ( strcmp( method_str, "greedy" ) == 0 )
                        method = DEINTERLACE_GREEDY;
-               else if ( strcmp( method_str, "onefield" ) == 0 )
-                       method = DEINTERLACE_ONEFIELD;
+
+               *format = mlt_image_yuv422;
+
+               if ( method == DEINTERLACE_YADIF )
+               {
+                       int mode = 0;
+                       error = deinterlace_yadif( this, filter, image, format, width, height, mode );
+                       progressive = mlt_properties_get_int( properties, "progressive" );
+               }
+               else if ( method == DEINTERLACE_YADIF_NOSPATIAL )
+               {
+                       int mode = 2;
+                       error = deinterlace_yadif( this, filter, image, format, width, height, mode );
+                       progressive = mlt_properties_get_int( properties, "progressive" );
+               }
+               if ( error || ( method > DEINTERLACE_NONE && method < DEINTERLACE_YADIF ) )
+               {
+                       // Signal that we no longer need previous and next frames
+                       mlt_producer producer = mlt_producer_cut_parent( mlt_frame_get_original_producer(this) );
+                       mlt_properties_set_int( MLT_PRODUCER_PROPERTIES(producer), "_need_previous_next", 0 );
                        
-               // Deinterlace the image
-               deinterlace_yuv( *image, image, *width * 2, *height, method );
+                       if ( error )
+                               method = DEINTERLACE_ONEFIELD;
+                       
+                       // Get the current frame's image
+                       error = mlt_frame_get_image( this, image, format, width, height, writable );
+                       progressive = mlt_properties_get_int( properties, "progressive" );
+
+                       // Check that we aren't already progressive
+                       if ( !progressive && !error && *image && *format == mlt_image_yuv422 )
+                       {
+                               // Deinterlace the image using one of the Xine deinterlacers
+                               int image_size = *width * *height * 2;
+                               uint8_t *new_image = mlt_pool_alloc( image_size );
+
+                               deinterlace_yuv( new_image, image, *width * 2, *height, method );
+                               mlt_properties_set_data( properties, "image", new_image, image_size, mlt_pool_release, NULL );
+                               *image = new_image;
+                       }
+               }
+               else if ( method == DEINTERLACE_NONE )
+               {
+                       error = mlt_frame_get_image( this, image, format, width, height, writable );
+               }
                
-               // Make sure that others know the frame is deinterlaced
-               mlt_properties_set_int( MLT_FRAME_PROPERTIES( this ), "progressive", 1 );
+               mlt_log_debug( MLT_FILTER_SERVICE( filter ), "error %d deint %d prog %d fmt %s method %s\n",
+                       error, deinterlace, progressive, mlt_image_format_name( *format ), method_str ? method_str : "yadif" );
+               
+               if ( !error )
+               {
+                       // Make sure that others know the frame is deinterlaced
+                       mlt_properties_set_int( properties, "progressive", 1 );
+               }
        }
-
+       else
+       {
+               // Pass through
+               error = mlt_frame_get_image( this, image, format, width, height, writable );
+       }
+       
        return error;
 }
 
@@ -97,6 +235,36 @@ static mlt_frame deinterlace_process( mlt_filter this, mlt_frame frame )
        return frame;
 }
 
+static void filter_close( mlt_filter this )
+{
+       yadif_filter *yadif = mlt_properties_get_data( MLT_FILTER_PROPERTIES( this ), "yadif", NULL );
+       if ( yadif )
+       {
+               if ( yadif->ysrc )
+               {
+                       mlt_pool_release( yadif->ysrc );
+                       mlt_pool_release( yadif->usrc );
+                       mlt_pool_release( yadif->vsrc );
+                       mlt_pool_release( yadif->yprev );
+                       mlt_pool_release( yadif->uprev );
+                       mlt_pool_release( yadif->vprev );
+                       mlt_pool_release( yadif->ynext );
+                       mlt_pool_release( yadif->unext );
+                       mlt_pool_release( yadif->vnext );
+                       mlt_pool_release( yadif->ydest );
+                       mlt_pool_release( yadif->udest );
+                       mlt_pool_release( yadif->vdest );
+               }
+               mlt_pool_release( yadif );
+       }
+}
+
+static void on_service_changed( mlt_service owner, mlt_service filter )
+{
+       mlt_service service = mlt_properties_get_data( MLT_SERVICE_PROPERTIES(filter), "service", NULL );
+       mlt_properties_set_int( MLT_SERVICE_PROPERTIES(service), "_need_previous_next", 1 );
+}
+
 /** Constructor for the filter.
 */
 
@@ -105,8 +273,37 @@ mlt_filter filter_deinterlace_init( mlt_profile profile, mlt_service_type type,
        mlt_filter this = mlt_filter_new( );
        if ( this != NULL )
        {
+               yadif_filter *yadif = mlt_pool_alloc( sizeof( *yadif ) );
+
+               yadif->cpu = 0; // Pure C
+#ifdef USE_SSE
+               yadif->cpu |= AVS_CPU_INTEGER_SSE;
+#endif
+#ifdef USE_SSE2
+               yadif->cpu |= AVS_CPU_SSE2;
+#endif
+               yadif->ysrc = NULL;
                this->process = deinterlace_process;
+               this->close = filter_close;
                mlt_properties_set( MLT_FILTER_PROPERTIES( this ), "method", arg );
+               mlt_properties_set_data( MLT_FILTER_PROPERTIES( this ), "yadif", yadif, sizeof(*yadif), NULL, NULL );
+               mlt_events_listen( MLT_FILTER_PROPERTIES( this ), this, "service-changed", (mlt_listener) on_service_changed ); 
+               
+#if defined(__GNUC__) && !defined(PIC)
+               // Set SSSE3 bit to cpu
+               asm (\
+               "mov $1, %%eax \n\t"\
+               "push %%ebx \n\t"\
+               "cpuid \n\t"\
+               "pop %%ebx \n\t"\
+               "mov %%ecx, %%edx \n\t"\
+               "shr $9, %%edx \n\t"\
+               "and $1, %%edx \n\t"\
+               "shl $9, %%edx \n\t"\
+               "and $511, %%ebx \n\t"\
+               "or %%edx, %%ebx \n\t"\
+               : "=b"(yadif->cpu) : "p"(yadif->cpu) : "%eax", "%ecx", "%edx");
+#endif
        }
        return this;
 }
diff --git a/src/modules/xine/vf_yadif_template.h b/src/modules/xine/vf_yadif_template.h
new file mode 100644 (file)
index 0000000..2a57d41
--- /dev/null
@@ -0,0 +1,245 @@
+/*\r
+ * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>\r
+ *\r
+ * SSE2/SSSE3 version (custom optimization) by h.yamagata\r
+ *\r
+ * Small fix by Alexander Balakhnin (fizick@avisynth.org.ru)\r
+ *\r
+ * MPlayer is free software; you can redistribute it and/or modify\r
+ * it under the terms of the GNU General Public License as published by\r
+ * the Free Software Foundation; either version 2 of the License, or\r
+ * (at your option) any later version.\r
+ *\r
+ * MPlayer is distributed in the hope that it will be useful,\r
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
+ * GNU General Public License for more details.\r
+ *\r
+ * You should have received a copy of the GNU General Public License along\r
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,\r
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.\r
+ */\r
+\r
+#define LOAD8(mem,dst) \\r
+            "movq      "mem", "#dst" \n\t"\\r
+            "punpcklbw %%xmm7, "#dst" \n\t"\r
+\r
+#define CHECK(pj,mj) \\r
+            "movdqu "#pj"(%[cur],%[mrefs]), %%xmm2 \n\t" /* cur[x-refs-1+j] */\\r
+            "movdqu "#mj"(%[cur],%[prefs]), %%xmm3 \n\t" /* cur[x+refs-1-j] */\\r
+            "movdqa      %%xmm2, %%xmm4 \n\t"\\r
+            "movdqa      %%xmm2, %%xmm5 \n\t"\\r
+            "pxor        %%xmm3, %%xmm4 \n\t"\\r
+            "pavgb       %%xmm3, %%xmm5 \n\t"\\r
+            "pand        %[pb1], %%xmm4 \n\t"\\r
+            "psubusb     %%xmm4, %%xmm5 \n\t"\\r
+            "psrldq      $1,    %%xmm5 \n\t"\\r
+            "punpcklbw   %%xmm7, %%xmm5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\\r
+            "movdqa      %%xmm2, %%xmm4 \n\t"\\r
+            "psubusb     %%xmm3, %%xmm2 \n\t"\\r
+            "psubusb     %%xmm4, %%xmm3 \n\t"\\r
+            "pmaxub      %%xmm3, %%xmm2 \n\t"\\r
+            "movdqa      %%xmm2, %%xmm3 \n\t"\\r
+            "movdqa      %%xmm2, %%xmm4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\\r
+            "psrldq      $1,   %%xmm3 \n\t" /* ABS(cur[x-refs  +j] - cur[x+refs  -j]) */\\r
+            "psrldq      $2,   %%xmm4 \n\t" /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\\r
+            "punpcklbw   %%xmm7, %%xmm2 \n\t"\\r
+            "punpcklbw   %%xmm7, %%xmm3 \n\t"\\r
+            "punpcklbw   %%xmm7, %%xmm4 \n\t"\\r
+            "paddw       %%xmm3, %%xmm2 \n\t"\\r
+            "paddw       %%xmm4, %%xmm2 \n\t" /* score */\r
+\r
+#define CHECK1 \\r
+            "movdqa      %%xmm0, %%xmm3 \n\t"\\r
+            "pcmpgtw     %%xmm2, %%xmm3 \n\t" /* if(score < spatial_score) */\\r
+            "pminsw      %%xmm2, %%xmm0 \n\t" /* spatial_score= score; */\\r
+            "movdqa      %%xmm3, %%xmm6 \n\t"\\r
+            "pand        %%xmm3, %%xmm5 \n\t"\\r
+            "pandn       %%xmm1, %%xmm3 \n\t"\\r
+            "por         %%xmm5, %%xmm3 \n\t"\\r
+            "movdqa      %%xmm3, %%xmm1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */\r
+\r
+#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\\r
+                  hurts both quality and speed, but matches the C version. */\\r
+            "paddw       %[pw1], %%xmm6 \n\t"\\r
+            "psllw       $14,   %%xmm6 \n\t"\\r
+            "paddsw      %%xmm6, %%xmm2 \n\t"\\r
+            "movdqa      %%xmm0, %%xmm3 \n\t"\\r
+            "pcmpgtw     %%xmm2, %%xmm3 \n\t"\\r
+            "pminsw      %%xmm2, %%xmm0 \n\t"\\r
+            "pand        %%xmm3, %%xmm5 \n\t"\\r
+            "pandn       %%xmm1, %%xmm3 \n\t"\\r
+            "por         %%xmm5, %%xmm3 \n\t"\\r
+            "movdqa      %%xmm3, %%xmm1 \n\t"\r
+\r
+/* mode argument mod - Fizick */\r
+\r
+/* static  attribute_align_arg void FILTER_LINE_FUNC_NAME(YadifContext *yadctx, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity){\r
+     const int mode = yadctx->mode; */\r
+static attribute_align_arg void FILTER_LINE_FUNC_NAME(int mode, uint8_t *dst, const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w, int refs, int parity){\r
+    DECLARE_ALIGNED(16, uint8_t, tmp0[16]);\r
+    DECLARE_ALIGNED(16, uint8_t, tmp1[16]);\r
+    DECLARE_ALIGNED(16, uint8_t, tmp2[16]);\r
+    DECLARE_ALIGNED(16, uint8_t, tmp3[16]);\r
+    int x;\r
+    static DECLARE_ALIGNED(16, const unsigned short, pw_1[]) =\r
+    {\r
+        0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001\r
+    };\r
+\r
+    static DECLARE_ALIGNED(16, const unsigned short, pb_1[]) =\r
+    {\r
+        0x0101,0x0101,0x0101,0x0101,0x0101,0x0101,0x0101,0x0101\r
+    };\r
+\r
+\r
+#define FILTER\\r
+    for(x=0; x<w; x+=8){\\r
+        __asm__ volatile(\\r
+            "pxor        %%xmm7, %%xmm7 \n\t"\\r
+            LOAD8("(%[cur],%[mrefs])", %%xmm0) /* c = cur[x-refs] */\\r
+            LOAD8("(%[cur],%[prefs])", %%xmm1) /* e = cur[x+refs] */\\r
+            LOAD8("(%["prev2"])", %%xmm2) /* prev2[x] */\\r
+            LOAD8("(%["next2"])", %%xmm3) /* next2[x] */\\r
+            "movdqa      %%xmm3, %%xmm4 \n\t"\\r
+            "paddw       %%xmm2, %%xmm3 \n\t"\\r
+            "psraw       $1,    %%xmm3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\\r
+            "movdqa      %%xmm0, %[tmp0] \n\t" /* c */\\r
+            "movdqa      %%xmm3, %[tmp1] \n\t" /* d */\\r
+            "movdqa      %%xmm1, %[tmp2] \n\t" /* e */\\r
+            "psubw       %%xmm4, %%xmm2 \n\t"\\r
+            PABS(        %%xmm4, %%xmm2) /* temporal_diff0 */\\r
+            LOAD8("(%[prev],%[mrefs])", %%xmm3) /* prev[x-refs] */\\r
+            LOAD8("(%[prev],%[prefs])", %%xmm4) /* prev[x+refs] */\\r
+            "psubw       %%xmm0, %%xmm3 \n\t"\\r
+            "psubw       %%xmm1, %%xmm4 \n\t"\\r
+            PABS(        %%xmm5, %%xmm3)\\r
+            PABS(        %%xmm5, %%xmm4)\\r
+            "paddw       %%xmm4, %%xmm3 \n\t" /* temporal_diff1 */\\r
+            "psrlw       $1,    %%xmm2 \n\t"\\r
+            "psrlw       $1,    %%xmm3 \n\t"\\r
+            "pmaxsw      %%xmm3, %%xmm2 \n\t"\\r
+            LOAD8("(%[next],%[mrefs])", %%xmm3) /* next[x-refs] */\\r
+            LOAD8("(%[next],%[prefs])", %%xmm4) /* next[x+refs] */\\r
+            "psubw       %%xmm0, %%xmm3 \n\t"\\r
+            "psubw       %%xmm1, %%xmm4 \n\t"\\r
+            PABS(        %%xmm5, %%xmm3)\\r
+            PABS(        %%xmm5, %%xmm4)\\r
+            "paddw       %%xmm4, %%xmm3 \n\t" /* temporal_diff2 */\\r
+            "psrlw       $1,    %%xmm3 \n\t"\\r
+            "pmaxsw      %%xmm3, %%xmm2 \n\t"\\r
+            "movdqa      %%xmm2, %[tmp3] \n\t" /* diff */\\r
+\\r
+            "paddw       %%xmm0, %%xmm1 \n\t"\\r
+            "paddw       %%xmm0, %%xmm0 \n\t"\\r
+            "psubw       %%xmm1, %%xmm0 \n\t"\\r
+            "psrlw       $1,    %%xmm1 \n\t" /* spatial_pred */\\r
+            PABS(        %%xmm2, %%xmm0)      /* ABS(c-e) */\\r
+\\r
+            "movdqu      -1(%[cur],%[mrefs]), %%xmm2 \n\t" /* cur[x-refs-1] */\\r
+            "movdqu      -1(%[cur],%[prefs]), %%xmm3 \n\t" /* cur[x+refs-1] */\\r
+            "movdqa      %%xmm2, %%xmm4 \n\t"\\r
+            "psubusb     %%xmm3, %%xmm2 \n\t"\\r
+            "psubusb     %%xmm4, %%xmm3 \n\t"\\r
+            "pmaxub      %%xmm3, %%xmm2 \n\t"\\r
+            /*"pshuflw      $9,%%xmm2, %%xmm3 \n\t"*/\\r
+            /*"pshufhw      $9,%%xmm2, %%xmm3 \n\t"*/\\r
+            "movdqa %%xmm2, %%xmm3 \n\t" /* correct replacement (here)  */\
+            "psrldq $2, %%xmm3 \n\t"/* for "pshufw $9,%%mm2, %%mm3" - fix by Fizick */\
+            "punpcklbw   %%xmm7, %%xmm2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\\r
+            "punpcklbw   %%xmm7, %%xmm3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\\r
+            "paddw       %%xmm2, %%xmm0 \n\t"\\r
+            "paddw       %%xmm3, %%xmm0 \n\t"\\r
+            "psubw       %[pw1], %%xmm0 \n\t" /* spatial_score */\\r
+\\r
+            CHECK(-2,0)\\r
+            CHECK1\\r
+            CHECK(-3,1)\\r
+            CHECK2\\r
+            CHECK(0,-2)\\r
+            CHECK1\\r
+            CHECK(1,-3)\\r
+            CHECK2\\r
+\\r
+            /* if(yadctx->mode<2) ... */\\r
+            "movdqa      %[tmp3], %%xmm6 \n\t" /* diff */\\r
+            "cmp         $2, %[mode] \n\t"\\r
+            "jge         1f \n\t"\\r
+            LOAD8("(%["prev2"],%[mrefs],2)", %%xmm2) /* prev2[x-2*refs] */\\r
+            LOAD8("(%["next2"],%[mrefs],2)", %%xmm4) /* next2[x-2*refs] */\\r
+            LOAD8("(%["prev2"],%[prefs],2)", %%xmm3) /* prev2[x+2*refs] */\\r
+            LOAD8("(%["next2"],%[prefs],2)", %%xmm5) /* next2[x+2*refs] */\\r
+            "paddw       %%xmm4, %%xmm2 \n\t"\\r
+            "paddw       %%xmm5, %%xmm3 \n\t"\\r
+            "psrlw       $1,    %%xmm2 \n\t" /* b */\\r
+            "psrlw       $1,    %%xmm3 \n\t" /* f */\\r
+            "movdqa      %[tmp0], %%xmm4 \n\t" /* c */\\r
+            "movdqa      %[tmp1], %%xmm5 \n\t" /* d */\\r
+            "movdqa      %[tmp2], %%xmm7 \n\t" /* e */\\r
+            "psubw       %%xmm4, %%xmm2 \n\t" /* b-c */\\r
+            "psubw       %%xmm7, %%xmm3 \n\t" /* f-e */\\r
+            "movdqa      %%xmm5, %%xmm0 \n\t"\\r
+            "psubw       %%xmm4, %%xmm5 \n\t" /* d-c */\\r
+            "psubw       %%xmm7, %%xmm0 \n\t" /* d-e */\\r
+            "movdqa      %%xmm2, %%xmm4 \n\t"\\r
+            "pminsw      %%xmm3, %%xmm2 \n\t"\\r
+            "pmaxsw      %%xmm4, %%xmm3 \n\t"\\r
+            "pmaxsw      %%xmm5, %%xmm2 \n\t"\\r
+            "pminsw      %%xmm5, %%xmm3 \n\t"\\r
+            "pmaxsw      %%xmm0, %%xmm2 \n\t" /* max */\\r
+            "pminsw      %%xmm0, %%xmm3 \n\t" /* min */\\r
+            "pxor        %%xmm4, %%xmm4 \n\t"\\r
+            "pmaxsw      %%xmm3, %%xmm6 \n\t"\\r
+            "psubw       %%xmm2, %%xmm4 \n\t" /* -max */\\r
+            "pmaxsw      %%xmm4, %%xmm6 \n\t" /* diff= MAX3(diff, min, -max); */\\r
+            "1: \n\t"\\r
+\\r
+            "movdqa      %[tmp1], %%xmm2 \n\t" /* d */\\r
+            "movdqa      %%xmm2, %%xmm3 \n\t"\\r
+            "psubw       %%xmm6, %%xmm2 \n\t" /* d-diff */\\r
+            "paddw       %%xmm6, %%xmm3 \n\t" /* d+diff */\\r
+            "pmaxsw      %%xmm2, %%xmm1 \n\t"\\r
+            "pminsw      %%xmm3, %%xmm1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\\r
+            "packuswb    %%xmm1, %%xmm1 \n\t"\\r
+\\r
+            :[tmp0]"=m"(tmp0),\\r
+             [tmp1]"=m"(tmp1),\\r
+             [tmp2]"=m"(tmp2),\\r
+             [tmp3]"=m"(tmp3)\\r
+            :[prev] "r"(prev),\\r
+             [cur]  "r"(cur),\\r
+             [next] "r"(next),\\r
+             [prefs]"r"((long)refs),\\r
+             [mrefs]"r"((long)-refs),\\r
+             [pw1]  "m"(*pw_1),\\r
+             [pb1]  "m"(*pb_1),\\r
+             [mode] "g"(mode)\\r
+        );\\r
+        __asm__ volatile("movq %%xmm1, %0" :"=m"(*dst));\\r
+        dst += 8;\\r
+        prev+= 8;\\r
+        cur += 8;\\r
+        next+= 8;\\r
+    }\r
+\r
+    if(parity){\r
+#define prev2 "prev"\r
+#define next2 "cur"\r
+        FILTER\r
+#undef prev2\r
+#undef next2\r
+    }else{\r
+#define prev2 "cur"\r
+#define next2 "next"\r
+        FILTER\r
+#undef prev2\r
+#undef next2\r
+    }\r
+}\r
+#undef LOAD8\r
+#undef PABS\r
+#undef CHECK\r
+#undef CHECK1\r
+#undef CHECK2\r
+#undef FILTER\r
+#undef FILTER_LINE_FUNC_NAME\r
diff --git a/src/modules/xine/yadif.c b/src/modules/xine/yadif.c
new file mode 100644 (file)
index 0000000..f1c759d
--- /dev/null
@@ -0,0 +1,553 @@
+/*
+       Yadif C-plugin for Avisynth 2.5 - Yet Another DeInterlacing Filter
+       Copyright (C)2007 Alexander G. Balakhnin aka Fizick  http://avisynth.org.ru
+    Port of YADIF filter from MPlayer
+       Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+
+    This program is free software; you can redistribute it and/or modify
+       it under the terms of the GNU General Public License as published by
+       the Free Software Foundation.
+
+       This program is distributed in the hope that it will be useful,
+       but WITHOUT ANY WARRANTY; without even the implied warranty of
+       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+       GNU General Public License for more details.
+
+       You should have received a copy of the GNU General Public License
+       along with this program; if not, write to the Free Software
+       Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+    Avisynth_C plugin
+       Assembler optimized for GNU C compiler
+
+*/
+#include "yadif.h"
+#include <stdlib.h>
+#include <memory.h>
+
+#define MIN(a,b) ((a) > (b) ? (b) : (a))
+#define MAX(a,b) ((a) < (b) ? (b) : (a))
+#define ABS(a) ((a) > 0 ? (a) : (-(a)))
+
+#define MIN3(a,b,c) MIN(MIN(a,b),c)
+#define MAX3(a,b,c) MAX(MAX(a,b),c)
+
+static void (*filter_line)(int mode, uint8_t *dst, const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w, int refs, int parity);
+
+#ifdef __GNUC__
+#define LOAD4(mem,dst) \
+            "movd      "mem", "#dst" \n\t"\
+            "punpcklbw %%mm7, "#dst" \n\t"
+
+#define PABS(tmp,dst) \
+            "pxor     "#tmp", "#tmp" \n\t"\
+            "psubw    "#dst", "#tmp" \n\t"\
+            "pmaxsw   "#tmp", "#dst" \n\t"
+
+#define CHECK(pj,mj) \
+            "movq "#pj"(%[cur],%[mrefs]), %%mm2 \n\t" /* cur[x-refs-1+j] */\
+            "movq "#mj"(%[cur],%[prefs]), %%mm3 \n\t" /* cur[x+refs-1-j] */\
+            "movq      %%mm2, %%mm4 \n\t"\
+            "movq      %%mm2, %%mm5 \n\t"\
+            "pxor      %%mm3, %%mm4 \n\t"\
+            "pavgb     %%mm3, %%mm5 \n\t"\
+            "pand     %[pb1], %%mm4 \n\t"\
+            "psubusb   %%mm4, %%mm5 \n\t"\
+            "psrlq     $8,    %%mm5 \n\t"\
+            "punpcklbw %%mm7, %%mm5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
+            "movq      %%mm2, %%mm4 \n\t"\
+            "psubusb   %%mm3, %%mm2 \n\t"\
+            "psubusb   %%mm4, %%mm3 \n\t"\
+            "pmaxub    %%mm3, %%mm2 \n\t"\
+            "movq      %%mm2, %%mm3 \n\t"\
+            "movq      %%mm2, %%mm4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
+            "psrlq      $8,   %%mm3 \n\t" /* ABS(cur[x-refs  +j] - cur[x+refs  -j]) */\
+            "psrlq     $16,   %%mm4 \n\t" /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
+            "punpcklbw %%mm7, %%mm2 \n\t"\
+            "punpcklbw %%mm7, %%mm3 \n\t"\
+            "punpcklbw %%mm7, %%mm4 \n\t"\
+            "paddw     %%mm3, %%mm2 \n\t"\
+            "paddw     %%mm4, %%mm2 \n\t" /* score */
+
+#define CHECK1 \
+            "movq      %%mm0, %%mm3 \n\t"\
+            "pcmpgtw   %%mm2, %%mm3 \n\t" /* if(score < spatial_score) */\
+            "pminsw    %%mm2, %%mm0 \n\t" /* spatial_score= score; */\
+            "movq      %%mm3, %%mm6 \n\t"\
+            "pand      %%mm3, %%mm5 \n\t"\
+            "pandn     %%mm1, %%mm3 \n\t"\
+            "por       %%mm5, %%mm3 \n\t"\
+            "movq      %%mm3, %%mm1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
+
+#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
+                  hurts both quality and speed, but matches the C version. */\
+            "paddw    %[pw1], %%mm6 \n\t"\
+            "psllw     $14,   %%mm6 \n\t"\
+            "paddsw    %%mm6, %%mm2 \n\t"\
+            "movq      %%mm0, %%mm3 \n\t"\
+            "pcmpgtw   %%mm2, %%mm3 \n\t"\
+            "pminsw    %%mm2, %%mm0 \n\t"\
+            "pand      %%mm3, %%mm5 \n\t"\
+            "pandn     %%mm1, %%mm3 \n\t"\
+            "por       %%mm5, %%mm3 \n\t"\
+            "movq      %%mm3, %%mm1 \n\t"
+
+static void filter_line_mmx2(int mode, uint8_t *dst, const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w, int refs, int parity){
+    static const uint64_t pw_1 = 0x0001000100010001ULL;
+    static const uint64_t pb_1 = 0x0101010101010101ULL;
+//    const int mode = p->mode;
+    uint64_t tmp0, tmp1, tmp2, tmp3;
+    int x;
+
+#define FILTER\
+    for(x=0; x<w; x+=4){\
+        asm volatile(\
+            "pxor      %%mm7, %%mm7 \n\t"\
+            LOAD4("(%[cur],%[mrefs])", %%mm0) /* c = cur[x-refs] */\
+            LOAD4("(%[cur],%[prefs])", %%mm1) /* e = cur[x+refs] */\
+            LOAD4("(%["prev2"])", %%mm2) /* prev2[x] */\
+            LOAD4("(%["next2"])", %%mm3) /* next2[x] */\
+            "movq      %%mm3, %%mm4 \n\t"\
+            "paddw     %%mm2, %%mm3 \n\t"\
+            "psraw     $1,    %%mm3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
+            "movq      %%mm0, %[tmp0] \n\t" /* c */\
+            "movq      %%mm3, %[tmp1] \n\t" /* d */\
+            "movq      %%mm1, %[tmp2] \n\t" /* e */\
+            "psubw     %%mm4, %%mm2 \n\t"\
+            PABS(      %%mm4, %%mm2) /* temporal_diff0 */\
+            LOAD4("(%[prev],%[mrefs])", %%mm3) /* prev[x-refs] */\
+            LOAD4("(%[prev],%[prefs])", %%mm4) /* prev[x+refs] */\
+            "psubw     %%mm0, %%mm3 \n\t"\
+            "psubw     %%mm1, %%mm4 \n\t"\
+            PABS(      %%mm5, %%mm3)\
+            PABS(      %%mm5, %%mm4)\
+            "paddw     %%mm4, %%mm3 \n\t" /* temporal_diff1 */\
+            "psrlw     $1,    %%mm2 \n\t"\
+            "psrlw     $1,    %%mm3 \n\t"\
+            "pmaxsw    %%mm3, %%mm2 \n\t"\
+            LOAD4("(%[next],%[mrefs])", %%mm3) /* next[x-refs] */\
+            LOAD4("(%[next],%[prefs])", %%mm4) /* next[x+refs] */\
+            "psubw     %%mm0, %%mm3 \n\t"\
+            "psubw     %%mm1, %%mm4 \n\t"\
+            PABS(      %%mm5, %%mm3)\
+            PABS(      %%mm5, %%mm4)\
+            "paddw     %%mm4, %%mm3 \n\t" /* temporal_diff2 */\
+            "psrlw     $1,    %%mm3 \n\t"\
+            "pmaxsw    %%mm3, %%mm2 \n\t"\
+            "movq      %%mm2, %[tmp3] \n\t" /* diff */\
+\
+            "paddw     %%mm0, %%mm1 \n\t"\
+            "paddw     %%mm0, %%mm0 \n\t"\
+            "psubw     %%mm1, %%mm0 \n\t"\
+            "psrlw     $1,    %%mm1 \n\t" /* spatial_pred */\
+            PABS(      %%mm2, %%mm0)      /* ABS(c-e) */\
+\
+            "movq -1(%[cur],%[mrefs]), %%mm2 \n\t" /* cur[x-refs-1] */\
+            "movq -1(%[cur],%[prefs]), %%mm3 \n\t" /* cur[x+refs-1] */\
+            "movq      %%mm2, %%mm4 \n\t"\
+            "psubusb   %%mm3, %%mm2 \n\t"\
+            "psubusb   %%mm4, %%mm3 \n\t"\
+            "pmaxub    %%mm3, %%mm2 \n\t"\
+            /*"pshufw $9,%%mm2, %%mm3 \n\t"*/\
+            "movq %%mm2, %%mm3 \n\t" /* replace for "pshufw $9,%%mm2, %%mm3" - Fizick */\
+            "psrlq $16, %%mm3 \n\t"/* replace for "pshufw $9,%%mm2, %%mm3" - Fizick*/\
+            "punpcklbw %%mm7, %%mm2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
+            "punpcklbw %%mm7, %%mm3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
+            "paddw     %%mm2, %%mm0 \n\t"\
+            "paddw     %%mm3, %%mm0 \n\t"\
+            "psubw    %[pw1], %%mm0 \n\t" /* spatial_score */\
+\
+            CHECK(-2,0)\
+            CHECK1\
+            CHECK(-3,1)\
+            CHECK2\
+            CHECK(0,-2)\
+            CHECK1\
+            CHECK(1,-3)\
+            CHECK2\
+\
+            /* if(p->mode<2) ... */\
+            "movq    %[tmp3], %%mm6 \n\t" /* diff */\
+            "cmp       $2, %[mode] \n\t"\
+            "jge       1f \n\t"\
+            LOAD4("(%["prev2"],%[mrefs],2)", %%mm2) /* prev2[x-2*refs] */\
+            LOAD4("(%["next2"],%[mrefs],2)", %%mm4) /* next2[x-2*refs] */\
+            LOAD4("(%["prev2"],%[prefs],2)", %%mm3) /* prev2[x+2*refs] */\
+            LOAD4("(%["next2"],%[prefs],2)", %%mm5) /* next2[x+2*refs] */\
+            "paddw     %%mm4, %%mm2 \n\t"\
+            "paddw     %%mm5, %%mm3 \n\t"\
+            "psrlw     $1,    %%mm2 \n\t" /* b */\
+            "psrlw     $1,    %%mm3 \n\t" /* f */\
+            "movq    %[tmp0], %%mm4 \n\t" /* c */\
+            "movq    %[tmp1], %%mm5 \n\t" /* d */\
+            "movq    %[tmp2], %%mm7 \n\t" /* e */\
+            "psubw     %%mm4, %%mm2 \n\t" /* b-c */\
+            "psubw     %%mm7, %%mm3 \n\t" /* f-e */\
+            "movq      %%mm5, %%mm0 \n\t"\
+            "psubw     %%mm4, %%mm5 \n\t" /* d-c */\
+            "psubw     %%mm7, %%mm0 \n\t" /* d-e */\
+            "movq      %%mm2, %%mm4 \n\t"\
+            "pminsw    %%mm3, %%mm2 \n\t"\
+            "pmaxsw    %%mm4, %%mm3 \n\t"\
+            "pmaxsw    %%mm5, %%mm2 \n\t"\
+            "pminsw    %%mm5, %%mm3 \n\t"\
+            "pmaxsw    %%mm0, %%mm2 \n\t" /* max */\
+            "pminsw    %%mm0, %%mm3 \n\t" /* min */\
+            "pxor      %%mm4, %%mm4 \n\t"\
+            "pmaxsw    %%mm3, %%mm6 \n\t"\
+            "psubw     %%mm2, %%mm4 \n\t" /* -max */\
+            "pmaxsw    %%mm4, %%mm6 \n\t" /* diff= MAX3(diff, min, -max); */\
+            "1: \n\t"\
+\
+            "movq    %[tmp1], %%mm2 \n\t" /* d */\
+            "movq      %%mm2, %%mm3 \n\t"\
+            "psubw     %%mm6, %%mm2 \n\t" /* d-diff */\
+            "paddw     %%mm6, %%mm3 \n\t" /* d+diff */\
+            "pmaxsw    %%mm2, %%mm1 \n\t"\
+            "pminsw    %%mm3, %%mm1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
+            "packuswb  %%mm1, %%mm1 \n\t"\
+\
+            :[tmp0]"=m"(tmp0),\
+             [tmp1]"=m"(tmp1),\
+             [tmp2]"=m"(tmp2),\
+             [tmp3]"=m"(tmp3)\
+            :[prev] "r"(prev),\
+             [cur]  "r"(cur),\
+             [next] "r"(next),\
+             [prefs]"r"((long)refs),\
+             [mrefs]"r"((long)-refs),\
+             [pw1]  "m"(pw_1),\
+             [pb1]  "m"(pb_1),\
+             [mode] "g"(mode)\
+        );\
+        asm volatile("movd %%mm1, %0" :"=m"(*dst));\
+        dst += 4;\
+        prev+= 4;\
+        cur += 4;\
+        next+= 4;\
+    }
+
+    if(parity){
+#define prev2 "prev"
+#define next2 "cur"
+        FILTER
+#undef prev2
+#undef next2
+    }else{
+#define prev2 "cur"
+#define next2 "next"
+        FILTER
+#undef prev2
+#undef next2
+    }
+}
+#undef LOAD4
+#undef PABS
+#undef CHECK
+#undef CHECK1
+#undef CHECK2
+#undef FILTER
+
+#ifndef attribute_align_arg
+#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__>1)
+#    define attribute_align_arg __attribute__((force_align_arg_pointer))
+#else
+#    define attribute_align_arg
+#endif
+#endif
+
+// for proper alignment SSE2 we need in GCC 4.2 and above
+#if (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__>1)
+
+#ifndef DECLARE_ALIGNED
+#define DECLARE_ALIGNED(n,t,v)       t v __attribute__ ((aligned (n)))
+#endif
+
+// ================= SSE2 =================
+#define PABS(tmp,dst) \
+            "pxor     "#tmp", "#tmp" \n\t"\
+            "psubw    "#dst", "#tmp" \n\t"\
+            "pmaxsw   "#tmp", "#dst" \n\t"
+
+#define FILTER_LINE_FUNC_NAME filter_line_sse2
+#include "vf_yadif_template.h"
+
+// ================ SSSE3 =================
+#define PABS(tmp,dst) \
+            "pabsw     "#dst", "#dst" \n\t"
+
+#define FILTER_LINE_FUNC_NAME filter_line_ssse3
+#include "vf_yadif_template.h"
+
+#endif
+
+#endif
+
+static void filter_line_c(int mode, uint8_t *dst, const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w, int refs, int parity){
+    int x;
+    const uint8_t *prev2= parity ? prev : cur ;
+    const uint8_t *next2= parity ? cur  : next;
+    for(x=0; x<w; x++){
+        int c= cur[-refs];
+        int d= (prev2[0] + next2[0])>>1;
+        int e= cur[+refs];
+        int temporal_diff0= ABS(prev2[0] - next2[0]);
+        int temporal_diff1=( ABS(prev[-refs] - c) + ABS(prev[+refs] - e) )>>1;
+        int temporal_diff2=( ABS(next[-refs] - c) + ABS(next[+refs] - e) )>>1;
+        int diff= MAX3(temporal_diff0>>1, temporal_diff1, temporal_diff2);
+        int spatial_pred= (c+e)>>1;
+        int spatial_score= ABS(cur[-refs-1] - cur[+refs-1]) + ABS(c-e)
+                         + ABS(cur[-refs+1] - cur[+refs+1]) - 1;
+
+#define CHECK(j)\
+    {   int score= ABS(cur[-refs-1+ j] - cur[+refs-1- j])\
+                 + ABS(cur[-refs  + j] - cur[+refs  - j])\
+                 + ABS(cur[-refs+1+ j] - cur[+refs+1- j]);\
+        if(score < spatial_score){\
+            spatial_score= score;\
+            spatial_pred= (cur[-refs  + j] + cur[+refs  - j])>>1;\
+
+        CHECK(-1) CHECK(-2) }} }}
+        CHECK( 1) CHECK( 2) }} }}
+
+        if(mode<2){
+            int b= (prev2[-2*refs] + next2[-2*refs])>>1;
+            int f= (prev2[+2*refs] + next2[+2*refs])>>1;
+#if 0
+            int a= cur[-3*refs];
+            int g= cur[+3*refs];
+            int max= MAX3(d-e, d-c, MIN3(MAX(b-c,f-e),MAX(b-c,b-a),MAX(f-g,f-e)) );
+            int min= MIN3(d-e, d-c, MAX3(MIN(b-c,f-e),MIN(b-c,b-a),MIN(f-g,f-e)) );
+#else
+            int max= MAX3(d-e, d-c, MIN(b-c, f-e));
+            int min= MIN3(d-e, d-c, MAX(b-c, f-e));
+#endif
+
+            diff= MAX3(diff, min, -max);
+        }
+
+        if(spatial_pred > d + diff)
+           spatial_pred = d + diff;
+        else if(spatial_pred < d - diff)
+           spatial_pred = d - diff;
+
+        dst[0] = spatial_pred;
+
+        dst++;
+        cur++;
+        prev++;
+        next++;
+        prev2++;
+        next2++;
+    }
+}
+
+static void interpolate(uint8_t *dst, const uint8_t *cur0,  const uint8_t *cur2, int w)
+{
+    int x;
+    for (x=0; x<w; x++) {
+        dst[x] = (cur0[x] + cur2[x] + 1)>>1; // simple average
+    }
+}
+
+void filter_plane(int mode, uint8_t *dst, int dst_stride, const uint8_t *prev0, const uint8_t *cur0, const uint8_t *next0, int refs, int w, int h, int parity, int tff, int cpu){
+
+       int y;
+       filter_line = filter_line_c;
+#ifdef __GNUC__
+#if (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__>1)
+       if (cpu & AVS_CPU_SSSE3)
+               filter_line = filter_line_ssse3;
+       else if (cpu & AVS_CPU_SSE2)
+               filter_line = filter_line_sse2;
+       else
+#endif
+       if (cpu & AVS_CPU_INTEGER_SSE)
+               filter_line = filter_line_mmx2;
+#endif
+        y=0;
+        if(((y ^ parity) & 1)){
+            memcpy(dst, cur0 + refs, w);// duplicate 1
+        }else{
+            memcpy(dst, cur0, w);
+        }
+        y=1;
+        if(((y ^ parity) & 1)){
+            interpolate(dst + dst_stride, cur0, cur0 + refs*2, w);   // interpolate 0 and 2
+        }else{
+            memcpy(dst + dst_stride, cur0 + refs, w); // copy original
+        }
+        for(y=2; y<h-2; y++){
+            if(((y ^ parity) & 1)){
+                const uint8_t *prev= prev0 + y*refs;
+                const uint8_t *cur = cur0 + y*refs;
+                const uint8_t *next= next0 + y*refs;
+                uint8_t *dst2= dst + y*dst_stride;
+                filter_line(mode, dst2, prev, cur, next, w, refs, (parity ^ tff));
+            }else{
+                memcpy(dst + y*dst_stride, cur0 + y*refs, w); // copy original
+            }
+        }
+       y=h-2;
+        if(((y ^ parity) & 1)){
+            interpolate(dst + (h-2)*dst_stride, cur0 + (h-3)*refs, cur0 + (h-1)*refs, w);   // interpolate h-3 and h-1
+        }else{
+            memcpy(dst + (h-2)*dst_stride, cur0 + (h-2)*refs, w); // copy original
+        }
+        y=h-1;
+        if(((y ^ parity) & 1)){
+            memcpy(dst + (h-1)*dst_stride, cur0 + (h-2)*refs, w); // duplicate h-2
+        }else{
+            memcpy(dst + (h-1)*dst_stride, cur0 + (h-1)*refs, w); // copy original
+        }
+
+#ifdef __GNUC__
+       if (cpu >= AVS_CPU_INTEGER_SSE)
+               asm volatile("emms");
+#endif
+}
+
+#ifdef __GNUC__
+#ifndef PIC
+static attribute_align_arg void  YUY2ToPlanes_mmx(const unsigned char *srcYUY2, int pitch_yuy2, int width, int height,
+                    unsigned char *py, int pitch_y,
+                    unsigned char *pu, unsigned char *pv,  int pitch_uv)
+{ /* process by 16 bytes (8 pixels), so width is assumed mod 8 */
+    int widthdiv2 = width>>1;
+//    static unsigned __int64 Ymask = 0x00FF00FF00FF00FFULL;
+    int h;
+    for (h=0; h<height; h++)
+    {
+        asm (\
+        "pcmpeqb %%mm5, %%mm5 \n\t"  /* prepare Ymask FFFFFFFFFFFFFFFF */\
+        "psrlw $8, %%mm5 \n\t" /* Ymask = 00FF00FF00FF00FF */\
+        "xor %%eax, %%eax \n\t"\
+        "xloop%= : \n\t"\
+        "prefetchnta 0xc0(%%edi,%%eax,4) \n\t"\
+        "movq (%%edi,%%eax,4), %%mm0 \n\t" /* src VYUYVYUY - 1 */\
+        "movq 8(%%edi,%%eax,4), %%mm1 \n\t" /* src VYUYVYUY - 2 */\
+        "movq %%mm0, %%mm2 \n\t" /* VYUYVYUY - 1 */\
+        "movq %%mm1, %%mm3 \n\t" /* VYUYVYUY - 2 */\
+        "pand %%mm5, %%mm0 \n\t" /* 0Y0Y0Y0Y - 1 */\
+        "psrlw $8, %%mm2 \n\t" /* 0V0U0V0U - 1 */\
+        "pand %%mm5, %%mm1 \n\t" /* 0Y0Y0Y0Y - 2 */\
+        "psrlw $8, %%mm3 \n\t" /* 0V0U0V0U - 2 */\
+        "packuswb %%mm1, %%mm0 \n\t" /* YYYYYYYY */\
+        "packuswb %%mm3, %%mm2 \n\t" /* VUVUVUVU */\
+        "movntq %%mm0, (%%ebx,%%eax,2) \n\t" /* store y */\
+        "movq %%mm2, %%mm4 \n\t" /* VUVUVUVU */\
+        "pand %%mm5, %%mm2 \n\t" /* 0U0U0U0U */\
+        "psrlw $8, %%mm4 \n\t" /* 0V0V0V0V */\
+        "packuswb %%mm2, %%mm2 \n\t" /* xxxxUUUU */\
+        "packuswb %%mm4, %%mm4 \n\t" /* xxxxVVVV */\
+        "movd %%mm2, (%%edx,%%eax) \n\t" /* store u */\
+        "add $4, %%eax \n\t" \
+        "cmp %%ecx, %%eax \n\t" \
+        "movd %%mm4, -4(%%esi,%%eax) \n\t" /* store v */\
+        "jl xloop%= \n\t"\
+        : : "D"(srcYUY2), "b"(py), "d"(pu), "S"(pv), "c"(widthdiv2) : "%eax");
+
+        srcYUY2 += pitch_yuy2;
+        py += pitch_y;
+        pu += pitch_uv;
+        pv += pitch_uv;
+    }
+    asm ("sfence \n\t emms");
+}
+
+static attribute_align_arg void YUY2FromPlanes_mmx(unsigned char *dstYUY2, int pitch_yuy2, int width, int height,
+                    const unsigned char *py, int pitch_y,
+                    const unsigned char *pu, const unsigned char *pv,  int pitch_uv)
+{
+    int widthdiv2 = width >> 1;
+    int h;
+    for (h=0; h<height; h++)
+    {
+        asm (\
+        "xor %%eax, %%eax \n\t"\
+        "xloop%=: \n\t"\
+        "movd (%%edx,%%eax), %%mm1 \n\t" /* 0000UUUU */\
+        "movd (%%esi,%%eax), %%mm2 \n\t" /* 0000VVVV */\
+        "movq (%%ebx,%%eax,2), %%mm0 \n\t" /* YYYYYYYY */\
+        "punpcklbw %%mm2,%%mm1 \n\t" /* VUVUVUVU */\
+        "movq %%mm0, %%mm3 \n\t" /* YYYYYYYY */\
+        "punpcklbw %%mm1, %%mm0 \n\t" /* VYUYVYUY */\
+        "add $4, %%eax \n\t"\
+        "punpckhbw %%mm1, %%mm3 \n\t" /* VYUYVYUY */\
+        "movntq %%mm0, -16(%%edi,%%eax,4) \n\t" /*store */\
+        "movntq %%mm3, -8(%%edi,%%eax,4) \n\t" /*  store */\
+        "cmp %%ecx, %%eax \n\t"\
+        "jl xloop%= \n\t"\
+        : : "b"(py), "d"(pu), "S"(pv), "D"(dstYUY2), "c"(widthdiv2) : "%eax");
+        py += pitch_y;
+        pu += pitch_uv;
+        pv += pitch_uv;
+        dstYUY2 += pitch_yuy2;
+    }
+    asm ("sfence \n\t emms");
+}
+#endif
+#endif
+
+//----------------------------------------------------------------------------------------------
+
+void YUY2ToPlanes(const unsigned char *pSrcYUY2, int nSrcPitchYUY2, int nWidth, int nHeight,
+                                                          unsigned char * pSrcY, int srcPitchY,
+                                                          unsigned char * pSrcU,  unsigned char * pSrcV, int srcPitchUV, int cpu)
+{
+
+    int h,w;
+    int w0 = 0;
+#if defined(__GNUC__) && !defined(PIC)
+    if (cpu & AVS_CPU_INTEGER_SSE) {
+        w0 = (nWidth/8)*8;
+        YUY2ToPlanes_mmx(pSrcYUY2, nSrcPitchYUY2, w0, nHeight, pSrcY, srcPitchY, pSrcU, pSrcV, srcPitchUV);
+    }
+#endif
+       for (h=0; h<nHeight; h++)
+       {
+               for (w=w0; w<nWidth; w+=2)
+               {
+                       int w2 = w+w;
+                       pSrcY[w] = pSrcYUY2[w2];
+                       pSrcY[w+1] = pSrcYUY2[w2+2];
+                       pSrcU[(w>>1)] = pSrcYUY2[w2+1];
+                       pSrcV[(w>>1)] = pSrcYUY2[w2+3];
+               }
+               pSrcY += srcPitchY;
+               pSrcU += srcPitchUV;
+               pSrcV += srcPitchUV;
+               pSrcYUY2 += nSrcPitchYUY2;
+       }
+}
+
+//----------------------------------------------------------------------------------------------
+
+void YUY2FromPlanes(unsigned char *pSrcYUY2, int nSrcPitchYUY2, int nWidth, int nHeight,
+                                                         const unsigned char * pSrcY, int srcPitchY,
+                                                         const unsigned char * pSrcU, const unsigned char * pSrcV, int srcPitchUV, int cpu)
+{
+    int h,w;
+    int w0 = 0;
+#if defined(__GNUC__) && !defined(PIC)
+    if (cpu & AVS_CPU_INTEGER_SSE) {
+        w0 = (nWidth/8)*8;
+        YUY2FromPlanes_mmx(pSrcYUY2, nSrcPitchYUY2, w0, nHeight, pSrcY, srcPitchY, pSrcU, pSrcV, srcPitchUV);
+    }
+#endif
+  for (h=0; h<nHeight; h++)
+       {
+               for (w=w0; w<nWidth; w+=2)
+               {
+                       int w2 = w+w;
+                       pSrcYUY2[w2] = pSrcY[w];
+                       pSrcYUY2[w2+1] = pSrcU[(w>>1)];
+                       pSrcYUY2[w2+2] = pSrcY[w+1];
+                       pSrcYUY2[w2+3] = pSrcV[(w>>1)];
+               }
+               pSrcY += srcPitchY;
+               pSrcU += srcPitchUV;
+               pSrcV += srcPitchUV;
+               pSrcYUY2 += nSrcPitchYUY2;
+       }
+}
diff --git a/src/modules/xine/yadif.h b/src/modules/xine/yadif.h
new file mode 100644 (file)
index 0000000..2f89e50
--- /dev/null
@@ -0,0 +1,63 @@
+/*
+       Yadif C-plugin for Avisynth 2.5 - Yet Another DeInterlacing Filter
+       Copyright (C)2007 Alexander G. Balakhnin aka Fizick  http://avisynth.org.ru
+    Port of YADIF filter from MPlayer
+       Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+
+    This program is free software; you can redistribute it and/or modify
+       it under the terms of the GNU General Public License as published by
+       the Free Software Foundation.
+
+       This program is distributed in the hope that it will be useful,
+       but WITHOUT ANY WARRANTY; without even the implied warranty of
+       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+       GNU General Public License for more details.
+
+       You should have received a copy of the GNU General Public License
+       along with this program; if not, write to the Free Software
+       Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+    Avisynth_C plugin
+       Assembler optimized for GNU C compiler
+
+*/
+
+#ifndef YADIF_H_
+#define YADIF_H_
+
+#include <stdint.h>
+
+#define AVS_CPU_INTEGER_SSE 0x1
+#define AVS_CPU_SSE2 0x2
+#define AVS_CPU_SSSE3 0x4
+
+typedef struct yadif_filter  {
+       int cpu; // optimization
+       int yheight;
+       int ypitch;
+       int uvpitch;
+       int ywidth;
+       int uvwidth;
+       unsigned char *ysrc;
+       unsigned char *usrc;
+       unsigned char *vsrc;
+       unsigned char *yprev;
+       unsigned char *uprev;
+       unsigned char *vprev;
+       unsigned char *ynext;
+       unsigned char *unext;
+       unsigned char *vnext;
+       unsigned char *ydest;
+       unsigned char *udest;
+       unsigned char *vdest;
+} yadif_filter;
+
+void filter_plane(int mode, uint8_t *dst, int dst_stride, const uint8_t *prev0, const uint8_t *cur0, const uint8_t *next0, int refs, int w, int h, int parity, int tff, int cpu);
+void YUY2ToPlanes(const unsigned char *pSrcYUY2, int nSrcPitchYUY2, int nWidth, int nHeight,
+                                                          unsigned char * pSrcY, int srcPitchY,
+                                                          unsigned char * pSrcU,  unsigned char * pSrcV, int srcPitchUV, int cpu);
+void YUY2FromPlanes(unsigned char *pSrcYUY2, int nSrcPitchYUY2, int nWidth, int nHeight,
+                                                         const unsigned char * pSrcY, int srcPitchY,
+                                                         const unsigned char * pSrcU, const unsigned char * pSrcV, int srcPitchUV, int cpu);
+
+#endif