]> git.sesse.net Git - ffmpeg/commitdiff
fixed the height%8!=0 bug
authorMichael Niedermayer <michaelni@gmx.at>
Wed, 17 Oct 2001 20:42:07 +0000 (20:42 +0000)
committerMichael Niedermayer <michaelni@gmx.at>
Wed, 17 Oct 2001 20:42:07 +0000 (20:42 +0000)
simplified a few things
removed last row variants of the deinterlace filters, they are not needed anymore
added cubic interpolating deinterlacer

Originally committed as revision 2247 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

postproc/postprocess.c
postproc/postprocess.h
postproc/postprocess_template.c

index 93603e0c3a57609b48060c30896a18b20d3d2247..33ebf42c3449d4601648d0ef3b3f421a16538bda 100644 (file)
@@ -30,14 +30,15 @@ deRing
 Vertical RKAlgo1       E               a       a
 Vertical X1            a               E       E
 Horizontal X1          a               E       E
-LinIpolDeinterlace     a               E       E*
-LinBlendDeinterlace    a               E       E*
+LinIpolDeinterlace     e               E       E*
+CubicIpolDeinterlace   a               e       e*
+LinBlendDeinterlace    e               E       E*
 MedianDeinterlace              Ec      Ec
 
 
 * i dont have a 3dnow CPU -> its untested
 E = Exact implementation
-e = allmost exact implementation
+e = allmost exact implementation (slightly different rounding,...)
 a = alternative / approximate impl
 c = checked against the other implementations (-vo md5)
 */
@@ -63,7 +64,6 @@ noise reduction filters
 
 Notes:
 
-
 */
 
 //Changelog: use the CVS log
@@ -178,12 +178,12 @@ static inline void prefetcht2(void *p)
 
 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
 /**
- * Check if the middle 8x8 Block in the given 8x10 block is flat
+ * Check if the middle 8x8 Block in the given 8x16 block is flat
  */
 static inline int isVertDC(uint8_t src[], int stride){
        int numEq= 0;
        int y;
-       src+= stride; // src points to begin of the 8x8 Block
+       src+= stride*4; // src points to begin of the 8x8 Block
 #ifdef HAVE_MMX
        asm volatile(
                "pushl %1\n\t"
@@ -295,6 +295,7 @@ static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
 {
 #ifdef HAVE_MMX
        int isOk;
+       src+= stride*3;
        asm volatile(
 //             "int $3 \n\t"
                "movq (%1, %2), %%mm0                           \n\t"
@@ -320,6 +321,7 @@ static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
 
        int isOk2= 1;
        int x;
+       src+= stride*3;
        for(x=0; x<BLOCK_SIZE; x++)
        {
                if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
@@ -343,19 +345,16 @@ static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
 }
 
 /**
- * Do a vertical low pass filter on the 8x10 block (only write to the 8x8 block in the middle)
+ * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
  */
 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
 {
-//     QP= 64;
-
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
-//#ifdef HAVE_MMX2
+       src+= stride*3;
        asm volatile(   //"movv %0 %1 %2\n\t"
                "pushl %0 \n\t"
                "movq pQPb, %%mm0                               \n\t"  // QP,..., QP
-//             "movq bFF  , %%mm0                              \n\t"  // QP,..., QP
 
                "movq (%0), %%mm6                               \n\t"
                "movq (%0, %1), %%mm5                           \n\t"
@@ -395,20 +394,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
                // 6 4 2 2 1 1
                // 6 4 4 2
                // 6 8 2
-/*
-               "movq %%mm6, %%mm2                              \n\t" //1
-               "movq %%mm6, %%mm3                              \n\t" //1
-               "paddusb b02, %%mm3                             \n\t"
-               "psrlw $2, %%mm3                                \n\t" //1       /4
-               "pand b3F, %%mm3                                \n\t"
-               "psubb %%mm3, %%mm2                             \n\t"
-               "movq (%0, %1), %%mm0                           \n\t" //  1
-               "movq %%mm0, %%mm1                              \n\t" //  1
-               "paddusb b02, %%mm0                             \n\t"
-               "psrlw $2, %%mm0                                \n\t" //  1     /4
-               "pand b3F, %%mm0                                \n\t"
-               "paddusb %%mm2, %%mm0                           \n\t" //3 1     /4
-*/
+
                "movq (%0, %1), %%mm0                           \n\t" //  1
                "movq %%mm0, %%mm1                              \n\t" //  1
                PAVGB(%%mm6, %%mm0)                                   //1 1     /2
@@ -470,7 +456,6 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
                "movq (%%eax, %1, 2), %%mm6                     \n\t" //      1
                PAVGB(%%mm6, %%mm1)                                   //  11  4  2      /8
                PAVGB(%%mm0, %%mm1)                                   //  11224222      /16
-//             "pxor %%mm1, %%mm1 \n\t"
                "movq %%mm1, (%%eax, %1, 2)                     \n\t" //      X
                // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
                PAVGB((%%ebx), %%mm2)                                 //   112 4        /8
@@ -478,7 +463,6 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
                PAVGB(%%mm0, %%mm6)                                   //      1 1       /2
                PAVGB(%%mm7, %%mm6)                                   //      1 12      /4
                PAVGB(%%mm2, %%mm6)                                   //   1122424      /4
-//             "pxor %%mm6, %%mm6 \n\t"
                "movq %%mm6, (%%ebx)                            \n\t" //       X
                // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
                PAVGB(%%mm7, %%mm5)                                   //    11   2      /4
@@ -486,8 +470,6 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
 
                PAVGB(%%mm3, %%mm0)                                   //      112       /4
                PAVGB(%%mm0, %%mm5)                                   //    112246      /16
-//             "pxor %%mm5, %%mm5 \n\t"
-//             "movq pQPb, %%mm5 \n\t"
                "movq %%mm5, (%%eax, %1, 4)                     \n\t" //        X
                "popl %0\n\t"
 
@@ -506,6 +488,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
        const int l8= stride + l7;
        const int l9= stride + l8;
        int x;
+       src+= stride*3;
        for(x=0; x<BLOCK_SIZE; x++)
        {
                const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
@@ -551,6 +534,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
 {
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+       src+= stride*3;
 // FIXME rounding
        asm volatile(
                "pxor %%mm7, %%mm7                              \n\t" // 0
@@ -622,6 +606,7 @@ static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
        const int l8= stride + l7;
        const int l9= stride + l8;
        int x;
+       src+= stride*3;
        for(x=0; x<BLOCK_SIZE; x++)
        {
                if(ABS(src[l4]-src[l5]) < QP + QP/4)
@@ -650,6 +635,8 @@ static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
 {
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+       src+= stride*3;
+
        asm volatile(
                "pxor %%mm7, %%mm7                              \n\t" // 0
 //             "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
@@ -744,6 +731,8 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
        const int l8= stride + l7;
        const int l9= stride + l8;
        int x;
+
+       src+= stride*3;
        for(x=0; x<BLOCK_SIZE; x++)
        {
                int a= src[l3] - src[l4];
@@ -1007,7 +996,7 @@ HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
 {
 #ifdef HAVE_MMX
-       src+= stride;
+       src+= stride*4;
        //FIXME try pmul for *5 stuff
 //     src[0]=0;
        asm volatile(
@@ -1154,7 +1143,6 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
                "psubw %%mm7, %%mm5                             \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
 // 100 opcodes
                "movd %2, %%mm2                                 \n\t" // QP
-//"pcmpeqb %%mm2, %%mm2\n\t"
                "punpcklwd %%mm2, %%mm2                         \n\t"
                "punpcklwd %%mm2, %%mm2                         \n\t"
                "psllw $3, %%mm2                                \n\t" // 8QP
@@ -1232,7 +1220,6 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
                "movq %%mm0, (%%eax, %1, 2)                     \n\t"
                "movq (%0, %1, 4), %%mm0                        \n\t"
                "psubb %%mm4, %%mm0                             \n\t"
-//             "pxor %%mm0, %%mm0 \n\t"
                "movq %%mm0, (%0, %1, 4)                        \n\t"
 
                :
@@ -1250,6 +1237,7 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
        const int l8= stride + l7;
 //     const int l9= stride + l8;
        int x;
+       src+= stride*3;
        for(x=0; x<BLOCK_SIZE; x++)
        {
                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
@@ -1881,7 +1869,7 @@ FIND_MIN_MAX(%%ebx, %1, 2)
 
 /**
  * Deinterlaces the given block
- * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
+ * will be called for every 8x8 block, and can read & write into an 8x16 block
  */
 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
 {
@@ -1894,16 +1882,16 @@ static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
 
                "movq (%0), %%mm0                               \n\t"
                "movq (%%eax, %1), %%mm1                        \n\t"
-               PAVGB(%%mm1, %%mm0)\
+               PAVGB(%%mm1, %%mm0)
                "movq %%mm0, (%%eax)                            \n\t"
                "movq (%0, %1, 4), %%mm0                        \n\t"
-               PAVGB(%%mm0, %%mm1)\
+               PAVGB(%%mm0, %%mm1)
                "movq %%mm1, (%%eax, %1, 2)                     \n\t"
                "movq (%%ebx, %1), %%mm1                        \n\t"
-               PAVGB(%%mm1, %%mm0)\
+               PAVGB(%%mm1, %%mm0)
                "movq %%mm0, (%%ebx)                            \n\t"
                "movq (%0, %1, 8), %%mm0                        \n\t"
-               PAVGB(%%mm0, %%mm1)\
+               PAVGB(%%mm0, %%mm1)
                "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
 
                : : "r" (src), "r" (stride)
@@ -1924,41 +1912,59 @@ static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
 
 /**
  * Deinterlaces the given block
- * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
+ * will be called for every 8x8 block, and can read & write into an 8x16 block
+ * no cliping in C version
  */
-static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride)
+static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
 {
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
        asm volatile(
                "leal (%0, %1), %%eax                           \n\t"
                "leal (%%eax, %1, 4), %%ebx                     \n\t"
-//     0       1       2       3       4       5       6       7       8       9
-//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
-
-               "movq (%0), %%mm0                               \n\t"
-               "movq (%%eax, %1), %%mm1                        \n\t"
-               PAVGB(%%mm1, %%mm0)\
-               "movq %%mm0, (%%eax)                            \n\t"
-               "movq (%0, %1, 4), %%mm0                        \n\t"
-               PAVGB(%%mm0, %%mm1)\
-               "movq %%mm1, (%%eax, %1, 2)                     \n\t"
-               "movq (%%ebx, %1), %%mm1                        \n\t"
-               PAVGB(%%mm1, %%mm0)\
-               "movq %%mm0, (%%ebx)                            \n\t"
-               "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
+               "leal (%%ebx, %1, 4), %%ecx                     \n\t"
+               "addl %1, %%ecx                                 \n\t"
+               "pxor %%mm7, %%mm7                              \n\t"
+//     0       1       2       3       4       5       6       7       8       9       10
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1 ecx
 
+#define DEINT_CUBIC(a,b,c,d,e)\
+               "movq " #a ", %%mm0                             \n\t"\
+               "movq " #b ", %%mm1                             \n\t"\
+               "movq " #d ", %%mm2                             \n\t"\
+               "movq " #e ", %%mm3                             \n\t"\
+               PAVGB(%%mm2, %%mm1)                                     /* (b+d) /2 */\
+               PAVGB(%%mm3, %%mm0)                                     /* a(a+e) /2 */\
+               "movq %%mm0, %%mm2                              \n\t"\
+               "punpcklbw %%mm7, %%mm0                         \n\t"\
+               "punpckhbw %%mm7, %%mm2                         \n\t"\
+               "movq %%mm1, %%mm3                              \n\t"\
+               "punpcklbw %%mm7, %%mm1                         \n\t"\
+               "punpckhbw %%mm7, %%mm3                         \n\t"\
+               "psubw %%mm1, %%mm0                             \n\t"   /* L(a+e - (b+d))/2 */\
+               "psubw %%mm3, %%mm2                             \n\t"   /* H(a+e - (b+d))/2 */\
+               "psraw $3, %%mm0                                \n\t"   /* L(a+e - (b+d))/16 */\
+               "psraw $3, %%mm2                                \n\t"   /* H(a+e - (b+d))/16 */\
+               "psubw %%mm0, %%mm1                             \n\t"   /* L(9b + 9d - a - e)/16 */\
+               "psubw %%mm2, %%mm3                             \n\t"   /* H(9b + 9d - a - e)/16 */\
+               "packuswb %%mm3, %%mm1                          \n\t"\
+               "movq %%mm1, " #c "                             \n\t"
+
+DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
+DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
+DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
+DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
 
                : : "r" (src), "r" (stride)
-               : "%eax", "%ebx"
+               : "%eax", "%ebx", "ecx"
        );
 #else
        int x;
        for(x=0; x<8; x++)
        {
-               src[stride]   = (src[0]        + src[stride*2])>>1;
-               src[stride*3] = (src[stride*2] + src[stride*4])>>1;
-               src[stride*5] = (src[stride*4] + src[stride*6])>>1;
-               src[stride*7] = src[stride*6];
+               src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
+               src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
+               src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
+               src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
                src++;
        }
 #endif
@@ -1966,7 +1972,7 @@ static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride
 
 /**
  * Deinterlaces the given block
- * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
+ * will be called for every 8x8 block, and can read & write into an 8x16 block
  * will shift the image up by 1 line (FIXME if this is a problem)
  */
 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
@@ -2034,70 +2040,6 @@ static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
 #endif
 }
 
-/**
- * Deinterlaces the given block
- * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
- * will shift the image up by 1 line (FIXME if this is a problem)
- */
-static inline void deInterlaceBlendLinearLastRow(uint8_t src[], int stride)
-{
-#if defined (HAVE_MMSX2) || defined (HAVE_3DNOW)
-       asm volatile(
-               "leal (%0, %1), %%eax                           \n\t"
-               "leal (%%eax, %1, 4), %%ebx                     \n\t"
-//     0       1       2       3       4       5       6       7       8       9
-//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
-
-               "movq (%0), %%mm0                               \n\t" // L0
-               "movq (%%eax, %1), %%mm1                        \n\t" // L2
-               PAVGB(%%mm1, %%mm0)                                   // L0+L2
-               "movq (%%eax), %%mm2                            \n\t" // L1
-               PAVGB(%%mm2, %%mm0)
-               "movq %%mm0, (%0)                               \n\t"
-               "movq (%%eax, %1, 2), %%mm0                     \n\t" // L3
-               PAVGB(%%mm0, %%mm2)                                   // L1+L3
-               PAVGB(%%mm1, %%mm2)                                   // 2L2 + L1 + L3
-               "movq %%mm2, (%%eax)                            \n\t"
-               "movq (%0, %1, 4), %%mm2                        \n\t" // L4
-               PAVGB(%%mm2, %%mm1)                                   // L2+L4
-               PAVGB(%%mm0, %%mm1)                                   // 2L3 + L2 + L4
-               "movq %%mm1, (%%eax, %1)                        \n\t"
-               "movq (%%ebx), %%mm1                            \n\t" // L5
-               PAVGB(%%mm1, %%mm0)                                   // L3+L5
-               PAVGB(%%mm2, %%mm0)                                   // 2L4 + L3 + L5
-               "movq %%mm0, (%%eax, %1, 2)                     \n\t"
-               "movq (%%ebx, %1), %%mm0                        \n\t" // L6
-               PAVGB(%%mm0, %%mm2)                                   // L4+L6
-               PAVGB(%%mm1, %%mm2)                                   // 2L5 + L4 + L6
-               "movq %%mm2, (%0, %1, 4)                        \n\t"
-               "movq (%%ebx, %1, 2), %%mm2                     \n\t" // L7
-               PAVGB(%%mm2, %%mm1)                                   // L5+L7
-               PAVGB(%%mm0, %%mm1)                                   // 2L6 + L5 + L7
-               "movq %%mm1, (%%ebx)                            \n\t"
-               PAVGB(%%mm2, %%mm0)                                   // L7 + L8
-               "movq %%mm0, (%%ebx, %1)                        \n\t"
-               "movq %%mm0, (%%ebx, %1, 2)                     \n\t"
-
-               : : "r" (src), "r" (stride)
-               : "%eax", "%ebx"
-       );
-#else
-       int x;
-       for(x=0; x<8; x++)
-       {
-               src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
-               src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
-               src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
-               src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
-               src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
-               src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
-               src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
-               src[stride*7] = src[stride*6];
-               src++;
-       }
-#endif
-}
-
 /**
  * Deinterlaces the given block
  * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
@@ -2213,91 +2155,6 @@ MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
 #endif
 }
 
-/**
- * Deinterlaces the given block
- * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
- */
-static inline void deInterlaceMedianLastRow(uint8_t src[], int stride)
-{
-#ifdef HAVE_MMX
-#ifdef HAVE_MMX2
-       asm volatile(
-               "leal (%0, %1), %%eax                           \n\t"
-               "leal (%%eax, %1, 4), %%ebx                     \n\t"
-//     0       1       2       3       4       5       6       7       8       9
-//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
-
-               "movq (%0), %%mm0                               \n\t" //
-               "movq (%%eax, %1), %%mm2                        \n\t" //
-               "movq (%%eax), %%mm1                            \n\t" //
-               "movq %%mm0, %%mm3                              \n\t"
-               "pmaxub %%mm1, %%mm0                            \n\t" //
-               "pminub %%mm3, %%mm1                            \n\t" //
-               "pmaxub %%mm2, %%mm1                            \n\t" //
-               "pminub %%mm1, %%mm0                            \n\t"
-               "movq %%mm0, (%%eax)                            \n\t"
-
-               "movq (%0, %1, 4), %%mm0                        \n\t" //
-               "movq (%%eax, %1, 2), %%mm1                     \n\t" //
-               "movq %%mm2, %%mm3                              \n\t"
-               "pmaxub %%mm1, %%mm2                            \n\t" //
-               "pminub %%mm3, %%mm1                            \n\t" //
-               "pmaxub %%mm0, %%mm1                            \n\t" //
-               "pminub %%mm1, %%mm2                            \n\t"
-               "movq %%mm2, (%%eax, %1, 2)                     \n\t"
-
-               "movq (%%ebx), %%mm2                            \n\t" //
-               "movq (%%ebx, %1), %%mm1                        \n\t" //
-               "movq %%mm2, %%mm3                              \n\t"
-               "pmaxub %%mm0, %%mm2                            \n\t" //
-               "pminub %%mm3, %%mm0                            \n\t" //
-               "pmaxub %%mm1, %%mm0                            \n\t" //
-               "pminub %%mm0, %%mm2                            \n\t"
-               "movq %%mm2, (%%ebx)                            \n\t"
-
-               "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
-
-               : : "r" (src), "r" (stride)
-               : "%eax", "%ebx"
-       );
-#else //MMX & no MMX2
-asm volatile(
-               "leal (%0, %1), %%eax                           \n\t"
-               "leal (%%eax, %1, 4), %%ebx                     \n\t"
-//     0       1       2       3       4       5       6       7       8       9
-//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
-               "pxor %%mm7, %%mm7                              \n\t"
-
-MEDIAN((%0), (%%eax), (%%eax, %1))
-MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
-MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
-
-               "movq (%%ebx, %1), %%mm0                        \n\t"
-               "movq %%mm0, (%%ebx, %1, 2)                     \n\t"
-
-               : : "r" (src), "r" (stride)
-               : "%eax", "%ebx"
-       );
-
-#endif //MMX
-#else
-       //FIXME
-       int x;
-       for(x=0; x<8; x++)
-       {
-               src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
-               src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
-               src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
-               src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
-               src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
-               src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
-               src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
-               src[stride*7] = src[stride*6];
-               src++;
-       }
-#endif
-}
-
 #ifdef HAVE_ODIVX_POSTPROCESS
 #include "../opendivx/postprocess.h"
 int use_old_pp=0;
@@ -2537,11 +2394,21 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
        static uint64_t *yHistogram= NULL;
        int black=0, white=255; // blackest black and whitest white in the picture
 
+       /* Temporary buffers for handling the last row(s) */
+       static uint8_t *tempDst= NULL;
+       static uint8_t *tempSrc= NULL;
+
 #ifdef TIMING
        long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
        sumTime= rdtsc();
 #endif
 
+       if(tempDst==NULL)
+       {
+               tempDst= (uint8_t*)memalign(8, 1024*24);
+               tempSrc= (uint8_t*)memalign(8, 1024*24);
+       }
+
        if(!yHistogram)
        {
                int i;
@@ -2569,7 +2436,6 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
 //             printf("\n\n");
 
                /* we allways get a completly black picture first */
-
                maxClipped= (uint64_t)(sum * maxClippedThreshold);
 
                clipped= sum;
@@ -2604,16 +2470,40 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                packedYOffset= 0;
        }
 
+       /* copy first row of 8x8 blocks */
        for(x=0; x<width; x+=BLOCK_SIZE)
                blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
 
-       for(y=0; y<height-7; y+=BLOCK_SIZE)
+       for(y=0; y<height; y+=BLOCK_SIZE)
        {
                //1% speedup if these are here instead of the inner loop
                uint8_t *srcBlock= &(src[y*srcStride]);
                uint8_t *dstBlock= &(dst[y*dstStride]);
-               uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start
-               uint8_t *vertBlock= &(dstBlock[dstStride*3]);
+
+               /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
+                  than use a temporary buffer */
+               if(y+15 >= height)
+               {
+                       /* copy from line 5 to 12 of src, these will e copied with
+                          blockcopy to dst later */
+                       memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5,
+                               srcStride*MAX(height-y-5, 0) );
+
+                       /* duplicate last line to fill the void upto line 12 */
+                       if(y+12 >= height)
+                       {
+                               int i;
+                               for(i=height-y; i<=12; i++)
+                                       memcpy(tempSrc + srcStride*i,
+                                               src + srcStride*(height-1), srcStride);
+                       }
+
+
+                       /* copy up to 5 lines of dst */
+                       memcpy(tempDst, dstBlock, dstStride*MIN(height-y, 5) );
+                       dstBlock= tempDst;
+                       srcBlock= tempSrc;
+               }
 
                // finish 1 block before the next otherwise we´ll might have a problem
                // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
@@ -2625,53 +2515,54 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                                QPs[(y>>4)*QPStride + (x>>4)];
                        if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
 #ifdef HAVE_MMX
-               asm volatile(
-                       "movd %0, %%mm7                                 \n\t"
-                       "packuswb %%mm7, %%mm7                          \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
-                       "packuswb %%mm7, %%mm7                          \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
-                       "packuswb %%mm7, %%mm7                          \n\t" // QP,..., QP
-                       "movq %%mm7, pQPb                               \n\t"
-                       : : "r" (QP)
-               );
+                       asm volatile(
+                               "movd %0, %%mm7                                 \n\t"
+                               "packuswb %%mm7, %%mm7                          \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
+                               "packuswb %%mm7, %%mm7                          \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
+                               "packuswb %%mm7, %%mm7                          \n\t" // QP,..., QP
+                               "movq %%mm7, pQPb                               \n\t"
+                               : : "r" (QP)
+                       );
 #endif
 
-
-                       if(y + 12 < height)
-                       {
 #ifdef MORE_TIMING
-                               T0= rdtsc();
+                       T0= rdtsc();
 #endif
 
 #ifdef HAVE_MMX2
-                               prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
-                               prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
-                               prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
-                               prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
+                       prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
+                       prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
+                       prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
+                       prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
 #elif defined(HAVE_3DNOW)
 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
-/*                             prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
-                               prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
-                               prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
-                               prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
+/*                     prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
+                       prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
+                       prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
+                       prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
 */
 #endif
-                               if(!isColor) yHistogram[ srcBlock[0] ]++;
-
-                               blockCopy(vertBlock + dstStride*2, dstStride,
-                                       vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
-
-                               if(mode & LINEAR_IPOL_DEINT_FILTER)
-                                       deInterlaceInterpolateLinear(dstBlock, dstStride);
-                               else if(mode & LINEAR_BLEND_DEINT_FILTER)
-                                       deInterlaceBlendLinear(dstBlock, dstStride);
-                               else if(mode & MEDIAN_DEINT_FILTER)
-                                       deInterlaceMedian(dstBlock, dstStride);
-/*                             else if(mode & CUBIC_IPOL_DEINT_FILTER)
-                                       deInterlaceInterpolateCubic(dstBlock, dstStride);
-                               else if(mode & CUBIC_BLEND_DEINT_FILTER)
-                                       deInterlaceBlendCubic(dstBlock, dstStride);
+
+                       if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++;
+
+                       blockCopy(dstBlock + dstStride*5, dstStride,
+                               srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX);
+
+                       if(mode & LINEAR_IPOL_DEINT_FILTER)
+                               deInterlaceInterpolateLinear(dstBlock, dstStride);
+                       else if(mode & LINEAR_BLEND_DEINT_FILTER)
+                               deInterlaceBlendLinear(dstBlock, dstStride);
+                       else if(mode & MEDIAN_DEINT_FILTER)
+                               deInterlaceMedian(dstBlock, dstStride);
+                       else if(mode & CUBIC_IPOL_DEINT_FILTER)
+                               deInterlaceInterpolateCubic(dstBlock, dstStride);
+/*                     else if(mode & CUBIC_BLEND_DEINT_FILTER)
+                               deInterlaceBlendCubic(dstBlock, dstStride);
 */
 
+                       /* only deblock if we have 2 blocks */
+                       if(y + 8 < height)
+                       {
 #ifdef MORE_TIMING
                                T1= rdtsc();
                                memcpyTime+= T1-T0;
@@ -2680,18 +2571,18 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                                if(mode & V_DEBLOCK)
                                {
                                        if(mode & V_RK1_FILTER)
-                                               vertRK1Filter(vertBlock, stride, QP);
+                                               vertRK1Filter(dstBlock, stride, QP);
                                        else if(mode & V_X1_FILTER)
-                                               vertX1Filter(vertBlock, stride, QP);
+                                               vertX1Filter(dstBlock, stride, QP);
                                        else
                                        {
-                                               if( isVertDC(vertBlock, stride))
+                                               if( isVertDC(dstBlock, stride))
                                                {
-                                                       if(isVertMinMaxOk(vertBlock, stride, QP))
-                                                               doVertLowPass(vertBlock, stride, QP);
+                                                       if(isVertMinMaxOk(dstBlock, stride, QP))
+                                                               doVertLowPass(dstBlock, stride, QP);
                                                }
                                                else
-                                                       doVertDefFilter(vertBlock, stride, QP);
+                                                       doVertDefFilter(dstBlock, stride, QP);
                                        }
                                }
 #ifdef MORE_TIMING
@@ -2700,24 +2591,8 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                                T0=T1;
 #endif
                        }
-                       else
-                       {
-                               blockCopy(vertBlock + dstStride*1, dstStride,
-                                       vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
-
-                               if(mode & LINEAR_IPOL_DEINT_FILTER)
-                                       deInterlaceInterpolateLinearLastRow(dstBlock, dstStride);
-                               else if(mode & LINEAR_BLEND_DEINT_FILTER)
-                                       deInterlaceBlendLinearLastRow(dstBlock, dstStride);
-                               else if(mode & MEDIAN_DEINT_FILTER)
-                                       deInterlaceMedianLastRow(dstBlock, dstStride);
-/*                             else if(mode & CUBIC_IPOL_DEINT_FILTER)
-                                       deInterlaceInterpolateCubicLastRow(dstBlock, dstStride);
-                               else if(mode & CUBIC_BLEND_DEINT_FILTER)
-                                       deInterlaceBlendCubicLastRow(dstBlock, dstStride);
-*/
-                       }
 
+                       /* check if we have a previous block to deblock it with dstBlock */
                        if(x - 8 >= 0 && x<width)
                        {
 #ifdef MORE_TIMING
@@ -2749,11 +2624,15 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                                dering(dstBlock - stride*9 + width-9, stride, QP);
                        //FIXME dering filter will not be applied to last block (bottom right)
 
-
                        dstBlock+=8;
                        srcBlock+=8;
-                       vertBlock+=8;
-                       vertSrcBlock+=8;
+               }
+
+               /* did we use a tmp buffer */
+               if(y+15 > height)
+               {
+                       uint8_t *dstBlock= &(dst[y*dstStride]);
+                       memcpy(dstBlock, tempDst, dstStride*(height-y) );
                }
        }
 #ifdef HAVE_3DNOW
@@ -2772,5 +2651,3 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                        , black, white);
 #endif
 }
-
-
index b8812756ebcb2f11b1636020a7a73f5e74c89441..e7eb248512c6e400f8c0ed27159bcab2547c32d6 100644 (file)
@@ -49,8 +49,8 @@
 //Deinterlacing Filters
 #define        LINEAR_IPOL_DEINT_FILTER        0x10000 // 65536
 #define        LINEAR_BLEND_DEINT_FILTER       0x20000 // 131072
-//#define      CUBIC_BLEND_DEINT_FILTER        0x8000  // (not implemented yet)
-#define        CUBIC_IPOL_DEINT_FILTER         0x40000 // 262144 (not implemented yet)
+#define        CUBIC_BLEND_DEINT_FILTER        0x8000  // (not implemented yet)
+#define        CUBIC_IPOL_DEINT_FILTER         0x40000 // 262144
 #define        MEDIAN_DEINT_FILTER             0x80000 // 524288
 
 
index 93603e0c3a57609b48060c30896a18b20d3d2247..33ebf42c3449d4601648d0ef3b3f421a16538bda 100644 (file)
@@ -30,14 +30,15 @@ deRing
 Vertical RKAlgo1       E               a       a
 Vertical X1            a               E       E
 Horizontal X1          a               E       E
-LinIpolDeinterlace     a               E       E*
-LinBlendDeinterlace    a               E       E*
+LinIpolDeinterlace     e               E       E*
+CubicIpolDeinterlace   a               e       e*
+LinBlendDeinterlace    e               E       E*
 MedianDeinterlace              Ec      Ec
 
 
 * i dont have a 3dnow CPU -> its untested
 E = Exact implementation
-e = allmost exact implementation
+e = allmost exact implementation (slightly different rounding,...)
 a = alternative / approximate impl
 c = checked against the other implementations (-vo md5)
 */
@@ -63,7 +64,6 @@ noise reduction filters
 
 Notes:
 
-
 */
 
 //Changelog: use the CVS log
@@ -178,12 +178,12 @@ static inline void prefetcht2(void *p)
 
 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
 /**
- * Check if the middle 8x8 Block in the given 8x10 block is flat
+ * Check if the middle 8x8 Block in the given 8x16 block is flat
  */
 static inline int isVertDC(uint8_t src[], int stride){
        int numEq= 0;
        int y;
-       src+= stride; // src points to begin of the 8x8 Block
+       src+= stride*4; // src points to begin of the 8x8 Block
 #ifdef HAVE_MMX
        asm volatile(
                "pushl %1\n\t"
@@ -295,6 +295,7 @@ static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
 {
 #ifdef HAVE_MMX
        int isOk;
+       src+= stride*3;
        asm volatile(
 //             "int $3 \n\t"
                "movq (%1, %2), %%mm0                           \n\t"
@@ -320,6 +321,7 @@ static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
 
        int isOk2= 1;
        int x;
+       src+= stride*3;
        for(x=0; x<BLOCK_SIZE; x++)
        {
                if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
@@ -343,19 +345,16 @@ static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
 }
 
 /**
- * Do a vertical low pass filter on the 8x10 block (only write to the 8x8 block in the middle)
+ * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
  */
 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
 {
-//     QP= 64;
-
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
-//#ifdef HAVE_MMX2
+       src+= stride*3;
        asm volatile(   //"movv %0 %1 %2\n\t"
                "pushl %0 \n\t"
                "movq pQPb, %%mm0                               \n\t"  // QP,..., QP
-//             "movq bFF  , %%mm0                              \n\t"  // QP,..., QP
 
                "movq (%0), %%mm6                               \n\t"
                "movq (%0, %1), %%mm5                           \n\t"
@@ -395,20 +394,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
                // 6 4 2 2 1 1
                // 6 4 4 2
                // 6 8 2
-/*
-               "movq %%mm6, %%mm2                              \n\t" //1
-               "movq %%mm6, %%mm3                              \n\t" //1
-               "paddusb b02, %%mm3                             \n\t"
-               "psrlw $2, %%mm3                                \n\t" //1       /4
-               "pand b3F, %%mm3                                \n\t"
-               "psubb %%mm3, %%mm2                             \n\t"
-               "movq (%0, %1), %%mm0                           \n\t" //  1
-               "movq %%mm0, %%mm1                              \n\t" //  1
-               "paddusb b02, %%mm0                             \n\t"
-               "psrlw $2, %%mm0                                \n\t" //  1     /4
-               "pand b3F, %%mm0                                \n\t"
-               "paddusb %%mm2, %%mm0                           \n\t" //3 1     /4
-*/
+
                "movq (%0, %1), %%mm0                           \n\t" //  1
                "movq %%mm0, %%mm1                              \n\t" //  1
                PAVGB(%%mm6, %%mm0)                                   //1 1     /2
@@ -470,7 +456,6 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
                "movq (%%eax, %1, 2), %%mm6                     \n\t" //      1
                PAVGB(%%mm6, %%mm1)                                   //  11  4  2      /8
                PAVGB(%%mm0, %%mm1)                                   //  11224222      /16
-//             "pxor %%mm1, %%mm1 \n\t"
                "movq %%mm1, (%%eax, %1, 2)                     \n\t" //      X
                // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
                PAVGB((%%ebx), %%mm2)                                 //   112 4        /8
@@ -478,7 +463,6 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
                PAVGB(%%mm0, %%mm6)                                   //      1 1       /2
                PAVGB(%%mm7, %%mm6)                                   //      1 12      /4
                PAVGB(%%mm2, %%mm6)                                   //   1122424      /4
-//             "pxor %%mm6, %%mm6 \n\t"
                "movq %%mm6, (%%ebx)                            \n\t" //       X
                // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
                PAVGB(%%mm7, %%mm5)                                   //    11   2      /4
@@ -486,8 +470,6 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
 
                PAVGB(%%mm3, %%mm0)                                   //      112       /4
                PAVGB(%%mm0, %%mm5)                                   //    112246      /16
-//             "pxor %%mm5, %%mm5 \n\t"
-//             "movq pQPb, %%mm5 \n\t"
                "movq %%mm5, (%%eax, %1, 4)                     \n\t" //        X
                "popl %0\n\t"
 
@@ -506,6 +488,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
        const int l8= stride + l7;
        const int l9= stride + l8;
        int x;
+       src+= stride*3;
        for(x=0; x<BLOCK_SIZE; x++)
        {
                const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
@@ -551,6 +534,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
 {
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+       src+= stride*3;
 // FIXME rounding
        asm volatile(
                "pxor %%mm7, %%mm7                              \n\t" // 0
@@ -622,6 +606,7 @@ static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
        const int l8= stride + l7;
        const int l9= stride + l8;
        int x;
+       src+= stride*3;
        for(x=0; x<BLOCK_SIZE; x++)
        {
                if(ABS(src[l4]-src[l5]) < QP + QP/4)
@@ -650,6 +635,8 @@ static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
 {
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+       src+= stride*3;
+
        asm volatile(
                "pxor %%mm7, %%mm7                              \n\t" // 0
 //             "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
@@ -744,6 +731,8 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
        const int l8= stride + l7;
        const int l9= stride + l8;
        int x;
+
+       src+= stride*3;
        for(x=0; x<BLOCK_SIZE; x++)
        {
                int a= src[l3] - src[l4];
@@ -1007,7 +996,7 @@ HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
 {
 #ifdef HAVE_MMX
-       src+= stride;
+       src+= stride*4;
        //FIXME try pmul for *5 stuff
 //     src[0]=0;
        asm volatile(
@@ -1154,7 +1143,6 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
                "psubw %%mm7, %%mm5                             \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
 // 100 opcodes
                "movd %2, %%mm2                                 \n\t" // QP
-//"pcmpeqb %%mm2, %%mm2\n\t"
                "punpcklwd %%mm2, %%mm2                         \n\t"
                "punpcklwd %%mm2, %%mm2                         \n\t"
                "psllw $3, %%mm2                                \n\t" // 8QP
@@ -1232,7 +1220,6 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
                "movq %%mm0, (%%eax, %1, 2)                     \n\t"
                "movq (%0, %1, 4), %%mm0                        \n\t"
                "psubb %%mm4, %%mm0                             \n\t"
-//             "pxor %%mm0, %%mm0 \n\t"
                "movq %%mm0, (%0, %1, 4)                        \n\t"
 
                :
@@ -1250,6 +1237,7 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
        const int l8= stride + l7;
 //     const int l9= stride + l8;
        int x;
+       src+= stride*3;
        for(x=0; x<BLOCK_SIZE; x++)
        {
                const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
@@ -1881,7 +1869,7 @@ FIND_MIN_MAX(%%ebx, %1, 2)
 
 /**
  * Deinterlaces the given block
- * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
+ * will be called for every 8x8 block, and can read & write into an 8x16 block
  */
 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
 {
@@ -1894,16 +1882,16 @@ static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
 
                "movq (%0), %%mm0                               \n\t"
                "movq (%%eax, %1), %%mm1                        \n\t"
-               PAVGB(%%mm1, %%mm0)\
+               PAVGB(%%mm1, %%mm0)
                "movq %%mm0, (%%eax)                            \n\t"
                "movq (%0, %1, 4), %%mm0                        \n\t"
-               PAVGB(%%mm0, %%mm1)\
+               PAVGB(%%mm0, %%mm1)
                "movq %%mm1, (%%eax, %1, 2)                     \n\t"
                "movq (%%ebx, %1), %%mm1                        \n\t"
-               PAVGB(%%mm1, %%mm0)\
+               PAVGB(%%mm1, %%mm0)
                "movq %%mm0, (%%ebx)                            \n\t"
                "movq (%0, %1, 8), %%mm0                        \n\t"
-               PAVGB(%%mm0, %%mm1)\
+               PAVGB(%%mm0, %%mm1)
                "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
 
                : : "r" (src), "r" (stride)
@@ -1924,41 +1912,59 @@ static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
 
 /**
  * Deinterlaces the given block
- * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
+ * will be called for every 8x8 block, and can read & write into an 8x16 block
+ * no cliping in C version
  */
-static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride)
+static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
 {
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
        asm volatile(
                "leal (%0, %1), %%eax                           \n\t"
                "leal (%%eax, %1, 4), %%ebx                     \n\t"
-//     0       1       2       3       4       5       6       7       8       9
-//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
-
-               "movq (%0), %%mm0                               \n\t"
-               "movq (%%eax, %1), %%mm1                        \n\t"
-               PAVGB(%%mm1, %%mm0)\
-               "movq %%mm0, (%%eax)                            \n\t"
-               "movq (%0, %1, 4), %%mm0                        \n\t"
-               PAVGB(%%mm0, %%mm1)\
-               "movq %%mm1, (%%eax, %1, 2)                     \n\t"
-               "movq (%%ebx, %1), %%mm1                        \n\t"
-               PAVGB(%%mm1, %%mm0)\
-               "movq %%mm0, (%%ebx)                            \n\t"
-               "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
+               "leal (%%ebx, %1, 4), %%ecx                     \n\t"
+               "addl %1, %%ecx                                 \n\t"
+               "pxor %%mm7, %%mm7                              \n\t"
+//     0       1       2       3       4       5       6       7       8       9       10
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1 ecx
 
+#define DEINT_CUBIC(a,b,c,d,e)\
+               "movq " #a ", %%mm0                             \n\t"\
+               "movq " #b ", %%mm1                             \n\t"\
+               "movq " #d ", %%mm2                             \n\t"\
+               "movq " #e ", %%mm3                             \n\t"\
+               PAVGB(%%mm2, %%mm1)                                     /* (b+d) /2 */\
+               PAVGB(%%mm3, %%mm0)                                     /* a(a+e) /2 */\
+               "movq %%mm0, %%mm2                              \n\t"\
+               "punpcklbw %%mm7, %%mm0                         \n\t"\
+               "punpckhbw %%mm7, %%mm2                         \n\t"\
+               "movq %%mm1, %%mm3                              \n\t"\
+               "punpcklbw %%mm7, %%mm1                         \n\t"\
+               "punpckhbw %%mm7, %%mm3                         \n\t"\
+               "psubw %%mm1, %%mm0                             \n\t"   /* L(a+e - (b+d))/2 */\
+               "psubw %%mm3, %%mm2                             \n\t"   /* H(a+e - (b+d))/2 */\
+               "psraw $3, %%mm0                                \n\t"   /* L(a+e - (b+d))/16 */\
+               "psraw $3, %%mm2                                \n\t"   /* H(a+e - (b+d))/16 */\
+               "psubw %%mm0, %%mm1                             \n\t"   /* L(9b + 9d - a - e)/16 */\
+               "psubw %%mm2, %%mm3                             \n\t"   /* H(9b + 9d - a - e)/16 */\
+               "packuswb %%mm3, %%mm1                          \n\t"\
+               "movq %%mm1, " #c "                             \n\t"
+
+DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
+DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
+DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
+DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
 
                : : "r" (src), "r" (stride)
-               : "%eax", "%ebx"
+               : "%eax", "%ebx", "ecx"
        );
 #else
        int x;
        for(x=0; x<8; x++)
        {
-               src[stride]   = (src[0]        + src[stride*2])>>1;
-               src[stride*3] = (src[stride*2] + src[stride*4])>>1;
-               src[stride*5] = (src[stride*4] + src[stride*6])>>1;
-               src[stride*7] = src[stride*6];
+               src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
+               src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
+               src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
+               src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
                src++;
        }
 #endif
@@ -1966,7 +1972,7 @@ static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride
 
 /**
  * Deinterlaces the given block
- * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
+ * will be called for every 8x8 block, and can read & write into an 8x16 block
  * will shift the image up by 1 line (FIXME if this is a problem)
  */
 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
@@ -2034,70 +2040,6 @@ static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
 #endif
 }
 
-/**
- * Deinterlaces the given block
- * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
- * will shift the image up by 1 line (FIXME if this is a problem)
- */
-static inline void deInterlaceBlendLinearLastRow(uint8_t src[], int stride)
-{
-#if defined (HAVE_MMSX2) || defined (HAVE_3DNOW)
-       asm volatile(
-               "leal (%0, %1), %%eax                           \n\t"
-               "leal (%%eax, %1, 4), %%ebx                     \n\t"
-//     0       1       2       3       4       5       6       7       8       9
-//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
-
-               "movq (%0), %%mm0                               \n\t" // L0
-               "movq (%%eax, %1), %%mm1                        \n\t" // L2
-               PAVGB(%%mm1, %%mm0)                                   // L0+L2
-               "movq (%%eax), %%mm2                            \n\t" // L1
-               PAVGB(%%mm2, %%mm0)
-               "movq %%mm0, (%0)                               \n\t"
-               "movq (%%eax, %1, 2), %%mm0                     \n\t" // L3
-               PAVGB(%%mm0, %%mm2)                                   // L1+L3
-               PAVGB(%%mm1, %%mm2)                                   // 2L2 + L1 + L3
-               "movq %%mm2, (%%eax)                            \n\t"
-               "movq (%0, %1, 4), %%mm2                        \n\t" // L4
-               PAVGB(%%mm2, %%mm1)                                   // L2+L4
-               PAVGB(%%mm0, %%mm1)                                   // 2L3 + L2 + L4
-               "movq %%mm1, (%%eax, %1)                        \n\t"
-               "movq (%%ebx), %%mm1                            \n\t" // L5
-               PAVGB(%%mm1, %%mm0)                                   // L3+L5
-               PAVGB(%%mm2, %%mm0)                                   // 2L4 + L3 + L5
-               "movq %%mm0, (%%eax, %1, 2)                     \n\t"
-               "movq (%%ebx, %1), %%mm0                        \n\t" // L6
-               PAVGB(%%mm0, %%mm2)                                   // L4+L6
-               PAVGB(%%mm1, %%mm2)                                   // 2L5 + L4 + L6
-               "movq %%mm2, (%0, %1, 4)                        \n\t"
-               "movq (%%ebx, %1, 2), %%mm2                     \n\t" // L7
-               PAVGB(%%mm2, %%mm1)                                   // L5+L7
-               PAVGB(%%mm0, %%mm1)                                   // 2L6 + L5 + L7
-               "movq %%mm1, (%%ebx)                            \n\t"
-               PAVGB(%%mm2, %%mm0)                                   // L7 + L8
-               "movq %%mm0, (%%ebx, %1)                        \n\t"
-               "movq %%mm0, (%%ebx, %1, 2)                     \n\t"
-
-               : : "r" (src), "r" (stride)
-               : "%eax", "%ebx"
-       );
-#else
-       int x;
-       for(x=0; x<8; x++)
-       {
-               src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
-               src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
-               src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
-               src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
-               src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
-               src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
-               src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
-               src[stride*7] = src[stride*6];
-               src++;
-       }
-#endif
-}
-
 /**
  * Deinterlaces the given block
  * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
@@ -2213,91 +2155,6 @@ MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
 #endif
 }
 
-/**
- * Deinterlaces the given block
- * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
- */
-static inline void deInterlaceMedianLastRow(uint8_t src[], int stride)
-{
-#ifdef HAVE_MMX
-#ifdef HAVE_MMX2
-       asm volatile(
-               "leal (%0, %1), %%eax                           \n\t"
-               "leal (%%eax, %1, 4), %%ebx                     \n\t"
-//     0       1       2       3       4       5       6       7       8       9
-//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
-
-               "movq (%0), %%mm0                               \n\t" //
-               "movq (%%eax, %1), %%mm2                        \n\t" //
-               "movq (%%eax), %%mm1                            \n\t" //
-               "movq %%mm0, %%mm3                              \n\t"
-               "pmaxub %%mm1, %%mm0                            \n\t" //
-               "pminub %%mm3, %%mm1                            \n\t" //
-               "pmaxub %%mm2, %%mm1                            \n\t" //
-               "pminub %%mm1, %%mm0                            \n\t"
-               "movq %%mm0, (%%eax)                            \n\t"
-
-               "movq (%0, %1, 4), %%mm0                        \n\t" //
-               "movq (%%eax, %1, 2), %%mm1                     \n\t" //
-               "movq %%mm2, %%mm3                              \n\t"
-               "pmaxub %%mm1, %%mm2                            \n\t" //
-               "pminub %%mm3, %%mm1                            \n\t" //
-               "pmaxub %%mm0, %%mm1                            \n\t" //
-               "pminub %%mm1, %%mm2                            \n\t"
-               "movq %%mm2, (%%eax, %1, 2)                     \n\t"
-
-               "movq (%%ebx), %%mm2                            \n\t" //
-               "movq (%%ebx, %1), %%mm1                        \n\t" //
-               "movq %%mm2, %%mm3                              \n\t"
-               "pmaxub %%mm0, %%mm2                            \n\t" //
-               "pminub %%mm3, %%mm0                            \n\t" //
-               "pmaxub %%mm1, %%mm0                            \n\t" //
-               "pminub %%mm0, %%mm2                            \n\t"
-               "movq %%mm2, (%%ebx)                            \n\t"
-
-               "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
-
-               : : "r" (src), "r" (stride)
-               : "%eax", "%ebx"
-       );
-#else //MMX & no MMX2
-asm volatile(
-               "leal (%0, %1), %%eax                           \n\t"
-               "leal (%%eax, %1, 4), %%ebx                     \n\t"
-//     0       1       2       3       4       5       6       7       8       9
-//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
-               "pxor %%mm7, %%mm7                              \n\t"
-
-MEDIAN((%0), (%%eax), (%%eax, %1))
-MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
-MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
-
-               "movq (%%ebx, %1), %%mm0                        \n\t"
-               "movq %%mm0, (%%ebx, %1, 2)                     \n\t"
-
-               : : "r" (src), "r" (stride)
-               : "%eax", "%ebx"
-       );
-
-#endif //MMX
-#else
-       //FIXME
-       int x;
-       for(x=0; x<8; x++)
-       {
-               src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
-               src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
-               src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
-               src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
-               src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
-               src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
-               src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
-               src[stride*7] = src[stride*6];
-               src++;
-       }
-#endif
-}
-
 #ifdef HAVE_ODIVX_POSTPROCESS
 #include "../opendivx/postprocess.h"
 int use_old_pp=0;
@@ -2537,11 +2394,21 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
        static uint64_t *yHistogram= NULL;
        int black=0, white=255; // blackest black and whitest white in the picture
 
+       /* Temporary buffers for handling the last row(s) */
+       static uint8_t *tempDst= NULL;
+       static uint8_t *tempSrc= NULL;
+
 #ifdef TIMING
        long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
        sumTime= rdtsc();
 #endif
 
+       if(tempDst==NULL)
+       {
+               tempDst= (uint8_t*)memalign(8, 1024*24);
+               tempSrc= (uint8_t*)memalign(8, 1024*24);
+       }
+
        if(!yHistogram)
        {
                int i;
@@ -2569,7 +2436,6 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
 //             printf("\n\n");
 
                /* we allways get a completly black picture first */
-
                maxClipped= (uint64_t)(sum * maxClippedThreshold);
 
                clipped= sum;
@@ -2604,16 +2470,40 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                packedYOffset= 0;
        }
 
+       /* copy first row of 8x8 blocks */
        for(x=0; x<width; x+=BLOCK_SIZE)
                blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
 
-       for(y=0; y<height-7; y+=BLOCK_SIZE)
+       for(y=0; y<height; y+=BLOCK_SIZE)
        {
                //1% speedup if these are here instead of the inner loop
                uint8_t *srcBlock= &(src[y*srcStride]);
                uint8_t *dstBlock= &(dst[y*dstStride]);
-               uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start
-               uint8_t *vertBlock= &(dstBlock[dstStride*3]);
+
+               /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
+                  than use a temporary buffer */
+               if(y+15 >= height)
+               {
+                       /* copy from line 5 to 12 of src, these will e copied with
+                          blockcopy to dst later */
+                       memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5,
+                               srcStride*MAX(height-y-5, 0) );
+
+                       /* duplicate last line to fill the void upto line 12 */
+                       if(y+12 >= height)
+                       {
+                               int i;
+                               for(i=height-y; i<=12; i++)
+                                       memcpy(tempSrc + srcStride*i,
+                                               src + srcStride*(height-1), srcStride);
+                       }
+
+
+                       /* copy up to 5 lines of dst */
+                       memcpy(tempDst, dstBlock, dstStride*MIN(height-y, 5) );
+                       dstBlock= tempDst;
+                       srcBlock= tempSrc;
+               }
 
                // finish 1 block before the next otherwise we´ll might have a problem
                // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
@@ -2625,53 +2515,54 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                                QPs[(y>>4)*QPStride + (x>>4)];
                        if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
 #ifdef HAVE_MMX
-               asm volatile(
-                       "movd %0, %%mm7                                 \n\t"
-                       "packuswb %%mm7, %%mm7                          \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
-                       "packuswb %%mm7, %%mm7                          \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
-                       "packuswb %%mm7, %%mm7                          \n\t" // QP,..., QP
-                       "movq %%mm7, pQPb                               \n\t"
-                       : : "r" (QP)
-               );
+                       asm volatile(
+                               "movd %0, %%mm7                                 \n\t"
+                               "packuswb %%mm7, %%mm7                          \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
+                               "packuswb %%mm7, %%mm7                          \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
+                               "packuswb %%mm7, %%mm7                          \n\t" // QP,..., QP
+                               "movq %%mm7, pQPb                               \n\t"
+                               : : "r" (QP)
+                       );
 #endif
 
-
-                       if(y + 12 < height)
-                       {
 #ifdef MORE_TIMING
-                               T0= rdtsc();
+                       T0= rdtsc();
 #endif
 
 #ifdef HAVE_MMX2
-                               prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
-                               prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
-                               prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
-                               prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
+                       prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
+                       prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
+                       prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
+                       prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
 #elif defined(HAVE_3DNOW)
 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
-/*                             prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
-                               prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
-                               prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
-                               prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
+/*                     prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
+                       prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
+                       prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
+                       prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
 */
 #endif
-                               if(!isColor) yHistogram[ srcBlock[0] ]++;
-
-                               blockCopy(vertBlock + dstStride*2, dstStride,
-                                       vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
-
-                               if(mode & LINEAR_IPOL_DEINT_FILTER)
-                                       deInterlaceInterpolateLinear(dstBlock, dstStride);
-                               else if(mode & LINEAR_BLEND_DEINT_FILTER)
-                                       deInterlaceBlendLinear(dstBlock, dstStride);
-                               else if(mode & MEDIAN_DEINT_FILTER)
-                                       deInterlaceMedian(dstBlock, dstStride);
-/*                             else if(mode & CUBIC_IPOL_DEINT_FILTER)
-                                       deInterlaceInterpolateCubic(dstBlock, dstStride);
-                               else if(mode & CUBIC_BLEND_DEINT_FILTER)
-                                       deInterlaceBlendCubic(dstBlock, dstStride);
+
+                       if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++;
+
+                       blockCopy(dstBlock + dstStride*5, dstStride,
+                               srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX);
+
+                       if(mode & LINEAR_IPOL_DEINT_FILTER)
+                               deInterlaceInterpolateLinear(dstBlock, dstStride);
+                       else if(mode & LINEAR_BLEND_DEINT_FILTER)
+                               deInterlaceBlendLinear(dstBlock, dstStride);
+                       else if(mode & MEDIAN_DEINT_FILTER)
+                               deInterlaceMedian(dstBlock, dstStride);
+                       else if(mode & CUBIC_IPOL_DEINT_FILTER)
+                               deInterlaceInterpolateCubic(dstBlock, dstStride);
+/*                     else if(mode & CUBIC_BLEND_DEINT_FILTER)
+                               deInterlaceBlendCubic(dstBlock, dstStride);
 */
 
+                       /* only deblock if we have 2 blocks */
+                       if(y + 8 < height)
+                       {
 #ifdef MORE_TIMING
                                T1= rdtsc();
                                memcpyTime+= T1-T0;
@@ -2680,18 +2571,18 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                                if(mode & V_DEBLOCK)
                                {
                                        if(mode & V_RK1_FILTER)
-                                               vertRK1Filter(vertBlock, stride, QP);
+                                               vertRK1Filter(dstBlock, stride, QP);
                                        else if(mode & V_X1_FILTER)
-                                               vertX1Filter(vertBlock, stride, QP);
+                                               vertX1Filter(dstBlock, stride, QP);
                                        else
                                        {
-                                               if( isVertDC(vertBlock, stride))
+                                               if( isVertDC(dstBlock, stride))
                                                {
-                                                       if(isVertMinMaxOk(vertBlock, stride, QP))
-                                                               doVertLowPass(vertBlock, stride, QP);
+                                                       if(isVertMinMaxOk(dstBlock, stride, QP))
+                                                               doVertLowPass(dstBlock, stride, QP);
                                                }
                                                else
-                                                       doVertDefFilter(vertBlock, stride, QP);
+                                                       doVertDefFilter(dstBlock, stride, QP);
                                        }
                                }
 #ifdef MORE_TIMING
@@ -2700,24 +2591,8 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                                T0=T1;
 #endif
                        }
-                       else
-                       {
-                               blockCopy(vertBlock + dstStride*1, dstStride,
-                                       vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
-
-                               if(mode & LINEAR_IPOL_DEINT_FILTER)
-                                       deInterlaceInterpolateLinearLastRow(dstBlock, dstStride);
-                               else if(mode & LINEAR_BLEND_DEINT_FILTER)
-                                       deInterlaceBlendLinearLastRow(dstBlock, dstStride);
-                               else if(mode & MEDIAN_DEINT_FILTER)
-                                       deInterlaceMedianLastRow(dstBlock, dstStride);
-/*                             else if(mode & CUBIC_IPOL_DEINT_FILTER)
-                                       deInterlaceInterpolateCubicLastRow(dstBlock, dstStride);
-                               else if(mode & CUBIC_BLEND_DEINT_FILTER)
-                                       deInterlaceBlendCubicLastRow(dstBlock, dstStride);
-*/
-                       }
 
+                       /* check if we have a previous block to deblock it with dstBlock */
                        if(x - 8 >= 0 && x<width)
                        {
 #ifdef MORE_TIMING
@@ -2749,11 +2624,15 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                                dering(dstBlock - stride*9 + width-9, stride, QP);
                        //FIXME dering filter will not be applied to last block (bottom right)
 
-
                        dstBlock+=8;
                        srcBlock+=8;
-                       vertBlock+=8;
-                       vertSrcBlock+=8;
+               }
+
+               /* did we use a tmp buffer */
+               if(y+15 > height)
+               {
+                       uint8_t *dstBlock= &(dst[y*dstStride]);
+                       memcpy(dstBlock, tempDst, dstStride*(height-y) );
                }
        }
 #ifdef HAVE_3DNOW
@@ -2772,5 +2651,3 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                        , black, white);
 #endif
 }
-
-