]> git.sesse.net Git - ffmpeg/blobdiff - postproc/swscale.c
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot...
[ffmpeg] / postproc / swscale.c
index d46e6167dcbf01301a171f189ea0f21c1d91d679..dd10521582ceaeee8204cc500b519b645a0cd474 100644 (file)
@@ -61,6 +61,9 @@ untested special converters
 #else
 #include <stdlib.h>
 #endif
+#ifdef HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
 #include "swscale.h"
 #include "swscale_internal.h"
 #include "../cpudetect.h"
@@ -673,6 +676,12 @@ static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **l
 #define COMPILE_C
 #endif
 
+#ifdef ARCH_POWERPC
+#ifdef HAVE_ALTIVEC
+#define COMPILE_ALTIVEC
+#endif //HAVE_ALTIVEC
+#endif //ARCH_POWERPC
+
 #ifdef ARCH_X86
 
 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
@@ -696,10 +705,20 @@ static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **l
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_3DNOW
+#undef HAVE_ALTIVEC
 #define RENAME(a) a ## _C
 #include "swscale_template.c"
 #endif
 
+#ifdef ARCH_POWERPC
+#ifdef COMPILE_ALTIVEC
+#undef RENAME
+#define HAVE_ALTIVEC
+#define RENAME(a) a ## _altivec
+#include "swscale_template.c"
+#endif
+#endif //ARCH_POWERPC
+
 #ifdef ARCH_X86
 
 //X86 versions
@@ -1022,6 +1041,21 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
                if(min>minFilterSize) minFilterSize= min;
        }
 
+        if (flags & SWS_CPU_CAPS_ALTIVEC) {
+          // we can handle the special case 4,
+          // so we don't want to go to the full 8
+          if (minFilterSize < 5)
+            filterAlign = 4;
+
+          // we really don't want to waste our time
+          // doing useless computation, so fall-back on
+          // the scalar C code for very small filter.
+          // vectorizing is worth it only if you have
+          // decent-sized vector.
+          if (minFilterSize < 3)
+            filterAlign = 1;
+        }
+
        ASSERT(minFilterSize > 0)
        filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
        ASSERT(filterSize > 0)
@@ -1309,6 +1343,12 @@ static SwsFunc getSwsFunc(int flags){
                return swScale_C;
 
 #else
+#ifdef ARCH_POWERPC
+       if(flags & SWS_CPU_CAPS_ALTIVEC)
+         return swScale_altivec;
+       else
+         return swScale_C;
+#endif
        return swScale_C;
 #endif
 #else //RUNTIME_CPUDETECT
@@ -1318,6 +1358,8 @@ static SwsFunc getSwsFunc(int flags){
        return swScale_3DNow;
 #elif defined (HAVE_MMX)
        return swScale_MMX;
+#elif defined (HAVE_ALTIVEC)
+       return swScale_altivec;
 #else
        return swScale_C;
 #endif
@@ -1710,7 +1752,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
 
        SwsContext *c;
        int i;
-       int usesFilter;
+       int usesVFilter, usesHFilter;
        int unscaled, needsDither;
        int srcFormat, dstFormat;
        SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
@@ -1720,13 +1762,15 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
 #endif
 
 #ifndef RUNTIME_CPUDETECT //ensure that the flags match the compiled variant if cpudetect is off
-       flags &= ~(SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2|SWS_CPU_CAPS_3DNOW);
+       flags &= ~(SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2|SWS_CPU_CAPS_3DNOW|SWS_CPU_CAPS_ALTIVEC);
 #ifdef HAVE_MMX2
        flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2;
 #elif defined (HAVE_3DNOW)
        flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_3DNOW;
 #elif defined (HAVE_MMX)
        flags |= SWS_CPU_CAPS_MMX;
+#elif defined (HAVE_ALTIVEC)
+       flags |= SWS_CPU_CAPS_ALTIVEC;
 #endif
 #endif
        if(clip_table[512] != 255) globalInit();
@@ -1779,15 +1823,15 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
        c->origSrcFormat= origSrcFormat;
         c->vRounder= 4* 0x0001000100010001ULL;
 
-       usesFilter=0;
-       if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
-       if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
-       if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
-       if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
-       if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
-       if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
-       if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
-       if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
+       usesHFilter= usesVFilter= 0;
+       if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesVFilter=1;
+       if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesHFilter=1;
+       if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesVFilter=1;
+       if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesHFilter=1;
+       if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesVFilter=1;
+       if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesHFilter=1;
+       if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesVFilter=1;
+       if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesHFilter=1;
 
        getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
        getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
@@ -1815,7 +1859,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
        sws_setColorspaceDetails(c, Inverse_Table_6_9[SWS_CS_DEFAULT], 0, Inverse_Table_6_9[SWS_CS_DEFAULT] /* FIXME*/, 0, 0, 1<<16, 1<<16); 
 
        /* unscaled special Cases */
-       if(unscaled && !usesFilter)
+       if(unscaled && !usesHFilter && !usesVFilter)
        {
                /* yv12_to_nv12 */
                if(srcFormat == IMGFMT_YV12 && dstFormat == IMGFMT_NV12)
@@ -1887,6 +1931,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
                        if(flags&SWS_PRINT_INFO)
                                MSG_INFO("SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
                }
+               if(usesHFilter) c->canMMX2BeUsed=0;
        }
        else
                c->canMMX2BeUsed=0;
@@ -1917,7 +1962,10 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
 
        /* precalculate horizontal scaler filter coefficients */
        {
-               const int filterAlign= (flags & SWS_CPU_CAPS_MMX) ? 4 : 1;
+               const int filterAlign=
+                 (flags & SWS_CPU_CAPS_MMX) ? 4 :
+                 (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
+                 1;
 
                initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
                                 srcW      ,       dstW, filterAlign, 1<<14,
@@ -1946,14 +1994,20 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
 
 
        /* precalculate vertical scaler filter coefficients */
-       initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
-                       srcH      ,        dstH, 1, (1<<12)-4,
-                       (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags,
-                       srcFilter->lumV, dstFilter->lumV);
-       initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
-                       c->chrSrcH, c->chrDstH, 1, (1<<12)-4,
-                       (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
-                       srcFilter->chrV, dstFilter->chrV);
+       {
+               const int filterAlign=
+                 (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
+                 1;
+
+               initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
+                               srcH      ,        dstH, filterAlign, (1<<12)-4,
+                               (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags,
+                               srcFilter->lumV, dstFilter->lumV);
+               initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
+                               c->chrSrcH, c->chrDstH, filterAlign, (1<<12)-4,
+                               (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
+                               srcFilter->chrV, dstFilter->chrV);
+       }
 
        // Calculate Buffer Sizes so that they won't run out while handling these damn slices
        c->vLumBufSize= c->vLumFilterSize;
@@ -1963,10 +2017,9 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
                int chrI= i*c->chrDstH / dstH;
                int nextSlice= MAX(c->vLumFilterPos[i   ] + c->vLumFilterSize - 1,
                                 ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<c->chrSrcVSubSample));
-               if(c->chrSrcVSubSample > 1) 
-                   nextSlice&= ~3; // Slices start at boundaries which are divisable through 4
-               else
-                   nextSlice&= ~1; // Slices start at boundaries which are divisable through 2
+
+               nextSlice>>= c->chrSrcVSubSample;
+               nextSlice<<= c->chrSrcVSubSample;
                if(c->vLumFilterPos[i   ] + c->vLumBufSize < nextSlice)
                        c->vLumBufSize= nextSlice - c->vLumFilterPos[i   ];
                if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>c->chrSrcVSubSample))
@@ -2033,7 +2086,9 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
                        MSG_INFO("using 3DNOW\n");
                else if(flags & SWS_CPU_CAPS_MMX)
                        MSG_INFO("using MMX\n");
-               else
+               else if(flags & SWS_CPU_CAPS_ALTIVEC)
+                       MSG_INFO("using AltiVec\n");
+               else 
                        MSG_INFO("using C\n");
        }
 
@@ -2119,7 +2174,10 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
  */
 int sws_scale_ordered(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                            int srcSliceH, uint8_t* dst[], int dstStride[]){
-       return c->swScale(c, src, srcStride, srcSliceY, srcSliceH, dst, dstStride);
+       //copy strides, so they can safely be modified
+       int srcStride2[3]= {srcStride[0], srcStride[1], srcStride[2]};
+       int dstStride2[3]= {dstStride[0], dstStride[1], dstStride[2]};
+       return c->swScale(c, src, srcStride2, srcSliceY, srcSliceH, dst, dstStride2);
 }
 
 /**