* You should have received a copy of the GNU General Public License
* along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-*/
+ */
/**
* @file postprocess_template.c
"paddb " #a ", " #b " \n\t"
#endif
-//FIXME? |255-0| = 1 (shouldnt be a problem ...)
+//FIXME? |255-0| = 1 (should not be a problem ...)
#ifdef HAVE_MMX
/**
* Check if the middle 8x8 Block in the given 8x16 block is flat
src+= stride*3;
for(x=0; x<BLOCK_SIZE; x++)
{
- const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
- const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
+ const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
+ const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
int sums[10];
sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
* Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
* values are correctly clipped (MMX2)
* values are wraparound (C)
- * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
+ * Conclusion: It is fast, but introduces ugly horizontal patterns
+ * if there is a continuous gradient.
0 8 16 24
x = 8
x/2 = 4
for(x=0; x<BLOCK_SIZE; x++)
{
const int v = (src[x+l5] - src[x+l4]);
- if(ABS(v) < QP15)
+ if(FFABS(v) < QP15)
{
src[x+l3] +=v>>3;
src[x+l4] +=v>>1;
* Experimental Filter 1
* will not damage linear gradients
* Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
- * can only smooth blocks at the expected locations (it cant smooth them if they did move)
- * MMX2 version does correct clipping C version doesnt
+ * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
+ * MMX2 version does correct clipping C version does not
*/
static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
{
int b= src[l4] - src[l5];
int c= src[l5] - src[l6];
- int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
+ int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
d= FFMAX(d, 0);
if(d < co->QP*2)
for(x=0; x<BLOCK_SIZE; x++)
{
const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
- if(ABS(middleEnergy)< 8*QP)
+ if(FFABS(middleEnergy)< 8*QP)
{
const int q=(src[l4] - src[l5])/2;
const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
- int d= ABS(middleEnergy) - FFMIN( ABS(leftEnergy), ABS(rightEnergy) );
+ int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
d= FFMAX(d, 0);
d= (5*d + 32) >> 6;
for(y=4; y<6; y++)
{
int d= src[x+y*stride] - tmp[x+(y-4)*8];
- int ad= ABS(d);
+ int ad= FFABS(d);
static int max=0;
static int sum=0;
static int num=0;
for(x=0; x<BLOCK_SIZE; x++)
{
const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
- if(ABS(middleEnergy) < 8*c->QP)
+ if(FFABS(middleEnergy) < 8*c->QP)
{
const int q=(src[l4] - src[l5])/2;
const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
- int d= ABS(middleEnergy) - FFMIN( ABS(leftEnergy), ABS(rightEnergy) );
+ int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
d= FFMAX(d, 0);
d= (5*d + 32) >> 6;
static int worstRange=0;
static int worstDiff=0;
int diff= (f - *p);
- int absDiff= ABS(diff);
+ int absDiff= FFABS(diff);
int error= diff*diff;
if(x==1 || x==8 || y==1 || y==8) continue;
int d1=ref - cur;
// if(x==0 || x==7) d1+= d1>>1;
// if(y==0 || y==7) d1+= d1>>1;
-// d+= ABS(d1);
+// d+= FFABS(d1);
d+= d1*d1;
// sysd+= d1;
}
/**
* accurate deblock filter
*/
-static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
+static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
int64_t dc_mask, eq_mask, both_masks;
int64_t sums[10*8*2];
src+= step*3; // src points to begin of the 8x8 Block
}
#endif //HAVE_MMX
-static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
- QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
+static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
+ const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
/**
- * Copies a block from src to dst and fixes the blacklevel
- * levelFix == 0 -> dont touch the brighness & contrast
+ * Copies a block from src to dst and fixes the blacklevel.
+ * levelFix == 0 -> do not touch the brighness & contrast
*/
#undef SCALED_CPY
-static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
+static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
int levelFix, int64_t *packedOffsetAndScale)
{
#ifndef HAVE_MMX
/**
* Filters array of bytes (Y or U or V values)
*/
-static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
- QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
+static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
+ const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
{
- PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access
+ DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
int x,y;
#ifdef COMPILE_TIME_MODE
const int mode= COMPILE_TIME_MODE;
sum+= yHistogram[i];
}
- /* we allways get a completly black picture first */
+ /* We always get a completely black picture first. */
maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
clipped= sum;
/* copy & deinterlace first row of blocks */
y=-BLOCK_SIZE;
{
- uint8_t *srcBlock= &(src[y*srcStride]);
+ const uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= tempDst + dstStride;
- // From this point on it is guranteed that we can read and write 16 lines downward
+ // From this point on it is guaranteed that we can read and write 16 lines downward
// finish 1 block before the next otherwise we might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for(x=0; x<width; x+=BLOCK_SIZE)
);
#elif defined(HAVE_3DNOW)
-//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
+//FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
dstBlock+=8;
srcBlock+=8;
}
- if(width==ABS(dstStride))
+ if(width==FFABS(dstStride))
linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
else
{
for(y=0; y<height; y+=BLOCK_SIZE)
{
//1% speedup if these are here instead of the inner loop
- uint8_t *srcBlock= &(src[y*srcStride]);
+ const uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= &(dst[y*dstStride]);
#ifdef HAVE_MMX
uint8_t *tempBlock1= c.tempBlocks;
uint8_t *tempBlock2= c.tempBlocks + 8;
#endif
- int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
- int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*ABS(QPStride)];
+ const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
+ int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
int QP=0;
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
if not than use a temporary buffer */
/* duplicate last line of src to fill the void upto line (copyAhead+7) */
for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
- memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), ABS(srcStride));
+ memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
/* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
/* duplicate last line of dst to fill the void upto line (copyAhead) */
for(i=height-y+1; i<=copyAhead; i++)
- memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), ABS(dstStride));
+ memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
dstBlock= tempDst + dstStride;
srcBlock= tempSrc;
}
- // From this point on it is guranteed that we can read and write 16 lines downward
+ // From this point on it is guaranteed that we can read and write 16 lines downward
// finish 1 block before the next otherwise we might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for(x=0; x<width; x+=BLOCK_SIZE)
);
#elif defined(HAVE_3DNOW)
-//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
+//FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
else if(mode & H_DEBLOCK)
{
#ifdef HAVE_ALTIVEC
- unsigned char __attribute__ ((aligned(16))) tempBlock[272];
+ DECLARE_ALIGNED(16, unsigned char, tempBlock[272]);
transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
const int t=vertClassify_altivec(tempBlock-48, 16, &c);
if(y+15 >= height)
{
uint8_t *dstBlock= &(dst[y*dstStride]);
- if(width==ABS(dstStride))
+ if(width==FFABS(dstStride))
linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
else
{