#endif
}
+#if ARCH_X86 && TEMPLATE_PP_MMXEXT
+static inline void RENAME(prefetchnta)(const void *p)
+{
+ __asm__ volatile( "prefetchnta (%0)\n\t"
+ : : "r" (p)
+ );
+}
+
+static inline void RENAME(prefetcht0)(const void *p)
+{
+ __asm__ volatile( "prefetcht0 (%0)\n\t"
+ : : "r" (p)
+ );
+}
+
+static inline void RENAME(prefetcht1)(const void *p)
+{
+ __asm__ volatile( "prefetcht1 (%0)\n\t"
+ : : "r" (p)
+ );
+}
+
+static inline void RENAME(prefetcht2)(const void *p)
+{
+ __asm__ volatile( "prefetcht2 (%0)\n\t"
+ : : "r" (p)
+ );
+}
+#elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2)
+static inline void RENAME(prefetchnta)(const void *p)
+{
+ __builtin_prefetch(p,0,0);
+}
+static inline void RENAME(prefetcht0)(const void *p)
+{
+ __builtin_prefetch(p,0,1);
+}
+static inline void RENAME(prefetcht1)(const void *p)
+{
+ __builtin_prefetch(p,0,2);
+}
+static inline void RENAME(prefetcht2)(const void *p)
+{
+ __builtin_prefetch(p,0,3);
+}
+#else
+static inline void RENAME(prefetchnta)(const void *p)
+{
+ return;
+}
+static inline void RENAME(prefetcht0)(const void *p)
+{
+ return;
+}
+static inline void RENAME(prefetcht1)(const void *p)
+{
+ return;
+}
+static inline void RENAME(prefetcht2)(const void *p)
+{
+ return;
+}
+#endif
/**
* Filter array of bytes (Y or U or V values)
*/
// finish 1 block before the next otherwise we might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or something
for(x=0; x<width; x+=BLOCK_SIZE){
-
-#if TEMPLATE_PP_MMXEXT && HAVE_6REGS
-/*
- prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
- prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
- prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
- prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
-*/
-
- __asm__(
- "mov %4, %%"REG_a" \n\t"
- "shr $2, %%"REG_a" \n\t"
- "and $6, %%"REG_a" \n\t"
- "add %5, %%"REG_a" \n\t"
- "mov %%"REG_a", %%"REG_d" \n\t"
- "imul %1, %%"REG_a" \n\t"
- "imul %3, %%"REG_d" \n\t"
- "prefetchnta 32(%%"REG_a", %0) \n\t"
- "prefetcht0 32(%%"REG_d", %2) \n\t"
- "add %1, %%"REG_a" \n\t"
- "add %3, %%"REG_d" \n\t"
- "prefetchnta 32(%%"REG_a", %0) \n\t"
- "prefetcht0 32(%%"REG_d", %2) \n\t"
- :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
- "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
- : "%"REG_a, "%"REG_d
- );
-#endif
+ RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
+ RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
+ RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
+ RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
#endif
const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
- int QP=0;
+ int QP=0, nonBQP=0;
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
if not than use a temporary buffer */
if(y+15 >= height){
int endx = FFMIN(width, x+32);
uint8_t *dstBlockStart = dstBlock;
const uint8_t *srcBlockStart = srcBlock;
- for(; x < endx; x+=BLOCK_SIZE){
-#if TEMPLATE_PP_MMXEXT && HAVE_6REGS
-/*
- prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
- prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
- prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
- prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
-*/
-
- __asm__(
- "mov %4, %%"REG_a" \n\t"
- "shr $2, %%"REG_a" \n\t"
- "and $6, %%"REG_a" \n\t"
- "add %5, %%"REG_a" \n\t"
- "mov %%"REG_a", %%"REG_d" \n\t"
- "imul %1, %%"REG_a" \n\t"
- "imul %3, %%"REG_d" \n\t"
- "prefetchnta 32(%%"REG_a", %0) \n\t"
- "prefetcht0 32(%%"REG_d", %2) \n\t"
- "add %1, %%"REG_a" \n\t"
- "add %3, %%"REG_d" \n\t"
- "prefetchnta 32(%%"REG_a", %0) \n\t"
- "prefetcht0 32(%%"REG_d", %2) \n\t"
- :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
- "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
- : "%"REG_a, "%"REG_d
+ int qp_index = 0;
+ for(qp_index=0; qp_index < (endx-startx)/BLOCK_SIZE; qp_index++){
+ QP = QPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
+ nonBQP = nonBQPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
+ if(!isColor){
+ QP= (QP* QPCorrecture + 256*128)>>16;
+ nonBQP= (nonBQP* QPCorrecture + 256*128)>>16;
+ yHistogram[(srcBlock+qp_index*8)[srcStride*12 + 4]]++;
+ }
+ c.QP_block[qp_index] = QP;
+ c.nonBQP_block[qp_index] = nonBQP;
+#if TEMPLATE_PP_MMX
+ __asm__ volatile(
+ "movd %1, %%mm7 \n\t"
+ "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
+ "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
+ "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
+ "movq %%mm7, %0 \n\t"
+ : "=m" (c.pQPb_block[qp_index])
+ : "r" (QP)
);
#endif
+ }
+ for(; x < endx; x+=BLOCK_SIZE){
+ RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
+ RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
+ RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
+ RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
dstBlock = dstBlockStart;
srcBlock = srcBlockStart;
- for(x = startx; x < endx; x+=BLOCK_SIZE){
+ for(x = startx, qp_index = 0; x < endx; x+=BLOCK_SIZE, qp_index++){
const int stride= dstStride;
- QP = QPptr[x>>qpHShift];
- c.nonBQP = nonBQPptr[x>>qpHShift];
- if(!isColor){
- QP= (QP* QPCorrecture + 256*128)>>16;
- c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
- yHistogram[srcBlock[srcStride*12 + 4]]++;
- }
- c.QP= QP;
-#if TEMPLATE_PP_MMX
- __asm__ volatile(
- "movd %1, %%mm7 \n\t"
- "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
- "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
- "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
- "movq %%mm7, %0 \n\t"
- : "=m" (c.pQPb)
- : "r" (QP)
- );
-#endif
+ //temporary while changing QP stuff to make things continue to work
+ //eventually QP,nonBQP,etc will be arrays and this will be unnecessary
+ c.QP = c.QP_block[qp_index];
+ c.nonBQP = c.nonBQP_block[qp_index];
+ c.pQPb = c.pQPb_block[qp_index];
+ c.pQPb2 = c.pQPb2_block[qp_index];
+
/* only deblock if we have 2 blocks */
if(y + 8 < height){
if(mode & V_X1_FILTER)
dstBlock = dstBlockStart;
srcBlock = srcBlockStart;
- for(x = startx; x < endx; x+=BLOCK_SIZE){
+ for(x = startx, qp_index=0; x < endx; x+=BLOCK_SIZE, qp_index++){
const int stride= dstStride;
av_unused uint8_t *tmpXchg;
-
- if(isColor){
- QP= QPptr[x>>qpHShift];
- c.nonBQP= nonBQPptr[x>>qpHShift];
- }else{
- QP= QPptr[x>>4];
- QP= (QP* QPCorrecture + 256*128)>>16;
- c.nonBQP= nonBQPptr[x>>4];
- c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
- }
- c.QP= QP;
+ c.QP = c.QP_block[qp_index];
+ c.nonBQP = c.nonBQP_block[qp_index];
+ c.pQPb = c.pQPb_block[qp_index];
+ c.pQPb2 = c.pQPb2_block[qp_index];
#if TEMPLATE_PP_MMX
- __asm__ volatile(
- "movd %1, %%mm7 \n\t"
- "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
- "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
- "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
- "movq %%mm7, %0 \n\t"
- : "=m" (c.pQPb)
- : "r" (QP)
- );
RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
#endif
/* check if we have a previous block to deblock it with dstBlock */
#else
if(mode & H_X1_FILTER)
- horizX1Filter(dstBlock-4, stride, QP);
+ horizX1Filter(dstBlock-4, stride, c.QP);
else if(mode & H_DEBLOCK){
#if TEMPLATE_PP_ALTIVEC
DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];