"leal (%1, %2), %%eax \n\t"
// 0 1 2 3 4 5 6 7 8 9
// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
- "movq %3, %%mm7 \n\t" // mm7 = 0x7F
- "movq %4, %%mm6 \n\t" // mm6 = 0x7D
+ "movq %3, %%mm7 \n\t"
+ "movq %4, %%mm6 \n\t"
+
"movq (%1), %%mm0 \n\t"
"movq (%%eax), %%mm1 \n\t"
"psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
#endif
"movd %%mm0, %0 \n\t"
: "=r" (numEq)
- : "r" (src), "r" (stride), "m" (c->mmxDcOffset), "m" (c->mmxDcThreshold)
+ : "r" (src), "r" (stride), "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
: "%eax"
);
numEq= (-numEq) &0xFF;
);
return isOk==0;
#else
+#if 1
int x;
const int QP= c->QP;
src+= stride*3;
if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
}
+ return 1;
+#else
+ int x;
+ const int QP= c->QP;
+ src+= stride*4;
+ for(x=0; x<BLOCK_SIZE; x++)
+ {
+ int min=255;
+ int max=0;
+ int y;
+ for(y=0; y<8; y++){
+ int v= src[x + y*stride];
+ if(v>max) max=v;
+ if(v<min) min=v;
+ }
+ if(max-min > 2*QP) return 0;
+ }
return 1;
#endif
+#endif
}
/**
*/
#elif defined (HAVE_MMX)
src+= stride*4;
-
asm volatile(
"pxor %%mm7, %%mm7 \n\t"
- "leal (%0, %1), %%eax \n\t"
- "leal (%%eax, %1, 4), %%edx \n\t"
"leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars
"andl $0xFFFFFFF8, %%ecx \n\t" // align
// 0 1 2 3 4 5 6 7
"punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
"punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
- "movq (%%eax), %%mm2 \n\t"
+ "movq (%0, %1), %%mm2 \n\t"
+ "leal (%0, %1, 2), %%eax \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
"punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
- "movq (%%eax, %1), %%mm4 \n\t"
+ "movq (%%eax), %%mm4 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
"punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
- "movq (%%eax, %1, 2), %%mm2 \n\t"
+ "movq (%%eax, %1), %%mm2 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" // L3
"punpckhbw %%mm7, %%mm3 \n\t" // H3
"movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
"movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
- "movq (%0, %1, 4), %%mm0 \n\t"
+ "movq (%%eax, %1, 2), %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t" // L4
"punpckhbw %%mm7, %%mm1 \n\t" // H4
"psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
"psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
+ "leal (%%eax, %1), %0 \n\t"
"psllw $2, %%mm2 \n\t" // 4L3 - 4L4
"psllw $2, %%mm3 \n\t" // 4H3 - 4H4
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
//50 opcodes so far
- "movq (%%edx), %%mm2 \n\t"
+ "movq (%0, %1, 2), %%mm2 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" // L5
"punpckhbw %%mm7, %%mm3 \n\t" // H5
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
- "movq (%%edx, %1), %%mm6 \n\t"
+ "movq (%%eax, %1, 4), %%mm6 \n\t"
"punpcklbw %%mm7, %%mm6 \n\t" // L6
"psubw %%mm6, %%mm2 \n\t" // L5 - L6
- "movq (%%edx, %1), %%mm6 \n\t"
+ "movq (%%eax, %1, 4), %%mm6 \n\t"
"punpckhbw %%mm7, %%mm6 \n\t" // H6
"psubw %%mm6, %%mm3 \n\t" // H5 - H6
"psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
"psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
- "movq (%%edx, %1, 2), %%mm2 \n\t"
+ "movq (%0, %1, 4), %%mm2 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" // L7
"punpckhbw %%mm7, %%mm3 \n\t" // H7
"psubw %%mm6, %%mm4 \n\t"
"psubw %%mm7, %%mm5 \n\t"
"packsswb %%mm5, %%mm4 \n\t"
- "movq (%%eax, %1, 2), %%mm0 \n\t"
+ "movq (%0), %%mm0 \n\t"
"paddb %%mm4, %%mm0 \n\t"
- "movq %%mm0, (%%eax, %1, 2) \n\t"
- "movq (%0, %1, 4), %%mm0 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq (%0, %1), %%mm0 \n\t"
"psubb %%mm4, %%mm0 \n\t"
- "movq %%mm0, (%0, %1, 4) \n\t"
+ "movq %%mm0, (%0, %1) \n\t"
- :
- : "r" (src), "r" (stride), "m" (c->pQPb)
- : "%eax", "%edx", "%ecx"
+ : "+r" (src)
+ : "r" (stride), "m" (c->pQPb)
+ : "%eax", "%ecx"
);
#else
const int l1= stride;
int QPCorrecture= 256*256;
int copyAhead;
+#ifdef HAVE_MMX
+ int i;
+#endif
//FIXME remove
uint64_t * const yHistogram= c.yHistogram;
uint8_t * const tempSrc= c.tempSrc;
uint8_t * const tempDst= c.tempDst;
-
- c.dcOffset= c.ppMode.maxDcDiff;
- c.dcThreshold= c.ppMode.maxDcDiff*2 + 1;
+ const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
#ifdef HAVE_MMX
- c.mmxDcOffset= 0x7F - c.dcOffset;
- c.mmxDcThreshold= 0x7F - c.dcThreshold;
-
- c.mmxDcOffset*= 0x0101010101010101LL;
- c.mmxDcThreshold*= 0x0101010101010101LL;
+ for(i=0; i<32; i++){
+ int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
+ int threshold= offset*2 + 1;
+ c.mmxDcOffset[i]= 0x7F - offset;
+ c.mmxDcThreshold[i]= 0x7F - threshold;
+ c.mmxDcOffset[i]*= 0x0101010101010101LL;
+ c.mmxDcThreshold[i]*= 0x0101010101010101LL;
+ }
#endif
if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
// printf("\n\n");
/* we allways get a completly black picture first */
- maxClipped= (uint64_t)(sum * maxClippedThreshold);
+ maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
clipped= sum;
for(black=255; black>0; black--)
uint8_t *tempBlock1= c.tempBlocks;
uint8_t *tempBlock2= c.tempBlocks + 8;
#endif
-#ifdef ARCH_X86
- int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
- int QPDelta= isColor ? (-1) : 1<<31;
- int QPFrac= 1<<30;
-#endif
+ int8_t *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
+ int8_t *nonBQPptr= isColor ? &c.nonBQPTable[(y>>3)*mbWidth] :&c.nonBQPTable[(y>>4)*mbWidth];
int QP=0;
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
if not than use a temporary buffer */
#ifdef HAVE_MMX
uint8_t *tmpXchg;
#endif
-#ifdef ARCH_X86
- QP= *QPptr;
- asm volatile(
- "addl %2, %1 \n\t"
- "sbbl %%eax, %%eax \n\t"
- "shll $2, %%eax \n\t"
- "subl %%eax, %0 \n\t"
- : "+r" (QPptr), "+m" (QPFrac)
- : "r" (QPDelta)
- : "%eax"
- );
-#else
- QP= isColor ?
- QPs[(y>>3)*QPStride + (x>>3)]:
- QPs[(y>>4)*QPStride + (x>>4)];
-#endif
- if(!isColor)
+ if(isColor)
+ {
+ QP= QPptr[x>>3];
+ c.nonBQP= nonBQPptr[x>>3];
+ }
+ else
{
+ QP= QPptr[x>>4];
QP= (QP* QPCorrecture + 256*128)>>16;
+ c.nonBQP= nonBQPptr[x>>4];
+ c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
yHistogram[ srcBlock[srcStride*12 + 4] ]++;
}
-//printf("%d ", QP);
c.QP= QP;
#ifdef HAVE_MMX
asm volatile(