}
}
-static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
-{
- unsigned j,i,num_pixels=src_size/3;
- for(i=0,j=0; j<num_pixels; i+=3,j+=3)
- {
- dst[j+0] = src[i+2];
- dst[j+1] = src[i+1];
- dst[j+2] = src[i+0];
- }
-}
-
static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
{
register const uint8_t* s=src;
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
#ifdef HAVE_MMX
+ mm_end = end - 15;
+#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
+ asm volatile(
+ "movq %3, %%mm5 \n\t"
+ "movq %4, %%mm6 \n\t"
+ "movq %5, %%mm7 \n\t"
+ ".balign 16 \n\t"
+ "1: \n\t"
+ PREFETCH" 32(%1) \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd 4(%1), %%mm3 \n\t"
+ "punpckldq 8(%1), %%mm0 \n\t"
+ "punpckldq 12(%1), %%mm3 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm3, %%mm4 \n\t"
+ "pand %%mm6, %%mm0 \n\t"
+ "pand %%mm6, %%mm3 \n\t"
+ "pmaddwd %%mm7, %%mm0 \n\t"
+ "pmaddwd %%mm7, %%mm3 \n\t"
+ "pand %%mm5, %%mm1 \n\t"
+ "pand %%mm5, %%mm4 \n\t"
+ "por %%mm1, %%mm0 \n\t"
+ "por %%mm4, %%mm3 \n\t"
+ "psrld $5, %%mm0 \n\t"
+ "pslld $11, %%mm3 \n\t"
+ "por %%mm3, %%mm0 \n\t"
+ MOVNTQ" %%mm0, (%0) \n\t"
+ "addl $16, %1 \n\t"
+ "addl $8, %0 \n\t"
+ "cmpl %2, %1 \n\t"
+ " jb 1b \n\t"
+ : "+r" (d), "+r"(s)
+ : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
+ );
+#else
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm __volatile(
"movq %0, %%mm7\n\t"
"movq %1, %%mm6\n\t"
::"m"(red_16mask),"m"(green_16mask));
- mm_end = end - 15;
while(s < mm_end)
{
__asm __volatile(
d += 4;
s += 16;
}
+#endif
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
#ifdef HAVE_MMX
+ mm_end = end - 15;
+#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
+ asm volatile(
+ "movq %3, %%mm5 \n\t"
+ "movq %4, %%mm6 \n\t"
+ "movq %5, %%mm7 \n\t"
+ ".balign 16 \n\t"
+ "1: \n\t"
+ PREFETCH" 32(%1) \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd 4(%1), %%mm3 \n\t"
+ "punpckldq 8(%1), %%mm0 \n\t"
+ "punpckldq 12(%1), %%mm3 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm3, %%mm4 \n\t"
+ "pand %%mm6, %%mm0 \n\t"
+ "pand %%mm6, %%mm3 \n\t"
+ "pmaddwd %%mm7, %%mm0 \n\t"
+ "pmaddwd %%mm7, %%mm3 \n\t"
+ "pand %%mm5, %%mm1 \n\t"
+ "pand %%mm5, %%mm4 \n\t"
+ "por %%mm1, %%mm0 \n\t"
+ "por %%mm4, %%mm3 \n\t"
+ "psrld $6, %%mm0 \n\t"
+ "pslld $10, %%mm3 \n\t"
+ "por %%mm3, %%mm0 \n\t"
+ MOVNTQ" %%mm0, (%0) \n\t"
+ "addl $16, %1 \n\t"
+ "addl $8, %0 \n\t"
+ "cmpl %2, %1 \n\t"
+ " jb 1b \n\t"
+ : "+r" (d), "+r"(s)
+ : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
+ );
+#else
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm __volatile(
"movq %0, %%mm7\n\t"
"movq %1, %%mm6\n\t"
::"m"(red_15mask),"m"(green_15mask));
- mm_end = end - 15;
while(s < mm_end)
{
__asm __volatile(
d += 4;
s += 16;
}
+#endif
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
unsigned num_pixels = src_size >> 2;
for(i=0; i<num_pixels; i++)
{
- dst[4*i + 0] = src[4*i + 2];
- dst[4*i + 1] = src[4*i + 1];
- dst[4*i + 2] = src[4*i + 0];
+#ifdef WORDS_BIGENDIAN
+ dst[4*i + 1] = src[4*i + 3];
+ dst[4*i + 2] = src[4*i + 2];
+ dst[4*i + 3] = src[4*i + 1];
+#else
+ dst[4*i + 0] = src[4*i + 2];
+ dst[4*i + 1] = src[4*i + 1];
+ dst[4*i + 2] = src[4*i + 0];
+#endif
}
#endif
}
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
}
+static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+ unsigned int width, unsigned int height,
+ int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
+{
+ unsigned y;
+ const unsigned chromWidth= width>>1;
+ for(y=0; y<height; y++)
+ {
+#ifdef HAVE_MMX
+//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
+ asm volatile(
+ "xorl %%eax, %%eax \n\t"
+ ".balign 16 \n\t"
+ "1: \n\t"
+ PREFETCH" 32(%1, %%eax, 2) \n\t"
+ PREFETCH" 32(%2, %%eax) \n\t"
+ PREFETCH" 32(%3, %%eax) \n\t"
+ "movq (%2, %%eax), %%mm0 \n\t" // U(0)
+ "movq %%mm0, %%mm2 \n\t" // U(0)
+ "movq (%3, %%eax), %%mm1 \n\t" // V(0)
+ "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
+ "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
+
+ "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
+ "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
+ "movq %%mm0, %%mm4 \n\t" // Y(0)
+ "movq %%mm2, %%mm6 \n\t" // Y(8)
+ "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
+ "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
+ "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
+ "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
+
+ MOVNTQ" %%mm0, (%0, %%eax, 4) \n\t"
+ MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
+ MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t"
+ MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
+
+ "addl $8, %%eax \n\t"
+ "cmpl %4, %%eax \n\t"
+ " jb 1b \n\t"
+ ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
+ : "%eax"
+ );
+#else
+//FIXME adapt the alpha asm code from yv12->yuy2
+
+#if __WORDSIZE >= 64
+ int i;
+ uint64_t *ldst = (uint64_t *) dst;
+ const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
+ for(i = 0; i < chromWidth; i += 2){
+ uint64_t k, l;
+ k = uc[0] + (yc[0] << 8) +
+ (vc[0] << 16) + (yc[1] << 24);
+ l = uc[1] + (yc[2] << 8) +
+ (vc[1] << 16) + (yc[3] << 24);
+ *ldst++ = k + (l << 32);
+ yc += 4;
+ uc += 2;
+ vc += 2;
+ }
+
+#else
+ int i, *idst = (int32_t *) dst;
+ const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
+ for(i = 0; i < chromWidth; i++){
+ *idst++ = uc[0] + (yc[0] << 8) +
+ (vc[0] << 16) + (yc[1] << 24);
+ yc += 2;
+ uc++;
+ vc++;
+ }
+#endif
+#endif
+ if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
+ {
+ usrc += chromStride;
+ vsrc += chromStride;
+ }
+ ysrc += lumStride;
+ dst += dstStride;
+ }
+#ifdef HAVE_MMX
+asm( EMMS" \n\t"
+ SFENCE" \n\t"
+ :::"memory");
+#endif
+}
+
+/**
+ *
+ * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
+ * problem for anyone then tell me, and ill fix it)
+ */
+static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+ unsigned int width, unsigned int height,
+ int lumStride, int chromStride, int dstStride)
+{
+ //FIXME interpolate chroma
+ RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
+}
+
/**
*
* width should be a multiple of 16