#undef MOVNTQ
#undef PREFETCH
-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
#define PREFETCH "prefetchnta"
#else
#define PREFETCH " # nop"
#endif
-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
#else
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
#endif
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
-#define YSCALEYUV2YV12X(offset, dest, end, pos) \
- __asm__ volatile(\
- "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
- "movq %%mm3, %%mm4 \n\t"\
- "lea " offset "(%0), %%"REG_d" \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- ".p2align 4 \n\t" /* FIXME Unroll? */\
- "1: \n\t"\
- "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
- "movq (%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
- "movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" /* srcData */\
- "add $16, %%"REG_d" \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- "test %%"REG_S", %%"REG_S" \n\t"\
- "pmulhw %%mm0, %%mm2 \n\t"\
- "pmulhw %%mm0, %%mm5 \n\t"\
- "paddw %%mm2, %%mm3 \n\t"\
- "paddw %%mm5, %%mm4 \n\t"\
- " jnz 1b \n\t"\
- "psraw $3, %%mm3 \n\t"\
- "psraw $3, %%mm4 \n\t"\
- "packuswb %%mm4, %%mm3 \n\t"\
- MOVNTQ(%%mm3, (%1, %3))\
- "add $8, %3 \n\t"\
- "cmp %2, %3 \n\t"\
- "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
- "movq %%mm3, %%mm4 \n\t"\
- "lea " offset "(%0), %%"REG_d" \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- "jb 1b \n\t"\
- :: "r" (&c->redDither),\
- "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
- : "%"REG_d, "%"REG_S\
- );
-
-static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
- const int16_t **lumSrc, int lumFilterSize,
- const int16_t *chrFilter, const int16_t **chrUSrc,
- const int16_t **chrVSrc,
- int chrFilterSize, const int16_t **alpSrc,
- uint8_t *dest[4], int dstW, int chrDstW)
-{
- uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
- *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
-
- if (uDest) {
- x86_reg uv_off = c->uv_offx2 >> 1;
- YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
- YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
- }
- if (CONFIG_SWSCALE_ALPHA && aDest) {
- YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
- }
-
- YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, yDest, dstW, 0)
-}
-
-#define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
- __asm__ volatile(\
- "lea " offset "(%0), %%"REG_d" \n\t"\
- "pxor %%mm4, %%mm4 \n\t"\
- "pxor %%mm5, %%mm5 \n\t"\
- "pxor %%mm6, %%mm6 \n\t"\
- "pxor %%mm7, %%mm7 \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- ".p2align 4 \n\t"\
- "1: \n\t"\
- "movq (%%"REG_S", %3, 2), %%mm0 \n\t" /* srcData */\
- "movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
- "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
- "movq (%%"REG_S", %3, 2), %%mm1 \n\t" /* srcData */\
- "movq %%mm0, %%mm3 \n\t"\
- "punpcklwd %%mm1, %%mm0 \n\t"\
- "punpckhwd %%mm1, %%mm3 \n\t"\
- "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
- "pmaddwd %%mm1, %%mm0 \n\t"\
- "pmaddwd %%mm1, %%mm3 \n\t"\
- "paddd %%mm0, %%mm4 \n\t"\
- "paddd %%mm3, %%mm5 \n\t"\
- "movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" /* srcData */\
- "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
- "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
- "test %%"REG_S", %%"REG_S" \n\t"\
- "movq %%mm2, %%mm0 \n\t"\
- "punpcklwd %%mm3, %%mm2 \n\t"\
- "punpckhwd %%mm3, %%mm0 \n\t"\
- "pmaddwd %%mm1, %%mm2 \n\t"\
- "pmaddwd %%mm1, %%mm0 \n\t"\
- "paddd %%mm2, %%mm6 \n\t"\
- "paddd %%mm0, %%mm7 \n\t"\
- " jnz 1b \n\t"\
- "psrad $16, %%mm4 \n\t"\
- "psrad $16, %%mm5 \n\t"\
- "psrad $16, %%mm6 \n\t"\
- "psrad $16, %%mm7 \n\t"\
- "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
- "packssdw %%mm5, %%mm4 \n\t"\
- "packssdw %%mm7, %%mm6 \n\t"\
- "paddw %%mm0, %%mm4 \n\t"\
- "paddw %%mm0, %%mm6 \n\t"\
- "psraw $3, %%mm4 \n\t"\
- "psraw $3, %%mm6 \n\t"\
- "packuswb %%mm6, %%mm4 \n\t"\
- MOVNTQ(%%mm4, (%1, %3))\
- "add $8, %3 \n\t"\
- "cmp %2, %3 \n\t"\
- "lea " offset "(%0), %%"REG_d" \n\t"\
- "pxor %%mm4, %%mm4 \n\t"\
- "pxor %%mm5, %%mm5 \n\t"\
- "pxor %%mm6, %%mm6 \n\t"\
- "pxor %%mm7, %%mm7 \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- "jb 1b \n\t"\
- :: "r" (&c->redDither),\
- "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
- : "%"REG_a, "%"REG_d, "%"REG_S\
- );
-
-static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
- const int16_t **lumSrc, int lumFilterSize,
- const int16_t *chrFilter, const int16_t **chrUSrc,
- const int16_t **chrVSrc,
- int chrFilterSize, const int16_t **alpSrc,
- uint8_t *dest[4], int dstW, int chrDstW)
-{
- uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
- *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
-
- if (uDest) {
- x86_reg uv_off = c->uv_offx2 >> 1;
- YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
- YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
- }
- if (CONFIG_SWSCALE_ALPHA && aDest) {
- YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
- }
-
- YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, yDest, dstW, 0)
-}
-
-static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
- const int16_t *chrUSrc, const int16_t *chrVSrc,
- const int16_t *alpSrc,
- uint8_t *dst[4], int dstW, int chrDstW)
-{
- int p= 4;
- const int16_t *src[4]= {
- lumSrc + dstW, chrUSrc + chrDstW,
- chrVSrc + chrDstW, alpSrc + dstW
- };
- x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
-
- while (p--) {
- if (dst[p]) {
- __asm__ volatile(
- "mov %2, %%"REG_a" \n\t"
- ".p2align 4 \n\t" /* FIXME Unroll? */
- "1: \n\t"
- "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
- "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
- "psraw $7, %%mm0 \n\t"
- "psraw $7, %%mm1 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- MOVNTQ(%%mm0, (%1, %%REGa))
- "add $8, %%"REG_a" \n\t"
- "jnc 1b \n\t"
- :: "r" (src[p]), "r" (dst[p] + counter[p]),
- "g" (-counter[p])
- : "%"REG_a
- );
- }
- }
-}
-
-static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
- const int16_t *chrUSrc, const int16_t *chrVSrc,
- const int16_t *alpSrc,
- uint8_t *dst[4], int dstW, int chrDstW)
-{
- int p= 4;
- const int16_t *src[4]= {
- lumSrc + dstW, chrUSrc + chrDstW,
- chrVSrc + chrDstW, alpSrc + dstW
- };
- x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
-
- while (p--) {
- if (dst[p]) {
- __asm__ volatile(
- "mov %2, %%"REG_a" \n\t"
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $15, %%mm7 \n\t"
- "psllw $6, %%mm7 \n\t"
- ".p2align 4 \n\t" /* FIXME Unroll? */
- "1: \n\t"
- "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
- "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
- "paddsw %%mm7, %%mm0 \n\t"
- "paddsw %%mm7, %%mm1 \n\t"
- "psraw $7, %%mm0 \n\t"
- "psraw $7, %%mm1 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- MOVNTQ(%%mm0, (%1, %%REGa))
- "add $8, %%"REG_a" \n\t"
- "jnc 1b \n\t"
- :: "r" (src[p]), "r" (dst[p] + counter[p]),
- "g" (-counter[p])
- : "%"REG_a
- );
- }
- }
-}
-
#define YSCALEYUV2PACKEDX_UV \
__asm__ volatile(\
"xor %%"REG_a", %%"REG_a" \n\t"\
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
+ x86_reg uv_off = c->uv_off_byte;
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
YSCALEYUV2PACKEDX_ACCURATE
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
+ x86_reg uv_off = c->uv_off_byte;
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
YSCALEYUV2PACKEDX
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
+ x86_reg uv_off = c->uv_off_byte;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
+ x86_reg uv_off = c->uv_off_byte;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
+ x86_reg uv_off = c->uv_off_byte;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
+ x86_reg uv_off = c->uv_off_byte;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"cmp "#dstw", "#index" \n\t"\
" jb 1b \n\t"
-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
#undef WRITEBGR24
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
#else
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
+ x86_reg uv_off = c->uv_off_byte;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
+ x86_reg uv_off = c->uv_off_byte;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
+ x86_reg uv_off = c->uv_off_byte;
YSCALEYUV2PACKEDX_ACCURATE
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
+ x86_reg uv_off = c->uv_off_byte;
YSCALEYUV2PACKEDX
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
- "add "UV_OFFx2"("#c"), "#index" \n\t" \
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
- "sub "UV_OFFx2"("#c"), "#index" \n\t" \
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
- "add "UV_OFFx2"("#c"), "#index" \n\t" \
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
- "sub "UV_OFFx2"("#c"), "#index" \n\t" \
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
- "add "UV_OFFx2"("#c"), "#index" \n\t" \
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
- "sub "UV_OFFx2"("#c"), "#index" \n\t" \
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
- "add "UV_OFFx2"("#c"), "#index" \n\t" \
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
- "sub "UV_OFFx2"("#c"), "#index" \n\t" \
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
"psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
* YV12 to RGB without scaling or interpolating
*/
static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
- const int16_t *ubuf[2], const int16_t *bguf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, int y)
{
- const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
+ const int16_t *ubuf0 = ubuf[0];
const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
+ const int16_t *ubuf1 = ubuf[0];
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
);
}
} else {
+ const int16_t *ubuf1 = ubuf[1];
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
}
static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
- const int16_t *ubuf[2], const int16_t *bguf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, int y)
{
- const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
+ const int16_t *ubuf0 = ubuf[0];
const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
+ const int16_t *ubuf1 = ubuf[0];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"a" (&c->redDither)
);
} else {
+ const int16_t *ubuf1 = ubuf[1];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
}
static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
- const int16_t *ubuf[2], const int16_t *bguf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, int y)
{
- const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
+ const int16_t *ubuf0 = ubuf[0];
const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
+ const int16_t *ubuf1 = ubuf[0];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"a" (&c->redDither)
);
} else {
+ const int16_t *ubuf1 = ubuf[1];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
}
static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
- const int16_t *ubuf[2], const int16_t *bguf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, int y)
{
- const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
+ const int16_t *ubuf0 = ubuf[0];
const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
+ const int16_t *ubuf1 = ubuf[0];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"a" (&c->redDither)
);
} else {
+ const int16_t *ubuf1 = ubuf[1];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
- "add "UV_OFFx2"("#c"), "#index" \n\t" \
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
- "sub "UV_OFFx2"("#c"), "#index" \n\t" \
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"psraw $7, %%mm3 \n\t" \
"psraw $7, %%mm4 \n\t" \
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
- "add "UV_OFFx2"("#c"), "#index" \n\t" \
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
- "sub "UV_OFFx2"("#c"), "#index" \n\t" \
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
"psrlw $8, %%mm3 \n\t" \
#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
- const int16_t *ubuf[2], const int16_t *bguf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, int y)
{
- const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
+ const int16_t *ubuf0 = ubuf[0];
const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
+ const int16_t *ubuf1 = ubuf[0];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"a" (&c->redDither)
);
} else {
+ const int16_t *ubuf1 = ubuf[1];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
}
}
-#if !COMPILE_TEMPLATE_MMX2
-//FIXME yuy2* can read up to 7 samples too much
-
-static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src,
- int width, uint32_t *unused)
-{
- __asm__ volatile(
- "movq "MANGLE(bm01010101)", %%mm2 \n\t"
- "mov %0, %%"REG_a" \n\t"
- "1: \n\t"
- "movq (%1, %%"REG_a",2), %%mm0 \n\t"
- "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
- "pand %%mm2, %%mm0 \n\t"
- "pand %%mm2, %%mm1 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "movq %%mm0, (%2, %%"REG_a") \n\t"
- "add $8, %%"REG_a" \n\t"
- " js 1b \n\t"
- : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
- : "%"REG_a
- );
-}
-
-static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV,
- const uint8_t *src1, const uint8_t *src2,
- int width, uint32_t *unused)
-{
- __asm__ volatile(
- "movq "MANGLE(bm01010101)", %%mm4 \n\t"
- "mov %0, %%"REG_a" \n\t"
- "1: \n\t"
- "movq (%1, %%"REG_a",4), %%mm0 \n\t"
- "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
- "psrlw $8, %%mm0 \n\t"
- "psrlw $8, %%mm1 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "psrlw $8, %%mm0 \n\t"
- "pand %%mm4, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- "movd %%mm0, (%3, %%"REG_a") \n\t"
- "movd %%mm1, (%2, %%"REG_a") \n\t"
- "add $4, %%"REG_a" \n\t"
- " js 1b \n\t"
- : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
- : "%"REG_a
- );
- assert(src1 == src2);
-}
-
-/* This is almost identical to the previous, end exists only because
- * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
-static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src,
- int width, uint32_t *unused)
-{
- __asm__ volatile(
- "mov %0, %%"REG_a" \n\t"
- "1: \n\t"
- "movq (%1, %%"REG_a",2), %%mm0 \n\t"
- "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
- "psrlw $8, %%mm0 \n\t"
- "psrlw $8, %%mm1 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "movq %%mm0, (%2, %%"REG_a") \n\t"
- "add $8, %%"REG_a" \n\t"
- " js 1b \n\t"
- : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
- : "%"REG_a
- );
-}
-
-static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV,
- const uint8_t *src1, const uint8_t *src2,
- int width, uint32_t *unused)
-{
- __asm__ volatile(
- "movq "MANGLE(bm01010101)", %%mm4 \n\t"
- "mov %0, %%"REG_a" \n\t"
- "1: \n\t"
- "movq (%1, %%"REG_a",4), %%mm0 \n\t"
- "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
- "pand %%mm4, %%mm0 \n\t"
- "pand %%mm4, %%mm1 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "psrlw $8, %%mm0 \n\t"
- "pand %%mm4, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- "movd %%mm0, (%3, %%"REG_a") \n\t"
- "movd %%mm1, (%2, %%"REG_a") \n\t"
- "add $4, %%"REG_a" \n\t"
- " js 1b \n\t"
- : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
- : "%"REG_a
- );
- assert(src1 == src2);
-}
-
-static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
- const uint8_t *src, int width)
-{
- __asm__ volatile(
- "movq "MANGLE(bm01010101)", %%mm4 \n\t"
- "mov %0, %%"REG_a" \n\t"
- "1: \n\t"
- "movq (%1, %%"REG_a",2), %%mm0 \n\t"
- "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm1, %%mm3 \n\t"
- "pand %%mm4, %%mm0 \n\t"
- "pand %%mm4, %%mm1 \n\t"
- "psrlw $8, %%mm2 \n\t"
- "psrlw $8, %%mm3 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "movq %%mm0, (%2, %%"REG_a") \n\t"
- "movq %%mm2, (%3, %%"REG_a") \n\t"
- "add $8, %%"REG_a" \n\t"
- " js 1b \n\t"
- : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
- : "%"REG_a
- );
-}
-
-static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
- const uint8_t *src1, const uint8_t *src2,
- int width, uint32_t *unused)
-{
- RENAME(nvXXtoUV)(dstU, dstV, src1, width);
-}
-
-static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
- const uint8_t *src1, const uint8_t *src2,
- int width, uint32_t *unused)
-{
- RENAME(nvXXtoUV)(dstV, dstU, src1, width);
-}
-#endif /* !COMPILE_TEMPLATE_MMX2 */
-
-static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src,
- int width, enum PixelFormat srcFormat)
-{
-
- if(srcFormat == PIX_FMT_BGR24) {
- __asm__ volatile(
- "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
- "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
- :
- );
- } else {
- __asm__ volatile(
- "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
- "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
- :
- );
- }
-
- __asm__ volatile(
- "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
- "mov %2, %%"REG_a" \n\t"
- "pxor %%mm7, %%mm7 \n\t"
- "1: \n\t"
- PREFETCH" 64(%0) \n\t"
- "movd (%0), %%mm0 \n\t"
- "movd 2(%0), %%mm1 \n\t"
- "movd 6(%0), %%mm2 \n\t"
- "movd 8(%0), %%mm3 \n\t"
- "add $12, %0 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm3 \n\t"
- "pmaddwd %%mm5, %%mm0 \n\t"
- "pmaddwd %%mm6, %%mm1 \n\t"
- "pmaddwd %%mm5, %%mm2 \n\t"
- "pmaddwd %%mm6, %%mm3 \n\t"
- "paddd %%mm1, %%mm0 \n\t"
- "paddd %%mm3, %%mm2 \n\t"
- "paddd %%mm4, %%mm0 \n\t"
- "paddd %%mm4, %%mm2 \n\t"
- "psrad $15, %%mm0 \n\t"
- "psrad $15, %%mm2 \n\t"
- "packssdw %%mm2, %%mm0 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "movd %%mm0, (%1, %%"REG_a") \n\t"
- "add $4, %%"REG_a" \n\t"
- " js 1b \n\t"
- : "+r" (src)
- : "r" (dst+width), "g" ((x86_reg)-width)
- : "%"REG_a
- );
-}
-
-static void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src,
- int width, uint32_t *unused)
-{
- RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
-}
-
-static void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src,
- int width, uint32_t *unused)
-{
- RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
-}
-
-static av_always_inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV,
- const uint8_t *src, int width,
- enum PixelFormat srcFormat)
-{
- __asm__ volatile(
- "movq 24(%4), %%mm6 \n\t"
- "mov %3, %%"REG_a" \n\t"
- "pxor %%mm7, %%mm7 \n\t"
- "1: \n\t"
- PREFETCH" 64(%0) \n\t"
- "movd (%0), %%mm0 \n\t"
- "movd 2(%0), %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm1, %%mm3 \n\t"
- "pmaddwd (%4), %%mm0 \n\t"
- "pmaddwd 8(%4), %%mm1 \n\t"
- "pmaddwd 16(%4), %%mm2 \n\t"
- "pmaddwd %%mm6, %%mm3 \n\t"
- "paddd %%mm1, %%mm0 \n\t"
- "paddd %%mm3, %%mm2 \n\t"
-
- "movd 6(%0), %%mm1 \n\t"
- "movd 8(%0), %%mm3 \n\t"
- "add $12, %0 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm3 \n\t"
- "movq %%mm1, %%mm4 \n\t"
- "movq %%mm3, %%mm5 \n\t"
- "pmaddwd (%4), %%mm1 \n\t"
- "pmaddwd 8(%4), %%mm3 \n\t"
- "pmaddwd 16(%4), %%mm4 \n\t"
- "pmaddwd %%mm6, %%mm5 \n\t"
- "paddd %%mm3, %%mm1 \n\t"
- "paddd %%mm5, %%mm4 \n\t"
-
- "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
- "paddd %%mm3, %%mm0 \n\t"
- "paddd %%mm3, %%mm2 \n\t"
- "paddd %%mm3, %%mm1 \n\t"
- "paddd %%mm3, %%mm4 \n\t"
- "psrad $15, %%mm0 \n\t"
- "psrad $15, %%mm2 \n\t"
- "psrad $15, %%mm1 \n\t"
- "psrad $15, %%mm4 \n\t"
- "packssdw %%mm1, %%mm0 \n\t"
- "packssdw %%mm4, %%mm2 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm2, %%mm2 \n\t"
- "movd %%mm0, (%1, %%"REG_a") \n\t"
- "movd %%mm2, (%2, %%"REG_a") \n\t"
- "add $4, %%"REG_a" \n\t"
- " js 1b \n\t"
- : "+r" (src)
- : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
- : "%"REG_a
- );
-}
-
-static void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV,
- const uint8_t *src1, const uint8_t *src2,
- int width, uint32_t *unused)
-{
- RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
- assert(src1 == src2);
-}
-
-static void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV,
- const uint8_t *src1, const uint8_t *src2,
- int width, uint32_t *unused)
-{
- assert(src1==src2);
- RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
-}
-
-#if !COMPILE_TEMPLATE_MMX2
-// bilinear / bicubic scaling
-static void RENAME(hScale)(SwsContext *c, int16_t *dst, int dstW,
- const uint8_t *src, const int16_t *filter,
- const int16_t *filterPos, int filterSize)
-{
- assert(filterSize % 4 == 0 && filterSize>0);
- if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
- x86_reg counter= -2*dstW;
- filter-= counter*2;
- filterPos-= counter/2;
- dst-= counter/2;
- __asm__ volatile(
-#if defined(PIC)
- "push %%"REG_b" \n\t"
-#endif
- "pxor %%mm7, %%mm7 \n\t"
- "push %%"REG_BP" \n\t" // we use 7 regs here ...
- "mov %%"REG_a", %%"REG_BP" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movzwl (%2, %%"REG_BP"), %%eax \n\t"
- "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
- "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
- "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
- "movd (%3, %%"REG_a"), %%mm0 \n\t"
- "movd (%3, %%"REG_b"), %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "pmaddwd %%mm1, %%mm0 \n\t"
- "pmaddwd %%mm2, %%mm3 \n\t"
- "movq %%mm0, %%mm4 \n\t"
- "punpckldq %%mm3, %%mm0 \n\t"
- "punpckhdq %%mm3, %%mm4 \n\t"
- "paddd %%mm4, %%mm0 \n\t"
- "psrad $7, %%mm0 \n\t"
- "packssdw %%mm0, %%mm0 \n\t"
- "movd %%mm0, (%4, %%"REG_BP") \n\t"
- "add $4, %%"REG_BP" \n\t"
- " jnc 1b \n\t"
-
- "pop %%"REG_BP" \n\t"
-#if defined(PIC)
- "pop %%"REG_b" \n\t"
-#endif
- : "+a" (counter)
- : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
-#if !defined(PIC)
- : "%"REG_b
-#endif
- );
- } else if (filterSize==8) {
- x86_reg counter= -2*dstW;
- filter-= counter*4;
- filterPos-= counter/2;
- dst-= counter/2;
- __asm__ volatile(
-#if defined(PIC)
- "push %%"REG_b" \n\t"
-#endif
- "pxor %%mm7, %%mm7 \n\t"
- "push %%"REG_BP" \n\t" // we use 7 regs here ...
- "mov %%"REG_a", %%"REG_BP" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movzwl (%2, %%"REG_BP"), %%eax \n\t"
- "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
- "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
- "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
- "movd (%3, %%"REG_a"), %%mm0 \n\t"
- "movd (%3, %%"REG_b"), %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "pmaddwd %%mm1, %%mm0 \n\t"
- "pmaddwd %%mm2, %%mm3 \n\t"
-
- "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
- "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
- "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
- "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "pmaddwd %%mm1, %%mm4 \n\t"
- "pmaddwd %%mm2, %%mm5 \n\t"
- "paddd %%mm4, %%mm0 \n\t"
- "paddd %%mm5, %%mm3 \n\t"
- "movq %%mm0, %%mm4 \n\t"
- "punpckldq %%mm3, %%mm0 \n\t"
- "punpckhdq %%mm3, %%mm4 \n\t"
- "paddd %%mm4, %%mm0 \n\t"
- "psrad $7, %%mm0 \n\t"
- "packssdw %%mm0, %%mm0 \n\t"
- "movd %%mm0, (%4, %%"REG_BP") \n\t"
- "add $4, %%"REG_BP" \n\t"
- " jnc 1b \n\t"
-
- "pop %%"REG_BP" \n\t"
-#if defined(PIC)
- "pop %%"REG_b" \n\t"
-#endif
- : "+a" (counter)
- : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
-#if !defined(PIC)
- : "%"REG_b
-#endif
- );
- } else {
- const uint8_t *offset = src+filterSize;
- x86_reg counter= -2*dstW;
- //filter-= counter*filterSize/2;
- filterPos-= counter/2;
- dst-= counter/2;
- __asm__ volatile(
- "pxor %%mm7, %%mm7 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "mov %2, %%"REG_c" \n\t"
- "movzwl (%%"REG_c", %0), %%eax \n\t"
- "movzwl 2(%%"REG_c", %0), %%edx \n\t"
- "mov %5, %%"REG_c" \n\t"
- "pxor %%mm4, %%mm4 \n\t"
- "pxor %%mm5, %%mm5 \n\t"
- "2: \n\t"
- "movq (%1), %%mm1 \n\t"
- "movq (%1, %6), %%mm3 \n\t"
- "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
- "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "pmaddwd %%mm1, %%mm0 \n\t"
- "pmaddwd %%mm2, %%mm3 \n\t"
- "paddd %%mm3, %%mm5 \n\t"
- "paddd %%mm0, %%mm4 \n\t"
- "add $8, %1 \n\t"
- "add $4, %%"REG_c" \n\t"
- "cmp %4, %%"REG_c" \n\t"
- " jb 2b \n\t"
- "add %6, %1 \n\t"
- "movq %%mm4, %%mm0 \n\t"
- "punpckldq %%mm5, %%mm4 \n\t"
- "punpckhdq %%mm5, %%mm0 \n\t"
- "paddd %%mm0, %%mm4 \n\t"
- "psrad $7, %%mm4 \n\t"
- "packssdw %%mm4, %%mm4 \n\t"
- "mov %3, %%"REG_a" \n\t"
- "movd %%mm4, (%%"REG_a", %0) \n\t"
- "add $4, %0 \n\t"
- " jnc 1b \n\t"
-
- : "+r" (counter), "+r" (filter)
- : "m" (filterPos), "m" (dst), "m"(offset),
- "m" (src), "r" ((x86_reg)filterSize*2)
- : "%"REG_a, "%"REG_c, "%"REG_d
- );
- }
-}
-#endif /* !COMPILE_TEMPLATE_MMX2 */
-
-#if COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMXEXT
static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
int dstWidth, const uint8_t *src,
int srcW, int xInc)
{
- int16_t *filterPos = c->hLumFilterPos;
+ int32_t *filterPos = c->hLumFilterPos;
int16_t *filter = c->hLumFilter;
- void *mmx2FilterCode= c->lumMmx2FilterCode;
+ void *mmxextFilterCode = c->lumMmxextFilterCode;
int i;
#if defined(PIC)
- DECLARE_ALIGNED(8, uint64_t, ebxsave);
+ uint64_t ebxsave;
+#endif
+#if ARCH_X86_64
+ uint64_t retsave;
#endif
__asm__ volatile(
#if defined(PIC)
"mov %%"REG_b", %5 \n\t"
+#if ARCH_X86_64
+ "mov -8(%%rsp), %%"REG_a" \n\t"
+ "mov %%"REG_a", %6 \n\t"
+#endif
+#else
+#if ARCH_X86_64
+ "mov -8(%%rsp), %%"REG_a" \n\t"
+ "mov %%"REG_a", %5 \n\t"
+#endif
#endif
"pxor %%mm7, %%mm7 \n\t"
"mov %0, %%"REG_c" \n\t"
#if defined(PIC)
"mov %5, %%"REG_b" \n\t"
+#if ARCH_X86_64
+ "mov %6, %%"REG_a" \n\t"
+ "mov %%"REG_a", -8(%%rsp) \n\t"
+#endif
+#else
+#if ARCH_X86_64
+ "mov %5, %%"REG_a" \n\t"
+ "mov %%"REG_a", -8(%%rsp) \n\t"
+#endif
#endif
:: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
- "m" (mmx2FilterCode)
+ "m" (mmxextFilterCode)
#if defined(PIC)
,"m" (ebxsave)
+#endif
+#if ARCH_X86_64
+ ,"m"(retsave)
#endif
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
#if !defined(PIC)
int dstWidth, const uint8_t *src1,
const uint8_t *src2, int srcW, int xInc)
{
- int16_t *filterPos = c->hChrFilterPos;
+ int32_t *filterPos = c->hChrFilterPos;
int16_t *filter = c->hChrFilter;
- void *mmx2FilterCode= c->chrMmx2FilterCode;
+ void *mmxextFilterCode = c->chrMmxextFilterCode;
int i;
#if defined(PIC)
DECLARE_ALIGNED(8, uint64_t, ebxsave);
#endif
+#if ARCH_X86_64
+ DECLARE_ALIGNED(8, uint64_t, retsave);
+#endif
__asm__ volatile(
#if defined(PIC)
"mov %%"REG_b", %7 \n\t"
+#if ARCH_X86_64
+ "mov -8(%%rsp), %%"REG_a" \n\t"
+ "mov %%"REG_a", %8 \n\t"
+#endif
+#else
+#if ARCH_X86_64
+ "mov -8(%%rsp), %%"REG_a" \n\t"
+ "mov %%"REG_a", %7 \n\t"
+#endif
#endif
"pxor %%mm7, %%mm7 \n\t"
"mov %0, %%"REG_c" \n\t"
#if defined(PIC)
"mov %7, %%"REG_b" \n\t"
+#if ARCH_X86_64
+ "mov %8, %%"REG_a" \n\t"
+ "mov %%"REG_a", -8(%%rsp) \n\t"
+#endif
+#else
+#if ARCH_X86_64
+ "mov %7, %%"REG_a" \n\t"
+ "mov %%"REG_a", -8(%%rsp) \n\t"
+#endif
#endif
:: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
- "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
+ "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
#if defined(PIC)
,"m" (ebxsave)
+#endif
+#if ARCH_X86_64
+ ,"m"(retsave)
#endif
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
#if !defined(PIC)
dst2[i] = src2[srcW-1]*128;
}
}
-#endif /* COMPILE_TEMPLATE_MMX2 */
+#endif /* COMPILE_TEMPLATE_MMXEXT */
static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
{
- enum PixelFormat srcFormat = c->srcFormat,
- dstFormat = c->dstFormat;
+ enum AVPixelFormat dstFormat = c->dstFormat;
if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
- dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) {
+ dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21) {
if (!(c->flags & SWS_BITEXACT)) {
if (c->flags & SWS_ACCURATE_RND) {
- c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
- c->yuv2yuvX = RENAME(yuv2yuvX_ar );
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
- case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
- case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
- case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
- case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
- case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
+ case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
+ case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
+ case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
+ case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
+ case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
default: break;
}
}
} else {
- c->yuv2yuv1 = RENAME(yuv2yuv1 );
- c->yuv2yuvX = RENAME(yuv2yuvX );
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
- case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
- case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
- case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
- case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
- case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
+ case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
+ case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
+ case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
+ case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
+ case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
default: break;
}
}
}
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
- case PIX_FMT_RGB32:
+ case AV_PIX_FMT_RGB32:
c->yuv2packed1 = RENAME(yuv2rgb32_1);
c->yuv2packed2 = RENAME(yuv2rgb32_2);
break;
- case PIX_FMT_BGR24:
+ case AV_PIX_FMT_BGR24:
c->yuv2packed1 = RENAME(yuv2bgr24_1);
c->yuv2packed2 = RENAME(yuv2bgr24_2);
break;
- case PIX_FMT_RGB555:
+ case AV_PIX_FMT_RGB555:
c->yuv2packed1 = RENAME(yuv2rgb555_1);
c->yuv2packed2 = RENAME(yuv2rgb555_2);
break;
- case PIX_FMT_RGB565:
+ case AV_PIX_FMT_RGB565:
c->yuv2packed1 = RENAME(yuv2rgb565_1);
c->yuv2packed2 = RENAME(yuv2rgb565_2);
break;
- case PIX_FMT_YUYV422:
+ case AV_PIX_FMT_YUYV422:
c->yuv2packed1 = RENAME(yuv2yuyv422_1);
c->yuv2packed2 = RENAME(yuv2yuyv422_2);
break;
}
}
- if (c->scalingBpp == 8) {
-#if !COMPILE_TEMPLATE_MMX2
- c->hScale = RENAME(hScale );
-#endif /* !COMPILE_TEMPLATE_MMX2 */
-
- // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
-#if COMPILE_TEMPLATE_MMX2
- if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
- {
+ if (c->srcBpc == 8 && c->dstBpc <= 10) {
+ // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
+#if COMPILE_TEMPLATE_MMXEXT
+ if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
c->hyscale_fast = RENAME(hyscale_fast);
c->hcscale_fast = RENAME(hcscale_fast);
} else {
-#endif /* COMPILE_TEMPLATE_MMX2 */
+#endif /* COMPILE_TEMPLATE_MMXEXT */
c->hyscale_fast = NULL;
c->hcscale_fast = NULL;
-#if COMPILE_TEMPLATE_MMX2
- }
-#endif /* COMPILE_TEMPLATE_MMX2 */
- }
-
-#if !COMPILE_TEMPLATE_MMX2
- switch(srcFormat) {
- case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
- case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
- case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
- case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
- default: break;
+#if COMPILE_TEMPLATE_MMXEXT
}
-#endif /* !COMPILE_TEMPLATE_MMX2 */
- if (!c->chrSrcHSubSample) {
- switch(srcFormat) {
- case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
- case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
- default: break;
- }
- }
-
- switch (srcFormat) {
-#if !COMPILE_TEMPLATE_MMX2
- case PIX_FMT_YUYV422 :
- case PIX_FMT_Y400A : c->lumToYV12 = RENAME(yuy2ToY); break;
- case PIX_FMT_UYVY422 : c->lumToYV12 = RENAME(uyvyToY); break;
-#endif /* !COMPILE_TEMPLATE_MMX2 */
- case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
- case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
- default: break;
- }
-#if !COMPILE_TEMPLATE_MMX2
- if (c->alpPixBuf) {
- switch (srcFormat) {
- case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
- default: break;
- }
+#endif /* COMPILE_TEMPLATE_MMXEXT */
}
-#endif /* !COMPILE_TEMPLATE_MMX2 */
}