+/*
+ * composite_line_yuv_sse2_simple.c
+ * Copyright (C) 2003-2004 Ushodaya Enterprises Limited
+ * Author: Maksym Veremeyenko <verem@m1stereo.tv>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <inttypes.h>
void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint8_t *alpha_b, uint8_t *alpha_a, int weight)
{
const static unsigned char const1[] =
{
0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00
};
+ const static unsigned char const2[] =
+ {
+ 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00
+ };
__asm__ volatile
(
"pxor %%xmm0, %%xmm0 \n\t" /* clear zero register */
"movdqu (%4), %%xmm9 \n\t" /* load const1 */
+ "movdqu (%7), %%xmm10 \n\t" /* load const2 */
"movd %0, %%xmm1 \n\t" /* load weight and decompose */
"movlhps %%xmm1, %%xmm1 \n\t"
"pshuflw $0, %%xmm1, %%xmm1 \n\t"
"movdqa %%xmm9, %%xmm4 \n\t"
"psubw %%xmm3, %%xmm4 \n\t"
"pmullw %%xmm2, %%xmm4 \n\t"
+ "movdqa %%xmm4, %%xmm5 \n\t"
+ "psrlw $8, %%xmm4 \n\t"
+ "paddw %%xmm5, %%xmm4 \n\t"
+ "paddw %%xmm10, %%xmm4 \n\t"
"psrlw $8, %%xmm4 \n\t"
"paddw %%xmm4, %%xmm3 \n\t"
"packuswb %%xmm0, %%xmm3 \n\t"
"pmullw %%xmm9, %%xmm6 \n\t"
"paddw %%xmm3, %%xmm4 \n\t" /* dst = dst + src */
"paddw %%xmm5, %%xmm6 \n\t"
- "psrlw $8, %%xmm4 \n\t" /* dst = dst >> 8 */
+ "movdqa %%xmm4, %%xmm3 \n\t" /* dst = ((dst >> 8) + dst + 128) >> 8 */
+ "movdqa %%xmm6, %%xmm5 \n\t"
+ "psrlw $8, %%xmm4 \n\t"
+ "psrlw $8, %%xmm6 \n\t"
+ "paddw %%xmm3, %%xmm4 \n\t"
+ "paddw %%xmm5, %%xmm6 \n\t"
+ "paddw %%xmm10, %%xmm4 \n\t"
+ "paddw %%xmm10, %%xmm6 \n\t"
+ "psrlw $8, %%xmm4 \n\t"
"psrlw $8, %%xmm6 \n\t"
// "pminsw %%xmm9, %%xmm4 \n\t" /* clamp values */
// "pminsw %%xmm9, %%xmm6 \n\t"
"jnz loop_start \n\t"
:
- : "r" (weight >> 8), "r" (alpha_b), "r" (src), "r" (dest), "r" (const1) , "r" (alpha_a), "r" (width / 8)
+ : "r" (weight >> 8), "r" (alpha_b), "r" (src), "r" (dest), "r" (const1) , "r" (alpha_a), "r" (width / 8), "r" (const2)
//: "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9", "memory"
);
};