From 6ad1fa5a49320c101a62d24aa0e7df14c10d7612 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Bernhard=20Rosenkr=C3=A4nzer?= Date: Thu, 26 May 2005 14:32:46 +0000 Subject: [PATCH] Better ARM support for mplayer/ffmpeg, ported from atty fork while playing with some new hardware, I found it's running a forked mplayer -- and it looks like they're following the GPL. The maintainer's page is here: http://atty.jp/?Zaurus/mplayer Unfortunately it's mostly in Japanese, so it's hard to figure out any details. Their code looks quite interesting (at least to those of us w/ ARM CPUs). The patches I've attached are the patches from atty.jp with a couple of modifications by myself: - ported to current CVS - reverted their change of removing SNOW support from ffmpeg - cleaned up their bswap mess - removed DOS-style linebreaks from various files patch by (Bernhard Rosenkraenzer: bero, arklinux org) Originally committed as revision 4311 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/Makefile | 11 +- libavcodec/armv4l/dsputil_arm.c | 211 +++++ libavcodec/armv4l/dsputil_arm_s.S | 694 +++++++++++++++ libavcodec/armv4l/dsputil_iwmmxt.c | 168 ++++ libavcodec/armv4l/dsputil_iwmmxt_rnd.h | 1093 ++++++++++++++++++++++++ libavcodec/armv4l/mpegvideo_arm.c | 7 + libavcodec/armv4l/mpegvideo_iwmmxt.c | 97 +++ libavcodec/avcodec.h | 1 + libavcodec/bswap.h | 13 + 9 files changed, 2294 insertions(+), 1 deletion(-) create mode 100644 libavcodec/armv4l/dsputil_arm_s.S create mode 100644 libavcodec/armv4l/dsputil_iwmmxt.c create mode 100644 libavcodec/armv4l/dsputil_iwmmxt_rnd.h create mode 100644 libavcodec/armv4l/mpegvideo_iwmmxt.c diff --git a/libavcodec/Makefile b/libavcodec/Makefile index a54cf348bcc..913a222bf40 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -316,8 +316,11 @@ endif # armv4l specific stuff ifeq ($(TARGET_ARCH_ARMV4L),yes) -ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o +ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o armv4l/dsputil_arm_s.o OBJS += armv4l/dsputil_arm.o armv4l/mpegvideo_arm.o +ifeq ($(TARGET_IWMMXT),yes) +OBJS += armv4l/dsputil_iwmmxt.o armv4l/mpegvideo_iwmmxt.o +endif endif # sun mediaLib specific stuff @@ -327,6 +330,12 @@ OBJS += mlib/dsputil_mlib.o CFLAGS += $(MLIB_INC) endif +# Intel IPP specific stuff +# currently only works when libavcodec is used in mplayer +ifeq ($(HAVE_IPP),yes) +CFLAGS += $(IPP_INC) +endif + # alpha specific stuff ifeq ($(TARGET_ARCH_ALPHA),yes) OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o \ diff --git a/libavcodec/armv4l/dsputil_arm.c b/libavcodec/armv4l/dsputil_arm.c index ff61097d73f..8608064ebdd 100644 --- a/libavcodec/armv4l/dsputil_arm.c +++ b/libavcodec/armv4l/dsputil_arm.c @@ -18,6 +18,13 @@ */ #include "../dsputil.h" +#ifdef HAVE_IPP +#include "ipp.h" +#endif + +#ifdef HAVE_IWMMXT +extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx); +#endif extern void j_rev_dct_ARM(DCTELEM *data); extern void simple_idct_ARM(DCTELEM *data); @@ -26,6 +33,146 @@ extern void simple_idct_ARM(DCTELEM *data); static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); +void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); + +void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); + +void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +static void put_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + put_pixels8_x2_arm(block, pixels, line_size, h); + put_pixels8_x2_arm(block + 8, pixels + 8, line_size, h); +} + +static void put_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + put_pixels8_y2_arm(block, pixels, line_size, h); + put_pixels8_y2_arm(block + 8, pixels + 8, line_size, h); +} + +static void put_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + put_pixels8_xy2_arm(block, pixels, line_size, h); + put_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h); +} + +static void put_no_rnd_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + put_no_rnd_pixels8_x2_arm(block, pixels, line_size, h); + put_no_rnd_pixels8_x2_arm(block + 8, pixels + 8, line_size, h); +} + +static void put_no_rnd_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + put_no_rnd_pixels8_y2_arm(block, pixels, line_size, h); + put_no_rnd_pixels8_y2_arm(block + 8, pixels + 8, line_size, h); +} + +static void put_no_rnd_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + put_no_rnd_pixels8_xy2_arm(block, pixels, line_size, h); + put_no_rnd_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h); +} + +static void add_pixels_clamped_ARM(short *block, unsigned char *dest, int line_size) +{ + asm volatile ( + "mov r10, #8 \n\t" + + "1: \n\t" + + /* load dest */ + "ldr r4, [%1] \n\t" + /* block[0] and block[1]*/ + "ldrsh r5, [%0] \n\t" + "ldrsh r7, [%0, #2] \n\t" + "and r6, r4, #0xFF \n\t" + "and r8, r4, #0xFF00 \n\t" + "add r6, r5, r6 \n\t" + "add r8, r7, r8, lsr #8 \n\t" + "mvn r5, r5 \n\t" + "mvn r7, r7 \n\t" + "tst r6, #0x100 \n\t" + "movne r6, r5, lsr #24 \n\t" + "tst r8, #0x100 \n\t" + "movne r8, r7, lsr #24 \n\t" + "mov r9, r6 \n\t" + "ldrsh r5, [%0, #4] \n\t" /* moved form [A] */ + "orr r9, r9, r8, lsl #8 \n\t" + /* block[2] and block[3] */ + /* [A] */ + "ldrsh r7, [%0, #6] \n\t" + "and r6, r4, #0xFF0000 \n\t" + "and r8, r4, #0xFF000000 \n\t" + "add r6, r5, r6, lsr #16 \n\t" + "add r8, r7, r8, lsr #24 \n\t" + "mvn r5, r5 \n\t" + "mvn r7, r7 \n\t" + "tst r6, #0x100 \n\t" + "movne r6, r5, lsr #24 \n\t" + "tst r8, #0x100 \n\t" + "movne r8, r7, lsr #24 \n\t" + "orr r9, r9, r6, lsl #16 \n\t" + "ldr r4, [%1, #4] \n\t" /* moved form [B] */ + "orr r9, r9, r8, lsl #24 \n\t" + /* store dest */ + "ldrsh r5, [%0, #8] \n\t" /* moved form [C] */ + "str r9, [%1] \n\t" + + /* load dest */ + /* [B] */ + /* block[4] and block[5] */ + /* [C] */ + "ldrsh r7, [%0, #10] \n\t" + "and r6, r4, #0xFF \n\t" + "and r8, r4, #0xFF00 \n\t" + "add r6, r5, r6 \n\t" + "add r8, r7, r8, lsr #8 \n\t" + "mvn r5, r5 \n\t" + "mvn r7, r7 \n\t" + "tst r6, #0x100 \n\t" + "movne r6, r5, lsr #24 \n\t" + "tst r8, #0x100 \n\t" + "movne r8, r7, lsr #24 \n\t" + "mov r9, r6 \n\t" + "ldrsh r5, [%0, #12] \n\t" /* moved from [D] */ + "orr r9, r9, r8, lsl #8 \n\t" + /* block[6] and block[7] */ + /* [D] */ + "ldrsh r7, [%0, #14] \n\t" + "and r6, r4, #0xFF0000 \n\t" + "and r8, r4, #0xFF000000 \n\t" + "add r6, r5, r6, lsr #16 \n\t" + "add r8, r7, r8, lsr #24 \n\t" + "mvn r5, r5 \n\t" + "mvn r7, r7 \n\t" + "tst r6, #0x100 \n\t" + "movne r6, r5, lsr #24 \n\t" + "tst r8, #0x100 \n\t" + "movne r8, r7, lsr #24 \n\t" + "orr r9, r9, r6, lsl #16 \n\t" + "add %0, %0, #16 \n\t" /* moved from [E] */ + "orr r9, r9, r8, lsl #24 \n\t" + "subs r10, r10, #1 \n\t" /* moved from [F] */ + /* store dest */ + "str r9, [%1, #4] \n\t" + + /* [E] */ + /* [F] */ + "add %1, %1, %2 \n\t" + "bne 1b \n\t" + : + : "r"(block), + "r"(dest), + "r"(line_size) + : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc", "memory" ); +} + /* XXX: those functions should be suppressed ASAP when all IDCTs are converted */ static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block) @@ -48,6 +195,34 @@ static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block) simple_idct_ARM (block); ff_add_pixels_clamped(block, dest, line_size); } +static void simple_idct_ipp(DCTELEM *block) +{ +#ifdef HAVE_IPP + ippiDCT8x8Inv_Video_16s_C1I(block); +#endif +} +static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block) +{ +#ifdef HAVE_IPP + ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size); +#endif +} + +#ifdef HAVE_IWMMXT +void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size); +#endif + +static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block) +{ +#ifdef HAVE_IPP + ippiDCT8x8Inv_Video_16s_C1I(block); +#ifdef HAVE_IWMMXT + add_pixels_clamped_iwmmxt(block, dest, line_size); +#else + add_pixels_clamped_ARM(block, dest, line_size); +#endif +#endif +} void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) { @@ -56,7 +231,11 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) ff_put_pixels_clamped = c->put_pixels_clamped; ff_add_pixels_clamped = c->add_pixels_clamped; +#ifdef HAVE_IPP + if(idct_algo==FF_IDCT_ARM){ +#else if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_ARM){ +#endif c->idct_put= j_rev_dct_ARM_put; c->idct_add= j_rev_dct_ARM_add; c->idct = j_rev_dct_ARM; @@ -66,5 +245,37 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) c->idct_add= simple_idct_ARM_add; c->idct = simple_idct_ARM; c->idct_permutation_type= FF_NO_IDCT_PERM; +#ifdef HAVE_IPP + } else if (idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_IPP){ +#else + } else if (idct_algo==FF_IDCT_IPP){ +#endif + c->idct_put= simple_idct_ipp_put; + c->idct_add= simple_idct_ipp_add; + c->idct = simple_idct_ipp; + c->idct_permutation_type= FF_NO_IDCT_PERM; } + +/* c->put_pixels_tab[0][0] = put_pixels16_arm; */ // NG! + c->put_pixels_tab[0][1] = put_pixels16_x2_arm; //OK! + c->put_pixels_tab[0][2] = put_pixels16_y2_arm; //OK! +/* c->put_pixels_tab[0][3] = put_pixels16_xy2_arm; /\* NG *\/ */ +/* c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm; // ?(»È¤ï¤ì¤Ê¤¤) */ + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; // OK + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; //OK +/* c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm; //NG */ + c->put_pixels_tab[1][0] = put_pixels8_arm; //OK + c->put_pixels_tab[1][1] = put_pixels8_x2_arm; //OK +/* c->put_pixels_tab[1][2] = put_pixels8_y2_arm; //NG */ +/* c->put_pixels_tab[1][3] = put_pixels8_xy2_arm; //NG */ + c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;//OK + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; //OK + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK +/* c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;//NG */ + +#if 1 +#ifdef HAVE_IWMMXT + dsputil_init_iwmmxt(c, avctx); +#endif +#endif } diff --git a/libavcodec/armv4l/dsputil_arm_s.S b/libavcodec/armv4l/dsputil_arm_s.S new file mode 100644 index 00000000000..827110b9683 --- /dev/null +++ b/libavcodec/armv4l/dsputil_arm_s.S @@ -0,0 +1,694 @@ +@ +@ ARMv4L optimized DSP utils +@ Copyright (c) 2004 AGAWA Koji +@ +@ This library is free software; you can redistribute it and/or +@ modify it under the terms of the GNU Lesser General Public +@ License as published by the Free Software Foundation; either +@ version 2 of the License, or (at your option) any later version. +@ +@ This library is distributed in the hope that it will be useful, +@ but WITHOUT ANY WARRANTY; without even the implied warranty of +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +@ Lesser General Public License for more details. +@ +@ You should have received a copy of the GNU Lesser General Public +@ License along with this library; if not, write to the Free Software +@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +@ + +.macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 + mov \Rd0, \Rn0, lsr #(\shift * 8) + mov \Rd1, \Rn1, lsr #(\shift * 8) + mov \Rd2, \Rn2, lsr #(\shift * 8) + mov \Rd3, \Rn3, lsr #(\shift * 8) + orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) + orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) + orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) + orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) +.endm +.macro ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2 + mov \R0, \R0, lsr #(\shift * 8) + orr \R0, \R0, \R1, lsl #(32 - \shift * 8) + mov \R1, \R1, lsr #(\shift * 8) + orr \R1, \R1, \R2, lsl #(32 - \shift * 8) +.endm +.macro ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 + mov \Rdst0, \Rsrc0, lsr #(\shift * 8) + mov \Rdst1, \Rsrc1, lsr #(\shift * 8) + orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) + orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) +.endm + +.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask + @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) + @ Rmask = 0xFEFEFEFE + @ Rn = destroy + eor \Rd0, \Rn0, \Rm0 + eor \Rd1, \Rn1, \Rm1 + orr \Rn0, \Rn0, \Rm0 + orr \Rn1, \Rn1, \Rm1 + and \Rd0, \Rd0, \Rmask + and \Rd1, \Rd1, \Rmask + sub \Rd0, \Rn0, \Rd0, lsr #1 + sub \Rd1, \Rn1, \Rd1, lsr #1 +.endm + +.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask + @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) + @ Rmask = 0xFEFEFEFE + @ Rn = destroy + eor \Rd0, \Rn0, \Rm0 + eor \Rd1, \Rn1, \Rm1 + and \Rn0, \Rn0, \Rm0 + and \Rn1, \Rn1, \Rm1 + and \Rd0, \Rd0, \Rmask + and \Rd1, \Rd1, \Rmask + add \Rd0, \Rn0, \Rd0, lsr #1 + add \Rd1, \Rn1, \Rd1, lsr #1 +.endm + +@ ---------------------------------------------------------------- + .align 8 + .global put_pixels16_arm +put_pixels16_arm: + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r11, lr} @ R14 is also called LR + adr r5, 5f + ands r4, r1, #3 + bic r1, r1, #3 + add r5, r5, r4, lsl #2 + ldrne pc, [r5] +1: + ldmia r1, {r4-r7} + add r1, r1, r2 + stmia r0, {r4-r7} + pld [r1] + subs r3, r3, #1 + add r0, r0, r2 + bne 1b + ldmfd sp!, {r4-r11, pc} + .align 8 +2: + ldmia r1, {r4-r8} + add r1, r1, r2 + ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stmia r0, {r9-r12} + add r0, r0, r2 + bne 2b + ldmfd sp!, {r4-r11, pc} + .align 8 +3: + ldmia r1, {r4-r8} + add r1, r1, r2 + ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stmia r0, {r9-r12} + add r0, r0, r2 + bne 3b + ldmfd sp!, {r4-r11, pc} + .align 8 +4: + ldmia r1, {r4-r8} + add r1, r1, r2 + ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stmia r0, {r9-r12} + add r0, r0, r2 + bne 4b + ldmfd sp!, {r4-r11,pc} + .align 8 +5: + .word 1b + .word 2b + .word 3b + .word 4b + +@ ---------------------------------------------------------------- + .align 8 + .global put_pixels8_arm +put_pixels8_arm: + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r5,lr} @ R14 is also called LR + adr r5, 5f + ands r4, r1, #3 + bic r1, r1, #3 + add r5, r5, r4, lsl #2 + ldrne pc, [r5] +1: + ldmia r1, {r4-r5} + add r1, r1, r2 + subs r3, r3, #1 + pld [r1] + stmia r0, {r4-r5} + add r0, r0, r2 + bne 1b + ldmfd sp!, {r4-r5,pc} + .align 8 +2: + ldmia r1, {r4-r5, r12} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stmia r0, {r4-r5} + add r0, r0, r2 + bne 2b + ldmfd sp!, {r4-r5,pc} + .align 8 +3: + ldmia r1, {r4-r5, r12} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stmia r0, {r4-r5} + add r0, r0, r2 + bne 3b + ldmfd sp!, {r4-r5,pc} + .align 8 +4: + ldmia r1, {r4-r5, r12} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stmia r0, {r4-r5} + add r0, r0, r2 + bne 4b + ldmfd sp!, {r4-r5,pc} + .align 8 +5: + .word 1b + .word 2b + .word 3b + .word 4b + +@ ---------------------------------------------------------------- + .align 8 + .global put_pixels8_x2_arm +put_pixels8_x2_arm: + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r10,lr} @ R14 is also called LR + adr r5, 5f + ands r4, r1, #3 + ldr r12, [r5] + add r5, r5, r4, lsl #2 + bic r1, r1, #3 + ldrne pc, [r5] +1: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 + pld [r1] + RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + subs r3, r3, #1 + stmia r0, {r8-r9} + add r0, r0, r2 + bne 1b + ldmfd sp!, {r4-r10,pc} + .align 8 +2: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 + ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10 + pld [r1] + RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stmia r0, {r4-r5} + add r0, r0, r2 + bne 2b + ldmfd sp!, {r4-r10,pc} + .align 8 +3: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10 + ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10 + pld [r1] + RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stmia r0, {r4-r5} + add r0, r0, r2 + bne 3b + ldmfd sp!, {r4-r10,pc} + .align 8 +4: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10 + pld [r1] + RND_AVG32 r8, r9, r6, r7, r5, r10, r12 + subs r3, r3, #1 + stmia r0, {r8-r9} + add r0, r0, r2 + bne 4b + ldmfd sp!, {r4-r10,pc} @@ update PC with LR content. + .align 8 +5: + .word 0xFEFEFEFE + .word 2b + .word 3b + .word 4b + + .align 8 + .global put_no_rnd_pixels8_x2_arm +put_no_rnd_pixels8_x2_arm: + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r10,lr} @ R14 is also called LR + adr r5, 5f + ands r4, r1, #3 + ldr r12, [r5] + add r5, r5, r4, lsl #2 + bic r1, r1, #3 + ldrne pc, [r5] +1: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + subs r3, r3, #1 + stmia r0, {r8-r9} + add r0, r0, r2 + bne 1b + ldmfd sp!, {r4-r10,pc} + .align 8 +2: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 + ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stmia r0, {r4-r5} + add r0, r0, r2 + bne 2b + ldmfd sp!, {r4-r10,pc} + .align 8 +3: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10 + ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stmia r0, {r4-r5} + add r0, r0, r2 + bne 3b + ldmfd sp!, {r4-r10,pc} + .align 8 +4: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 + subs r3, r3, #1 + stmia r0, {r8-r9} + add r0, r0, r2 + bne 4b + ldmfd sp!, {r4-r10,pc} @@ update PC with LR content. + .align 8 +5: + .word 0xFEFEFEFE + .word 2b + .word 3b + .word 4b + + +@ ---------------------------------------------------------------- + .align 8 + .global put_pixels8_y2_arm +put_pixels8_y2_arm: + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r11,lr} @ R14 is also called LR + adr r5, 5f + ands r4, r1, #3 + mov r3, r3, lsr #1 + ldr r12, [r5] + add r5, r5, r4, lsl #2 + bic r1, r1, #3 + ldrne pc, [r5] +1: + ldmia r1, {r4-r5} + add r1, r1, r2 +6: ldmia r1, {r6-r7} + add r1, r1, r2 + pld [r1] + RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + ldmia r1, {r4-r5} + add r1, r1, r2 + stmia r0, {r8-r9} + add r0, r0, r2 + pld [r1] + RND_AVG32 r8, r9, r6, r7, r4, r5, r12 + subs r3, r3, #1 + stmia r0, {r8-r9} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + .align 8 +2: + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 +6: ldmia r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + .align 8 +3: + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 +6: ldmia r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + .align 8 +4: + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 +6: ldmia r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + + .align 8 +5: + .word 0xFEFEFEFE + .word 2b + .word 3b + .word 4b + + .align 8 + .global put_no_rnd_pixels8_y2_arm +put_no_rnd_pixels8_y2_arm: + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r11,lr} @ R14 is also called LR + adr r5, 5f + ands r4, r1, #3 + mov r3, r3, lsr #1 + ldr r12, [r5] + add r5, r5, r4, lsl #2 + bic r1, r1, #3 + ldrne pc, [r5] +1: + ldmia r1, {r4-r5} + add r1, r1, r2 +6: ldmia r1, {r6-r7} + add r1, r1, r2 + pld [r1] + NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + ldmia r1, {r4-r5} + add r1, r1, r2 + stmia r0, {r8-r9} + add r0, r0, r2 + pld [r1] + NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 + subs r3, r3, #1 + stmia r0, {r8-r9} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + .align 8 +2: + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 +6: ldmia r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + .align 8 +3: + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 +6: ldmia r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + .align 8 +4: + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 +6: ldmia r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + .align 8 +5: + .word 0xFEFEFEFE + .word 2b + .word 3b + .word 4b + +@ ---------------------------------------------------------------- +.macro RND_XY2_IT align, rnd + @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) + @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) +.if \align == 0 + ldmia r1, {r6-r8} +.elseif \align == 3 + ldmia r1, {r5-r7} +.else + ldmia r1, {r8-r10} +.endif + add r1, r1, r2 + pld [r1] +.if \align == 0 + ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8 +.elseif \align == 1 + ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10 + ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10 +.elseif \align == 2 + ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10 + ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10 +.elseif \align == 3 + ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7 +.endif + ldr r14, [r12, #0] @ 0x03030303 + tst r3, #1 + and r8, r4, r14 + and r9, r5, r14 + and r10, r6, r14 + and r11, r7, r14 +.if \rnd == 1 + ldreq r14, [r12, #16] @ 0x02020202 +.else + ldreq r14, [r12, #28] @ 0x01010101 +.endif + add r8, r8, r10 + add r9, r9, r11 + addeq r8, r8, r14 + addeq r9, r9, r14 + ldr r14, [r12, #20] @ 0xFCFCFCFC >> 2 + and r4, r14, r4, lsr #2 + and r5, r14, r5, lsr #2 + and r6, r14, r6, lsr #2 + and r7, r14, r7, lsr #2 + add r10, r4, r6 + add r11, r5, r7 +.endm + +.macro RND_XY2_EXPAND align, rnd + RND_XY2_IT \align, \rnd +6: stmfd sp!, {r8-r11} + RND_XY2_IT \align, \rnd + ldmfd sp!, {r4-r7} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + ldr r14, [r12, #24] @ 0x0F0F0F0F + and r4, r14, r4, lsr #2 + and r5, r14, r5, lsr #2 + add r4, r4, r6 + add r5, r5, r7 + subs r3, r3, #1 + stmia r0, {r4-r5} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} +.endm + + .align 8 + .global put_pixels8_xy2_arm +put_pixels8_xy2_arm: + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r11,lr} @ R14 is also called LR + adrl r12, 5f + ands r4, r1, #3 + add r5, r12, r4, lsl #2 + bic r1, r1, #3 + ldrne pc, [r5] +1: + RND_XY2_EXPAND 0, 1 + + .align 8 +2: + RND_XY2_EXPAND 1, 1 + + .align 8 +3: + RND_XY2_EXPAND 2, 1 + + .align 8 +4: + RND_XY2_EXPAND 3, 1 + +5: + .word 0x03030303 + .word 2b + .word 3b + .word 4b + .word 0x02020202 + .word 0xFCFCFCFC >> 2 + .word 0x0F0F0F0F + .word 0x01010101 + + .align 8 + .global put_no_rnd_pixels8_xy2_arm +put_no_rnd_pixels8_xy2_arm: + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r11,lr} @ R14 is also called LR + adrl r12, 5f + ands r4, r1, #3 + add r5, r12, r4, lsl #2 + bic r1, r1, #3 + ldrne pc, [r5] +1: + RND_XY2_EXPAND 0, 0 + + .align 8 +2: + RND_XY2_EXPAND 1, 0 + + .align 8 +3: + RND_XY2_EXPAND 2, 0 + + .align 8 +4: + RND_XY2_EXPAND 3, 0 + +5: + .word 0x03030303 + .word 2b + .word 3b + .word 4b + .word 0x02020202 + .word 0xFCFCFCFC >> 2 + .word 0x0F0F0F0F + .word 0x01010101 diff --git a/libavcodec/armv4l/dsputil_iwmmxt.c b/libavcodec/armv4l/dsputil_iwmmxt.c new file mode 100644 index 00000000000..6e2465d7c8e --- /dev/null +++ b/libavcodec/armv4l/dsputil_iwmmxt.c @@ -0,0 +1,168 @@ +/* + * iWMMXt optimized DSP utils + * Copyright (c) 2004 AGAWA Koji + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "../dsputil.h" + +#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt +#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12"); +#define WAVG2B "wavg2b" +#include "dsputil_iwmmxt_rnd.h" +#undef DEF +#undef SET_RND +#undef WAVG2B + +#define DEF(x, y) x ## _ ## y ##_iwmmxt +#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12"); +#define WAVG2B "wavg2br" +#include "dsputil_iwmmxt_rnd.h" +#undef DEF +#undef SET_RND +#undef WAVG2BR + +// need scheduling +#define OP(AVG) \ + asm volatile ( \ + /* alignment */ \ + "and r12, %[pixels], #7 \n\t" \ + "bic %[pixels], %[pixels], #7 \n\t" \ + "tmcr wcgr1, r12 \n\t" \ + \ + "wldrd wr0, [%[pixels]] \n\t" \ + "wldrd wr1, [%[pixels], #8] \n\t" \ + "add %[pixels], %[pixels], %[line_size] \n\t" \ + "walignr1 wr4, wr0, wr1 \n\t" \ + \ + "1: \n\t" \ + \ + "wldrd wr2, [%[pixels]] \n\t" \ + "wldrd wr3, [%[pixels], #8] \n\t" \ + "add %[pixels], %[pixels], %[line_size] \n\t" \ + "pld [%[pixels]] \n\t" \ + "walignr1 wr5, wr2, wr3 \n\t" \ + AVG " wr6, wr4, wr5 \n\t" \ + "wstrd wr6, [%[block]] \n\t" \ + "add %[block], %[block], %[line_size] \n\t" \ + \ + "wldrd wr0, [%[pixels]] \n\t" \ + "wldrd wr1, [%[pixels], #8] \n\t" \ + "add %[pixels], %[pixels], %[line_size] \n\t" \ + "walignr1 wr4, wr0, wr1 \n\t" \ + "pld [%[pixels]] \n\t" \ + AVG " wr6, wr4, wr5 \n\t" \ + "wstrd wr6, [%[block]] \n\t" \ + "add %[block], %[block], %[line_size] \n\t" \ + \ + "subs %[h], %[h], #2 \n\t" \ + "bne 1b \n\t" \ + : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \ + : [line_size]"r"(line_size) \ + : "memory", "r12"); +void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + OP("wavg2br"); +} +void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + OP("wavg2b"); +} +#undef OP + +void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size) +{ + uint8_t *pixels2 = pixels + line_size; + + __asm__ __volatile__ ( + "mov r12, #4 \n\t" + "1: \n\t" + "pld [%[pixels], %[line_size2]] \n\t" + "pld [%[pixels2], %[line_size2]] \n\t" + "wldrd wr4, [%[pixels]] \n\t" + "wldrd wr5, [%[pixels2]] \n\t" + "pld [%[block], #32] \n\t" + "wunpckelub wr6, wr4 \n\t" + "wldrd wr0, [%[block]] \n\t" + "wunpckehub wr7, wr4 \n\t" + "wldrd wr1, [%[block], #8] \n\t" + "wunpckelub wr8, wr5 \n\t" + "wldrd wr2, [%[block], #16] \n\t" + "wunpckehub wr9, wr5 \n\t" + "wldrd wr3, [%[block], #24] \n\t" + "add %[block], %[block], #32 \n\t" + "waddhss wr10, wr0, wr6 \n\t" + "waddhss wr11, wr1, wr7 \n\t" + "waddhss wr12, wr2, wr8 \n\t" + "waddhss wr13, wr3, wr9 \n\t" + "wpackhus wr14, wr10, wr11 \n\t" + "wpackhus wr15, wr12, wr13 \n\t" + "wstrd wr14, [%[pixels]] \n\t" + "add %[pixels], %[pixels], %[line_size2] \n\t" + "subs r12, r12, #1 \n\t" + "wstrd wr15, [%[pixels2]] \n\t" + "add %[pixels2], %[pixels2], %[line_size2] \n\t" + "bne 1b \n\t" + : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2) + : [line_size2]"r"(line_size << 1) + : "cc", "memory", "r12"); +} + +static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + return; +} + +void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx) +{ + c->add_pixels_clamped = add_pixels_clamped_iwmmxt; + + c->put_pixels_tab[0][0] = put_pixels16_iwmmxt; + c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt; + c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt; + c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt; + c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt; + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt; + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt; + c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt; + + c->put_pixels_tab[1][0] = put_pixels8_iwmmxt; + c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt; + c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt; + c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt; + c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt; + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt; + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt; + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt; + + c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt; + c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt; + c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt; + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt; + c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt; + c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt; + c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt; + c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt; + + c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt; + c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt; + c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt; + c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt; + c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt; + c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt; + c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt; + c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt; +} diff --git a/libavcodec/armv4l/dsputil_iwmmxt_rnd.h b/libavcodec/armv4l/dsputil_iwmmxt_rnd.h new file mode 100644 index 00000000000..ca49a76bdc6 --- /dev/null +++ b/libavcodec/armv4l/dsputil_iwmmxt_rnd.h @@ -0,0 +1,1093 @@ +void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + __asm__ __volatile__ ( + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r4, %[pixels], %[line_size] \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "1: \n\t" + "wldrd wr0, [%[pixels]] \n\t" + "subs %[h], %[h], #2 \n\t" + "wldrd wr1, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr3, [r4] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wldrd wr4, [r4, #8] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr8, wr0, wr1 \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr10, wr3, wr4 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr10, [r5] \n\t" + "add r5, r5, %[line_size] \n\t" + "bne 1b \n\t" + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) + : + : "memory", "r4", "r5", "r12"); +} + +void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + __asm__ __volatile__ ( + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r4, %[pixels], %[line_size] \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "1: \n\t" + "wldrd wr0, [%[pixels]] \n\t" + "subs %[h], %[h], #2 \n\t" + "wldrd wr1, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr3, [r4] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wldrd wr4, [r4, #8] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr8, wr0, wr1 \n\t" + "wldrd wr0, [%[block]] \n\t" + "wldrd wr2, [r5] \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr10, wr3, wr4 \n\t" + WAVG2B" wr8, wr8, wr0 \n\t" + WAVG2B" wr10, wr10, wr2 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr10, [r5] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "add r5, r5, %[line_size] \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + "bne 1b \n\t" + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) + : + : "memory", "r4", "r5", "r12"); +} + +void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + __asm__ __volatile__ ( + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r4, %[pixels], %[line_size] \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "1: \n\t" + "wldrd wr0, [%[pixels]] \n\t" + "wldrd wr1, [%[pixels], #8] \n\t" + "subs %[h], %[h], #2 \n\t" + "wldrd wr2, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr3, [r4] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr8, wr0, wr1 \n\t" + "wldrd wr4, [r4, #8] \n\t" + "walignr1 wr9, wr1, wr2 \n\t" + "wldrd wr5, [r4, #16] \n\t" + "add r4, r4, %[line_size] \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr10, wr3, wr4 \n\t" + "wstrd wr8, [%[block]] \n\t" + "walignr1 wr11, wr4, wr5 \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr10, [r5] \n\t" + "wstrd wr11, [r5, #8] \n\t" + "add r5, r5, %[line_size] \n\t" + "bne 1b \n\t" + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) + : + : "memory", "r4", "r5", "r12"); +} + +void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + __asm__ __volatile__ ( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r4, %[pixels], %[line_size]\n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "1: \n\t" + "wldrd wr0, [%[pixels]] \n\t" + "wldrd wr1, [%[pixels], #8] \n\t" + "subs %[h], %[h], #2 \n\t" + "wldrd wr2, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr3, [r4] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr8, wr0, wr1 \n\t" + "wldrd wr4, [r4, #8] \n\t" + "walignr1 wr9, wr1, wr2 \n\t" + "wldrd wr5, [r4, #16] \n\t" + "add r4, r4, %[line_size] \n\t" + "wldrd wr0, [%[block]] \n\t" + "pld [r4] \n\t" + "wldrd wr1, [%[block], #8] \n\t" + "pld [r4, #32] \n\t" + "wldrd wr2, [r5] \n\t" + "walignr1 wr10, wr3, wr4 \n\t" + "wldrd wr3, [r5, #8] \n\t" + WAVG2B" wr8, wr8, wr0 \n\t" + WAVG2B" wr9, wr9, wr1 \n\t" + WAVG2B" wr10, wr10, wr2 \n\t" + "wstrd wr8, [%[block]] \n\t" + "walignr1 wr11, wr4, wr5 \n\t" + WAVG2B" wr11, wr11, wr3 \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr10, [r5] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "wstrd wr11, [r5, #8] \n\t" + "add r5, r5, %[line_size] \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + "bne 1b \n\t" + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) + : + : "memory", "r4", "r5", "r12"); +} + +void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ __volatile__( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "add r4, %[pixels], %[line_size]\n\t" + "tmcr wcgr2, r12 \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr13, [r4] \n\t" + "pld [%[pixels]] \n\t" + "wldrd wr14, [r4, #8] \n\t" + "pld [%[pixels], #32] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr2, wr13, wr14 \n\t" + "wmoveq wr4, wr11 \n\t" + "wmoveq wr6, wr14 \n\t" + "walignr2ne wr4, wr10, wr11 \n\t" + "walignr2ne wr6, wr13, wr14 \n\t" + WAVG2B" wr0, wr0, wr4 \n\t" + WAVG2B" wr2, wr2, wr6 \n\t" + "wstrd wr0, [%[block]] \n\t" + "subs %[h], %[h], #2 \n\t" + "wstrd wr2, [r5] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "add r5, r5, %[line_size] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "r4", "r5", "r12", "memory"); +} + +void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ __volatile__( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "add r4, %[pixels], %[line_size]\n\t" + "tmcr wcgr2, r12 \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr13, [r4] \n\t" + "pld [%[pixels]] \n\t" + "wldrd wr14, [r4, #8] \n\t" + "pld [%[pixels], #32] \n\t" + "wldrd wr15, [r4, #16] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + "walignr1 wr2, wr13, wr14 \n\t" + "walignr1 wr3, wr14, wr15 \n\t" + "wmoveq wr4, wr11 \n\t" + "wmoveq wr5, wr12 \n\t" + "wmoveq wr6, wr14 \n\t" + "wmoveq wr7, wr15 \n\t" + "walignr2ne wr4, wr10, wr11 \n\t" + "walignr2ne wr5, wr11, wr12 \n\t" + "walignr2ne wr6, wr13, wr14 \n\t" + "walignr2ne wr7, wr14, wr15 \n\t" + WAVG2B" wr0, wr0, wr4 \n\t" + WAVG2B" wr1, wr1, wr5 \n\t" + "wstrd wr0, [%[block]] \n\t" + WAVG2B" wr2, wr2, wr6 \n\t" + "wstrd wr1, [%[block], #8] \n\t" + WAVG2B" wr3, wr3, wr7 \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr2, [r5] \n\t" + "subs %[h], %[h], #2 \n\t" + "wstrd wr3, [r5, #8] \n\t" + "add r5, r5, %[line_size] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "r4", "r5", "r12", "memory"); +} + +void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ __volatile__( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "add r4, %[pixels], %[line_size]\n\t" + "tmcr wcgr2, r12 \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr13, [r4] \n\t" + "pld [%[pixels]] \n\t" + "wldrd wr14, [r4, #8] \n\t" + "pld [%[pixels], #32] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr2, wr13, wr14 \n\t" + "wmoveq wr4, wr11 \n\t" + "wmoveq wr6, wr14 \n\t" + "walignr2ne wr4, wr10, wr11 \n\t" + "wldrd wr10, [%[block]] \n\t" + "walignr2ne wr6, wr13, wr14 \n\t" + "wldrd wr12, [r5] \n\t" + WAVG2B" wr0, wr0, wr4 \n\t" + WAVG2B" wr2, wr2, wr6 \n\t" + WAVG2B" wr0, wr0, wr10 \n\t" + WAVG2B" wr2, wr2, wr12 \n\t" + "wstrd wr0, [%[block]] \n\t" + "subs %[h], %[h], #2 \n\t" + "wstrd wr2, [r5] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "add r5, r5, %[line_size] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "r4", "r5", "r12", "memory"); +} + +void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ __volatile__( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "add r4, %[pixels], %[line_size]\n\t" + "tmcr wcgr2, r12 \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr13, [r4] \n\t" + "pld [%[pixels]] \n\t" + "wldrd wr14, [r4, #8] \n\t" + "pld [%[pixels], #32] \n\t" + "wldrd wr15, [r4, #16] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + "walignr1 wr2, wr13, wr14 \n\t" + "walignr1 wr3, wr14, wr15 \n\t" + "wmoveq wr4, wr11 \n\t" + "wmoveq wr5, wr12 \n\t" + "wmoveq wr6, wr14 \n\t" + "wmoveq wr7, wr15 \n\t" + "walignr2ne wr4, wr10, wr11 \n\t" + "walignr2ne wr5, wr11, wr12 \n\t" + "walignr2ne wr6, wr13, wr14 \n\t" + "walignr2ne wr7, wr14, wr15 \n\t" + "wldrd wr10, [%[block]] \n\t" + WAVG2B" wr0, wr0, wr4 \n\t" + "wldrd wr11, [%[block], #8] \n\t" + WAVG2B" wr1, wr1, wr5 \n\t" + "wldrd wr12, [r5] \n\t" + WAVG2B" wr2, wr2, wr6 \n\t" + "wldrd wr13, [r5, #8] \n\t" + WAVG2B" wr3, wr3, wr7 \n\t" + WAVG2B" wr0, wr0, wr10 \n\t" + WAVG2B" wr1, wr1, wr11 \n\t" + WAVG2B" wr2, wr2, wr12 \n\t" + WAVG2B" wr3, wr3, wr13 \n\t" + "wstrd wr0, [%[block]] \n\t" + "subs %[h], %[h], #2 \n\t" + "wstrd wr1, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr2, [r5] \n\t" + "pld [%[block]] \n\t" + "wstrd wr3, [r5, #8] \n\t" + "add r5, r5, %[line_size] \n\t" + "pld [%[block], #32] \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + :"r4", "r5", "r12", "memory"); +} + +void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + __asm__ __volatile__( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "pld [%[block]] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr4, wr10, wr11 \n\t" + "wldrd wr10, [%[block]] \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr8, wr8, wr10 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "pld [%[block]] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "wldrd wr10, [%[block]] \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr8, wr8, wr10 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "subs %[h], %[h], #2 \n\t" + "pld [%[block]] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "cc", "memory", "r12"); +} + +void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + __asm__ __volatile__( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr4, wr10, wr11 \n\t" + "walignr1 wr5, wr11, wr12 \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr9, wr1, wr5 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr9, wr1, wr5 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "subs %[h], %[h], #2 \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "r4", "r5", "r12", "memory"); +} + +void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + __asm__ __volatile__( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "pld [%[block]] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr4, wr10, wr11 \n\t" + "walignr1 wr5, wr11, wr12 \n\t" + "wldrd wr10, [%[block]] \n\t" + "wldrd wr11, [%[block], #8] \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr9, wr1, wr5 \n\t" + WAVG2B" wr8, wr8, wr10 \n\t" + WAVG2B" wr9, wr9, wr11 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "pld [%[block]] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + "wldrd wr10, [%[block]] \n\t" + "wldrd wr11, [%[block], #8] \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr9, wr1, wr5 \n\t" + WAVG2B" wr8, wr8, wr10 \n\t" + WAVG2B" wr9, wr9, wr11 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "subs %[h], %[h], #2 \n\t" + "pld [%[block]] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "r4", "r5", "r12", "memory"); +} + +void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ __volatile__( + "pld [%[pixels]] \n\t" + "mov r12, #2 \n\t" + "pld [%[pixels], #32] \n\t" + "tmcr wcgr0, r12 \n\t" /* for shift value */ + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "add r12, r12, #1 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "tmcr wcgr2, r12 \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "cmp r12, #8 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + + "1: \n\t" + // [wr0 wr1 wr2 wr3] + // [wr4 wr5 wr6 wr7] <= * + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr6, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr4, wr6 \n\t" + "wunpckehub wr5, wr6 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr4, wr4, wr8 \n\t" + "waddhus wr5, wr5, wr9 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "subs %[h], %[h], #2 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) + : [line_size]"r"(line_size) + : "r12", "memory"); +} + +void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ __volatile__( + "pld [%[pixels]] \n\t" + "mov r12, #2 \n\t" + "pld [%[pixels], #32] \n\t" + "tmcr wcgr0, r12 \n\t" /* for shift value */ + /* alignment */ + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "tmcr wcgr2, r12 \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr3, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr2, wr3 \n\t" + "wunpckehub wr3, wr3 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr2, wr2, wr10 \n\t" + "waddhus wr3, wr3, wr11 \n\t" + + "1: \n\t" + // [wr0 wr1 wr2 wr3] + // [wr4 wr5 wr6 wr7] <= * + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr6, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr7, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr4, wr6 \n\t" + "wunpckehub wr5, wr6 \n\t" + "wunpckelub wr6, wr7 \n\t" + "wunpckehub wr7, wr7 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr4, wr4, wr8 \n\t" + "waddhus wr5, wr5, wr9 \n\t" + "waddhus wr6, wr6, wr10 \n\t" + "waddhus wr7, wr7, wr11 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr10, wr2, wr6 \n\t" + "waddhus wr11, wr3, wr7 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "waddhus wr10, wr10, wr15 \n\t" + "waddhus wr11, wr11, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wsrlhg wr10, wr10, wcgr0 \n\t" + "wsrlhg wr11, wr11, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "wpackhus wr9, wr10, wr11 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr3, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr2, wr3 \n\t" + "wunpckehub wr3, wr3 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr2, wr2, wr10 \n\t" + "waddhus wr3, wr3, wr11 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr10, wr2, wr6 \n\t" + "waddhus wr11, wr3, wr7 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "waddhus wr10, wr10, wr15 \n\t" + "waddhus wr11, wr11, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wsrlhg wr10, wr10, wcgr0 \n\t" + "wsrlhg wr11, wr11, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "wpackhus wr9, wr10, wr11 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "subs %[h], %[h], #2 \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) + : [line_size]"r"(line_size) + : "r12", "memory"); +} + +void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ __volatile__( + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "pld [%[pixels]] \n\t" + "mov r12, #2 \n\t" + "pld [%[pixels], #32] \n\t" + "tmcr wcgr0, r12 \n\t" /* for shift value */ + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "add r12, r12, #1 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "tmcr wcgr2, r12 \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "cmp r12, #8 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + + "1: \n\t" + // [wr0 wr1 wr2 wr3] + // [wr4 wr5 wr6 wr7] <= * + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr6, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr4, wr6 \n\t" + "wunpckehub wr5, wr6 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr4, wr4, wr8 \n\t" + "waddhus wr5, wr5, wr9 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "wldrd wr12, [%[block]] \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + WAVG2B" wr8, wr8, wr12 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wldrd wr12, [%[pixels]] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr13, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "wldrd wr12, [%[block]] \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "subs %[h], %[h], #2 \n\t" + WAVG2B" wr8, wr8, wr12 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) + : [line_size]"r"(line_size) + : "r12", "memory"); +} + +void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ __volatile__( + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "pld [%[pixels]] \n\t" + "mov r12, #2 \n\t" + "pld [%[pixels], #32] \n\t" + "tmcr wcgr0, r12 \n\t" /* for shift value */ + /* alignment */ + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "tmcr wcgr2, r12 \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr3, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr2, wr3 \n\t" + "wunpckehub wr3, wr3 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr2, wr2, wr10 \n\t" + "waddhus wr3, wr3, wr11 \n\t" + + "1: \n\t" + // [wr0 wr1 wr2 wr3] + // [wr4 wr5 wr6 wr7] <= * + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr6, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr7, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr4, wr6 \n\t" + "wunpckehub wr5, wr6 \n\t" + "wunpckelub wr6, wr7 \n\t" + "wunpckehub wr7, wr7 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr4, wr4, wr8 \n\t" + "waddhus wr5, wr5, wr9 \n\t" + "waddhus wr6, wr6, wr10 \n\t" + "waddhus wr7, wr7, wr11 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr10, wr2, wr6 \n\t" + "waddhus wr11, wr3, wr7 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "waddhus wr10, wr10, wr15 \n\t" + "waddhus wr11, wr11, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wldrd wr12, [%[block]] \n\t" + "wldrd wr13, [%[block], #8] \n\t" + "wsrlhg wr10, wr10, wcgr0 \n\t" + "wsrlhg wr11, wr11, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "wpackhus wr9, wr10, wr11 \n\t" + WAVG2B" wr8, wr8, wr12 \n\t" + WAVG2B" wr9, wr9, wr13 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "pld [%[block]] \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "pld [%[block], #32] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr3, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr2, wr3 \n\t" + "wunpckehub wr3, wr3 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr2, wr2, wr10 \n\t" + "waddhus wr3, wr3, wr11 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr10, wr2, wr6 \n\t" + "waddhus wr11, wr3, wr7 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "waddhus wr10, wr10, wr15 \n\t" + "waddhus wr11, wr11, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wldrd wr12, [%[block]] \n\t" + "wldrd wr13, [%[block], #8] \n\t" + "wsrlhg wr10, wr10, wcgr0 \n\t" + "wsrlhg wr11, wr11, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "wpackhus wr9, wr10, wr11 \n\t" + WAVG2B" wr8, wr8, wr12 \n\t" + WAVG2B" wr9, wr9, wr13 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "subs %[h], %[h], #2 \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) + : [line_size]"r"(line_size) + : "r12", "memory"); +} diff --git a/libavcodec/armv4l/mpegvideo_arm.c b/libavcodec/armv4l/mpegvideo_arm.c index 9c5d8bf8627..e2efd308ed7 100644 --- a/libavcodec/armv4l/mpegvideo_arm.c +++ b/libavcodec/armv4l/mpegvideo_arm.c @@ -21,6 +21,13 @@ #include "../mpegvideo.h" #include "../avcodec.h" +#ifdef HAVE_IWMMXT +extern void MPV_common_init_iwmmxt(MpegEncContext *s); +#endif + void MPV_common_init_armv4l(MpegEncContext *s) { +#ifdef HAVE_IWMMXT + MPV_common_init_iwmmxt(s); +#endif } diff --git a/libavcodec/armv4l/mpegvideo_iwmmxt.c b/libavcodec/armv4l/mpegvideo_iwmmxt.c new file mode 100644 index 00000000000..0f4c99e3ac8 --- /dev/null +++ b/libavcodec/armv4l/mpegvideo_iwmmxt.c @@ -0,0 +1,97 @@ +#include "../dsputil.h" +#include "../mpegvideo.h" +#include "../avcodec.h" + +static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int level, qmul, qadd; + int nCoeffs; + DCTELEM *block_orig = block; + + assert(s->block_last_index[n]>=0); + + qmul = qscale << 1; + + if (!s->h263_aic) { + if (n < 4) + level = block[0] * s->y_dc_scale; + else + level = block[0] * s->c_dc_scale; + qadd = (qscale - 1) | 1; + }else{ + qadd = 0; + level = block[0]; + } + if(s->ac_pred) + nCoeffs=63; + else + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + __asm__ __volatile__ ( +/* "movd %1, %%mm6 \n\t" //qmul */ +/* "packssdw %%mm6, %%mm6 \n\t" */ +/* "packssdw %%mm6, %%mm6 \n\t" */ + "tbcsth wr6, %[qmul] \n\t" +/* "movd %2, %%mm5 \n\t" //qadd */ +/* "packssdw %%mm5, %%mm5 \n\t" */ +/* "packssdw %%mm5, %%mm5 \n\t" */ + "tbcsth wr5, %[qadd] \n\t" + "wzero wr7 \n\t" /* "pxor %%mm7, %%mm7 \n\t" */ + "wzero wr4 \n\t" /* "pxor %%mm4, %%mm4 \n\t" */ + "wsubh wr7, wr5, wr7 \n\t" /* "psubw %%mm5, %%mm7 \n\t" */ + "1: \n\t" + "wldrd wr2, [%[block]] \n\t" /* "movq (%0, %3), %%mm0 \n\t" */ + "wldrd wr3, [%[block], #8] \n\t" /* "movq 8(%0, %3), %%mm1 \n\t" */ + "wmulsl wr0, wr6, wr2 \n\t" /* "pmullw %%mm6, %%mm0 \n\t" */ + "wmulsl wr1, wr6, wr3 \n\t" /* "pmullw %%mm6, %%mm1 \n\t" */ +/* "movq (%0, %3), %%mm2 \n\t" */ +/* "movq 8(%0, %3), %%mm3 \n\t" */ + "wcmpgtsh wr2, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 */ + "wcmpgtsh wr3, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 */ + "wxor wr0, wr2, wr0 \n\t" /* "pxor %%mm2, %%mm0 \n\t" */ + "wxor wr1, wr3, wr1 \n\t" /* "pxor %%mm3, %%mm1 \n\t" */ + "waddh wr0, wr7, wr0 \n\t" /* "paddw %%mm7, %%mm0 \n\t" */ + "waddh wr1, wr7, wr1 \n\t" /* "paddw %%mm7, %%mm1 \n\t" */ + "wxor wr2, wr0, wr2 \n\t" /* "pxor %%mm0, %%mm2 \n\t" */ + "wxor wr3, wr1, wr3 \n\t" /* "pxor %%mm1, %%mm3 \n\t" */ + "wcmpeqh wr0, wr7, wr0 \n\t" /* "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 */ + "wcmpeqh wr1, wr7, wr1 \n\t" /* "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 */ + "wandn wr0, wr2, wr0 \n\t" /* "pandn %%mm2, %%mm0 \n\t" */ + "wandn wr1, wr3, wr1 \n\t" /* "pandn %%mm3, %%mm1 \n\t" */ + "wstrd wr0, [%[block]] \n\t" /* "movq %%mm0, (%0, %3) \n\t" */ + "wstrd wr1, [%[block], #8] \n\t" /* "movq %%mm1, 8(%0, %3) \n\t" */ + "add %[block], %[block], #16 \n\t" /* "addl $16, %3 \n\t" */ + "subs %[i], %[i], #1 \n\t" + "bne 1b \n\t" /* "jng 1b \n\t" */ + :[block]"+r"(block) + :[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd) + :"memory"); + + block_orig[0] = level; +} + +#if 0 +static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int nCoeffs; + + assert(s->block_last_index[n]>=0); + + if(s->ac_pred) + nCoeffs=63; + else + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale); +} +#endif + +void MPV_common_init_iwmmxt(MpegEncContext *s) +{ + s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt; +#if 0 + s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt; +#endif +} diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h index 2b491cff289..d5e7f46dc7d 100644 --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h @@ -1180,6 +1180,7 @@ typedef struct AVCodecContext { #define FF_IDCT_SIMPLEARM 10 #define FF_IDCT_H264 11 #define FF_IDCT_VP3 12 +#define FP_IDCT_IPP 13 /** * slice count. diff --git a/libavcodec/bswap.h b/libavcodec/bswap.h index 2e55337f9a8..50fd5717873 100644 --- a/libavcodec/bswap.h +++ b/libavcodec/bswap.h @@ -94,10 +94,23 @@ static always_inline uint16_t bswap_16(uint16_t x){ return (x>>8) | (x<<8); } +#ifdef ARCH_ARM +static always_inline uint32_t bswap_32(uint32_t x){ + uint32_t t; + __asm__ ( + "eor %1, %0, %0, ror #16 \n\t" + "bic %1, %1, #0xFF0000 \n\t" + "mov %0, %0, ror #8 \n\t" + "eor %0, %0, %1, lsr #8 \n\t" + : "+r"(x), "+r"(t)); + return x; +} +#else static always_inline uint32_t bswap_32(uint32_t x){ x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF); return (x>>16) | (x<<16); } +#endif static inline uint64_t bswap_64(uint64_t x) { -- 2.39.5