X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavcodec%2Fppc%2Fdsputil_altivec.c;h=8a1cd443769e3f630ed9730b194b2af83cdc7a6a;hb=5137235e0cc19175bc96eecd0762ab3398d5c4a8;hp=20ee382f2976706eaa4770b4562693676bc8ee86;hpb=115329f16062074e11ccf3b89ead6176606c9696;p=ffmpeg diff --git a/libavcodec/ppc/dsputil_altivec.c b/libavcodec/ppc/dsputil_altivec.c index 20ee382f297..8a1cd443769 100644 --- a/libavcodec/ppc/dsputil_altivec.c +++ b/libavcodec/ppc/dsputil_altivec.c @@ -3,59 +3,37 @@ * Copyright (c) 2002 Dieter Shirley * Copyright (c) 2003-2004 Romain Dolbeau * - * This library is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * version 2.1 of the License, or (at your option) any later version. * - * This library is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "../dsputil.h" - -#include "gcc_fixes.h" - -#include "dsputil_altivec.h" - -#ifdef CONFIG_DARWIN -#include -#else /* CONFIG_DARWIN */ -#ifdef __AMIGAOS4__ -#include -#include -#include -#else /* __AMIGAOS4__ */ -#include -#include - -static sigjmp_buf jmpbuf; -static volatile sig_atomic_t canjump = 0; - -static void sigill_handler (int sig) -{ - if (!canjump) { - signal (sig, SIG_DFL); - raise (sig); - } - - canjump = 0; - siglongjmp (jmpbuf, 1); -} -#endif /* CONFIG_DARWIN */ -#endif /* __AMIGAOS4__ */ +#include "config.h" +#if HAVE_ALTIVEC_H +#include +#endif +#include "libavcodec/dsputil.h" +#include "dsputil_ppc.h" +#include "util_altivec.h" +#include "types_altivec.h" int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; - int s __attribute__((aligned(16))); - const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); + DECLARE_ALIGNED_16(int, s); + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; @@ -63,12 +41,10 @@ int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h s = 0; sad = (vector unsigned int)vec_splat_u32(0); - for(i=0;il))) - - ((((*((uint32_t *) (block))) ^ - ((((const struct unaligned_32 *) (pixels))-> - l))) & 0xFEFEFEFEUL) >> 1)); - *((uint32_t *) (block + 4)) = - (((*((uint32_t *) (block + 4))) | - ((((const struct unaligned_32 *) (pixels + 4))->l))) - - ((((*((uint32_t *) (block + 4))) ^ - ((((const struct unaligned_32 *) (pixels + - 4))-> - l))) & 0xFEFEFEFEUL) >> 1)); - pixels += line_size; - block += line_size; - } -POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); - -#else /* ALTIVEC_USE_REFERENCE_C_CODE */ register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; int i; POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); for (i = 0; i < h; i++) { - /* - block is 8 bytes-aligned, so we're either in the - left block (16 bytes-aligned) or in the right block (not) - */ - int rightside = ((unsigned long)block & 0x0000000F); - - blockv = vec_ld(0, block); - pixelsv1 = vec_ld(0, (unsigned char*)pixels); - pixelsv2 = vec_ld(16, (unsigned char*)pixels); - pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); - - if (rightside) - { - pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); - } - else - { - pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); - } - - blockv = vec_avg(blockv, pixelsv); - - vec_st(blockv, 0, block); - - pixels += line_size; - block += line_size; + /* block is 8 bytes-aligned, so we're either in the + left block (16 bytes-aligned) or in the right block (not) */ + int rightside = ((unsigned long)block & 0x0000000F); + + blockv = vec_ld(0, block); + pixelsv1 = vec_ld(0, (unsigned char*)pixels); + pixelsv2 = vec_ld(16, (unsigned char*)pixels); + pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); + + if (rightside) { + pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); + } else { + pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); + } + + blockv = vec_avg(blockv, pixelsv); + + vec_st(blockv, 0, block); + + pixels += line_size; + block += line_size; } POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); - -#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } /* next one assumes that ((line_size % 8) == 0) */ void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) { POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); -#ifdef ALTIVEC_USE_REFERENCE_C_CODE - int j; + register int i; + register vector unsigned char pixelsv1, pixelsv2, pixelsavg; + register vector unsigned char blockv, temp1, temp2; + register vector unsigned short pixelssum1, pixelssum2, temp3; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + + temp1 = vec_ld(0, pixels); + temp2 = vec_ld(16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); + } + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vctwo); + POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); - for (j = 0; j < 2; j++) { - int i; - const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); - const uint32_t b = - (((const struct unaligned_32 *) (pixels + 1))->l); - uint32_t l0 = - (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; - uint32_t h0 = - ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); - uint32_t l1, h1; - pixels += line_size; - for (i = 0; i < h; i += 2) { - uint32_t a = (((const struct unaligned_32 *) (pixels))->l); - uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); - l1 = (a & 0x03030303UL) + (b & 0x03030303UL); - h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); - *((uint32_t *) block) = - h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); - pixels += line_size; + for (i = 0; i < h ; i++) { + int rightside = ((unsigned long)block & 0x0000000F); + blockv = vec_ld(0, block); + + temp1 = vec_ld(line_size, pixels); + temp2 = vec_ld(line_size + 16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); + } + + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + pixelssum1 = vec_add(pixelssum2, vctwo); + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); + + if (rightside) { + blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); + } else { + blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); + } + + vec_st(blockv, 0, block); + block += line_size; - a = (((const struct unaligned_32 *) (pixels))->l); - b = (((const struct unaligned_32 *) (pixels + 1))->l); - l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; - h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); - *((uint32_t *) block) = - h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); pixels += line_size; - block += line_size; - } pixels += 4 - line_size * (h + 1); - block += 4 - line_size * h; } POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); - -#else /* ALTIVEC_USE_REFERENCE_C_CODE */ - register int i; - register vector unsigned char - pixelsv1, pixelsv2, - pixelsavg; - register vector unsigned char - blockv, temp1, temp2; - register vector unsigned short - pixelssum1, pixelssum2, temp3; - register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); - register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - -POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) - { - blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } - else - { - blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); -#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } /* next one assumes that ((line_size % 8) == 0) */ void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) { POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); -#ifdef ALTIVEC_USE_REFERENCE_C_CODE - int j; + register int i; + register vector unsigned char pixelsv1, pixelsv2, pixelsavg; + register vector unsigned char blockv, temp1, temp2; + register vector unsigned short pixelssum1, pixelssum2, temp3; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + + temp1 = vec_ld(0, pixels); + temp2 = vec_ld(16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); + } + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vcone); + POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); - for (j = 0; j < 2; j++) { - int i; - const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); - const uint32_t b = - (((const struct unaligned_32 *) (pixels + 1))->l); - uint32_t l0 = - (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; - uint32_t h0 = - ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); - uint32_t l1, h1; - pixels += line_size; - for (i = 0; i < h; i += 2) { - uint32_t a = (((const struct unaligned_32 *) (pixels))->l); - uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); - l1 = (a & 0x03030303UL) + (b & 0x03030303UL); - h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); - *((uint32_t *) block) = - h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); - pixels += line_size; + for (i = 0; i < h ; i++) { + int rightside = ((unsigned long)block & 0x0000000F); + blockv = vec_ld(0, block); + + temp1 = vec_ld(line_size, pixels); + temp2 = vec_ld(line_size + 16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); + } + + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + pixelssum1 = vec_add(pixelssum2, vcone); + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); + + if (rightside) { + blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); + } else { + blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); + } + + vec_st(blockv, 0, block); + block += line_size; - a = (((const struct unaligned_32 *) (pixels))->l); - b = (((const struct unaligned_32 *) (pixels + 1))->l); - l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; - h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); - *((uint32_t *) block) = - h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); pixels += line_size; - block += line_size; - } pixels += 4 - line_size * (h + 1); - block += 4 - line_size * h; } POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); - -#else /* ALTIVEC_USE_REFERENCE_C_CODE */ - register int i; - register vector unsigned char - pixelsv1, pixelsv2, - pixelsavg; - register vector unsigned char - blockv, temp1, temp2; - register vector unsigned short - pixelssum1, pixelssum2, temp3; - register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); - register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1); - register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vcone); - -POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vcone); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) - { - blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } - else - { - blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); -#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } /* next one assumes that ((line_size % 16) == 0) */ void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) { POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); -#ifdef ALTIVEC_USE_REFERENCE_C_CODE - int j; + register int i; + register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; + register vector unsigned char blockv, temp1, temp2; + register vector unsigned short temp3, temp4, + pixelssum1, pixelssum2, pixelssum3, pixelssum4; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); - for (j = 0; j < 4; j++) { - int i; - const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); - const uint32_t b = - (((const struct unaligned_32 *) (pixels + 1))->l); - uint32_t l0 = - (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; - uint32_t h0 = - ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); - uint32_t l1, h1; - pixels += line_size; - for (i = 0; i < h; i += 2) { - uint32_t a = (((const struct unaligned_32 *) (pixels))->l); - uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); - l1 = (a & 0x03030303UL) + (b & 0x03030303UL); - h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); - *((uint32_t *) block) = - h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); - pixels += line_size; + + temp1 = vec_ld(0, pixels); + temp2 = vec_ld(16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); + } + pixelsv3 = vec_mergel(vczero, pixelsv1); + pixelsv4 = vec_mergel(vczero, pixelsv2); + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum3 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum3 = vec_add(pixelssum3, vctwo); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vctwo); + + for (i = 0; i < h ; i++) { + blockv = vec_ld(0, block); + + temp1 = vec_ld(line_size, pixels); + temp2 = vec_ld(line_size + 16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); + } + + pixelsv3 = vec_mergel(vczero, pixelsv1); + pixelsv4 = vec_mergel(vczero, pixelsv2); + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + + pixelssum4 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp4 = vec_add(pixelssum3, pixelssum4); + temp4 = vec_sra(temp4, vctwo); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + + pixelssum3 = vec_add(pixelssum4, vctwo); + pixelssum1 = vec_add(pixelssum2, vctwo); + + blockv = vec_packsu(temp3, temp4); + + vec_st(blockv, 0, block); + block += line_size; - a = (((const struct unaligned_32 *) (pixels))->l); - b = (((const struct unaligned_32 *) (pixels + 1))->l); - l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; - h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); - *((uint32_t *) block) = - h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); pixels += line_size; - block += line_size; - } pixels += 4 - line_size * (h + 1); - block += 4 - line_size * h; } POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); - -#else /* ALTIVEC_USE_REFERENCE_C_CODE */ - register int i; - register vector unsigned char - pixelsv1, pixelsv2, pixelsv3, pixelsv4; - register vector unsigned char - blockv, temp1, temp2; - register vector unsigned short - pixelssum1, pixelssum2, temp3, - pixelssum3, pixelssum4, temp4; - register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); - register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); - -POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum3 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum3 = vec_add(pixelssum3, vctwo); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - - for (i = 0; i < h ; i++) { - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - - pixelssum4 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp4 = vec_add(pixelssum3, pixelssum4); - temp4 = vec_sra(temp4, vctwo); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - - pixelssum3 = vec_add(pixelssum4, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - - blockv = vec_packsu(temp3, temp4); - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); -#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } /* next one assumes that ((line_size % 16) == 0) */ void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) { POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); -#ifdef ALTIVEC_USE_REFERENCE_C_CODE - int j; + register int i; + register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; + register vector unsigned char blockv, temp1, temp2; + register vector unsigned short temp3, temp4, + pixelssum1, pixelssum2, pixelssum3, pixelssum4; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); - for (j = 0; j < 4; j++) { - int i; - const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); - const uint32_t b = - (((const struct unaligned_32 *) (pixels + 1))->l); - uint32_t l0 = - (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; - uint32_t h0 = - ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); - uint32_t l1, h1; - pixels += line_size; - for (i = 0; i < h; i += 2) { - uint32_t a = (((const struct unaligned_32 *) (pixels))->l); - uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); - l1 = (a & 0x03030303UL) + (b & 0x03030303UL); - h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); - *((uint32_t *) block) = - h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); - pixels += line_size; + + temp1 = vec_ld(0, pixels); + temp2 = vec_ld(16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); + } + pixelsv3 = vec_mergel(vczero, pixelsv1); + pixelsv4 = vec_mergel(vczero, pixelsv2); + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum3 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum3 = vec_add(pixelssum3, vcone); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vcone); + + for (i = 0; i < h ; i++) { + blockv = vec_ld(0, block); + + temp1 = vec_ld(line_size, pixels); + temp2 = vec_ld(line_size + 16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); + } + + pixelsv3 = vec_mergel(vczero, pixelsv1); + pixelsv4 = vec_mergel(vczero, pixelsv2); + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + + pixelssum4 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp4 = vec_add(pixelssum3, pixelssum4); + temp4 = vec_sra(temp4, vctwo); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + + pixelssum3 = vec_add(pixelssum4, vcone); + pixelssum1 = vec_add(pixelssum2, vcone); + + blockv = vec_packsu(temp3, temp4); + + vec_st(blockv, 0, block); + block += line_size; - a = (((const struct unaligned_32 *) (pixels))->l); - b = (((const struct unaligned_32 *) (pixels + 1))->l); - l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; - h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); - *((uint32_t *) block) = - h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); pixels += line_size; - block += line_size; - } pixels += 4 - line_size * (h + 1); - block += 4 - line_size * h; } POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); - -#else /* ALTIVEC_USE_REFERENCE_C_CODE */ - register int i; - register vector unsigned char - pixelsv1, pixelsv2, pixelsv3, pixelsv4; - register vector unsigned char - blockv, temp1, temp2; - register vector unsigned short - pixelssum1, pixelssum2, temp3, - pixelssum3, pixelssum4, temp4; - register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); - register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1); - register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); - -POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum3 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum3 = vec_add(pixelssum3, vcone); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vcone); - - for (i = 0; i < h ; i++) { - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - - pixelssum4 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp4 = vec_add(pixelssum3, pixelssum4); - temp4 = vec_sra(temp4, vctwo); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - - pixelssum3 = vec_add(pixelssum4, vcone); - pixelssum1 = vec_add(pixelssum2, vcone); - - blockv = vec_packsu(temp3, temp4); - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); -#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } -#ifdef CONFIG_DARWIN int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1); - int sum; + int sum; + register const vector unsigned char vzero = + (const vector unsigned char)vec_splat_u8(0); + register vector signed short temp0, temp1, temp2, temp3, temp4, + temp5, temp6, temp7; POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); - register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0); - register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; - { - register const_vector signed short vprod1 = (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1); - register const_vector signed short vprod2 = (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1); - register const_vector signed short vprod3 = (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1); - register const_vector unsigned char perm1 = (const_vector unsigned char) - AVV(0x02, 0x03, 0x00, 0x01, - 0x06, 0x07, 0x04, 0x05, - 0x0A, 0x0B, 0x08, 0x09, - 0x0E, 0x0F, 0x0C, 0x0D); - register const_vector unsigned char perm2 = (const_vector unsigned char) - AVV(0x04, 0x05, 0x06, 0x07, - 0x00, 0x01, 0x02, 0x03, - 0x0C, 0x0D, 0x0E, 0x0F, - 0x08, 0x09, 0x0A, 0x0B); - register const_vector unsigned char perm3 = (const_vector unsigned char) - AVV(0x08, 0x09, 0x0A, 0x0B, - 0x0C, 0x0D, 0x0E, 0x0F, - 0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07); - -#define ONEITERBUTTERFLY(i, res) \ - { \ - register vector unsigned char src1, src2, srcO; \ - register vector unsigned char dst1, dst2, dstO; \ - src1 = vec_ld(stride * i, src); \ - if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \ - src2 = vec_ld((stride * i) + 16, src); \ - srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ - dst1 = vec_ld(stride * i, dst); \ - if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \ - dst2 = vec_ld((stride * i) + 16, dst); \ - dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ - /* promote the unsigned chars to signed shorts */ \ - /* we're in the 8x8 function, we only care for the first 8 */ \ - register vector signed short srcV = \ - (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \ - register vector signed short dstV = \ - (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \ - /* substractions inside the first butterfly */ \ - register vector signed short but0 = vec_sub(srcV, dstV); \ - register vector signed short op1 = vec_perm(but0, but0, perm1); \ - register vector signed short but1 = vec_mladd(but0, vprod1, op1); \ - register vector signed short op2 = vec_perm(but1, but1, perm2); \ - register vector signed short but2 = vec_mladd(but1, vprod2, op2); \ - register vector signed short op3 = vec_perm(but2, but2, perm3); \ - res = vec_mladd(but2, vprod3, op3); \ + { + register const vector signed short vprod1 =(const vector signed short) + { 1,-1, 1,-1, 1,-1, 1,-1 }; + register const vector signed short vprod2 =(const vector signed short) + { 1, 1,-1,-1, 1, 1,-1,-1 }; + register const vector signed short vprod3 =(const vector signed short) + { 1, 1, 1, 1,-1,-1,-1,-1 }; + register const vector unsigned char perm1 = (const vector unsigned char) + {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, + 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D}; + register const vector unsigned char perm2 = (const vector unsigned char) + {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, + 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B}; + register const vector unsigned char perm3 = (const vector unsigned char) + {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}; + +#define ONEITERBUTTERFLY(i, res) \ + { \ + register vector unsigned char src1, src2, srcO; \ + register vector unsigned char dst1, dst2, dstO; \ + register vector signed short srcV, dstV; \ + register vector signed short but0, but1, but2, op1, op2, op3; \ + src1 = vec_ld(stride * i, src); \ + src2 = vec_ld((stride * i) + 15, src); \ + srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ + dst1 = vec_ld(stride * i, dst); \ + dst2 = vec_ld((stride * i) + 15, dst); \ + dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ + /* promote the unsigned chars to signed shorts */ \ + /* we're in the 8x8 function, we only care for the first 8 */ \ + srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \ + (vector signed char)srcO); \ + dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \ + (vector signed char)dstO); \ + /* subtractions inside the first butterfly */ \ + but0 = vec_sub(srcV, dstV); \ + op1 = vec_perm(but0, but0, perm1); \ + but1 = vec_mladd(but0, vprod1, op1); \ + op2 = vec_perm(but1, but1, perm2); \ + but2 = vec_mladd(but1, vprod2, op2); \ + op3 = vec_perm(but2, but2, perm3); \ + res = vec_mladd(but2, vprod3, op3); \ } ONEITERBUTTERFLY(0, temp0); ONEITERBUTTERFLY(1, temp1); @@ -1370,9 +1056,9 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); ONEITERBUTTERFLY(5, temp5); ONEITERBUTTERFLY(6, temp6); ONEITERBUTTERFLY(7, temp7); - } + } #undef ONEITERBUTTERFLY - { + { register vector signed int vsum; register vector signed short line0 = vec_add(temp0, temp1); register vector signed short line1 = vec_sub(temp0, temp1); @@ -1412,108 +1098,128 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); vsum = vec_sums(vsum, (vector signed int)vzero); vsum = vec_splat(vsum, 3); vec_ste(vsum, 0, &sum); - } + } POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); - return sum; + return sum; } /* - 16x8 works with 16 elements ; it allows to avoid replicating - loads, and give the compiler more rooms for scheduling. - It's only used from inside hadamard8_diff16_altivec. - - Unfortunately, it seems gcc-3.3 is a bit dumb, and - the compiled code has a LOT of spill code, it seems - gcc (unlike xlc) cannot keep everything in registers - by itself. The following code include hand-made - registers allocation. It's not clean, but on - a 7450 the resulting code is much faster (best case - fall from 700+ cycles to 550). - - xlc doesn't add spill code, but it doesn't know how to - schedule for the 7450, and its code isn't much faster than - gcc-3.3 on the 7450 (but uses 25% less instructions...) - - On the 970, the hand-made RA is still a win (arount 690 - vs. around 780), but xlc goes to around 660 on the - regular C code... +16x8 works with 16 elements; it allows to avoid replicating loads, and +give the compiler more rooms for scheduling. It's only used from +inside hadamard8_diff16_altivec. + +Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has a LOT +of spill code, it seems gcc (unlike xlc) cannot keep everything in registers +by itself. The following code include hand-made registers allocation. It's not +clean, but on a 7450 the resulting code is much faster (best case fall from +700+ cycles to 550). + +xlc doesn't add spill code, but it doesn't know how to schedule for the 7450, +and its code isn't much faster than gcc-3.3 on the 7450 (but uses 25% less +instructions...) + +On the 970, the hand-made RA is still a win (around 690 vs. around 780), but +xlc goes to around 660 on the regular C code... */ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { - int sum; - register vector signed short - temp0 asm ("v0"), - temp1 asm ("v1"), - temp2 asm ("v2"), - temp3 asm ("v3"), - temp4 asm ("v4"), - temp5 asm ("v5"), - temp6 asm ("v6"), - temp7 asm ("v7"); - register vector signed short - temp0S asm ("v8"), - temp1S asm ("v9"), - temp2S asm ("v10"), - temp3S asm ("v11"), - temp4S asm ("v12"), - temp5S asm ("v13"), - temp6S asm ("v14"), - temp7S asm ("v15"); - register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0); - { - register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1); - register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1); - register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1); - register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char) - AVV(0x02, 0x03, 0x00, 0x01, - 0x06, 0x07, 0x04, 0x05, - 0x0A, 0x0B, 0x08, 0x09, - 0x0E, 0x0F, 0x0C, 0x0D); - register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char) - AVV(0x04, 0x05, 0x06, 0x07, - 0x00, 0x01, 0x02, 0x03, - 0x0C, 0x0D, 0x0E, 0x0F, - 0x08, 0x09, 0x0A, 0x0B); - register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char) - AVV(0x08, 0x09, 0x0A, 0x0B, - 0x0C, 0x0D, 0x0E, 0x0F, - 0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07); - -#define ONEITERBUTTERFLY(i, res1, res2) \ - { \ - register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \ - register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \ - src1 = vec_ld(stride * i, src); \ - src2 = vec_ld((stride * i) + 16, src); \ - register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ - dst1 = vec_ld(stride * i, dst); \ - dst2 = vec_ld((stride * i) + 16, dst); \ - register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ - /* promote the unsigned chars to signed shorts */ \ - register vector signed short srcV asm ("v24") = \ - (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \ - register vector signed short dstV asm ("v25") = \ - (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \ - register vector signed short srcW asm ("v26") = \ - (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \ - register vector signed short dstW asm ("v27") = \ - (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \ - /* substractions inside the first butterfly */ \ - register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \ - register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \ - register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \ - register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \ - register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \ - register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \ - register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \ - register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \ - register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \ - register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \ - register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \ - res1 = vec_mladd(but2, vprod3, op3); \ - register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \ - res2 = vec_mladd(but2S, vprod3, op3S); \ + int sum; + register vector signed short + temp0 __asm__ ("v0"), + temp1 __asm__ ("v1"), + temp2 __asm__ ("v2"), + temp3 __asm__ ("v3"), + temp4 __asm__ ("v4"), + temp5 __asm__ ("v5"), + temp6 __asm__ ("v6"), + temp7 __asm__ ("v7"); + register vector signed short + temp0S __asm__ ("v8"), + temp1S __asm__ ("v9"), + temp2S __asm__ ("v10"), + temp3S __asm__ ("v11"), + temp4S __asm__ ("v12"), + temp5S __asm__ ("v13"), + temp6S __asm__ ("v14"), + temp7S __asm__ ("v15"); + register const vector unsigned char vzero __asm__ ("v31") = + (const vector unsigned char)vec_splat_u8(0); + { + register const vector signed short vprod1 __asm__ ("v16") = + (const vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1 }; + register const vector signed short vprod2 __asm__ ("v17") = + (const vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1 }; + register const vector signed short vprod3 __asm__ ("v18") = + (const vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1 }; + register const vector unsigned char perm1 __asm__ ("v19") = + (const vector unsigned char) + {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, + 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D}; + register const vector unsigned char perm2 __asm__ ("v20") = + (const vector unsigned char) + {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, + 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B}; + register const vector unsigned char perm3 __asm__ ("v21") = + (const vector unsigned char) + {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}; + +#define ONEITERBUTTERFLY(i, res1, res2) \ + { \ + register vector unsigned char src1 __asm__ ("v22"), \ + src2 __asm__ ("v23"), \ + dst1 __asm__ ("v24"), \ + dst2 __asm__ ("v25"), \ + srcO __asm__ ("v22"), \ + dstO __asm__ ("v23"); \ + \ + register vector signed short srcV __asm__ ("v24"), \ + dstV __asm__ ("v25"), \ + srcW __asm__ ("v26"), \ + dstW __asm__ ("v27"), \ + but0 __asm__ ("v28"), \ + but0S __asm__ ("v29"), \ + op1 __asm__ ("v30"), \ + but1 __asm__ ("v22"), \ + op1S __asm__ ("v23"), \ + but1S __asm__ ("v24"), \ + op2 __asm__ ("v25"), \ + but2 __asm__ ("v26"), \ + op2S __asm__ ("v27"), \ + but2S __asm__ ("v28"), \ + op3 __asm__ ("v29"), \ + op3S __asm__ ("v30"); \ + \ + src1 = vec_ld(stride * i, src); \ + src2 = vec_ld((stride * i) + 16, src); \ + srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ + dst1 = vec_ld(stride * i, dst); \ + dst2 = vec_ld((stride * i) + 16, dst); \ + dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ + /* promote the unsigned chars to signed shorts */ \ + srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \ + (vector signed char)srcO); \ + dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \ + (vector signed char)dstO); \ + srcW = (vector signed short)vec_mergel((vector signed char)vzero, \ + (vector signed char)srcO); \ + dstW = (vector signed short)vec_mergel((vector signed char)vzero, \ + (vector signed char)dstO); \ + /* subtractions inside the first butterfly */ \ + but0 = vec_sub(srcV, dstV); \ + but0S = vec_sub(srcW, dstW); \ + op1 = vec_perm(but0, but0, perm1); \ + but1 = vec_mladd(but0, vprod1, op1); \ + op1S = vec_perm(but0S, but0S, perm1); \ + but1S = vec_mladd(but0S, vprod1, op1S); \ + op2 = vec_perm(but1, but1, perm2); \ + but2 = vec_mladd(but1, vprod2, op2); \ + op2S = vec_perm(but1S, but1S, perm2); \ + but2S = vec_mladd(but1S, vprod2, op2S); \ + op3 = vec_perm(but2, but2, perm3); \ + res1 = vec_mladd(but2, vprod3, op3); \ + op3S = vec_perm(but2S, but2S, perm3); \ + res2 = vec_mladd(but2S, vprod3, op3S); \ } ONEITERBUTTERFLY(0, temp0, temp0S); ONEITERBUTTERFLY(1, temp1, temp1S); @@ -1523,10 +1229,16 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, ONEITERBUTTERFLY(5, temp5, temp5S); ONEITERBUTTERFLY(6, temp6, temp6S); ONEITERBUTTERFLY(7, temp7, temp7S); - } + } #undef ONEITERBUTTERFLY - { + { register vector signed int vsum; + register vector signed short line0S, line1S, line2S, line3S, line4S, + line5S, line6S, line7S, line0BS,line2BS, + line1BS,line3BS,line4BS,line6BS,line5BS, + line7BS,line0CS,line4CS,line1CS,line5CS, + line2CS,line6CS,line3CS,line7CS; + register vector signed short line0 = vec_add(temp0, temp1); register vector signed short line1 = vec_sub(temp0, temp1); register vector signed short line2 = vec_add(temp2, temp3); @@ -1563,32 +1275,32 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, vsum = vec_sum4s(vec_abs(line6C), vsum); vsum = vec_sum4s(vec_abs(line7C), vsum); - register vector signed short line0S = vec_add(temp0S, temp1S); - register vector signed short line1S = vec_sub(temp0S, temp1S); - register vector signed short line2S = vec_add(temp2S, temp3S); - register vector signed short line3S = vec_sub(temp2S, temp3S); - register vector signed short line4S = vec_add(temp4S, temp5S); - register vector signed short line5S = vec_sub(temp4S, temp5S); - register vector signed short line6S = vec_add(temp6S, temp7S); - register vector signed short line7S = vec_sub(temp6S, temp7S); - - register vector signed short line0BS = vec_add(line0S, line2S); - register vector signed short line2BS = vec_sub(line0S, line2S); - register vector signed short line1BS = vec_add(line1S, line3S); - register vector signed short line3BS = vec_sub(line1S, line3S); - register vector signed short line4BS = vec_add(line4S, line6S); - register vector signed short line6BS = vec_sub(line4S, line6S); - register vector signed short line5BS = vec_add(line5S, line7S); - register vector signed short line7BS = vec_sub(line5S, line7S); - - register vector signed short line0CS = vec_add(line0BS, line4BS); - register vector signed short line4CS = vec_sub(line0BS, line4BS); - register vector signed short line1CS = vec_add(line1BS, line5BS); - register vector signed short line5CS = vec_sub(line1BS, line5BS); - register vector signed short line2CS = vec_add(line2BS, line6BS); - register vector signed short line6CS = vec_sub(line2BS, line6BS); - register vector signed short line3CS = vec_add(line3BS, line7BS); - register vector signed short line7CS = vec_sub(line3BS, line7BS); + line0S = vec_add(temp0S, temp1S); + line1S = vec_sub(temp0S, temp1S); + line2S = vec_add(temp2S, temp3S); + line3S = vec_sub(temp2S, temp3S); + line4S = vec_add(temp4S, temp5S); + line5S = vec_sub(temp4S, temp5S); + line6S = vec_add(temp6S, temp7S); + line7S = vec_sub(temp6S, temp7S); + + line0BS = vec_add(line0S, line2S); + line2BS = vec_sub(line0S, line2S); + line1BS = vec_add(line1S, line3S); + line3BS = vec_sub(line1S, line3S); + line4BS = vec_add(line4S, line6S); + line6BS = vec_sub(line4S, line6S); + line5BS = vec_add(line5S, line7S); + line7BS = vec_sub(line5S, line7S); + + line0CS = vec_add(line0BS, line4BS); + line4CS = vec_sub(line0BS, line4BS); + line1CS = vec_add(line1BS, line5BS); + line5CS = vec_sub(line1BS, line5BS); + line2CS = vec_add(line2BS, line6BS); + line6CS = vec_sub(line2BS, line6BS); + line3CS = vec_add(line3BS, line7BS); + line7CS = vec_sub(line3BS, line7BS); vsum = vec_sum4s(vec_abs(line0CS), vsum); vsum = vec_sum4s(vec_abs(line1CS), vsum); @@ -1601,175 +1313,144 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, vsum = vec_sums(vsum, (vector signed int)vzero); vsum = vec_splat(vsum, 3); vec_ste(vsum, 0, &sum); - } - return sum; + } + return sum; } int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1); - int score; + int score; POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1); - score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); - if (h==16) { - dst += 8*stride; - src += 8*stride; - score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); - } + score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); + if (h==16) { + dst += 8*stride; + src += 8*stride; + score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); + } POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1); - return score; + return score; } -#endif //CONFIG_DARWIN -int has_altivec(void) +static void vorbis_inverse_coupling_altivec(float *mag, float *ang, + int blocksize) { -#ifdef __AMIGAOS4__ - ULONG result = 0; - extern struct ExecIFace *IExec; - - IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE); - if (result == VECTORTYPE_ALTIVEC) return 1; - return 0; -#else /* __AMIGAOS4__ */ - -#ifdef CONFIG_DARWIN - int sels[2] = {CTL_HW, HW_VECTORUNIT}; - int has_vu = 0; - size_t len = sizeof(has_vu); - int err; - - err = sysctl(sels, 2, &has_vu, &len, NULL, 0); - - if (err == 0) return (has_vu != 0); -#else /* CONFIG_DARWIN */ -/* no Darwin, do it the brute-force way */ -/* this is borrowed from the libmpeg2 library */ - { - signal (SIGILL, sigill_handler); - if (sigsetjmp (jmpbuf, 1)) { - signal (SIGILL, SIG_DFL); - } else { - canjump = 1; - - asm volatile ("mtspr 256, %0\n\t" - "vand %%v0, %%v0, %%v0" - : - : "r" (-1)); - - signal (SIGILL, SIG_DFL); - return 1; - } + int i; + vector float m, a; + vector bool int t0, t1; + const vector unsigned int v_31 = //XXX + vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); + for (i = 0; i < blocksize; i += 4) { + m = vec_ld(0, mag+i); + a = vec_ld(0, ang+i); + t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); + t1 = vec_cmple(a, (vector float)vec_splat_u32(0)); + a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31)); + t0 = (vector bool int)vec_and(a, t1); + t1 = (vector bool int)vec_andc(a, t1); + a = vec_sub(m, (vector float)t1); + m = vec_add(m, (vector float)t0); + vec_stl(a, 0, ang+i); + vec_stl(m, 0, mag+i); } -#endif /* CONFIG_DARWIN */ - return 0; -#endif /* __AMIGAOS4__ */ } /* next one assumes that ((line_size % 8) == 0) */ void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) { POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1); -#ifdef ALTIVEC_USE_REFERENCE_C_CODE + register int i; + register vector unsigned char pixelsv1, pixelsv2, pixelsavg; + register vector unsigned char blockv, temp1, temp2, blocktemp; + register vector unsigned short pixelssum1, pixelssum2, temp3; + + register const vector unsigned char vczero = (const vector unsigned char) + vec_splat_u8(0); + register const vector unsigned short vctwo = (const vector unsigned short) + vec_splat_u16(2); + + temp1 = vec_ld(0, pixels); + temp2 = vec_ld(16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); + } + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vctwo); - int j; POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1); - for (j = 0; j < 2; j++) { - int i; - const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); - const uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); - uint32_t l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; - uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); - uint32_t l1, h1; - pixels += line_size; - for (i = 0; i < h; i += 2) { - uint32_t a = (((const struct unaligned_32 *) (pixels))->l); - uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); - l1 = (a & 0x03030303UL) + (b & 0x03030303UL); - h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); - *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL)); - pixels += line_size; - block += line_size; - a = (((const struct unaligned_32 *) (pixels))->l); - b = (((const struct unaligned_32 *) (pixels + 1))->l); - l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; - h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); - *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL)); - pixels += line_size; - block += line_size; - } pixels += 4 - line_size * (h + 1); - block += 4 - line_size * h; - } -POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1); -#else /* ALTIVEC_USE_REFERENCE_C_CODE */ - register int i; - register vector unsigned char - pixelsv1, pixelsv2, - pixelsavg; - register vector unsigned char - blockv, temp1, temp2, blocktemp; - register vector unsigned short - pixelssum1, pixelssum2, temp3; - register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); - register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); + for (i = 0; i < h ; i++) { + int rightside = ((unsigned long)block & 0x0000000F); + blockv = vec_ld(0, block); + + temp1 = vec_ld(line_size, pixels); + temp2 = vec_ld(line_size + 16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); + } + + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + pixelssum1 = vec_add(pixelssum2, vctwo); + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); + + if (rightside) { + blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); + } else { + blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); + } + + blockv = vec_avg(blocktemp, blockv); + vec_st(blockv, 0, block); -POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1); - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) - { - blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } - else - { - blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - blockv = vec_avg(blocktemp, blockv); - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } + block += line_size; + pixels += line_size; + } POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1); -#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +} + +void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx) +{ + c->pix_abs[0][1] = sad16_x2_altivec; + c->pix_abs[0][2] = sad16_y2_altivec; + c->pix_abs[0][3] = sad16_xy2_altivec; + c->pix_abs[0][0] = sad16_altivec; + c->pix_abs[1][0] = sad8_altivec; + c->sad[0]= sad16_altivec; + c->sad[1]= sad8_altivec; + c->pix_norm1 = pix_norm1_altivec; + c->sse[1]= sse8_altivec; + c->sse[0]= sse16_altivec; + c->pix_sum = pix_sum_altivec; + c->diff_pixels = diff_pixels_altivec; + c->get_pixels = get_pixels_altivec; + c->clear_block = clear_block_altivec; + c->add_bytes= add_bytes_altivec; + c->put_pixels_tab[0][0] = put_pixels16_altivec; + /* the two functions do the same thing, so use the same code */ + c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; + c->avg_pixels_tab[0][0] = avg_pixels16_altivec; + c->avg_pixels_tab[1][0] = avg_pixels8_altivec; + c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; + c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; + c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; + c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; + + c->hadamard8_diff[0] = hadamard8_diff16_altivec; + c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; + if (CONFIG_VORBIS_DECODER) + c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec; }