--- /dev/null
- const uint8_t *restrict pixels, ptrdiff_t line_size)
+/*
+ * SIMD-optimized pixel operations
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/pixblockdsp.h"
+#include "asm.h"
+
+static void get_pixels_mvi(int16_t *restrict block,
- pixels += line_size;
++ const uint8_t *restrict pixels, ptrdiff_t stride)
+{
+ int h = 8;
+
+ do {
+ uint64_t p;
+
+ p = ldq(pixels);
+ stq(unpkbw(p), block);
+ stq(unpkbw(p >> 32), block + 4);
+
- int stride) {
++ pixels += stride;
+ block += 8;
+ } while (--h);
+}
+
+static void diff_pixels_mvi(int16_t *block, const uint8_t *s1, const uint8_t *s2,
++ ptrdiff_t stride)
++{
+ int h = 8;
+ uint64_t mask = 0x4040;
+
+ mask |= mask << 16;
+ mask |= mask << 32;
+ do {
+ uint64_t x, y, c, d, a;
+ uint64_t signs;
+
+ x = ldq(s1);
+ y = ldq(s2);
+ c = cmpbge(x, y);
+ d = x - y;
+ a = zap(mask, c); /* We use 0x4040404040404040 here... */
+ d += 4 * a; /* ...so we can use s4addq here. */
+ signs = zap(-1, c);
+
+ stq(unpkbw(d) | (unpkbw(signs) << 8), block);
+ stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4);
+
+ s1 += stride;
+ s2 += stride;
+ block += 8;
+ } while (--h);
+}
+
+av_cold void ff_pixblockdsp_init_alpha(PixblockDSPContext *c, AVCodecContext *avctx,
+ unsigned high_bit_depth)
+{
+ if (amask(AMASK_MVI) == 0) {
+ if (!high_bit_depth)
+ c->get_pixels = get_pixels_mvi;
+ c->diff_pixels = diff_pixels_mvi;
+ }
+}
--- /dev/null
- const uint8_t *src2, int stride);
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ * Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_PIXBLOCKDSP_MIPS_H
+#define AVCODEC_MIPS_PIXBLOCKDSP_MIPS_H
+
+#include "../mpegvideo.h"
+
+void ff_diff_pixels_msa(int16_t *av_restrict block, const uint8_t *src1,
- ptrdiff_t line_size);
++ const uint8_t *src2, ptrdiff_t stride);
+void ff_get_pixels_16_msa(int16_t *restrict dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_get_pixels_8_msa(int16_t *restrict dst, const uint8_t *src,
+ ptrdiff_t stride);
+
+void ff_get_pixels_8_mmi(int16_t *av_restrict block, const uint8_t *pixels,
- const uint8_t *src2, int stride);
++ ptrdiff_t stride);
+void ff_diff_pixels_mmi(int16_t *av_restrict block, const uint8_t *src1,
++ const uint8_t *src2, ptrdiff_t stride);
+
+#endif // #ifndef AVCODEC_MIPS_PIXBLOCKDSP_MIPS_H
--- /dev/null
- ptrdiff_t line_size)
+/*
+ * Loongson SIMD optimized pixblockdsp
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "pixblockdsp_mips.h"
+#include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
+
+void ff_get_pixels_8_mmi(int16_t *av_restrict block, const uint8_t *pixels,
- MMI_LDXC1(%[ftmp2], %[pixels], %[line_size], 0x00)
++ ptrdiff_t stride)
+{
+ double ftmp[7];
+ DECLARE_VAR_ALL64;
+ DECLARE_VAR_ADDRT;
+
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+
+ MMI_LDC1(%[ftmp1], %[pixels], 0x00)
- PTR_ADDU "%[pixels], %[pixels], %[line_size_x2] \n\t"
++ MMI_LDXC1(%[ftmp2], %[pixels], %[stride], 0x00)
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
+ MMI_SDC1(%[ftmp3], %[block], 0x00)
+ MMI_SDC1(%[ftmp4], %[block], 0x08)
+ MMI_SDC1(%[ftmp5], %[block], 0x10)
+ MMI_SDC1(%[ftmp6], %[block], 0x18)
- MMI_LDXC1(%[ftmp2], %[pixels], %[line_size], 0x00)
++ PTR_ADDU "%[pixels], %[pixels], %[stride_x2] \n\t"
+
+ MMI_LDC1(%[ftmp1], %[pixels], 0x00)
- PTR_ADDU "%[pixels], %[pixels], %[line_size_x2] \n\t"
++ MMI_LDXC1(%[ftmp2], %[pixels], %[stride], 0x00)
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
+ MMI_SDC1(%[ftmp3], %[block], 0x20)
+ MMI_SDC1(%[ftmp4], %[block], 0x28)
+ MMI_SDC1(%[ftmp5], %[block], 0x30)
+ MMI_SDC1(%[ftmp6], %[block], 0x38)
- MMI_LDXC1(%[ftmp2], %[pixels], %[line_size], 0x00)
++ PTR_ADDU "%[pixels], %[pixels], %[stride_x2] \n\t"
+
+ MMI_LDC1(%[ftmp1], %[pixels], 0x00)
- PTR_ADDU "%[pixels], %[pixels], %[line_size_x2] \n\t"
++ MMI_LDXC1(%[ftmp2], %[pixels], %[stride], 0x00)
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
+ MMI_SDC1(%[ftmp3], %[block], 0x40)
+ MMI_SDC1(%[ftmp4], %[block], 0x48)
+ MMI_SDC1(%[ftmp5], %[block], 0x50)
+ MMI_SDC1(%[ftmp6], %[block], 0x58)
- MMI_LDXC1(%[ftmp2], %[pixels], %[line_size], 0x00)
++ PTR_ADDU "%[pixels], %[pixels], %[stride_x2] \n\t"
+
+ MMI_LDC1(%[ftmp1], %[pixels], 0x00)
- : [block]"r"((mips_reg)block), [line_size]"r"((mips_reg)line_size),
- [line_size_x2]"r"((mips_reg)(line_size<<1))
++ MMI_LDXC1(%[ftmp2], %[pixels], %[stride], 0x00)
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
+ MMI_SDC1(%[ftmp3], %[block], 0x60)
+ MMI_SDC1(%[ftmp4], %[block], 0x68)
+ MMI_SDC1(%[ftmp5], %[block], 0x70)
+ MMI_SDC1(%[ftmp6], %[block], 0x78)
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]),
+ RESTRICT_ASM_ALL64
+ RESTRICT_ASM_ADDRT
+ [pixels]"+&r"(pixels)
- const uint8_t *src2, int stride)
++ : [block]"r"((mips_reg)block), [stride]"r"((mips_reg)stride),
++ [stride_x2]"r"((mips_reg)(stride<<1))
+ : "memory"
+ );
+}
+
+void ff_diff_pixels_mmi(int16_t *av_restrict block, const uint8_t *src1,
++ const uint8_t *src2, ptrdiff_t stride)
+{
+ double ftmp[5];
+ mips_reg tmp[1];
+ DECLARE_VAR_ALL64;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x08 \n\t"
+ "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
+ "1: \n\t"
+ MMI_LDC1(%[ftmp0], %[src1], 0x00)
+ "or %[ftmp1], %[ftmp0], %[ftmp0] \n\t"
+ MMI_LDC1(%[ftmp2], %[src2], 0x00)
+ "or %[ftmp3], %[ftmp2], %[ftmp2] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
+ "punpckhbh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ "psubh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
+ "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ MMI_SDC1(%[ftmp0], %[block], 0x00)
+ MMI_SDC1(%[ftmp1], %[block], 0x08)
+ PTR_ADDI "%[tmp0], %[tmp0], -0x01 \n\t"
+ PTR_ADDIU "%[block], %[block], 0x10 \n\t"
+ PTR_ADDU "%[src1], %[src1], %[stride] \n\t"
+ PTR_ADDU "%[src2], %[src2], %[stride] \n\t"
+ "bgtz %[tmp0], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]),
+ [tmp0]"=&r"(tmp[0]),
+ RESTRICT_ASM_ALL64
+ [block]"+&r"(block), [src1]"+&r"(src1),
+ [src2]"+&r"(src2)
+ : [stride]"r"((mips_reg)stride)
+ : "memory"
+ );
+}
--- /dev/null
- const uint8_t *src2, int stride)
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "pixblockdsp_mips.h"
+
+static void diff_pixels_msa(int16_t *block, const uint8_t *src1,
+ const uint8_t *src2, int32_t stride)
+{
+ v16u8 in10, in11, in12, in13, in14, in15, in16, in17;
+ v16u8 in20, in21, in22, in23, in24, in25, in26, in27;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+
+ LD_UB8(src1, stride, in10, in11, in12, in13, in14, in15, in16, in17);
+ LD_UB8(src2, stride, in20, in21, in22, in23, in24, in25, in26, in27);
+ ILVR_B4_SH(in10, in20, in11, in21, in12, in22, in13, in23,
+ out0, out1, out2, out3);
+ ILVR_B4_SH(in14, in24, in15, in25, in16, in26, in17, in27,
+ out4, out5, out6, out7);
+ HSUB_UB4_SH(out0, out1, out2, out3, out0, out1, out2, out3);
+ HSUB_UB4_SH(out4, out5, out6, out7, out4, out5, out6, out7);
+ ST_SH8(out0, out1, out2, out3, out4, out5, out6, out7, block, 8);
+}
+
+static void copy_8bit_to_16bit_width8_msa(const uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ uint8_t *dst_ptr;
+ int32_t cnt;
+ v16u8 src0, src1, src2, src3;
+ v16i8 zero = { 0 };
+
+ dst_ptr = (uint8_t *) dst;
+
+ for (cnt = (height >> 2); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ ILVR_B4_UB(zero, src0, zero, src1, zero, src2, zero, src3,
+ src0, src1, src2, src3);
+
+ ST_UB4(src0, src1, src2, src3, dst_ptr, (dst_stride * 2));
+ dst_ptr += (4 * 2 * dst_stride);
+ }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int32_t height, int32_t width)
+{
+ int32_t cnt, loop_cnt;
+ const uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LD_UB8(src_tmp, src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+ src_tmp += (8 * src_stride);
+
+ ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+ dst_tmp, dst_stride);
+ dst_tmp += (8 * dst_stride);
+ }
+
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ int32_t cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if (0 == height % 12) {
+ for (cnt = (height / 12); cnt--;) {
+ LD_UB8(src, src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+ dst, dst_stride);
+ dst += (8 * dst_stride);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 8) {
+ copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+ } else if (0 == height % 4) {
+ for (cnt = (height >> 2); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ }
+}
+
+void ff_get_pixels_16_msa(int16_t *av_restrict dest, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ copy_width16_msa(src, stride, (uint8_t *) dest, 16, 8);
+}
+
+void ff_get_pixels_8_msa(int16_t *av_restrict dest, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ copy_8bit_to_16bit_width8_msa(src, stride, dest, 8, 8);
+}
+
+void ff_diff_pixels_msa(int16_t *av_restrict block, const uint8_t *src1,
++ const uint8_t *src2, ptrdiff_t stride)
+{
+ diff_pixels_msa(block, src1, src2, stride);
+}
#include "avcodec.h"
#include "pixblockdsp.h"
-#define BIT_DEPTH 16
-#include "pixblockdsp_template.c"
-#undef BIT_DEPTH
+static void get_pixels_16_c(int16_t *av_restrict block, const uint8_t *pixels,
- ptrdiff_t line_size)
++ ptrdiff_t stride)
+{
- AV_COPY128U(block + 0 * 8, pixels + 0 * line_size);
- AV_COPY128U(block + 1 * 8, pixels + 1 * line_size);
- AV_COPY128U(block + 2 * 8, pixels + 2 * line_size);
- AV_COPY128U(block + 3 * 8, pixels + 3 * line_size);
- AV_COPY128U(block + 4 * 8, pixels + 4 * line_size);
- AV_COPY128U(block + 5 * 8, pixels + 5 * line_size);
- AV_COPY128U(block + 6 * 8, pixels + 6 * line_size);
- AV_COPY128U(block + 7 * 8, pixels + 7 * line_size);
++ AV_COPY128U(block + 0 * 8, pixels + 0 * stride);
++ AV_COPY128U(block + 1 * 8, pixels + 1 * stride);
++ AV_COPY128U(block + 2 * 8, pixels + 2 * stride);
++ AV_COPY128U(block + 3 * 8, pixels + 3 * stride);
++ AV_COPY128U(block + 4 * 8, pixels + 4 * stride);
++ AV_COPY128U(block + 5 * 8, pixels + 5 * stride);
++ AV_COPY128U(block + 6 * 8, pixels + 6 * stride);
++ AV_COPY128U(block + 7 * 8, pixels + 7 * stride);
+}
+
+static void get_pixels_8_c(int16_t *av_restrict block, const uint8_t *pixels,
- ptrdiff_t line_size)
++ ptrdiff_t stride)
+{
+ int i;
-#define BIT_DEPTH 8
-#include "pixblockdsp_template.c"
+ /* read the pixels */
+ for (i = 0; i < 8; i++) {
+ block[0] = pixels[0];
+ block[1] = pixels[1];
+ block[2] = pixels[2];
+ block[3] = pixels[3];
+ block[4] = pixels[4];
+ block[5] = pixels[5];
+ block[6] = pixels[6];
+ block[7] = pixels[7];
- pixels += line_size;
++ pixels += stride;
+ block += 8;
+ }
+}
-static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
+static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
- const uint8_t *s2, int stride)
+ const uint8_t *s2, ptrdiff_t stride)
{
int i;
#include "libavcodec/avcodec.h"
#include "libavcodec/pixblockdsp.h"
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
- ptrdiff_t line_size)
+#if HAVE_VSX
+static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
- pixels += line_size;
++ ptrdiff_t stride)
+{
+ int i;
+ vector unsigned char perm =
+ (vector unsigned char) {0x00,0x10, 0x01,0x11,0x02,0x12,0x03,0x13,\
+ 0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17};
+ const vector unsigned char zero =
+ (const vector unsigned char) vec_splat_u8(0);
+
+ for (i = 0; i < 8; i++) {
+ /* Read potentially unaligned pixels.
+ * We're reading 16 pixels, and actually only want 8,
+ * but we simply ignore the extras. */
+ vector unsigned char bytes = vec_vsx_ld(0, pixels);
+
+ // Convert the bytes into shorts.
+ //vector signed short shorts = (vector signed short) vec_perm(zero, bytes, perm);
+ vector signed short shorts = (vector signed short) vec_perm(bytes, zero, perm);
+
+ // Save the data to the block, we assume the block is 16-byte aligned.
+ vec_vsx_st(shorts, i * 16, (vector signed short *) block);
+
++ pixels += stride;
+ }
+}
+#else
static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
- ptrdiff_t line_size)
+ ptrdiff_t stride)
{
int i;
- vec_u8 perm = vec_lvsl(0, pixels);
const vec_u8 zero = (const vec_u8)vec_splat_u8(0);
for (i = 0; i < 8; i++) {
}
}
- const uint8_t *s2, int stride)
+#endif /* HAVE_VSX */
+
+#if HAVE_VSX
+static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
++ const uint8_t *s2, ptrdiff_t stride)
+{
+ int i;
+ const vector unsigned char zero =
+ (const vector unsigned char) vec_splat_u8(0);
+ vector signed short shorts1, shorts2;
+
+ for (i = 0; i < 4; i++) {
+ /* Read potentially unaligned pixels.
+ * We're reading 16 pixels, and actually only want 8,
+ * but we simply ignore the extras. */
+ vector unsigned char bytes = vec_vsx_ld(0, s1);
+
+ // Convert the bytes into shorts.
+ shorts1 = (vector signed short) vec_mergeh(bytes, zero);
+
+ // Do the same for the second block of pixels.
+ bytes =vec_vsx_ld(0, s2);
+
+ // Convert the bytes into shorts.
+ shorts2 = (vector signed short) vec_mergeh(bytes, zero);
+
+ // Do the subtraction.
+ shorts1 = vec_sub(shorts1, shorts2);
+
+ // Save the data to the block, we assume the block is 16-byte aligned.
+ vec_vsx_st(shorts1, 0, (vector signed short *) block);
+
+ s1 += stride;
+ s2 += stride;
+ block += 8;
+
+ /* The code below is a copy of the code above...
+ * This is a manual unroll. */
+
+ /* Read potentially unaligned pixels.
+ * We're reading 16 pixels, and actually only want 8,
+ * but we simply ignore the extras. */
+ bytes = vec_vsx_ld(0, s1);
+
+ // Convert the bytes into shorts.
+ shorts1 = (vector signed short) vec_mergeh(bytes, zero);
+
+ // Do the same for the second block of pixels.
+ bytes = vec_vsx_ld(0, s2);
+
+ // Convert the bytes into shorts.
+ shorts2 = (vector signed short) vec_mergeh(bytes, zero);
+
+ // Do the subtraction.
+ shorts1 = vec_sub(shorts1, shorts2);
+
+ // Save the data to the block, we assume the block is 16-byte aligned.
+ vec_vsx_st(shorts1, 0, (vector signed short *) block);
+
+ s1 += stride;
+ s2 += stride;
+ block += 8;
+ }
+}
+#else
static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
- const uint8_t *s2, int stride)
+ const uint8_t *s2, ptrdiff_t stride)
{
int i;
- vec_u8 perm1 = vec_lvsl(0, s1);
- vec_u8 perm2 = vec_lvsl(0, s2);
+ vec_u8 perm;
const vec_u8 zero = (const vec_u8)vec_splat_u8(0);
vec_s16 shorts1, shorts2;
mova [r0+0x70], m3
RET
-INIT_MMX mmx
; void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
- ; int stride);
+ ; ptrdiff_t stride);
-cglobal diff_pixels, 4,5
- pxor m7, m7
+%macro DIFF_PIXELS 0
+cglobal diff_pixels, 4,5,5
- movsxdifnidn r3, r3d
+ pxor m4, m4
add r0, 128
mov r4, -128
.loop:
#include "libavutil/x86/cpu.h"
#include "libavcodec/pixblockdsp.h"
- void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size);
- void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size);
+ void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
+ void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
- int stride);
+ ptrdiff_t stride);
+void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
- int stride);
++ ptrdiff_t stride);
av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
AVCodecContext *avctx,
--- /dev/null
- declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *av_restrict block, const uint8_t *s1, const uint8_t *s2, int stride); \
+/*
+ * Copyright (c) 2015 Tiancheng "Timothy" Gu
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+#include "checkasm.h"
+#include "libavcodec/pixblockdsp.h"
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+
+#define BUF_UNITS 8
+#define BUF_SIZE (BUF_UNITS * 128 + 8 * BUF_UNITS)
+
+#define randomize_buffers() \
+ do { \
+ int i; \
+ for (i = 0; i < BUF_SIZE; i += 4) { \
+ uint32_t r = rnd(); \
+ AV_WN32A(src10 + i, r); \
+ AV_WN32A(src11 + i, r); \
+ r = rnd(); \
+ AV_WN32A(src20 + i, r); \
+ AV_WN32A(src21 + i, r); \
+ r = rnd(); \
+ AV_WN32A(dst0_ + i, r); \
+ AV_WN32A(dst1_ + i, r); \
+ } \
+ } while (0)
+
+#define check_get_pixels(type) \
+ do { \
+ int i; \
+ declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *block, const uint8_t *pixels, ptrdiff_t line_size); \
+ \
+ for (i = 0; i < BUF_UNITS; i++) { \
+ int src_offset = i * 64 * sizeof(type) + 8 * i; /* Test various alignments */ \
+ int dst_offset = i * 64; /* dst must be aligned */ \
+ randomize_buffers(); \
+ call_ref(dst0 + dst_offset, src10 + src_offset, 8); \
+ call_new(dst1 + dst_offset, src11 + src_offset, 8); \
+ if (memcmp(src10, src11, BUF_SIZE)|| memcmp(dst0, dst1, BUF_SIZE)) \
+ fail(); \
+ bench_new(dst1 + dst_offset, src11 + src_offset, 8); \
+ } \
+ } while (0)
+
+#define check_diff_pixels(type) \
+ do { \
+ int i; \
++ declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *av_restrict block, const uint8_t *s1, const uint8_t *s2, ptrdiff_t stride); \
+ \
+ for (i = 0; i < BUF_UNITS; i++) { \
+ int src_offset = i * 64 * sizeof(type) + 8 * i; /* Test various alignments */ \
+ int dst_offset = i * 64; /* dst must be aligned */ \
+ randomize_buffers(); \
+ call_ref(dst0 + dst_offset, src10 + src_offset, src20 + src_offset, 8); \
+ call_new(dst1 + dst_offset, src11 + src_offset, src21 + src_offset, 8); \
+ if (memcmp(src10, src11, BUF_SIZE) || memcmp(src20, src21, BUF_SIZE) || memcmp(dst0, dst1, BUF_SIZE)) \
+ fail(); \
+ bench_new(dst1 + dst_offset, src11 + src_offset, src21 + src_offset, 8); \
+ } \
+ } while (0)
+
+void checkasm_check_pixblockdsp(void)
+{
+ LOCAL_ALIGNED_16(uint8_t, src10, [BUF_SIZE]);
+ LOCAL_ALIGNED_16(uint8_t, src11, [BUF_SIZE]);
+ LOCAL_ALIGNED_16(uint8_t, src20, [BUF_SIZE]);
+ LOCAL_ALIGNED_16(uint8_t, src21, [BUF_SIZE]);
+ LOCAL_ALIGNED_16(uint8_t, dst0_, [BUF_SIZE]);
+ LOCAL_ALIGNED_16(uint8_t, dst1_, [BUF_SIZE]);
+ uint16_t *dst0 = (uint16_t *)dst0_;
+ uint16_t *dst1 = (uint16_t *)dst1_;
+ PixblockDSPContext h;
+ AVCodecContext avctx = {
+ .bits_per_raw_sample = 8,
+ };
+
+ ff_pixblockdsp_init(&h, &avctx);
+
+ if (check_func(h.get_pixels, "get_pixels"))
+ check_get_pixels(uint8_t);
+
+ report("get_pixels");
+
+ if (check_func(h.diff_pixels, "diff_pixels"))
+ check_diff_pixels(uint8_t);
+
+ report("diff_pixels");
+}