#include "img_format.h"
#include "mp_image.h"
#include "vf.h"
-#include "vd_ffmpeg.h"
+#include "av_helpers.h"
#include "libvo/fastmemcpy.h"
#include "libavutil/internal.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/dsputil.h"
((short*)p->threshold_mtx)[a]=q * ((short*)p->threshold_mtx_noq)[a];//ints faster in C
}
-static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt);
-static void row_idct_c(DCTELEM* workspace,
+static void column_fidct_c(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt);
+static void row_idct_c(int16_t* workspace,
int16_t* output_adr, int output_stride, int cnt);
-static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt);
+static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt);
//this is rather ugly, but there is no need for function pointers
#define store_slice_s store_slice_c
);
}
-static void column_fidct_mmx(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt);
-static void row_idct_mmx(DCTELEM* workspace,
+static void column_fidct_mmx(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt);
+static void row_idct_mmx(int16_t* workspace,
int16_t* output_adr, int output_stride, int cnt);
-static void row_fdct_mmx(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt);
+static void row_fdct_mmx(int16_t *data, const uint8_t *pixels, int line_size, int cnt);
#define store_slice_s store_slice_mmx
#define store_slice2_s store_slice2_mmx
const int step=6-p->log2_count;
const int qps= 3 + is_luma;
int32_t __attribute__((aligned(32))) block_align[4*8*BLOCKSZ+ 4*8*BLOCKSZ];
- DCTELEM *block= (DCTELEM *)block_align;
- DCTELEM *block3=(DCTELEM *)(block_align+4*8*BLOCKSZ);
+ int16_t *block= (int16_t *)block_align;
+ int16_t *block3=(int16_t *)(block_align+4*8*BLOCKSZ);
memset(block3, 0, 4*8*BLOCKSZ);
column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+x*8, block3+x*8, 8); //yes, this is a HOTSPOT
}
row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, 2*(BLOCKSZ-1));
- memmove(block, block+(BLOCKSZ-1)*64, 8*8*sizeof(DCTELEM)); //cycling
- memmove(block3, block3+(BLOCKSZ-1)*64, 6*8*sizeof(DCTELEM));
+ memmove(block, block+(BLOCKSZ-1)*64, 8*8*sizeof(int16_t)); //cycling
+ memmove(block3, block3+(BLOCKSZ-1)*64, 6*8*sizeof(int16_t));
}
//
es=width+8-x0; // 8, ...
//#define MANGLE(a) #a
-//typedef int16_t DCTELEM; //! only int16_t
+//typedef int16_t int16_t; //! only int16_t
#define DCTSIZE 8
#define DCTSIZE_S "8"
#if !HAVE_MMX
-static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt)
+static void column_fidct_c(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
- DCTELEM* dataptr;
- DCTELEM* wsptr;
+ int16_t* dataptr;
+ int16_t* wsptr;
int16_t *threshold;
int ctr;
#else /* HAVE_MMX */
-static void column_fidct_mmx(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt)
+static void column_fidct_mmx(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt)
{
uint64_t __attribute__((aligned(8))) temps[4];
__asm__ volatile(
#if !HAVE_MMX
-static void row_idct_c(DCTELEM* workspace,
+static void row_idct_c(int16_t* workspace,
int16_t* output_adr, int output_stride, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
int_simd16_t z5, z10, z11, z12, z13;
int16_t* outptr;
- DCTELEM* wsptr;
+ int16_t* wsptr;
cnt*=4;
wsptr = workspace;
#else /* HAVE_MMX */
-static void row_idct_mmx (DCTELEM* workspace,
+static void row_idct_mmx (int16_t* workspace,
int16_t* output_adr, int output_stride, int cnt)
{
uint64_t __attribute__((aligned(8))) temps[4];
#if !HAVE_MMX
-static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt)
+static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
int_simd16_t z1, z2, z3, z4, z5, z11, z13;
- DCTELEM *dataptr;
+ int16_t *dataptr;
cnt*=4;
// Pass 1: process rows.
#else /* HAVE_MMX */
-static void row_fdct_mmx(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt)
+static void row_fdct_mmx(int16_t *data, const uint8_t *pixels, int line_size, int cnt)
{
uint64_t __attribute__((aligned(8))) temps[4];
__asm__ volatile(