%define GLOBAL
%endif
+%assign FENC_STRIDE 16
%assign FDEC_STRIDE 32
ALIGN 16
;-----------------------------------------------------------------------------
-; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
x264_sub4x4_dct_mmx:
- firstpush rbx
- pushreg rbx
- endprolog
-
- mov r10, parm1q ; dct
- mov rax, parm2q ; pix1
-%ifdef WIN64
- mov rcx, parm4q ; pix2
- movsxd rdx, dword [rsp+40+8] ; i_pix2
- movsxd rbx, parm3d ; i_pix1
-%else
- movsxd rbx, parm3d ; i_pix1
- movsxd rdx, parm5d ; i_pix2
-%endif
-
MMX_ZERO mm7
; Load 4 lines
- MMX_LOAD_DIFF_4P mm0, mm6, mm7, [rax ], [rcx]
- MMX_LOAD_DIFF_4P mm1, mm6, mm7, [rax+rbx ], [rcx+rdx]
- MMX_LOAD_DIFF_4P mm2, mm6, mm7, [rax+rbx*2], [rcx+rdx*2]
- add rax, rbx
- add rcx, rdx
- MMX_LOAD_DIFF_4P mm3, mm6, mm7, [rax+rbx*2], [rcx+rdx*2]
+ MMX_LOAD_DIFF_4P mm0, mm6, mm7, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE]
+ MMX_LOAD_DIFF_4P mm1, mm6, mm7, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE]
+ MMX_LOAD_DIFF_4P mm2, mm6, mm7, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE]
+ MMX_LOAD_DIFF_4P mm3, mm6, mm7, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE]
MMX_SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12
MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
- movq [r10+ 0], mm1 ; dct
- movq [r10+ 8], mm2
- movq [r10+16], mm3
- movq [r10+24], mm0
-
- pop rbx
+ movq [parm1q+ 0], mm1
+ movq [parm1q+ 8], mm2
+ movq [parm1q+16], mm3
+ movq [parm1q+24], mm0
ret
- endfunc
cglobal x264_add4x4_idct_mmx
ALIGN 16
;-----------------------------------------------------------------------------
-; void x264_add4x4_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
+; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
x264_add4x4_idct_mmx:
; Load dct coeffs
- movq mm0, [parm3q+ 0] ; dct
- movq mm1, [parm3q+ 8]
- movq mm2, [parm3q+16]
- movq mm3, [parm3q+24]
+ movq mm0, [parm2q+ 0] ; dct
+ movq mm1, [parm2q+ 8]
+ movq mm2, [parm2q+16]
+ movq mm3, [parm2q+24]
- mov rax, parm1q ; p_dst
- movsxd rcx, parm2d ; i_dst
- lea rdx, [rcx+rcx*2]
-
MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02
MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
MMX_ZERO mm7
movq mm6, [pw_32 GLOBAL]
- MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [rax]
- MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [rax+rcx]
- MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [rax+rcx*2]
- MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [rax+rdx]
+ MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [parm1q+0*FDEC_STRIDE]
+ MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [parm1q+1*FDEC_STRIDE]
+ MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [parm1q+2*FDEC_STRIDE]
+ MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [parm1q+3*FDEC_STRIDE]
ret
ALIGN 16
;-----------------------------------------------------------------------------
-; void __cdecl x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+; void __cdecl x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
x264_sub8x8_dct8_sse2:
-; mov rdi, rdi ; dct
-; mov rsi, rsi ; pix1
- movsxd rdx, edx ; i_pix1
-; mov rcx, rcx ; pix2
- movsxd r8, r8d ; i_pix2
-
MMX_ZERO xmm9
- MMX_LOAD_DIFF_8P xmm0, xmm8, xmm9, [rsi ], [rcx]
- MMX_LOAD_DIFF_8P xmm1, xmm8, xmm9, [rsi+rdx ], [rcx+r8]
- MMX_LOAD_DIFF_8P xmm2, xmm8, xmm9, [rsi+rdx*2], [rcx+r8*2]
- lea r9, [rdx+rdx*2]
- lea r10, [r8+r8*2]
- add rsi, r9
- add rcx, r10
- MMX_LOAD_DIFF_8P xmm3, xmm8, xmm9, [rsi ], [rcx]
- MMX_LOAD_DIFF_8P xmm4, xmm8, xmm9, [rsi+rdx ], [rcx+r8]
- MMX_LOAD_DIFF_8P xmm5, xmm8, xmm9, [rsi+rdx*2], [rcx+r8*2]
- MMX_LOAD_DIFF_8P xmm6, xmm8, xmm9, [rsi+r9 ], [rcx+r10]
- MMX_LOAD_DIFF_8P xmm7, xmm8, xmm9, [rsi+rdx*4], [rcx+r8*4]
+ MMX_LOAD_DIFF_8P xmm0, xmm8, xmm9, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE]
+ MMX_LOAD_DIFF_8P xmm1, xmm8, xmm9, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE]
+ MMX_LOAD_DIFF_8P xmm2, xmm8, xmm9, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE]
+ MMX_LOAD_DIFF_8P xmm3, xmm8, xmm9, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE]
+ MMX_LOAD_DIFF_8P xmm4, xmm8, xmm9, [parm2q+4*FENC_STRIDE], [parm3q+4*FDEC_STRIDE]
+ MMX_LOAD_DIFF_8P xmm5, xmm8, xmm9, [parm2q+5*FENC_STRIDE], [parm3q+5*FDEC_STRIDE]
+ MMX_LOAD_DIFF_8P xmm6, xmm8, xmm9, [parm2q+6*FENC_STRIDE], [parm3q+6*FDEC_STRIDE]
+ MMX_LOAD_DIFF_8P xmm7, xmm8, xmm9, [parm2q+7*FENC_STRIDE], [parm3q+7*FDEC_STRIDE]
DCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
SSE2_TRANSPOSE8x8 xmm5, xmm1, xmm2, xmm6, xmm4, xmm3, xmm7, xmm8, xmm0
DCT8_1D xmm5, xmm3, xmm8, xmm6, xmm0, xmm4, xmm2, xmm1, xmm7, xmm9
- movdqa [rdi+0x00], xmm4
- movdqa [rdi+0x10], xmm3
- movdqa [rdi+0x20], xmm8
- movdqa [rdi+0x30], xmm2
- movdqa [rdi+0x40], xmm0
- movdqa [rdi+0x50], xmm6
- movdqa [rdi+0x60], xmm1
- movdqa [rdi+0x70], xmm7
+ movdqa [parm1q+0x00], xmm4
+ movdqa [parm1q+0x10], xmm3
+ movdqa [parm1q+0x20], xmm8
+ movdqa [parm1q+0x30], xmm2
+ movdqa [parm1q+0x40], xmm0
+ movdqa [parm1q+0x50], xmm6
+ movdqa [parm1q+0x60], xmm1
+ movdqa [parm1q+0x70], xmm7
ret
ALIGN 16
;-----------------------------------------------------------------------------
-; void __cdecl x264_add8x8_idct8_sse2( uint8_t *p_dst, int i_dst, int16_t dct[8][8] )
+; void __cdecl x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
x264_add8x8_idct8_sse2:
- movsxd rsi, esi ; i_dst
- movdqa xmm0, [rdx+0x00] ; dct
- movdqa xmm1, [rdx+0x10]
- movdqa xmm2, [rdx+0x20]
- movdqa xmm3, [rdx+0x30]
- movdqa xmm4, [rdx+0x40]
- movdqa xmm5, [rdx+0x50]
- movdqa xmm6, [rdx+0x60]
- movdqa xmm7, [rdx+0x70]
+ movdqa xmm0, [parm2q+0x00]
+ movdqa xmm1, [parm2q+0x10]
+ movdqa xmm2, [parm2q+0x20]
+ movdqa xmm3, [parm2q+0x30]
+ movdqa xmm4, [parm2q+0x40]
+ movdqa xmm5, [parm2q+0x50]
+ movdqa xmm6, [parm2q+0x60]
+ movdqa xmm7, [parm2q+0x70]
IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm9, xmm8
SSE2_TRANSPOSE8x8 xmm9, xmm1, xmm7, xmm3, xmm4, xmm0, xmm2, xmm6, xmm5
IDCT8_1D xmm9, xmm0, xmm6, xmm3, xmm5, xmm4, xmm7, xmm1, xmm8, xmm2
MMX_ZERO xmm15
- MMX_STORE_DIFF_8P xmm8, xmm14, xmm15, [rdi]
- MMX_STORE_DIFF_8P xmm0, xmm14, xmm15, [rdi+rsi]
- MMX_STORE_DIFF_8P xmm1, xmm14, xmm15, [rdi+rsi*2]
- lea rax, [rsi+rsi*2]
- add rdi, rax
- MMX_STORE_DIFF_8P xmm3, xmm14, xmm15, [rdi]
- MMX_STORE_DIFF_8P xmm5, xmm14, xmm15, [rdi+rsi]
- MMX_STORE_DIFF_8P xmm9, xmm14, xmm15, [rdi+rsi*2]
- MMX_STORE_DIFF_8P xmm6, xmm14, xmm15, [rdi+rax]
- MMX_STORE_DIFF_8P xmm7, xmm14, xmm15, [rdi+rsi*4]
+ MMX_STORE_DIFF_8P xmm8, xmm14, xmm15, [parm1q+0*FDEC_STRIDE]
+ MMX_STORE_DIFF_8P xmm0, xmm14, xmm15, [parm1q+1*FDEC_STRIDE]
+ MMX_STORE_DIFF_8P xmm1, xmm14, xmm15, [parm1q+2*FDEC_STRIDE]
+ MMX_STORE_DIFF_8P xmm3, xmm14, xmm15, [parm1q+3*FDEC_STRIDE]
+ MMX_STORE_DIFF_8P xmm5, xmm14, xmm15, [parm1q+4*FDEC_STRIDE]
+ MMX_STORE_DIFF_8P xmm9, xmm14, xmm15, [parm1q+5*FDEC_STRIDE]
+ MMX_STORE_DIFF_8P xmm6, xmm14, xmm15, [parm1q+6*FDEC_STRIDE]
+ MMX_STORE_DIFF_8P xmm7, xmm14, xmm15, [parm1q+7*FDEC_STRIDE]
ret
}
}
-static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
{
int16_t d[4][4];
int16_t tmp[4][4];
int i;
- pixel_sub_wxh( (int16_t*)d, 4, pix1, i_pix1, pix2, i_pix2 );
+ pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
for( i = 0; i < 4; i++ )
{
}
}
-static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
{
- sub4x4_dct( dct[0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
- sub4x4_dct( dct[1], &pix1[4], i_pix1, &pix2[4], i_pix2 );
- sub4x4_dct( dct[2], &pix1[4*i_pix1+0], i_pix1, &pix2[4*i_pix2+0], i_pix2 );
- sub4x4_dct( dct[3], &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 );
+ sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
+ sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
+ sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
+ sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
}
-static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
{
- sub8x8_dct( &dct[ 0], pix1, i_pix1, pix2, i_pix2 );
- sub8x8_dct( &dct[ 4], &pix1[8], i_pix1, &pix2[8], i_pix2 );
- sub8x8_dct( &dct[ 8], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 );
- sub8x8_dct( &dct[12], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );
+ sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
+ sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
+ sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
+ sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}
-static void add4x4_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
+static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
{
int16_t d[4][4];
int16_t tmp[4][4];
{
p_dst[x] = clip_uint8( p_dst[x] + d[y][x] );
}
- p_dst += i_dst;
+ p_dst += FDEC_STRIDE;
}
}
-static void add8x8_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] )
+static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
{
- add4x4_idct( p_dst, i_dst, dct[0] );
- add4x4_idct( &p_dst[4], i_dst, dct[1] );
- add4x4_idct( &p_dst[4*i_dst+0], i_dst, dct[2] );
- add4x4_idct( &p_dst[4*i_dst+4], i_dst, dct[3] );
+ add4x4_idct( &p_dst[0], dct[0] );
+ add4x4_idct( &p_dst[4], dct[1] );
+ add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
+ add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}
-static void add16x16_idct( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
+static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
{
- add8x8_idct( &p_dst[0], i_dst, &dct[0] );
- add8x8_idct( &p_dst[8], i_dst, &dct[4] );
- add8x8_idct( &p_dst[8*i_dst], i_dst, &dct[8] );
- add8x8_idct( &p_dst[8*i_dst+8], i_dst, &dct[12] );
+ add8x8_idct( &p_dst[0], &dct[0] );
+ add8x8_idct( &p_dst[8], &dct[4] );
+ add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
+ add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
}
/****************************************************************************
DST(7) = (a4>>2) - a7 ;\
}
-static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
{
int i;
int16_t tmp[8][8];
- pixel_sub_wxh( (int16_t*)tmp, 8, pix1, i_pix1, pix2, i_pix2 );
+ pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
#define SRC(x) tmp[x][i]
#define DST(x) tmp[x][i]
#undef DST
}
-static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
{
- sub8x8_dct8( dct[0], pix1, i_pix1, pix2, i_pix2 );
- sub8x8_dct8( dct[1], &pix1[8], i_pix1, &pix2[8], i_pix2 );
- sub8x8_dct8( dct[2], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 );
- sub8x8_dct8( dct[3], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );
+ sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
+ sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
+ sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
+ sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}
#define IDCT8_1D {\
DST(7, b0 - b7);\
}
-static void add8x8_idct8( uint8_t *dst, int i_dst, int16_t dct[8][8] )
+static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
{
int i;
#undef DST
#define SRC(x) dct[i][x]
-#define DST(x,rhs) dst[i + x*i_dst] = clip_uint8( dst[i + x*i_dst] + ((rhs) >> 6) );
+#define DST(x,rhs) dst[i + x*FDEC_STRIDE] = clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
for( i = 0; i < 8; i++ )
IDCT8_1D
#undef SRC
#undef DST
}
-static void add16x16_idct8( uint8_t *dst, int i_dst, int16_t dct[4][8][8] )
+static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
{
- add8x8_idct8( &dst[0], i_dst, dct[0] );
- add8x8_idct8( &dst[8], i_dst, dct[1] );
- add8x8_idct8( &dst[8*i_dst], i_dst, dct[2] );
- add8x8_idct8( &dst[8*i_dst+8], i_dst, dct[3] );
+ add8x8_idct8( &dst[0], dct[0] );
+ add8x8_idct8( &dst[8], dct[1] );
+ add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
+ add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
}
typedef struct
{
- void (*sub4x4_dct) ( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
- void (*add4x4_idct) ( uint8_t *p_dst, int i_dst, int16_t dct[4][4] );
+ // pix1 stride = FENC_STRIDE
+ // pix2 stride = FDEC_STRIDE
+ // p_dst stride = FDEC_STRIDE
+ void (*sub4x4_dct) ( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 );
+ void (*add4x4_idct) ( uint8_t *p_dst, int16_t dct[4][4] );
- void (*sub8x8_dct) ( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
- void (*add8x8_idct) ( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] );
+ void (*sub8x8_dct) ( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
+ void (*add8x8_idct) ( uint8_t *p_dst, int16_t dct[4][4][4] );
- void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
- void (*add16x16_idct) ( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] );
+ void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
+ void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][4][4] );
- void (*sub8x8_dct8) ( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
- void (*add8x8_idct8) ( uint8_t *p_dst, int i_dst, int16_t dct[8][8] );
+ void (*sub8x8_dct8) ( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
+ void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[8][8] );
- void (*sub16x16_dct8) ( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
- void (*add16x16_idct8) ( uint8_t *p_dst, int i_dst, int16_t dct[4][8][8] );
+ void (*sub16x16_dct8) ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
+ void (*add16x16_idct8)( uint8_t *p_dst, int16_t dct[4][8][8] );
void (*dct4x4dc) ( int16_t d[4][4] );
void (*idct4x4dc)( int16_t d[4][4] );
ALIGN 16
;-----------------------------------------------------------------------------
-; void __cdecl x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+; void __cdecl x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
x264_sub4x4_dct_mmx:
- push ebx
- mov eax, [esp+12] ; pix1
- mov ebx, [esp+16] ; i_pix1
- mov ecx, [esp+20] ; pix2
- mov edx, [esp+24] ; i_pix2
+ mov eax, [esp+ 8] ; pix1
+ mov ecx, [esp+12] ; pix2
MMX_ZERO mm7
; Load 4 lines
- MMX_LOAD_DIFF_4P mm0, mm6, mm7, [eax ], [ecx]
- MMX_LOAD_DIFF_4P mm1, mm6, mm7, [eax+ebx ], [ecx+edx]
- MMX_LOAD_DIFF_4P mm2, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
- add eax, ebx
- add ecx, edx
- MMX_LOAD_DIFF_4P mm3, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
+ MMX_LOAD_DIFF_4P mm0, mm6, mm7, [eax+0*FENC_STRIDE], [ecx+0*FDEC_STRIDE]
+ MMX_LOAD_DIFF_4P mm1, mm6, mm7, [eax+1*FENC_STRIDE], [ecx+1*FDEC_STRIDE]
+ MMX_LOAD_DIFF_4P mm2, mm6, mm7, [eax+2*FENC_STRIDE], [ecx+2*FDEC_STRIDE]
+ MMX_LOAD_DIFF_4P mm3, mm6, mm7, [eax+3*FENC_STRIDE], [ecx+3*FDEC_STRIDE]
MMX_SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12
MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
- mov eax, [esp+ 8] ; dct
+ mov eax, [esp+ 4] ; dct
movq [eax+ 0], mm1
movq [eax+ 8], mm2
movq [eax+16], mm3
movq [eax+24], mm0
- pop ebx
ret
cglobal x264_add4x4_idct_mmx
ALIGN 16
;-----------------------------------------------------------------------------
-; void __cdecl x264_add4x4_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
+; void __cdecl x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
x264_add4x4_idct_mmx:
; Load dct coeffs
- mov eax, [esp+12] ; dct
+ mov eax, [esp+ 8] ; dct
movq mm0, [eax+ 0]
movq mm1, [eax+ 8]
movq mm2, [eax+16]
movq mm3, [eax+24]
mov eax, [esp+ 4] ; p_dst
- mov ecx, [esp+ 8] ; i_dst
- lea edx, [ecx+ecx*2]
picpush ebx
picgetgot ebx
MMX_ZERO mm7
movq mm6, [x264_mmx_32 GOT_ebx]
- MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [eax]
- MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [eax+ecx]
- MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [eax+ecx*2]
- MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [eax+edx]
+ MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [eax+0*FDEC_STRIDE]
+ MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [eax+1*FDEC_STRIDE]
+ MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [eax+2*FDEC_STRIDE]
+ MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [eax+3*FDEC_STRIDE]
picpop ebx
ret
ALIGN 16
;-----------------------------------------------------------------------------
-; void __cdecl x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+; void __cdecl x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
;-----------------------------------------------------------------------------
x264_pixel_sub_8x8_mmx:
- push ebx
- push ebp
- mov ebp, [esp+12] ; diff
- mov eax, [esp+16] ; pix1
- mov ebx, [esp+20] ; i_pix1
- mov ecx, [esp+24] ; pix2
- mov edx, [esp+28] ; i_pix2
+ mov edx, [esp+ 4] ; diff
+ mov eax, [esp+ 8] ; pix1
+ mov ecx, [esp+12] ; pix2
MMX_ZERO mm7
%assign disp 0
%rep 8
MMX_LOAD_DIFF_8P mm0, mm1, mm2, mm3, [eax], [ecx], mm7
- movq [ebp+disp], mm0
- movq [ebp+disp+8], mm1
- add eax, ebx
- add ecx, edx
+ movq [edx+disp], mm0
+ movq [edx+disp+8], mm1
+ add eax, FENC_STRIDE
+ add ecx, FDEC_STRIDE
%assign disp disp+16
%endrep
- pop ebp
- pop ebx
ret
ALIGN 16
ALIGN 16
;-----------------------------------------------------------------------------
-; void __cdecl x264_pixel_add_8x8_mmx( uint8_t *dst, int i_dst, int16_t src[8][8] );
+; void __cdecl x264_pixel_add_8x8_mmx( uint8_t *dst, int16_t src[8][8] );
;-----------------------------------------------------------------------------
x264_pixel_add_8x8_mmx:
- mov eax, [esp+04] ; dst
- mov ecx, [esp+08] ; i_dst
- mov edx, [esp+12] ; src
+ mov eax, [esp+4] ; dst
+ mov edx, [esp+8] ; src
MMX_ZERO mm7
paddw mm1, mm3
packuswb mm0, mm1
movq [eax], mm0
- add eax, ecx
+ add eax, FDEC_STRIDE
%assign disp disp+16
%endrep
ret
#include <stdlib.h>
#include <stdarg.h>
-#include "x264.h"
-
#include "dct.h"
+#include "common/common.h"
-void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
{
- x264_sub4x4_dct_mmx( dct[0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
- x264_sub4x4_dct_mmx( dct[1], &pix1[4], i_pix1, &pix2[4], i_pix2 );
- x264_sub4x4_dct_mmx( dct[2], &pix1[4*i_pix1+0], i_pix1, &pix2[4*i_pix2+0], i_pix2 );
- x264_sub4x4_dct_mmx( dct[3], &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 );
+ x264_sub4x4_dct_mmx( dct[0], &pix1[0], &pix2[0] );
+ x264_sub4x4_dct_mmx( dct[1], &pix1[4], &pix2[4] );
+ x264_sub4x4_dct_mmx( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
+ x264_sub4x4_dct_mmx( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
}
-void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
{
- x264_sub8x8_dct_mmx( &dct[ 0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
- x264_sub8x8_dct_mmx( &dct[ 4], &pix1[8], i_pix1, &pix2[8], i_pix2 );
- x264_sub8x8_dct_mmx( &dct[ 8], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 );
- x264_sub8x8_dct_mmx( &dct[12], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );
+ x264_sub8x8_dct_mmx( &dct[ 0], &pix1[0], &pix2[0] );
+ x264_sub8x8_dct_mmx( &dct[ 4], &pix1[8], &pix2[8] );
+ x264_sub8x8_dct_mmx( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
+ x264_sub8x8_dct_mmx( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}
* addXxX_idct:
****************************************************************************/
-void x264_add8x8_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] )
+void x264_add8x8_idct_mmx( uint8_t *p_dst, int16_t dct[4][4][4] )
{
- x264_add4x4_idct_mmx( p_dst, i_dst, dct[0] );
- x264_add4x4_idct_mmx( &p_dst[4], i_dst, dct[1] );
- x264_add4x4_idct_mmx( &p_dst[4*i_dst+0], i_dst, dct[2] );
- x264_add4x4_idct_mmx( &p_dst[4*i_dst+4], i_dst, dct[3] );
+ x264_add4x4_idct_mmx( p_dst, dct[0] );
+ x264_add4x4_idct_mmx( &p_dst[4], dct[1] );
+ x264_add4x4_idct_mmx( &p_dst[4*FDEC_STRIDE+0], dct[2] );
+ x264_add4x4_idct_mmx( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}
-void x264_add16x16_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
+void x264_add16x16_idct_mmx( uint8_t *p_dst, int16_t dct[16][4][4] )
{
- x264_add8x8_idct_mmx( &p_dst[0], i_dst, &dct[0] );
- x264_add8x8_idct_mmx( &p_dst[8], i_dst, &dct[4] );
- x264_add8x8_idct_mmx( &p_dst[8*i_dst], i_dst, &dct[8] );
- x264_add8x8_idct_mmx( &p_dst[8*i_dst+8], i_dst, &dct[12] );
+ x264_add8x8_idct_mmx( &p_dst[0], &dct[0] );
+ x264_add8x8_idct_mmx( &p_dst[8], &dct[4] );
+ x264_add8x8_idct_mmx( &p_dst[8*FDEC_STRIDE], &dct[8] );
+ x264_add8x8_idct_mmx( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
}
/***********************
***********************/
#ifdef ARCH_X86_64
-void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
{
- x264_sub8x8_dct8_sse2( dct[0], pix1, i_pix1, pix2, i_pix2 );
- x264_sub8x8_dct8_sse2( dct[1], pix1+8, i_pix1, pix2+8, i_pix2 );
- x264_sub8x8_dct8_sse2( dct[2], pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 );
- x264_sub8x8_dct8_sse2( dct[3], pix1+8*i_pix1+8, i_pix1, pix2+8*i_pix2+8, i_pix2 );
+ x264_sub8x8_dct8_sse2( dct[0], pix1, pix2 );
+ x264_sub8x8_dct8_sse2( dct[1], pix1+8, pix2+8 );
+ x264_sub8x8_dct8_sse2( dct[2], pix1+8*FENC_STRIDE, pix2+8*FDEC_STRIDE );
+ x264_sub8x8_dct8_sse2( dct[3], pix1+8*FENC_STRIDE+8, pix2+8*FDEC_STRIDE+8 );
}
-void x264_add16x16_idct8_sse2( uint8_t *p_dst, int i_dst, int16_t dct[4][8][8] )
+void x264_add16x16_idct8_sse2( uint8_t *p_dst, int16_t dct[4][8][8] )
{
- x264_add8x8_idct8_sse2( p_dst, i_dst, dct[0] );
- x264_add8x8_idct8_sse2( p_dst+8, i_dst, dct[1] );
- x264_add8x8_idct8_sse2( p_dst+8*i_dst, i_dst, dct[2] );
- x264_add8x8_idct8_sse2( p_dst+8*i_dst+8, i_dst, dct[3] );
+ x264_add8x8_idct8_sse2( p_dst, dct[0] );
+ x264_add8x8_idct8_sse2( p_dst+8, dct[1] );
+ x264_add8x8_idct8_sse2( p_dst+8*FDEC_STRIDE, dct[2] );
+ x264_add8x8_idct8_sse2( p_dst+8*FDEC_STRIDE+8, dct[3] );
}
#else // ARCH_X86
-void x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
-void x264_pixel_add_8x8_mmx( uint8_t *pix, int i_pix, uint16_t *diff );
+void x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
+void x264_pixel_add_8x8_mmx( uint8_t *pix, uint16_t *diff );
void x264_transpose_8x8_mmx( int16_t src[8][8] );
void x264_ydct8_mmx( int16_t dct[8][8] );
void x264_yidct8_mmx( int16_t dct[8][8] );
-inline void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+inline void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
{
- x264_pixel_sub_8x8_mmx( (int16_t *)dct, pix1, i_pix1, pix2, i_pix2 );
+ x264_pixel_sub_8x8_mmx( (int16_t *)dct, pix1, pix2 );
x264_ydct8_mmx( dct );
x264_transpose_8x8_mmx( dct );
x264_ydct8_mmx( dct );
}
-void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
{
- x264_sub8x8_dct8_mmx( dct[0], pix1, i_pix1, pix2, i_pix2 );
- x264_sub8x8_dct8_mmx( dct[1], pix1+8, i_pix1, pix2+8, i_pix2 );
- x264_sub8x8_dct8_mmx( dct[2], pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 );
- x264_sub8x8_dct8_mmx( dct[3], pix1+8*i_pix1+8, i_pix1, pix2+8*i_pix2+8, i_pix2 );
+ x264_sub8x8_dct8_mmx( dct[0], pix1, pix2 );
+ x264_sub8x8_dct8_mmx( dct[1], pix1+8, pix2+8 );
+ x264_sub8x8_dct8_mmx( dct[2], pix1+8*FENC_STRIDE, pix2+8*FDEC_STRIDE );
+ x264_sub8x8_dct8_mmx( dct[3], pix1+8*FENC_STRIDE+8, pix2+8*FDEC_STRIDE+8 );
}
-inline void x264_add8x8_idct8_mmx( uint8_t *dst, int i_dst, int16_t dct[8][8] )
+inline void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
{
dct[0][0] += 32;
x264_yidct8_mmx( dct );
x264_transpose_8x8_mmx( dct );
x264_yidct8_mmx( dct );
- x264_pixel_add_8x8_mmx( dst, i_dst, (uint16_t *)dct ); // including >>6 at the end
+ x264_pixel_add_8x8_mmx( dst, (uint16_t *)dct ); // including >>6 at the end
}
-void x264_add16x16_idct8_mmx( uint8_t *dst, int i_dst, int16_t dct[4][8][8] )
+void x264_add16x16_idct8_mmx( uint8_t *dst, int16_t dct[4][8][8] )
{
- x264_add8x8_idct8_mmx( dst, i_dst, dct[0] );
- x264_add8x8_idct8_mmx( dst+8, i_dst, dct[1] );
- x264_add8x8_idct8_mmx( dst+8*i_dst, i_dst, dct[2] );
- x264_add8x8_idct8_mmx( dst+8*i_dst+8, i_dst, dct[3] );
+ x264_add8x8_idct8_mmx( dst, dct[0] );
+ x264_add8x8_idct8_mmx( dst+8, dct[1] );
+ x264_add8x8_idct8_mmx( dst+8*FDEC_STRIDE, dct[2] );
+ x264_add8x8_idct8_mmx( dst+8*FDEC_STRIDE+8, dct[3] );
}
#endif
#ifndef _I386_DCT_H
#define _I386_DCT_H 1
-void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
-void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
-void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
-void x264_add4x4_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[4][4] );
-void x264_add8x8_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] );
-void x264_add16x16_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] );
+void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] );
+void x264_add8x8_idct_mmx( uint8_t *p_dst, int16_t dct[4][4][4] );
+void x264_add16x16_idct_mmx( uint8_t *p_dst, int16_t dct[16][4][4] );
void x264_dct4x4dc_mmx( int16_t d[4][4] );
void x264_idct4x4dc_mmx( int16_t d[4][4] );
-void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
-void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
-void x264_add8x8_idct8_mmx( uint8_t *dst, int i_dst, int16_t dct[8][8] );
-void x264_add16x16_idct8_mmx( uint8_t *dst, int i_dst, int16_t dct[4][8][8] );
+void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] );
+void x264_add16x16_idct8_mmx( uint8_t *dst, int16_t dct[4][8][8] );
-void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
-void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
-void x264_add8x8_idct8_sse2( uint8_t *dst, int i_dst, int16_t dct[8][8] );
-void x264_add16x16_idct8_sse2( uint8_t *dst, int i_dst, int16_t dct[4][8][8] );
+void x264_add8x8_idct8_sse2( uint8_t *dst, int16_t dct[8][8] );
+void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][8][8] );
#endif
%define picesp esp
%endif
+%assign FENC_STRIDE 16
%assign FDEC_STRIDE 32
; This is needed for ELF, otherwise the GNU linker assumes the stack is
return;
}
- h->dctf.sub4x4_dct( dct4x4, p_src, FENC_STRIDE, p_dst, FDEC_STRIDE );
+ h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
if( h->mb.b_trellis )
x264_quant_4x4_trellis( h, dct4x4, CQM_4IY, i_qscale, DCT_LUMA_4x4, 1 );
h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
/* output samples to fdec */
- h->dctf.add4x4_idct( p_dst, FDEC_STRIDE, dct4x4 );
+ h->dctf.add4x4_idct( p_dst, dct4x4 );
}
void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
int16_t dct8x8[8][8];
- h->dctf.sub8x8_dct8( dct8x8, p_src, FENC_STRIDE, p_dst, FDEC_STRIDE );
+ h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
if( h->mb.b_trellis )
x264_quant_8x8_trellis( h, dct8x8, CQM_8IY, i_qscale, 1 );
scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8 );
h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
- h->dctf.add8x8_idct8( p_dst, FDEC_STRIDE, dct8x8 );
+ h->dctf.add8x8_idct8( p_dst, dct8x8 );
}
static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
return;
}
- h->dctf.sub16x16_dct( &dct4x4[1], p_src, FENC_STRIDE, p_dst, FDEC_STRIDE );
+ h->dctf.sub16x16_dct( &dct4x4[1], p_src, p_dst );
for( i = 0; i < 16; i++ )
{
/* copy dc coeff */
dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
}
/* put pixels to fdec */
- h->dctf.add16x16_idct( p_dst, FDEC_STRIDE, &dct4x4[1] );
+ h->dctf.add16x16_idct( p_dst, &dct4x4[1] );
}
static void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
continue;
}
- h->dctf.sub8x8_dct( dct4x4, p_src, FENC_STRIDE, p_dst, FDEC_STRIDE );
+ h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
/* calculate dct coeffs */
for( i = 0; i < 4; i++ )
{
/* copy dc coeff */
dct4x4[i][0][0] = dct2x2[0][i];
}
- h->dctf.add8x8_idct( p_dst, FDEC_STRIDE, dct4x4 );
+ h->dctf.add8x8_idct( p_dst, dct4x4 );
}
}
{
int16_t dct8x8[4][8][8];
int nnz8x8[4] = {1,1,1,1};
- h->dctf.sub16x16_dct8( dct8x8,
- h->mb.pic.p_fenc[0], FENC_STRIDE,
- h->mb.pic.p_fdec[0], FDEC_STRIDE );
+ h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
for( idx = 0; idx < 4; idx++ )
{
if( nnz8x8[idx] )
{
h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
- h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], FDEC_STRIDE, dct8x8[idx] );
+ h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
}
}
}
{
int16_t dct4x4[16][4][4];
int nnz8x8[4] = {1,1,1,1};
- h->dctf.sub16x16_dct( dct4x4,
- h->mb.pic.p_fenc[0], FENC_STRIDE,
- h->mb.pic.p_fdec[0], FDEC_STRIDE );
+ h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
for( i8x8 = 0; i8x8 < 4; i8x8++ )
{
{
for( i = 0; i < 4; i++ )
h->quantf.dequant_4x4( dct4x4[i8x8*4+i], h->dequant4_mf[CQM_4PY], i_qp );
- h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], FDEC_STRIDE, &dct4x4[i8x8*4] );
+ h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
}
}
}
}
/* get luma diff */
- h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], FENC_STRIDE,
- h->mb.pic.p_fdec[0], FDEC_STRIDE );
+ h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0],
+ h->mb.pic.p_fdec[0] );
for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
{
mvp[0], mvp[1], 8, 8 );
}
- h->dctf.sub8x8_dct( dct4x4, p_src, FENC_STRIDE, p_dst, FDEC_STRIDE );
+ h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
/* calculate dct DC */
dct2x2[0][0] = dct4x4[0][0][0];
if( h->mb.b_transform_8x8 )
{
int16_t dct8x8[8][8];
- h->dctf.sub8x8_dct8( dct8x8, p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
+ h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
quant_8x8( h, dct8x8, h->quant8_mf[CQM_8PY], i_qp, 0 );
scan_zigzag_8x8full( h->dct.luma8x8[i8], dct8x8 );
if( nnz8x8 )
{
h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
- h->dctf.add8x8_idct8( p_fdec, FDEC_STRIDE, dct8x8 );
+ h->dctf.add8x8_idct8( p_fdec, dct8x8 );
}
}
else
{
int i4, idx;
int16_t dct4x4[4][4][4];
- h->dctf.sub8x8_dct( dct4x4, p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
+ h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
for( i4 = 0; i4 < 4; i4++ )
{
{
for( i4 = 0; i4 < 4; i4++ )
h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
- h->dctf.add8x8_idct( p_fdec, FDEC_STRIDE, dct4x4 );
+ h->dctf.add8x8_idct( p_fdec, dct4x4 );
}
}
p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
- h->dctf.sub4x4_dct( dct4x4, p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
+ h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
quant_4x4( h, dct4x4, h->quant4_mf[CQM_4PC], i_qp, 0 );
scan_zigzag_4x4( h->dct.block[16+i8+ch*4].residual_ac, dct4x4 );
h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
- h->dctf.add4x4_idct( p_fdec, FDEC_STRIDE, dct4x4 );
+ h->dctf.add4x4_idct( p_fdec, dct4x4 );
}
if( nnz8x8 )
if( dct_asm.name != dct_ref.name ) \
{ \
used_asm = 1; \
- dct_c.name( t1, buf1, 32, buf2, 24 ); \
- dct_asm.name( t2, buf1, 32, buf2, 24 ); \
+ dct_c.name( t1, buf1, buf2 ); \
+ dct_asm.name( t2, buf1, buf2 ); \
if( memcmp( t1, t2, size ) ) \
{ \
ok = 0; \
memcpy( buf4, buf1, 32*32 ); \
memcpy( dct1, buf5, 512 ); \
memcpy( dct2, buf5, 512 ); \
- dct_c.name( buf3, 32, (void*)dct1 ); \
- dct_asm.name( buf4, 32, (void*)dct2 ); \
+ dct_c.name( buf3, (void*)dct1 ); \
+ dct_asm.name( buf4, (void*)dct2 ); \
if( memcmp( buf3, buf4, 32*32 ) ) \
{ \
ok = 0; \