Heavily optimized for Core 2 and Nehalem, but performance should improve on all modern x86 CPUs.
16x16 SATD: +18% speed on K8(64bit), +22% on K10(32bit), +42% on Penryn(64bit), +44% on Nehalem(64bit), +50% on P4(32bit), +98% on Conroe(64bit)
Similar performance boosts in SATD-like functions (SA8D, hadamard_ac) and somewhat less in DCT/IDCT/SSD.
Overall performance boost is up to ~15% on 64-bit Conroe.
if( cpu&X264_CPU_SSSE3 )
{
+ dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
+ dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
+ dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
+ dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
+ dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
}
SATD_X_DECL7( _mmxext )
SATD_X_DECL6( _sse2 )
SATD_X_DECL7( _ssse3 )
-SATD_X_DECL6( _ssse3_phadd )
+SATD_X_DECL7( _sse4 )
#endif
/****************************************************************************
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
}
+ if( cpu&X264_CPU_SSE2 )
+ {
+ INIT5( ssd, _sse2slow );
+ INIT2_NAME( sad_aligned, sad, _sse2_aligned );
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
+ pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
+ pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
+ pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
+#ifdef ARCH_X86_64
+ pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
+#endif
+ }
+
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
INIT2( sad, _sse2 );
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
+ INIT6( satd, _sse2 );
+ INIT6( satd_x3, _sse2 );
+ INIT6( satd_x4, _sse2 );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _sse2 );
INIT_ADS( _sse2 );
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
-
- if( cpu&X264_CPU_CACHELINE_64 )
+ if( cpu&X264_CPU_CACHELINE_64 )
{
+ INIT2( ssd, _sse2); /* faster for width 16 on p4 */
#ifdef ARCH_X86
INIT2( sad, _cache64_sse2 );
INIT2( sad_x3, _cache64_sse2 );
INIT2( sad_x4, _sse2_misalign );
}
}
- if( cpu&X264_CPU_SSE2 )
- {
- INIT5( ssd, _sse2 );
- if( cpu&X264_CPU_SSE2_IS_FAST )
- {
- INIT6( satd, _sse2 );
- INIT6( satd_x3, _sse2 );
- INIT6( satd_x4, _sse2 );
- }
- else
- {
- INIT5( satd, _sse2 );
- INIT5( satd_x3, _sse2 );
- INIT5( satd_x4, _sse2 );
- }
- INIT2_NAME( sad_aligned, sad, _sse2_aligned );
- pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
- pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
- pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
- pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
- pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
-#ifdef ARCH_X86_64
- pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
-#endif
- }
if( cpu&X264_CPU_SSE2_IS_FAST && !(cpu&X264_CPU_CACHELINE_64) )
{
if( cpu&X264_CPU_SSSE3 )
{
+ INIT7( ssd, _ssse3 );
INIT7( satd, _ssse3 );
INIT7( satd_x3, _ssse3 );
INIT7( satd_x4, _ssse3 );
INIT2( sad_x3, _cache64_ssse3 );
INIT2( sad_x4, _cache64_ssse3 );
}
- if( cpu&X264_CPU_PHADD_IS_FAST )
+ if( !(cpu&X264_CPU_PHADD_IS_FAST) )
{
- INIT6( satd, _ssse3_phadd );
- INIT6( satd_x3, _ssse3_phadd );
- INIT6( satd_x4, _ssse3_phadd );
+ INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
}
}
if( cpu&X264_CPU_SSE4 )
{
- pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_sse4;
- pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_sse4;
+ INIT7( satd, _sse4 );
+ INIT7( satd_x3, _sse4 );
+ INIT7( satd_x4, _sse4 );
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ {
+ INIT4( hadamard_ac, _sse4 );
+ }
+ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
}
#endif //HAVE_MMX
;*****************************************************************************
;* Copyright (C) 2003-2008 x264 project
;*
-;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
-;* Loren Merritt <lorenm@u.washington.edu> (misc)
-;* Min Chen <chenm001.163.com> (converted to nasm)
-;* Christian Heine <sennindemokrit@gmx.net> (dct8/idct8 functions)
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+;* Loren Merritt <lorenm@u.washington.edu>
+;* Holger Lubitz <holger@lubitz.org>
+;* Min Chen <chenm001.163.com>
+;* Christian Heine <sennindemokrit@gmx.net>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
SECTION_RODATA
pw_32: times 8 dw 32
+hsub_mul: times 8 db 1, -1
SECTION .text
ADD_STORE_ROW 7, m7, [r1+0x78]
ret
-
-
INIT_XMM
+%macro DCT_SUB8 1
+cglobal x264_sub8x8_dct_%1, 3,3
+ add r2, 4*FDEC_STRIDE
+global x264_sub8x8_dct_%1.skip_prologue
+.skip_prologue:
+%ifnidn %1, sse2
+ mova m7, [hsub_mul GLOBAL]
+%endif
+ LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE
+ SPILL r0, 1,2
+ SWAP 2, 7
+ LOAD_DIFF8x4 4, 5, 6, 7, 1, 2, r1, r2-4*FDEC_STRIDE
+ UNSPILL r0, 1
+ SPILL r0, 7
+ SWAP 2, 7
+ UNSPILL r0, 2
+ DCT4_1D 0, 1, 2, 3, 7
+ TRANSPOSE2x4x4W 0, 1, 2, 3, 7
+ UNSPILL r0, 7
+ SPILL r0, 2
+ DCT4_1D 4, 5, 6, 7, 2
+ TRANSPOSE2x4x4W 4, 5, 6, 7, 2
+ UNSPILL r0, 2
+ SPILL r0, 6
+ DCT4_1D 0, 1, 2, 3, 6
+ UNSPILL r0, 6
+ STORE_DCT 0, 1, 2, 3, r0, 0
+ DCT4_1D 4, 5, 6, 7, 3
+ STORE_DCT 4, 5, 6, 7, r0, 64
+ ret
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-cglobal x264_sub8x8_dct8_sse2, 3,3
-global x264_sub8x8_dct8_sse2.skip_prologue
+cglobal x264_sub8x8_dct8_%1, 3,3
+ add r2, 4*FDEC_STRIDE
+global x264_sub8x8_dct8_%1.skip_prologue
.skip_prologue:
- LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
- LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
- LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
- LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
- LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
- LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
+%ifidn %1, sse2
+ LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE]
+ LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2-3*FDEC_STRIDE]
+ LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2-2*FDEC_STRIDE]
+ LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2-1*FDEC_STRIDE]
+ LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+ LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+1*FDEC_STRIDE]
SPILL r0, 0
- LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
- LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
+ LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+ LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
UNSPILL r0, 0
+%else
+ mova m7, [hsub_mul GLOBAL]
+ LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
+ SPILL r0, 0,1
+ SWAP 1, 7
+ LOAD_DIFF8x4 4, 5, 6, 7, 0, 1, r1, r2-4*FDEC_STRIDE
+ UNSPILL r0, 0,1
+%endif
DCT8_1D 0,1,2,3,4,5,6,7,r0
UNSPILL r0, 0,4
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1
DCT8_1D 0,1,2,3,4,5,6,7,r0
SPILL r0, 1,2,3,5,7
ret
+%endmacro
+
+%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSE2
+%define movdqa movaps
+%define punpcklqdq movlhps
+DCT_SUB8 sse2
+%undef movdqa
+%undef punpcklqdq
+%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3
+DCT_SUB8 ssse3
+
+;-----------------------------------------------------------------------------
+; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] )
+;-----------------------------------------------------------------------------
+cglobal x264_add8x8_idct_sse2, 2,2
+ add r0, 4*FDEC_STRIDE
+global x264_add8x8_idct_sse2.skip_prologue
+.skip_prologue:
+ UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3
+ SBUTTERFLY qdq, 0, 1, 4
+ SBUTTERFLY qdq, 2, 3, 4
+ UNSPILL_SHUFFLE r1, 4,6,5,7, 4,5,6,7
+ SPILL r1, 0
+ SBUTTERFLY qdq, 4, 5, 0
+ SBUTTERFLY qdq, 6, 7, 0
+ UNSPILL r1,0
+ IDCT4_1D 0,1,2,3,r1
+ SPILL r1, 4
+ TRANSPOSE2x4x4W 0,1,2,3,4
+ UNSPILL r1, 4
+ IDCT4_1D 4,5,6,7,r1
+ SPILL r1, 0
+ TRANSPOSE2x4x4W 4,5,6,7,0
+ UNSPILL r1, 0
+ paddw m0, [pw_32 GLOBAL]
+ IDCT4_1D 0,1,2,3,r1
+ paddw m4, [pw_32 GLOBAL]
+ IDCT4_1D 4,5,6,7,r1
+ SPILL r1, 6,7
+ pxor m7, m7
+ DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
+ DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5
+ UNSPILL_SHUFFLE r1, 0,2, 6,7
+ DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5
+ DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
+ STORE_IDCT m1, m3, m5, m2
+ ret
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_sse2, 2,2
+ add r0, 4*FDEC_STRIDE
global x264_add8x8_idct8_sse2.skip_prologue
.skip_prologue:
UNSPILL r1, 1,2,3,5,6,7
IDCT8_1D 0,1,2,3,4,5,6,7,r1
SPILL r1, 6,7
pxor m7, m7
- STORE_DIFF m0, m6, m7, [r0+FDEC_STRIDE*0]
- STORE_DIFF m1, m6, m7, [r0+FDEC_STRIDE*1]
- STORE_DIFF m2, m6, m7, [r0+FDEC_STRIDE*2]
- STORE_DIFF m3, m6, m7, [r0+FDEC_STRIDE*3]
- STORE_DIFF m4, m6, m7, [r0+FDEC_STRIDE*4]
- STORE_DIFF m5, m6, m7, [r0+FDEC_STRIDE*5]
- UNSPILL_SHUFFLE r1, 0,1, 6,7
- STORE_DIFF m0, m6, m7, [r0+FDEC_STRIDE*6]
- STORE_DIFF m1, m6, m7, [r0+FDEC_STRIDE*7]
+ DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
+ DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5
+ UNSPILL_SHUFFLE r1, 0,2, 6,7
+ DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5
+ DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
+ STORE_IDCT m1, m3, m5, m2
ret
-
;*****************************************************************************
;* Copyright (C) 2003-2008 x264 project
;*
-;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
-;* Loren Merritt <lorenm@u.washington.edu> (dct8, misc)
-;* Min Chen <chenm001.163.com> (converted to nasm)
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+;* Loren Merritt <lorenm@u.washington.edu>
+;* Holger Lubitz <holger@lubitz.org>
+;* Min Chen <chenm001.163.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
SECTION_RODATA
pw_32: times 8 dw 32
+hsub_mul: times 8 db 1, -1
SECTION .text
INIT_XMM
%macro DCT8_1D 10
- SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07
- SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16
- SUMSUB_BA m%6, m%3 ; %6=s25, %3=d25
SUMSUB_BA m%5, m%4 ; %5=s34, %4=d34
+ SUMSUB_BA m%6, m%3 ; %6=s25, %3=d25
+ SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16
+ SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07
- SUMSUB_BA m%5, m%8 ; %5=a0, %8=a2
- SUMSUB_BA m%6, m%7 ; %6=a1, %7=a3
+ SUMSUB_BA m%6, m%7, m%10 ; %6=a1, %7=a3
+ SUMSUB_BA m%5, m%8, m%10 ; %5=a0, %8=a2
movdqa m%9, m%1
psraw m%9, 1
psubw m%1, m%3 ; %1=a5
psubw m%4, m%2 ; %4=a6
- SUMSUB_BA m%6, m%5 ; %6=b0, %5=b4
-
movdqa m%2, m%10
psraw m%2, 2
paddw m%2, m%9 ; %2=b1
psraw m%9, 2
psubw m%9, m%10 ; %9=b7
+ SUMSUB_BA m%6, m%5, m%10 ; %6=b0, %5=b4
+
movdqa m%3, m%7
psraw m%3, 1
paddw m%3, m%8 ; %3=b2
SWAP %1, %6, %4, %7, %8, %9
%endmacro
-;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
-;-----------------------------------------------------------------------------
-cglobal x264_sub8x8_dct8_sse2, 3,3,10
- LOAD_DIFF m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
- LOAD_DIFF m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
- LOAD_DIFF m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
- LOAD_DIFF m3, m8, m9, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
- LOAD_DIFF m4, m8, m9, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
- LOAD_DIFF m5, m8, m9, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
- LOAD_DIFF m6, m8, m9, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
- LOAD_DIFF m7, m8, m9, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
-
- DCT8_1D 0,1,2,3,4,5,6,7,8,9
- TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
- DCT8_1D 0,1,2,3,4,5,6,7,8,9
-
- movdqa [r0+0x00], m0
- movdqa [r0+0x10], m1
- movdqa [r0+0x20], m2
- movdqa [r0+0x30], m3
- movdqa [r0+0x40], m4
- movdqa [r0+0x50], m5
- movdqa [r0+0x60], m6
- movdqa [r0+0x70], m7
- RET
-
-
%macro IDCT8_1D 10
- SUMSUB_BA m%5, m%1 ; %5=a0, %1=a2
- movdqa m%10, m%3
- psraw m%3, 1
- psubw m%3, m%7 ; %3=a4
- psraw m%7, 1
- paddw m%7, m%10 ; %7=a6
+ SUMSUB_BA m%5, m%1, m%9 ; %5=a0, %1=a2
movdqa m%9, m%2
psraw m%9, 1
paddw m%9, m%4
paddw m%9, m%6 ; %9=a7
+ movdqa m%10, m%3
+ psraw m%3, 1
+ psubw m%3, m%7 ; %3=a4
+ psraw m%7, 1
+ paddw m%7, m%10 ; %7=a6
+
movdqa m%10, m%6
psraw m%10, 1
paddw m%10, m%6
psubw m%2, m%4 ; %2=a3
psubw m%6, m%8 ; %6=a1
- SUMSUB_BA m%7, m%5 ; %7=b0, %5=b6
- SUMSUB_BA m%3, m%1 ; %3=b2, %1=b4
-
movdqa m%4, m%9
psraw m%4, 2
paddw m%4, m%6 ; %4=b1
psraw m%6, 2
psubw m%9, m%6 ; %9=b7
+ SUMSUB_BA m%7, m%5, m%6 ; %7=b0, %5=b6
+ SUMSUB_BA m%3, m%1, m%6; %3=b2, %1=b4
+
movdqa m%8, m%10
psraw m%8, 2
paddw m%8, m%2 ; %8=b3
psraw m%2, 2
psubw m%2, m%10 ; %2=b5
- SUMSUB_BA m%9, m%7 ; %9=c0, %7=c7
- SUMSUB_BA m%2, m%3 ; %2=c1, %3=c6
- SUMSUB_BA m%8, m%1 ; %8=c2, %1=c5
- SUMSUB_BA m%4, m%5 ; %4=c3, %5=c4
+ SUMSUB_BA m%9, m%7, m%6 ; %9=c0, %7=c7
+ SUMSUB_BA m%2, m%3, m%6 ; %2=c1, %3=c6
+ SUMSUB_BA m%8, m%1, m%6 ; %8=c2, %1=c5
+ SUMSUB_BA m%4, m%5, m%6 ; %4=c3, %5=c4
SWAP %1, %9, %6
SWAP %3, %8, %7
%endmacro
+%macro DCT_SUB8 1
+cglobal x264_sub8x8_dct_%1, 3,3,11
+ add r2, 4*FDEC_STRIDE
+%ifnidn %1, sse2
+ mova m7, [hsub_mul GLOBAL]
+%endif
+%ifdef WIN64
+ call .skip_prologue
+ RET
+%endif
+global x264_sub8x8_dct_%1.skip_prologue
+.skip_prologue:
+ SWAP 7, 9
+ LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
+ LOAD_DIFF8x4 4, 5, 6, 7, 8, 9, r1, r2-4*FDEC_STRIDE
+ DCT4_1D 0, 1, 2, 3, 8
+ TRANSPOSE2x4x4W 0, 1, 2, 3, 8
+ DCT4_1D 4, 5, 6, 7, 8
+ TRANSPOSE2x4x4W 4, 5, 6, 7, 8
+ DCT4_1D 0, 1, 2, 3, 8
+ STORE_DCT 0, 1, 2, 3, r0, 0
+ DCT4_1D 4, 5, 6, 7, 8
+ STORE_DCT 4, 5, 6, 7, r0, 64
+ ret
+
+;-----------------------------------------------------------------------------
+; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+cglobal x264_sub8x8_dct8_%1, 3,3,11
+ add r2, 4*FDEC_STRIDE
+%ifnidn %1, sse2
+ mova m7, [hsub_mul GLOBAL]
+%endif
+%ifdef WIN64
+ call .skip_prologue
+ RET
+%endif
+global x264_sub8x8_dct8_%1.skip_prologue
+.skip_prologue:
+ SWAP 7, 10
+ LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
+ LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE
+ DCT8_1D 0,1,2,3,4,5,6,7,8,9
+ TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
+ DCT8_1D 0,1,2,3,4,5,6,7,8,9
+ movdqa [r0+0x00], m0
+ movdqa [r0+0x10], m1
+ movdqa [r0+0x20], m2
+ movdqa [r0+0x30], m3
+ movdqa [r0+0x40], m4
+ movdqa [r0+0x50], m5
+ movdqa [r0+0x60], m6
+ movdqa [r0+0x70], m7
+ ret
+%endmacro
+
+%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSE2
+%define movdqa movaps
+%define punpcklqdq movlhps
+DCT_SUB8 sse2
+%undef movdqa
+%undef punpcklqdq
+%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3
+DCT_SUB8 ssse3
+
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_sse2, 2,2,10
+cglobal x264_add8x8_idct8_sse2, 2,2,11
+ add r0, 4*FDEC_STRIDE
+ pxor m7, m7
+%ifdef WIN64
+ call .skip_prologue
+ RET
+%endif
+global x264_add8x8_idct8_sse2.skip_prologue
+.skip_prologue:
+ SWAP 7, 9
movdqa m0, [r1+0x00]
movdqa m1, [r1+0x10]
movdqa m2, [r1+0x20]
movdqa m5, [r1+0x50]
movdqa m6, [r1+0x60]
movdqa m7, [r1+0x70]
-
- IDCT8_1D 0,1,2,3,4,5,6,7,8,9
+ IDCT8_1D 0,1,2,3,4,5,6,7,8,10
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
paddw m0, [pw_32 GLOBAL] ; rounding for the >>6 at the end
- IDCT8_1D 0,1,2,3,4,5,6,7,8,9
-
- pxor m9, m9
- STORE_DIFF m0, m8, m9, [r0+0*FDEC_STRIDE]
- STORE_DIFF m1, m8, m9, [r0+1*FDEC_STRIDE]
- STORE_DIFF m2, m8, m9, [r0+2*FDEC_STRIDE]
- STORE_DIFF m3, m8, m9, [r0+3*FDEC_STRIDE]
- STORE_DIFF m4, m8, m9, [r0+4*FDEC_STRIDE]
- STORE_DIFF m5, m8, m9, [r0+5*FDEC_STRIDE]
- STORE_DIFF m6, m8, m9, [r0+6*FDEC_STRIDE]
- STORE_DIFF m7, m8, m9, [r0+7*FDEC_STRIDE]
- RET
-
+ IDCT8_1D 0,1,2,3,4,5,6,7,8,10
+ DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
+ DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
+ DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
+ DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
+ STORE_IDCT m1, m3, m5, m7
+ ret
+;-----------------------------------------------------------------------------
+; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] )
+;-----------------------------------------------------------------------------
+cglobal x264_add8x8_idct_sse2, 2,2,11
+ add r0, 4*FDEC_STRIDE
+ pxor m7, m7
+%ifdef WIN64
+ call .skip_prologue
+ RET
+%endif
+global x264_add8x8_idct_sse2.skip_prologue
+.skip_prologue:
+ SWAP 7, 9
+ mova m0, [r1+ 0]
+ mova m2, [r1+16]
+ mova m1, [r1+32]
+ mova m3, [r1+48]
+ SBUTTERFLY qdq, 0, 1, 4
+ SBUTTERFLY qdq, 2, 3, 4
+ mova m4, [r1+64]
+ mova m6, [r1+80]
+ mova m5, [r1+96]
+ mova m7, [r1+112]
+ SBUTTERFLY qdq, 4, 5, 8
+ SBUTTERFLY qdq, 6, 7, 8
+ IDCT4_1D 0,1,2,3,8,10
+ TRANSPOSE2x4x4W 0,1,2,3,8
+ IDCT4_1D 4,5,6,7,8,10
+ TRANSPOSE2x4x4W 4,5,6,7,8
+ paddw m0, [pw_32 GLOBAL]
+ IDCT4_1D 0,1,2,3,8,10
+ paddw m4, [pw_32 GLOBAL]
+ IDCT4_1D 4,5,6,7,8,10
+ DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
+ DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
+ DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
+ DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
+ STORE_IDCT m1, m3, m5, m7
+ ret
;*****************************************************************************
;* Copyright (C) 2003-2008 x264 project
;*
-;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+;* Authors: Holger Lubitz <holger@lubitz.org>
+;* Laurent Aimar <fenrir@via.ecp.fr>
;* Loren Merritt <lorenm@u.washington.edu>
-;* Holger Lubitz <hal@duncan.ol.sub.de>
;* Min Chen <chenm001.163.com>
;*
;* This program is free software; you can redistribute it and/or modify
SECTION_RODATA
pw_32: times 8 dw 32
pw_8000: times 8 dw 0x8000
+hsub_mul: times 8 db 1, -1
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
SECTION .text
-%macro HADAMARD4_1D 4
- SUMSUB_BADC m%2, m%1, m%4, m%3
- SUMSUB_BADC m%4, m%2, m%3, m%1
+%macro WALSH4_1D 5
+ SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
+ SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
SWAP %1, %4, %3
%endmacro
%macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
movq m%3, m%4
- paddw m%1, m%4
+ pxor m%1, m%4
psubw m%3, m%2
- paddw m%2, m%4
+ pxor m%2, m%4
pavgw m%3, m%1
pavgw m%2, m%1
- psubw m%3, m%4
- psubw m%2, m%4
+ pxor m%3, m%4
+ pxor m%2, m%4
SWAP %1, %2, %3
%endmacro
+INIT_MMX
;-----------------------------------------------------------------------------
; void x264_dct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_dct4x4dc_mmx, 1,1
- movq m0, [r0+ 0]
- movq m1, [r0+ 8]
- movq m2, [r0+16]
movq m3, [r0+24]
+ movq m2, [r0+16]
+ movq m1, [r0+ 8]
+ movq m0, [r0+ 0]
movq m7, [pw_8000 GLOBAL] ; convert to unsigned and back, so that pavgw works
- HADAMARD4_1D 0,1,2,3
+ WALSH4_1D 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
- SUMSUB_BADC m1, m0, m3, m2
- SWAP 0,1
- SWAP 2,3
+ SUMSUB_BADC m1, m0, m3, m2, m4
+ SWAP 0, 1
+ SWAP 2, 3
SUMSUB_17BIT 0,2,4,7
SUMSUB_17BIT 1,3,5,7
movq [r0+0], m0
; void x264_idct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_idct4x4dc_mmx, 1,1
- movq m0, [r0+ 0]
- movq m1, [r0+ 8]
- movq m2, [r0+16]
- movq m3, [r0+24]
- HADAMARD4_1D 0,1,2,3
+ movq m3, [r0+24]
+ movq m2, [r0+16]
+ movq m1, [r0+ 8]
+ movq m0, [r0+ 0]
+ WALSH4_1D 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
- HADAMARD4_1D 0,1,2,3
+ WALSH4_1D 0,1,2,3,4
movq [r0+ 0], m0
movq [r0+ 8], m1
movq [r0+16], m2
movq [r0+24], m3
RET
-%macro DCT4_1D 5
- SUMSUB_BADC m%4, m%1, m%3, m%2
- SUMSUB_BA m%3, m%4
- SUMSUB2_AB m%1, m%2, m%5
- SWAP %1, %3, %4, %5, %2
-%endmacro
-
-%macro IDCT4_1D 6
- SUMSUB_BA m%3, m%1
- SUMSUBD2_AB m%2, m%4, m%6, m%5
- SUMSUB_BADC m%2, m%3, m%5, m%1
- SWAP %1, %2, %5, %4, %3
-%endmacro
-
+%macro SUB_DCT4 1
;-----------------------------------------------------------------------------
; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-cglobal x264_sub4x4_dct_mmx, 3,3
+cglobal x264_sub4x4_dct_%1, 3,3
+%ifidn %1, mmx
.skip_prologue:
-%macro SUB_DCT4 1
- LOAD_DIFF m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
- LOAD_DIFF m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
- LOAD_DIFF m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
- LOAD_DIFF m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
+ LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+ LOAD_DIFF m3, m4, m5, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
+ LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
+ LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+%else
+ mova m5, [hsub_mul GLOBAL]
+ LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2
+%endif
DCT4_1D 0,1,2,3,4
- TRANSPOSE%1 0,1,2,3,4
+ TRANSPOSE4x4W 0,1,2,3,4
DCT4_1D 0,1,2,3,4
movq [r0+ 0], m0
movq [r0+ 8], m1
movq [r0+16], m2
movq [r0+24], m3
-%endmacro
- SUB_DCT4 4x4W
RET
+%endmacro
+
+SUB_DCT4 mmx
+SUB_DCT4 ssse3
;-----------------------------------------------------------------------------
; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_add4x4_idct_mmx, 2,2
+ pxor m7, m7
.skip_prologue:
- movq m0, [r1+ 0]
movq m1, [r1+ 8]
- movq m2, [r1+16]
movq m3, [r1+24]
-%macro ADD_IDCT4 1
+ movq m2, [r1+16]
+ movq m0, [r1+ 0]
IDCT4_1D 0,1,2,3,4,5
- TRANSPOSE%1 0,1,2,3,4
+ TRANSPOSE4x4W 0,1,2,3,4
paddw m0, [pw_32 GLOBAL]
IDCT4_1D 0,1,2,3,4,5
- pxor m7, m7
STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
-%endmacro
- ADD_IDCT4 4x4W
- RET
-
-INIT_XMM
-
-cglobal x264_sub8x8_dct_sse2, 3,3,8
-.skip_prologue:
- call .8x4
- add r0, 64
- add r1, 4*FENC_STRIDE
- add r2, 4*FDEC_STRIDE
-%ifdef WIN64
- call .8x4
- RET
-%endif
-.8x4:
- SUB_DCT4 2x4x4W
- movhps [r0+32], m0
- movhps [r0+40], m1
- movhps [r0+48], m2
- movhps [r0+56], m3
- ret
-
-cglobal x264_add8x8_idct_sse2, 2,2,8
-.skip_prologue:
- call .8x4
- add r1, 64
- add r0, 4*FDEC_STRIDE
-%ifdef WIN64
- call .8x4
RET
-%endif
-.8x4:
- movq m0, [r1+ 0]
- movq m1, [r1+ 8]
- movq m2, [r1+16]
- movq m3, [r1+24]
- movhps m0, [r1+32]
- movhps m1, [r1+40]
- movhps m2, [r1+48]
- movhps m3, [r1+56]
- ADD_IDCT4 2x4x4W
- ret
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
-cglobal %1, 3,3
+cglobal %1, 3,3,11
+%if mmsize == 8
+ pxor m7, m7
+%else
+ add r2, 4*FDEC_STRIDE
+ mova m7, [hsub_mul GLOBAL]
+%endif
.skip_prologue:
%ifdef WIN64
sub rsp, 8
add r2, %4-%5-%6*FDEC_STRIDE
%ifdef WIN64
add rsp, 8
-%endif
+ call %2
+ RET
+%else
jmp %2
+%endif
%endmacro
;-----------------------------------------------------------------------------
; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
-%macro ADD_NxN_IDCT 6
-cglobal %1, 2,2
+%macro ADD_NxN_IDCT 6-7
+cglobal %1, 2,2,11
+ pxor m7, m7
+%if mmsize==16
+ add r0, 4*FDEC_STRIDE
+%endif
.skip_prologue:
%ifdef WIN64
sub rsp, 8
add r1, %3
%ifdef WIN64
add rsp, 8
-%endif
+ call %2
+ RET
+%else
jmp %2
+%endif
%endmacro
%ifndef ARCH_X86_64
cextern x264_add8x8_idct8_mmx.skip_prologue
SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
-%define x264_sub8x8_dct_sse2 x264_sub8x8_dct_sse2.skip_prologue
-%define x264_add8x8_idct_sse2 x264_add8x8_idct_sse2.skip_prologue
-%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
-%define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
%endif
-SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2, 64, 8, 0, 4
-ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2, 64, 8, 0, 4
+INIT_XMM
+
+cextern x264_sub8x8_dct_sse2.skip_prologue
+cextern x264_sub8x8_dct_ssse3.skip_prologue
+SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT x264_sub16x16_dct_ssse3, x264_sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
+cextern x264_add8x8_idct_sse2.skip_prologue
+ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
+
+cextern x264_sub8x8_dct8_sse2.skip_prologue
+cextern x264_add8x8_idct8_sse2.skip_prologue
+SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
+ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
+
+cextern x264_sub8x8_dct8_ssse3.skip_prologue
+SUB_NxN_DCT x264_sub16x16_dct8_ssse3, x264_sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
-cextern x264_sub8x8_dct8_sse2
-cextern x264_add8x8_idct8_sse2
-SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0
-ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
;-----------------------------------------------------------------------------
; void add8x8_idct_dc( uint8_t *p_dst, int16_t *dct2x2 )
void x264_sub16x16_dct_mmx ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_sse2 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub4x4_dct_ssse3 ( int16_t dct[ 4][4] , uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_ssse3 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
+
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] );
void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
void x264_sub16x16_dct8_mmx ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_sse2 ( int16_t dct[8][8] , uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_sse2 ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct8_ssse3 ( int16_t dct[8][8] , uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_ssse3( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
+
void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct[8][8] );
void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][8][8] );
%define spill esp+0x60 ; +16
%define trans esp+0 ; +96
LOAD_DIFF_4x8P 0
- HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
movq [spill], m1
TRANSPOSE4x4W 4, 5, 6, 7, 1
mov r0, [args+4]
mov r2, [args]
LOAD_DIFF_4x8P 4
- HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
movq [spill], m7
TRANSPOSE4x4W 0, 1, 2, 3, 7
movq m2, [trans+0x10]
movq m3, [trans+0x18]
- HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
SUM4x8_MM
movq [trans], m0
movq m6, [trans+0x50]
movq m7, [trans+0x58]
- HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
SUM4x8_MM
pavgw m0, [trans]
%define trans esp+0 ; +96
%define sum esp+0 ; +32
LOAD_4x8P 0
- HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
movq [spill], m0
TRANSPOSE4x4W 4, 5, 6, 7, 0
movq [trans+0x38], m3
LOAD_4x8P 4
- HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
movq [spill], m7
TRANSPOSE4x4W 0, 1, 2, 3, 7
movq m2, [trans+0x10]
movq m3, [trans+0x18]
- HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
movq [spill+0], m0
movq [spill+8], m1
movq m6, [trans+0x50]
movq m7, [trans+0x58]
- HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+ HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
movd [sum+0x10], m0
movd [sum+0x12], m1
;* Copyright (C) 2003-2008 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Holger Lubitz <holger@lubitz.org>
;* Laurent Aimar <fenrir@via.ecp.fr>
;* Alex Izvorski <aizvorksi@gmail.com>
;* Fiona Glaser <fiona@x264.com>
%include "x86util.asm"
SECTION_RODATA
-pw_1: times 8 dw 1
-ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
-ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
-mask_ff: times 16 db 0xff
- times 16 db 0
-mask_ac4: dw 0,-1,-1,-1, 0,-1,-1,-1
-mask_ac8: dw 0,-1,-1,-1,-1,-1,-1,-1
+pw_1: times 8 dw 1
+pw_00ff: times 8 dw 0xff
+ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
+ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
+mask_ff: times 16 db 0xff
+ times 16 db 0
+mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1
+mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1
+mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1
+hsub_mul: times 8 db 1, -1
+hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
+hmul_8p: times 8 db 1
+ times 4 db 1, -1
+mask_10: times 4 dw 0, -1
+mask_1100: times 2 dd 0, -1
SECTION .text
pshuflw %2, %1, 0xE
paddd %1, %2
%else
- mova %2, %1
- psrlq %2, 32
+ pshufw %2, %1, 0xE
paddd %1, %2
%endif
%endmacro
; SSD
;=============================================================================
-%macro SSD_FULL 6
+%macro SSD_LOAD_FULL 5
mova m1, [r0+%1]
mova m2, [r2+%2]
mova m3, [r0+%3]
mova m4, [r2+%4]
-
- mova m5, m2
- mova m6, m4
- psubusb m2, m1
- psubusb m4, m3
- psubusb m1, m5
- psubusb m3, m6
- por m1, m2
- por m3, m4
-
- mova m2, m1
- mova m4, m3
- punpcklbw m1, m7
- punpcklbw m3, m7
- punpckhbw m2, m7
- punpckhbw m4, m7
- pmaddwd m1, m1
- pmaddwd m2, m2
- pmaddwd m3, m3
- pmaddwd m4, m4
-
-%if %6
+%if %5
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
%endif
- paddd m1, m2
- paddd m3, m4
+%endmacro
+
+%macro LOAD 5
+ movh m%1, %3
+ movh m%2, %4
%if %5
- paddd m0, m1
-%else
- SWAP m0, m1
+ lea r0, [r0+2*r1]
%endif
- paddd m0, m3
%endmacro
-%macro SSD_HALF 6
- movh m1, [r0+%1]
- movh m2, [r2+%2]
- movh m3, [r0+%3]
- movh m4, [r2+%4]
+%macro JOIN 7
+ movh m%3, %5
+ movh m%4, %6
+%if %7
+ lea r2, [r2+2*r3]
+%endif
+ punpcklbw m%1, m7
+ punpcklbw m%3, m7
+ psubw m%1, m%3
+ punpcklbw m%2, m7
+ punpcklbw m%4, m7
+ psubw m%2, m%4
+%endmacro
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- punpcklbw m4, m7
- psubw m1, m2
- psubw m3, m4
- pmaddwd m1, m1
- pmaddwd m3, m3
+%macro JOIN_SSE2 7
+ movh m%3, %5
+ movh m%4, %6
+%if %7
+ lea r2, [r2+2*r3]
+%endif
+ punpcklqdq m%1, m%2
+ punpcklqdq m%3, m%4
+ DEINTB %2, %1, %4, %3, 7
+ psubw m%2, m%4
+ psubw m%1, m%3
+%endmacro
-%if %6
- lea r0, [r0+2*r1]
+%macro JOIN_SSSE3 7
+ movh m%3, %5
+ movh m%4, %6
+%if %7
lea r2, [r2+2*r3]
%endif
-%if %5
- paddd m0, m1
-%else
- SWAP m0, m1
+ punpcklbw m%1, m%3
+ punpcklbw m%2, m%4
+%endmacro
+
+%macro SSD_LOAD_HALF 5
+ LOAD 1, 2, [r0+%1], [r0+%3], 1
+ JOIN 1, 2, 3, 4, [r2+%2], [r2+%4], 1
+ LOAD 3, 4, [r0+%1], [r0+%3], %5
+ JOIN 3, 4, 5, 6, [r2+%2], [r2+%4], %5
+%endmacro
+
+%macro SSD_CORE 7-8
+%ifidn %8, FULL
+ mova m%6, m%2
+ mova m%7, m%4
+ psubusb m%2, m%1
+ psubusb m%4, m%3
+ psubusb m%1, m%6
+ psubusb m%3, m%7
+ por m%1, m%2
+ por m%3, m%4
+ mova m%2, m%1
+ mova m%4, m%3
+ punpckhbw m%1, m%5
+ punpckhbw m%3, m%5
+ punpcklbw m%2, m%5
+ punpcklbw m%4, m%5
%endif
- paddd m0, m3
+ pmaddwd m%1, m%1
+ pmaddwd m%2, m%2
+ pmaddwd m%3, m%3
+ pmaddwd m%4, m%4
%endmacro
-%macro SSD_QUARTER 6
- movd m1, [r0+%1]
- movd m2, [r2+%2]
- movd m3, [r0+%3]
- movd m4, [r2+%4]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- pinsrd m1, [r0+%1], 1
- pinsrd m2, [r2+%2], 1
- pinsrd m3, [r0+%3], 1
- pinsrd m4, [r2+%4], 1
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- punpcklbw m4, m7
- psubw m1, m2
- psubw m3, m4
- pmaddwd m1, m1
- pmaddwd m3, m3
+%macro SSD_CORE_SSE2 7-8
+%ifidn %8, FULL
+ DEINTB %6, %1, %7, %2, %5
+ psubw m%6, m%7
+ psubw m%1, m%2
+ SWAP %2, %6
+ DEINTB %6, %3, %7, %4, %5
+ psubw m%6, m%7
+ psubw m%3, m%4
+ SWAP %4, %6
+%endif
+ pmaddwd m%1, m%1
+ pmaddwd m%2, m%2
+ pmaddwd m%3, m%3
+ pmaddwd m%4, m%4
+%endmacro
-%if %6
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
+%macro SSD_CORE_SSSE3 7-8
+%ifidn %8, FULL
+ mova m%6, m%1
+ mova m%7, m%3
+ punpcklbw m%1, m%2
+ punpcklbw m%3, m%4
+ punpckhbw m%6, m%2
+ punpckhbw m%7, m%4
+ SWAP %6, %2
+ SWAP %7, %4
%endif
-%if %5
+ pmaddubsw m%1, m%5
+ pmaddubsw m%2, m%5
+ pmaddubsw m%3, m%5
+ pmaddubsw m%4, m%5
+ pmaddwd m%1, m%1
+ pmaddwd m%2, m%2
+ pmaddwd m%3, m%3
+ pmaddwd m%4, m%4
+%endmacro
+
+%macro SSD_END 1
+ paddd m1, m2
+ paddd m3, m4
+%if %1
paddd m0, m1
%else
- SWAP m0, m1
+ SWAP 0, 1
%endif
paddd m0, m3
%endmacro
+%macro SSD_ITER 7
+ SSD_LOAD_%1 %2,%3,%4,%5,%7
+ SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
+ SSD_END %6
+%endmacro
+
;-----------------------------------------------------------------------------
; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SSD 3-4 0
cglobal x264_pixel_ssd_%1x%2_%3, 4,4,%4
-%if %1 >= mmsize
+%ifidn %3, ssse3
+ mova m7, [hsub_mul GLOBAL]
+%elifidn %3, sse2
+ mova m7, [pw_00ff GLOBAL]
+%elif %1 >= mmsize
pxor m7, m7
%endif
%assign i 0
-%rep %2/2
+%rep %2/4
%if %1 > mmsize
- SSD_FULL 0, 0, mmsize, mmsize, i, 0
- SSD_FULL r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/2-1
+ SSD_ITER FULL, 0, 0, mmsize, mmsize, i, 0
+ SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, 1
+ SSD_ITER FULL, 0, 0, mmsize, mmsize, 1, 0
+ SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/4-1
%elif %1 == mmsize
- SSD_FULL 0, 0, r1, r3, i, i<%2/2-1
+ SSD_ITER FULL, 0, 0, r1, r3, i, 1
+ SSD_ITER FULL, 0, 0, r1, r3, 1, i<%2/4-1
%else
- SSD_HALF 0, 0, r1, r3, i, i<%2/2-1
+ SSD_ITER HALF, 0, 0, r1, r3, i, i<%2/4-1
%endif
%assign i i+1
%endrep
SSD 4, 8, mmx
SSD 4, 4, mmx
INIT_XMM
+SSD 16, 16, sse2slow, 8
+SSD 16, 8, sse2slow, 8
+SSD 8, 16, sse2slow, 8
+SSD 8, 8, sse2slow, 8
+SSD 8, 4, sse2slow, 8
+%define SSD_CORE SSD_CORE_SSE2
+%define JOIN JOIN_SSE2
SSD 16, 16, sse2, 8
SSD 16, 8, sse2, 8
-SSD 8, 16, sse2, 5
-SSD 8, 8, sse2, 5
-SSD 8, 4, sse2, 5
-
-cglobal x264_pixel_ssd_4x8_sse4, 4,4
- SSD_QUARTER 0, 0, r1, r3, 0, 1
- SSD_QUARTER 0, 0, r1, r3, 1, 0
- HADDD m0, m1
- movd eax, m0
- RET
-
-cglobal x264_pixel_ssd_4x4_sse4, 4,4
- SSD_QUARTER 0, 0, r1, r3, 0, 0
- HADDD m0, m1
- movd eax, m0
- RET
-
+SSD 8, 16, sse2, 8
+SSD 8, 8, sse2, 8
+SSD 8, 4, sse2, 8
+%define SSD_CORE SSD_CORE_SSSE3
+%define JOIN JOIN_SSSE3
+SSD 16, 16, ssse3, 8
+SSD 16, 8, ssse3, 8
+SSD 8, 16, ssse3, 8
+SSD 8, 8, ssse3, 8
+SSD 8, 4, ssse3, 8
+INIT_MMX
+SSD 4, 8, ssse3
+SSD 4, 4, ssse3
;=============================================================================
; variance
; SATD
;=============================================================================
-; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower:
-; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1.
-; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging,
-; whereas phaddw-based transform doesn't care what order the coefs end up in.
+%macro TRANS_SSE2 5-6
+; TRANSPOSE2x2
+; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq
+; %2: ord/unord (for compat with sse4, unused)
+; %3/%4: source regs
+; %5/%6: tmp regs
+%ifidn %1, d
+%define mask [mask_10 GLOBAL]
+%define shift 16
+%elifidn %1, q
+%define mask [mask_1100 GLOBAL]
+%define shift 32
+%endif
+%if %0==6 ; less dependency if we have two tmp
+ mova m%5, mask ; ff00
+ mova m%6, m%4 ; x5x4
+ psll%1 m%4, shift ; x4..
+ pand m%6, m%5 ; x5..
+ pandn m%5, m%3 ; ..x0
+ psrl%1 m%3, shift ; ..x1
+ por m%4, m%5 ; x4x0
+ por m%3, m%6 ; x5x1
+%else ; more dependency, one insn less. sometimes faster, sometimes not
+ mova m%5, m%4 ; x5x4
+ psll%1 m%4, shift ; x4..
+ pxor m%4, m%3 ; (x4^x1)x0
+ pand m%4, mask ; (x4^x1)..
+ pxor m%3, m%4 ; x4x0
+ psrl%1 m%4, shift ; ..(x1^x4)
+ pxor m%5, m%4 ; x5x1
+ SWAP %4, %3, %5
+%endif
+%endmacro
+
+%define TRANS TRANS_SSE2
-%macro PHSUMSUB 3
- movdqa m%3, m%1
- phaddw m%1, m%2
- phsubw m%3, m%2
- SWAP %2, %3
+%macro TRANS_SSE4 5-6 ; see above
+%ifidn %1, d
+%define mask 10101010b
+%define shift 16
+%elifidn %1, q
+%define mask 11001100b
+%define shift 32
+%endif
+ mova m%5, m%3
+%ifidn %2, ord
+ psrl%1 m%3, shift
+%endif
+ pblendw m%3, m%4, mask
+ psll%1 m%4, shift
+%ifidn %2, ord
+ pblendw m%4, m%5, 255^mask
+%else
+ psrl%1 m%5, shift
+ por m%4, m%5
+%endif
%endmacro
-%macro HADAMARD4_ROW_PHADD 5
- PHSUMSUB %1, %2, %5
- PHSUMSUB %3, %4, %5
- PHSUMSUB %1, %3, %5
- PHSUMSUB %2, %4, %5
- SWAP %3, %4
+%macro JDUP_SSE2 2
+ punpckldq %1, %2
+ ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
%endmacro
-%macro HADAMARD4_1D 4
- SUMSUB_BADC %1, %2, %3, %4
- SUMSUB_BADC %1, %3, %2, %4
+%macro JDUP_CONROE 2
+ ; join 2x 32 bit and duplicate them
+ ; emulating shufps is faster on conroe
+ punpcklqdq %1, %2
+ movsldup %1, %1
%endmacro
-%macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block)
- %xdefine %%n n%1
- HADAMARD4_1D m4, m5, m6, m7
- TRANSPOSE4x4W 4, 5, 6, 7, %%n
- HADAMARD4_1D m4, m5, m6, m7
- ABS2 m4, m5, m3, m %+ %%n
- ABS2 m6, m7, m3, m %+ %%n
- paddw m6, m4
- paddw m7, m5
- pavgw m6, m7
- SWAP %%n, 6
+%macro JDUP_PENRYN 2
+ ; just use shufps on anything post conroe
+ shufps %1, %2, 0
+%endmacro
+
+%macro HSUMSUB 5
+ pmaddubsw m%2, m%5
+ pmaddubsw m%1, m%5
+ pmaddubsw m%4, m%5
+ pmaddubsw m%3, m%5
+%endmacro
+
+%macro DIFF_UNPACK_SSE2 5
+ punpcklbw m%1, m%5
+ punpcklbw m%2, m%5
+ punpcklbw m%3, m%5
+ punpcklbw m%4, m%5
+ psubw m%1, m%2
+ psubw m%3, m%4
+%endmacro
+
+%macro DIFF_SUMSUB_SSSE3 5
+ HSUMSUB %1, %2, %3, %4, %5
+ psubw m%1, m%2
+ psubw m%3, m%4
+%endmacro
+
+%macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
+ movd %1, %3
+ movd %2, %4
+ JDUP %1, %2
+%endmacro
+
+%macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
+ movddup m%3, %6
+ movddup m%4, %8
+ movddup m%1, %5
+ movddup m%2, %7
+%endmacro
+
+%macro LOAD_DUP_4x8P_PENRYN 8
+ ; penryn and nehalem run punpcklqdq and movddup in different units
+ movh m%3, %6
+ movh m%4, %8
+ punpcklqdq m%3, m%3
+ movddup m%1, %5
+ punpcklqdq m%4, m%4
+ movddup m%2, %7
+%endmacro
+
+%macro LOAD_SUMSUB_8x2P 9
+ LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
+ DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
+%endmacro
+
+%macro LOAD_SUMSUB_8x4P_SSSE3 7-10 r0, r2, 0
+; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
+ LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
+ LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
+%if %10
+ lea %8, [%8+4*r1]
+ lea %9, [%9+4*r3]
+%endif
+%endmacro
+
+%macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
+ movddup m%1, [%7]
+ movddup m%2, [%7+8]
+ mova m%4, [%6]
+ movddup m%3, m%4
+ punpckhqdq m%4, m%4
+ DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
+%endmacro
+
+%macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
+ movu m%4, [%7]
+ mova m%2, [%6]
+ DEINTB %1, %2, %3, %4, %5
+ psubw m%1, m%3
+ psubw m%2, m%4
+ SUMSUB_BA m%1, m%2, m%3
+%endmacro
+
+%macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
+; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
+ LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
+ LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
+ LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
+ LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
%endmacro
; in: r4=3*stride1, r5=3*stride2
; clobber: m3..m7
; out: %1 = satd
%macro SATD_4x4_MMX 3
+ %xdefine %%n n%1
LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2]
LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2]
LOAD_DIFF m6, m3, none, [r0+2*r1+%2], [r2+2*r3+%2]
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
- HADAMARD4x4_SUM %1
+ HADAMARD4_2D 4, 5, 6, 7, 3, %%n
+ paddw m4, m6
+ SWAP %%n, 4
%endmacro
-%macro SATD_8x4_SSE2 1
- HADAMARD4_1D m0, m1, m2, m3
-%ifidn %1, ssse3_phadd
- HADAMARD4_ROW_PHADD 0, 1, 2, 3, 4
+%macro SATD_8x4_SSE 8-9
+%ifidn %1, sse2
+ HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
+%else
+ HADAMARD4_V m%2, m%3, m%4, m%5, m%6
+ ; doing the abs first is a slight advantage
+ ABS4 m%2, m%4, m%3, m%5, m%6, m%7
+ HADAMARD 1, max, %2, %4, %6, %7
+%endif
+%ifnidn %9, swap
+ paddw m%8, m%2
%else
- TRANSPOSE2x4x4W 0, 1, 2, 3, 4
- HADAMARD4_1D m0, m1, m2, m3
-%endif
- ABS4 m0, m1, m2, m3, m4, m5
- paddusw m0, m1
- paddusw m2, m3
- paddusw m6, m0
- paddusw m6, m2
+ SWAP %8, %2
+%endif
+%ifidn %1, sse2
+ paddw m%8, m%4
+%else
+ HADAMARD 1, max, %3, %5, %6, %7
+ paddw m%8, m%3
+%endif
%endmacro
%macro SATD_START_MMX 0
paddw m0, m1
SATD_END_MMX
-%macro SATD_W4 1
-INIT_MMX
-cglobal x264_pixel_satd_4x4_%1, 4,6
+cglobal x264_pixel_satd_4x4_mmxext, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 0
SATD_END_MMX
-%endmacro
-
-SATD_W4 mmxext
-%macro SATD_START_SSE2 0
- pxor m6, m6
+%macro SATD_START_SSE2 3
+%ifnidn %1, sse2
+ mova %3, [hmul_8p GLOBAL]
+%endif
lea r4, [3*r1]
lea r5, [3*r3]
+ pxor %2, %2
%endmacro
-%macro SATD_END_SSE2 0
- psrlw m6, 1
- HADDW m6, m7
- movd eax, m6
+%macro SATD_END_SSE2 2
+ HADDW %2, m7
+ movd eax, %2
RET
%endmacro
;-----------------------------------------------------------------------------
%macro SATDS_SSE2 1
INIT_XMM
+%ifnidn %1, sse2
+cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
+ SATD_START_MMX
+ mova m4, [hmul_4p GLOBAL]
+ LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
+ LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
+ LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
+ LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
+ DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
+ HADAMARD 0, sumsub, 0, 1, 2, 3
+ HADAMARD 4, sumsub, 0, 1, 2, 3
+ HADAMARD 1, amax, 0, 1, 2, 3
+ HADDW m0, m1
+ movd eax, m0
+ RET
+%endif
+
+cglobal x264_pixel_satd_4x8_%1, 4, 6, 8
+ SATD_START_MMX
+%ifnidn %1, sse2
+ mova m7, [hmul_4p GLOBAL]
+%endif
+ movd m4, [r2]
+ movd m5, [r2+r3]
+ movd m6, [r2+2*r3]
+ add r2, r5
+ movd m0, [r0]
+ movd m1, [r0+r1]
+ movd m2, [r0+2*r1]
+ add r0, r4
+ movd m3, [r2+r3]
+ JDUP m4, m3
+ movd m3, [r0+r1]
+ JDUP m0, m3
+ movd m3, [r2+2*r3]
+ JDUP m5, m3
+ movd m3, [r0+2*r1]
+ JDUP m1, m3
+ DIFFOP 0, 4, 1, 5, 7
+ movd m5, [r2]
+ add r2, r5
+ movd m3, [r0]
+ add r0, r4
+ movd m4, [r2]
+ JDUP m6, m4
+ movd m4, [r0]
+ JDUP m2, m4
+ movd m4, [r2+r3]
+ JDUP m5, m4
+ movd m4, [r0+r1]
+ JDUP m3, m4
+ DIFFOP 2, 6, 3, 5, 7
+ SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6, swap
+ HADDW m6, m1
+ movd eax, m6
+ RET
+
cglobal x264_pixel_satd_8x8_internal_%1
- LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
- SATD_8x4_SSE2 %1
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
+ LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
+ SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
x264_pixel_satd_8x4_internal_%1:
- LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
-x264_pixel_satd_4x8_internal_%1:
- SAVE_MM_PERMUTATION satd_4x8_internal
- SATD_8x4_SSE2 %1
+ LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
+ SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
ret
-cglobal x264_pixel_satd_16x16_%1, 4,6,8
- SATD_START_SSE2
- BACKUP_POINTERS
- call x264_pixel_satd_8x8_internal_%1
- lea r0, [r0+4*r1]
+%ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
+cglobal x264_pixel_satd_16x4_internal_%1
+ LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
lea r2, [r2+4*r3]
+ lea r0, [r0+4*r1]
+ SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10
+ SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10
+ ret
+
+cglobal x264_pixel_satd_16x8_%1, 4,6,12
+ SATD_START_SSE2 %1, m10, m7
+%ifidn %1, sse2
+ mova m7, [pw_00ff GLOBAL]
+%endif
+ jmp x264_pixel_satd_16x8_internal_%1
+
+cglobal x264_pixel_satd_16x16_%1, 4,6,12
+ SATD_START_SSE2 %1, m10, m7
+%ifidn %1, sse2
+ mova m7, [pw_00ff GLOBAL]
+%endif
+ call x264_pixel_satd_16x4_internal_%1
+ call x264_pixel_satd_16x4_internal_%1
+x264_pixel_satd_16x8_internal_%1:
+ call x264_pixel_satd_16x4_internal_%1
+ call x264_pixel_satd_16x4_internal_%1
+ SATD_END_SSE2 %1, m10
+%else
+cglobal x264_pixel_satd_16x8_%1, 4,6,8
+ SATD_START_SSE2 %1, m6, m7
+ BACKUP_POINTERS
call x264_pixel_satd_8x8_internal_%1
RESTORE_AND_INC_POINTERS
call x264_pixel_satd_8x8_internal_%1
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- call x264_pixel_satd_8x8_internal_%1
- SATD_END_SSE2
+ SATD_END_SSE2 %1, m6
-cglobal x264_pixel_satd_16x8_%1, 4,6,8
- SATD_START_SSE2
+cglobal x264_pixel_satd_16x16_%1, 4,6,8
+ SATD_START_SSE2 %1, m6, m7
BACKUP_POINTERS
call x264_pixel_satd_8x8_internal_%1
+ call x264_pixel_satd_8x8_internal_%1
RESTORE_AND_INC_POINTERS
call x264_pixel_satd_8x8_internal_%1
- SATD_END_SSE2
+ call x264_pixel_satd_8x8_internal_%1
+ SATD_END_SSE2 %1, m6
+%endif
cglobal x264_pixel_satd_8x16_%1, 4,6,8
- SATD_START_SSE2
+ SATD_START_SSE2 %1, m6, m7
call x264_pixel_satd_8x8_internal_%1
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
call x264_pixel_satd_8x8_internal_%1
- SATD_END_SSE2
+ SATD_END_SSE2 %1, m6
cglobal x264_pixel_satd_8x8_%1, 4,6,8
- SATD_START_SSE2
+ SATD_START_SSE2 %1, m6, m7
call x264_pixel_satd_8x8_internal_%1
- SATD_END_SSE2
+ SATD_END_SSE2 %1, m6
cglobal x264_pixel_satd_8x4_%1, 4,6,8
- SATD_START_SSE2
+ SATD_START_SSE2 %1, m6, m7
call x264_pixel_satd_8x4_internal_%1
- SATD_END_SSE2
-
-cglobal x264_pixel_satd_4x8_%1, 4,6,8
- INIT_XMM
- LOAD_MM_PERMUTATION satd_4x8_internal
- %define movh movd
- SATD_START_SSE2
- LOAD_DIFF m0, m7, m6, [r0], [r2]
- LOAD_DIFF m1, m7, m6, [r0+r1], [r2+r3]
- LOAD_DIFF m2, m7, m6, [r0+2*r1], [r2+2*r3]
- LOAD_DIFF m3, m7, m6, [r0+r4], [r2+r5]
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- LOAD_DIFF m4, m7, m6, [r0], [r2]
- LOAD_DIFF m5, m7, m6, [r0+r1], [r2+r3]
- punpcklqdq m0, m4
- punpcklqdq m1, m5
- LOAD_DIFF m4, m7, m6, [r0+2*r1], [r2+2*r3]
- LOAD_DIFF m5, m7, m6, [r0+r4], [r2+r5]
- punpcklqdq m2, m4
- punpcklqdq m3, m5
- %define movh movq
- call x264_pixel_satd_4x8_internal_%1
- SATD_END_SSE2
+ SATD_END_SSE2 %1, m6
+%endmacro ; SATDS_SSE2
+%macro SA8D 1
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
cglobal x264_pixel_sa8d_8x8_internal_%1
lea r10, [r0+4*r1]
lea r11, [r2+4*r3]
- LOAD_DIFF_8x4P m0, m1, m2, m3, m8, m9, r0, r2
- LOAD_DIFF_8x4P m4, m5, m6, m7, m8, m9, r10, r11
-
- HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
- TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
- HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
-
- ABS4 m0, m1, m2, m3, m8, m9
- ABS4 m4, m5, m6, m7, m8, m9
- paddusw m0, m1
- paddusw m2, m3
- paddusw m4, m5
- paddusw m6, m7
- paddusw m0, m2
- paddusw m4, m6
- pavgw m0, m4
+ LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
+ LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11
+%ifidn %1, sse2 ; sse2 doesn't seem to like the horizontal way of doing things
+ HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+%else ; non-sse2
+ HADAMARD4_V m0, m1, m2, m8, m6
+ HADAMARD4_V m4, m5, m3, m9, m6
+ SUMSUB_BADC m0, m4, m1, m5, m6
+ HADAMARD 2, sumsub, 0, 4, 6, 11
+ HADAMARD 2, sumsub, 1, 5, 6, 11
+ SUMSUB_BADC m2, m3, m8, m9, m6
+ HADAMARD 2, sumsub, 2, 3, 6, 11
+ HADAMARD 2, sumsub, 8, 9, 6, 11
+ HADAMARD 1, amax, 0, 4, 6, 11
+ HADAMARD 1, amax, 1, 5, 6, 4
+ HADAMARD 1, amax, 2, 3, 6, 4
+ HADAMARD 1, amax, 8, 9, 6, 4
+%endif
+ paddw m0, m1
+ paddw m0, m2
+ paddw m0, m8
+ SAVE_MM_PERMUTATION x264_pixel_sa8d_8x8_internal_%1
ret
-cglobal x264_pixel_sa8d_8x8_%1, 4,6,10
+cglobal x264_pixel_sa8d_8x8_%1, 4,6,12
lea r4, [3*r1]
lea r5, [3*r3]
+%ifnidn %1, sse2
+ mova m7, [hmul_8p GLOBAL]
+%endif
call x264_pixel_sa8d_8x8_internal_%1
HADDW m0, m1
movd eax, m0
shr eax, 1
RET
-cglobal x264_pixel_sa8d_16x16_%1, 4,6,11
+cglobal x264_pixel_sa8d_16x16_%1, 4,6,12
lea r4, [3*r1]
lea r5, [3*r3]
+%ifnidn %1, sse2
+ mova m7, [hmul_8p GLOBAL]
+%endif
call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
- add r0, 8
add r2, 8
+ add r0, 8
mova m10, m0
call x264_pixel_sa8d_8x8_internal_%1 ; pix[8]
- lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
+ lea r0, [r0+8*r1]
paddusw m10, m0
call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8]
- sub r0, 8
sub r2, 8
+ sub r0, 8
paddusw m10, m0
call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
paddusw m0, m10
RET
%else ; ARCH_X86_32
+%ifnidn %1, mmxext
cglobal x264_pixel_sa8d_8x8_internal_%1
- LOAD_DIFF_8x4P m0, m1, m2, m3, m6, m7
- movdqa [esp+4], m2
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- LOAD_DIFF_8x4P m4, m5, m6, m7, m2, m2
- movdqa m2, [esp+4]
-
- HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
- TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [esp+4], [esp+20]
- HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
-
-%ifidn %1, sse2
- movdqa [esp+4], m4
- movdqa [esp+20], m2
-%endif
- ABS2 m6, m3, m4, m2
- ABS2 m0, m7, m4, m2
- paddusw m0, m6
- paddusw m7, m3
+ %define spill0 [esp+4]
+ %define spill1 [esp+20]
+ %define spill2 [esp+36]
%ifidn %1, sse2
- movdqa m4, [esp+4]
- movdqa m2, [esp+20]
-%endif
- ABS2 m5, m1, m6, m3
- ABS2 m4, m2, m6, m3
- paddusw m5, m1
- paddusw m4, m2
- paddusw m0, m7
- paddusw m5, m4
- pavgw m0, m5
+ LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
+ HADAMARD4_2D 0, 1, 2, 3, 4
+ movdqa spill0, m3
+ LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
+ HADAMARD4_2D 4, 5, 6, 7, 3
+ HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
+ movdqa m3, spill0
+ paddw m0, m1
+ HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
+%else ; non-sse2
+ mova m7, [hmul_8p GLOBAL]
+ LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
+ ; could do first HADAMARD4_V here to save spilling later
+ ; surprisingly, not a win on conroe or even p4
+ mova spill0, m2
+ mova spill1, m3
+ mova spill2, m1
+ SWAP 1, 7
+ LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
+ HADAMARD4_V m4, m5, m6, m7, m3
+ mova m1, spill2
+ mova m2, spill0
+ mova m3, spill1
+ mova spill0, m6
+ mova spill1, m7
+ HADAMARD4_V m0, m1, m2, m3, m7
+ SUMSUB_BADC m0, m4, m1, m5, m7
+ HADAMARD 2, sumsub, 0, 4, 7, 6
+ HADAMARD 2, sumsub, 1, 5, 7, 6
+ HADAMARD 1, amax, 0, 4, 7, 6
+ HADAMARD 1, amax, 1, 5, 7, 6
+ mova m6, spill0
+ mova m7, spill1
+ paddw m0, m1
+ SUMSUB_BADC m2, m6, m3, m7, m4
+ HADAMARD 2, sumsub, 2, 6, 4, 5
+ HADAMARD 2, sumsub, 3, 7, 4, 5
+ HADAMARD 1, amax, 2, 6, 4, 5
+ HADAMARD 1, amax, 3, 7, 4, 5
+%endif ; sse2/non-sse2
+ paddw m0, m2
+ paddw m0, m3
ret
-%endif ; ARCH
-%endmacro ; SATDS_SSE2
+%endif ; ifndef mmxext
-%macro SA8D_16x16_32 1
-%ifndef ARCH_X86_64
cglobal x264_pixel_sa8d_8x8_%1, 4,7
mov r6, esp
and esp, ~15
- sub esp, 32
+ sub esp, 48
lea r4, [3*r1]
lea r5, [3*r3]
call x264_pixel_sa8d_8x8_internal_%1
cglobal x264_pixel_sa8d_16x16_%1, 4,7
mov r6, esp
and esp, ~15
- sub esp, 48
+ sub esp, 64
lea r4, [3*r1]
lea r5, [3*r3]
call x264_pixel_sa8d_8x8_internal_%1
+%ifidn %1, mmxext
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
- mova [esp+32], m0
+%endif
+ mova [esp+48], m0
call x264_pixel_sa8d_8x8_internal_%1
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8
add r2, 8
- paddusw m0, [esp+32]
- mova [esp+32], m0
+ paddusw m0, [esp+48]
+ mova [esp+48], m0
call x264_pixel_sa8d_8x8_internal_%1
+%ifidn %1, mmxext
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
+%endif
%if mmsize == 16
- paddusw m0, [esp+32]
+ paddusw m0, [esp+48]
%endif
- mova [esp+48-mmsize], m0
+ mova [esp+64-mmsize], m0
call x264_pixel_sa8d_8x8_internal_%1
- paddusw m0, [esp+48-mmsize]
+ paddusw m0, [esp+64-mmsize]
%if mmsize == 16
HADDUW m0, m1
%else
- mova m2, [esp+32]
+ mova m2, [esp+48]
pxor m7, m7
mova m1, m0
mova m3, m2
mov esp, r6
RET
%endif ; !ARCH_X86_64
-%endmacro ; SA8D_16x16_32
-
-
+%endmacro ; SA8D
;=============================================================================
; INTRA SATD
punpcklbw m5, m8
punpcklbw m6, m8
punpcklbw m7, m8
- HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
- TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
- HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
+
+ HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
; dc
movzx r0d, word [r1+0]
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
- HADAMARD4_1D m0, m1, m2, m3
- TRANSPOSE4x4W 0, 1, 2, 3, 4
- HADAMARD4_1D m0, m1, m2, m3
+ HADAMARD4_2D 0, 1, 2, 3, 4
SAVE_MM_PERMUTATION load_hadamard
ret
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
- HADAMARD4_1D m0, m1, m2, m3
- TRANSPOSE4x4W 0, 1, 2, 3, 4
- HADAMARD4_1D m0, m1, m2, m3
+ HADAMARD4_2D 0, 1, 2, 3, 4
mova [r3], m0
mova [r3+8], m1
mova [r3+16], m2
SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext
ret
-cglobal x264_hadamard_ac_2x2_mmxext
+cglobal x264_hadamard_ac_2x2max_mmxext
mova m0, [r3+0x00]
mova m1, [r3+0x20]
mova m2, [r3+0x40]
mova m3, [r3+0x60]
- HADAMARD4_1D m0, m1, m2, m3
- ABS2 m0, m1, m4, m5
- ABS2 m2, m3, m4, m5
- SAVE_MM_PERMUTATION x264_hadamard_ac_2x2_mmxext
+ sub r3, 8
+ SUMSUB_BADC m0, m1, m2, m3, m4
+ ABS4 m0, m2, m1, m3, m4, m5
+ HADAMARD 0, max, 0, 2, 4, 5
+ HADAMARD 0, max, 1, 3, 4, 5
+ paddw m7, m0
+ paddw m7, m1
+ SAVE_MM_PERMUTATION x264_hadamard_ac_2x2max_mmxext
ret
cglobal x264_hadamard_ac_8x8_mmxext
paddw m5, m0
call x264_hadamard_ac_4x4_mmxext
paddw m5, m0
- sub r3, 64
+ sub r3, 40
mova [rsp+gprsize+8], m5 ; save satd
- call x264_hadamard_ac_2x2_mmxext
- add r3, 8
- pand m6, m0
- mova m7, m1
- paddw m6, m2
- paddw m7, m3
-%rep 2
- call x264_hadamard_ac_2x2_mmxext
- add r3, 8
- paddw m6, m0
- paddw m7, m1
- paddw m6, m2
- paddw m7, m3
+%rep 3
+ call x264_hadamard_ac_2x2max_mmxext
%endrep
- call x264_hadamard_ac_2x2_mmxext
- sub r3, 24
- paddw m6, m0
+ mova m0, [r3+0x00]
+ mova m1, [r3+0x20]
+ mova m2, [r3+0x40]
+ mova m3, [r3+0x60]
+ SUMSUB_BADC m0, m1, m2, m3, m4
+ HADAMARD 0, sumsub, 0, 2, 4, 5
+ ABS4 m1, m3, m0, m2, m4, m5
+ HADAMARD 0, max, 1, 3, 4, 5
+ pand m6, m0
paddw m7, m1
paddw m6, m2
- paddw m7, m3
+ paddw m7, m7
paddw m6, m7
mova [rsp+gprsize], m6 ; save sa8d
SWAP m0, m6
HADAMARD_AC_WXH_MMX 16, 8
HADAMARD_AC_WXH_MMX 8, 8
+%macro LOAD_INC_8x4W_SSE2 5
+ movh m%1, [r0]
+ movh m%2, [r0+r1]
+ movh m%3, [r0+r1*2]
+ movh m%4, [r0+r2]
+%ifidn %1, 0
+ lea r0, [r0+r1*4]
+%endif
+ punpcklbw m%1, m%5
+ punpcklbw m%2, m%5
+ punpcklbw m%3, m%5
+ punpcklbw m%4, m%5
+%endmacro
+
+%macro LOAD_INC_8x4W_SSSE3 5
+ LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1]
+%ifidn %1, 0
+ lea r0, [r0+r1*4]
+%endif
+ HSUMSUB %1, %2, %3, %4, %5
+%endmacro
+
%macro HADAMARD_AC_SSE2 1
INIT_XMM
; in: r0=pix, r1=stride, r2=stride*3
%define spill1 [rsp+gprsize+16]
%define spill2 [rsp+gprsize+32]
%endif
+%ifnidn %1, sse2
+ ;LOAD_INC loads sumsubs
+ mova m7, [hmul_8p GLOBAL]
+%else
+ ;LOAD_INC only unpacks to words
pxor m7, m7
- movh m0, [r0]
- movh m1, [r0+r1]
- movh m2, [r0+r1*2]
- movh m3, [r0+r2]
- lea r0, [r0+r1*4]
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- HADAMARD4_1D m0, m1, m2, m3
- mova spill0, m3
- SWAP m3, m7
- movh m4, [r0]
- movh m5, [r0+r1]
- movh m6, [r0+r1*2]
- movh m7, [r0+r2]
- punpcklbw m4, m3
- punpcklbw m5, m3
- punpcklbw m6, m3
- punpcklbw m7, m3
- HADAMARD4_1D m4, m5, m6, m7
- mova m3, spill0
-%ifdef ARCH_X86_64
- TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
+%endif
+ LOAD_INC_8x4W 0, 1, 2, 3, 7
+%ifidn %1, sse2
+ HADAMARD4_2D_SSE 0, 1, 2, 3, 4
%else
- TRANSPOSE8x8W 0,1,2,3,4,5,6,7,spill0,spill1
+ HADAMARD4_V m0, m1, m2, m3, m4
%endif
- HADAMARD4_1D m0, m1, m2, m3
- HADAMARD4_1D m4, m5, m6, m7
mova spill0, m1
+ SWAP 1, 7
+ LOAD_INC_8x4W 4, 5, 6, 7, 1
+%ifidn %1, sse2
+ HADAMARD4_2D_SSE 4, 5, 6, 7, 1
+%else
+ HADAMARD4_V m4, m5, m6, m7, m1
+%endif
+
+%ifnidn %1, sse2
+ mova m1, spill0
+ mova spill0, m6
+ mova spill1, m7
+ HADAMARD 1, sumsub, 0, 1, 6, 7
+ HADAMARD 1, sumsub, 2, 3, 6, 7
+ mova m6, spill0
+ mova m7, spill1
+ mova spill0, m1
+ mova spill1, m0
+ HADAMARD 1, sumsub, 4, 5, 1, 0
+ HADAMARD 1, sumsub, 6, 7, 1, 0
+ mova m0, spill1
+%endif
+
mova spill1, m2
mova spill2, m3
ABS_MOV m1, m0
ABS_MOV m2, m4
ABS_MOV m3, m5
paddw m1, m2
- SUMSUB_BA m0, m4
+ SUMSUB_BA m0, m4; m2
+%ifnidn %1, sse2
+ pand m1, [mask_ac4b GLOBAL]
+%else
pand m1, [mask_ac4 GLOBAL]
+%endif
ABS_MOV m2, spill0
paddw m1, m3
ABS_MOV m3, spill1
paddw m2, spill1
psubw m5, spill0
paddw m1, spill0
- mova spill1, m7
- SBUTTERFLY qdq, 0, 4, 7
- SBUTTERFLY qdq, 1, 5, 7
- SBUTTERFLY qdq, 2, 6, 7
- SUMSUB_BADC m0, m4, m1, m5
- SUMSUB_BA m2, m6
- ABS1 m0, m7
- ABS1 m1, m7
- pand m0, [mask_ac8 GLOBAL]
- ABS1 m2, m7
+%ifnidn %1, sse2
+ mova spill1, m4
+ HADAMARD 2, amax, 3, 7, 4
+ HADAMARD 2, amax, 2, 6, 7, 4
+ mova m4, spill1
+ HADAMARD 2, amax, 1, 5, 6, 7
+ HADAMARD 2, sumsub, 0, 4, 5, 6
+%else
+ mova spill1, m4
+ HADAMARD 4, amax, 3, 7, 4
+ HADAMARD 4, amax, 2, 6, 7, 4
+ mova m4, spill1
+ HADAMARD 4, amax, 1, 5, 6, 7
+ HADAMARD 4, sumsub, 0, 4, 5, 6
+%endif
+ paddw m2, m3
+ paddw m2, m1
+ paddw m2, m2
ABS1 m4, m7
- ABS1 m5, m7
- ABS1 m6, m7
- mova m7, spill1
- paddw m0, m4
- SBUTTERFLY qdq, 3, 7, 4
- SUMSUB_BA m3, m7
- paddw m1, m5
- ABS1 m3, m4
- ABS1 m7, m4
- paddw m2, m6
- paddw m3, m7
- paddw m0, m1
- paddw m2, m3
- paddw m0, m2
+ pand m0, [mask_ac8 GLOBAL]
+ ABS1 m0, m7
+ paddw m2, m4
+ paddw m0, m2
mova [rsp+gprsize+16], m0 ; save sa8d
SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1
ret
%ifndef ARCH_X86_64
cextern x264_pixel_sa8d_8x8_internal_mmxext
-SA8D_16x16_32 mmxext
+SA8D mmxext
%endif
+%define TRANS TRANS_SSE2
%define ABS1 ABS1_MMX
%define ABS2 ABS2_MMX
+%define DIFFOP DIFF_UNPACK_SSE2
+%define JDUP JDUP_SSE2
+%define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2
+%define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
+%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
+%define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
+%define movdqu movups
+%define punpcklqdq movlhps
+INIT_XMM
+SA8D sse2
SATDS_SSE2 sse2
-SA8D_16x16_32 sse2
INTRA_SA8D_SSE2 sse2
INTRA_SATDS_MMX mmxext
HADAMARD_AC_SSE2 sse2
+
%define ABS1 ABS1_SSSE3
%define ABS2 ABS2_SSSE3
%define ABS_MOV ABS_MOV_SSSE3
-SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
+%define DIFFOP DIFF_SUMSUB_SSSE3
+%define JDUP JDUP_CONROE
+%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
+%define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
+%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
+%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
SATDS_SSE2 ssse3
-SA8D_16x16_32 ssse3
+SA8D ssse3
+HADAMARD_AC_SSE2 ssse3
+%undef movdqa ; nehalem doesn't like movaps
+%undef movdqu ; movups
+%undef punpcklqdq ; or movlhps
INTRA_SA8D_SSE2 ssse3
INTRA_SATDS_MMX ssse3
-HADAMARD_AC_SSE2 ssse3
-SATDS_SSE2 ssse3_phadd
-
+%define TRANS TRANS_SSE4
+%define JDUP JDUP_PENRYN
+%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
+SATDS_SSE2 sse4
+SA8D sse4
+HADAMARD_AC_SSE2 sse4
;=============================================================================
; SSIM
DECL_X4( sad, sse2 )
DECL_X4( sad, sse3 )
DECL_X1( ssd, mmx )
+DECL_X1( ssd, sse2slow )
DECL_X1( ssd, sse2 )
-DECL_X1( ssd, sse4 )
+DECL_X1( ssd, ssse3 )
DECL_X1( satd, mmxext )
DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 )
-DECL_X1( satd, ssse3_phadd )
+DECL_X1( satd, sse4 )
DECL_X1( sa8d, mmxext )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
+DECL_X1( sa8d, sse4)
DECL_X1( sad, cache32_mmxext );
DECL_X1( sad, cache64_mmxext );
DECL_X1( sad, cache64_sse2 );
DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( uint8_t *pix, int i_stride ))
+
void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
;*****************************************************************************
-;* x86inc.asm
+;* x86util.asm
;*****************************************************************************
-;* Copyright (C) 2008 Loren Merritt <lorenm@u.washington.edu>
+;* Copyright (C) 2008 x264 project
+;*
+;* Authors: Holger Lubitz <holger@lubitz.org>
+;* Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
SBUTTERFLY qdq, %4, %8, %2
SWAP %2, %5
SWAP %4, %7
-%if 0<11
+%if %0<11
movdqa m%5, %10
%endif
%endif
palignr %1, %2, %3
%endmacro
-%macro SUMSUB_BA 2
+%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
+%ifnum %5
+ mova m%1, m%5
+ mova m%3, m%5
+%else
+ mova m%1, %5
+ mova m%3, m%1
+%endif
+ pand m%1, m%2 ; dst .. y6 .. y4
+ pand m%3, m%4 ; src .. y6 .. y4
+ psrlw m%2, 8 ; dst .. y7 .. y5
+ psrlw m%4, 8 ; src .. y7 .. y5
+%endmacro
+
+%macro SUMSUB_BA 2-3
+%if %0==2
paddw %1, %2
paddw %2, %2
psubw %2, %1
+%else
+ mova %3, %1
+ paddw %1, %2
+ psubw %2, %3
+%endif
%endmacro
-%macro SUMSUB_BADC 4
+%macro SUMSUB_BADC 4-5
+%if %0==5
+ SUMSUB_BA %1, %2, %5
+ SUMSUB_BA %3, %4, %5
+%else
paddw %1, %2
paddw %3, %4
paddw %2, %2
paddw %4, %4
psubw %2, %1
psubw %4, %3
+%endif
%endmacro
-%macro HADAMARD8_1D 8
- SUMSUB_BADC %1, %5, %2, %6
- SUMSUB_BADC %3, %7, %4, %8
+%macro HADAMARD4_V 4+
+ SUMSUB_BADC %1, %2, %3, %4
SUMSUB_BADC %1, %3, %2, %4
- SUMSUB_BADC %5, %7, %6, %8
+%endmacro
+
+%macro HADAMARD8_V 8+
SUMSUB_BADC %1, %2, %3, %4
SUMSUB_BADC %5, %6, %7, %8
+ SUMSUB_BADC %1, %3, %2, %4
+ SUMSUB_BADC %5, %7, %6, %8
+ SUMSUB_BADC %1, %5, %2, %6
+ SUMSUB_BADC %3, %7, %4, %8
+%endmacro
+
+%macro HADAMARD 5-6
+; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes)
+; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes)
+; %3/%4: regs
+; %5(%6): tmpregs
+%if %1!=0 ; have to reorder stuff for horizontal op
+ %ifidn %2, sumsub
+ %define ORDER ord
+ ; sumsub needs order because a-b != b-a unless a=b
+ %else
+ %define ORDER unord
+ ; if we just max, order doesn't matter (allows pblendw+or in sse4)
+ %endif
+ %if %1==1
+ TRANS d, ORDER, %3, %4, %5, %6
+ %elif %1==2
+ %if mmsize==8
+ SBUTTERFLY dq, %3, %4, %5
+ %else
+ TRANS q, ORDER, %3, %4, %5, %6
+ %endif
+ %elif %1==4
+ SBUTTERFLY qdq, %3, %4, %5
+ %endif
+%endif
+%ifidn %2, sumsub
+ SUMSUB_BA m%3, m%4, m%5
+%else
+ %ifidn %2, amax
+ %if %0==6
+ ABS2 m%3, m%4, m%5, m%6
+ %else
+ ABS1 m%3, m%5
+ ABS1 m%4, m%5
+ %endif
+ %endif
+ pmaxsw m%3, m%4
+%endif
+%endmacro
+
+
+%macro HADAMARD2_2D 6-7 sumsub
+ HADAMARD 0, sumsub, %1, %2, %5
+ HADAMARD 0, sumsub, %3, %4, %5
+ SBUTTERFLY %6, %1, %2, %5
+%ifnum %7
+ HADAMARD 0, amax, %1, %2, %5, %7
+%else
+ HADAMARD 0, %7, %1, %2, %5
+%endif
+ SBUTTERFLY %6, %3, %4, %5
+%ifnum %7
+ HADAMARD 0, amax, %3, %4, %5, %7
+%else
+ HADAMARD 0, %7, %3, %4, %5
+%endif
+%endmacro
+
+%macro HADAMARD4_2D 5-6 sumsub
+ HADAMARD2_2D %1, %2, %3, %4, %5, wd
+ HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6
+ SWAP %2, %3
+%endmacro
+
+%macro HADAMARD4_2D_SSE 5-6 sumsub
+ HADAMARD 0, sumsub, %1, %2, %5 ; 1st V row 0 + 1
+ HADAMARD 0, sumsub, %3, %4, %5 ; 1st V row 2 + 3
+ SBUTTERFLY wd, %1, %2, %5 ; %1: m0 1+0 %2: m1 1+0
+ SBUTTERFLY wd, %3, %4, %5 ; %3: m0 3+2 %4: m1 3+2
+ HADAMARD2_2D %1, %3, %2, %4, %5, dq
+ SBUTTERFLY qdq, %1, %2, %5
+ HADAMARD 0, %6, %1, %2, %5 ; 2nd H m1/m0 row 0+1
+ SBUTTERFLY qdq, %3, %4, %5
+ HADAMARD 0, %6, %3, %4, %5 ; 2nd H m1/m0 row 2+3
+%endmacro
+
+%macro HADAMARD8_2D 9-10 sumsub
+ HADAMARD2_2D %1, %2, %3, %4, %9, wd
+ HADAMARD2_2D %5, %6, %7, %8, %9, wd
+ HADAMARD2_2D %1, %3, %2, %4, %9, dq
+ HADAMARD2_2D %5, %7, %6, %8, %9, dq
+ HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10
+ HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10
+%ifnidn %10, amax
+ SWAP %2, %5
+ SWAP %4, %7
+%endif
%endmacro
%macro SUMSUB2_AB 3
psubw %3, %2
%endmacro
+%macro SUMSUB2_BA 3
+ mova m%3, m%1
+ paddw m%1, m%2
+ paddw m%1, m%2
+ psubw m%2, m%3
+ psubw m%2, m%3
+%endmacro
+
%macro SUMSUBD2_AB 4
mova %4, %1
mova %3, %2
psraw %2, 1
- psraw %4, 1
- paddw %1, %2
- psubw %4, %3
+ psraw %1, 1
+ paddw %2, %4
+ psubw %1, %3
+%endmacro
+
+%macro DCT4_1D 5
+%ifnum %5
+ SUMSUB_BADC m%4, m%1, m%3, m%2; m%5
+ SUMSUB_BA m%3, m%4, m%5
+ SUMSUB2_AB m%1, m%2, m%5
+ SWAP %1, %3, %4, %5, %2
+%else
+ SUMSUB_BADC m%4, m%1, m%3, m%2
+ SUMSUB_BA m%3, m%4
+ mova [%5], m%2
+ SUMSUB2_AB m%1, [%5], m%2
+ SWAP %1, %3, %4, %2
+%endif
+%endmacro
+
+%macro IDCT4_1D 5-6
+%ifnum %5
+ SUMSUBD2_AB m%2, m%4, m%6, m%5
+ SUMSUB_BA m%3, m%1, m%6
+ SUMSUB_BADC m%4, m%3, m%2, m%1, m%6
+%else
+ SUMSUBD2_AB m%2, m%4, [%5], [%5+16]
+ SUMSUB_BA m%3, m%1
+ SUMSUB_BADC m%4, m%3, m%2, m%1
+%endif
+ SWAP %1, %4, %3
%endmacro
+
%macro LOAD_DIFF 5
%ifidn %3, none
movh %1, %4
%endif
%endmacro
-%macro LOAD_DIFF_8x4P 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer
- LOAD_DIFF %1, %5, none, [%7], [%8]
- LOAD_DIFF %2, %6, none, [%7+r1], [%8+r3]
- LOAD_DIFF %3, %5, none, [%7+2*r1], [%8+2*r3]
- LOAD_DIFF %4, %6, none, [%7+r4], [%8+r5]
+%macro LOAD_DIFF8x4_SSE2 8
+ LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDE], [%8+%1*FDEC_STRIDE]
+ LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDE], [%8+%2*FDEC_STRIDE]
+ LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDE], [%8+%3*FDEC_STRIDE]
+ LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDE], [%8+%4*FDEC_STRIDE]
%endmacro
-%macro STORE_DIFF 4
+%macro LOAD_DIFF8x4_SSSE3 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
+ movh m%2, [%8+%1*FDEC_STRIDE]
+ movh m%1, [%7+%1*FENC_STRIDE]
+ punpcklbw m%1, m%2
+ movh m%3, [%8+%2*FDEC_STRIDE]
+ movh m%2, [%7+%2*FENC_STRIDE]
+ punpcklbw m%2, m%3
+ movh m%4, [%8+%3*FDEC_STRIDE]
+ movh m%3, [%7+%3*FENC_STRIDE]
+ punpcklbw m%3, m%4
+ movh m%5, [%8+%4*FDEC_STRIDE]
+ movh m%4, [%7+%4*FENC_STRIDE]
+ punpcklbw m%4, m%5
+ pmaddubsw m%1, m%6
+ pmaddubsw m%2, m%6
+ pmaddubsw m%3, m%6
+ pmaddubsw m%4, m%6
+%endmacro
+
+%macro STORE_DCT 6
+ movq [%5+%6+ 0], m%1
+ movq [%5+%6+ 8], m%2
+ movq [%5+%6+16], m%3
+ movq [%5+%6+24], m%4
+ movhps [%5+%6+32], m%1
+ movhps [%5+%6+40], m%2
+ movhps [%5+%6+48], m%3
+ movhps [%5+%6+56], m%4
+%endmacro
+
+%macro STORE_IDCT 4
+ movhps [r0-4*FDEC_STRIDE], %1
+ movh [r0-3*FDEC_STRIDE], %1
+ movhps [r0-2*FDEC_STRIDE], %2
+ movh [r0-1*FDEC_STRIDE], %2
+ movhps [r0+0*FDEC_STRIDE], %3
+ movh [r0+1*FDEC_STRIDE], %3
+ movhps [r0+2*FDEC_STRIDE], %4
+ movh [r0+3*FDEC_STRIDE], %4
+%endmacro
+
+%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment?
+ LOAD_DIFF m%1, m%5, m%7, [%8], [%9]
+ LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3]
+ LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3]
+ LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5]
+%if %10
+ lea %8, [%8+4*r1]
+ lea %9, [%9+4*r3]
+%endif
+%endmacro
+
+%macro DIFFx2 6-7
+ movh %3, %5
+ punpcklbw %3, %4
psraw %1, 6
+ paddsw %1, %3
+ movh %3, %6
+ punpcklbw %3, %4
+ psraw %2, 6
+ paddsw %2, %3
+ packuswb %2, %1
+%endmacro
+
+%macro STORE_DIFF 4
movh %2, %4
punpcklbw %2, %3
+ psraw %1, 6
paddsw %1, %2
packuswb %1, %1
movh %4, %1
%endmacro
-
TEST_PIXEL( sad_aligned, 1 );
TEST_PIXEL( ssd, 1 );
TEST_PIXEL( satd, 0 );
- TEST_PIXEL( sa8d, 0 );
+ TEST_PIXEL( sa8d, 1 );
#define TEST_PIXEL_X( N ) \
for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \