void *x264_malloc( int i_size )
{
uint8_t *align_buf = NULL;
-#if SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64)
- /* Mac OS X and Win x64 always returns 16 byte aligned memory */
- align_buf = malloc( i_size );
-#elif HAVE_MALLOC_H
- align_buf = memalign( 16, i_size );
+#if HAVE_MALLOC_H
+ align_buf = memalign( NATIVE_ALIGN, i_size );
#else
- uint8_t *buf = malloc( i_size + 15 + sizeof(void **) );
+ uint8_t *buf = malloc( i_size + (NATIVE_ALIGN-1) + sizeof(void **) );
if( buf )
{
- align_buf = buf + 15 + sizeof(void **);
- align_buf -= (intptr_t) align_buf & 15;
+ align_buf = buf + (NATIVE_ALIGN-1) + sizeof(void **);
+ align_buf -= (intptr_t) align_buf & (NATIVE_ALIGN-1);
*( (void **) ( align_buf - sizeof(void **) ) ) = buf;
}
#endif
{
if( p )
{
-#if HAVE_MALLOC_H || SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64)
+#if HAVE_MALLOC_H
free( p );
#else
free( *( ( ( void **) p ) - 1 ) );
/* Current MB DCT coeffs */
struct
{
- ALIGNED_16( dctcoef luma16x16_dc[3][16] );
+ ALIGNED_N( dctcoef luma16x16_dc[3][16] );
ALIGNED_16( dctcoef chroma_dc[2][8] );
// FIXME share memory?
- ALIGNED_16( dctcoef luma8x8[12][64] );
- ALIGNED_16( dctcoef luma4x4[16*3][16] );
+ ALIGNED_N( dctcoef luma8x8[12][64] );
+ ALIGNED_N( dctcoef luma4x4[16*3][16] );
} dct;
/* MB table and cache for current frame/mb */
dctf->sub8x8_dct = x264_sub8x8_dct_xop;
dctf->sub16x16_dct = x264_sub16x16_dct_xop;
}
+
+ if( cpu&X264_CPU_AVX2 )
+ {
+ dctf->add8x8_idct = x264_add8x8_idct_avx2;
+ dctf->add16x16_idct = x264_add16x16_idct_avx2;
+ dctf->sub8x8_dct = x264_sub8x8_dct_avx2;
+ dctf->sub16x16_dct = x264_sub16x16_dct_avx2;
+#if ARCH_X86_64
+ dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2;
+#endif
+ }
#endif //HAVE_MMX
#if HAVE_ALTIVEC
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
}
+
+ if( cpu&X264_CPU_AVX2 )
+ {
+ pf_interlaced->interleave_8x8_cavlc =
+ pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
+ }
#endif // HIGH_BIT_DEPTH
#endif
}
int i_mb_count = h->mb.i_mb_count;
int i_stride, i_width, i_lines, luma_plane_count;
int i_padv = PADV << PARAM_INTERLACED;
- int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
+ int align = 16;
+#if ARCH_X86 || ARCH_X86_64
+ if( h->param.cpu&X264_CPU_CACHELINE_64 )
+ align = 64;
+ else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX2 )
+ align = 32;
+#endif
#if ARCH_PPC
int disalign = 1<<9;
#else
int scratch_size = 0;
if( !b_lookahead )
{
- int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t);
+ int buf_hpel = (h->thread[0]->fdec->i_width[0]+48+32) * sizeof(int16_t);
int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
#else
#define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n)))
#endif
+#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
#define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 )
#define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
#define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
+/* For AVX2 */
+#if ARCH_X86 || ARCH_X86_64
+#define NATIVE_ALIGN 32
+#define ALIGNED_N ALIGNED_32
+#define ALIGNED_ARRAY_N ALIGNED_ARRAY_32
+#else
+#define NATIVE_ALIGN 16
+#define ALIGNED_N ALIGNED_16
+#define ALIGNED_ARRAY_N ALIGNED_ARRAY_16
+#endif
+
#define UNINIT(x) x=x
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2;
INIT6( satd_x3, _sse2 );
INIT6( satd_x4, _sse2 );
- if( !(cpu&X264_CPU_STACK_MOD4) )
- {
- INIT4( hadamard_ac, _sse2 );
- }
+ INIT4( hadamard_ac, _sse2 );
INIT_ADS( _sse2 );
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2;
if( cpu&X264_CPU_SSSE3 )
{
+ INIT4( hadamard_ac, _ssse3 );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
- INIT4( hadamard_ac, _ssse3 );
pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_ssse3;
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_ssse3;
pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_ssse3;
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3_atom;
INIT6( satd_x3, _ssse3_atom );
INIT6( satd_x4, _ssse3_atom );
- if( !(cpu&X264_CPU_STACK_MOD4) )
- {
- INIT4( hadamard_ac, _ssse3_atom );
- }
+ INIT4( hadamard_ac, _ssse3_atom );
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3_atom;
#endif
INIT2( sad_x3, _cache64_ssse3 );
INIT2( sad_x4, _cache64_ssse3 );
}
+ else
+ {
+ pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_ssse3;
+ pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_ssse3;
+ pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_ssse3;
+ }
if( (cpu&X264_CPU_SLOW_ATOM) || (cpu&X264_CPU_SLOW_SHUFFLE) )
{
INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
INIT8( satd, _sse4 );
INIT7( satd_x3, _sse4 );
INIT7( satd_x4, _sse4 );
+ INIT4( hadamard_ac, _sse4 );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
- INIT4( hadamard_ac, _sse4 );
pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_sse4;
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_sse4;
pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_sse4;
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
INIT_ADS( _avx );
+ INIT4( hadamard_ac, _avx );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
- INIT4( hadamard_ac, _avx );
pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_avx;
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_avx;
pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_avx;
INIT7( satd, _xop );
INIT7( satd_x3, _xop );
INIT7( satd_x4, _xop );
+ INIT4( hadamard_ac, _xop );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
- INIT4( hadamard_ac, _xop );
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_xop;
}
INIT5( ssd, _xop );
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop;
+#endif
+ }
+
+ if( cpu&X264_CPU_AVX2 )
+ {
+ INIT2( ssd, _avx2 );
+ INIT2( sad_x3, _avx2 );
+ INIT2( sad_x4, _avx2 );
+ INIT4( satd, _avx2 );
+ INIT2( hadamard_ac, _avx2 );
+ INIT_ADS( _avx2 );
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx2;
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2;
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_avx2;
+ pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_avx2;
+#if ARCH_X86_64
+ pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2;
#endif
}
#endif //HAVE_MMX
pf->denoise_dct = x264_denoise_dct_mmx;
pf->decimate_score15 = x264_decimate_score15_mmx2;
pf->decimate_score16 = x264_decimate_score16_mmx2;
- if( cpu&X264_CPU_SLOW_CTZ )
- {
- pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
- pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
- }
pf->decimate_score64 = x264_decimate_score64_mmx2;
pf->coeff_last8 = x264_coeff_last8_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
pf->decimate_score15 = x264_decimate_score15_sse2;
pf->decimate_score16 = x264_decimate_score16_sse2;
pf->decimate_score64 = x264_decimate_score64_sse2;
- if( cpu&X264_CPU_SLOW_CTZ )
- {
- pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
- pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
- }
pf->coeff_last8 = x264_coeff_last8_sse2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
pf->denoise_dct = x264_denoise_dct_ssse3;
pf->decimate_score15 = x264_decimate_score15_ssse3;
pf->decimate_score16 = x264_decimate_score16_ssse3;
- if( cpu&X264_CPU_SLOW_CTZ )
- {
- pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
- pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
- }
pf->decimate_score64 = x264_decimate_score64_ssse3;
INIT_TRELLIS( ssse3 );
}
pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
pf->decimate_score15 = x264_decimate_score15_mmx2;
pf->decimate_score16 = x264_decimate_score16_mmx2;
- if( cpu&X264_CPU_SLOW_CTZ )
- {
- pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
- pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
- }
pf->decimate_score64 = x264_decimate_score64_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
pf->decimate_score15 = x264_decimate_score15_sse2;
pf->decimate_score16 = x264_decimate_score16_sse2;
pf->decimate_score64 = x264_decimate_score64_sse2;
- if( cpu&X264_CPU_SLOW_CTZ )
- {
- pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
- pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
- }
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
pf->denoise_dct = x264_denoise_dct_ssse3;
pf->decimate_score15 = x264_decimate_score15_ssse3;
pf->decimate_score16 = x264_decimate_score16_ssse3;
- if( cpu&X264_CPU_SLOW_CTZ )
- {
- pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
- pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
- }
pf->decimate_score64 = x264_decimate_score64_ssse3;
INIT_TRELLIS( ssse3 );
}
pf->dequant_8x8 = x264_dequant_8x8_xop;
}
}
+
+ if( cpu&X264_CPU_AVX2 )
+ {
+ pf->quant_4x4 = x264_quant_4x4_avx2;
+ pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
+ pf->quant_8x8 = x264_quant_8x8_avx2;
+ pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
+ if( cpu&X264_CPU_LZCNT )
+ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
+ pf->dequant_4x4 = x264_dequant_4x4_avx2;
+ pf->dequant_8x8 = x264_dequant_8x8_avx2;
+ if( h->param.i_cqm_preset == X264_CQM_FLAT )
+ {
+ pf->dequant_4x4 = x264_dequant_4x4_flat16_avx2;
+ pf->dequant_8x8 = x264_dequant_8x8_flat16_avx2;
+ }
+ pf->decimate_score64 = x264_decimate_score64_avx2;
+ pf->denoise_dct = x264_denoise_dct_avx2;
+ }
#endif // HAVE_MMX
#if HAVE_ALTIVEC
%include "x86inc.asm"
-SECTION_RODATA
+SECTION_RODATA 32
+
+const pb_1, times 32 db 1
+const hsub_mul, times 16 db 1, -1
+const pw_1, times 16 dw 1
+const pw_16, times 16 dw 16
+const pw_32, times 16 dw 32
+const pw_00ff, times 16 dw 0x00ff
+const pd_1, times 8 dd 1
+const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
const pb_01, times 8 db 0,1
const pb_0, times 16 db 0
const pb_a1, times 16 db 0xa1
-const pb_1, times 16 db 1
const pb_3, times 16 db 3
-const hsub_mul, times 8 db 1, -1
const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
-const pw_1, times 8 dw 1
const pw_2, times 8 dw 2
const pw_m2, times 8 dw -2
const pw_4, times 8 dw 4
const pw_8, times 8 dw 8
-const pw_16, times 8 dw 16
-const pw_32, times 8 dw 32
const pw_64, times 8 dw 64
const pw_32_0, times 4 dw 32,
times 4 dw 0
const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
-const pd_1, times 4 dd 1
const pd_32, times 4 dd 32
const pd_1024, times 4 dd 1024
const pd_ffff, times 4 dd 0xffff
-const pw_00ff, times 8 dw 0x00ff
const pw_ff00, times 8 dw 0xff00
const sw_64, dd 64
INIT_XMM xop
DCT_SUB8
+INIT_YMM avx2
+cglobal sub16x16_dct8, 3,3,10
+ add r0, 128
+ add r2, 4*FDEC_STRIDE
+ call .sub16x8_dct8
+ add r0, 256
+ add r1, FENC_STRIDE*8
+ add r2, FDEC_STRIDE*8
+ call .sub16x8_dct8
+ RET
+.sub16x8_dct8:
+ LOAD_DIFF16x2_AVX2 0, 1, 2, 3, 0, 1
+ LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
+ LOAD_DIFF16x2_AVX2 4, 5, 6, 7, 4, 5
+ LOAD_DIFF16x2_AVX2 6, 7, 8, 9, 6, 7
+ DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
+ TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
+ DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
+ mova [r0-0x80+0x00], xm0
+ vextracti128 [r0+0x00], m0, 1
+ mova [r0-0x80+0x10], xm1
+ vextracti128 [r0+0x10], m1, 1
+ mova [r0-0x80+0x20], xm2
+ vextracti128 [r0+0x20], m2, 1
+ mova [r0-0x80+0x30], xm3
+ vextracti128 [r0+0x30], m3, 1
+ mova [r0-0x80+0x40], xm4
+ vextracti128 [r0+0x40], m4, 1
+ mova [r0-0x80+0x50], xm5
+ vextracti128 [r0+0x50], m5, 1
+ mova [r0-0x80+0x60], xm6
+ vextracti128 [r0+0x60], m6, 1
+ mova [r0-0x80+0x70], xm7
+ vextracti128 [r0+0x70], m7, 1
+ ret
+
;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
ADD8x8
INIT_XMM avx
ADD8x8
+
%endif ; !HIGH_BIT_DEPTH
cextern pd_32
cextern pw_ppppmmmm
cextern pw_pmpmpmpm
+cextern deinterleave_shufd
%macro WALSH4_1D 6
SUMSUB_BADC %1, %5, %4, %3, %2, %6
ADD4x4
INIT_XMM avx
ADD4x4
+
+%macro STOREx2_AVX2 9
+ movq xm%3, [r0+%5*FDEC_STRIDE]
+ vinserti128 m%3, m%3, [r0+%6*FDEC_STRIDE], 1
+ movq xm%4, [r0+%7*FDEC_STRIDE]
+ vinserti128 m%4, m%4, [r0+%8*FDEC_STRIDE], 1
+ punpcklbw m%3, m%9
+ punpcklbw m%4, m%9
+ psraw m%1, 6
+ psraw m%2, 6
+ paddsw m%1, m%3
+ paddsw m%2, m%4
+ packuswb m%1, m%2
+ vextracti128 xm%2, m%1, 1
+ movq [r0+%5*FDEC_STRIDE], xm%1
+ movq [r0+%6*FDEC_STRIDE], xm%2
+ movhps [r0+%7*FDEC_STRIDE], xm%1
+ movhps [r0+%8*FDEC_STRIDE], xm%2
+%endmacro
+
+INIT_YMM avx2
+cglobal add8x8_idct, 2,3,8
+ add r0, 4*FDEC_STRIDE
+ pxor m7, m7
+ TAIL_CALL .skip_prologue, 0
+global current_function %+ .skip_prologue
+.skip_prologue:
+ mova m0, [r1+ 0]
+ mova m1, [r1+ 32]
+ mova m2, [r1+ 64]
+ mova m3, [r1+ 96]
+ ; TRANSPOSE4x4Q
+ SBUTTERFLY qdq, 0, 1, 4
+ SBUTTERFLY qdq, 2, 3, 5
+ SBUTTERFLY dqqq, 0, 2, 4
+ SBUTTERFLY dqqq, 1, 3, 5
+ IDCT4_1D w,0,1,2,3,4,5
+ TRANSPOSE2x4x4W 0,1,2,3,4
+ paddw m0, [pw_32]
+ IDCT4_1D w,0,1,2,3,4,5
+ STOREx2_AVX2 0, 1, 4, 5, -4, 0, -3, 1, 7
+ STOREx2_AVX2 2, 3, 4, 5, -2, 2, -1, 3, 7
+ ret
+
+; 2xdst, 2xtmp, 4xsrcrow, 1xzero
+%macro LOAD_DIFF8x2_AVX2 9
+ movq xm%1, [r1+%5*FENC_STRIDE]
+ movq xm%2, [r1+%6*FENC_STRIDE]
+ vinserti128 m%1, m%1, [r1+%7*FENC_STRIDE], 1
+ vinserti128 m%2, m%2, [r1+%8*FENC_STRIDE], 1
+ punpcklbw m%1, m%9
+ punpcklbw m%2, m%9
+ movq xm%3, [r2+(%5-4)*FDEC_STRIDE]
+ movq xm%4, [r2+(%6-4)*FDEC_STRIDE]
+ vinserti128 m%3, m%3, [r2+(%7-4)*FDEC_STRIDE], 1
+ vinserti128 m%4, m%4, [r2+(%8-4)*FDEC_STRIDE], 1
+ punpcklbw m%3, m%9
+ punpcklbw m%4, m%9
+ psubw m%1, m%3
+ psubw m%2, m%4
+%endmacro
+
+; 4x src, 1x tmp
+%macro STORE8_DCT_AVX2 5
+ SBUTTERFLY qdq, %1, %2, %5
+ SBUTTERFLY qdq, %3, %4, %5
+ mova [r0+ 0], xm%1
+ mova [r0+ 16], xm%3
+ mova [r0+ 32], xm%2
+ mova [r0+ 48], xm%4
+ vextracti128 [r0+ 64], m%1, 1
+ vextracti128 [r0+ 80], m%3, 1
+ vextracti128 [r0+ 96], m%2, 1
+ vextracti128 [r0+112], m%4, 1
+%endmacro
+
+%macro STORE16_DCT_AVX2 5
+ SBUTTERFLY qdq, %1, %2, %5
+ SBUTTERFLY qdq, %3, %4, %5
+ mova [r0+ 0-128], xm%1
+ mova [r0+16-128], xm%3
+ mova [r0+32-128], xm%2
+ mova [r0+48-128], xm%4
+ vextracti128 [r0+ 0], m%1, 1
+ vextracti128 [r0+16], m%3, 1
+ vextracti128 [r0+32], m%2, 1
+ vextracti128 [r0+48], m%4, 1
+%endmacro
+
+INIT_YMM avx2
+cglobal sub8x8_dct, 3,3,7
+ pxor m6, m6
+ add r2, 4*FDEC_STRIDE
+ LOAD_DIFF8x2_AVX2 0, 1, 4, 5, 0, 1, 4, 5, 6
+ LOAD_DIFF8x2_AVX2 2, 3, 4, 5, 2, 3, 6, 7, 6
+ DCT4_1D 0, 1, 2, 3, 4
+ TRANSPOSE2x4x4W 0, 1, 2, 3, 4
+ DCT4_1D 0, 1, 2, 3, 4
+ STORE8_DCT_AVX2 0, 1, 2, 3, 4
+ RET
+
+INIT_YMM avx2
+cglobal sub16x16_dct, 3,3,6
+ add r0, 128
+ add r2, 4*FDEC_STRIDE
+ call .sub16x4_dct
+ add r0, 64
+ add r1, 4*FENC_STRIDE
+ add r2, 4*FDEC_STRIDE
+ call .sub16x4_dct
+ add r0, 256-64
+ add r1, 4*FENC_STRIDE
+ add r2, 4*FDEC_STRIDE
+ call .sub16x4_dct
+ add r0, 64
+ add r1, 4*FENC_STRIDE
+ add r2, 4*FDEC_STRIDE
+ call .sub16x4_dct
+ RET
+.sub16x4_dct:
+ LOAD_DIFF16x2_AVX2 0, 1, 4, 5, 0, 1
+ LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
+ DCT4_1D 0, 1, 2, 3, 4
+ TRANSPOSE2x4x4W 0, 1, 2, 3, 4
+ DCT4_1D 0, 1, 2, 3, 4
+ STORE16_DCT_AVX2 0, 1, 2, 3, 4
+ ret
%endif ; HIGH_BIT_DEPTH
INIT_MMX
cglobal %1, 2,2,11
pxor m7, m7
%endif
-%if mmsize==16 && %3!=256
+%if mmsize>=16 && %3!=256
add r0, 4*FDEC_STRIDE
%endif
.skip_prologue:
SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0, 11
SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11
SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11
+
+INIT_YMM
+ADD_NxN_IDCT add16x16_idct_avx2, add8x8_idct_avx2, 128, 8, 0, 0
%endif ; HIGH_BIT_DEPTH
%if HIGH_BIT_DEPTH
ZIGZAG_8x8_CAVLC
INIT_XMM avx
ZIGZAG_8x8_CAVLC
+
+INIT_YMM avx2
+cglobal zigzag_interleave_8x8_cavlc, 3,3,6
+ mova m0, [r1+ 0]
+ mova m1, [r1+32]
+ mova m2, [r1+64]
+ mova m3, [r1+96]
+ mova m5, [deinterleave_shufd]
+ SBUTTERFLY wd, 0, 1, 4
+ SBUTTERFLY wd, 2, 3, 4
+ SBUTTERFLY wd, 0, 1, 4
+ SBUTTERFLY wd, 2, 3, 4
+ vpermd m0, m5, m0
+ vpermd m1, m5, m1
+ vpermd m2, m5, m2
+ vpermd m3, m5, m3
+ mova [r0+ 0], xm0
+ mova [r0+ 16], xm2
+ vextracti128 [r0+ 32], m0, 1
+ vextracti128 [r0+ 48], m2, 1
+ mova [r0+ 64], xm1
+ mova [r0+ 80], xm3
+ vextracti128 [r0+ 96], m1, 1
+ vextracti128 [r0+112], m3, 1
+
+ packsswb m0, m2 ; nnz0, nnz1
+ packsswb m1, m3 ; nnz2, nnz3
+ packsswb m0, m1 ; {nnz0,nnz2}, {nnz1,nnz3}
+ vpermq m0, m0, q3120 ; {nnz0,nnz1}, {nnz2,nnz3}
+ pxor m5, m5
+ pcmpeqq m0, m5
+ pmovmskb r0d, m0
+ not r0d
+ and r0d, 0x01010101
+ mov [r2+0], r0w
+ shr r0d, 16
+ mov [r2+8], r0w
+ RET
%endif ; !HIGH_BIT_DEPTH
void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_add16x16_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct [16] );
void x264_add8x8_idct_sse2 ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add8x8_idct_avx ( pixel *p_dst, dctcoef dct[ 4][16] );
+void x264_add8x8_idct_avx2 ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add16x16_idct_sse2 ( pixel *p_dst, dctcoef dct[16][16] );
void x264_add16x16_idct_avx ( pixel *p_dst, dctcoef dct[16][16] );
+void x264_add16x16_idct_avx2 ( pixel *p_dst, dctcoef dct[16][16] );
void x264_add8x8_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [ 4] );
void x264_add16x16_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [16] );
void x264_add8x8_idct_dc_ssse3 ( uint8_t *p_dst, int16_t dct [ 4] );
void x264_sub16x16_dct8_sse4 ( int32_t dct[4][64], uint16_t *pix1, uint16_t *pix2 );
void x264_sub8x8_dct8_avx ( dctcoef dct [64], pixel *pix1, pixel *pix2 );
void x264_sub16x16_dct8_avx ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
+void x264_sub16x16_dct8_avx2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct [64] );
void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_avx2( int16_t *dst, int16_t *src, uint8_t *nnz );
#endif
SECTION_RODATA 32
-ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
+pw_512: times 16 dw 512
+ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
ch_shuf_adj: times 8 db 0
times 8 db 2
times 8 db 4
cextern pw_pixel_max
cextern sw_64
cextern pd_32
+cextern deinterleave_shufd
;=============================================================================
; implicit weighted biprediction
movh m1, %2
punpcklbw m0, m1
pmaddubsw m0, m3
- paddw m0, m4
- psraw m0, 6
+ pmulhrsw m0, m4
%endmacro
%macro BIWEIGHT_START_SSSE3 0
sub t7d, t6d
shl t7d, 8
add t6d, t7d
- movd m3, t6d
- mova m4, [pw_32]
+ mova m4, [pw_512]
+ movd xm3, t6d
+%if cpuflag(avx2)
+ vpbroadcastw m3, xm3
+%else
SPLATW m3, m3 ; weight_dst,src
+%endif
%endmacro
%if HIGH_BIT_DEPTH
INIT_XMM ssse3
AVG_WEIGHT 8, 7
AVG_WEIGHT 16, 7
+
+INIT_YMM avx2
+cglobal pixel_avg_weight_w16
+ BIWEIGHT_START
+ AVG_START 5
+.height_loop:
+ movu xm0, [t2]
+ movu xm1, [t4]
+ vinserti128 m0, m0, [t2+t3], 1
+ vinserti128 m1, m1, [t4+t5], 1
+ SBUTTERFLY bw, 0, 1, 2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ packuswb m0, m1
+ mova [t0], xm0
+ vextracti128 [t0+t1], m0, 1
+ AVG_END
%endif ;HIGH_BIT_DEPTH
;=============================================================================
%endmacro
; src, dst, width
-%macro WEIGHT_TWO_ROW 3
+%macro WEIGHT_TWO_ROW 4
%assign x 0
%rep (%3+mmsize/2-1)/(mmsize/2)
%if %3-x/2 <= 4 && mmsize == 16
%else ; !HIGH_BIT_DEPTH
%macro WEIGHT_START 1
+%if cpuflag(avx2)
+ vbroadcasti128 m3, [r4]
+ vbroadcasti128 m4, [r4+16]
+%else
mova m3, [r4]
mova m4, [r4+16]
%if notcpuflag(ssse3)
movd m5, [r4+32]
+%endif
%endif
pxor m2, m2
%endmacro
-; src1, src2, dst1, dst2
-%macro WEIGHT_ROWx2 4
+; src1, src2, dst1, dst2, fast
+%macro WEIGHT_ROWx2 5
movh m0, [%1 ]
movh m1, [%1+mmsize/2]
movh m6, [%2 ]
punpcklbw m6, m2
punpcklbw m7, m2
%if cpuflag(ssse3)
+%if %5==0
psllw m0, 7
psllw m1, 7
psllw m6, 7
psllw m7, 7
+%endif
pmulhrsw m0, m3
pmulhrsw m1, m3
pmulhrsw m6, m3
mova [%4], m6
%endmacro
-; src1, src2, dst1, dst2, width
-%macro WEIGHT_COL 5
+; src1, src2, dst1, dst2, width, fast
+%macro WEIGHT_COL 6
+%if cpuflag(avx2)
+%if %5==16
+ movu xm0, [%1]
+ vinserti128 m0, m0, [%2], 1
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m0, m2
+%if %6==0
+ psllw m0, 7
+ psllw m1, 7
+%endif
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddw m0, m4
+ paddw m1, m4
+ packuswb m0, m1
+ mova [%3], xm0
+ vextracti128 [%4], m0, 1
+%else
+ movq xm0, [%1]
+ vinserti128 m0, m0, [%2], 1
+ punpcklbw m0, m2
+%if %6==0
+ psllw m0, 7
+%endif
+ pmulhrsw m0, m3
+ paddw m0, m4
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+%if %5 == 8
+ movq [%3], xm0
+ movq [%4], xm1
+%else
+ movd [%3], xm0
+ movd [%4], xm1
+%endif
+%endif
+%else
movh m0, [%1]
movh m1, [%2]
punpcklbw m0, m2
punpcklbw m1, m2
%if cpuflag(ssse3)
+%if %6==0
psllw m0, 7
psllw m1, 7
+%endif
pmulhrsw m0, m3
pmulhrsw m1, m3
paddw m0, m4
movd [%3], m0 ; width 2 can write garbage for the last 2 bytes
movd [%4], m1
%endif
+%endif
%endmacro
-
; src, dst, width
-%macro WEIGHT_TWO_ROW 3
+%macro WEIGHT_TWO_ROW 4
%assign x 0
%rep %3
%if (%3-x) >= mmsize
- WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x
+ WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4
%assign x (x+mmsize)
%else
- WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, %3-x
- %exitrep
+ %assign w %3-x
+%if w == 20
+ %assign w 16
+%endif
+ WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4
+ %assign x (x+w)
%endif
%if x >= %3
%exitrep
cglobal mc_weight_w%1, 6,6,8
FIX_STRIDES r1, r3
WEIGHT_START %1
+%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
+ ; we can merge the shift step into the scale factor
+ ; if (m3<<7) doesn't overflow an int16_t
+ cmp byte [r4+1], 0
+ jz .fast
+%endif
.loop:
- WEIGHT_TWO_ROW r2, r0, %1
+ WEIGHT_TWO_ROW r2, r0, %1, 0
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
sub r5d, 2
jg .loop
RET
+%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
+.fast:
+ psllw m3, 7
+.fastloop:
+ WEIGHT_TWO_ROW r2, r0, %1, 1
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ sub r5d, 2
+ jg .fastloop
+ RET
+%endif
%endmacro
INIT_MMX mmx2
WEIGHTER 8
WEIGHTER 16
WEIGHTER 20
+INIT_YMM avx2
+WEIGHTER 8
+WEIGHTER 16
+WEIGHTER 20
%endif
%macro OFFSET_OP 7
mov eax, %2
cmp dword r6m, 32
jne pixel_avg_weight_w%1 %+ SUFFIX
+%if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads
+ jmp pixel_avg_w%1_avx2
+%else
%if mmsize == 16 && %1 == 16
test dword r4m, 15
jz pixel_avg_w%1_sse2
%endif
jmp pixel_avg_w%1_mmx2
+%endif
%endmacro
;-----------------------------------------------------------------------------
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
+INIT_XMM avx2
+AVG_FUNC 16, movdqu, movdqa
+AVGH 16, 16
+AVGH 16, 8
%endif ;HIGH_BIT_DEPTH
AVG2_W20 sse2
AVG2_W20 sse2_misalign
+INIT_YMM avx2
+cglobal pixel_avg2_w20, 6,7
+ sub r2, r4
+ lea r6, [r2+r3]
+.height_loop:
+ movu m0, [r4]
+ movu m1, [r4+r3]
+ pavgb m0, [r4+r2]
+ pavgb m1, [r4+r6]
+ lea r4, [r4+r3*2]
+ mova [r0], m0
+ mova [r0+r1], m1
+ lea r0, [r0+r1*2]
+ sub r5d, 2
+ jg .height_loop
+ RET
+
; Cacheline split code for processors with high latencies for loads
; split over cache lines. See sad-a.asm for a more detailed explanation.
; This particular instance is complicated by the fact that src1 and src2
%macro MC_CHROMA_SSSE3 0
cglobal mc_chroma
+%if cpuflag(avx2)
MC_CHROMA_START 9
+%else
+ MC_CHROMA_START 10
+%endif
and r5d, 7
and t2d, 7
mov t0d, r5d
sub r5d, t2d
imul t2d, t0d ; (x*255+8)*y
imul r5d, t0d ; (x*255+8)*(8-y)
- movd m6, t2d
- movd m7, r5d
+ movd xm6, t2d
+ movd xm7, r5d
%if cpuflag(cache64)
mov t0d, r3d
and t0d, 7
%ifdef PIC
lea t1, [ch_shuf_adj]
- movddup m5, [t1 + t0*4]
+ movddup xm5, [t1 + t0*4]
%else
- movddup m5, [ch_shuf_adj + t0*4]
+ movddup xm5, [ch_shuf_adj + t0*4]
%endif
- paddb m5, [ch_shuf]
+ paddb xm5, [ch_shuf]
and r3, ~7
%else
mova m5, [ch_shuf]
movifnidn r1, r1mp
movifnidn r2d, r2m
movifnidn r5d, r8m
+%if cpuflag(avx2)
+ vpbroadcastw m6, xm6
+ vpbroadcastw m7, xm7
+%else
SPLATW m6, m6
SPLATW m7, m7
+%endif
+%if ARCH_X86_64
+ %define shiftround m8
+ mova m8, [pw_512]
+%else
+ %define shiftround [pw_512]
+%endif
cmp dword r7m, 4
jg .width8
- movu m0, [r3]
+
+%if cpuflag(avx2)
+.loop4:
+ movu xm0, [r3]
+ movu xm1, [r3+r4]
+ vinserti128 m0, m0, [r3+r4], 1
+ vinserti128 m1, m1, [r3+r4*2], 1
pshufb m0, m5
+ pshufb m1, m5
+ pmaddubsw m0, m7
+ pmaddubsw m1, m6
+ paddw m0, m1
+ pmulhrsw m0, shiftround
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [r0], xm0
+ movd [r0+r2], xm1
+ psrldq xm0, 4
+ psrldq xm1, 4
+ movd [r1], xm0
+ movd [r1+r2], xm1
+ lea r3, [r3+r4*2]
+ lea r0, [r0+r2*2]
+ lea r1, [r1+r2*2]
+ sub r5d, 2
+ jg .loop4
+ RET
+.width8:
+ movu xm0, [r3]
+ vinserti128 m0, m0, [r3+8], 1
+ pshufb m0, m5
+.loop8:
+ movu xm3, [r3+r4]
+ vinserti128 m3, m3, [r3+r4+8], 1
+ pshufb m3, m5
+ pmaddubsw m1, m0, m7
+ pmaddubsw m2, m3, m6
+ pmaddubsw m3, m3, m7
+
+ movu xm0, [r3+r4*2]
+ vinserti128 m0, m0, [r3+r4*2+8], 1
+ pshufb m0, m5
+ pmaddubsw m4, m0, m6
+
+ paddw m1, m2
+ paddw m3, m4
+ pmulhrsw m1, shiftround
+ pmulhrsw m3, shiftround
+ packuswb m1, m3
+ mova m2, [deinterleave_shufd]
+ vpermd m1, m2, m1
+ vextracti128 xm2, m1, 1
+ movq [r0], xm1
+ movhps [r1], xm1
+ movq [r0+r2], xm2
+ movhps [r1+r2], xm2
+%else
+ movu m0, [r3]
+ pshufb m0, xm5
.loop4:
movu m1, [r3+r4]
pshufb m1, m5
pmaddubsw m2, m1, m7
pmaddubsw m1, m6
pmaddubsw m3, m6
- paddw m0, [pw_32]
- paddw m2, [pw_32]
paddw m1, m0
paddw m3, m2
+ pmulhrsw m1, shiftround
+ pmulhrsw m3, shiftround
mova m0, m4
- psrlw m1, 6
- psrlw m3, 6
packuswb m1, m3
movhlps m3, m1
- movd [r0], m1
+ movd [r0], xm1
movd [r0+r2], m3
psrldq m1, 4
psrldq m3, 4
sub r5d, 2
jg .loop4
RET
-
.width8:
movu m0, [r3]
pshufb m0, m5
movu m1, [r3+8]
pshufb m1, m5
%if ARCH_X86_64
- SWAP 8, 6
- %define mult1 m8
+ SWAP 9, 6
+ %define mult1 m9
%else
mova r0m, m6
%define mult1 r0m
pmaddubsw m1, m7
pmaddubsw m2, mult1
pmaddubsw m3, mult1
- paddw m0, [pw_32]
- paddw m1, [pw_32]
paddw m0, m2
paddw m1, m3
- psrlw m0, 6
- psrlw m1, 6
+ pmulhrsw m0, shiftround ; x + 32 >> 6
+ pmulhrsw m1, shiftround
packuswb m0, m1
pshufd m0, m0, q3120
movq [r0], m0
pmaddubsw m6, m7
pmaddubsw m2, mult1
pmaddubsw m3, mult1
- paddw m4, [pw_32]
- paddw m6, [pw_32]
paddw m2, m4
paddw m3, m6
- psrlw m2, 6
- psrlw m3, 6
+ pmulhrsw m2, shiftround
+ pmulhrsw m3, shiftround
packuswb m2, m3
pshufd m2, m2, q3120
movq [r0+r2], m2
movhps [r1+r2], m2
+%endif
lea r3, [r3+r4*2]
lea r0, [r0+r2*2]
lea r1, [r1+r2*2]
MC_CHROMA_SSSE3
INIT_XMM avx
MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
+INIT_YMM avx2
+MC_CHROMA_SSSE3
%endif ; HIGH_BIT_DEPTH
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
+SECTION_RODATA 32
-filt_mul20: times 16 db 20
-filt_mul15: times 8 db 1, -5
-filt_mul51: times 8 db -5, 1
-hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
-deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
+filt_mul20: times 32 db 20
+filt_mul15: times 16 db 1, -5
+filt_mul51: times 16 db -5, 1
+hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
+deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
%if HIGH_BIT_DEPTH
deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
FILT_V2 m1, m2, m3, m4, m5, m6
%endif
mova m7, [pw_16]
+%if mmsize==32
+ mova [r2+r4*2], xm1
+ mova [r2+r4*2+mmsize/2], xm4
+ vextracti128 [r2+r4*2+mmsize], m1, 1
+ vextracti128 [r2+r4*2+mmsize*3/2], m4, 1
+%else
mova [r2+r4*2], m1
mova [r2+r4*2+mmsize], m4
+%endif
FILT_PACK m1, m4, 5, m7
movnta [r0+r4], m1
add r1, mmsize
%define tpw_32 [pw_32]
%endif
; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
-%if cpuflag(misalign)
+%if cpuflag(misalign) || mmsize==32
.loop:
movu m4, [src-4]
movu m5, [src-2]
- mova m6, [src]
- movu m3, [src+12]
- movu m2, [src+14]
- mova m1, [src+16]
+ mova m6, [src+0]
+ movu m3, [src-4+mmsize]
+ movu m2, [src-2+mmsize]
+ mova m1, [src+0+mmsize]
paddw m4, [src+6]
paddw m5, [src+4]
paddw m6, [src+2]
- paddw m3, [src+22]
- paddw m2, [src+20]
- paddw m1, [src+18]
+ paddw m3, [src+6+mmsize]
+ paddw m2, [src+4+mmsize]
+ paddw m1, [src+2+mmsize]
FILT_H2 m4, m5, m6, m3, m2, m1
%else
mova m0, [src-16]
FILT_H m3, m5, m6
%endif
FILT_PACK m4, m3, 6, tpw_32
- movntps [r0+r2], m4
- add r2, 16
+%if mmsize==32
+ vpermq m4, m4, q3120
+%endif
+ movnta [r0+r2], m4
+ add r2, mmsize
jl .loop
RET
%endmacro
HPEL_V 0
HPEL_H
%endif
+INIT_YMM avx2
+HPEL_V 8
+HPEL_C
+
+INIT_YMM avx2
+cglobal hpel_filter_h, 3,3,8
+ add r0, r2
+ add r1, r2
+ neg r2
+ %define src r1+r2
+ mova m5, [filt_mul15]
+ mova m6, [filt_mul20]
+ mova m7, [filt_mul51]
+.loop:
+ movu m0, [src-2]
+ movu m1, [src-1]
+ movu m2, [src+2]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m7
+ paddw m0, m2
+
+ mova m2, [src+0]
+ movu m3, [src+1]
+ movu m4, [src+3]
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m4, m7
+ paddw m0, m2
+ paddw m1, m3
+ paddw m1, m4
+
+ mova m2, [pw_16]
+ FILT_PACK m0, m1, 5, m2
+ pshufb m0, [hpel_shuf]
+ movnta [r0+r2], m0
+ add r2, mmsize
+ jl .loop
+ RET
%if ARCH_X86_64
%macro DO_FILT_V 5
;-----------------------------------------------------------------------------
; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
;-----------------------------------------------------------------------------
-INIT_XMM
-cglobal integral_init4h_sse4, 3,4
+%macro INTEGRAL_INIT4H 0
+cglobal integral_init4h, 3,4
lea r3, [r0+r2*2]
add r1, r2
neg r2
pxor m4, m4
.loop:
- movdqa m0, [r1+r2]
- movdqa m1, [r1+r2+16]
+ mova m0, [r1+r2]
+%if mmsize==32
+ movu m1, [r1+r2+8]
+%else
+ mova m1, [r1+r2+16]
palignr m1, m0, 8
+%endif
mpsadbw m0, m4, 0
mpsadbw m1, m4, 0
paddw m0, [r0+r2*2]
- paddw m1, [r0+r2*2+16]
- movdqa [r3+r2*2 ], m0
- movdqa [r3+r2*2+16], m1
- add r2, 16
+ paddw m1, [r0+r2*2+mmsize]
+ mova [r3+r2*2 ], m0
+ mova [r3+r2*2+mmsize], m1
+ add r2, mmsize
jl .loop
RET
+%endmacro
+
+INIT_XMM sse4
+INTEGRAL_INIT4H
+INIT_YMM avx2
+INTEGRAL_INIT4H
%macro INTEGRAL_INIT8H 0
cglobal integral_init8h, 3,4
neg r2
pxor m4, m4
.loop:
- movdqa m0, [r1+r2]
- movdqa m1, [r1+r2+16]
+ mova m0, [r1+r2]
+%if mmsize==32
+ movu m1, [r1+r2+8]
+ mpsadbw m2, m0, m4, 100100b
+ mpsadbw m3, m1, m4, 100100b
+%else
+ mova m1, [r1+r2+16]
palignr m1, m0, 8
- mpsadbw m2, m0, m4, 4
- mpsadbw m3, m1, m4, 4
+ mpsadbw m2, m0, m4, 100b
+ mpsadbw m3, m1, m4, 100b
+%endif
mpsadbw m0, m4, 0
mpsadbw m1, m4, 0
paddw m0, [r0+r2*2]
- paddw m1, [r0+r2*2+16]
+ paddw m1, [r0+r2*2+mmsize]
paddw m0, m2
paddw m1, m3
- movdqa [r3+r2*2 ], m0
- movdqa [r3+r2*2+16], m1
- add r2, 16
+ mova [r3+r2*2 ], m0
+ mova [r3+r2*2+mmsize], m1
+ add r2, mmsize
jl .loop
RET
%endmacro
INTEGRAL_INIT8H
INIT_XMM avx
INTEGRAL_INIT8H
+INIT_YMM avx2
+INTEGRAL_INIT8H
%endif ; !HIGH_BIT_DEPTH
%macro INTEGRAL_INIT_8V 0
; void integral_init8v( uint16_t *sum8, intptr_t stride )
;-----------------------------------------------------------------------------
cglobal integral_init8v, 3,3
- shl r1, 1
+ add r1, r1
add r0, r1
lea r2, [r0+r1*8]
neg r1
INTEGRAL_INIT_8V
INIT_XMM sse2
INTEGRAL_INIT_8V
+INIT_YMM avx2
+INTEGRAL_INIT_8V
;-----------------------------------------------------------------------------
; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal integral_init4v_mmx, 3,5
+INIT_MMX mmx
+cglobal integral_init4v, 3,5
shl r2, 1
lea r3, [r0+r2*4]
lea r4, [r0+r2*8]
jge .loop
RET
-INIT_XMM
-cglobal integral_init4v_sse2, 3,5
+INIT_XMM sse2
+cglobal integral_init4v, 3,5
shl r2, 1
add r0, r2
add r1, r2
jl .loop
RET
-cglobal integral_init4v_ssse3, 3,5
+INIT_XMM ssse3
+cglobal integral_init4v, 3,5
shl r2, 1
add r0, r2
add r1, r2
jl .loop
RET
+INIT_YMM avx2
+cglobal integral_init4v, 3,5
+ add r2, r2
+ add r0, r2
+ add r1, r2
+ lea r3, [r0+r2*4]
+ lea r4, [r0+r2*8]
+ neg r2
+.loop:
+ mova m2, [r0+r2]
+ movu m1, [r4+r2+8]
+ paddw m0, m2, [r0+r2+8]
+ paddw m1, [r4+r2]
+ mova m3, [r3+r2]
+ psubw m1, m0
+ psubw m3, m2
+ mova [r0+r2], m1
+ mova [r1+r2], m3
+ add r2, 32
+ jl .loop
+ RET
+
%macro FILT8x4 7
mova %3, [r0+%7]
mova %4, [r0+r5+%7]
%endif
%endmacro
+%macro FILT32x4U 4
+ mova m1, [r0+r5]
+ pavgb m0, m1, [r0]
+ movu m3, [r0+r5+1]
+ pavgb m2, m3, [r0+1]
+ pavgb m1, [r0+r5*2]
+ pavgb m3, [r0+r5*2+1]
+ pavgb m0, m2
+ pavgb m1, m3
+
+ mova m3, [r0+r5+mmsize]
+ pavgb m2, m3, [r0+mmsize]
+ movu m5, [r0+r5+1+mmsize]
+ pavgb m4, m5, [r0+1+mmsize]
+ pavgb m3, [r0+r5*2+mmsize]
+ pavgb m5, [r0+r5*2+1+mmsize]
+ pavgb m2, m4
+ pavgb m3, m5
+
+ pshufb m0, m7
+ pshufb m1, m7
+ pshufb m2, m7
+ pshufb m3, m7
+ punpckhqdq m4, m0, m2
+ punpcklqdq m0, m0, m2
+ punpckhqdq m5, m1, m3
+ punpcklqdq m2, m1, m3
+ vpermq m0, m0, q3120
+ vpermq m1, m4, q3120
+ vpermq m2, m2, q3120
+ vpermq m3, m5, q3120
+ mova [%1], m0
+ mova [%2], m1
+ mova [%3], m2
+ mova [%4], m3
+%endmacro
+
%macro FILT16x2 4
mova m3, [r0+%4+mmsize]
mova m2, [r0+%4]
shl dword r6m, 1
FIX_STRIDES r5
shl dword r7m, 1
+%endif
+%if mmsize >= 16
+ add dword r7m, mmsize-1
+ and dword r7m, ~(mmsize-1)
%endif
; src += 2*(height-1)*stride + 2*width
mov r6d, r8m
sub r6d, mmsize
jg .hloop
%else ; !HIGH_BIT_DEPTH
-%if mmsize == 16
- ; adjust for the odd end case
- mov r6d, r7m
- and r6d, 8
- sub r1, r6
- sub r2, r6
- sub r3, r6
- sub r4, r6
- add dst_gap, r6d
-%endif ; mmsize
-%if cpuflag(xop)
+%if cpuflag(avx2)
+ mova m7, [deinterleave_shuf]
+%elif cpuflag(xop)
mova m6, [deinterleave_shuf32a]
mova m7, [deinterleave_shuf32b]
%else
.vloop:
mov r6d, r7m
%ifnidn cpuname, mmx2
+%if mmsize <= 16
mova m0, [r0]
mova m1, [r0+r5]
pavgb m0, m1
pavgb m1, [r0+r5*2]
%endif
-%if mmsize == 16
- test r6d, 8
- jz .hloop
- sub r0, 16
- FILT8x4 m0, m1, m2, m3, m4, m5, 0
-%if cpuflag(xop)
- mova m4, m0
- vpperm m0, m4, m1, m6
- vpperm m1, m4, m1, m7
- movq [r1], m0
- movq [r2], m1
- movhps [r3], m0
- movhps [r4], m1
-%else
- packuswb m0, m4
- packuswb m1, m5
- movq [r1], m0
- movhps [r2], m0
- movq [r3], m1
- movhps [r4], m1
%endif
- mova m0, m2
- mova m1, m3
- sub r6d, 8
- jz .skip
-%endif ; mmsize
.hloop:
sub r0, mmsize*2
sub r1, mmsize
sub r2, mmsize
sub r3, mmsize
sub r4, mmsize
-%ifdef m8
+%if mmsize==32
+ FILT32x4U r1, r2, r3, r4
+%elifdef m8
FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
mova m8, m0
mova m9, m1
FRAME_INIT_LOWRES
INIT_XMM xop
FRAME_INIT_LOWRES
+%if HIGH_BIT_DEPTH==0
+INIT_YMM avx2
+FRAME_INIT_LOWRES
+%endif
;-----------------------------------------------------------------------------
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
INIT_XMM fma4
MBTREE
-%macro INT16_TO_FLOAT 1
-%if cpuflag(avx2)
- vpmovzxwd ymm%1, xmm%1
-%else
- vpunpckhwd xmm4, xmm%1, xmm7
- vpunpcklwd xmm%1, xmm7
- vinsertf128 ymm%1, ymm%1, xmm4, 1
-%endif
- vcvtdq2ps ymm%1, ymm%1
+%macro INT16_UNPACK 1
+ vpunpckhwd xm4, xm%1, xm7
+ vpunpcklwd xm%1, xm7
+ vinsertf128 m%1, m%1, xm4, 1
%endmacro
; FIXME: align loads/stores to 16 bytes
%macro MBTREE_AVX 0
cglobal mbtree_propagate_cost, 7,7,8
- add r6d, r6d
- lea r0, [r0+r6*2]
- add r1, r6
- add r2, r6
- add r3, r6
- add r4, r6
- neg r6
- vmovdqa xmm5, [pw_3fff]
- vbroadcastss ymm6, [r5]
- vmulps ymm6, ymm6, [pf_inv256]
+ add r6d, r6d
+ lea r0, [r0+r6*2]
+ add r1, r6
+ add r2, r6
+ add r3, r6
+ add r4, r6
+ neg r6
+ mova xm5, [pw_3fff]
+ vbroadcastss m6, [r5]
+ mulps m6, [pf_inv256]
%if notcpuflag(avx2)
- vpxor xmm7, xmm7
+ pxor xm7, xm7
%endif
.loop:
- vmovdqu xmm0, [r2+r6] ; intra
- vmovdqu xmm1, [r4+r6] ; invq
- vmovdqu xmm2, [r1+r6] ; prop
- vpand xmm3, xmm5, [r3+r6] ; inter
- INT16_TO_FLOAT 0
- INT16_TO_FLOAT 1
- INT16_TO_FLOAT 2
- INT16_TO_FLOAT 3
-%if cpuflag(fma3)
- vmulps ymm1, ymm1, ymm0
- vsubps ymm4, ymm0, ymm3
- fmaddps ymm1, ymm1, ymm6, ymm2
- vrcpps ymm3, ymm0
- vmulps ymm2, ymm0, ymm3
- vmulps ymm1, ymm1, ymm4
- vaddps ymm4, ymm3, ymm3
- fnmaddps ymm4, ymm2, ymm3, ymm4
- vmulps ymm1, ymm1, ymm4
+%if cpuflag(avx2)
+ pmovzxwd m0, [r2+r6] ; intra
+ pmovzxwd m1, [r4+r6] ; invq
+ pmovzxwd m2, [r1+r6] ; prop
+ pand xm3, xm5, [r3+r6] ; inter
+ pmovzxwd m3, xm3
+ pmaddwd m1, m0
+ psubd m4, m0, m3
+ cvtdq2ps m0, m0
+ cvtdq2ps m1, m1
+ cvtdq2ps m2, m2
+ cvtdq2ps m4, m4
+ fmaddps m1, m1, m6, m2
+ rcpps m3, m0
+ mulps m2, m0, m3
+ mulps m1, m4
+ addps m4, m3, m3
+ fnmaddps m4, m2, m3, m4
+ mulps m1, m4
%else
- vmulps ymm1, ymm1, ymm0
- vsubps ymm4, ymm0, ymm3
- vmulps ymm1, ymm1, ymm6 ; intra*invq*fps_factor>>8
- vaddps ymm1, ymm1, ymm2 ; prop + (intra*invq*fps_factor>>8)
- vrcpps ymm3, ymm0 ; 1 / intra 1st approximation
- vmulps ymm2, ymm0, ymm3 ; intra * (1/intra 1st approx)
- vmulps ymm2, ymm2, ymm3 ; intra * (1/intra 1st approx)^2
- vmulps ymm1, ymm1, ymm4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
- vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx)
- vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra
- vmulps ymm1, ymm1, ymm3 ; / intra
+ movu xm0, [r2+r6]
+ movu xm1, [r4+r6]
+ movu xm2, [r1+r6]
+ pand xm3, xm5, [r3+r6]
+ INT16_UNPACK 0
+ INT16_UNPACK 1
+ INT16_UNPACK 2
+ INT16_UNPACK 3
+ cvtdq2ps m0, m0
+ cvtdq2ps m1, m1
+ cvtdq2ps m2, m2
+ cvtdq2ps m3, m3
+ mulps m1, m0
+ subps m4, m0, m3
+ mulps m1, m6 ; intra*invq*fps_factor>>8
+ addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
+ rcpps m3, m0 ; 1 / intra 1st approximation
+ mulps m2, m0, m3 ; intra * (1/intra 1st approx)
+ mulps m2, m3 ; intra * (1/intra 1st approx)^2
+ mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+ addps m3, m3 ; 2 * (1/intra 1st approx)
+ subps m3, m2 ; 2nd approximation for 1/intra
+ mulps m1, m3 ; / intra
%endif
- vcvtps2dq ymm1, ymm1
- vmovdqu [r0+r6*2], ymm1
- add r6, 16
+ vcvtps2dq m1, m1
+ movu [r0+r6*2], m1
+ add r6, 16
jl .loop
RET
%endmacro
#define DECL_SUF( func, args )\
void func##_mmx2 args;\
void func##_sse2 args;\
- void func##_ssse3 args;
+ void func##_ssse3 args;\
+ void func##_avx2 args;
DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
DECL_SUF( x264_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
MC_WEIGHT( 12, ssse3 )
MC_WEIGHT( 16, ssse3 )
MC_WEIGHT( 20, ssse3 )
+MC_WEIGHT( 8, avx2 )
+MC_WEIGHT( 16, avx2 )
+MC_WEIGHT( 20, avx2 )
#undef MC_OFFSET
#undef MC_WEIGHT
void x264_memzero_aligned_mmx( void *dst, size_t n );
void x264_memzero_aligned_sse( void *dst, size_t n );
void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
+void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, intptr_t stride );
+void x264_integral_init8h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
void x264_integral_init4v_mmx ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
void x264_integral_init4v_sse2 ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
+void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
+void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
MC_CHROMA(ssse3)
MC_CHROMA(ssse3_cache64)
MC_CHROMA(avx)
-MC_CHROMA(avx_cache64)
+MC_CHROMA(avx2)
#define LOWRES(cpu)\
void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
LOWRES(ssse3)
LOWRES(avx)
LOWRES(xop)
+LOWRES(avx2)
#define PIXEL_AVG_W(width,cpu)\
void x264_pixel_avg2_w##width##_##cpu( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t );
PIXEL_AVG_WALL(sse2)
PIXEL_AVG_WALL(sse2_misalign)
PIXEL_AVG_WALL(cache64_ssse3)
+PIXEL_AVG_WALL(avx2)
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ) =\
PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2)
+PIXEL_AVG_WTAB(avx2, mmx2, mmx2, sse2, sse2, avx2)
#endif // HIGH_BIT_DEPTH
#define MC_COPY_WTAB(instr, name1, name2, name3)\
MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,mmx2,16)
MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,mmx2,16)
MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16)
+MC_WEIGHT_WTAB(weight,avx2,ssse3,avx2,16)
static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )
{
GET_REF(cache64_sse2)
GET_REF(cache64_ssse3)
GET_REF(cache64_ssse3_atom)
+GET_REF(avx2)
#endif // !HIGH_BIT_DEPTH
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
width += realign;\
while( height-- )\
{\
- x264_hpel_filter_v_##cpuv( dstv, src, buf+8, stride, width );\
- x264_hpel_filter_c_##cpuc( dstc, buf+8, width );\
+ x264_hpel_filter_v_##cpuv( dstv, src, buf+16, stride, width );\
+ x264_hpel_filter_c_##cpuc( dstc, buf+16, width );\
x264_hpel_filter_h_##cpuh( dsth, src, width );\
dsth += stride;\
dstv += stride;\
HPEL(16, ssse3, ssse3, ssse3, ssse3)
HPEL(16, avx, avx, avx, avx)
#endif
+HPEL(32, avx2, avx2, avx2, avx2)
HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
#endif // HIGH_BIT_DEPTH
if( cpu&X264_CPU_XOP )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
+
+ if( cpu&X264_CPU_AVX2 )
+ {
+ pf->hpel_filter = x264_hpel_filter_avx2;
+ pf->mc_chroma = x264_mc_chroma_avx2;
+ pf->weight = x264_mc_weight_wtab_avx2;
+ pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx2;
+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx2;
+ pf->get_ref = get_ref_avx2;
+ pf->integral_init8v = x264_integral_init8v_avx2;
+ pf->integral_init4v = x264_integral_init4v_avx2;
+ pf->integral_init8h = x264_integral_init8h_avx2;
+ pf->integral_init4h = x264_integral_init4h_avx2;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
+ }
#endif // HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_AVX) )
%include "x86util.asm"
SECTION_RODATA 32
+hmul_16p: times 16 db 1
+ times 8 db 1, -1
+hmul_8p: times 8 db 1
+ times 4 db 1, -1
+ times 8 db 1
+ times 4 db 1, -1
mask_ff: times 16 db 0xff
times 16 db 0
+mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1
+mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1
+mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1
%if BIT_DEPTH == 10
ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
%endif
-mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1
-mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1
-mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1
hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
-hmul_8p: times 8 db 1
- times 4 db 1, -1
mask_10: times 4 dw 0, -1
mask_1100: times 2 dd 0, -1
pb_pppm: times 4 db 1,1,1,-1
intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0
+ALIGN 32
intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5
intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4
intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1
punpcklbw m%2, m%4
%endmacro
+%macro LOAD_AVX2 5
+ mova xm%1, %3
+ vinserti128 m%1, m%1, %4, 1
+%if %5
+ lea t0, [t0+2*t1]
+%endif
+%endmacro
+
+%macro JOIN_AVX2 7
+ mova xm%2, %5
+ vinserti128 m%2, m%2, %6, 1
+%if %7
+ lea t2, [t2+2*t3]
+%endif
+ SBUTTERFLY bw, %1, %2, %3
+%endmacro
+
%macro SSD_LOAD_HALF 5
LOAD 1, 2, [t0+%1], [t0+%3], 1
JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
%endif
dec al
jg .loop
+%if mmsize==32
+ vextracti128 xm1, m0, 1
+ paddd xm0, xm1
+ HADDD xm0, xm1
+ movd eax, xm0
+%else
HADDD m0, m1
movd eax, m0
+%endif
RET
%endif
%endmacro
SSD 16, 8
SSD 8, 16
SSD 8, 4
+%define LOAD LOAD_AVX2
+%define JOIN JOIN_AVX2
+INIT_YMM avx2
+SSD 16, 16
+SSD 16, 8
%assign function_align 16
%endif ; !HIGH_BIT_DEPTH
%if HIGH_BIT_DEPTH == 0
%if %1
mova m7, [pw_00ff]
-%else
+%elif mmsize < 32
pxor m7, m7 ; zero
%endif
%endif ; !HIGH_BIT_DEPTH
%else ; !HIGH_BIT_DEPTH
HADDW m5, m2
%endif ; HIGH_BIT_DEPTH
- movd eax, m5
HADDD m6, m1
- movd edx, m6
%if ARCH_X86_64
- shl rdx, 32
- add rax, rdx
+ punpckldq m5, m6
+ movq rax, m5
+%else
+ movd eax, m5
+ movd edx, m6
%endif
RET
%endmacro
VAR
INIT_XMM xop
VAR
+
+INIT_YMM avx2
+cglobal pixel_var_16x16, 2,4,7
+ VAR_START 0
+ mov r2d, 4
+ lea r3, [r1*3]
+.loop:
+ pmovzxbw m0, [r0]
+ pmovzxbw m3, [r0+r1]
+ pmovzxbw m1, [r0+r1*2]
+ pmovzxbw m4, [r0+r3]
+ lea r0, [r0+r1*4]
+ VAR_CORE
+ dec r2d
+ jg .loop
+ vextracti128 xm0, m5, 1
+ vextracti128 xm1, m6, 1
+ paddw xm5, xm0
+ paddd xm6, xm1
+ HADDW xm5, xm2
+ HADDD xm6, xm1
+%if ARCH_X86_64
+ punpckldq xm5, xm6
+ movq rax, xm5
+%else
+ movd eax, xm5
+ movd edx, xm6
+%endif
+ RET
%endif ; !HIGH_BIT_DEPTH
-%macro VAR2_END 1
- HADDW m5, m7
- movd r1d, m5
+%macro VAR2_END 3
+ HADDW %2, xm1
+ movd r1d, %2
imul r1d, r1d
- HADDD m6, m1
+ HADDD %3, xm1
shr r1d, %1
- movd eax, m6
- mov [r4], eax
+ movd eax, %3
+ movd [r4], %3
sub eax, r1d ; sqr - (sum * sum >> shift)
RET
%endmacro
add r2, r3
dec r5d
jg .loop
- VAR2_END %2
+ VAR2_END %2, m5, m6
%endmacro
%if ARCH_X86_64 == 0
lea r2, [r2+r3*2*SIZEOF_PIXEL]
dec r5d
jg .loop
- VAR2_END %2
+ VAR2_END %2, m5, m6
%endmacro
INIT_XMM sse2
lea r2, [r2+r3*2]
dec r5d
jg .loop
- VAR2_END %2
+ VAR2_END %2, m5, m6
%endmacro
INIT_XMM ssse3
VAR2_8x8_SSSE3 8, 6
VAR2_8x8_SSSE3 16, 7
+%macro VAR2_8x8_AVX2 2
+cglobal pixel_var2_8x%1, 5,6,6
+ pxor m3, m3 ; sum
+ pxor m4, m4 ; sum squared
+ mova m5, [hsub_mul]
+ mov r5d, %1/4
+.loop:
+ movq xm0, [r0]
+ movq xm1, [r2]
+ vinserti128 m0, m0, [r0+r1], 1
+ vinserti128 m1, m1, [r2+r3], 1
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ punpcklbw m0, m1
+ movq xm1, [r0]
+ movq xm2, [r2]
+ vinserti128 m1, m1, [r0+r1], 1
+ vinserti128 m2, m2, [r2+r3], 1
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ punpcklbw m1, m2
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ paddw m3, m0
+ paddw m3, m1
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ paddd m4, m0
+ paddd m4, m1
+ dec r5d
+ jg .loop
+ vextracti128 xm0, m3, 1
+ vextracti128 xm1, m4, 1
+ paddw xm3, xm0
+ paddd xm4, xm1
+ VAR2_END %2, xm3, xm4
+%endmacro
+
+INIT_YMM avx2
+VAR2_8x8_AVX2 8, 6
+VAR2_8x8_AVX2 16, 7
+
%endif ; !HIGH_BIT_DEPTH
;=============================================================================
LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
%endmacro
+%macro LOAD_SUMSUB_16x2P_AVX2 9
+; 2*dst, 2*tmp, mul, 4*ptr
+ vbroadcasti128 m%1, [%6]
+ vbroadcasti128 m%3, [%7]
+ vbroadcasti128 m%2, [%8]
+ vbroadcasti128 m%4, [%9]
+ DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
+%endmacro
+
+%macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0
+; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
+ LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3
+ LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5
+%if %10
+ lea %8, [%8+4*r1]
+ lea %9, [%9+4*r3]
+%endif
+%endmacro
+
+%macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer
+ mova xm%3, %6
+ mova xm%4, %8
+ mova xm%1, %5
+ mova xm%2, %7
+ vpermq m%3, m%3, q0011
+ vpermq m%4, m%4, q0011
+ vpermq m%1, m%1, q0011
+ vpermq m%2, m%2, q0011
+%endmacro
+
+%macro LOAD_SUMSUB8_16x2P_AVX2 9
+; 2*dst, 2*tmp, mul, 4*ptr
+ LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9
+ DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
+%endmacro
+
+%macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0
+; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
+ LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
+ LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
+%if %10
+ lea %8, [%8+4*r1]
+ lea %9, [%9+4*r3]
+%endif
+%endmacro
+
; in: r4=3*stride1, r5=3*stride2
; in: %2 = horizontal offset
; in: %3 = whether we need to increment pix1 and pix2
%if HIGH_BIT_DEPTH && %3
pxor %2, %2
%elif cpuflag(ssse3) && notcpuflag(atom)
+%if mmsize==32
+ mova %2, [hmul_16p]
+%else
mova %2, [hmul_8p]
+%endif
%endif
lea r4, [3*r1]
lea r5, [3*r3]
%macro SATD_END_SSE2 1-2
%if HIGH_BIT_DEPTH
- HADDUW %1, m0
+ HADDUW %1, xm0
%if %0 == 2
paddd %1, %2
%endif
%else
- HADDW %1, m7
+ HADDW %1, xm7
%endif
movd eax, %1
RET
HADAMARD 0, amax, 14, 15, 6
paddw m10, m14
%else
- LOAD_SUMSUB_8x4P_SSSE3 %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
+ LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
HADAMARD4_V %2, %3, %4, %5, 6
pabsw m12, m%2 ; doing the abs first is a slight advantage
;-------------------------------------------------------------------------------
; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t )
;-------------------------------------------------------------------------------
-cglobal pixel_sa8d_satd_16x16, 4,8,16,SIZEOF_PIXEL*mmsize
+cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize
%define temp0 [rsp+0*mmsize]
%define temp1 [rsp+1*mmsize]
FIX_STRIDES r1, r3
%endif
lea r4, [3*r1]
lea r5, [3*r3]
+ pxor m10, m10
+
+%if mmsize==32
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 0, 1
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 1, 0
+ vextracti128 xm1, m0, 1
+ vextracti128 xm2, m10, 1
+ paddw xm0, xm1
+ paddw xm10, xm2
+%else
lea r6, [r2+8*SIZEOF_PIXEL]
lea r7, [r0+8*SIZEOF_PIXEL]
- pxor m10, m10
call pixel_sa8d_satd_8x8_internal
SA8D_SATD_ACCUM 0, 1
SA8D_SATD_ACCUM 1, 1
call pixel_sa8d_satd_8x8_internal
SA8D_SATD_ACCUM 1, 0
+%endif
; xop already has fast horizontal sums
%if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0
- pmaddwd m10, [pw_1]
- HADDUWD m0, m1
- phaddd m0, m10 ; sa8d1 sa8d2 satd1 satd2
- pshufd m1, m0, q2301 ; sa8d2 sa8d1 satd2 satd1
- paddd m0, m1 ; sa8d sa8d satd satd
- movd r0d, m0
- pextrd eax, m0, 2
+ pmaddwd xm10, [pw_1]
+ HADDUWD xm0, xm1
+ phaddd xm0, xm10 ; sa8d1 sa8d2 satd1 satd2
+ pshufd xm1, xm0, q2301 ; sa8d2 sa8d1 satd2 satd1
+ paddd xm0, xm1 ; sa8d sa8d satd satd
+ movd r0d, xm0
+ pextrd eax, xm0, 2
%else
%if HIGH_BIT_DEPTH
- HADDD m0, m1
- HADDD m10, m2
+ HADDD xm0, xm1
+ HADDD xm10, xm2
%else
- HADDUW m0, m1
- HADDW m10, m2
+ HADDUW xm0, xm1
+ HADDW xm10, xm2
%endif
- movd r0d, m0
- movd eax, m10
+ movd r0d, xm0
+ movd eax, xm10
%endif
add r0d, 1
shl rax, 32
%define spill2 m10
%else
%define spill0 [rsp+gprsize]
- %define spill1 [rsp+gprsize+16]
- %define spill2 [rsp+gprsize+32]
+ %define spill1 [rsp+gprsize+mmsize]
+ %define spill2 [rsp+gprsize+mmsize*2]
%endif
%if HIGH_BIT_DEPTH
%define vertical 1
AC_PADD m1, m2, [pw_1]
ABSW m2, m7, m7
AC_PADD m1, m3, [pw_1]
- mova m3, m7
AC_PADD m1, m2, [pw_1]
- mova m2, m6
+ paddw m3, m7, spill2
psubw m7, spill2
- paddw m3, spill2
- mova [rsp+gprsize+32], m1 ; save satd
- mova m1, m5
+ mova [rsp+gprsize+mmsize*2], m1 ; save satd
+ paddw m2, m6, spill1
psubw m6, spill1
- paddw m2, spill1
+ paddw m1, m5, spill0
psubw m5, spill0
- paddw m1, spill0
%assign %%x 2
%if vertical
%assign %%x 4
ABSW m0, m0, m7
AC_PADD m2, m4, [pw_1]
AC_PADD m2, m0, [pw_1]
- mova [rsp+gprsize+16], m2 ; save sa8d
+ mova [rsp+gprsize+mmsize], m2 ; save sa8d
SWAP 0, 2
SAVE_MM_PERMUTATION
ret
HADAMARD_AC_WXH_SSE2 16, 16
-HADAMARD_AC_WXH_SSE2 8, 16
HADAMARD_AC_WXH_SSE2 16, 8
+%if mmsize <= 16
+HADAMARD_AC_WXH_SSE2 8, 16
HADAMARD_AC_WXH_SSE2 8, 8
+%endif
%endmacro ; HADAMARD_AC_SSE2
%macro HADAMARD_AC_WXH_SUM_SSE2 2
paddd m1, [rsp+8*mmsize]
psrld m0, 1
%endif
- HADDD m0, m2
- HADDD m1, m3
+ HADDD xm0, xm2
+ HADDD xm1, xm3
%else ; !HIGH_BIT_DEPTH
-%if %1*%2 >= 128
+%if %1*%2*16/mmsize >= 128
paddusw m0, [rsp+3*mmsize]
paddusw m1, [rsp+4*mmsize]
%endif
-%if %1*%2 == 256
+%if %1*%2*16/mmsize == 256
paddusw m0, [rsp+5*mmsize]
paddusw m1, [rsp+6*mmsize]
paddusw m0, [rsp+7*mmsize]
paddusw m1, [rsp+8*mmsize]
psrlw m0, 1
%endif
- HADDUW m0, m2
- HADDW m1, m3
+%if mmsize==32
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ paddusw xm0, xm2
+ paddusw xm1, xm3
+%endif
+ HADDUW xm0, xm2
+ HADDW xm1, xm3
%endif ; HIGH_BIT_DEPTH
%endmacro
; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
%macro HADAMARD_AC_WXH_SSE2 2
-cglobal pixel_hadamard_ac_%1x%2, 2,3,11
- %assign pad 16-gprsize-(stack_offset&15)
+cglobal pixel_hadamard_ac_%1x%2, 2,4,11
%define ysub r1
FIX_STRIDES r1
- sub rsp, 48+pad
- lea r2, [r1*3]
+ mov r3, rsp
+ and rsp, ~(mmsize-1)
+ sub rsp, mmsize*3
+ lea r2, [r1*3]
call hadamard_ac_8x8
%if %2==16
%define ysub r2
- lea r0, [r0+r1*4]
- sub rsp, 32
+ lea r0, [r0+r1*4]
+ sub rsp, mmsize*2
call hadamard_ac_8x8
%endif
-%if %1==16
+%if %1==16 && mmsize <= 16
neg ysub
- sub rsp, 32
- lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
+ sub rsp, mmsize*2
+ lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
neg ysub
call hadamard_ac_8x8
%if %2==16
- lea r0, [r0+r1*4]
- sub rsp, 32
+ lea r0, [r0+r1*4]
+ sub rsp, mmsize*2
call hadamard_ac_8x8
%endif
%endif
HADAMARD_AC_WXH_SUM_SSE2 %1, %2
- movd edx, m0
- movd eax, m1
- shr edx, 2 - (%1*%2 >> 8)
+ movd edx, xm0
+ movd eax, xm1
+ shr edx, 2 - (%1*%2*16/mmsize >> 8)
shr eax, 1
%if ARCH_X86_64
shl rdx, 32
add rax, rdx
%endif
- add rsp, 16+%1*%2/2+pad
+ mov rsp, r3
RET
%endmacro ; HADAMARD_AC_WXH_SSE2
%endif
HADAMARD_AC_SSE2
+
+%if HIGH_BIT_DEPTH == 0
+%define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
+%define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2
+%define TRANS TRANS_SSE4
+INIT_YMM avx2
+HADAMARD_AC_SSE2
+%if ARCH_X86_64
+SA8D_SATD
+%endif
+
+%macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
+ movq xm%1, [r0]
+ movq xm%3, [r2]
+ movq xm%2, [r0+r1]
+ movq xm%4, [r2+r3]
+ vinserti128 m%1, m%1, [r0+4*r1], 1
+ vinserti128 m%3, m%3, [r2+4*r3], 1
+ vinserti128 m%2, m%2, [r0+r4], 1
+ vinserti128 m%4, m%4, [r2+r5], 1
+ punpcklqdq m%1, m%1
+ punpcklqdq m%3, m%3
+ punpcklqdq m%2, m%2
+ punpcklqdq m%4, m%4
+ DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+
+ movq xm%3, [r0]
+ movq xm%5, [r2]
+ movq xm%4, [r0+r1]
+ movq xm%6, [r2+r3]
+ vinserti128 m%3, m%3, [r0+4*r1], 1
+ vinserti128 m%5, m%5, [r2+4*r3], 1
+ vinserti128 m%4, m%4, [r0+r4], 1
+ vinserti128 m%6, m%6, [r2+r5], 1
+ punpcklqdq m%3, m%3
+ punpcklqdq m%5, m%5
+ punpcklqdq m%4, m%4
+ punpcklqdq m%6, m%6
+ DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
+%endmacro
+
+%macro SATD_START_AVX2 2-3 0
+ FIX_STRIDES r1, r3
+%if %3
+ mova %2, [hmul_8p]
+ lea r4, [5*r1]
+ lea r5, [5*r3]
+%else
+ mova %2, [hmul_16p]
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+%endif
+ pxor %1, %1
+%endmacro
+
+%define TRANS TRANS_SSE4
+INIT_YMM avx2
+cglobal pixel_satd_16x8_internal
+ LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
+ LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
+ ret
+
+cglobal pixel_satd_16x16, 4,6,8
+ SATD_START_AVX2 m6, m7
+ call pixel_satd_16x8_internal
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+pixel_satd_16x8_internal:
+ call pixel_satd_16x8_internal
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ SATD_END_SSE2 xm0
+ RET
+
+cglobal pixel_satd_16x8, 4,6,8
+ SATD_START_AVX2 m6, m7
+ jmp pixel_satd_16x8_internal
+
+cglobal pixel_satd_8x8_internal
+ LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
+ ret
+
+cglobal pixel_satd_8x16, 4,6,8
+ SATD_START_AVX2 m6, m7, 1
+ call pixel_satd_8x8_internal
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ call pixel_satd_8x8_internal
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ SATD_END_SSE2 xm0
+ RET
+
+cglobal pixel_satd_8x8, 4,6,8
+ SATD_START_AVX2 m6, m7, 1
+ call pixel_satd_8x8_internal
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ SATD_END_SSE2 xm0
+ RET
+
+cglobal pixel_sa8d_8x8_internal
+ LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
+ HADAMARD4_V 0, 1, 2, 3, 4
+ HADAMARD 8, sumsub, 0, 1, 4, 5
+ HADAMARD 8, sumsub, 2, 3, 4, 5
+ HADAMARD 2, sumsub, 0, 1, 4, 5
+ HADAMARD 2, sumsub, 2, 3, 4, 5
+ HADAMARD 1, amax, 0, 1, 4, 5
+ HADAMARD 1, amax, 2, 3, 4, 5
+ paddw m6, m0
+ paddw m6, m2
+ ret
+
+cglobal pixel_sa8d_8x8, 4,6,8
+ SATD_START_AVX2 m6, m7, 1
+ call pixel_sa8d_8x8_internal
+ vextracti128 xm1, m6, 1
+ paddw xm6, xm1
+ HADDW xm6, xm1
+ movd eax, xm6
+ add eax, 1
+ shr eax, 1
+ RET
+
+cglobal intra_sad_x9_8x8, 5,7,8
+ %define pred(i,j) [rsp+i*0x40+j*0x20]
+
+ mov r6, rsp
+ and rsp, ~31
+ SUB rsp, 0x240
+ movu m5, [r0+0*FENC_STRIDE]
+ movu m6, [r0+4*FENC_STRIDE]
+ punpcklqdq m5, [r0+2*FENC_STRIDE]
+ punpcklqdq m6, [r0+6*FENC_STRIDE]
+
+ ; save instruction size: avoid 4-byte memory offsets
+ lea r0, [intra8x9_h1+128]
+ %define off(m) (r0+m-(intra8x9_h1+128))
+
+ vpbroadcastq m0, [r2+16]
+ psadbw m4, m0, m5
+ psadbw m2, m0, m6
+ mova pred(0,0), m0
+ mova pred(0,1), m0
+ paddw m4, m2
+
+ vpbroadcastq m1, [r2+7]
+ pshufb m3, m1, [off(intra8x9_h1)]
+ pshufb m2, m1, [off(intra8x9_h3)]
+ mova pred(1,0), m3
+ mova pred(1,1), m2
+ psadbw m3, m5
+ psadbw m2, m6
+ paddw m3, m2
+
+ lea r5, [rsp+0x100]
+ %define pred(i,j) [r5+i*0x40+j*0x20-0x100]
+
+ ; combine the first two
+ pslldq m3, 2
+ por m4, m3
+
+ pxor m2, m2
+ psadbw m0, m2
+ psadbw m1, m2
+ paddw m0, m1
+ psrlw m0, 3
+ pavgw m0, m2
+ pshufb m0, m2
+ mova pred(2,0), m0
+ mova pred(2,1), m0
+ psadbw m3, m0, m5
+ psadbw m2, m0, m6
+ paddw m3, m2
+
+ pslldq m3, 4
+ por m4, m3
+
+ vbroadcasti128 m0, [r2+16]
+ vbroadcasti128 m2, [r2+17]
+ pslldq m1, m0, 1
+ pavgb m3, m0, m2
+ PRED4x4_LOWPASS m0, m1, m2, m0, m7
+ pshufb m1, m0, [off(intra8x9_ddl1)]
+ pshufb m2, m0, [off(intra8x9_ddl3)]
+ mova pred(3,0), m1
+ mova pred(3,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 6
+ por m4, m1
+ vextracti128 xm1, m4, 1
+ paddw xm4, xm1
+ mova [r4], xm4
+
+ ; for later
+ vinserti128 m7, m3, xm0, 1
+
+ vbroadcasti128 m2, [r2+8]
+ vbroadcasti128 m0, [r2+7]
+ vbroadcasti128 m1, [r2+6]
+ pavgb m3, m2, m0
+ PRED4x4_LOWPASS m0, m1, m2, m0, m4
+ pshufb m1, m0, [off(intra8x9_ddr1)]
+ pshufb m2, m0, [off(intra8x9_ddr3)]
+ mova pred(4,0), m1
+ mova pred(4,1), m2
+ psadbw m4, m1, m5
+ psadbw m2, m6
+ paddw m4, m2
+
+ add r0, 256
+ add r5, 0xC0
+ %define off(m) (r0+m-(intra8x9_h1+256+128))
+ %define pred(i,j) [r5+i*0x40+j*0x20-0x1C0]
+
+ vpblendd m2, m3, m0, 11110011b
+ pshufb m1, m2, [off(intra8x9_vr1)]
+ pshufb m2, m2, [off(intra8x9_vr3)]
+ mova pred(5,0), m1
+ mova pred(5,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 2
+ por m4, m1
+
+ psrldq m2, m3, 4
+ pblendw m2, m0, q3330
+ punpcklbw m0, m3
+ pshufb m1, m2, [off(intra8x9_hd1)]
+ pshufb m2, m0, [off(intra8x9_hd3)]
+ mova pred(6,0), m1
+ mova pred(6,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 4
+ por m4, m1
+
+ pshufb m1, m7, [off(intra8x9_vl1)]
+ pshufb m2, m7, [off(intra8x9_vl3)]
+ mova pred(7,0), m1
+ mova pred(7,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 6
+ por m4, m1
+ vextracti128 xm1, m4, 1
+ paddw xm4, xm1
+ mova xm3, [r4]
+ SBUTTERFLY qdq, 3, 4, 7
+ paddw xm3, xm4
+
+ pslldq m1, m0, 1
+ vpbroadcastd m0, [r2+7]
+ palignr m0, m1, 1
+ pshufb m1, m0, [off(intra8x9_hu1)]
+ pshufb m2, m0, [off(intra8x9_hu3)]
+ mova pred(8,0), m1
+ mova pred(8,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+ vextracti128 xm2, m1, 1
+ paddw xm1, xm2
+ movhlps xm2, xm1
+ paddw xm1, xm2
+ movd r2d, xm1
+
+ paddw xm3, [r3]
+ mova [r4], xm3
+ add r2w, word [r3+16]
+ mov [r4+16], r2w
+
+ phminposuw xm3, xm3
+ movd r3d, xm3
+ add r2d, 8<<16
+ cmp r3w, r2w
+ cmovg r3d, r2d
+
+ mov r2d, r3d
+ shr r3, 16
+ shl r3, 6
+ add r1, 4*FDEC_STRIDE
+ mova xm0, [rsp+r3+0x00]
+ mova xm1, [rsp+r3+0x10]
+ mova xm2, [rsp+r3+0x20]
+ mova xm3, [rsp+r3+0x30]
+ movq [r1+FDEC_STRIDE*-4], xm0
+ movhps [r1+FDEC_STRIDE*-2], xm0
+ movq [r1+FDEC_STRIDE*-3], xm1
+ movhps [r1+FDEC_STRIDE*-1], xm1
+ movq [r1+FDEC_STRIDE* 0], xm2
+ movhps [r1+FDEC_STRIDE* 2], xm2
+ movq [r1+FDEC_STRIDE* 1], xm3
+ movhps [r1+FDEC_STRIDE* 3], xm3
+ mov rsp, r6
+ mov eax, r2d
+ RET
+%endif ; HIGH_BIT_DEPTH
+
;=============================================================================
; SSIM
;=============================================================================
%macro ADS_START 0
%if UNIX64
- movsxd r5, r5d
+ movsxd r5, r5d
%else
- mov r5d, r5m
+ mov r5d, r5m
%endif
- mov r0d, r5d
- lea r6, [r4+r5+15]
- and r6, ~15;
+ mov r0d, r5d
+ lea r6, [r4+r5+(mmsize-1)]
+ and r6, ~(mmsize-1)
shl r2d, 1
%endmacro
add r1, 8*%1
add r3, 8*%1
add r6, 4*%1
- sub r0d, 4*%1
+ sub r0d, 4*%1
jg .loop
WIN64_RESTORE_XMM rsp
+%if mmsize==32
+ vzeroupper
+%endif
+ lea r6, [r4+r5+(mmsize-1)]
+ and r6, ~(mmsize-1)
jmp ads_mvs
%endmacro
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal pixel_ads4, 5,7
- movq mm6, [r0]
- movq mm4, [r0+8]
- pshufw mm7, mm6, 0
- pshufw mm6, mm6, q2222
- pshufw mm5, mm4, 0
- pshufw mm4, mm4, q2222
+ mova m6, [r0]
+ mova m4, [r0+8]
+ pshufw m7, m6, 0
+ pshufw m6, m6, q2222
+ pshufw m5, m4, 0
+ pshufw m4, m4, q2222
ADS_START
.loop:
- movq mm0, [r1]
- movq mm1, [r1+16]
- psubw mm0, mm7
- psubw mm1, mm6
- ABSW mm0, mm0, mm2
- ABSW mm1, mm1, mm3
- movq mm2, [r1+r2]
- movq mm3, [r1+r2+16]
- psubw mm2, mm5
- psubw mm3, mm4
- paddw mm0, mm1
- ABSW mm2, mm2, mm1
- ABSW mm3, mm3, mm1
- paddw mm0, mm2
- paddw mm0, mm3
- pshufw mm1, r6m, 0
- paddusw mm0, [r3]
- psubusw mm1, mm0
- packsswb mm1, mm1
- movd [r6], mm1
+ movu m0, [r1]
+ movu m1, [r1+16]
+ psubw m0, m7
+ psubw m1, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ movu m2, [r1+r2]
+ movu m3, [r1+r2+16]
+ psubw m2, m5
+ psubw m3, m4
+ paddw m0, m1
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ paddw m0, m2
+ paddw m0, m3
+ pshufw m1, r6m, 0
+ paddusw m0, [r3]
+ psubusw m1, m0
+ packsswb m1, m1
+ movd [r6], m1
ADS_END 1
cglobal pixel_ads2, 5,7
- movq mm6, [r0]
- pshufw mm5, r6m, 0
- pshufw mm7, mm6, 0
- pshufw mm6, mm6, q2222
+ mova m6, [r0]
+ pshufw m5, r6m, 0
+ pshufw m7, m6, 0
+ pshufw m6, m6, q2222
ADS_START
.loop:
- movq mm0, [r1]
- movq mm1, [r1+r2]
- psubw mm0, mm7
- psubw mm1, mm6
- ABSW mm0, mm0, mm2
- ABSW mm1, mm1, mm3
- paddw mm0, mm1
- paddusw mm0, [r3]
- movq mm4, mm5
- psubusw mm4, mm0
- packsswb mm4, mm4
- movd [r6], mm4
+ movu m0, [r1]
+ movu m1, [r1+r2]
+ psubw m0, m7
+ psubw m1, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ paddw m0, m1
+ paddusw m0, [r3]
+ mova m4, m5
+ psubusw m4, m0
+ packsswb m4, m4
+ movd [r6], m4
ADS_END 1
cglobal pixel_ads1, 5,7
- pshufw mm7, [r0], 0
- pshufw mm6, r6m, 0
+ pshufw m7, [r0], 0
+ pshufw m6, r6m, 0
ADS_START
.loop:
- movq mm0, [r1]
- movq mm1, [r1+8]
- psubw mm0, mm7
- psubw mm1, mm7
- ABSW mm0, mm0, mm2
- ABSW mm1, mm1, mm3
- paddusw mm0, [r3]
- paddusw mm1, [r3+8]
- movq mm4, mm6
- movq mm5, mm6
- psubusw mm4, mm0
- psubusw mm5, mm1
- packsswb mm4, mm5
- movq [r6], mm4
+ movu m0, [r1]
+ movu m1, [r1+8]
+ psubw m0, m7
+ psubw m1, m7
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ paddusw m0, [r3]
+ paddusw m1, [r3+8]
+ mova m4, m6
+ mova m5, m6
+ psubusw m4, m0
+ psubusw m5, m1
+ packsswb m4, m5
+ mova [r6], m4
ADS_END 2
%macro ADS_XMM 0
+%if mmsize==32
+cglobal pixel_ads4, 5,7,8
+ vpbroadcastw m7, [r0+ 0]
+ vpbroadcastw m6, [r0+ 4]
+ vpbroadcastw m5, [r0+ 8]
+ vpbroadcastw m4, [r0+12]
+%else
cglobal pixel_ads4, 5,7,12
- movdqa xmm4, [r0]
- pshuflw xmm7, xmm4, 0
- pshuflw xmm6, xmm4, q2222
- pshufhw xmm5, xmm4, 0
- pshufhw xmm4, xmm4, q2222
- punpcklqdq xmm7, xmm7
- punpcklqdq xmm6, xmm6
- punpckhqdq xmm5, xmm5
- punpckhqdq xmm4, xmm4
-%if ARCH_X86_64
- movd xmm8, r6m
- SPLATW xmm8, xmm8
+ mova m4, [r0]
+ pshuflw m7, m4, q0000
+ pshuflw m6, m4, q2222
+ pshufhw m5, m4, q0000
+ pshufhw m4, m4, q2222
+ punpcklqdq m7, m7
+ punpcklqdq m6, m6
+ punpckhqdq m5, m5
+ punpckhqdq m4, m4
+%endif
+%if ARCH_X86_64 && mmsize == 16
+ movd m8, r6m
+ SPLATW m8, m8
ADS_START
- movdqu xmm10, [r1]
- movdqu xmm11, [r1+r2]
+ movu m10, [r1]
+ movu m11, [r1+r2]
.loop:
- psubw xmm0, xmm10, xmm7
- movdqu xmm10, [r1+16]
- psubw xmm1, xmm10, xmm6
- ABSW xmm0, xmm0, xmm2
- ABSW xmm1, xmm1, xmm3
- psubw xmm2, xmm11, xmm5
- movdqu xmm11, [r1+r2+16]
- paddw xmm0, xmm1
- psubw xmm3, xmm11, xmm4
- movdqu xmm9, [r3]
- ABSW xmm2, xmm2, xmm1
- ABSW xmm3, xmm3, xmm1
- paddw xmm0, xmm2
- paddw xmm0, xmm3
- paddusw xmm0, xmm9
- psubusw xmm1, xmm8, xmm0
- packsswb xmm1, xmm1
- movq [r6], xmm1
+ psubw m0, m10, m7
+ movu m10, [r1+16]
+ psubw m1, m10, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ psubw m2, m11, m5
+ movu m11, [r1+r2+16]
+ paddw m0, m1
+ psubw m3, m11, m4
+ movu m9, [r3]
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ paddw m0, m2
+ paddw m0, m3
+ paddusw m0, m9
+ psubusw m1, m8, m0
%else
ADS_START
.loop:
- movdqu xmm0, [r1]
- movdqu xmm1, [r1+16]
- psubw xmm0, xmm7
- psubw xmm1, xmm6
- ABSW xmm0, xmm0, xmm2
- ABSW xmm1, xmm1, xmm3
- movdqu xmm2, [r1+r2]
- movdqu xmm3, [r1+r2+16]
- psubw xmm2, xmm5
- psubw xmm3, xmm4
- paddw xmm0, xmm1
- ABSW xmm2, xmm2, xmm1
- ABSW xmm3, xmm3, xmm1
- paddw xmm0, xmm2
- paddw xmm0, xmm3
- movd xmm1, r6m
- movdqu xmm2, [r3]
- pshuflw xmm1, xmm1, 0
- punpcklqdq xmm1, xmm1
- paddusw xmm0, xmm2
- psubusw xmm1, xmm0
- packsswb xmm1, xmm1
- movq [r6], xmm1
+ movu m0, [r1]
+ movu m1, [r1+16]
+ psubw m0, m7
+ psubw m1, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ movu m2, [r1+r2]
+ movu m3, [r1+r2+16]
+ psubw m2, m5
+ psubw m3, m4
+ paddw m0, m1
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ paddw m0, m2
+ paddw m0, m3
+ movu m2, [r3]
+%if mmsize==32
+ vpbroadcastw m1, r6m
+%else
+ movd m1, r6m
+ pshuflw m1, m1, 0
+ punpcklqdq m1, m1
+%endif
+ paddusw m0, m2
+ psubusw m1, m0
%endif ; ARCH
- ADS_END 2
+ packsswb m1, m1
+%if mmsize==32
+ vpermq m1, m1, q3120
+ mova [r6], xm1
+%else
+ movh [r6], m1
+%endif
+ ADS_END mmsize/8
cglobal pixel_ads2, 5,7,8
- movq xmm6, [r0]
- movd xmm5, r6m
- pshuflw xmm7, xmm6, 0
- pshuflw xmm6, xmm6, q2222
- pshuflw xmm5, xmm5, 0
- punpcklqdq xmm7, xmm7
- punpcklqdq xmm6, xmm6
- punpcklqdq xmm5, xmm5
+%if mmsize==32
+ vpbroadcastw m7, [r0+0]
+ vpbroadcastw m6, [r0+4]
+ vpbroadcastw m5, r6m
+%else
+ movq m6, [r0]
+ movd m5, r6m
+ pshuflw m7, m6, 0
+ pshuflw m6, m6, q2222
+ pshuflw m5, m5, 0
+ punpcklqdq m7, m7
+ punpcklqdq m6, m6
+ punpcklqdq m5, m5
+%endif
ADS_START
.loop:
- movdqu xmm0, [r1]
- movdqu xmm1, [r1+r2]
- psubw xmm0, xmm7
- psubw xmm1, xmm6
- movdqu xmm4, [r3]
- ABSW xmm0, xmm0, xmm2
- ABSW xmm1, xmm1, xmm3
- paddw xmm0, xmm1
- paddusw xmm0, xmm4
- psubusw xmm1, xmm5, xmm0
- packsswb xmm1, xmm1
- movq [r6], xmm1
- ADS_END 2
+ movu m0, [r1]
+ movu m1, [r1+r2]
+ psubw m0, m7
+ psubw m1, m6
+ movu m4, [r3]
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ paddw m0, m1
+ paddusw m0, m4
+ psubusw m1, m5, m0
+ packsswb m1, m1
+%if mmsize==32
+ vpermq m1, m1, q3120
+ mova [r6], xm1
+%else
+ movh [r6], m1
+%endif
+ ADS_END mmsize/8
cglobal pixel_ads1, 5,7,8
- movd xmm7, [r0]
- movd xmm6, r6m
- pshuflw xmm7, xmm7, 0
- pshuflw xmm6, xmm6, 0
- punpcklqdq xmm7, xmm7
- punpcklqdq xmm6, xmm6
+%if mmsize==32
+ vpbroadcastw m7, [r0]
+ vpbroadcastw m6, r6m
+%else
+ movd m7, [r0]
+ movd m6, r6m
+ pshuflw m7, m7, 0
+ pshuflw m6, m6, 0
+ punpcklqdq m7, m7
+ punpcklqdq m6, m6
+%endif
ADS_START
.loop:
- movdqu xmm0, [r1]
- movdqu xmm1, [r1+16]
- psubw xmm0, xmm7
- psubw xmm1, xmm7
- movdqu xmm2, [r3]
- movdqu xmm3, [r3+16]
- ABSW xmm0, xmm0, xmm4
- ABSW xmm1, xmm1, xmm5
- paddusw xmm0, xmm2
- paddusw xmm1, xmm3
- psubusw xmm4, xmm6, xmm0
- psubusw xmm5, xmm6, xmm1
- packsswb xmm4, xmm5
- movdqa [r6], xmm4
- ADS_END 4
+ movu m0, [r1]
+ movu m1, [r1+mmsize]
+ psubw m0, m7
+ psubw m1, m7
+ movu m2, [r3]
+ movu m3, [r3+mmsize]
+ ABSW m0, m0, m4
+ ABSW m1, m1, m5
+ paddusw m0, m2
+ paddusw m1, m3
+ psubusw m4, m6, m0
+ psubusw m5, m6, m1
+ packsswb m4, m5
+%if mmsize==32
+ vpermq m4, m4, q3120
+%endif
+ mova [r6], m4
+ ADS_END mmsize/4
%endmacro
INIT_XMM sse2
ADS_XMM
INIT_XMM avx
ADS_XMM
+INIT_YMM avx2
+ADS_XMM
; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {
INIT_MMX
cglobal pixel_ads_mvs, 0,7,0
ads_mvs:
- lea r6, [r4+r5+15]
- and r6, ~15;
; mvs = r4
; masks = r6
; width = r5
DECL_X4( sad, sse2 )
DECL_X4( sad, sse3 )
DECL_X4( sad, ssse3 )
+DECL_X4( sad, avx2 )
DECL_X1( ssd, mmx )
DECL_X1( ssd, mmx2 )
DECL_X1( ssd, sse2slow )
DECL_X1( ssd, ssse3 )
DECL_X1( ssd, avx )
DECL_X1( ssd, xop )
+DECL_X1( ssd, avx2 )
DECL_X1( satd, mmx2 )
DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 )
DECL_X1( satd, sse4 )
DECL_X1( satd, avx )
DECL_X1( satd, xop )
+DECL_X1( satd, avx2 )
DECL_X1( sa8d, mmx2 )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
DECL_X1( sa8d, sse4 )
DECL_X1( sa8d, avx )
DECL_X1( sa8d, xop )
+DECL_X1( sa8d, avx2 )
DECL_X1( sad, cache32_mmx2 );
DECL_X1( sad, cache64_mmx2 );
DECL_X1( sad, cache64_sse2 );
DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, avx2, ( pixel *pix, intptr_t i_stride ))
void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_16x16_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_16x16_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_16x16_ssse3 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_16x16_avx2 ( pixel *, pixel *, int * );
void x264_intra_sa8d_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * );
int x264_intra_sad_x9_8x8_ssse3 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
int x264_intra_sad_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
int x264_intra_sad_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
+int x264_intra_sad_x9_8x8_avx2 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, intptr_t stride1,
pixel *pixuv2, intptr_t stride2, int width,
int x264_pixel_var2_8x8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
int x264_pixel_var2_8x8_ssse3 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x8_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x8_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
int x264_pixel_var2_8x16_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
int x264_pixel_var2_8x16_ssse3( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x16_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
uint64_t x264_pixel_sa8d_satd_16x16_sse4 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
uint64_t x264_pixel_sa8d_satd_16x16_avx ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
uint64_t x264_pixel_sa8d_satd_16x16_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_avx2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
DECL_ADS( 4, avx )
DECL_ADS( 2, avx )
DECL_ADS( 1, avx )
+DECL_ADS( 4, avx2 )
+DECL_ADS( 2, avx2 )
+DECL_ADS( 1, avx2 )
#undef DECL_PIXELS
#undef DECL_X1
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
+SECTION_RODATA 32
%macro DQM4 3
dw %1, %2, %1, %2, %2, %3, %2, %3
dw %1, %4, %5, %4, %1, %4, %5, %4
dw %4, %2, %6, %2, %4, %2, %6, %2
dw %5, %6, %3, %6, %5, %6, %3, %6
- ; last line not used, just padding for power-of-2 stride
- times 8 dw 0
+ dw %4, %2, %6, %2, %4, %2, %6, %2
%endmacro
dequant4_scale:
cextern pd_1
cextern pb_01
cextern pd_1024
-
-%macro QUANT_DC_START 0
- movd m6, r1m ; mf
- movd m7, r2m ; bias
-%if HIGH_BIT_DEPTH
- SPLATD m6, m6
- SPLATD m7, m7
+cextern deinterleave_shufd
+
+%macro QUANT_DC_START 2
+ movd xm%1, r1m ; mf
+ movd xm%2, r2m ; bias
+%if cpuflag(avx2)
+ vpbroadcastdct m%1, xm%1
+ vpbroadcastdct m%2, xm%2
+%elif HIGH_BIT_DEPTH
+ SPLATD m%1, m%1
+ SPLATD m%2, m%2
%elif cpuflag(sse4) ; ssse3, but not faster on conroe
mova m5, [pb_01]
- pshufb m6, m5
- pshufb m7, m5
+ pshufb m%1, m5
+ pshufb m%2, m5
%else
- SPLATW m6, m6
- SPLATW m7, m7
+ SPLATW m%1, m%1
+ SPLATW m%2, m%2
%endif
%endmacro
;-----------------------------------------------------------------------------
%macro QUANT_DC 2
cglobal quant_%1x%2_dc, 3,3,8
- QUANT_DC_START
+ QUANT_DC_START 6,7
%if %1*%2 <= mmsize/4
QUANT_ONE_DC r0, m6, m7, 0
%else
%endif ; HIGH_BIT_DEPTH
%if HIGH_BIT_DEPTH == 0
-%macro QUANT_ONE 4
+%macro QUANT_ONE 5
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
pmulhuw m0, %2 ; divide
PSIGNW m0, m1 ; restore sign
mova %1, m0 ; store
- ACCUM por, 5, 0, %4
+ ACCUM por, %5, 0, %4
%endmacro
%macro QUANT_TWO 8
;-----------------------------------------------------------------------------
%macro QUANT_DC 2-3 0
cglobal %1, 1,1,%3
- QUANT_DC_START
%if %2==1
- QUANT_ONE [r0], m6, m7, 0
+ QUANT_DC_START 2,3
+ QUANT_ONE [r0], m2, m3, 0, 5
%else
+ QUANT_DC_START 4,6
%assign x 0
%rep %2/2
- QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x, 5
+ QUANT_TWO [r0+x], [r0+x+mmsize], m4, m4, m6, m6, x, 5
%assign x x+mmsize*2
%endrep
%endif
;-----------------------------------------------------------------------------
%macro QUANT_AC 2
cglobal %1, 3,3
+%if %2==1
+ QUANT_ONE [r0], [r1], [r2], 0, 5
+%else
%assign x 0
%rep %2/2
QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x, 5
%assign x x+mmsize*2
%endrep
+%endif
QUANT_END
RET
%endmacro
%endif
INIT_XMM sse2
-QUANT_DC quant_4x4_dc, 2, 8
+QUANT_DC quant_4x4_dc, 2, 7
QUANT_AC quant_4x4, 2
QUANT_AC quant_8x8, 8
QUANT_4x4x4
INIT_XMM ssse3
-QUANT_DC quant_4x4_dc, 2, 8
+QUANT_DC quant_4x4_dc, 2, 7
QUANT_AC quant_4x4, 2
QUANT_AC quant_8x8, 8
QUANT_4x4x4
INIT_XMM sse4
;Not faster on Conroe, so only used in SSE4 versions
-QUANT_DC quant_4x4_dc, 2, 8
+QUANT_DC quant_4x4_dc, 2, 7
QUANT_AC quant_4x4, 2
QUANT_AC quant_8x8, 8
+
+INIT_YMM avx2
+QUANT_AC quant_4x4, 1
+QUANT_AC quant_8x8, 4
+QUANT_DC quant_4x4_dc, 1, 6
+
+INIT_YMM avx2
+cglobal quant_4x4x4, 3,3,7
+ mova m2, [r1]
+ mova m3, [r2]
+ QUANT_ONE [r0+ 0], m2, m3, 0, 4
+ QUANT_ONE [r0+32], m2, m3, 0, 5
+ packssdw m4, m5
+ QUANT_ONE [r0+64], m2, m3, 0, 5
+ QUANT_ONE [r0+96], m2, m3, 0, 6
+ packssdw m5, m6
+ packssdw m4, m5
+ vextracti128 xm5, m4, 1
+ por xm4, xm5
+ packssdw xm4, xm4
+ packsswb xm4, xm4
+ pxor xm3, xm3
+ pcmpeqb xm4, xm3
+ pmovmskb eax, xm4
+ not eax
+ and eax, 0xf
+ RET
%endif ; !HIGH_BIT_DEPTH
; dequant
;=============================================================================
-%macro DEQUANT16_L 3
+%macro DEQUANT16_L 4
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; m2 i_qbits
pslld m0, m2
%else
packssdw m0, %3
+%if mmsize==32
+ vpermq m0, m0, q3120
+%endif
pmullw m0, %1
- psllw m0, m2
+ psllw m0, xm2
%endif
mova %1, m0
%endmacro
-%macro DEQUANT32_R 3
+%macro DEQUANT32_R 4
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; m2 -i_qbits
;;; m3 f
;;; m4 0
+%if mmsize==32
+ pmovzxwd m0, %1
+ pmovzxwd m1, %4
+ pmaddwd m0, %2
+ pmaddwd m1, %3
+ paddd m0, m3
+ paddd m1, m3
+ psrad m0, xm2
+ psrad m1, xm2
+ packssdw m0, m1
+ vpermq m0, m0, q3120
+%else
mova m0, %1
%if HIGH_BIT_DEPTH
pmadcswd m0, m0, %2, m3
psrad m0, m2
psrad m1, m2
packssdw m0, m1
+%endif
%endif
mova %1, m0
%endmacro
%macro DEQUANT_LOOP 3
-%if 8*(%2-2*%3)
+%if 8*(%2-2*%3) > 0
mov t0d, 8*(%2-2*%3)
%%loop:
- %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3]
- %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
+ %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3], [r0+(t0+ 4*%3)*SIZEOF_PIXEL]
+ %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3], [r0+(t0+12*%3)*SIZEOF_PIXEL]
sub t0d, 16*%3
jge %%loop
RET
%else
- %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
- %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3]
+%if mmsize < 32
+ %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3], [r0+(12*%3)*SIZEOF_PIXEL]
+%endif
+ %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3], [r0+( 4*%3)*SIZEOF_PIXEL]
RET
%endif
%endmacro
DEQUANT_START %2+2, %2
.lshift:
- movd m2, t0d
+ movd xm2, t0d
DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3
.rshift32:
neg t0d
- movd m2, t0d
mova m3, [pd_1]
+ movd xm2, t0d
+ pslld m3, xm2
pxor m4, m4
- pslld m3, m2
psrld m3, 1
DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
-%if HIGH_BIT_DEPTH == 0 && notcpuflag(avx)
+%if HIGH_BIT_DEPTH == 0 && (notcpuflag(avx) || mmsize == 32)
cglobal dequant_%1x%1_flat16, 0,3
movifnidn t2d, r2m
%if %1 == 8
lea r1, [dequant%1_scale + t2]
%endif
movifnidn r0, r0mp
- movd m4, t0d
+ movd xm4, t0d
%if %1 == 4
%if mmsize == 8
DEQUANT16_FLAT [r1], 0, 16
DEQUANT16_FLAT [r1+8], 8, 24
-%else
+%elif mmsize == 16
DEQUANT16_FLAT [r1], 0, 16
+%else
+ vbroadcasti128 m0, [r1]
+ psllw m0, xm4
+ pmullw m0, [r0]
+ mova [r0], m0
%endif
%elif mmsize == 8
DEQUANT16_FLAT [r1], 0, 8, 64, 72
DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
-%else
+%elif mmsize == 16
DEQUANT16_FLAT [r1], 0, 64
DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
DEQUANT16_FLAT [r1+32], 32, 96
+%else
+ mova m1, [r1+ 0]
+ mova m2, [r1+32]
+ psllw m1, xm4
+ psllw m2, xm4
+ pmullw m0, m1, [r0+ 0]
+ pmullw m3, m2, [r0+32]
+ pmullw m4, m1, [r0+64]
+ pmullw m5, m2, [r0+96]
+ mova [r0+ 0], m0
+ mova [r0+32], m3
+ mova [r0+64], m4
+ mova [r0+96], m5
%endif
RET
%endif ; !HIGH_BIT_DEPTH && !AVX
INIT_XMM xop
DEQUANT 4, 4, 2
DEQUANT 8, 6, 2
+INIT_YMM avx2
+DEQUANT 4, 4, 4
+DEQUANT 8, 6, 4
%endif
%macro DEQUANT_DC 2
INIT_XMM avx
DENOISE_DCT
+INIT_YMM avx2
+cglobal denoise_dct, 4,4,4
+ pxor m3, m3
+ movsxdifnidn r3, r3d
+.loop:
+ mova m1, [r0+r3*2-mmsize]
+ pabsw m0, m1
+ psubusw m2, m0, [r2+r3*2-mmsize]
+ vpermq m0, m0, q3120
+ psignw m2, m1
+ mova [r0+r3*2-mmsize], m2
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+ paddd m1, [r1+r3*4-2*mmsize]
+ paddd m0, [r1+r3*4-1*mmsize]
+ mova [r1+r3*4-2*mmsize], m1
+ mova [r1+r3*4-1*mmsize], m0
+ sub r3, mmsize/2
+ jg .loop
+ RET
+
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
%macro DECIMATE_MASK 5
%if mmsize==16
%if HIGH_BIT_DEPTH
- movdqa xmm0, [%3+ 0]
- movdqa xmm1, [%3+32]
- packssdw xmm0, [%3+16]
- packssdw xmm1, [%3+48]
- ABSW2 xmm0, xmm1, xmm0, xmm1, xmm3, xmm4
+ movdqa m0, [%3+ 0]
+ movdqa m1, [%3+32]
+ packssdw m0, [%3+16]
+ packssdw m1, [%3+48]
+ ABSW2 m0, m1, m0, m1, m3, m4
%else
- ABSW xmm0, [%3+ 0], xmm3
- ABSW xmm1, [%3+16], xmm4
+ ABSW m0, [%3+ 0], m3
+ ABSW m1, [%3+16], m4
%endif
- packsswb xmm0, xmm1
- pxor xmm2, xmm2
- pcmpeqb xmm2, xmm0
- pcmpgtb xmm0, %4
- pmovmskb %1, xmm2
- pmovmskb %2, xmm0
-
+ packsswb m0, m1
+ pxor m2, m2
+ pcmpeqb m2, m0
+ pcmpgtb m0, %4
+ pmovmskb %1, m2
+ pmovmskb %2, m0
%else ; mmsize==8
%if HIGH_BIT_DEPTH
- movq mm0, [%3+ 0]
- movq mm1, [%3+16]
- movq mm2, [%3+32]
- movq mm3, [%3+48]
- packssdw mm0, [%3+ 8]
- packssdw mm1, [%3+24]
- packssdw mm2, [%3+40]
- packssdw mm3, [%3+56]
+ movq m0, [%3+ 0]
+ movq m1, [%3+16]
+ movq m2, [%3+32]
+ movq m3, [%3+48]
+ packssdw m0, [%3+ 8]
+ packssdw m1, [%3+24]
+ packssdw m2, [%3+40]
+ packssdw m3, [%3+56]
%else
- movq mm0, [%3+ 0]
- movq mm1, [%3+ 8]
- movq mm2, [%3+16]
- movq mm3, [%3+24]
-%endif
- ABSW2 mm0, mm1, mm0, mm1, mm6, mm7
- ABSW2 mm2, mm3, mm2, mm3, mm6, mm7
- packsswb mm0, mm1
- packsswb mm2, mm3
- pxor mm4, mm4
- pxor mm6, mm6
- pcmpeqb mm4, mm0
- pcmpeqb mm6, mm2
- pcmpgtb mm0, %4
- pcmpgtb mm2, %4
- pmovmskb %5, mm4
- pmovmskb %1, mm6
- shl %1, 8
- or %1, %5
- pmovmskb %5, mm0
- pmovmskb %2, mm2
- shl %2, 8
- or %2, %5
+ movq m0, [%3+ 0]
+ movq m1, [%3+ 8]
+ movq m2, [%3+16]
+ movq m3, [%3+24]
+%endif
+ ABSW2 m0, m1, m0, m1, m6, m7
+ ABSW2 m2, m3, m2, m3, m6, m7
+ packsswb m0, m1
+ packsswb m2, m3
+ pxor m4, m4
+ pxor m6, m6
+ pcmpeqb m4, m0
+ pcmpeqb m6, m2
+ pcmpgtb m0, %4
+ pcmpgtb m2, %4
+ pmovmskb %5, m4
+ pmovmskb %1, m6
+ shl %1, 8
+ or %1, %5
+ pmovmskb %5, m0
+ pmovmskb %2, m2
+ shl %2, 8
+ or %2, %5
%endif
%endmacro
%macro DECIMATE4x4 1
-;A LUT is faster than bsf on older AMD processors.
-;This is not true for score64.
cglobal decimate_score%1, 1,3
%ifdef PIC
lea r4, [decimate_table4]
%if %1==15
shr edx, 1
%endif
-%if cpuflag(slowctz)
movzx ecx, dl
movzx eax, byte [mask_table + rcx]
cmp edx, ecx
bsr ecx, ecx
shr edx, 1
shr edx, cl
- bsf ecx, edx
+ tzcnt ecx, edx
shr edx, 1
shr edx, cl
add al, byte [table + rcx]
add al, byte [mask_table + rdx]
-%else
-.loop:
- tzcnt ecx, edx
- shr edx, cl
- add al, byte [table + rcx]
- shr edx, 1
- jne .loop
-%endif
.ret:
REP_RET
.ret9:
INIT_MMX mmx2
DECIMATE4x4 15
DECIMATE4x4 16
-INIT_MMX mmx2, slowctz
-DECIMATE4x4 15
-DECIMATE4x4 16
%endif
INIT_XMM sse2
DECIMATE4x4 15
DECIMATE4x4 16
-INIT_XMM sse2, slowctz
-DECIMATE4x4 15
-DECIMATE4x4 16
INIT_XMM ssse3
DECIMATE4x4 15
DECIMATE4x4 16
-INIT_XMM ssse3, slowctz
-DECIMATE4x4 15
-DECIMATE4x4 16
+
+; 2x gt1 output, 2x nz output, 1x mask
+%macro DECIMATE_MASK64_AVX2 5
+ pabsw m0, [r0+ 0]
+ pabsw m2, [r0+32]
+ pabsw m1, [r0+64]
+ pabsw m3, [r0+96]
+ packsswb m0, m2
+ packsswb m1, m3
+ pcmpgtb m2, m0, %5 ; the > 1 checks don't care about order, so
+ pcmpgtb m3, m1, %5 ; we can save latency by doing them here
+ pmovmskb %1, m2
+ pmovmskb %2, m3
+ or %1, %2
+ jne .ret9
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+ pxor m4, m4
+ pcmpeqb m0, m4
+ pcmpeqb m1, m4
+ pmovmskb %3, m0
+ pmovmskb %4, m1
+%endmacro
%macro DECIMATE8x8 0
%define table decimate_table8
%endif
mova m5, [pb_1]
+%if mmsize==32
+ DECIMATE_MASK64_AVX2 eax, r2d, r1d, r3d, m5
+ shl r3, 32
+ or r1, r3
+ xor r1, -1
+ je .ret
+%else
DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
- test eax, eax
+ test eax, eax
jne .ret9
DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
- shl r2d, 16
- or r1d, r2d
+ shl r2d, 16
+ or r1d, r2d
DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
shl r2, 32
- or eax, r3d
+ or eax, r3d
or r1, r2
DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
shl r2, 48
or r1, r2
xor r1, -1
je .ret
- add eax, r3d
+ add eax, r3d
jne .ret9
+%endif
+ mov al, -6
.loop:
tzcnt rcx, r1
shr r1, cl
add al, byte [table + rcx]
+ jge .ret9
shr r1, 1
jne .loop
+ add al, 6
.ret:
REP_RET
.ret9:
- mov eax, 9
+ mov eax, 9
RET
%else ; ARCH
cglobal decimate_score64, 1,5
%endif
mova m5, [pb_1]
+%if mmsize==32
+ DECIMATE_MASK64_AVX2 r0, r2, r3, r4, m5
+ xor r3, -1
+ je .tryret
+ xor r4, -1
+.cont:
+%else
DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
test r2, r2
jne .ret9
xor r4, -1
.cont:
add r0, r2
- jne .ret9 ;r0 is zero at this point, so we don't need to zero it
+ jne .ret9
+%endif
+ mov al, -6
.loop:
tzcnt ecx, r3
test r3, r3
je .largerun
shrd r3, r4, cl
shr r4, cl
- add r0b, byte [decimate_table8 + ecx]
+ add al, byte [decimate_table8 + ecx]
+ jge .ret9
shrd r3, r4, 1
shr r4, 1
- cmp r0, 6 ;score64's threshold is never higher than 6
- jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
test r3, r3
jne .loop
test r4, r4
jne .loop
+ add al, 6
.ret:
REP_RET
.tryret:
shr r3, cl
shr r3, 1
jne .loop
+ add al, 6
RET
%endif ; ARCH
DECIMATE8x8
INIT_XMM ssse3
DECIMATE8x8
+INIT_YMM avx2
+DECIMATE8x8
;-----------------------------------------------------------------------------
; int coeff_last( dctcoef *dct )
RET
%if ARCH_X86_64 == 0
-cglobal coeff_last64, 1, 5-mmsize/16
+cglobal coeff_last64, 1, 4-mmsize/16
pxor m2, m2
- LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 32, r4d
- LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF* 48, r4d
- shl r3d, 16
- or r2d, r3d
- xor r2d, -1
+ LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 32, r3d
+ LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 48, r3d
+ shl r2d, 16
+ or r1d, r2d
+ xor r1d, -1
jne .secondhalf
- LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r4d
- LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*16, r4d
- shl r3d, 16
- or r1d, r3d
+ LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r3d
+ LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16, r3d
+ shl r2d, 16
+ or r1d, r2d
not r1d
BSR eax, r1d, 0x1f
RET
.secondhalf:
- BSR eax, r2d, 0x1f
+ BSR eax, r1d, 0x1f
add eax, 32
RET
%else
INIT_XMM sse2, lzcnt
COEFF_LAST
+%macro LAST_MASK_AVX2 2
+%if HIGH_BIT_DEPTH
+ mova m0, [%2+ 0]
+ packssdw m0, [%2+32]
+ mova m1, [%2+64]
+ packssdw m1, [%2+96]
+ packsswb m0, m1
+ mova m1, [deinterleave_shufd]
+ vpermd m0, m1, m0
+%else
+ mova m0, [%2+ 0]
+ packsswb m0, [%2+32]
+ vpermq m0, m0, q3120
+%endif
+ pcmpeqb m0, m2
+ pmovmskb %1, m0
+%endmacro
+
+%if ARCH_X86_64 == 0
+INIT_YMM avx2,lzcnt
+cglobal coeff_last64, 1,2
+ pxor m2, m2
+ LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32
+ xor r1d, -1
+ jne .secondhalf
+ LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
+ not r1d
+ BSR eax, r1d, 0x1f
+ RET
+.secondhalf:
+ BSR eax, r1d, 0x1f
+ add eax, 32
+ RET
+%else
+INIT_YMM avx2,lzcnt
+cglobal coeff_last64, 1,3
+ pxor m2, m2
+ LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
+ LAST_MASK_AVX2 r2d, r0+SIZEOF_DCTCOEF*32
+ shl r2, 32
+ or r1, r2
+ not r1
+ BSR rax, r1, 0x3f
+ RET
+%endif
+
;-----------------------------------------------------------------------------
; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
;-----------------------------------------------------------------------------
int x264_quant_4x4_sse4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_4x4x4_sse4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
+int x264_quant_4x4_avx2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_4x4_dc_avx2( dctcoef dct[16], int mf, int bias );
+int x264_quant_8x8_avx2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
+int x264_quant_4x4x4_avx2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_mmx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf );
int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf );
int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf );
void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_avx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
+void x264_denoise_dct_avx2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
int x264_decimate_score15_mmx2( dctcoef *dct );
int x264_decimate_score15_sse2( dctcoef *dct );
int x264_decimate_score15_ssse3( dctcoef *dct );
int x264_decimate_score16_mmx2( dctcoef *dct );
int x264_decimate_score16_sse2( dctcoef *dct );
int x264_decimate_score16_ssse3( dctcoef *dct );
-int x264_decimate_score15_mmx2_slowctz( dctcoef *dct );
-int x264_decimate_score15_sse2_slowctz( dctcoef *dct );
-int x264_decimate_score15_ssse3_slowctz( dctcoef *dct );
-int x264_decimate_score16_mmx2_slowctz( dctcoef *dct );
-int x264_decimate_score16_sse2_slowctz( dctcoef *dct );
-int x264_decimate_score16_ssse3_slowctz( dctcoef *dct );
int x264_decimate_score64_mmx2( dctcoef *dct );
int x264_decimate_score64_sse2( dctcoef *dct );
int x264_decimate_score64_ssse3( dctcoef *dct );
+int x264_decimate_score64_avx2( int16_t *dct );
int x264_coeff_last4_mmx2( dctcoef *dct );
int x264_coeff_last8_mmx2( dctcoef *dct );
int x264_coeff_last15_mmx2( dctcoef *dct );
int x264_coeff_last15_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last16_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last64_sse2_lzcnt( dctcoef *dct );
+int x264_coeff_last64_avx2_lzcnt( dctcoef *dct );
int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
%include "x86inc.asm"
%include "x86util.asm"
+SECTION_RODATA
+
+deinterleave_sadx4: dd 0,4,2,6
+
SECTION .text
cextern pb_3
INIT_XMM ssse3
INTRA_SAD16
-
+INIT_YMM avx2
+cglobal intra_sad_x3_16x16, 3,5,6
+ pxor xm0, xm0
+ psadbw xm0, [r1-FDEC_STRIDE]
+ movhlps xm1, xm0
+ paddw xm0, xm1
+ movd r3d, xm0
+%assign x 0
+%rep 16
+ movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
+%if (x&3)==3 && x!=15
+ add r1, FDEC_STRIDE*4
+%endif
+ add r3d, r4d
+%assign x x+1
+%endrep
+ sub r1, FDEC_STRIDE*12
+ add r3d, 16
+ shr r3d, 5
+ movd xm5, r3d
+ vpbroadcastb xm5, xm5
+ vinserti128 m5, m5, [r1-FDEC_STRIDE], 1 ; m5 contains DC and V prediction
+
+ pxor m4, m4 ; DC / V accumulator
+ pxor xm3, xm3 ; H accumulator
+ mov r3d, 15*FENC_STRIDE
+.vloop:
+ vpbroadcastb xm2, [r1+r3*2-1]
+ vbroadcasti128 m0, [r0+r3]
+ psadbw m1, m0, m5
+ psadbw xm0, xm2
+ paddw m4, m1
+ paddw xm3, xm0
+ add r3d, -FENC_STRIDE
+ jge .vloop
+ punpckhqdq m5, m4, m4
+ movhlps xm2, xm3
+ paddw m4, m5 ; DC / V
+ paddw xm3, xm2 ; H
+ vextracti128 xm2, m4, 1
+ movd [r2+0], xm2
+ movd [r2+4], xm3
+ movd [r2+8], xm4
+ RET
;=============================================================================
; SAD x3/x4 MMX
%endif
%endmacro
-%macro SAD_X3_2x16P_SSE2 1
-%if %1
+%macro SAD_X3_4x16P_SSE2 2
+%if %1==0
+%if UNIX64
+ mov r6, r5
+%endif
+ lea r5, [r4*3]
SAD_X3_START_1x16P_SSE2
%else
- SAD_X3_1x16P_SSE2 0, 0
+ SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
+%endif
+ SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1
+ SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2
+ SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r5
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
+%endif
+ lea r1, [r1+4*r4]
+ lea r2, [r2+4*r4]
+ lea r3, [r3+4*r4]
%endif
- SAD_X3_1x16P_SSE2 FENC_STRIDE, r4
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r4]
- lea r2, [r2+2*r4]
- lea r3, [r3+2*r4]
%endmacro
%macro SAD_X3_START_2x8P_SSE2 0
psadbw xmm2, xmm7
%endmacro
-%macro SAD_X3_2x8P_SSE2 0
- movq xmm7, [r0]
- movq xmm3, [r1]
- movq xmm4, [r2]
- movq xmm5, [r3]
- movhps xmm7, [r0+FENC_STRIDE]
- movhps xmm3, [r1+r4]
- movhps xmm4, [r2+r4]
- movhps xmm5, [r3+r4]
+%macro SAD_X3_2x8P_SSE2 4
+ movq xmm7, [r0+%1]
+ movq xmm3, [r1+%2]
+ movq xmm4, [r2+%2]
+ movq xmm5, [r3+%2]
+ movhps xmm7, [r0+%3]
+ movhps xmm3, [r1+%4]
+ movhps xmm4, [r2+%4]
+ movhps xmm5, [r3+%4]
psadbw xmm3, xmm7
psadbw xmm4, xmm7
psadbw xmm5, xmm7
psadbw xmm3, xmm7
%endmacro
-%macro SAD_X4_2x8P_SSE2 0
- movq xmm7, [r0]
- movq xmm4, [r1]
- movq xmm5, [r2]
+%macro SAD_X4_2x8P_SSE2 4
+ movq xmm7, [r0+%1]
+ movq xmm4, [r1+%2]
+ movq xmm5, [r2+%2]
%if ARCH_X86_64
- movq xmm6, [r3]
- movq xmm8, [r4]
- movhps xmm7, [r0+FENC_STRIDE]
- movhps xmm4, [r1+r5]
- movhps xmm5, [r2+r5]
- movhps xmm6, [r3+r5]
- movhps xmm8, [r4+r5]
+ movq xmm6, [r3+%2]
+ movq xmm8, [r4+%2]
+ movhps xmm7, [r0+%3]
+ movhps xmm4, [r1+%4]
+ movhps xmm5, [r2+%4]
+ movhps xmm6, [r3+%4]
+ movhps xmm8, [r4+%4]
psadbw xmm4, xmm7
psadbw xmm5, xmm7
psadbw xmm6, xmm7
paddw xmm2, xmm6
paddw xmm3, xmm8
%else
- movhps xmm7, [r0+FENC_STRIDE]
- movhps xmm4, [r1+r5]
- movhps xmm5, [r2+r5]
+ movhps xmm7, [r0+%3]
+ movhps xmm4, [r1+%4]
+ movhps xmm5, [r2+%4]
psadbw xmm4, xmm7
psadbw xmm5, xmm7
paddw xmm0, xmm4
paddw xmm1, xmm5
- movq xmm6, [r3]
- movq xmm4, [r4]
- movhps xmm6, [r3+r5]
- movhps xmm4, [r4+r5]
+ movq xmm6, [r3+%2]
+ movq xmm4, [r4+%2]
+ movhps xmm6, [r3+%4]
+ movhps xmm4, [r4+%4]
psadbw xmm6, xmm7
psadbw xmm4, xmm7
paddw xmm2, xmm6
%endif
%endmacro
-%macro SAD_X4_2x16P_SSE2 1
-%if %1
+%macro SAD_X4_4x16P_SSE2 2
+%if %1==0
+ lea r6, [r5*3]
SAD_X4_START_1x16P_SSE2
%else
- SAD_X4_1x16P_SSE2 0, 0
+ SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0
+%endif
+ SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1
+ SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2
+ SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
+%endif
+ lea r1, [r1+4*r5]
+ lea r2, [r2+4*r5]
+ lea r3, [r3+4*r5]
+ lea r4, [r4+4*r5]
%endif
- SAD_X4_1x16P_SSE2 FENC_STRIDE, r5
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r5]
- lea r2, [r2+2*r5]
- lea r3, [r3+2*r5]
- lea r4, [r4+2*r5]
%endmacro
-%macro SAD_X3_2x8P_SSE2 1
-%if %1
+%macro SAD_X3_4x8P_SSE2 2
+%if %1==0
+%if UNIX64
+ mov r6, r5
+%endif
+ lea r5, [r4*3]
SAD_X3_START_2x8P_SSE2
%else
- SAD_X3_2x8P_SSE2
+ SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1
+%endif
+ SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), r5
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
+%endif
+ lea r1, [r1+4*r4]
+ lea r2, [r2+4*r4]
+ lea r3, [r3+4*r4]
%endif
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r4]
- lea r2, [r2+2*r4]
- lea r3, [r3+2*r4]
%endmacro
-%macro SAD_X4_2x8P_SSE2 1
-%if %1
+%macro SAD_X4_4x8P_SSE2 2
+%if %1==0
+ lea r6, [r5*3]
SAD_X4_START_2x8P_SSE2
%else
- SAD_X4_2x8P_SSE2
+ SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
+%endif
+ SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
+%endif
+ lea r1, [r1+4*r5]
+ lea r2, [r2+4*r5]
+ lea r3, [r3+4*r5]
+ lea r4, [r4+4*r5]
%endif
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r5]
- lea r2, [r2+2*r5]
- lea r3, [r3+2*r5]
- lea r4, [r4+2*r5]
%endmacro
%macro SAD_X3_END_SSE2 0
paddw xmm1, xmm5
paddw xmm2, xmm6
%if UNIX64
- movd [r5+0], xmm0
- movd [r5+4], xmm1
- movd [r5+8], xmm2
+ movd [r6+0], xmm0
+ movd [r6+4], xmm1
+ movd [r6+8], xmm2
%else
mov r0, r5mp
movd [r0+0], xmm0
RET
%endmacro
+%macro SAD_X4_START_2x8P_SSSE3 0
+ movddup xmm4, [r0]
+ movq xmm0, [r1]
+ movq xmm1, [r3]
+ movhps xmm0, [r2]
+ movhps xmm1, [r4]
+ movddup xmm5, [r0+FENC_STRIDE]
+ movq xmm2, [r1+r5]
+ movq xmm3, [r3+r5]
+ movhps xmm2, [r2+r5]
+ movhps xmm3, [r4+r5]
+ psadbw xmm0, xmm4
+ psadbw xmm1, xmm4
+ psadbw xmm2, xmm5
+ psadbw xmm3, xmm5
+ paddw xmm0, xmm2
+ paddw xmm1, xmm3
+%endmacro
+
+%macro SAD_X4_2x8P_SSSE3 4
+ movddup xmm6, [r0+%1]
+ movq xmm2, [r1+%2]
+ movq xmm3, [r3+%2]
+ movhps xmm2, [r2+%2]
+ movhps xmm3, [r4+%2]
+ movddup xmm7, [r0+%3]
+ movq xmm4, [r1+%4]
+ movq xmm5, [r3+%4]
+ movhps xmm4, [r2+%4]
+ movhps xmm5, [r4+%4]
+ psadbw xmm2, xmm6
+ psadbw xmm3, xmm6
+ psadbw xmm4, xmm7
+ psadbw xmm5, xmm7
+ paddw xmm0, xmm2
+ paddw xmm1, xmm3
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+%endmacro
+
+%macro SAD_X4_4x8P_SSSE3 2
+%if %1==0
+ lea r6, [r5*3]
+ SAD_X4_START_2x8P_SSSE3
+%else
+ SAD_X4_2x8P_SSSE3 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
+%endif
+ SAD_X4_2x8P_SSSE3 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
+%endif
+ lea r1, [r1+4*r5]
+ lea r2, [r2+4*r5]
+ lea r3, [r3+4*r5]
+ lea r4, [r4+4*r5]
+%endif
+%endmacro
+
+%macro SAD_X4_END_SSSE3 0
+ mov r0, r6mp
+ packssdw xmm0, xmm1
+ movdqa [r0], xmm0
+ RET
+%endmacro
+
+%macro SAD_X3_START_2x16P_AVX2 0
+ movu m3, [r0] ; assumes FENC_STRIDE == 16
+ movu xm0, [r1]
+ movu xm1, [r2]
+ movu xm2, [r3]
+ vinserti128 m0, m0, [r1+r4], 1
+ vinserti128 m1, m1, [r2+r4], 1
+ vinserti128 m2, m2, [r3+r4], 1
+ psadbw m0, m3
+ psadbw m1, m3
+ psadbw m2, m3
+%endmacro
+
+%macro SAD_X3_2x16P_AVX2 3
+ movu m3, [r0+%1] ; assumes FENC_STRIDE == 16
+ movu xm4, [r1+%2]
+ movu xm5, [r2+%2]
+ movu xm6, [r3+%2]
+ vinserti128 m4, m4, [r1+%3], 1
+ vinserti128 m5, m5, [r2+%3], 1
+ vinserti128 m6, m6, [r3+%3], 1
+ psadbw m4, m3
+ psadbw m5, m3
+ psadbw m6, m3
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+%endmacro
+
+%macro SAD_X3_4x16P_AVX2 2
+%if %1==0
+%if UNIX64
+ mov r6, r5
+%endif
+ lea r5, [r4*3]
+ SAD_X3_START_2x16P_AVX2
+%else
+ SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1
+%endif
+ SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, r5
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
+%endif
+ lea r1, [r1+4*r4]
+ lea r2, [r2+4*r4]
+ lea r3, [r3+4*r4]
+%endif
+%endmacro
+
+%macro SAD_X4_START_2x16P_AVX2 0
+ vbroadcasti128 m4, [r0]
+ vbroadcasti128 m5, [r0+FENC_STRIDE]
+ movu xm0, [r1]
+ movu xm1, [r3]
+ movu xm2, [r1+r5]
+ movu xm3, [r3+r5]
+ vinserti128 m0, m0, [r2], 1
+ vinserti128 m1, m1, [r4], 1
+ vinserti128 m2, m2, [r2+r5], 1
+ vinserti128 m3, m3, [r4+r5], 1
+ psadbw m0, m4
+ psadbw m1, m4
+ psadbw m2, m5
+ psadbw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+%endmacro
+
+%macro SAD_X4_2x16P_AVX2 4
+ vbroadcasti128 m6, [r0+%1]
+ vbroadcasti128 m7, [r0+%3]
+ movu xm2, [r1+%2]
+ movu xm3, [r3+%2]
+ movu xm4, [r1+%4]
+ movu xm5, [r3+%4]
+ vinserti128 m2, m2, [r2+%2], 1
+ vinserti128 m3, m3, [r4+%2], 1
+ vinserti128 m4, m4, [r2+%4], 1
+ vinserti128 m5, m5, [r4+%4], 1
+ psadbw m2, m6
+ psadbw m3, m6
+ psadbw m4, m7
+ psadbw m5, m7
+ paddw m0, m2
+ paddw m1, m3
+ paddw m0, m4
+ paddw m1, m5
+%endmacro
+
+%macro SAD_X4_4x16P_AVX2 2
+%if %1==0
+ lea r6, [r5*3]
+ SAD_X4_START_2x16P_AVX2
+%else
+ SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
+%endif
+ SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
+%endif
+ lea r1, [r1+4*r5]
+ lea r2, [r2+4*r5]
+ lea r3, [r3+4*r5]
+ lea r4, [r4+4*r5]
+%endif
+%endmacro
+
+%macro SAD_X3_END_AVX2 0
+ vextracti128 xm4, m0, 1
+ vextracti128 xm5, m1, 1
+ vextracti128 xm6, m2, 1
+ paddw xm0, xm4
+ paddw xm1, xm5
+ paddw xm2, xm6
+ movhlps xm4, xm0
+ movhlps xm5, xm1
+ movhlps xm6, xm2
+ paddw xm0, xm4
+ paddw xm1, xm5
+ paddw xm2, xm6
+%if UNIX64
+ movd [r6+0], xm0
+ movd [r6+4], xm1
+ movd [r6+8], xm2
+%else
+ mov r0, r5mp
+ movd [r0+0], xm0
+ movd [r0+4], xm1
+ movd [r0+8], xm2
+%endif
+ RET
+%endmacro
+
+%macro SAD_X4_END_AVX2 0
+ mov r0, r6mp
+ punpckhqdq m2, m0, m0
+ punpckhqdq m3, m1, m1
+ paddw m0, m2
+ paddw m1, m3
+ packssdw m0, m1
+ mova xm2, [deinterleave_sadx4]
+ vpermd m0, m2, m0
+ mova [r0], xm0
+ RET
+%endmacro
+
;-----------------------------------------------------------------------------
; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
; uint8_t *pix2, intptr_t i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X_SSE2 3
-cglobal pixel_sad_x%1_%2x%3, 2+%1,2+%1,9
- SAD_X%1_2x%2P_SSE2 1
-%rep %3/2-1
- SAD_X%1_2x%2P_SSE2 0
+cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,9
+%assign x 0
+%rep %3/4
+ SAD_X%1_4x%2P_SSE2 x, %3/4
+%assign x x+1
%endrep
SAD_X%1_END_SSE2
%endmacro
SAD_X_SSE2 4, 16, 16
SAD_X_SSE2 4, 16, 8
+%macro SAD_X_SSSE3 3
+cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8
+%assign x 0
+%rep %3/4
+ SAD_X%1_4x%2P_SSSE3 x, %3/4
+%assign x x+1
+%endrep
+ SAD_X%1_END_SSSE3
+%endmacro
+
+INIT_XMM ssse3
+SAD_X_SSSE3 4, 8, 16
+SAD_X_SSSE3 4, 8, 8
+SAD_X_SSSE3 4, 8, 4
+
+%macro SAD_X_AVX2 4
+cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
+%assign x 0
+%rep %3/4
+ SAD_X%1_4x%2P_AVX2 x, %3/4
+%assign x x+1
+%endrep
+ SAD_X%1_END_AVX2
+%endmacro
+INIT_YMM avx2
+SAD_X_AVX2 3, 16, 16, 7
+SAD_X_AVX2 3, 16, 8, 7
+SAD_X_AVX2 4, 16, 16, 8
+SAD_X_AVX2 4, 16, 8, 8
;=============================================================================
; SAD cacheline split
FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
+
+; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
+%if ARCH_X86_64 == 0
+%macro vpbroadcastq 2
+%if sizeof%1 == 16
+ movddup %1, %2
+%else
+ vbroadcastsd %1, %2
+%endif
+%endmacro
+%endif
%assign SIZEOF_PIXEL 1
%assign SIZEOF_DCTCOEF 2
%define pixel byte
+%define vpbroadcastdct vpbroadcastw
+%define vpbroadcastpix vpbroadcastb
%if HIGH_BIT_DEPTH
%assign SIZEOF_PIXEL 2
%assign SIZEOF_DCTCOEF 4
%define pixel word
+ %define vpbroadcastdct vpbroadcastd
+ %define vpbroadcastpix vpbroadcastw
%endif
%assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE
%macro SBUTTERFLY 4
-%if avx_enabled && mmsize == 16
+%ifidn %1, dqqq
+ vperm2i128 m%4, m%2, m%3, q0301 ; punpckh
+ vinserti128 m%2, m%2, xm%3, 1 ; punpckl
+%elif avx_enabled && mmsize >= 16
punpckh%1 m%4, m%2, m%3
punpckl%1 m%2, m%3
%else
%endmacro
%macro HADDD 2 ; sum junk
-%if mmsize == 16
+%if mmsize >= 16
movhlps %2, %1
paddd %1, %2
%endif
%endmacro
%macro HADDW 2 ; reg, tmp
-%if cpuflag(xop) && mmsize == 16
+%if cpuflag(xop) && mmsize >= 16
vphaddwq %1, %1
movhlps %2, %1
paddd %1, %2
%endmacro
%macro HADDUW 2
-%if cpuflag(xop) && mmsize == 16
+%if cpuflag(xop) && mmsize >= 16
vphadduwq %1, %1
movhlps %2, %1
paddd %1, %2
%endif
%elifidn %1, q
shufps m%5, m%3, m%4, q3131
- shufps m%3, m%4, q2020
+ shufps m%3, m%3, m%4, q2020
SWAP %4, %5
%endif
%endmacro
; %5(%6): tmpregs
%if %1!=0 ; have to reorder stuff for horizontal op
%ifidn %2, sumsub
- %define ORDER ord
- ; sumsub needs order because a-b != b-a unless a=b
+ %define ORDER ord
+ ; sumsub needs order because a-b != b-a unless a=b
%else
- %define ORDER unord
- ; if we just max, order doesn't matter (allows pblendw+or in sse4)
+ %define ORDER unord
+ ; if we just max, order doesn't matter (allows pblendw+or in sse4)
%endif
%if %1==1
- TRANS d, ORDER, %3, %4, %5, %6
+ TRANS d, ORDER, %3, %4, %5, %6
%elif %1==2
- %if mmsize==8
- SBUTTERFLY dq, %3, %4, %5
- %else
- TRANS q, ORDER, %3, %4, %5, %6
- %endif
+ %if mmsize==8
+ SBUTTERFLY dq, %3, %4, %5
+ %else
+ TRANS q, ORDER, %3, %4, %5, %6
+ %endif
%elif %1==4
- SBUTTERFLY qdq, %3, %4, %5
+ SBUTTERFLY qdq, %3, %4, %5
+ %elif %1==8
+ SBUTTERFLY dqqq, %3, %4, %5
%endif
%endif
%ifidn %2, sumsub
%endif
%endmacro
+; 2xdst, 2xtmp, 2xsrcrow
+%macro LOAD_DIFF16x2_AVX2 6
+ pmovzxbw m%1, [r1+%5*FENC_STRIDE]
+ pmovzxbw m%2, [r1+%6*FENC_STRIDE]
+ pmovzxbw m%3, [r2+(%5-4)*FDEC_STRIDE]
+ pmovzxbw m%4, [r2+(%6-4)*FDEC_STRIDE]
+ psubw m%1, m%3
+ psubw m%2, m%4
+%endmacro
+
%macro DIFFx2 6-7
movh %3, %5
punpcklbw %3, %4
pixel *p_src = h->mb.pic.p_fenc[p];
pixel *p_dst = h->mb.pic.p_fdec[p];
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
- ALIGNED_ARRAY_16( dctcoef, dct_dc4x4,[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] );
int nz, block_cbp = 0;
int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
int i_decimate_score = b_decimate ? 0 : 7;
int nz_ac = 0;
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
if( h->mb.b_lossless )
{
}
else if( h->mb.b_transform_8x8 )
{
- ALIGNED_ARRAY_16( dctcoef, dct8x8,[4],[64] );
+ ALIGNED_ARRAY_N( dctcoef, dct8x8,[4],[64] );
b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
}
else
{
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
{
CLEAR_16x16_NNZ( p );
*****************************************************************************/
static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
{
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
ALIGNED_4( int16_t mvp[2] );
int i_qp = h->mb.i_qp;
int quant_cat = p ? CQM_8PC : CQM_8PY;
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
- ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
+ ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
int i_decimate_8x8 = b_decimate ? 0 : 4;
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4x4,[4],[16] );
int nnz8x8 = 0;
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
i_qp = h->mb.i_chroma_qp;
for( int ch = 0; ch < 2; ch++ )
{
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[2],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4x4,[2],[16] );
pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
}
else
{
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
if( b_predict )
{
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
- ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
+ ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
if( b_predict )
ALIGNED_ARRAY_16( pixel, pix,[16*16] );
ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] );
- int costs[16];
+ ALIGNED_ARRAY_16( int, costs,[16] );
int mv_x_min = h->mb.mv_limit_fpel[0][0];
int mv_y_min = h->mb.mv_limit_fpel[0][1];
if( h->mb.i_me_method == X264_ME_TESA )
{
// ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
- mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15) + 4);
+ mvsad_t *mvsads = (mvsad_t *)(xs + ((width+31)&~31) + 4);
int nmvsad = 0, limit;
int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+bmy*stride+bmx, stride )
int chroma_v_shift = CHROMA_V_SHIFT;
int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
- ALIGNED_ARRAY_16( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
+ ALIGNED_ARRAY_N( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
+ ALIGNED_ARRAY_16( int, costs,[4] );
int bmx = m->mv[0];
int bmy = m->mv[1];
for( int i = hpel_iters; i > 0; i-- )
{
int omx = bmx, omy = bmy;
- int costs[4];
intptr_t stride = 64; // candidates are either all hpel or all qpel, so one stride is enough
pixel *src0, *src1, *src2, *src3;
src0 = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] );
/* Special simplified case for subme=1 */
else if( bmy > h->mb.mv_min_spel[1] && bmy < h->mb.mv_max_spel[1] && bmx > h->mb.mv_min_spel[0] && bmx < h->mb.mv_max_spel[0] )
{
- int costs[4];
int omx = bmx, omy = bmy;
/* We have to use mc_luma because all strides must be the same to use fpelcmp_x4 */
h->mc.mc_luma( pix , 64, m->p_fref, m->i_stride[0], omx, omy-1, bw, bh, &m->weight[0] );
const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
int b_chroma, int dc, int num_coefs, int idx )
{
- ALIGNED_ARRAY_16( dctcoef, orig_coefs, [64] );
- ALIGNED_ARRAY_16( dctcoef, quant_coefs, [64] );
+ ALIGNED_ARRAY_N( dctcoef, orig_coefs, [64] );
+ ALIGNED_ARRAY_N( dctcoef, quant_coefs, [64] );
const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
const int b_interlaced = MB_INTERLACED;
{
ALIGNED_16( uint16_t sums[72] );
ALIGNED_16( int dc[4] );
- ALIGNED_16( int16_t mvs_a[32] );
- ALIGNED_16( int16_t mvs_c[32] );
+ ALIGNED_16( int16_t mvs_a[48] );
+ ALIGNED_16( int16_t mvs_c[48] );
int mvn_a, mvn_c;
int thresh = rand() & 0x3fff;
set_func_name( "esa_ads" );
x264_dct_function_t dct_asm;
x264_quant_function_t qf;
int ret = 0, ok, used_asm, interlace = 0;
- ALIGNED_16( dctcoef dct1[16][16] );
- ALIGNED_16( dctcoef dct2[16][16] );
- ALIGNED_16( dctcoef dct4[16][16] );
- ALIGNED_16( dctcoef dct8[4][64] );
+ ALIGNED_ARRAY_N( dctcoef, dct1, [16],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct2, [16],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4, [16],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct8, [4],[64] );
ALIGNED_16( dctcoef dctdc[2][8] );
x264_t h_buf;
x264_t *h = &h_buf;
call_a( zigzag_asm[interlace].name, t2, dct, buf4 ); \
if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( buf3, buf4, 10 ) ) \
{ \
- ok = 0; \
+ ok = 0; printf("%d: %d %d %d %d\n%d %d %d %d\n\n",memcmp( t1, t2, size*sizeof(dctcoef) ),buf3[0], buf3[1], buf3[8], buf3[9], buf4[0], buf4[1], buf4[8], buf4[9]);break;\
} \
} \
}
fprintf( stderr, #name "[%d]: [FAILED] s:%d o:%d d%d\n", i, s, o, d ); \
break; \
} \
- call_c2( mc_c.weight[i], buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
- call_a2( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
+ /* omit unlikely high scales for benchmarking */ \
+ if( (s << (8-d)) < 512 ) \
+ { \
+ call_c2( mc_c.weight[i], buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
+ call_a2( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
+ } \
} \
}
pixel *dsta[4] = { pbuf4, pbuf4+1024, pbuf4+2048, pbuf4+3072 };
set_func_name( "lowres_init" );
ok = 1; used_asm = 1;
- for( int w = 40; w <= 48; w += 8 )
+ for( int w = 96; w <= 96+24; w += 8 )
{
- intptr_t stride = (w+8)&~15;
- call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], (intptr_t)w*2, stride, w, 16 );
- call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], (intptr_t)w*2, stride, w, 16 );
- for( int i = 0; i < 16; i++ )
+ intptr_t stride = (w*2+31)&~31;
+ intptr_t stride_lowres = (w+31)&~31;
+ call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], stride, stride_lowres, w, 8 );
+ call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], stride, stride_lowres, w, 8 );
+ for( int i = 0; i < 8; i++ )
{
for( int j = 0; j < 4; j++ )
- if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w * sizeof(pixel) ) )
+ if( memcmp( dstc[j]+i*stride_lowres, dsta[j]+i*stride_lowres, w * sizeof(pixel) ) )
{
ok = 0;
fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
for( int k = 0; k < w; k++ )
- printf( "%d ", dstc[j][k+i*stride] );
+ printf( "%d ", dstc[j][k+i*stride_lowres] );
printf( "\n" );
for( int k = 0; k < w; k++ )
- printf( "%d ", dsta[j][k+i*stride] );
+ printf( "%d ", dsta[j][k+i*stride_lowres] );
printf( "\n" );
break;
}
#define INTEGRAL_INIT( name, size, ... )\
if( mc_a.name != mc_ref.name )\
{\
- intptr_t stride = 80;\
+ intptr_t stride = 96;\
set_func_name( #name );\
used_asm = 1;\
memcpy( buf3, buf1, size*2*stride );\
x264_quant_function_t qf_c;
x264_quant_function_t qf_ref;
x264_quant_function_t qf_a;
- ALIGNED_16( dctcoef dct1[64] );
- ALIGNED_16( dctcoef dct2[64] );
- ALIGNED_16( dctcoef dct3[8][16] );
- ALIGNED_16( dctcoef dct4[8][16] );
- ALIGNED_16( uint8_t cqm_buf[64] );
+ ALIGNED_ARRAY_N( dctcoef, dct1,[64] );
+ ALIGNED_ARRAY_N( dctcoef, dct2,[64] );
+ ALIGNED_ARRAY_N( dctcoef, dct3,[8],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4,[8],[16] );
+ ALIGNED_ARRAY_N( uint8_t, cqm_buf,[64] );
int ret = 0, ok, used_asm;
int oks[3] = {1,1,1}, used_asms[3] = {0,0,0};
x264_t h_buf;
fprintf( stderr, "x264: using random seed %u\n", seed );
srand( seed );
- buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 16*BENCH_ALIGNS );
- pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 16*BENCH_ALIGNS );
+ buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 32*BENCH_ALIGNS );
+ pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 32*BENCH_ALIGNS );
if( !buf1 || !pbuf1 )
{
fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
}
memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) );
- /* 16-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */
+ /* 32-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */
if( do_bench )
for( int i = 0; i < BENCH_ALIGNS && !ret; i++ )
{
INIT_POINTER_OFFSETS;
- ret |= x264_stack_pagealign( check_all_flags, i*16 );
- buf1 += 16;
- pbuf1 += 16;
+ ret |= x264_stack_pagealign( check_all_flags, i*32 );
+ buf1 += 32;
+ pbuf1 += 32;
quiet = 1;
fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS );
}