return (sum+2)>>2;
}
-
static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, intptr_t stride )
{
sum2_t tmp[32];
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
+ pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2;
#endif
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse2;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
+#if ARCH_X86_64
+ pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
+#endif
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
}
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
+#if ARCH_X86_64
+ pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4;
+#endif
}
if( cpu&X264_CPU_AVX )
{
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
+#if ARCH_X86_64
+ pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx;
+#endif
}
if( cpu&X264_CPU_XOP )
{
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
+ pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2;
#endif
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3;
pixf->asd8 = x264_pixel_asd8_ssse3;
+#if ARCH_X86_64
+ pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
+#endif
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse4;
+#if ARCH_X86_64
+ pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4;
+#endif
}
if( cpu&X264_CPU_AVX )
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
+#if ARCH_X86_64
+ pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx;
+#endif
}
if( cpu&X264_CPU_XOP )
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
+#if ARCH_X86_64
+ pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop;
+#endif
}
#endif //HAVE_MMX
x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */
int (*vsad)( pixel *, intptr_t, int );
int (*asd8)( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
+ uint64_t (*sa8d_satd[1])( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
uint64_t (*var[4])( pixel *pix, intptr_t stride );
int (*var2[4])( pixel *pix1, intptr_t stride1,
cextern pw_ppmmppmm
cextern pw_pmpmpmpm
cextern pw_pmmpzzzz
+cextern pd_1
cextern hsub_mul
;=============================================================================
%endif ; !ARCH_X86_64
%endmacro ; SA8D
+;=============================================================================
+; SA8D_SATD
+;=============================================================================
+
+; %1-%4: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9)
+; m10: satd result
+; m6, m11-15: tmp regs
+%macro SA8D_SATD_8x4 4
+%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
+ LOAD_SUMSUB_8x4P_SSSE3 %1, %2, %3, %4, 6, 11, 7, r0, r2, 1
+ HADAMARD4_V %1, %2, %3, %4, 6
+
+ pabsw m12, m%1 ; doing the abs first is a slight advantage
+ pabsw m14, m%3
+ pabsw m13, m%2
+ pabsw m15, m%4
+ HADAMARD 1, max, 12, 14, 6, 11
+ paddw m10, m12
+ HADAMARD 1, max, 13, 15, 6, 11
+ paddw m10, m13
+%else
+ LOAD_DIFF_8x4P %1, %2, %3, %4, 6, 11, 7, r0, r2, 1
+ HADAMARD 0, sumsub, %1, %2, 6
+ HADAMARD 0, sumsub, %3, %4, 6
+ SBUTTERFLY wd, %1, %2, 6
+ SBUTTERFLY wd, %3, %4, 6
+ HADAMARD2_2D %1, %3, %2, %4, 6, dq
+
+ mova m12, m%1
+ mova m13, m%2
+ mova m14, m%3
+ mova m15, m%4
+ HADAMARD 0, sumsub, %1, %2, 6
+ HADAMARD 0, sumsub, %3, %4, 6
+ SBUTTERFLY qdq, 12, 13, 6
+ HADAMARD 0, amax, 12, 13, 6
+ SBUTTERFLY qdq, 14, 15, 6
+ paddw m10, m12
+ HADAMARD 0, amax, 14, 15, 6
+ paddw m10, m14
+%endif
+%endmacro ; SA8D_SATD_8x4
+
+; %1: add spilled regs?
+; %2: spill regs?
+%macro SA8D_SATD_ACCUM 2
+%if HIGH_BIT_DEPTH
+ pmaddwd m10, [pw_1]
+ HADDUWD m0, m1
+%if %1
+ paddd m10, temp1
+ paddd m0, temp0
+%endif
+%if %2
+ mova temp1, m10
+ pxor m10, m10
+%endif
+%elif %1
+ paddw m0, temp0
+%endif
+%if %2
+ mova temp0, m0
+%endif
+%endmacro
+
+%macro SA8D_SATD 0
+cglobal pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_8x4 0, 1, 2, 3
+ SA8D_SATD_8x4 4, 5, 8, 9
+
+ ; complete sa8d
+%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
+ SUMSUB_BADC w, 0, 4, 1, 5, 12
+ HADAMARD 2, sumsub, 0, 4, 12, 11
+ HADAMARD 2, sumsub, 1, 5, 12, 11
+ SUMSUB_BADC w, 2, 8, 3, 9, 12
+ HADAMARD 2, sumsub, 2, 8, 12, 11
+ HADAMARD 2, sumsub, 3, 9, 12, 11
+ HADAMARD 1, amax, 0, 4, 12, 11
+ HADAMARD 1, amax, 1, 5, 12, 4
+ HADAMARD 1, amax, 2, 8, 12, 4
+ HADAMARD 1, amax, 3, 9, 12, 4
+%else ; sse2
+ HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax
+ HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax
+%endif
+
+ ; create sa8d sub results
+ paddw m1, m2
+ paddw m0, m3
+ paddw m0, m1
+
+ SAVE_MM_PERMUTATION
+ ret
+
+;-------------------------------------------------------------------------------
+; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t )
+;-------------------------------------------------------------------------------
+cglobal pixel_sa8d_satd_16x16, 4,8,16,SIZEOF_PIXEL*mmsize
+ %define temp0 [rsp+0*mmsize]
+ %define temp1 [rsp+1*mmsize]
+ FIX_STRIDES r1, r3
+%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
+ mova m7, [hmul_8p]
+%endif
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ lea r6, [r2+8*SIZEOF_PIXEL]
+ lea r7, [r0+8*SIZEOF_PIXEL]
+ pxor m10, m10
+
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 0, 1
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 1, 1
+
+ mov r0, r7
+ mov r2, r6
+
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 1, 1
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 1, 0
+
+; xop already has fast horizontal sums
+%if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0
+ pmaddwd m10, [pw_1]
+ HADDUWD m0, m1
+ phaddd m0, m10 ; sa8d1 sa8d2 satd1 satd2
+ pshufd m1, m0, q2301 ; sa8d2 sa8d1 satd2 satd1
+ paddd m0, m1 ; sa8d sa8d satd satd
+ movd r0d, m0
+ pextrd eax, m0, 2
+%else
+%if HIGH_BIT_DEPTH
+ HADDD m0, m1
+ HADDD m10, m2
+%else
+ HADDUW m0, m1
+ HADDW m10, m2
+%endif
+ movd r0d, m0
+ movd eax, m10
+%endif
+ add r0d, 1
+ shl rax, 32
+ shr r0d, 1
+ or rax, r0
+ RET
+%endmacro ; SA8D_SATD
+
;=============================================================================
; INTRA SATD
;=============================================================================
INIT_XMM sse2
SA8D
SATDS_SSE2
+%if ARCH_X86_64
+SA8D_SATD
+%endif
%if HIGH_BIT_DEPTH == 0
INTRA_SA8D_SSE2
%endif
SATDS_SSE2
SA8D
HADAMARD_AC_SSE2
+%if ARCH_X86_64
+SA8D_SATD
+%endif
%if HIGH_BIT_DEPTH == 0
INTRA_X9
INTRA8_X9
SATDS_SSE2
SA8D
HADAMARD_AC_SSE2
+%if ARCH_X86_64
+SA8D_SATD
+%endif
%if HIGH_BIT_DEPTH == 0
INTRA_X9
INTRA8_X9
INIT_XMM avx
SATDS_SSE2
SA8D
+%if ARCH_X86_64
+SA8D_SATD
+%endif
%if HIGH_BIT_DEPTH == 0
INTRA_X9
INTRA8_X9
INIT_XMM xop
SATDS_SSE2
SA8D
+%if ARCH_X86_64
+SA8D_SATD
+%endif
%if HIGH_BIT_DEPTH == 0
INTRA_X9
; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
int x264_pixel_asd8_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
+uint64_t x264_pixel_sa8d_satd_16x16_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_sse4 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_avx ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
%endif
%endmacro
+%macro HADDUWD 2
+%if cpuflag(xop)
+ vphadduwd %1, %1
+%else
+ psrld %2, %1, 16
+ pslld %1, 16
+ psrld %1, 16
+ paddd %1, %2
+%endif
+%endmacro
+
%macro HADDUW 2
%if cpuflag(xop) && mmsize == 16
vphadduwq %1, %1
movhlps %2, %1
paddd %1, %2
%else
- psrld %2, %1, 16
- pslld %1, 16
- psrld %1, 16
- paddd %1, %2
+ HADDUWD %1, %2
HADDD %1, %2
%endif
%endmacro
int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
int i_cost8 = 0, i_cost4 = 0;
- for( int p = 0; p < plane_count; p++ )
+ /* Not all platforms have a merged SATD function */
+ if( h->pixf.sa8d_satd[PIXEL_16x16] )
{
- i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
- h->mb.pic.p_fdec[p], FDEC_STRIDE );
- i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
- h->mb.pic.p_fdec[p], FDEC_STRIDE );
+ uint64_t cost = 0;
+ for( int p = 0; p < plane_count; p++ )
+ {
+ cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
+ h->mb.pic.p_fdec[p], FDEC_STRIDE );
+
+ }
+ i_cost8 = (uint32_t)cost;
+ i_cost4 = (uint32_t)(cost >> 32);
+ }
+ else
+ {
+ for( int p = 0; p < plane_count; p++ )
+ {
+ i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
+ h->mb.pic.p_fdec[p], FDEC_STRIDE );
+ i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
+ h->mb.pic.p_fdec[p], FDEC_STRIDE );
+ }
}
h->mb.b_transform_8x8 = i_cost8 < i_cost4;
TEST_PIXEL( satd, 0 );
TEST_PIXEL( sa8d, 1 );
+ ok = 1, used_asm = 0;
+ if( pixel_asm.sa8d_satd[PIXEL_16x16] != pixel_ref.sa8d_satd[PIXEL_16x16] )
+ {
+ set_func_name( "sa8d_satd_%s", pixel_names[PIXEL_16x16] );
+ used_asm = 1;
+ for( int j = 0; j < 64; j++ )
+ {
+ uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
+ uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
+ uint64_t res_a = call_a( pixel_asm.sa8d_satd[PIXEL_16x16], pbuf1, (intptr_t)16, pbuf2, (intptr_t)64 );
+ uint32_t cost8_a = res_a;
+ uint32_t cost4_a = res_a >> 32;
+ if( cost8_a != cost8_c || cost4_a != cost4_c )
+ {
+ ok = 0;
+ fprintf( stderr, "sa8d_satd [%d]: (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16,
+ cost8_c, cost4_c, cost8_a, cost4_a );
+ break;
+ }
+ }
+ for( int j = 0; j < 0x1000 && ok; j += 256 ) \
+ {
+ uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
+ uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
+ uint64_t res_a = pixel_asm.sa8d_satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
+ uint32_t cost8_a = res_a;
+ uint32_t cost4_a = res_a >> 32;
+ if( cost8_a != cost8_c || cost4_a != cost4_c )
+ {
+ ok = 0;
+ fprintf( stderr, "sa8d_satd [%d]: overflow (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16,
+ cost8_c, cost4_c, cost8_a, cost4_a );
+ }
+ }
+ }
+ report( "pixel sa8d_satd :" );
+
#define TEST_PIXEL_X( N ) \
ok = 1; used_asm = 0; \
for( int i = 0; i < 7; i++ ) \