pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmxext;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmxext;
}
if( cpu&X264_CPU_SSE2 )
void (*intra_mbcmp_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_sad_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_mbcmp_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_sad_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_sa8d_x3_8x8) ( uint8_t *fenc, uint8_t edge[33], int res[3] );
} x264_pixel_function_t;
void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
SAD_END_SSE2
RET
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_4x4 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+;-----------------------------------------------------------------------------
+
+cglobal x264_intra_sad_x3_4x4_mmxext, 3,3
+ pxor mm7, mm7
+ movd mm0, [r1-FDEC_STRIDE]
+ movd mm1, [r0+FENC_STRIDE*0]
+ movd mm2, [r0+FENC_STRIDE*2]
+ punpckldq mm0, mm0
+ punpckldq mm1, [r0+FENC_STRIDE*1]
+ punpckldq mm2, [r0+FENC_STRIDE*3]
+ movq mm6, mm0
+ movq mm3, mm1
+ psadbw mm3, mm0
+ psadbw mm0, mm2
+ paddw mm0, mm3
+ movd [r2], mm0 ;V prediction cost
+ movd mm3, [r1+FDEC_STRIDE*0-4]
+ movd mm0, [r1+FDEC_STRIDE*1-4]
+ movd mm4, [r1+FDEC_STRIDE*2-4]
+ movd mm5, [r1+FDEC_STRIDE*3-4]
+ punpcklbw mm3, mm0
+ punpcklbw mm4, mm5
+ movq mm5, mm3
+ punpckhwd mm5, mm4
+ punpckhdq mm5, mm6
+ psadbw mm5, mm7
+ punpckhbw mm3, mm3
+ punpckhbw mm4, mm4
+ punpckhwd mm3, mm3
+ punpckhwd mm4, mm4
+ psraw mm5, 2
+ pavgw mm5, mm7
+ punpcklbw mm5, mm5
+ pshufw mm5, mm5, 0x0 ;DC prediction
+ movq mm6, mm5
+ psadbw mm5, mm1
+ psadbw mm6, mm2
+ psadbw mm1, mm3
+ psadbw mm2, mm4
+ paddw mm5, mm6
+ paddw mm1, mm2
+ movd [r2+8], mm5 ;DC prediction cost
+ movd [r2+4], mm1 ;H prediction cost
+ RET
+
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8c ( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
int i_cost;
int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
h->mb.i_cbp_luma = 0;
- b_merged_satd = h->pixf.intra_satd_x3_4x4 && h->pixf.mbcmp[0] == h->pixf.satd[0];
+ b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
if( a->i_mbrd )
i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
+ h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
satd &= h->param.analyse.i_me_method == X264_ME_TESA;
memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
report( "intra satd_x3 :" );
TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 );
TEST_INTRA_MBCMP( intra_sad_x3_8x8c , predict_8x8c , sad [PIXEL_8x8] , 0 );
+ TEST_INTRA_MBCMP( intra_sad_x3_4x4 , predict_4x4 , sad [PIXEL_4x4] , 0 );
report( "intra sad_x3 :" );
if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||