From 7760f1b2e78360542e31eb55db81e84dcb4f95ac Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Thu, 17 Aug 2006 21:57:59 +0000 Subject: [PATCH] SSIM computation. (default on, disable by --no-ssim) git-svn-id: svn://svn.videolan.org/x264/trunk@554 df754926-b1dd-0310-bc7b-ec298dee348c --- common/amd64/pixel-sse2.asm | 180 +++++++++++++++++++++++++++++------- common/common.c | 3 + common/common.h | 1 + common/i386/pixel-a.asm | 64 +++++++++++++ common/i386/pixel-sse2.asm | 169 ++++++++++++++++++++++++++++++--- common/i386/pixel.h | 6 ++ common/pixel.c | 83 +++++++++++++++++ common/pixel.h | 7 ++ encoder/encoder.c | 76 ++++++++++----- tools/checkasm.c | 16 ++++ x264.c | 3 +- x264.h | 5 +- 12 files changed, 540 insertions(+), 73 deletions(-) diff --git a/common/amd64/pixel-sse2.asm b/common/amd64/pixel-sse2.asm index 42183658..5f17b4ee 100644 --- a/common/amd64/pixel-sse2.asm +++ b/common/amd64/pixel-sse2.asm @@ -30,9 +30,12 @@ BITS 64 SECTION .rodata align=16 -pd_0000ffff: times 4 dd 0x0000ffff -pb_1: times 16 db 1 - +pb_1: times 16 db 1 +pw_1: times 8 dw 1 +ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 +ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 +mask_ff: times 16 db 0xff + times 16 db 0 SECTION .text @@ -49,6 +52,20 @@ cglobal x264_pixel_satd_16x16_sse2 cglobal x264_pixel_sa8d_8x8_sse2 cglobal x264_pixel_sa8d_16x16_sse2 cglobal x264_intra_sa8d_x3_8x8_core_sse2 +cglobal x264_pixel_ssim_4x4x2_core_sse2 +cglobal x264_pixel_ssim_end4_sse2 + +%macro HADDD 2 ; sum junk + movhlps %2, %1 + paddd %1, %2 + pshuflw %2, %1, 0xE + paddd %1, %2 +%endmacro + +%macro HADDW 2 + pmaddwd %1, [pw_1 GLOBAL] + HADDD %1, %2 +%endmacro %macro SAD_INC_4x16P_SSE2 0 movdqu xmm1, [rdx] @@ -217,15 +234,8 @@ x264_pixel_sad_16x8_sse2: %endmacro %macro SSD_END_SSE2 0 - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddd xmm0, xmm1 - - movdqa xmm1, xmm0 - psrldq xmm1, 4 - paddd xmm0, xmm1 - - movd eax, xmm0 + HADDD xmm0, xmm1 + movd eax, xmm0 ret %endmacro @@ -399,20 +409,6 @@ x264_pixel_ssd_16x8_sse2: paddusw %7, %4 %endmacro -%macro SUM_MM_SSE2 2 ; sum junk - movdqa %2, %1 - psrldq %1, 2 - paddusw %1, %2 - pand %1, [pd_0000ffff GLOBAL] - movdqa %2, %1 - psrldq %1, 4 - paddd %1, %2 - movdqa %2, %1 - psrldq %1, 8 - paddd %1, %2 - movd eax,%1 -%endmacro - %macro SATD_TWO_SSE2 0 LOAD4x8_DIFF_SSE2 HADAMARD4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3 @@ -430,8 +426,9 @@ x264_pixel_ssd_16x8_sse2: %endmacro %macro SATD_END 0 - psrlw xmm6, 1 - SUM_MM_SSE2 xmm6, xmm7 + psrlw xmm6, 1 + HADDW xmm6, xmm7 + movd eax, xmm6 ret %endmacro @@ -531,6 +528,13 @@ x264_pixel_satd_8x4_sse2: punpckh%2 %5, %4 %endmacro +%macro TRANSPOSE4x4D 5 ; abcd-t -> adtc + SBUTTERFLY dqa, dq, %1, %2, %5 + SBUTTERFLY dqa, dq, %3, %4, %2 + SBUTTERFLY dqa, qdq, %1, %3, %4 + SBUTTERFLY dqa, qdq, %5, %2, %3 +%endmacro + ;----------------------------------------------------------------------------- ; input ABCDEFGH output AFHDTECB ;----------------------------------------------------------------------------- @@ -593,7 +597,8 @@ x264_pixel_sa8d_8x8_sse2: SUM4x4_TWO_SSE2 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10 SUM4x4_TWO_SSE2 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10 psrlw xmm10, 1 - SUM_MM_SSE2 xmm10, xmm0 + HADDW xmm10, xmm0 + movd eax, xmm10 add r8d, eax ; preserve rounding for 16x16 add eax, 1 shr eax, 1 @@ -695,17 +700,128 @@ x264_intra_sa8d_x3_8x8_core_sse2: psubw xmm0, xmm1 ; 8x1 sum SUM1x8_SSE2 xmm0, xmm1, xmm2 - SUM_MM_SSE2 xmm14, xmm3 + HADDW xmm14, xmm3 + movd eax, xmm14 add eax, 2 shr eax, 2 mov [parm3q+4], eax ; i8x8_h sa8d - SUM_MM_SSE2 xmm15, xmm4 + HADDW xmm15, xmm4 + movd eax, xmm15 add eax, 2 shr eax, 2 mov [parm3q+8], eax ; i8x8_dc sa8d - SUM_MM_SSE2 xmm2, xmm5 + HADDW xmm2, xmm5 + movd eax, xmm2 add eax, 2 shr eax, 2 mov [parm3q+0], eax ; i8x8_v sa8d ret + + + +;----------------------------------------------------------------------------- +; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, +; const uint8_t *pix2, int stride2, int sums[2][4] ) +;----------------------------------------------------------------------------- +ALIGN 16 +x264_pixel_ssim_4x4x2_core_sse2: + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm2, xmm2 + pxor xmm3, xmm3 + pxor xmm4, xmm4 + movdqa xmm8, [pw_1 GLOBAL] +%rep 4 + movq xmm5, [parm1q] + movq xmm6, [parm3q] + punpcklbw xmm5, xmm0 + punpcklbw xmm6, xmm0 + paddw xmm1, xmm5 + paddw xmm2, xmm6 + movdqa xmm7, xmm5 + pmaddwd xmm5, xmm5 + pmaddwd xmm7, xmm6 + pmaddwd xmm6, xmm6 + paddd xmm3, xmm5 + paddd xmm4, xmm7 + paddd xmm3, xmm6 + add parm1q, parm2q + add parm3q, parm4q +%endrep + ; PHADDW xmm1, xmm2 + ; PHADDD xmm3, xmm4 + pshufd xmm5, xmm3, 0xB1 + pmaddwd xmm1, xmm8 + pmaddwd xmm2, xmm8 + pshufd xmm6, xmm4, 0xB1 + packssdw xmm1, xmm2 + paddd xmm3, xmm5 + pmaddwd xmm1, xmm8 + paddd xmm4, xmm6 + pshufd xmm1, xmm1, 0xD8 + movdqa xmm5, xmm3 + punpckldq xmm3, xmm4 + punpckhdq xmm5, xmm4 + movq [parm5q+ 0], xmm1 + movq [parm5q+ 8], xmm3 + psrldq xmm1, 8 + movq [parm5q+16], xmm1 + movq [parm5q+24], xmm5 + ret + +;----------------------------------------------------------------------------- +; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width ) +;----------------------------------------------------------------------------- +ALIGN 16 +x264_pixel_ssim_end4_sse2: + movdqa xmm0, [parm1q+ 0] + movdqa xmm1, [parm1q+16] + movdqa xmm2, [parm1q+32] + movdqa xmm3, [parm1q+48] + movdqa xmm4, [parm1q+64] + paddd xmm0, [parm2q+ 0] + paddd xmm1, [parm2q+16] + paddd xmm2, [parm2q+32] + paddd xmm3, [parm2q+48] + paddd xmm4, [parm2q+64] + paddd xmm0, xmm1 + paddd xmm1, xmm2 + paddd xmm2, xmm3 + paddd xmm3, xmm4 + movdqa xmm5, [ssim_c1 GLOBAL] + movdqa xmm6, [ssim_c2 GLOBAL] + TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4 + +; s1=mm0, s2=mm3, ss=mm4, s12=mm2 + movdqa xmm1, xmm3 + pslld xmm3, 16 + pmaddwd xmm1, xmm0 ; s1*s2 + por xmm0, xmm3 + pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2 + pslld xmm1, 1 + pslld xmm2, 7 + pslld xmm4, 6 + psubd xmm2, xmm1 ; covar*2 + psubd xmm4, xmm0 ; vars + paddd xmm0, xmm5 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm4, xmm6 + cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1) + cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1) + cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2) + cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2) + mulps xmm1, xmm2 + mulps xmm0, xmm4 + divps xmm1, xmm0 ; ssim + + neg parm3d + movdqu xmm3, [mask_ff + parm3d*4 + 16 GLOBAL] + pand xmm1, xmm3 + movhlps xmm0, xmm1 + addps xmm0, xmm1 + pshuflw xmm1, xmm0, 0xE + addss xmm0, xmm1 + ret + diff --git a/common/common.c b/common/common.c index dd0825ed..e4e7af14 100644 --- a/common/common.c +++ b/common/common.c @@ -123,6 +123,7 @@ void x264_param_default( x264_param_t *param ) param->analyse.b_fast_pskip = 1; param->analyse.b_dct_decimate = 1; param->analyse.b_psnr = 1; + param->analyse.b_ssim = 1; param->i_cqm_preset = X264_CQM_FLAT; memset( param->cqm_4iy, 16, 16 ); @@ -460,6 +461,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value ) p->rc.psz_zones = strdup(value); OPT("psnr") p->analyse.b_psnr = atobool(value); + OPT("ssim") + p->analyse.b_ssim = atobool(value); OPT("aud") p->b_aud = atobool(value); OPT("sps-id") diff --git a/common/common.h b/common/common.h index 0dbb23b5..ce3c27b5 100644 --- a/common/common.h +++ b/common/common.h @@ -546,6 +546,7 @@ struct x264_t float f_psnr_mean_y[5]; float f_psnr_mean_u[5]; float f_psnr_mean_v[5]; + float f_ssim_mean_y[5]; /* */ int64_t i_mb_count[5][19]; int64_t i_mb_count_8x8dct[2]; diff --git a/common/i386/pixel-a.asm b/common/i386/pixel-a.asm index 331b1848..66ee5cd0 100644 --- a/common/i386/pixel-a.asm +++ b/common/i386/pixel-a.asm @@ -490,6 +490,7 @@ cglobal x264_intra_satd_x3_8x8c_mmxext cglobal x264_intra_satd_x3_16x16_mmxext cglobal x264_intra_sa8d_x3_8x8_core_mmxext +cglobal x264_pixel_ssim_4x4x2_core_mmxext %macro SAD_START 0 push ebx @@ -1571,3 +1572,66 @@ x264_intra_sa8d_x3_8x8_core_mmxext: %undef trans %undef sum + + +;----------------------------------------------------------------------------- +; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1, +; const uint8_t *pix2, int stride2, int sums[2][4] ) +;----------------------------------------------------------------------------- +ALIGN 16 +x264_pixel_ssim_4x4x2_core_mmxext: + push ebx + push edi + mov ebx, [esp+16] + mov edx, [esp+24] + mov edi, 4 + pxor mm0, mm0 +.loop + mov eax, [esp+12] + mov ecx, [esp+20] + add eax, edi + add ecx, edi + pxor mm1, mm1 + pxor mm2, mm2 + pxor mm3, mm3 + pxor mm4, mm4 +%rep 4 + movd mm5, [eax] + movd mm6, [ecx] + punpcklbw mm5, mm0 + punpcklbw mm6, mm0 + paddw mm1, mm5 + paddw mm2, mm6 + movq mm7, mm5 + pmaddwd mm5, mm5 + pmaddwd mm7, mm6 + pmaddwd mm6, mm6 + paddd mm3, mm5 + paddd mm4, mm7 + paddd mm3, mm6 + add eax, ebx + add ecx, edx +%endrep + mov eax, [esp+28] + lea eax, [eax+edi*4] + pshufw mm5, mm1, 0xE + pshufw mm6, mm2, 0xE + paddusw mm1, mm5 + paddusw mm2, mm6 + punpcklwd mm1, mm2 + pshufw mm2, mm1, 0xE + pshufw mm5, mm3, 0xE + pshufw mm6, mm4, 0xE + paddusw mm1, mm2 + paddd mm3, mm5 + paddd mm4, mm6 + punpcklwd mm1, mm0 + punpckldq mm3, mm4 + movq [eax+0], mm1 + movq [eax+8], mm3 + sub edi, 4 + jge .loop + pop edi + pop ebx + emms + ret diff --git a/common/i386/pixel-sse2.asm b/common/i386/pixel-sse2.asm index e054df68..37977b61 100644 --- a/common/i386/pixel-sse2.asm +++ b/common/i386/pixel-sse2.asm @@ -30,7 +30,11 @@ BITS 32 SECTION_RODATA -pd_0000ffff: times 4 dd 0x0000ffff +pw_1: times 8 dw 1 +ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 +ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 +mask_ff: times 16 db 0xff + times 16 db 0 SECTION .text @@ -49,6 +53,23 @@ cglobal x264_pixel_satd_8x8_sse2 cglobal x264_pixel_satd_16x8_sse2 cglobal x264_pixel_satd_8x16_sse2 cglobal x264_pixel_satd_16x16_sse2 +cglobal x264_pixel_ssim_4x4x2_core_sse2 +cglobal x264_pixel_ssim_end4_sse2 + + +%macro SBUTTERFLY 5 + mov%1 %5, %3 + punpckl%2 %3, %4 + punpckh%2 %5, %4 +%endmacro + +%macro TRANSPOSE4x4D 5 ; abcd-t -> adtc + SBUTTERFLY dqa, dq, %1, %2, %5 + SBUTTERFLY dqa, dq, %3, %4, %2 + SBUTTERFLY dqa, qdq, %1, %3, %4 + SBUTTERFLY dqa, qdq, %5, %2, %3 +%endmacro + %macro SAD_INC_4x16P_SSE2 0 movdqu xmm1, [ecx] @@ -548,22 +569,14 @@ x264_pixel_ssd_16x8_sse2: paddusw %7, %4 %endmacro -%macro SUM_MM_SSE2 2 ; sum junk +%macro HADDW 2 ; sum junk ; ebx is no longer used at this point, so no push needed picgetgot ebx - ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first. - psrlw %1, 1 - movdqa %2, %1 - psrldq %1, 2 - paddusw %1, %2 - pand %1, [pd_0000ffff GOT_ebx] - movdqa %2, %1 - psrldq %1, 4 + pmaddwd %1, [pw_1 GOT_ebx] + movhlps %2, %1 paddd %1, %2 - movdqa %2, %1 - psrldq %1, 8 + pshuflw %2, %1, 0xE paddd %1, %2 - movd eax,%1 %endmacro %macro SATD_TWO_SSE2 0 @@ -586,8 +599,10 @@ x264_pixel_ssd_16x8_sse2: %endmacro %macro SATD_END 0 - SUM_MM_SSE2 xmm6, xmm7 - + ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first. + psrlw xmm6, 1 + HADDW xmm6, xmm7 + movd eax, xmm6 pop ebx ret %endmacro @@ -673,3 +688,127 @@ x264_pixel_satd_8x4_sse2: SATD_END + + +;----------------------------------------------------------------------------- +; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, +; const uint8_t *pix2, int stride2, int sums[2][4] ) +;----------------------------------------------------------------------------- +ALIGN 16 +x264_pixel_ssim_4x4x2_core_sse2: + push ebx + mov eax, [esp+ 8] + mov ebx, [esp+12] + mov ecx, [esp+16] + mov edx, [esp+20] + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm2, xmm2 + pxor xmm3, xmm3 + pxor xmm4, xmm4 +%rep 4 + movq xmm5, [eax] + movq xmm6, [ecx] + punpcklbw xmm5, xmm0 + punpcklbw xmm6, xmm0 + paddw xmm1, xmm5 + paddw xmm2, xmm6 + movdqa xmm7, xmm5 + pmaddwd xmm5, xmm5 + pmaddwd xmm7, xmm6 + pmaddwd xmm6, xmm6 + paddd xmm3, xmm5 + paddd xmm4, xmm7 + paddd xmm3, xmm6 + add eax, ebx + add ecx, edx +%endrep + ; PHADDW xmm1, xmm2 + ; PHADDD xmm3, xmm4 + mov eax, [esp+24] + picgetgot ebx + movdqa xmm7, [pw_1 GOT_ebx] + pshufd xmm5, xmm3, 0xB1 + pmaddwd xmm1, xmm7 + pmaddwd xmm2, xmm7 + pshufd xmm6, xmm4, 0xB1 + packssdw xmm1, xmm2 + paddd xmm3, xmm5 + pmaddwd xmm1, xmm7 + paddd xmm4, xmm6 + pshufd xmm1, xmm1, 0xD8 + movdqa xmm5, xmm3 + punpckldq xmm3, xmm4 + punpckhdq xmm5, xmm4 + movq [eax+ 0], xmm1 + movq [eax+ 8], xmm3 + psrldq xmm1, 8 + movq [eax+16], xmm1 + movq [eax+24], xmm5 + pop ebx + ret + +;----------------------------------------------------------------------------- +; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width ) +;----------------------------------------------------------------------------- +ALIGN 16 +x264_pixel_ssim_end4_sse2: + mov eax, [esp+ 4] + mov ecx, [esp+ 8] + mov edx, [esp+12] + picpush ebx + picgetgot ebx + movdqa xmm0, [eax+ 0] + movdqa xmm1, [eax+16] + movdqa xmm2, [eax+32] + movdqa xmm3, [eax+48] + movdqa xmm4, [eax+64] + paddd xmm0, [ecx+ 0] + paddd xmm1, [ecx+16] + paddd xmm2, [ecx+32] + paddd xmm3, [ecx+48] + paddd xmm4, [ecx+64] + paddd xmm0, xmm1 + paddd xmm1, xmm2 + paddd xmm2, xmm3 + paddd xmm3, xmm4 + movdqa xmm5, [ssim_c1 GOT_ebx] + movdqa xmm6, [ssim_c2 GOT_ebx] + TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4 + +; s1=mm0, s2=mm3, ss=mm4, s12=mm2 + movdqa xmm1, xmm3 + pslld xmm3, 16 + pmaddwd xmm1, xmm0 ; s1*s2 + por xmm0, xmm3 + pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2 + pslld xmm1, 1 + pslld xmm2, 7 + pslld xmm4, 6 + psubd xmm2, xmm1 ; covar*2 + psubd xmm4, xmm0 ; vars + paddd xmm0, xmm5 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm4, xmm6 + cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1) + cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1) + cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2) + cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2) + mulps xmm1, xmm2 + mulps xmm0, xmm4 + divps xmm1, xmm0 ; ssim + + neg edx + movdqu xmm3, [mask_ff + edx*4 + 16 GOT_ebx] + pand xmm1, xmm3 + movhlps xmm0, xmm1 + addps xmm0, xmm1 + pshuflw xmm1, xmm0, 0xE + addss xmm0, xmm1 + + movd [picesp+4], xmm0 + fld dword [picesp+4] + picpop ebx + ret + diff --git a/common/i386/pixel.h b/common/i386/pixel.h index b7b8e89f..f33b22d7 100644 --- a/common/i386/pixel.h +++ b/common/i386/pixel.h @@ -98,4 +98,10 @@ void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *, uint8_t *, int * ); void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *, int16_t [2][8], int * ); void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * ); +void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1, + const uint8_t *pix2, int stride2, int sums[2][4] ); +void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, + const uint8_t *pix2, int stride2, int sums[2][4] ); +float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width ); + #endif diff --git a/common/pixel.c b/common/pixel.c index 54b03553..fd557ff3 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -322,6 +322,84 @@ SAD_X( 8x16_vis ) SAD_X( 8x8_vis ) #endif +static void ssim_4x4x2_core( const uint8_t *pix1, int stride1, + const uint8_t *pix2, int stride2, + int sums[2][4]) +{ + int x, y, z; + for(z=0; z<2; z++) + { + uint32_t s1=0, s2=0, ss=0, s12=0; + for(y=0; y<4; y++) + for(x=0; x<4; x++) + { + int a = pix1[x+y*stride1]; + int b = pix2[x+y*stride2]; + s1 += a; + s2 += b; + ss += a*a; + ss += b*b; + s12 += a*b; + } + sums[z][0] = s1; + sums[z][1] = s2; + sums[z][2] = ss; + sums[z][3] = s12; + pix1 += 4; + pix2 += 4; + } +} + +static float ssim_end1( int s1, int s2, int ss, int s12 ) +{ + static const int ssim_c1 = (int)(.01*.01*255*255*64 + .5); + static const int ssim_c2 = (int)(.03*.03*255*255*64*63 + .5); + int vars = ss*64 - s1*s1 - s2*s2; + int covar = s12*64 - s1*s2; + return (float)(2*s1*s2 + ssim_c1) * (float)(2*covar + ssim_c2)\ + / ((float)(s1*s1 + s2*s2 + ssim_c1) * (float)(vars + ssim_c2)); +} + +static float ssim_end4( int sum0[5][4], int sum1[5][4], int width ) +{ + int i; + float ssim = 0.0; + for( i = 0; i < width; i++ ) + ssim += ssim_end1( sum0[i][0] + sum0[i+1][0] + sum1[i][0] + sum1[i+1][0], + sum0[i][1] + sum0[i+1][1] + sum1[i][1] + sum1[i+1][1], + sum0[i][2] + sum0[i+1][2] + sum1[i][2] + sum1[i+1][2], + sum0[i][3] + sum0[i+1][3] + sum1[i][3] + sum1[i+1][3] ); + return ssim; +} + +float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, + uint8_t *pix1, int stride1, + uint8_t *pix2, int stride2, + int width, int height ) +{ + int x, y, z; + float ssim = 0.0; + int sums[2][width/4+3][4]; + int (*sum0)[4] = sums[0]; + int (*sum1)[4] = sums[1]; + width >>= 2; + height >>= 2; + z = 0; + for( y = 1; y < height; y++ ) + { + for( ; z <= y; z++ ) + { + XCHG( void*, sum0, sum1 ); + for( x = 0; x < width; x+=2 ) + pf->ssim_4x4x2_core( &pix1[4*(x+z*stride1)], stride1, &pix2[4*(x+z*stride2)], stride2, &sum0[x] ); + } + for( x = 0; x < width-1; x += 4 ) + ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) ); + } + return ssim / ((width-1) * (height-1)); +} + + /**************************************************************************** * x264_pixel_init: ****************************************************************************/ @@ -348,6 +426,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sa8d[PIXEL_16x8] = x264_pixel_sa8d_16x8; pixf->sa8d[PIXEL_8x16] = x264_pixel_sa8d_8x16; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8; + pixf->ssim_4x4x2_core = ssim_4x4x2_core; + pixf->ssim_end4 = ssim_end4; #ifdef HAVE_MMXEXT if( cpu&X264_CPU_MMX ) @@ -370,6 +450,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext; pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext; + pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext; #endif pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext; @@ -403,6 +484,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) { pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_sse2; pixf->ssd[PIXEL_16x8] = x264_pixel_ssd_16x8_sse2; + pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; + pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; #ifdef ARCH_X86_64 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; diff --git a/common/pixel.h b/common/pixel.h index db9a5742..d6b014cf 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -67,8 +67,14 @@ typedef struct x264_pixel_cmp_t sad[7]; x264_pixel_cmp_t ssd[7]; x264_pixel_cmp_t satd[7]; + x264_pixel_cmp_t ssim[7]; x264_pixel_cmp_t sa8d[4]; x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */ + x264_pixel_cmp_t rdcmp[7]; /* either ssd or ssim for rate-distortion */ + + void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1, + const uint8_t *pix2, int stride2, int sums[2][4] ); + float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width ); /* partial distortion elimination: * terminate early if partial score is worse than a threshold. @@ -89,5 +95,6 @@ typedef struct void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ); int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ); +float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ); #endif diff --git a/encoder/encoder.c b/encoder/encoder.c index c503c2a5..cb98ce4a 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -363,6 +363,7 @@ static int x264_validate_parameters( x264_t *h ) h->param.rc.f_pb_factor = 1; h->param.analyse.b_transform_8x8 = 0; h->param.analyse.b_psnr = 0; + h->param.analyse.b_ssim = 0; h->param.analyse.i_chroma_qp_offset = 0; h->param.analyse.i_trellis = 0; h->param.analyse.b_fast_pskip = 0; @@ -447,6 +448,12 @@ static int x264_validate_parameters( x264_t *h ) h->param.i_sps_id &= 31; + if( h->param.i_log_level < X264_LOG_INFO ) + { + h->param.analyse.b_psnr = 0; + h->param.analyse.b_ssim = 0; + } + /* ensure the booleans are 0 or 1 so they can be used in math */ #define BOOLIFY(x) h->param.x = !!h->param.x BOOLIFY( b_cabac ); @@ -462,6 +469,13 @@ static int x264_validate_parameters( x264_t *h ) return 0; } +static void mbcmp_init( x264_t *h ) +{ + memcpy( h->pixf.mbcmp, + ( h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1 ) ? h->pixf.sad : h->pixf.satd, + sizeof(h->pixf.mbcmp) ); +} + /**************************************************************************** * x264_encoder_open: ****************************************************************************/ @@ -603,9 +617,7 @@ x264_t *x264_encoder_open ( x264_param_t *param ) x264_quant_init( h, h->param.cpu, &h->quantf ); x264_deblock_init( h->param.cpu, &h->loopf ); - memcpy( h->pixf.mbcmp, - ( h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1 ) ? h->pixf.sad : h->pixf.satd, - sizeof(h->pixf.mbcmp) ); + mbcmp_init( h ); /* rate control */ if( x264_ratecontrol_new( h ) < 0 ) @@ -657,9 +669,7 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param ) h->param.analyse.intra = param->analyse.intra; h->param.analyse.inter = param->analyse.inter; - memcpy( h->pixf.mbcmp, - ( h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1 ) ? h->pixf.sad : h->pixf.satd, - sizeof(h->pixf.mbcmp) ); + mbcmp_init( h ); return x264_validate_parameters( h ); } @@ -1565,32 +1575,44 @@ do_encode: } } + psz_message[0] = '\0'; if( h->param.analyse.b_psnr ) { - int64_t i_sqe_y, i_sqe_u, i_sqe_v; + int64_t sqe[3]; - /* PSNR */ - i_sqe_y = x264_pixel_ssd_wxh( &h->pixf, frame_psnr->plane[0], frame_psnr->i_stride[0], h->fenc->plane[0], h->fenc->i_stride[0], h->param.i_width, h->param.i_height ); - i_sqe_u = x264_pixel_ssd_wxh( &h->pixf, frame_psnr->plane[1], frame_psnr->i_stride[1], h->fenc->plane[1], h->fenc->i_stride[1], h->param.i_width/2, h->param.i_height/2); - i_sqe_v = x264_pixel_ssd_wxh( &h->pixf, frame_psnr->plane[2], frame_psnr->i_stride[2], h->fenc->plane[2], h->fenc->i_stride[2], h->param.i_width/2, h->param.i_height/2); + for( i=0; i<3; i++ ) + { + sqe[i] = x264_pixel_ssd_wxh( &h->pixf, + frame_psnr->plane[i], frame_psnr->i_stride[i], + h->fenc->plane[i], h->fenc->i_stride[i], + h->param.i_width >> !!i, h->param.i_height >> !!i ); + } x264_cpu_restore( h->param.cpu ); - h->stat.i_sqe_global[i_slice_type] += i_sqe_y + i_sqe_u + i_sqe_v; - h->stat.f_psnr_average[i_slice_type] += x264_psnr( i_sqe_y + i_sqe_u + i_sqe_v, 3 * h->param.i_width * h->param.i_height / 2 ); - h->stat.f_psnr_mean_y[i_slice_type] += x264_psnr( i_sqe_y, h->param.i_width * h->param.i_height ); - h->stat.f_psnr_mean_u[i_slice_type] += x264_psnr( i_sqe_u, h->param.i_width * h->param.i_height / 4 ); - h->stat.f_psnr_mean_v[i_slice_type] += x264_psnr( i_sqe_v, h->param.i_width * h->param.i_height / 4 ); + h->stat.i_sqe_global[i_slice_type] += sqe[0] + sqe[1] + sqe[2]; + h->stat.f_psnr_average[i_slice_type] += x264_psnr( sqe[0] + sqe[1] + sqe[2], 3 * h->param.i_width * h->param.i_height / 2 ); + h->stat.f_psnr_mean_y[i_slice_type] += x264_psnr( sqe[0], h->param.i_width * h->param.i_height ); + h->stat.f_psnr_mean_u[i_slice_type] += x264_psnr( sqe[1], h->param.i_width * h->param.i_height / 4 ); + h->stat.f_psnr_mean_v[i_slice_type] += x264_psnr( sqe[2], h->param.i_width * h->param.i_height / 4 ); - snprintf( psz_message, 80, " PSNR Y:%2.2f U:%2.2f V:%2.2f", - x264_psnr( i_sqe_y, h->param.i_width * h->param.i_height ), - x264_psnr( i_sqe_u, h->param.i_width * h->param.i_height / 4), - x264_psnr( i_sqe_v, h->param.i_width * h->param.i_height / 4) ); - psz_message[79] = '\0'; + snprintf( psz_message, 80, " PSNR Y:%5.2f U:%5.2f V:%5.2f", + x264_psnr( sqe[0], h->param.i_width * h->param.i_height ), + x264_psnr( sqe[1], h->param.i_width * h->param.i_height / 4), + x264_psnr( sqe[2], h->param.i_width * h->param.i_height / 4) ); } - else + + if( h->param.analyse.b_ssim ) { - psz_message[0] = '\0'; + // offset by 2 pixels to avoid alignment of ssim blocks with dct blocks + float ssim_y = x264_pixel_ssim_wxh( &h->pixf, + frame_psnr->plane[0] + 2+2*frame_psnr->i_stride[0], frame_psnr->i_stride[0], + h->fenc->plane[0] + 2+2*h->fenc->i_stride[0], h->fenc->i_stride[0], + h->param.i_width-2, h->param.i_height-2 ); + h->stat.f_ssim_mean_y[i_slice_type] += ssim_y; + snprintf( psz_message + strlen(psz_message), 80 - strlen(psz_message), + " SSIM Y:%.5f", ssim_y ); } + psz_message[79] = '\0'; x264_log( h, X264_LOG_DEBUG, "frame=%4d QP=%i NAL=%d Slice:%c Poc:%-3d I:%-4d P:%-4d SKIP:%-4d size=%d bytes%s\n", @@ -1783,7 +1805,14 @@ void x264_encoder_close ( x264_t *h ) } } + if( h->param.analyse.b_ssim ) + { + x264_log( h, X264_LOG_INFO, + "SSIM Mean Y:%.7f\n", + SUM3( h->stat.f_ssim_mean_y ) / i_count ); + } if( h->param.analyse.b_psnr ) + { x264_log( h, X264_LOG_INFO, "PSNR Mean Y:%6.3f U:%6.3f V:%6.3f Avg:%6.3f Global:%6.3f kb/s:%.2f\n", SUM3( h->stat.f_psnr_mean_y ) / i_count, @@ -1792,6 +1821,7 @@ void x264_encoder_close ( x264_t *h ) SUM3( h->stat.f_psnr_average ) / i_count, x264_psnr( SUM3( h->stat.i_sqe_global ), i_count * i_yuv_size ), f_bitrate ); + } else x264_log( h, X264_LOG_INFO, "kb/s:%.1f\n", f_bitrate ); } diff --git a/tools/checkasm.c b/tools/checkasm.c index d410de49..450d04b3 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -131,6 +131,22 @@ static int check_pixel( int cpu_ref, int cpu_new ) TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], 1, edge ); report( "intra satd_x3 :" ); + if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core || + pixel_asm.ssim_end4 != pixel_ref.ssim_end4 ) + { + float res_c, res_a; + ok = 1; + x264_cpu_restore( cpu_new ); + res_c = x264_pixel_ssim_wxh( &pixel_c, buf1+2, 32, buf2+2, 32, 32, 28 ); + res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28 ); + if( res_c != res_a ) + { + ok = 0; + fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a ); + } + report( "ssim :" ); + } + return ret; } diff --git a/x264.c b/x264.c index badb5237..4d073500 100644 --- a/x264.c +++ b/x264.c @@ -286,6 +286,7 @@ static void Help( x264_param_t *defaults, int b_longhelp ) H0( " --progress Show a progress indicator while encoding\n" ); H0( " --quiet Quiet Mode\n" ); H0( " --no-psnr Disable PSNR computation\n" ); + H0( " --no-ssim Disable SSIM computation\n" ); H0( " --threads Parallel encoding (uses slices)\n" ); H0( " --thread-input Run Avisynth in its own thread\n" ); H1( " --no-asm Disable all CPU optimizations\n" ); @@ -399,6 +400,7 @@ static int Parse( int argc, char **argv, { "threads", required_argument, NULL, 0 }, { "thread-input", no_argument, NULL, OPT_THREAD_INPUT }, { "no-psnr", no_argument, NULL, 0 }, + { "no-ssim", no_argument, NULL, 0 }, { "quiet", no_argument, NULL, OPT_QUIET }, { "verbose", no_argument, NULL, 'v' }, { "progress",no_argument, NULL, OPT_PROGRESS }, @@ -502,7 +504,6 @@ static int Parse( int argc, char **argv, break; case OPT_QUIET: param->i_log_level = X264_LOG_NONE; - param->analyse.b_psnr = 0; break; case 'v': param->i_log_level = X264_LOG_DEBUG; diff --git a/x264.h b/x264.h index f1d947d9..5874be69 100644 --- a/x264.h +++ b/x264.h @@ -35,7 +35,7 @@ #include -#define X264_BUILD 49 +#define X264_BUILD 50 /* x264_t: * opaque handler for decoder and encoder */ @@ -216,7 +216,8 @@ typedef struct int b_dct_decimate; /* transform coefficient thresholding on P-frames */ int i_noise_reduction; /* adaptive pseudo-deadzone */ - int b_psnr; /* Do we compute PSNR stats (save a few % of cpu) */ + int b_psnr; /* compute and print PSNR stats */ + int b_ssim; /* compute and print SSIM stats */ } analyse; /* Rate control parameters */ -- 2.39.5