From 7760f1b2e78360542e31eb55db81e84dcb4f95ac Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@videolan.org>
Date: Thu, 17 Aug 2006 21:57:59 +0000
Subject: [PATCH] SSIM computation. (default on, disable by --no-ssim)

git-svn-id: svn://svn.videolan.org/x264/trunk@554 df754926-b1dd-0310-bc7b-ec298dee348c
---
 common/amd64/pixel-sse2.asm | 180 +++++++++++++++++++++++++++++-------
 common/common.c             |   3 +
 common/common.h             |   1 +
 common/i386/pixel-a.asm     |  64 +++++++++++++
 common/i386/pixel-sse2.asm  | 169 ++++++++++++++++++++++++++++++---
 common/i386/pixel.h         |   6 ++
 common/pixel.c              |  83 +++++++++++++++++
 common/pixel.h              |   7 ++
 encoder/encoder.c           |  76 ++++++++++-----
 tools/checkasm.c            |  16 ++++
 x264.c                      |   3 +-
 x264.h                      |   5 +-
 12 files changed, 540 insertions(+), 73 deletions(-)

diff --git a/common/amd64/pixel-sse2.asm b/common/amd64/pixel-sse2.asm
index 42183658..5f17b4ee 100644
--- a/common/amd64/pixel-sse2.asm
+++ b/common/amd64/pixel-sse2.asm
@@ -30,9 +30,12 @@ BITS 64
 
 SECTION .rodata align=16
 
-pd_0000ffff: times 4 dd 0x0000ffff
-pb_1: times 16 db 1
-
+pb_1:    times 16 db 1
+pw_1:    times 8 dw 1
+ssim_c1: times 4 dd 416    ; .01*.01*255*255*64
+ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
+mask_ff: times 16 db 0xff
+         times 16 db 0
 
 SECTION .text
 
@@ -49,6 +52,20 @@ cglobal x264_pixel_satd_16x16_sse2
 cglobal x264_pixel_sa8d_8x8_sse2
 cglobal x264_pixel_sa8d_16x16_sse2
 cglobal x264_intra_sa8d_x3_8x8_core_sse2
+cglobal x264_pixel_ssim_4x4x2_core_sse2
+cglobal x264_pixel_ssim_end4_sse2
+
+%macro HADDD 2 ; sum junk
+    movhlps %2, %1
+    paddd   %1, %2
+    pshuflw %2, %1, 0xE 
+    paddd   %1, %2
+%endmacro
+
+%macro HADDW 2
+    pmaddwd %1, [pw_1 GLOBAL]
+    HADDD   %1, %2
+%endmacro
 
 %macro SAD_INC_4x16P_SSE2 0
     movdqu  xmm1,   [rdx]
@@ -217,15 +234,8 @@ x264_pixel_sad_16x8_sse2:
 %endmacro
 
 %macro SSD_END_SSE2 0
-    movdqa  xmm1,   xmm0
-    psrldq  xmm1,    8
-    paddd   xmm0,   xmm1
-
-    movdqa  xmm1,   xmm0
-    psrldq  xmm1,    4
-    paddd   xmm0,   xmm1
-
-    movd    eax,    xmm0
+    HADDD   xmm0, xmm1
+    movd    eax,  xmm0
     ret
 %endmacro
 
@@ -399,20 +409,6 @@ x264_pixel_ssd_16x8_sse2:
     paddusw %7, %4
 %endmacro
 
-%macro SUM_MM_SSE2 2    ; sum junk
-    movdqa  %2, %1
-    psrldq  %1, 2
-    paddusw %1, %2
-    pand    %1, [pd_0000ffff GLOBAL]
-    movdqa  %2, %1
-    psrldq  %1, 4
-    paddd   %1, %2
-    movdqa  %2, %1
-    psrldq  %1, 8
-    paddd   %1, %2
-    movd    eax,%1
-%endmacro
-
 %macro SATD_TWO_SSE2 0
     LOAD4x8_DIFF_SSE2
     HADAMARD4x4_TWO_SSE2        xmm0, xmm1, xmm2, xmm4, xmm5, xmm3
@@ -430,8 +426,9 @@ x264_pixel_ssd_16x8_sse2:
 %endmacro
 
 %macro SATD_END 0
-    psrlw        xmm6, 1
-    SUM_MM_SSE2  xmm6, xmm7
+    psrlw   xmm6, 1
+    HADDW   xmm6, xmm7
+    movd    eax,  xmm6
     ret
 %endmacro
 
@@ -531,6 +528,13 @@ x264_pixel_satd_8x4_sse2:
     punpckh%2   %5, %4
 %endmacro
 
+%macro TRANSPOSE4x4D 5   ; abcd-t -> adtc
+    SBUTTERFLY dqa, dq,  %1, %2, %5
+    SBUTTERFLY dqa, dq,  %3, %4, %2
+    SBUTTERFLY dqa, qdq, %1, %3, %4
+    SBUTTERFLY dqa, qdq, %5, %2, %3
+%endmacro
+
 ;-----------------------------------------------------------------------------
 ; input ABCDEFGH output AFHDTECB 
 ;-----------------------------------------------------------------------------
@@ -593,7 +597,8 @@ x264_pixel_sa8d_8x8_sse2:
     SUM4x4_TWO_SSE2 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
     SUM4x4_TWO_SSE2 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
     psrlw           xmm10, 1
-    SUM_MM_SSE2     xmm10, xmm0
+    HADDW           xmm10, xmm0
+    movd eax, xmm10
     add r8d, eax ; preserve rounding for 16x16
     add eax, 1
     shr eax, 1
@@ -695,17 +700,128 @@ x264_intra_sa8d_x3_8x8_core_sse2:
     psubw       xmm0, xmm1  ; 8x1 sum
     SUM1x8_SSE2 xmm0, xmm1, xmm2
 
-    SUM_MM_SSE2 xmm14, xmm3
+    HADDW       xmm14, xmm3
+    movd        eax, xmm14
     add         eax, 2
     shr         eax, 2
     mov         [parm3q+4], eax ; i8x8_h sa8d
-    SUM_MM_SSE2 xmm15, xmm4
+    HADDW       xmm15, xmm4
+    movd        eax, xmm15
     add         eax, 2
     shr         eax, 2
     mov         [parm3q+8], eax ; i8x8_dc sa8d
-    SUM_MM_SSE2 xmm2, xmm5
+    HADDW       xmm2, xmm5
+    movd        eax, xmm2
     add         eax, 2
     shr         eax, 2
     mov         [parm3q+0], eax ; i8x8_v sa8d
 
     ret
+
+
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
+;                                       const uint8_t *pix2, int stride2, int sums[2][4] )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_pixel_ssim_4x4x2_core_sse2:
+    pxor      xmm0, xmm0
+    pxor      xmm1, xmm1
+    pxor      xmm2, xmm2
+    pxor      xmm3, xmm3
+    pxor      xmm4, xmm4
+    movdqa    xmm8, [pw_1 GLOBAL]
+%rep 4
+    movq      xmm5, [parm1q]
+    movq      xmm6, [parm3q]
+    punpcklbw xmm5, xmm0
+    punpcklbw xmm6, xmm0
+    paddw     xmm1, xmm5
+    paddw     xmm2, xmm6
+    movdqa    xmm7, xmm5
+    pmaddwd   xmm5, xmm5
+    pmaddwd   xmm7, xmm6
+    pmaddwd   xmm6, xmm6
+    paddd     xmm3, xmm5
+    paddd     xmm4, xmm7
+    paddd     xmm3, xmm6
+    add       parm1q, parm2q
+    add       parm3q, parm4q
+%endrep
+    ; PHADDW xmm1, xmm2
+    ; PHADDD xmm3, xmm4
+    pshufd    xmm5, xmm3, 0xB1
+    pmaddwd   xmm1, xmm8
+    pmaddwd   xmm2, xmm8
+    pshufd    xmm6, xmm4, 0xB1
+    packssdw  xmm1, xmm2
+    paddd     xmm3, xmm5
+    pmaddwd   xmm1, xmm8
+    paddd     xmm4, xmm6
+    pshufd    xmm1, xmm1, 0xD8
+    movdqa    xmm5, xmm3
+    punpckldq xmm3, xmm4
+    punpckhdq xmm5, xmm4
+    movq      [parm5q+ 0], xmm1
+    movq      [parm5q+ 8], xmm3
+    psrldq    xmm1, 8
+    movq      [parm5q+16], xmm1
+    movq      [parm5q+24], xmm5
+    ret
+
+;-----------------------------------------------------------------------------
+; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_pixel_ssim_end4_sse2:
+    movdqa   xmm0, [parm1q+ 0]
+    movdqa   xmm1, [parm1q+16]
+    movdqa   xmm2, [parm1q+32]
+    movdqa   xmm3, [parm1q+48]
+    movdqa   xmm4, [parm1q+64]
+    paddd    xmm0, [parm2q+ 0]
+    paddd    xmm1, [parm2q+16]
+    paddd    xmm2, [parm2q+32]
+    paddd    xmm3, [parm2q+48]
+    paddd    xmm4, [parm2q+64]
+    paddd    xmm0, xmm1
+    paddd    xmm1, xmm2
+    paddd    xmm2, xmm3
+    paddd    xmm3, xmm4
+    movdqa   xmm5, [ssim_c1 GLOBAL]
+    movdqa   xmm6, [ssim_c2 GLOBAL]
+    TRANSPOSE4x4D  xmm0, xmm1, xmm2, xmm3, xmm4
+
+;   s1=mm0, s2=mm3, ss=mm4, s12=mm2
+    movdqa   xmm1, xmm3
+    pslld    xmm3, 16
+    pmaddwd  xmm1, xmm0  ; s1*s2
+    por      xmm0, xmm3
+    pmaddwd  xmm0, xmm0  ; s1*s1 + s2*s2
+    pslld    xmm1, 1
+    pslld    xmm2, 7
+    pslld    xmm4, 6
+    psubd    xmm2, xmm1  ; covar*2
+    psubd    xmm4, xmm0  ; vars
+    paddd    xmm0, xmm5
+    paddd    xmm1, xmm5
+    paddd    xmm2, xmm6
+    paddd    xmm4, xmm6
+    cvtdq2ps xmm0, xmm0  ; (float)(s1*s1 + s2*s2 + ssim_c1)
+    cvtdq2ps xmm1, xmm1  ; (float)(s1*s2*2 + ssim_c1)
+    cvtdq2ps xmm2, xmm2  ; (float)(covar*2 + ssim_c2)
+    cvtdq2ps xmm4, xmm4  ; (float)(vars + ssim_c2)
+    mulps    xmm1, xmm2
+    mulps    xmm0, xmm4
+    divps    xmm1, xmm0  ; ssim
+
+    neg      parm3d
+    movdqu   xmm3, [mask_ff + parm3d*4 + 16 GLOBAL]
+    pand     xmm1, xmm3
+    movhlps  xmm0, xmm1
+    addps    xmm0, xmm1
+    pshuflw  xmm1, xmm0, 0xE
+    addss    xmm0, xmm1
+    ret
+
diff --git a/common/common.c b/common/common.c
index dd0825ed..e4e7af14 100644
--- a/common/common.c
+++ b/common/common.c
@@ -123,6 +123,7 @@ void    x264_param_default( x264_param_t *param )
     param->analyse.b_fast_pskip = 1;
     param->analyse.b_dct_decimate = 1;
     param->analyse.b_psnr = 1;
+    param->analyse.b_ssim = 1;
 
     param->i_cqm_preset = X264_CQM_FLAT;
     memset( param->cqm_4iy, 16, 16 );
@@ -460,6 +461,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
         p->rc.psz_zones = strdup(value);
     OPT("psnr")
         p->analyse.b_psnr = atobool(value);
+    OPT("ssim")
+        p->analyse.b_ssim = atobool(value);
     OPT("aud")
         p->b_aud = atobool(value);
     OPT("sps-id")
diff --git a/common/common.h b/common/common.h
index 0dbb23b5..ce3c27b5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -546,6 +546,7 @@ struct x264_t
         float   f_psnr_mean_y[5];
         float   f_psnr_mean_u[5];
         float   f_psnr_mean_v[5];
+        float   f_ssim_mean_y[5];
         /* */
         int64_t i_mb_count[5][19];
         int64_t i_mb_count_8x8dct[2];
diff --git a/common/i386/pixel-a.asm b/common/i386/pixel-a.asm
index 331b1848..66ee5cd0 100644
--- a/common/i386/pixel-a.asm
+++ b/common/i386/pixel-a.asm
@@ -490,6 +490,7 @@ cglobal x264_intra_satd_x3_8x8c_mmxext
 cglobal x264_intra_satd_x3_16x16_mmxext
 cglobal x264_intra_sa8d_x3_8x8_core_mmxext
 
+cglobal x264_pixel_ssim_4x4x2_core_mmxext
 
 %macro SAD_START 0
     push    ebx
@@ -1571,3 +1572,66 @@ x264_intra_sa8d_x3_8x8_core_mmxext:
 %undef trans
 %undef sum
 
+
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
+;                                         const uint8_t *pix2, int stride2, int sums[2][4] )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_pixel_ssim_4x4x2_core_mmxext:
+    push      ebx
+    push      edi
+    mov       ebx, [esp+16]
+    mov       edx, [esp+24]
+    mov       edi, 4
+    pxor      mm0, mm0
+.loop
+    mov       eax, [esp+12]
+    mov       ecx, [esp+20]
+    add       eax, edi
+    add       ecx, edi
+    pxor      mm1, mm1
+    pxor      mm2, mm2
+    pxor      mm3, mm3
+    pxor      mm4, mm4
+%rep 4
+    movd      mm5, [eax]
+    movd      mm6, [ecx]
+    punpcklbw mm5, mm0
+    punpcklbw mm6, mm0
+    paddw     mm1, mm5
+    paddw     mm2, mm6
+    movq      mm7, mm5
+    pmaddwd   mm5, mm5
+    pmaddwd   mm7, mm6
+    pmaddwd   mm6, mm6
+    paddd     mm3, mm5
+    paddd     mm4, mm7
+    paddd     mm3, mm6
+    add       eax, ebx
+    add       ecx, edx
+%endrep
+    mov       eax, [esp+28]
+    lea       eax, [eax+edi*4]
+    pshufw    mm5, mm1, 0xE
+    pshufw    mm6, mm2, 0xE
+    paddusw   mm1, mm5
+    paddusw   mm2, mm6
+    punpcklwd mm1, mm2
+    pshufw    mm2, mm1, 0xE
+    pshufw    mm5, mm3, 0xE
+    pshufw    mm6, mm4, 0xE
+    paddusw   mm1, mm2
+    paddd     mm3, mm5
+    paddd     mm4, mm6
+    punpcklwd mm1, mm0
+    punpckldq mm3, mm4
+    movq  [eax+0], mm1
+    movq  [eax+8], mm3
+    sub       edi, 4
+    jge       .loop
+    pop       edi
+    pop       ebx
+    emms
+    ret
diff --git a/common/i386/pixel-sse2.asm b/common/i386/pixel-sse2.asm
index e054df68..37977b61 100644
--- a/common/i386/pixel-sse2.asm
+++ b/common/i386/pixel-sse2.asm
@@ -30,7 +30,11 @@ BITS 32
 
 SECTION_RODATA
 
-pd_0000ffff: times 4 dd 0x0000ffff
+pw_1:    times 8 dw 1
+ssim_c1: times 4 dd 416    ; .01*.01*255*255*64
+ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
+mask_ff: times 16 db 0xff
+         times 16 db 0
 
 
 SECTION .text
@@ -49,6 +53,23 @@ cglobal x264_pixel_satd_8x8_sse2
 cglobal x264_pixel_satd_16x8_sse2
 cglobal x264_pixel_satd_8x16_sse2
 cglobal x264_pixel_satd_16x16_sse2
+cglobal x264_pixel_ssim_4x4x2_core_sse2
+cglobal x264_pixel_ssim_end4_sse2
+
+
+%macro SBUTTERFLY 5
+    mov%1       %5, %3
+    punpckl%2   %3, %4
+    punpckh%2   %5, %4
+%endmacro
+
+%macro TRANSPOSE4x4D 5   ; abcd-t -> adtc
+    SBUTTERFLY dqa, dq,  %1, %2, %5
+    SBUTTERFLY dqa, dq,  %3, %4, %2
+    SBUTTERFLY dqa, qdq, %1, %3, %4
+    SBUTTERFLY dqa, qdq, %5, %2, %3
+%endmacro
+
 
 %macro SAD_INC_4x16P_SSE2 0
     movdqu  xmm1,   [ecx]
@@ -548,22 +569,14 @@ x264_pixel_ssd_16x8_sse2:
     paddusw %7, %4
 %endmacro
 
-%macro SUM_MM_SSE2 2    ; sum junk
+%macro HADDW 2    ; sum junk
     ; ebx is no longer used at this point, so no push needed
     picgetgot ebx
-    ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
-    psrlw   %1, 1
-    movdqa  %2, %1
-    psrldq  %1, 2
-    paddusw %1, %2
-    pand    %1, [pd_0000ffff GOT_ebx]
-    movdqa  %2, %1
-    psrldq  %1, 4
+    pmaddwd %1, [pw_1 GOT_ebx]
+    movhlps %2, %1
     paddd   %1, %2
-    movdqa  %2, %1
-    psrldq  %1, 8
+    pshuflw %2, %1, 0xE 
     paddd   %1, %2
-    movd    eax,%1
 %endmacro
 
 %macro SATD_TWO_SSE2 0
@@ -586,8 +599,10 @@ x264_pixel_ssd_16x8_sse2:
 %endmacro
 
 %macro SATD_END 0
-    SUM_MM_SSE2  xmm6, xmm7
-
+    ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
+    psrlw   xmm6, 1
+    HADDW   xmm6, xmm7
+    movd    eax,  xmm6
     pop     ebx
     ret
 %endmacro
@@ -673,3 +688,127 @@ x264_pixel_satd_8x4_sse2:
 
     SATD_END
 
+
+
+;-----------------------------------------------------------------------------
+; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
+;                                       const uint8_t *pix2, int stride2, int sums[2][4] )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_pixel_ssim_4x4x2_core_sse2:
+    push      ebx
+    mov       eax,  [esp+ 8]
+    mov       ebx,  [esp+12]
+    mov       ecx,  [esp+16]
+    mov       edx,  [esp+20]
+    pxor      xmm0, xmm0
+    pxor      xmm1, xmm1
+    pxor      xmm2, xmm2
+    pxor      xmm3, xmm3
+    pxor      xmm4, xmm4
+%rep 4
+    movq      xmm5, [eax]
+    movq      xmm6, [ecx]
+    punpcklbw xmm5, xmm0
+    punpcklbw xmm6, xmm0
+    paddw     xmm1, xmm5
+    paddw     xmm2, xmm6
+    movdqa    xmm7, xmm5
+    pmaddwd   xmm5, xmm5
+    pmaddwd   xmm7, xmm6
+    pmaddwd   xmm6, xmm6
+    paddd     xmm3, xmm5
+    paddd     xmm4, xmm7
+    paddd     xmm3, xmm6
+    add       eax,  ebx
+    add       ecx,  edx
+%endrep
+    ; PHADDW xmm1, xmm2
+    ; PHADDD xmm3, xmm4
+    mov       eax,  [esp+24]
+    picgetgot ebx
+    movdqa    xmm7, [pw_1 GOT_ebx]
+    pshufd    xmm5, xmm3, 0xB1
+    pmaddwd   xmm1, xmm7
+    pmaddwd   xmm2, xmm7
+    pshufd    xmm6, xmm4, 0xB1
+    packssdw  xmm1, xmm2
+    paddd     xmm3, xmm5
+    pmaddwd   xmm1, xmm7
+    paddd     xmm4, xmm6
+    pshufd    xmm1, xmm1, 0xD8
+    movdqa    xmm5, xmm3
+    punpckldq xmm3, xmm4
+    punpckhdq xmm5, xmm4
+    movq      [eax+ 0], xmm1
+    movq      [eax+ 8], xmm3
+    psrldq    xmm1, 8
+    movq      [eax+16], xmm1
+    movq      [eax+24], xmm5
+    pop       ebx
+    ret
+
+;-----------------------------------------------------------------------------
+; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_pixel_ssim_end4_sse2:
+    mov      eax,  [esp+ 4]
+    mov      ecx,  [esp+ 8]
+    mov      edx,  [esp+12]
+    picpush  ebx
+    picgetgot ebx
+    movdqa   xmm0, [eax+ 0]
+    movdqa   xmm1, [eax+16]
+    movdqa   xmm2, [eax+32]
+    movdqa   xmm3, [eax+48]
+    movdqa   xmm4, [eax+64]
+    paddd    xmm0, [ecx+ 0]
+    paddd    xmm1, [ecx+16]
+    paddd    xmm2, [ecx+32]
+    paddd    xmm3, [ecx+48]
+    paddd    xmm4, [ecx+64]
+    paddd    xmm0, xmm1
+    paddd    xmm1, xmm2
+    paddd    xmm2, xmm3
+    paddd    xmm3, xmm4
+    movdqa   xmm5, [ssim_c1 GOT_ebx]
+    movdqa   xmm6, [ssim_c2 GOT_ebx]
+    TRANSPOSE4x4D  xmm0, xmm1, xmm2, xmm3, xmm4
+
+;   s1=mm0, s2=mm3, ss=mm4, s12=mm2
+    movdqa   xmm1, xmm3
+    pslld    xmm3, 16
+    pmaddwd  xmm1, xmm0  ; s1*s2
+    por      xmm0, xmm3
+    pmaddwd  xmm0, xmm0  ; s1*s1 + s2*s2
+    pslld    xmm1, 1
+    pslld    xmm2, 7
+    pslld    xmm4, 6
+    psubd    xmm2, xmm1  ; covar*2
+    psubd    xmm4, xmm0  ; vars
+    paddd    xmm0, xmm5
+    paddd    xmm1, xmm5
+    paddd    xmm2, xmm6
+    paddd    xmm4, xmm6
+    cvtdq2ps xmm0, xmm0  ; (float)(s1*s1 + s2*s2 + ssim_c1)
+    cvtdq2ps xmm1, xmm1  ; (float)(s1*s2*2 + ssim_c1)
+    cvtdq2ps xmm2, xmm2  ; (float)(covar*2 + ssim_c2)
+    cvtdq2ps xmm4, xmm4  ; (float)(vars + ssim_c2)
+    mulps    xmm1, xmm2
+    mulps    xmm0, xmm4
+    divps    xmm1, xmm0  ; ssim
+
+    neg      edx
+    movdqu   xmm3, [mask_ff + edx*4 + 16 GOT_ebx]
+    pand     xmm1, xmm3
+    movhlps  xmm0, xmm1
+    addps    xmm0, xmm1
+    pshuflw  xmm1, xmm0, 0xE
+    addss    xmm0, xmm1
+
+    movd     [picesp+4], xmm0
+    fld      dword [picesp+4]
+    picpop   ebx
+    ret
+
diff --git a/common/i386/pixel.h b/common/i386/pixel.h
index b7b8e89f..f33b22d7 100644
--- a/common/i386/pixel.h
+++ b/common/i386/pixel.h
@@ -98,4 +98,10 @@ void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *, uint8_t *, int * );
 void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *, int16_t [2][8], int * );
 void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * );
 
+void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
+                                        const uint8_t *pix2, int stride2, int sums[2][4] );
+void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
+                                      const uint8_t *pix2, int stride2, int sums[2][4] );
+float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
+
 #endif
diff --git a/common/pixel.c b/common/pixel.c
index 54b03553..fd557ff3 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -322,6 +322,84 @@ SAD_X( 8x16_vis )
 SAD_X( 8x8_vis )
 #endif
 
+static void ssim_4x4x2_core( const uint8_t *pix1, int stride1,
+                             const uint8_t *pix2, int stride2,
+                             int sums[2][4])
+{
+    int x, y, z;
+    for(z=0; z<2; z++)
+    {
+        uint32_t s1=0, s2=0, ss=0, s12=0;
+        for(y=0; y<4; y++)
+            for(x=0; x<4; x++)
+            {
+                int a = pix1[x+y*stride1];
+                int b = pix2[x+y*stride2];
+                s1  += a;
+                s2  += b;
+                ss  += a*a;
+                ss  += b*b;
+                s12 += a*b;
+            }
+        sums[z][0] = s1;
+        sums[z][1] = s2;
+        sums[z][2] = ss;
+        sums[z][3] = s12;
+        pix1 += 4;
+        pix2 += 4;
+    }
+}
+
+static float ssim_end1( int s1, int s2, int ss, int s12 )
+{
+    static const int ssim_c1 = (int)(.01*.01*255*255*64 + .5);
+    static const int ssim_c2 = (int)(.03*.03*255*255*64*63 + .5);
+    int vars = ss*64 - s1*s1 - s2*s2;
+    int covar = s12*64 - s1*s2;
+    return (float)(2*s1*s2 + ssim_c1) * (float)(2*covar + ssim_c2)\
+           / ((float)(s1*s1 + s2*s2 + ssim_c1) * (float)(vars + ssim_c2));
+}
+
+static float ssim_end4( int sum0[5][4], int sum1[5][4], int width )
+{
+    int i;
+    float ssim = 0.0;
+    for( i = 0; i < width; i++ )
+        ssim += ssim_end1( sum0[i][0] + sum0[i+1][0] + sum1[i][0] + sum1[i+1][0],
+                           sum0[i][1] + sum0[i+1][1] + sum1[i][1] + sum1[i+1][1],
+                           sum0[i][2] + sum0[i+1][2] + sum1[i][2] + sum1[i+1][2],
+                           sum0[i][3] + sum0[i+1][3] + sum1[i][3] + sum1[i+1][3] );
+    return ssim;
+}
+
+float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
+                           uint8_t *pix1, int stride1,
+                           uint8_t *pix2, int stride2,
+                           int width, int height )
+{
+    int x, y, z;
+    float ssim = 0.0;
+    int sums[2][width/4+3][4];
+    int (*sum0)[4] = sums[0];
+    int (*sum1)[4] = sums[1];
+    width >>= 2;
+    height >>= 2;
+    z = 0;
+    for( y = 1; y < height; y++ )
+    {
+        for( ; z <= y; z++ )
+        {
+            XCHG( void*, sum0, sum1 );
+            for( x = 0; x < width; x+=2 )
+                pf->ssim_4x4x2_core( &pix1[4*(x+z*stride1)], stride1, &pix2[4*(x+z*stride2)], stride2, &sum0[x] );
+        }
+        for( x = 0; x < width-1; x += 4 )
+            ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) );
+    }
+    return ssim / ((width-1) * (height-1));
+}
+
+
 /****************************************************************************
  * x264_pixel_init:
  ****************************************************************************/
@@ -348,6 +426,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     pixf->sa8d[PIXEL_16x8] = x264_pixel_sa8d_16x8;
     pixf->sa8d[PIXEL_8x16] = x264_pixel_sa8d_8x16;
     pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8;
+    pixf->ssim_4x4x2_core = ssim_4x4x2_core;
+    pixf->ssim_end4 = ssim_end4;
 
 #ifdef HAVE_MMXEXT
     if( cpu&X264_CPU_MMX )
@@ -370,6 +450,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmxext;
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
+        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_mmxext;
 #endif
         pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
         pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_mmxext;
@@ -403,6 +484,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     {
         pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_sse2;
         pixf->ssd[PIXEL_16x8]  = x264_pixel_ssd_16x8_sse2;
+        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_sse2;
+        pixf->ssim_end4        = x264_pixel_ssim_end4_sse2;
 
 #ifdef ARCH_X86_64
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
diff --git a/common/pixel.h b/common/pixel.h
index db9a5742..d6b014cf 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -67,8 +67,14 @@ typedef struct
     x264_pixel_cmp_t  sad[7];
     x264_pixel_cmp_t  ssd[7];
     x264_pixel_cmp_t satd[7];
+    x264_pixel_cmp_t ssim[7];
     x264_pixel_cmp_t sa8d[4];
     x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */
+    x264_pixel_cmp_t rdcmp[7]; /* either ssd or ssim for rate-distortion */
+
+    void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
+                             const uint8_t *pix2, int stride2, int sums[2][4] );
+    float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width );
 
     /* partial distortion elimination:
      * terminate early if partial score is worse than a threshold.
@@ -89,5 +95,6 @@ typedef struct
 
 void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
 int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height );
+float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height );
 
 #endif
diff --git a/encoder/encoder.c b/encoder/encoder.c
index c503c2a5..cb98ce4a 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -363,6 +363,7 @@ static int x264_validate_parameters( x264_t *h )
         h->param.rc.f_pb_factor = 1;
         h->param.analyse.b_transform_8x8 = 0;
         h->param.analyse.b_psnr = 0;
+        h->param.analyse.b_ssim = 0;
         h->param.analyse.i_chroma_qp_offset = 0;
         h->param.analyse.i_trellis = 0;
         h->param.analyse.b_fast_pskip = 0;
@@ -447,6 +448,12 @@ static int x264_validate_parameters( x264_t *h )
 
     h->param.i_sps_id &= 31;
 
+    if( h->param.i_log_level < X264_LOG_INFO )
+    {
+        h->param.analyse.b_psnr = 0;
+        h->param.analyse.b_ssim = 0;
+    }
+
     /* ensure the booleans are 0 or 1 so they can be used in math */
 #define BOOLIFY(x) h->param.x = !!h->param.x
     BOOLIFY( b_cabac );
@@ -462,6 +469,13 @@ static int x264_validate_parameters( x264_t *h )
     return 0;
 }
 
+static void mbcmp_init( x264_t *h )
+{
+    memcpy( h->pixf.mbcmp,
+            ( h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1 ) ? h->pixf.sad : h->pixf.satd,
+            sizeof(h->pixf.mbcmp) );
+}
+
 /****************************************************************************
  * x264_encoder_open:
  ****************************************************************************/
@@ -603,9 +617,7 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
     x264_quant_init( h, h->param.cpu, &h->quantf );
     x264_deblock_init( h->param.cpu, &h->loopf );
 
-    memcpy( h->pixf.mbcmp,
-            ( h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1 ) ? h->pixf.sad : h->pixf.satd,
-            sizeof(h->pixf.mbcmp) );
+    mbcmp_init( h );
 
     /* rate control */
     if( x264_ratecontrol_new( h ) < 0 )
@@ -657,9 +669,7 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
     h->param.analyse.intra = param->analyse.intra;
     h->param.analyse.inter = param->analyse.inter;
 
-    memcpy( h->pixf.mbcmp,
-            ( h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1 ) ? h->pixf.sad : h->pixf.satd,
-            sizeof(h->pixf.mbcmp) );
+    mbcmp_init( h );
 
     return x264_validate_parameters( h );
 }
@@ -1565,32 +1575,44 @@ do_encode:
         }
     }
 
+    psz_message[0] = '\0';
     if( h->param.analyse.b_psnr )
     {
-        int64_t i_sqe_y, i_sqe_u, i_sqe_v;
+        int64_t sqe[3];
 
-        /* PSNR */
-        i_sqe_y = x264_pixel_ssd_wxh( &h->pixf, frame_psnr->plane[0], frame_psnr->i_stride[0], h->fenc->plane[0], h->fenc->i_stride[0], h->param.i_width, h->param.i_height );
-        i_sqe_u = x264_pixel_ssd_wxh( &h->pixf, frame_psnr->plane[1], frame_psnr->i_stride[1], h->fenc->plane[1], h->fenc->i_stride[1], h->param.i_width/2, h->param.i_height/2);
-        i_sqe_v = x264_pixel_ssd_wxh( &h->pixf, frame_psnr->plane[2], frame_psnr->i_stride[2], h->fenc->plane[2], h->fenc->i_stride[2], h->param.i_width/2, h->param.i_height/2);
+        for( i=0; i<3; i++ )
+        {
+            sqe[i] = x264_pixel_ssd_wxh( &h->pixf,
+                         frame_psnr->plane[i], frame_psnr->i_stride[i],
+                         h->fenc->plane[i], h->fenc->i_stride[i],
+                         h->param.i_width >> !!i, h->param.i_height >> !!i );
+        }
         x264_cpu_restore( h->param.cpu );
 
-        h->stat.i_sqe_global[i_slice_type] += i_sqe_y + i_sqe_u + i_sqe_v;
-        h->stat.f_psnr_average[i_slice_type] += x264_psnr( i_sqe_y + i_sqe_u + i_sqe_v, 3 * h->param.i_width * h->param.i_height / 2 );
-        h->stat.f_psnr_mean_y[i_slice_type] += x264_psnr( i_sqe_y, h->param.i_width * h->param.i_height );
-        h->stat.f_psnr_mean_u[i_slice_type] += x264_psnr( i_sqe_u, h->param.i_width * h->param.i_height / 4 );
-        h->stat.f_psnr_mean_v[i_slice_type] += x264_psnr( i_sqe_v, h->param.i_width * h->param.i_height / 4 );
+        h->stat.i_sqe_global[i_slice_type] += sqe[0] + sqe[1] + sqe[2];
+        h->stat.f_psnr_average[i_slice_type] += x264_psnr( sqe[0] + sqe[1] + sqe[2], 3 * h->param.i_width * h->param.i_height / 2 );
+        h->stat.f_psnr_mean_y[i_slice_type] += x264_psnr( sqe[0], h->param.i_width * h->param.i_height );
+        h->stat.f_psnr_mean_u[i_slice_type] += x264_psnr( sqe[1], h->param.i_width * h->param.i_height / 4 );
+        h->stat.f_psnr_mean_v[i_slice_type] += x264_psnr( sqe[2], h->param.i_width * h->param.i_height / 4 );
 
-        snprintf( psz_message, 80, " PSNR Y:%2.2f U:%2.2f V:%2.2f",
-                  x264_psnr( i_sqe_y, h->param.i_width * h->param.i_height ),
-                  x264_psnr( i_sqe_u, h->param.i_width * h->param.i_height / 4),
-                  x264_psnr( i_sqe_v, h->param.i_width * h->param.i_height / 4) );
-        psz_message[79] = '\0';
+        snprintf( psz_message, 80, " PSNR Y:%5.2f U:%5.2f V:%5.2f",
+                  x264_psnr( sqe[0], h->param.i_width * h->param.i_height ),
+                  x264_psnr( sqe[1], h->param.i_width * h->param.i_height / 4),
+                  x264_psnr( sqe[2], h->param.i_width * h->param.i_height / 4) );
     }
-    else
+
+    if( h->param.analyse.b_ssim )
     {
-        psz_message[0] = '\0';
+        // offset by 2 pixels to avoid alignment of ssim blocks with dct blocks
+        float ssim_y = x264_pixel_ssim_wxh( &h->pixf,
+                         frame_psnr->plane[0] + 2+2*frame_psnr->i_stride[0], frame_psnr->i_stride[0],
+                         h->fenc->plane[0] + 2+2*h->fenc->i_stride[0], h->fenc->i_stride[0],
+                         h->param.i_width-2, h->param.i_height-2 );
+        h->stat.f_ssim_mean_y[i_slice_type] += ssim_y;
+        snprintf( psz_message + strlen(psz_message), 80 - strlen(psz_message),
+                  " SSIM Y:%.5f", ssim_y );
     }
+    psz_message[79] = '\0';
     
     x264_log( h, X264_LOG_DEBUG,
                   "frame=%4d QP=%i NAL=%d Slice:%c Poc:%-3d I:%-4d P:%-4d SKIP:%-4d size=%d bytes%s\n",
@@ -1783,7 +1805,14 @@ void    x264_encoder_close  ( x264_t *h )
             }
         }
 
+        if( h->param.analyse.b_ssim )
+        {
+            x264_log( h, X264_LOG_INFO,
+                      "SSIM Mean Y:%.7f\n",
+                      SUM3( h->stat.f_ssim_mean_y ) / i_count );
+        }
         if( h->param.analyse.b_psnr )
+        {
             x264_log( h, X264_LOG_INFO,
                       "PSNR Mean Y:%6.3f U:%6.3f V:%6.3f Avg:%6.3f Global:%6.3f kb/s:%.2f\n",
                       SUM3( h->stat.f_psnr_mean_y ) / i_count,
@@ -1792,6 +1821,7 @@ void    x264_encoder_close  ( x264_t *h )
                       SUM3( h->stat.f_psnr_average ) / i_count,
                       x264_psnr( SUM3( h->stat.i_sqe_global ), i_count * i_yuv_size ),
                       f_bitrate );
+        }
         else
             x264_log( h, X264_LOG_INFO, "kb/s:%.1f\n", f_bitrate );
     }
diff --git a/tools/checkasm.c b/tools/checkasm.c
index d410de49..450d04b3 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -131,6 +131,22 @@ static int check_pixel( int cpu_ref, int cpu_new )
     TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], 1, edge );
     report( "intra satd_x3 :" );
 
+    if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
+        pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )
+    {
+        float res_c, res_a;
+        ok = 1;
+        x264_cpu_restore( cpu_new );
+        res_c = x264_pixel_ssim_wxh( &pixel_c,   buf1+2, 32, buf2+2, 32, 32, 28 );
+        res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28 );
+        if( res_c != res_a )
+        {
+            ok = 0;
+            fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a );
+        }
+        report( "ssim :" );
+    }
+
     return ret;
 }
 
diff --git a/x264.c b/x264.c
index badb5237..4d073500 100644
--- a/x264.c
+++ b/x264.c
@@ -286,6 +286,7 @@ static void Help( x264_param_t *defaults, int b_longhelp )
     H0( "      --progress              Show a progress indicator while encoding\n" );
     H0( "      --quiet                 Quiet Mode\n" );
     H0( "      --no-psnr               Disable PSNR computation\n" );
+    H0( "      --no-ssim               Disable SSIM computation\n" );
     H0( "      --threads <integer>     Parallel encoding (uses slices)\n" );
     H0( "      --thread-input          Run Avisynth in its own thread\n" );
     H1( "      --no-asm                Disable all CPU optimizations\n" );
@@ -399,6 +400,7 @@ static int  Parse( int argc, char **argv,
             { "threads", required_argument, NULL, 0 },
             { "thread-input", no_argument,  NULL, OPT_THREAD_INPUT },
             { "no-psnr", no_argument,       NULL, 0 },
+            { "no-ssim", no_argument,       NULL, 0 },
             { "quiet",   no_argument,       NULL, OPT_QUIET },
             { "verbose", no_argument,       NULL, 'v' },
             { "progress",no_argument,       NULL, OPT_PROGRESS },
@@ -502,7 +504,6 @@ static int  Parse( int argc, char **argv,
                 break;
             case OPT_QUIET:
                 param->i_log_level = X264_LOG_NONE;
-                param->analyse.b_psnr = 0;
                 break;
             case 'v':
                 param->i_log_level = X264_LOG_DEBUG;
diff --git a/x264.h b/x264.h
index f1d947d9..5874be69 100644
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@
 
 #include <stdarg.h>
 
-#define X264_BUILD 49
+#define X264_BUILD 50
 
 /* x264_t:
  *      opaque handler for decoder and encoder */
@@ -216,7 +216,8 @@ typedef struct
         int          b_dct_decimate; /* transform coefficient thresholding on P-frames */
         int          i_noise_reduction; /* adaptive pseudo-deadzone */
 
-        int          b_psnr;    /* Do we compute PSNR stats (save a few % of cpu) */
+        int          b_psnr;    /* compute and print PSNR stats */
+        int          b_ssim;    /* compute and print SSIM stats */
     } analyse;
 
     /* Rate control parameters */
-- 
2.39.5