1 ;*****************************************************************************
2 ;* pixel-sse2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* Authors: Alex Izvorski <aizvorksi@gmail.com>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
21 ;*****************************************************************************
25 ;=============================================================================
26 ; Macros and other preprocessor constants
27 ;=============================================================================
29 %include "amd64inc.asm"
31 SECTION .rodata align=16
35 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
36 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
37 mask_ff: times 16 db 0xff
43 cglobal x264_pixel_sad_16x16_sse2
44 cglobal x264_pixel_sad_16x8_sse2
45 cglobal x264_pixel_ssd_16x16_sse2
46 cglobal x264_pixel_ssd_16x8_sse2
47 cglobal x264_pixel_satd_8x4_sse2
48 cglobal x264_pixel_satd_8x8_sse2
49 cglobal x264_pixel_satd_16x8_sse2
50 cglobal x264_pixel_satd_8x16_sse2
51 cglobal x264_pixel_satd_16x16_sse2
52 cglobal x264_pixel_sa8d_8x8_sse2
53 cglobal x264_pixel_sa8d_16x16_sse2
54 cglobal x264_intra_sa8d_x3_8x8_core_sse2
55 cglobal x264_pixel_ssim_4x4x2_core_sse2
56 cglobal x264_pixel_ssim_end4_sse2
58 %macro HADDD 2 ; sum junk
66 pmaddwd %1, [pw_1 GLOBAL]
70 %macro SAD_INC_4x16P_SSE2 0
72 movdqu xmm2, [rdx+rcx]
75 movdqu xmm4, [rdx+rcx]
77 psadbw xmm2, [rdi+rsi]
80 psadbw xmm4, [rdi+rsi]
89 %macro SAD_START_SSE2 0
91 movsxd rsi, esi ; stride1
93 movsxd rcx, ecx ; stride2
105 ;-----------------------------------------------------------------------------
106 ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
107 ;-----------------------------------------------------------------------------
108 x264_pixel_sad_16x16_sse2:
111 movdqu xmm1, [rdx+rcx]
114 movdqu xmm3, [rdx+rcx]
117 psadbw xmm1, [rdi+rsi]
122 psadbw xmm3, [rdi+rsi]
124 movdqu xmm5, [rdx+rcx]
128 movdqu xmm7, [rdx+rcx]
132 psadbw xmm5, [rdi+rsi]
137 psadbw xmm7, [rdi+rsi]
139 movdqu xmm2, [rdx+rcx]
144 movdqu xmm4, [rdx+rcx]
148 psadbw xmm2, [rdi+rsi]
153 psadbw xmm4, [rdi+rsi]
155 movdqu xmm6, [rdx+rcx]
160 movdqu xmm1, [rdx+rcx]
163 psadbw xmm6, [rdi+rsi]
167 psadbw xmm1, [rdi+rsi]
174 ;-----------------------------------------------------------------------------
175 ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
176 ;-----------------------------------------------------------------------------
177 x264_pixel_sad_16x8_sse2:
184 %macro SSD_INC_2x16P_SSE2 0
187 movdqu xmm3, [rdi+rsi]
188 movdqu xmm4, [rdx+rcx]
219 %macro SSD_INC_8x16P_SSE2 0
226 %macro SSD_START_SSE2 0
227 ; mov rdi, rdi ; pix1
228 movsxd rsi, esi ; stride1
229 ; mov rdx, rdx ; pix2
230 movsxd rcx, ecx ; stride2
232 pxor xmm7, xmm7 ; zero
233 pxor xmm0, xmm0 ; mm0 holds the sum
236 %macro SSD_END_SSE2 0
243 ;-----------------------------------------------------------------------------
244 ; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
245 ;-----------------------------------------------------------------------------
246 x264_pixel_ssd_16x16_sse2:
253 ;-----------------------------------------------------------------------------
254 ; int x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
255 ;-----------------------------------------------------------------------------
256 x264_pixel_ssd_16x8_sse2:
261 ; %1=(row2, row0) %2=(row3, row1) %3=junk
262 ; output in %1=(row3, row0) and %3=(row2, row1)
263 %macro HADAMARD4x4_SSE2 3
275 ;;; two HADAMARD4x4_SSE2 running side-by-side
276 %macro HADAMARD4x4_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)
297 %macro TRANSPOSE4x4_TWIST_SSE2 3 ; %1=(row3, row0) %2=(row2, row1) %3=junk, output in %1 and %2
300 punpckhwd %2, %3 ; backwards because the high quadwords are already swapped
311 ;;; two TRANSPOSE4x4_TWIST_SSE2 running side-by-side
312 %macro TRANSPOSE4x4_TWIST_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)
333 ;;; loads the difference of two 4x4 blocks into xmm0,xmm1 and xmm4,xmm5 in interleaved-row order
335 ;;; the value in xmm7 doesn't matter: it's only subtracted from itself
336 %macro LOAD4x8_DIFF_SSE2 0
357 punpcklqdq xmm0, xmm2 ; rows 0 and 2
358 punpckhqdq xmm4, xmm2 ; next 4x4 rows 0 and 2
368 punpcklqdq xmm1, xmm3 ; rows 1 and 3
369 punpckhqdq xmm5, xmm3 ; next 4x4 rows 1 and 3
372 %macro SUM1x8_SSE2 3 ; 01 junk sum
379 %macro SUM4x4_SSE2 4 ; 02 13 junk sum
392 ;;; two SUM4x4_SSE2 running side-by-side
393 %macro SUM4x4_TWO_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
412 %macro SATD_TWO_SSE2 0
414 HADAMARD4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3
415 TRANSPOSE4x4_TWIST_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5
416 HADAMARD4x4_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5
417 SUM4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
421 ; mov rdi, rdi ; pix1
422 movsxd rsi, esi ; stride1
423 ; mov rdx, rdx ; pix2
424 movsxd rcx, ecx ; stride2
436 ;-----------------------------------------------------------------------------
437 ; int x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
438 ;-----------------------------------------------------------------------------
439 x264_pixel_satd_16x16_sse2:
460 ;-----------------------------------------------------------------------------
461 ; int x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int )
462 ;-----------------------------------------------------------------------------
463 x264_pixel_satd_8x16_sse2:
474 ;-----------------------------------------------------------------------------
475 ; int x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
476 ;-----------------------------------------------------------------------------
477 x264_pixel_satd_16x8_sse2:
494 ;-----------------------------------------------------------------------------
495 ; int x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int )
496 ;-----------------------------------------------------------------------------
497 x264_pixel_satd_8x8_sse2:
506 ;-----------------------------------------------------------------------------
507 ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
508 ;-----------------------------------------------------------------------------
509 x264_pixel_satd_8x4_sse2:
517 %macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2]
531 %macro TRANSPOSE4x4D 5 ; abcd-t -> adtc
532 SBUTTERFLY dqa, dq, %1, %2, %5
533 SBUTTERFLY dqa, dq, %3, %4, %2
534 SBUTTERFLY dqa, qdq, %1, %3, %4
535 SBUTTERFLY dqa, qdq, %5, %2, %3
538 ;-----------------------------------------------------------------------------
539 ; input ABCDEFGH output AFHDTECB
540 ;-----------------------------------------------------------------------------
541 %macro TRANSPOSE8x8 9
542 SBUTTERFLY dqa, wd, %1, %2, %9
543 SBUTTERFLY dqa, wd, %3, %4, %2
544 SBUTTERFLY dqa, wd, %5, %6, %4
545 SBUTTERFLY dqa, wd, %7, %8, %6
546 SBUTTERFLY dqa, dq, %1, %3, %8
547 SBUTTERFLY dqa, dq, %9, %2, %3
548 SBUTTERFLY dqa, dq, %5, %7, %2
549 SBUTTERFLY dqa, dq, %4, %6, %7
550 SBUTTERFLY dqa, qdq, %1, %5, %6
551 SBUTTERFLY dqa, qdq, %9, %4, %5
552 SBUTTERFLY dqa, qdq, %8, %2, %4
553 SBUTTERFLY dqa, qdq, %3, %7, %2
566 SUMSUB_BADC %1, %5, %2, %6
567 SUMSUB_BADC %3, %7, %4, %8
568 SUMSUB_BADC %1, %3, %2, %4
569 SUMSUB_BADC %5, %7, %6, %8
570 SUMSUB_BADC %1, %2, %3, %4
571 SUMSUB_BADC %5, %6, %7, %8
575 ;-----------------------------------------------------------------------------
576 ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
577 ;-----------------------------------------------------------------------------
578 x264_pixel_sa8d_8x8_sse2:
581 LOAD_DIFF_8P xmm0, xmm8, [parm1q], [parm3q]
582 LOAD_DIFF_8P xmm1, xmm9, [parm1q+parm2q], [parm3q+parm4q]
583 LOAD_DIFF_8P xmm2, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q]
584 LOAD_DIFF_8P xmm3, xmm9, [parm1q+r10], [parm3q+r11]
585 lea parm1q, [parm1q+4*parm2q]
586 lea parm3q, [parm3q+4*parm4q]
587 LOAD_DIFF_8P xmm4, xmm8, [parm1q], [parm3q]
588 LOAD_DIFF_8P xmm5, xmm9, [parm1q+parm2q], [parm3q+parm4q]
589 LOAD_DIFF_8P xmm6, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q]
590 LOAD_DIFF_8P xmm7, xmm9, [parm1q+r10], [parm3q+r11]
592 HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
593 TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
594 HADAMARD1x8 xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
597 SUM4x4_TWO_SSE2 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
598 SUM4x4_TWO_SSE2 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
602 add r8d, eax ; preserve rounding for 16x16
608 ;-----------------------------------------------------------------------------
609 ; int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int )
610 ;-----------------------------------------------------------------------------
611 ;; violates calling convention
612 x264_pixel_sa8d_16x16_sse2:
614 call x264_pixel_sa8d_8x8_sse2 ; pix[0]
615 lea parm1q, [parm1q+4*parm2q]
616 lea parm3q, [parm3q+4*parm4q]
617 call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride]
618 lea r10, [3*parm2q-2]
619 lea r11, [3*parm4q-2]
624 call x264_pixel_sa8d_8x8_sse2 ; pix[8]
625 lea parm1q, [parm1q+4*parm2q]
626 lea parm3q, [parm3q+4*parm4q]
627 call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride+8]
636 ;-----------------------------------------------------------------------------
637 ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
638 ;-----------------------------------------------------------------------------
639 x264_intra_sa8d_x3_8x8_core_sse2:
642 movq xmm0, [parm1q+0*FENC_STRIDE]
643 movq xmm7, [parm1q+1*FENC_STRIDE]
644 movq xmm6, [parm1q+2*FENC_STRIDE]
645 movq xmm3, [parm1q+3*FENC_STRIDE]
646 movq xmm5, [parm1q+4*FENC_STRIDE]
647 movq xmm1, [parm1q+5*FENC_STRIDE]
648 movq xmm8, [parm1q+6*FENC_STRIDE]
649 movq xmm2, [parm1q+7*FENC_STRIDE]
658 HADAMARD1x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2
659 TRANSPOSE8x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4
660 HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
663 movzx edi, word [parm2q+0]
664 add di, word [parm2q+16]
674 SUM4x4_TWO_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15
677 SUM4x4_SSE2 xmm8, xmm9, xmm10, xmm15
679 SUM1x8_SSE2 xmm8, xmm10, xmm15
680 movdqa xmm14, xmm15 ; 7x8 sum
682 movdqa xmm8, [parm2q+0] ; left edge
687 SUM1x8_SSE2 xmm8, xmm10, xmm14
688 SUM1x8_SSE2 xmm9, xmm11, xmm15 ; 1x8 sum
695 punpcklqdq xmm0, xmm4 ; transpose
696 movdqa xmm1, [parm2q+16] ; top edge
699 psrldq xmm2, 2 ; 8x7 sum
700 psubw xmm0, xmm1 ; 8x1 sum
701 SUM1x8_SSE2 xmm0, xmm1, xmm2
707 mov [parm3q+4], eax ; i8x8_h sa8d
712 mov [parm3q+8], eax ; i8x8_dc sa8d
717 mov [parm3q+0], eax ; i8x8_v sa8d
723 ;-----------------------------------------------------------------------------
724 ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
725 ; const uint8_t *pix2, int stride2, int sums[2][4] )
726 ;-----------------------------------------------------------------------------
728 x264_pixel_ssim_4x4x2_core_sse2:
734 movdqa xmm8, [pw_1 GLOBAL]
754 pshufd xmm5, xmm3, 0xB1
757 pshufd xmm6, xmm4, 0xB1
762 pshufd xmm1, xmm1, 0xD8
766 movq [parm5q+ 0], xmm1
767 movq [parm5q+ 8], xmm3
769 movq [parm5q+16], xmm1
770 movq [parm5q+24], xmm5
773 ;-----------------------------------------------------------------------------
774 ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
775 ;-----------------------------------------------------------------------------
777 x264_pixel_ssim_end4_sse2:
778 movdqa xmm0, [parm1q+ 0]
779 movdqa xmm1, [parm1q+16]
780 movdqa xmm2, [parm1q+32]
781 movdqa xmm3, [parm1q+48]
782 movdqa xmm4, [parm1q+64]
783 paddd xmm0, [parm2q+ 0]
784 paddd xmm1, [parm2q+16]
785 paddd xmm2, [parm2q+32]
786 paddd xmm3, [parm2q+48]
787 paddd xmm4, [parm2q+64]
792 movdqa xmm5, [ssim_c1 GLOBAL]
793 movdqa xmm6, [ssim_c2 GLOBAL]
794 TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4
796 ; s1=mm0, s2=mm3, ss=mm4, s12=mm2
799 pmaddwd xmm1, xmm0 ; s1*s2
801 pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2
805 psubd xmm2, xmm1 ; covar*2
806 psubd xmm4, xmm0 ; vars
811 cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
812 cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1)
813 cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2)
814 cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2)
817 divps xmm1, xmm0 ; ssim
820 movdqu xmm3, [mask_ff + parm3d*4 + 16 GLOBAL]
824 pshuflw xmm1, xmm0, 0xE