1 ;*****************************************************************************
2 ;* pixel-sse2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* Authors: Alex Izvorski <aizvorksi@gmail.com>
7 ;* Loren Merritt <lorenm@u.washington.edu>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 ;*****************************************************************************
26 ;=============================================================================
27 ; Macros and other preprocessor constants
28 ;=============================================================================
30 %include "amd64inc.asm"
32 SECTION .rodata align=16
35 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
36 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
37 mask_ff: times 16 db 0xff
42 %macro HADDD 2 ; sum junk
50 pmaddwd %1, [pw_1 GLOBAL]
54 %macro SAD_INC_4x16P_SSE2 0
56 movdqu xmm2, [rdx+rcx]
59 movdqu xmm4, [rdx+rcx]
61 psadbw xmm2, [rdi+rsi]
64 psadbw xmm4, [rdi+rsi]
80 ;-----------------------------------------------------------------------------
81 ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
82 ;-----------------------------------------------------------------------------
83 cglobal x264_pixel_sad_16x16_sse2
85 movdqu xmm1, [rdx+rcx]
88 movdqu xmm3, [rdx+rcx]
91 psadbw xmm1, [rdi+rsi]
96 psadbw xmm3, [rdi+rsi]
98 movdqu xmm5, [rdx+rcx]
102 movdqu xmm7, [rdx+rcx]
106 psadbw xmm5, [rdi+rsi]
111 psadbw xmm7, [rdi+rsi]
113 movdqu xmm2, [rdx+rcx]
118 movdqu xmm4, [rdx+rcx]
122 psadbw xmm2, [rdi+rsi]
127 psadbw xmm4, [rdi+rsi]
129 movdqu xmm6, [rdx+rcx]
134 movdqu xmm1, [rdx+rcx]
137 psadbw xmm6, [rdi+rsi]
141 psadbw xmm1, [rdi+rsi]
147 ;-----------------------------------------------------------------------------
148 ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
149 ;-----------------------------------------------------------------------------
150 cglobal x264_pixel_sad_16x8_sse2
159 %macro SAD_X3_START_1x16P 0
160 movdqa xmm3, [parm1q]
161 movdqu xmm0, [parm2q]
162 movdqu xmm1, [parm3q]
163 movdqu xmm2, [parm4q]
169 %macro SAD_X3_1x16P 2
170 movdqa xmm3, [parm1q+%1]
171 movdqu xmm4, [parm2q+%2]
172 movdqu xmm5, [parm3q+%2]
173 movdqu xmm6, [parm4q+%2]
182 %macro SAD_X3_2x16P 1
188 SAD_X3_1x16P FENC_STRIDE, parm5q
189 add parm1q, 2*FENC_STRIDE
190 lea parm2q, [parm2q+2*parm5q]
191 lea parm3q, [parm3q+2*parm5q]
192 lea parm4q, [parm4q+2*parm5q]
195 %macro SAD_X4_START_1x16P 0
196 movdqa xmm7, [parm1q]
197 movdqu xmm0, [parm2q]
198 movdqu xmm1, [parm3q]
199 movdqu xmm2, [parm4q]
200 movdqu xmm3, [parm5q]
207 %macro SAD_X4_1x16P 2
208 movdqa xmm7, [parm1q+%1]
209 movdqu xmm4, [parm2q+%2]
210 movdqu xmm5, [parm3q+%2]
211 movdqu xmm6, [parm4q+%2]
212 movdqu xmm8, [parm5q+%2]
223 %macro SAD_X4_2x16P 1
229 SAD_X4_1x16P FENC_STRIDE, parm6q
230 add parm1q, 2*FENC_STRIDE
231 lea parm2q, [parm2q+2*parm6q]
232 lea parm3q, [parm3q+2*parm6q]
233 lea parm4q, [parm4q+2*parm6q]
234 lea parm5q, [parm5q+2*parm6q]
244 movd [parm6q+0], xmm0
245 movd [parm6q+4], xmm1
246 movd [parm6q+8], xmm2
267 ;-----------------------------------------------------------------------------
268 ; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
269 ; uint8_t *pix2, int i_stride, int scores[3] )
270 ;-----------------------------------------------------------------------------
272 cglobal x264_pixel_sad_x%1_%2x%3_sse2
288 %macro SSD_INC_2x16P_SSE2 0
291 movdqu xmm3, [rdi+rsi]
292 movdqu xmm4, [rdx+rcx]
323 %macro SSD_START_SSE2 0
324 pxor xmm7, xmm7 ; zero
325 pxor xmm0, xmm0 ; mm0 holds the sum
328 %macro SSD_END_SSE2 0
334 ;-----------------------------------------------------------------------------
335 ; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
336 ;-----------------------------------------------------------------------------
337 cglobal x264_pixel_ssd_16x16_sse2
344 ;-----------------------------------------------------------------------------
345 ; int x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
346 ;-----------------------------------------------------------------------------
347 cglobal x264_pixel_ssd_16x8_sse2
366 SUMSUB_BADC %1, %2, %3, %4
367 SUMSUB_BADC %1, %3, %2, %4
371 SUMSUB_BADC %1, %5, %2, %6
372 SUMSUB_BADC %3, %7, %4, %8
373 SUMSUB_BADC %1, %3, %2, %4
374 SUMSUB_BADC %5, %7, %6, %8
375 SUMSUB_BADC %1, %2, %3, %4
376 SUMSUB_BADC %5, %6, %7, %8
379 ;;; row transform not used, because phaddw is much slower than paddw on a Conroe
386 ;%macro HADAMARD4x1_SSSE3 5 ; ABCD-T -> ADTC
387 ; PHSUMSUB %1, %2, %5
388 ; PHSUMSUB %3, %4, %2
389 ; PHSUMSUB %1, %3, %4
390 ; PHSUMSUB %5, %2, %3
399 %macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4 to not shuffle registers
405 %macro TRANSPOSE4x4D 5 ; ABCD-T -> ADTC
406 SBUTTERFLY dqa, dq, %1, %2, %5
407 SBUTTERFLY dqa, dq, %3, %4, %2
408 SBUTTERFLY dqa, qdq, %1, %3, %4
409 SBUTTERFLY dqa, qdq, %5, %2, %3
412 %macro TRANSPOSE2x4x4W 5 ; ABCD-T -> ABCD
413 SBUTTERFLY dqa, wd, %1, %2, %5
414 SBUTTERFLY dqa, wd, %3, %4, %2
415 SBUTTERFLY dqa, dq, %1, %3, %4
416 SBUTTERFLY2 dqa, dq, %5, %2, %3
417 SBUTTERFLY dqa, qdq, %1, %3, %2
418 SBUTTERFLY2 dqa, qdq, %4, %5, %3
421 %macro TRANSPOSE8x8 9 ; ABCDEFGH-T -> AFHDTECB
422 SBUTTERFLY dqa, wd, %1, %2, %9
423 SBUTTERFLY dqa, wd, %3, %4, %2
424 SBUTTERFLY dqa, wd, %5, %6, %4
425 SBUTTERFLY dqa, wd, %7, %8, %6
426 SBUTTERFLY dqa, dq, %1, %3, %8
427 SBUTTERFLY dqa, dq, %9, %2, %3
428 SBUTTERFLY dqa, dq, %5, %7, %2
429 SBUTTERFLY dqa, dq, %4, %6, %7
430 SBUTTERFLY dqa, qdq, %1, %5, %6
431 SBUTTERFLY dqa, qdq, %9, %4, %5
432 SBUTTERFLY dqa, qdq, %8, %2, %4
433 SBUTTERFLY dqa, qdq, %3, %7, %2
436 %macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2]
444 %macro LOAD_DIFF_4x8P 6 ; 4x dest, 2x temp
445 LOAD_DIFF_8P %1, %5, [parm1q], [parm3q]
446 LOAD_DIFF_8P %2, %6, [parm1q+parm2q], [parm3q+parm4q]
447 LOAD_DIFF_8P %3, %5, [parm1q+2*parm2q], [parm3q+2*parm4q]
448 LOAD_DIFF_8P %4, %6, [parm1q+r10], [parm3q+r11]
451 %macro SUM1x8_SSE2 3 ; 01 junk sum
458 %macro SUM4x4_SSE2 4 ; 02 13 junk sum
471 %macro SUM8x4_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
490 %macro SUM8x4_SSSE3 7 ; a02 a13 . b02 b13 . sum
501 %macro SATD_TWO_SSE2 0
502 LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
503 lea parm1q, [parm1q+4*parm2q]
504 lea parm3q, [parm3q+4*parm4q]
505 HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
506 TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
507 HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
508 SUM8x4 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6
525 ;-----------------------------------------------------------------------------
526 ; int x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
527 ;-----------------------------------------------------------------------------
528 cglobal x264_pixel_satd_16x16_%1
544 ;-----------------------------------------------------------------------------
545 ; int x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int )
546 ;-----------------------------------------------------------------------------
547 cglobal x264_pixel_satd_8x16_%1
555 ;-----------------------------------------------------------------------------
556 ; int x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
557 ;-----------------------------------------------------------------------------
558 cglobal x264_pixel_satd_16x8_%1
570 ;-----------------------------------------------------------------------------
571 ; int x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int )
572 ;-----------------------------------------------------------------------------
573 cglobal x264_pixel_satd_8x8_%1
579 ;-----------------------------------------------------------------------------
580 ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
581 ;-----------------------------------------------------------------------------
582 cglobal x264_pixel_satd_8x4_%1
588 ;-----------------------------------------------------------------------------
589 ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
590 ;-----------------------------------------------------------------------------
591 cglobal x264_pixel_sa8d_8x8_%1
594 LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm8, xmm8
595 lea parm1q, [parm1q+4*parm2q]
596 lea parm3q, [parm3q+4*parm4q]
597 LOAD_DIFF_4x8P xmm4, xmm5, xmm6, xmm7, xmm8, xmm8
599 HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
600 TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
601 HADAMARD1x8 xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
604 SUM8x4 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
605 SUM8x4 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
609 add r8d, eax ; preserve rounding for 16x16
614 ;-----------------------------------------------------------------------------
615 ; int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int )
616 ;-----------------------------------------------------------------------------
617 ;; violates calling convention
618 cglobal x264_pixel_sa8d_16x16_%1
620 call x264_pixel_sa8d_8x8_%1 ; pix[0]
621 lea parm1q, [parm1q+4*parm2q]
622 lea parm3q, [parm3q+4*parm4q]
623 call x264_pixel_sa8d_8x8_%1 ; pix[8*stride]
624 lea r10, [3*parm2q-2]
625 lea r11, [3*parm4q-2]
630 call x264_pixel_sa8d_8x8_%1 ; pix[8]
631 lea parm1q, [parm1q+4*parm2q]
632 lea parm3q, [parm3q+4*parm4q]
633 call x264_pixel_sa8d_8x8_%1 ; pix[8*stride+8]
640 %define SUM8x4 SUM8x4_SSE2
643 %define SUM8x4 SUM8x4_SSSE3
649 ;-----------------------------------------------------------------------------
650 ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
651 ;-----------------------------------------------------------------------------
652 cglobal x264_intra_sa8d_x3_8x8_core_sse2
655 movq xmm0, [parm1q+0*FENC_STRIDE]
656 movq xmm7, [parm1q+1*FENC_STRIDE]
657 movq xmm6, [parm1q+2*FENC_STRIDE]
658 movq xmm3, [parm1q+3*FENC_STRIDE]
659 movq xmm5, [parm1q+4*FENC_STRIDE]
660 movq xmm1, [parm1q+5*FENC_STRIDE]
661 movq xmm8, [parm1q+6*FENC_STRIDE]
662 movq xmm2, [parm1q+7*FENC_STRIDE]
671 HADAMARD1x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2
672 TRANSPOSE8x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4
673 HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
676 movzx edi, word [parm2q+0]
677 add di, word [parm2q+16]
687 SUM8x4_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15
690 SUM4x4_SSE2 xmm8, xmm9, xmm10, xmm15
692 SUM1x8_SSE2 xmm8, xmm10, xmm15
693 movdqa xmm14, xmm15 ; 7x8 sum
695 movdqa xmm8, [parm2q+0] ; left edge
700 SUM1x8_SSE2 xmm8, xmm10, xmm14
701 SUM1x8_SSE2 xmm9, xmm11, xmm15 ; 1x8 sum
708 punpcklqdq xmm0, xmm4 ; transpose
709 movdqa xmm1, [parm2q+16] ; top edge
712 psrldq xmm2, 2 ; 8x7 sum
713 psubw xmm0, xmm1 ; 8x1 sum
714 SUM1x8_SSE2 xmm0, xmm1, xmm2
720 mov [parm3q+4], eax ; i8x8_h sa8d
725 mov [parm3q+8], eax ; i8x8_dc sa8d
730 mov [parm3q+0], eax ; i8x8_v sa8d
736 ;-----------------------------------------------------------------------------
737 ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
738 ; const uint8_t *pix2, int stride2, int sums[2][4] )
739 ;-----------------------------------------------------------------------------
740 cglobal x264_pixel_ssim_4x4x2_core_sse2
746 movdqa xmm8, [pw_1 GLOBAL]
766 pshufd xmm5, xmm3, 0xB1
769 pshufd xmm6, xmm4, 0xB1
774 pshufd xmm1, xmm1, 0xD8
778 movq [parm5q+ 0], xmm1
779 movq [parm5q+ 8], xmm3
781 movq [parm5q+16], xmm1
782 movq [parm5q+24], xmm5
785 ;-----------------------------------------------------------------------------
786 ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
787 ;-----------------------------------------------------------------------------
788 cglobal x264_pixel_ssim_end4_sse2
789 movdqa xmm0, [parm1q+ 0]
790 movdqa xmm1, [parm1q+16]
791 movdqa xmm2, [parm1q+32]
792 movdqa xmm3, [parm1q+48]
793 movdqa xmm4, [parm1q+64]
794 paddd xmm0, [parm2q+ 0]
795 paddd xmm1, [parm2q+16]
796 paddd xmm2, [parm2q+32]
797 paddd xmm3, [parm2q+48]
798 paddd xmm4, [parm2q+64]
803 movdqa xmm5, [ssim_c1 GLOBAL]
804 movdqa xmm6, [ssim_c2 GLOBAL]
805 TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4
807 ; s1=mm0, s2=mm3, ss=mm4, s12=mm2
810 pmaddwd xmm1, xmm0 ; s1*s2
812 pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2
816 psubd xmm2, xmm1 ; covar*2
817 psubd xmm4, xmm0 ; vars
822 cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
823 cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1)
824 cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2)
825 cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2)
828 divps xmm1, xmm0 ; ssim
832 lea rax, [mask_ff + 16 GLOBAL]
833 movdqu xmm3, [rax + parm3q*4]
835 movdqu xmm3, [mask_ff + parm3q*4 + 16]
840 pshuflw xmm1, xmm0, 0xE