1 ;*****************************************************************************
2 ;* pixel-sse2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* Authors: Alex Izvorski <aizvorksi@gmail.com>
7 ;* Loren Merritt <lorenm@u.washington.edu>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 ;*****************************************************************************
26 ;=============================================================================
27 ; Macros and other preprocessor constants
28 ;=============================================================================
30 %include "amd64inc.asm"
32 SECTION .rodata align=16
36 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
37 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
38 mask_ff: times 16 db 0xff
43 %macro HADDD 2 ; sum junk
51 pmaddwd %1, [pw_1 GLOBAL]
55 %macro SAD_INC_4x16P_SSE2 0
57 movdqu xmm2, [rdx+rcx]
60 movdqu xmm4, [rdx+rcx]
62 psadbw xmm2, [rdi+rsi]
65 psadbw xmm4, [rdi+rsi]
82 ;-----------------------------------------------------------------------------
83 ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
84 ;-----------------------------------------------------------------------------
85 cglobal x264_pixel_sad_16x16_sse2
87 movdqu xmm1, [rdx+rcx]
90 movdqu xmm3, [rdx+rcx]
93 psadbw xmm1, [rdi+rsi]
98 psadbw xmm3, [rdi+rsi]
100 movdqu xmm5, [rdx+rcx]
104 movdqu xmm7, [rdx+rcx]
108 psadbw xmm5, [rdi+rsi]
113 psadbw xmm7, [rdi+rsi]
115 movdqu xmm2, [rdx+rcx]
120 movdqu xmm4, [rdx+rcx]
124 psadbw xmm2, [rdi+rsi]
129 psadbw xmm4, [rdi+rsi]
131 movdqu xmm6, [rdx+rcx]
136 movdqu xmm1, [rdx+rcx]
139 psadbw xmm6, [rdi+rsi]
143 psadbw xmm1, [rdi+rsi]
149 ;-----------------------------------------------------------------------------
150 ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
151 ;-----------------------------------------------------------------------------
152 cglobal x264_pixel_sad_16x8_sse2
158 %macro SSD_INC_2x16P_SSE2 0
161 movdqu xmm3, [rdi+rsi]
162 movdqu xmm4, [rdx+rcx]
193 %macro SSD_START_SSE2 0
194 pxor xmm7, xmm7 ; zero
195 pxor xmm0, xmm0 ; mm0 holds the sum
198 %macro SSD_END_SSE2 0
204 ;-----------------------------------------------------------------------------
205 ; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
206 ;-----------------------------------------------------------------------------
207 cglobal x264_pixel_ssd_16x16_sse2
214 ;-----------------------------------------------------------------------------
215 ; int x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
216 ;-----------------------------------------------------------------------------
217 cglobal x264_pixel_ssd_16x8_sse2
236 SUMSUB_BADC %1, %2, %3, %4
237 SUMSUB_BADC %1, %3, %2, %4
241 SUMSUB_BADC %1, %5, %2, %6
242 SUMSUB_BADC %3, %7, %4, %8
243 SUMSUB_BADC %1, %3, %2, %4
244 SUMSUB_BADC %5, %7, %6, %8
245 SUMSUB_BADC %1, %2, %3, %4
246 SUMSUB_BADC %5, %6, %7, %8
249 ;;; row transform not used, because phaddw is much slower than paddw on a Conroe
256 ;%macro HADAMARD4x1_SSSE3 5 ; ABCD-T -> ADTC
257 ; PHSUMSUB %1, %2, %5
258 ; PHSUMSUB %3, %4, %2
259 ; PHSUMSUB %1, %3, %4
260 ; PHSUMSUB %5, %2, %3
269 %macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4 to not shuffle registers
275 %macro TRANSPOSE4x4D 5 ; ABCD-T -> ADTC
276 SBUTTERFLY dqa, dq, %1, %2, %5
277 SBUTTERFLY dqa, dq, %3, %4, %2
278 SBUTTERFLY dqa, qdq, %1, %3, %4
279 SBUTTERFLY dqa, qdq, %5, %2, %3
282 %macro TRANSPOSE2x4x4W 5 ; ABCD-T -> ABCD
283 SBUTTERFLY dqa, wd, %1, %2, %5
284 SBUTTERFLY dqa, wd, %3, %4, %2
285 SBUTTERFLY dqa, dq, %1, %3, %4
286 SBUTTERFLY2 dqa, dq, %5, %2, %3
287 SBUTTERFLY dqa, qdq, %1, %3, %2
288 SBUTTERFLY2 dqa, qdq, %4, %5, %3
291 %macro TRANSPOSE8x8 9 ; ABCDEFGH-T -> AFHDTECB
292 SBUTTERFLY dqa, wd, %1, %2, %9
293 SBUTTERFLY dqa, wd, %3, %4, %2
294 SBUTTERFLY dqa, wd, %5, %6, %4
295 SBUTTERFLY dqa, wd, %7, %8, %6
296 SBUTTERFLY dqa, dq, %1, %3, %8
297 SBUTTERFLY dqa, dq, %9, %2, %3
298 SBUTTERFLY dqa, dq, %5, %7, %2
299 SBUTTERFLY dqa, dq, %4, %6, %7
300 SBUTTERFLY dqa, qdq, %1, %5, %6
301 SBUTTERFLY dqa, qdq, %9, %4, %5
302 SBUTTERFLY dqa, qdq, %8, %2, %4
303 SBUTTERFLY dqa, qdq, %3, %7, %2
306 %macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2]
314 %macro LOAD_DIFF_4x8P 6 ; 4x dest, 2x temp
315 LOAD_DIFF_8P %1, %5, [parm1q], [parm3q]
316 LOAD_DIFF_8P %2, %6, [parm1q+parm2q], [parm3q+parm4q]
317 LOAD_DIFF_8P %3, %5, [parm1q+2*parm2q], [parm3q+2*parm4q]
318 LOAD_DIFF_8P %4, %6, [parm1q+r10], [parm3q+r11]
321 %macro SUM1x8_SSE2 3 ; 01 junk sum
328 %macro SUM4x4_SSE2 4 ; 02 13 junk sum
341 %macro SUM8x4_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
360 %macro SUM8x4_SSSE3 7 ; a02 a13 . b02 b13 . sum
371 %macro SATD_TWO_SSE2 0
372 LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
373 lea parm1q, [parm1q+4*parm2q]
374 lea parm3q, [parm3q+4*parm4q]
375 HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
376 TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
377 HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
378 SUM8x4 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6
395 ;-----------------------------------------------------------------------------
396 ; int x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
397 ;-----------------------------------------------------------------------------
398 cglobal x264_pixel_satd_16x16_%1
414 ;-----------------------------------------------------------------------------
415 ; int x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int )
416 ;-----------------------------------------------------------------------------
417 cglobal x264_pixel_satd_8x16_%1
425 ;-----------------------------------------------------------------------------
426 ; int x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
427 ;-----------------------------------------------------------------------------
428 cglobal x264_pixel_satd_16x8_%1
440 ;-----------------------------------------------------------------------------
441 ; int x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int )
442 ;-----------------------------------------------------------------------------
443 cglobal x264_pixel_satd_8x8_%1
449 ;-----------------------------------------------------------------------------
450 ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
451 ;-----------------------------------------------------------------------------
452 cglobal x264_pixel_satd_8x4_%1
458 ;-----------------------------------------------------------------------------
459 ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
460 ;-----------------------------------------------------------------------------
461 cglobal x264_pixel_sa8d_8x8_%1
464 LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm8, xmm8
465 lea parm1q, [parm1q+4*parm2q]
466 lea parm3q, [parm3q+4*parm4q]
467 LOAD_DIFF_4x8P xmm4, xmm5, xmm6, xmm7, xmm8, xmm8
469 HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
470 TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
471 HADAMARD1x8 xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
474 SUM8x4 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
475 SUM8x4 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
479 add r8d, eax ; preserve rounding for 16x16
484 ;-----------------------------------------------------------------------------
485 ; int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int )
486 ;-----------------------------------------------------------------------------
487 ;; violates calling convention
488 cglobal x264_pixel_sa8d_16x16_%1
490 call x264_pixel_sa8d_8x8_%1 ; pix[0]
491 lea parm1q, [parm1q+4*parm2q]
492 lea parm3q, [parm3q+4*parm4q]
493 call x264_pixel_sa8d_8x8_%1 ; pix[8*stride]
494 lea r10, [3*parm2q-2]
495 lea r11, [3*parm4q-2]
500 call x264_pixel_sa8d_8x8_%1 ; pix[8]
501 lea parm1q, [parm1q+4*parm2q]
502 lea parm3q, [parm3q+4*parm4q]
503 call x264_pixel_sa8d_8x8_%1 ; pix[8*stride+8]
510 %define SUM8x4 SUM8x4_SSE2
513 %define SUM8x4 SUM8x4_SSSE3
519 ;-----------------------------------------------------------------------------
520 ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
521 ;-----------------------------------------------------------------------------
522 cglobal x264_intra_sa8d_x3_8x8_core_sse2
525 movq xmm0, [parm1q+0*FENC_STRIDE]
526 movq xmm7, [parm1q+1*FENC_STRIDE]
527 movq xmm6, [parm1q+2*FENC_STRIDE]
528 movq xmm3, [parm1q+3*FENC_STRIDE]
529 movq xmm5, [parm1q+4*FENC_STRIDE]
530 movq xmm1, [parm1q+5*FENC_STRIDE]
531 movq xmm8, [parm1q+6*FENC_STRIDE]
532 movq xmm2, [parm1q+7*FENC_STRIDE]
541 HADAMARD1x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2
542 TRANSPOSE8x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4
543 HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
546 movzx edi, word [parm2q+0]
547 add di, word [parm2q+16]
557 SUM8x4_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15
560 SUM4x4_SSE2 xmm8, xmm9, xmm10, xmm15
562 SUM1x8_SSE2 xmm8, xmm10, xmm15
563 movdqa xmm14, xmm15 ; 7x8 sum
565 movdqa xmm8, [parm2q+0] ; left edge
570 SUM1x8_SSE2 xmm8, xmm10, xmm14
571 SUM1x8_SSE2 xmm9, xmm11, xmm15 ; 1x8 sum
578 punpcklqdq xmm0, xmm4 ; transpose
579 movdqa xmm1, [parm2q+16] ; top edge
582 psrldq xmm2, 2 ; 8x7 sum
583 psubw xmm0, xmm1 ; 8x1 sum
584 SUM1x8_SSE2 xmm0, xmm1, xmm2
590 mov [parm3q+4], eax ; i8x8_h sa8d
595 mov [parm3q+8], eax ; i8x8_dc sa8d
600 mov [parm3q+0], eax ; i8x8_v sa8d
606 ;-----------------------------------------------------------------------------
607 ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
608 ; const uint8_t *pix2, int stride2, int sums[2][4] )
609 ;-----------------------------------------------------------------------------
610 cglobal x264_pixel_ssim_4x4x2_core_sse2
616 movdqa xmm8, [pw_1 GLOBAL]
636 pshufd xmm5, xmm3, 0xB1
639 pshufd xmm6, xmm4, 0xB1
644 pshufd xmm1, xmm1, 0xD8
648 movq [parm5q+ 0], xmm1
649 movq [parm5q+ 8], xmm3
651 movq [parm5q+16], xmm1
652 movq [parm5q+24], xmm5
655 ;-----------------------------------------------------------------------------
656 ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
657 ;-----------------------------------------------------------------------------
658 cglobal x264_pixel_ssim_end4_sse2
659 movdqa xmm0, [parm1q+ 0]
660 movdqa xmm1, [parm1q+16]
661 movdqa xmm2, [parm1q+32]
662 movdqa xmm3, [parm1q+48]
663 movdqa xmm4, [parm1q+64]
664 paddd xmm0, [parm2q+ 0]
665 paddd xmm1, [parm2q+16]
666 paddd xmm2, [parm2q+32]
667 paddd xmm3, [parm2q+48]
668 paddd xmm4, [parm2q+64]
673 movdqa xmm5, [ssim_c1 GLOBAL]
674 movdqa xmm6, [ssim_c2 GLOBAL]
675 TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4
677 ; s1=mm0, s2=mm3, ss=mm4, s12=mm2
680 pmaddwd xmm1, xmm0 ; s1*s2
682 pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2
686 psubd xmm2, xmm1 ; covar*2
687 psubd xmm4, xmm0 ; vars
692 cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
693 cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1)
694 cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2)
695 cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2)
698 divps xmm1, xmm0 ; ssim
702 lea rax, [mask_ff + 16 GLOBAL]
703 movdqu xmm3, [rax + parm3q*4]
705 movdqu xmm3, [mask_ff + parm3q*4 + 16]
710 pshuflw xmm1, xmm0, 0xE