1 ;*****************************************************************************
2 ;* pixel-sse2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* Authors: Alex Izvorski <aizvorksi@gmail.com>
7 ;* Loren Merritt <lorenm@u.washington.edu>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 ;*****************************************************************************
26 ;=============================================================================
27 ; Macros and other preprocessor constants
28 ;=============================================================================
30 %include "amd64inc.asm"
32 SECTION .rodata align=16
36 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
37 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
38 mask_ff: times 16 db 0xff
44 cglobal x264_pixel_sad_16x16_sse2
45 cglobal x264_pixel_sad_16x8_sse2
46 cglobal x264_pixel_ssd_16x16_sse2
47 cglobal x264_pixel_ssd_16x8_sse2
48 cglobal x264_pixel_satd_8x4_sse2
49 cglobal x264_pixel_satd_8x8_sse2
50 cglobal x264_pixel_satd_16x8_sse2
51 cglobal x264_pixel_satd_8x16_sse2
52 cglobal x264_pixel_satd_16x16_sse2
53 cglobal x264_pixel_sa8d_8x8_sse2
54 cglobal x264_pixel_sa8d_16x16_sse2
55 cglobal x264_intra_sa8d_x3_8x8_core_sse2
56 cglobal x264_pixel_ssim_4x4x2_core_sse2
57 cglobal x264_pixel_ssim_end4_sse2
60 %macro HADDD 2 ; sum junk
68 pmaddwd %1, [pw_1 GLOBAL]
72 %macro SAD_INC_4x16P_SSE2 0
74 movdqu xmm2, [rdx+rcx]
77 movdqu xmm4, [rdx+rcx]
79 psadbw xmm2, [rdi+rsi]
82 psadbw xmm4, [rdi+rsi]
100 ;-----------------------------------------------------------------------------
101 ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
102 ;-----------------------------------------------------------------------------
103 x264_pixel_sad_16x16_sse2:
105 movdqu xmm1, [rdx+rcx]
108 movdqu xmm3, [rdx+rcx]
111 psadbw xmm1, [rdi+rsi]
116 psadbw xmm3, [rdi+rsi]
118 movdqu xmm5, [rdx+rcx]
122 movdqu xmm7, [rdx+rcx]
126 psadbw xmm5, [rdi+rsi]
131 psadbw xmm7, [rdi+rsi]
133 movdqu xmm2, [rdx+rcx]
138 movdqu xmm4, [rdx+rcx]
142 psadbw xmm2, [rdi+rsi]
147 psadbw xmm4, [rdi+rsi]
149 movdqu xmm6, [rdx+rcx]
154 movdqu xmm1, [rdx+rcx]
157 psadbw xmm6, [rdi+rsi]
161 psadbw xmm1, [rdi+rsi]
168 ;-----------------------------------------------------------------------------
169 ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
170 ;-----------------------------------------------------------------------------
171 x264_pixel_sad_16x8_sse2:
177 %macro SSD_INC_2x16P_SSE2 0
180 movdqu xmm3, [rdi+rsi]
181 movdqu xmm4, [rdx+rcx]
212 %macro SSD_START_SSE2 0
213 pxor xmm7, xmm7 ; zero
214 pxor xmm0, xmm0 ; mm0 holds the sum
217 %macro SSD_END_SSE2 0
224 ;-----------------------------------------------------------------------------
225 ; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
226 ;-----------------------------------------------------------------------------
227 x264_pixel_ssd_16x16_sse2:
235 ;-----------------------------------------------------------------------------
236 ; int x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
237 ;-----------------------------------------------------------------------------
238 x264_pixel_ssd_16x8_sse2:
257 SUMSUB_BADC %1, %2, %3, %4
258 SUMSUB_BADC %1, %3, %2, %4
262 SUMSUB_BADC %1, %5, %2, %6
263 SUMSUB_BADC %3, %7, %4, %8
264 SUMSUB_BADC %1, %3, %2, %4
265 SUMSUB_BADC %5, %7, %6, %8
266 SUMSUB_BADC %1, %2, %3, %4
267 SUMSUB_BADC %5, %6, %7, %8
276 %macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4 to not shuffle registers
282 %macro TRANSPOSE4x4D 5 ; ABCD-T -> ADTC
283 SBUTTERFLY dqa, dq, %1, %2, %5
284 SBUTTERFLY dqa, dq, %3, %4, %2
285 SBUTTERFLY dqa, qdq, %1, %3, %4
286 SBUTTERFLY dqa, qdq, %5, %2, %3
289 %macro TRANSPOSE2x4x4W 5 ; ABCD-T -> ABCD
290 SBUTTERFLY dqa, wd, %1, %2, %5
291 SBUTTERFLY dqa, wd, %3, %4, %2
292 SBUTTERFLY dqa, dq, %1, %3, %4
293 SBUTTERFLY2 dqa, dq, %5, %2, %3
294 SBUTTERFLY dqa, qdq, %1, %3, %2
295 SBUTTERFLY2 dqa, qdq, %4, %5, %3
298 %macro TRANSPOSE8x8 9 ; ABCDEFGH-T -> AFHDTECB
299 SBUTTERFLY dqa, wd, %1, %2, %9
300 SBUTTERFLY dqa, wd, %3, %4, %2
301 SBUTTERFLY dqa, wd, %5, %6, %4
302 SBUTTERFLY dqa, wd, %7, %8, %6
303 SBUTTERFLY dqa, dq, %1, %3, %8
304 SBUTTERFLY dqa, dq, %9, %2, %3
305 SBUTTERFLY dqa, dq, %5, %7, %2
306 SBUTTERFLY dqa, dq, %4, %6, %7
307 SBUTTERFLY dqa, qdq, %1, %5, %6
308 SBUTTERFLY dqa, qdq, %9, %4, %5
309 SBUTTERFLY dqa, qdq, %8, %2, %4
310 SBUTTERFLY dqa, qdq, %3, %7, %2
313 %macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2]
321 %macro SUM1x8_SSE2 3 ; 01 junk sum
328 %macro SUM4x4_SSE2 4 ; 02 13 junk sum
341 ;;; two SUM4x4_SSE2 running side-by-side
342 %macro SUM4x4_TWO_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
361 %macro SATD_TWO_SSE2 0
362 LOAD_DIFF_8P xmm0, xmm4, [parm1q], [parm3q]
363 LOAD_DIFF_8P xmm1, xmm5, [parm1q+parm2q], [parm3q+parm4q]
364 LOAD_DIFF_8P xmm2, xmm4, [parm1q+2*parm2q], [parm3q+2*parm4q]
365 LOAD_DIFF_8P xmm3, xmm5, [parm1q+r10], [parm3q+r11]
366 lea parm1q, [parm1q+4*parm2q]
367 lea parm3q, [parm3q+4*parm4q]
369 HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
370 TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
371 HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
372 SUM4x4_TWO_SSE2 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6
389 ;-----------------------------------------------------------------------------
390 ; int x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
391 ;-----------------------------------------------------------------------------
392 x264_pixel_satd_16x16_sse2:
413 ;-----------------------------------------------------------------------------
414 ; int x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int )
415 ;-----------------------------------------------------------------------------
416 x264_pixel_satd_8x16_sse2:
427 ;-----------------------------------------------------------------------------
428 ; int x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
429 ;-----------------------------------------------------------------------------
430 x264_pixel_satd_16x8_sse2:
447 ;-----------------------------------------------------------------------------
448 ; int x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int )
449 ;-----------------------------------------------------------------------------
450 x264_pixel_satd_8x8_sse2:
459 ;-----------------------------------------------------------------------------
460 ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
461 ;-----------------------------------------------------------------------------
462 x264_pixel_satd_8x4_sse2:
471 ;-----------------------------------------------------------------------------
472 ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
473 ;-----------------------------------------------------------------------------
474 x264_pixel_sa8d_8x8_sse2:
477 LOAD_DIFF_8P xmm0, xmm8, [parm1q], [parm3q]
478 LOAD_DIFF_8P xmm1, xmm9, [parm1q+parm2q], [parm3q+parm4q]
479 LOAD_DIFF_8P xmm2, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q]
480 LOAD_DIFF_8P xmm3, xmm9, [parm1q+r10], [parm3q+r11]
481 lea parm1q, [parm1q+4*parm2q]
482 lea parm3q, [parm3q+4*parm4q]
483 LOAD_DIFF_8P xmm4, xmm8, [parm1q], [parm3q]
484 LOAD_DIFF_8P xmm5, xmm9, [parm1q+parm2q], [parm3q+parm4q]
485 LOAD_DIFF_8P xmm6, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q]
486 LOAD_DIFF_8P xmm7, xmm9, [parm1q+r10], [parm3q+r11]
488 HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
489 TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
490 HADAMARD1x8 xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
493 SUM4x4_TWO_SSE2 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
494 SUM4x4_TWO_SSE2 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
498 add r8d, eax ; preserve rounding for 16x16
504 ;-----------------------------------------------------------------------------
505 ; int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int )
506 ;-----------------------------------------------------------------------------
507 ;; violates calling convention
508 x264_pixel_sa8d_16x16_sse2:
510 call x264_pixel_sa8d_8x8_sse2 ; pix[0]
511 lea parm1q, [parm1q+4*parm2q]
512 lea parm3q, [parm3q+4*parm4q]
513 call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride]
514 lea r10, [3*parm2q-2]
515 lea r11, [3*parm4q-2]
520 call x264_pixel_sa8d_8x8_sse2 ; pix[8]
521 lea parm1q, [parm1q+4*parm2q]
522 lea parm3q, [parm3q+4*parm4q]
523 call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride+8]
532 ;-----------------------------------------------------------------------------
533 ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
534 ;-----------------------------------------------------------------------------
535 x264_intra_sa8d_x3_8x8_core_sse2:
538 movq xmm0, [parm1q+0*FENC_STRIDE]
539 movq xmm7, [parm1q+1*FENC_STRIDE]
540 movq xmm6, [parm1q+2*FENC_STRIDE]
541 movq xmm3, [parm1q+3*FENC_STRIDE]
542 movq xmm5, [parm1q+4*FENC_STRIDE]
543 movq xmm1, [parm1q+5*FENC_STRIDE]
544 movq xmm8, [parm1q+6*FENC_STRIDE]
545 movq xmm2, [parm1q+7*FENC_STRIDE]
554 HADAMARD1x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2
555 TRANSPOSE8x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4
556 HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
559 movzx edi, word [parm2q+0]
560 add di, word [parm2q+16]
570 SUM4x4_TWO_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15
573 SUM4x4_SSE2 xmm8, xmm9, xmm10, xmm15
575 SUM1x8_SSE2 xmm8, xmm10, xmm15
576 movdqa xmm14, xmm15 ; 7x8 sum
578 movdqa xmm8, [parm2q+0] ; left edge
583 SUM1x8_SSE2 xmm8, xmm10, xmm14
584 SUM1x8_SSE2 xmm9, xmm11, xmm15 ; 1x8 sum
591 punpcklqdq xmm0, xmm4 ; transpose
592 movdqa xmm1, [parm2q+16] ; top edge
595 psrldq xmm2, 2 ; 8x7 sum
596 psubw xmm0, xmm1 ; 8x1 sum
597 SUM1x8_SSE2 xmm0, xmm1, xmm2
603 mov [parm3q+4], eax ; i8x8_h sa8d
608 mov [parm3q+8], eax ; i8x8_dc sa8d
613 mov [parm3q+0], eax ; i8x8_v sa8d
619 ;-----------------------------------------------------------------------------
620 ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
621 ; const uint8_t *pix2, int stride2, int sums[2][4] )
622 ;-----------------------------------------------------------------------------
624 x264_pixel_ssim_4x4x2_core_sse2:
630 movdqa xmm8, [pw_1 GLOBAL]
650 pshufd xmm5, xmm3, 0xB1
653 pshufd xmm6, xmm4, 0xB1
658 pshufd xmm1, xmm1, 0xD8
662 movq [parm5q+ 0], xmm1
663 movq [parm5q+ 8], xmm3
665 movq [parm5q+16], xmm1
666 movq [parm5q+24], xmm5
669 ;-----------------------------------------------------------------------------
670 ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
671 ;-----------------------------------------------------------------------------
673 x264_pixel_ssim_end4_sse2:
674 movdqa xmm0, [parm1q+ 0]
675 movdqa xmm1, [parm1q+16]
676 movdqa xmm2, [parm1q+32]
677 movdqa xmm3, [parm1q+48]
678 movdqa xmm4, [parm1q+64]
679 paddd xmm0, [parm2q+ 0]
680 paddd xmm1, [parm2q+16]
681 paddd xmm2, [parm2q+32]
682 paddd xmm3, [parm2q+48]
683 paddd xmm4, [parm2q+64]
688 movdqa xmm5, [ssim_c1 GLOBAL]
689 movdqa xmm6, [ssim_c2 GLOBAL]
690 TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4
692 ; s1=mm0, s2=mm3, ss=mm4, s12=mm2
695 pmaddwd xmm1, xmm0 ; s1*s2
697 pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2
701 psubd xmm2, xmm1 ; covar*2
702 psubd xmm4, xmm0 ; vars
707 cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
708 cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1)
709 cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2)
710 cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2)
713 divps xmm1, xmm0 ; ssim
717 lea rax, [mask_ff + 16 GLOBAL]
718 movdqu xmm3, [rax + parm3q*4]
720 movdqu xmm3, [mask_ff + parm3q*4 + 16]
725 pshuflw xmm1, xmm0, 0xE