1 ;*****************************************************************************
2 ;* pixel-sse2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* Authors: Alex Izvorski <aizvorksi@gmail.com>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
21 ;*****************************************************************************
25 ;=============================================================================
26 ; Macros and other preprocessor constants
27 ;=============================================================================
29 %include "amd64inc.asm"
31 SECTION .rodata align=16
33 pd_0000ffff: times 4 dd 0x0000ffff
39 cglobal x264_pixel_sad_16x16_sse2
40 cglobal x264_pixel_sad_16x8_sse2
41 cglobal x264_pixel_ssd_16x16_sse2
42 cglobal x264_pixel_ssd_16x8_sse2
43 cglobal x264_pixel_satd_8x4_sse2
44 cglobal x264_pixel_satd_8x8_sse2
45 cglobal x264_pixel_satd_16x8_sse2
46 cglobal x264_pixel_satd_8x16_sse2
47 cglobal x264_pixel_satd_16x16_sse2
49 %macro SAD_INC_4x16P_SSE2 0
51 movdqu xmm2, [rdx+rcx]
54 movdqu xmm4, [rdx+rcx]
56 psadbw xmm2, [rdi+rsi]
59 psadbw xmm4, [rdi+rsi]
68 %macro SAD_START_SSE2 0
70 movsxd rsi, esi ; stride1
72 movsxd rcx, ecx ; stride2
84 ;-----------------------------------------------------------------------------
85 ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
86 ;-----------------------------------------------------------------------------
87 x264_pixel_sad_16x16_sse2:
90 movdqu xmm1, [rdx+rcx]
93 movdqu xmm3, [rdx+rcx]
96 psadbw xmm1, [rdi+rsi]
101 psadbw xmm3, [rdi+rsi]
103 movdqu xmm5, [rdx+rcx]
107 movdqu xmm7, [rdx+rcx]
111 psadbw xmm5, [rdi+rsi]
116 psadbw xmm7, [rdi+rsi]
118 movdqu xmm2, [rdx+rcx]
123 movdqu xmm4, [rdx+rcx]
127 psadbw xmm2, [rdi+rsi]
132 psadbw xmm4, [rdi+rsi]
134 movdqu xmm6, [rdx+rcx]
139 movdqu xmm1, [rdx+rcx]
142 psadbw xmm6, [rdi+rsi]
146 psadbw xmm1, [rdi+rsi]
153 ;-----------------------------------------------------------------------------
154 ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
155 ;-----------------------------------------------------------------------------
156 x264_pixel_sad_16x8_sse2:
163 %macro SSD_INC_2x16P_SSE2 0
166 movdqu xmm3, [rdi+rsi]
167 movdqu xmm4, [rdx+rcx]
198 %macro SSD_INC_8x16P_SSE2 0
205 %macro SSD_START_SSE2 0
206 ; mov rdi, rdi ; pix1
207 movsxd rsi, esi ; stride1
208 ; mov rdx, rdx ; pix2
209 movsxd rcx, ecx ; stride2
211 pxor xmm7, xmm7 ; zero
212 pxor xmm0, xmm0 ; mm0 holds the sum
215 %macro SSD_END_SSE2 0
229 ;-----------------------------------------------------------------------------
230 ; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
231 ;-----------------------------------------------------------------------------
232 x264_pixel_ssd_16x16_sse2:
239 ;-----------------------------------------------------------------------------
240 ; int x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
241 ;-----------------------------------------------------------------------------
242 x264_pixel_ssd_16x8_sse2:
247 ; %1=(row2, row0) %2=(row3, row1) %3=junk
248 ; output in %1=(row3, row0) and %3=(row2, row1)
249 %macro HADAMARD4x4_SSE2 3
261 ;;; two HADAMARD4x4_SSE2 running side-by-side
262 %macro HADAMARD4x4_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)
283 %macro TRANSPOSE4x4_TWIST_SSE2 3 ; %1=(row3, row0) %2=(row2, row1) %3=junk, output in %1 and %2
286 punpckhwd %2, %3 ; backwards because the high quadwords are already swapped
297 ;;; two TRANSPOSE4x4_TWIST_SSE2 running side-by-side
298 %macro TRANSPOSE4x4_TWIST_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)
319 ;;; loads the difference of two 4x4 blocks into xmm0,xmm1 and xmm4,xmm5 in interleaved-row order
321 ;;; the value in xmm7 doesn't matter: it's only subtracted from itself
322 %macro LOAD4x8_DIFF_SSE2 0
343 punpcklqdq xmm0, xmm2 ; rows 0 and 2
344 punpckhqdq xmm4, xmm2 ; next 4x4 rows 0 and 2
354 punpcklqdq xmm1, xmm3 ; rows 1 and 3
355 punpckhqdq xmm5, xmm3 ; next 4x4 rows 1 and 3
358 %macro SUM4x4_SSE2 4 ; 02 13 junk sum
371 ;;; two SUM4x4_SSE2 running side-by-side
372 %macro SUM4x4_TWO_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
391 %macro SUM_MM_SSE2 2 ; sum junk
392 ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
397 pand %1, [pd_0000ffff GLOBAL]
407 %macro SATD_TWO_SSE2 0
409 HADAMARD4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3
410 TRANSPOSE4x4_TWIST_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5
411 HADAMARD4x4_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5
412 SUM4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
416 ; mov rdi, rdi ; pix1
417 movsxd rsi, esi ; stride1
418 ; mov rdx, rdx ; pix2
419 movsxd rcx, ecx ; stride2
424 SUM_MM_SSE2 xmm6, xmm7
429 ;-----------------------------------------------------------------------------
430 ; int x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
431 ;-----------------------------------------------------------------------------
432 x264_pixel_satd_16x16_sse2:
453 ;-----------------------------------------------------------------------------
454 ; int x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int )
455 ;-----------------------------------------------------------------------------
456 x264_pixel_satd_8x16_sse2:
467 ;-----------------------------------------------------------------------------
468 ; int x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
469 ;-----------------------------------------------------------------------------
470 x264_pixel_satd_16x8_sse2:
487 ;-----------------------------------------------------------------------------
488 ; int x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int )
489 ;-----------------------------------------------------------------------------
490 x264_pixel_satd_8x8_sse2:
499 ;-----------------------------------------------------------------------------
500 ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
501 ;-----------------------------------------------------------------------------
502 x264_pixel_satd_8x4_sse2: