1 ;*****************************************************************************
2 ;* pixel-sse2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* Authors: Alex Izvorski <aizvorksi@gmail.com>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
21 ;*****************************************************************************
25 ;=============================================================================
26 ; Macros and other preprocessor constants
27 ;=============================================================================
42 SECTION .rodata data align=16
45 pd_0000ffff: times 4 dd 0x0000ffff
51 cglobal x264_pixel_sad_16x16_sse2
52 cglobal x264_pixel_sad_16x8_sse2
53 cglobal x264_pixel_ssd_16x16_sse2
54 cglobal x264_pixel_ssd_16x8_sse2
55 cglobal x264_pixel_satd_8x4_sse2
56 cglobal x264_pixel_satd_8x8_sse2
57 cglobal x264_pixel_satd_16x8_sse2
58 cglobal x264_pixel_satd_8x16_sse2
59 cglobal x264_pixel_satd_16x16_sse2
61 %macro SAD_INC_4x16P_SSE2 0
63 movdqu xmm2, [ecx+edx]
66 movdqu xmm4, [ecx+edx]
68 psadbw xmm2, [eax+ebx]
71 psadbw xmm4, [eax+ebx]
80 %macro SAD_START_SSE2 0
83 mov eax, [esp+ 8] ; pix1
84 mov ebx, [esp+12] ; stride1
85 mov ecx, [esp+16] ; pix2
86 mov edx, [esp+20] ; stride2
100 ;-----------------------------------------------------------------------------
101 ; int __cdecl x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
102 ;-----------------------------------------------------------------------------
103 x264_pixel_sad_16x16_sse2:
106 movdqu xmm1, [ecx+edx]
109 movdqu xmm3, [ecx+edx]
112 psadbw xmm1, [eax+ebx]
117 psadbw xmm3, [eax+ebx]
119 movdqu xmm5, [ecx+edx]
123 movdqu xmm7, [ecx+edx]
127 psadbw xmm5, [eax+ebx]
132 psadbw xmm7, [eax+ebx]
134 movdqu xmm2, [ecx+edx]
139 movdqu xmm4, [ecx+edx]
143 psadbw xmm2, [eax+ebx]
148 psadbw xmm4, [eax+ebx]
150 movdqu xmm6, [ecx+edx]
155 movdqu xmm1, [ecx+edx]
158 psadbw xmm6, [eax+ebx]
162 psadbw xmm1, [eax+ebx]
169 ;-----------------------------------------------------------------------------
170 ; int __cdecl x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
171 ;-----------------------------------------------------------------------------
172 x264_pixel_sad_16x8_sse2:
179 %macro SSD_INC_2x16P_SSE2 0
182 movdqu xmm3, [eax+ebx]
183 movdqu xmm4, [ecx+edx]
214 %macro SSD_INC_8x16P_SSE2 0
221 %macro SSD_START_SSE2 0
224 mov eax, [esp+ 8] ; pix1
225 mov ebx, [esp+12] ; stride1
226 mov ecx, [esp+16] ; pix2
227 mov edx, [esp+20] ; stride2
229 pxor xmm7, xmm7 ; zero
230 pxor xmm0, xmm0 ; mm0 holds the sum
233 %macro SSD_END_SSE2 0
249 ;-----------------------------------------------------------------------------
250 ; int __cdecl x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
251 ;-----------------------------------------------------------------------------
252 x264_pixel_ssd_16x16_sse2:
259 ;-----------------------------------------------------------------------------
260 ; int __cdecl x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
261 ;-----------------------------------------------------------------------------
262 x264_pixel_ssd_16x8_sse2:
267 ; %1=(row2, row0) %2=(row3, row1) %3=junk
268 ; output in %1=(row3, row0) and %3=(row2, row1)
269 %macro HADAMARD4x4_SSE2 3
281 ;;; two HADAMARD4x4_SSE2 running side-by-side
282 %macro HADAMARD4x4_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)
303 %macro TRANSPOSE4x4_TWIST_SSE2 3 ; %1=(row3, row0) %2=(row2, row1) %3=junk, output in %1 and %2
306 punpckhwd %2, %3 ; backwards because the high quadwords are already swapped
317 ;;; two TRANSPOSE4x4_TWIST_SSE2 running side-by-side
318 %macro TRANSPOSE4x4_TWIST_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)
339 ;;; loads the difference of two 4x4 blocks into xmm0,xmm1 and xmm4,xmm5 in interleaved-row order
341 ;;; the value in xmm7 doesn't matter: it's only subtracted from itself
342 %macro LOAD4x8_DIFF_SSE2 0
363 punpcklqdq xmm0, xmm2 ; rows 0 and 2
364 punpckhqdq xmm4, xmm2 ; next 4x4 rows 0 and 2
374 punpcklqdq xmm1, xmm3 ; rows 1 and 3
375 punpckhqdq xmm5, xmm3 ; next 4x4 rows 1 and 3
378 %macro SUM4x4_SSE2 4 ; 02 13 junk sum
391 ;;; two SUM4x4_SSE2 running side-by-side
392 %macro SUM4x4_TWO_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
411 %macro SUM_MM_SSE2 2 ; sum junk
412 ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
417 pand %1, [pd_0000ffff]
427 %macro SATD_TWO_SSE2 0
429 HADAMARD4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3
430 TRANSPOSE4x4_TWIST_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5
431 HADAMARD4x4_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5
432 SUM4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
438 mov eax, [esp+ 8] ; pix1
439 mov ebx, [esp+12] ; stride1
440 mov ecx, [esp+16] ; pix2
441 mov edx, [esp+20] ; stride2
447 SUM_MM_SSE2 xmm6, xmm7
454 ;-----------------------------------------------------------------------------
455 ; int __cdecl x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
456 ;-----------------------------------------------------------------------------
457 x264_pixel_satd_16x16_sse2:
478 ;-----------------------------------------------------------------------------
479 ; int __cdecl x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int )
480 ;-----------------------------------------------------------------------------
481 x264_pixel_satd_8x16_sse2:
492 ;-----------------------------------------------------------------------------
493 ; int __cdecl x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
494 ;-----------------------------------------------------------------------------
495 x264_pixel_satd_16x8_sse2:
512 ;-----------------------------------------------------------------------------
513 ; int __cdecl x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int )
514 ;-----------------------------------------------------------------------------
515 x264_pixel_satd_8x8_sse2:
524 ;-----------------------------------------------------------------------------
525 ; int __cdecl x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
526 ;-----------------------------------------------------------------------------
527 x264_pixel_satd_8x4_sse2: