1 ;*****************************************************************************
2 ;* pixel-32.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 ;* Loren Merritt <lorenm@u.washington.edu>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 ;*****************************************************************************
43 %macro TRANSPOSE4x4W 5 ; abcd-t -> adtc
44 SBUTTERFLY q, wd, %1, %2, %5
45 SBUTTERFLY q, wd, %3, %4, %2
46 SBUTTERFLY q, dq, %1, %3, %4
47 SBUTTERFLY q, dq, %5, %2, %3
50 %macro ABS1 2 ; mma, tmp
56 %macro ABS2 4 ; mma, mmb, tmp0, tmp1
65 %macro LOAD_DIFF_4P 4 ; mmp, mmt, dx, dy
66 movd %1, [eax+ebx*%4+%3]
67 movd %2, [ecx+edx*%4+%3]
73 %macro LOAD_DIFF_4x8P 1 ; dx
74 LOAD_DIFF_4P mm0, mm7, %1, 0
75 LOAD_DIFF_4P mm1, mm7, %1, 1
78 LOAD_DIFF_4P mm2, mm7, %1, 0
79 LOAD_DIFF_4P mm3, mm7, %1, 1
82 LOAD_DIFF_4P mm4, mm7, %1, 0
83 LOAD_DIFF_4P mm5, mm7, %1, 1
86 LOAD_DIFF_4P mm6, mm7, %1, 0
88 LOAD_DIFF_4P mm7, mm6, %1, 1
93 SUMSUB_BADC %1, %5, %2, %6
94 SUMSUB_BADC %3, %7, %4, %8
95 SUMSUB_BADC %1, %3, %2, %4
96 SUMSUB_BADC %5, %7, %6, %8
97 SUMSUB_BADC %1, %2, %3, %4
98 SUMSUB_BADC %5, %6, %7, %8
104 ABS2 mm0, mm1, mm6, mm7
105 ABS2 mm2, mm3, mm6, mm7
110 ABS2 mm4, mm5, mm2, mm3
111 ABS2 mm6, mm7, mm2, mm3
119 ;-----------------------------------------------------------------------------
120 ; int x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int )
121 ;-----------------------------------------------------------------------------
122 cglobal x264_pixel_sa8d_8x8_mmxext
124 mov eax, [esp+ 8] ; pix1
125 mov ebx, [esp+12] ; stride1
126 mov ecx, [esp+16] ; pix2
127 mov edx, [esp+20] ; stride2
129 %define args esp+0x74
130 %define spill esp+0x60 ; +16
131 %define trans esp+0 ; +96
133 HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
136 TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
137 movq [trans+0x00], mm4
138 movq [trans+0x08], mm7
139 movq [trans+0x10], mm0
140 movq [trans+0x18], mm6
142 TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
143 movq [trans+0x20], mm0
144 movq [trans+0x28], mm3
145 movq [trans+0x30], mm4
146 movq [trans+0x38], mm2
151 HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
154 TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7
155 movq [trans+0x40], mm0
156 movq [trans+0x48], mm3
157 movq [trans+0x50], mm7
158 movq [trans+0x58], mm2
160 TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
161 movq mm5, [trans+0x00]
162 movq mm1, [trans+0x08]
163 movq mm2, [trans+0x10]
164 movq mm3, [trans+0x18]
166 HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
170 movq mm0, [trans+0x20]
171 movq mm1, [trans+0x28]
172 movq mm2, [trans+0x30]
173 movq mm3, [trans+0x38]
174 movq mm4, [trans+0x40]
175 movq mm5, [trans+0x48]
176 movq mm6, [trans+0x50]
177 movq mm7, [trans+0x58]
179 HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
183 pshufw mm1, mm0, 01001110b
185 pshufw mm1, mm0, 10110001b
189 mov ecx, eax ; preserve rounding for 16x16
199 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
201 pshufw %4, %1, 01001110b
202 pshufw %5, %2, 01001110b
203 pshufw %6, %3, 01001110b
210 pshufw %4, %1, 01001110b
211 pshufw %5, %2, 01001110b
212 pshufw %6, %3, 01001110b
218 %macro LOAD_4x8P 1 ; dx
220 movd mm6, [eax+%1+7*FENC_STRIDE]
221 movd mm0, [eax+%1+0*FENC_STRIDE]
222 movd mm1, [eax+%1+1*FENC_STRIDE]
223 movd mm2, [eax+%1+2*FENC_STRIDE]
224 movd mm3, [eax+%1+3*FENC_STRIDE]
225 movd mm4, [eax+%1+4*FENC_STRIDE]
226 movd mm5, [eax+%1+5*FENC_STRIDE]
233 movd mm6, [eax+%1+6*FENC_STRIDE]
240 ;-----------------------------------------------------------------------------
241 ; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res )
242 ;-----------------------------------------------------------------------------
243 cglobal x264_intra_sa8d_x3_8x8_core_mmxext
247 %define args esp+0x74
248 %define spill esp+0x60 ; +16
249 %define trans esp+0 ; +96
250 %define sum esp+0 ; +32
252 HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
255 TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
256 movq [trans+0x00], mm4
257 movq [trans+0x08], mm7
258 movq [trans+0x10], mm0
259 movq [trans+0x18], mm6
261 TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
262 movq [trans+0x20], mm0
263 movq [trans+0x28], mm3
264 movq [trans+0x30], mm4
265 movq [trans+0x38], mm2
268 HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
271 TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7
272 movq [trans+0x40], mm0
273 movq [trans+0x48], mm3
274 movq [trans+0x50], mm7
275 movq [trans+0x58], mm2
277 TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
278 movq mm5, [trans+0x00]
279 movq mm1, [trans+0x08]
280 movq mm2, [trans+0x10]
281 movq mm3, [trans+0x18]
283 HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
287 ABS2 mm0, mm1, mm5, mm7
288 ABS2 mm2, mm3, mm5, mm7
292 ABS2 mm4, mm6, mm2, mm3
298 paddw mm0, mm7 ; 7x4 sum
300 movq mm7, [ecx+8] ; left bottom
303 ABS2 mm5, mm6, mm2, mm3
306 movq [sum+0], mm5 ; dc
307 movq [sum+8], mm6 ; left
309 movq mm0, [trans+0x20]
310 movq mm1, [trans+0x28]
311 movq mm2, [trans+0x30]
312 movq mm3, [trans+0x38]
313 movq mm4, [trans+0x40]
314 movq mm5, [trans+0x48]
315 movq mm6, [trans+0x50]
316 movq mm7, [trans+0x58]
318 HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
331 ABS2 mm2, mm3, mm0, mm1
332 ABS2 mm4, mm5, mm0, mm1
338 ABS2 mm6, mm7, mm4, mm5
342 paddw mm2, mm1 ; 7x4 sum
346 psllw mm7, 3 ; left top
348 movzx edx, word [ecx+0]
356 ABS2 mm0, mm1, mm5, mm6
357 movq mm3, [sum+0] ; dc
362 paddw mm1, [sum+8] ; h
366 movq mm3, [ecx+16] ; top left
367 movq mm4, [ecx+24] ; top right
372 ABS2 mm3, mm4, mm5, mm6
376 SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
384 mov [eax+0], ecx ; i8x8_v satd
385 mov [eax+4], edx ; i8x8_h satd
389 mov [eax+8], ecx ; i8x8_dc satd
400 ;-----------------------------------------------------------------------------
401 ; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
402 ; const uint8_t *pix2, int stride2, int sums[2][4] )
403 ;-----------------------------------------------------------------------------
404 cglobal x264_pixel_ssim_4x4x2_core_mmxext