1 ;*****************************************************************************
2 ;* pixel-32.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 ;* Loren Merritt <lorenm@u.washington.edu>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 ;*****************************************************************************
25 %include "x86util.asm"
35 %macro TRANSPOSE4x4W 5 ; abcd-t -> adtc
36 SBUTTERFLY q, wd, %1, %2, %5
37 SBUTTERFLY q, wd, %3, %4, %2
38 SBUTTERFLY q, dq, %1, %3, %4
39 SBUTTERFLY q, dq, %5, %2, %3
42 %macro LOAD_DIFF_4P 4 ; mmp, mmt, dx, dy
43 movd %1, [eax+ebx*%4+%3]
44 movd %2, [ecx+edx*%4+%3]
50 %macro LOAD_DIFF_4x8P 1 ; dx
51 LOAD_DIFF_4P mm0, mm7, %1, 0
52 LOAD_DIFF_4P mm1, mm7, %1, 1
55 LOAD_DIFF_4P mm2, mm7, %1, 0
56 LOAD_DIFF_4P mm3, mm7, %1, 1
59 LOAD_DIFF_4P mm4, mm7, %1, 0
60 LOAD_DIFF_4P mm5, mm7, %1, 1
63 LOAD_DIFF_4P mm6, mm7, %1, 0
65 LOAD_DIFF_4P mm7, mm6, %1, 1
72 ABS2 mm0, mm1, mm6, mm7
73 ABS2 mm2, mm3, mm6, mm7
78 ABS2 mm4, mm5, mm2, mm3
79 ABS2 mm6, mm7, mm2, mm3
87 ;-----------------------------------------------------------------------------
88 ; int x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int )
89 ;-----------------------------------------------------------------------------
90 cglobal x264_pixel_sa8d_8x8_mmxext
92 mov eax, [esp+ 8] ; pix1
93 mov ebx, [esp+12] ; stride1
94 mov ecx, [esp+16] ; pix2
95 mov edx, [esp+20] ; stride2
98 %define spill esp+0x60 ; +16
99 %define trans esp+0 ; +96
101 HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
104 TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
105 movq [trans+0x00], mm4
106 movq [trans+0x08], mm7
107 movq [trans+0x10], mm0
108 movq [trans+0x18], mm6
110 TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
111 movq [trans+0x20], mm0
112 movq [trans+0x28], mm3
113 movq [trans+0x30], mm4
114 movq [trans+0x38], mm2
119 HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
122 TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7
123 movq [trans+0x40], mm0
124 movq [trans+0x48], mm3
125 movq [trans+0x50], mm7
126 movq [trans+0x58], mm2
128 TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
129 movq mm5, [trans+0x00]
130 movq mm1, [trans+0x08]
131 movq mm2, [trans+0x10]
132 movq mm3, [trans+0x18]
134 HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
138 movq mm0, [trans+0x20]
139 movq mm1, [trans+0x28]
140 movq mm2, [trans+0x30]
141 movq mm3, [trans+0x38]
142 movq mm4, [trans+0x40]
143 movq mm5, [trans+0x48]
144 movq mm6, [trans+0x50]
145 movq mm7, [trans+0x58]
147 HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
151 pshufw mm1, mm0, 01001110b
153 pshufw mm1, mm0, 10110001b
157 mov ecx, eax ; preserve rounding for 16x16
167 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
169 pshufw %4, %1, 01001110b
170 pshufw %5, %2, 01001110b
171 pshufw %6, %3, 01001110b
178 pshufw %4, %1, 01001110b
179 pshufw %5, %2, 01001110b
180 pshufw %6, %3, 01001110b
186 %macro LOAD_4x8P 1 ; dx
188 movd mm6, [eax+%1+7*FENC_STRIDE]
189 movd mm0, [eax+%1+0*FENC_STRIDE]
190 movd mm1, [eax+%1+1*FENC_STRIDE]
191 movd mm2, [eax+%1+2*FENC_STRIDE]
192 movd mm3, [eax+%1+3*FENC_STRIDE]
193 movd mm4, [eax+%1+4*FENC_STRIDE]
194 movd mm5, [eax+%1+5*FENC_STRIDE]
201 movd mm6, [eax+%1+6*FENC_STRIDE]
208 ;-----------------------------------------------------------------------------
209 ; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res )
210 ;-----------------------------------------------------------------------------
211 cglobal x264_intra_sa8d_x3_8x8_core_mmxext
215 %define args esp+0x74
216 %define spill esp+0x60 ; +16
217 %define trans esp+0 ; +96
218 %define sum esp+0 ; +32
220 HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
223 TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
224 movq [trans+0x00], mm4
225 movq [trans+0x08], mm7
226 movq [trans+0x10], mm0
227 movq [trans+0x18], mm6
229 TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
230 movq [trans+0x20], mm0
231 movq [trans+0x28], mm3
232 movq [trans+0x30], mm4
233 movq [trans+0x38], mm2
236 HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
239 TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7
240 movq [trans+0x40], mm0
241 movq [trans+0x48], mm3
242 movq [trans+0x50], mm7
243 movq [trans+0x58], mm2
245 TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
246 movq mm5, [trans+0x00]
247 movq mm1, [trans+0x08]
248 movq mm2, [trans+0x10]
249 movq mm3, [trans+0x18]
251 HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
255 ABS2 mm0, mm1, mm5, mm7
256 ABS2 mm2, mm3, mm5, mm7
260 ABS2 mm4, mm6, mm2, mm3
266 paddw mm0, mm7 ; 7x4 sum
268 movq mm7, [ecx+8] ; left bottom
271 ABS2 mm5, mm6, mm2, mm3
274 movq [sum+0], mm5 ; dc
275 movq [sum+8], mm6 ; left
277 movq mm0, [trans+0x20]
278 movq mm1, [trans+0x28]
279 movq mm2, [trans+0x30]
280 movq mm3, [trans+0x38]
281 movq mm4, [trans+0x40]
282 movq mm5, [trans+0x48]
283 movq mm6, [trans+0x50]
284 movq mm7, [trans+0x58]
286 HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
299 ABS2 mm2, mm3, mm0, mm1
300 ABS2 mm4, mm5, mm0, mm1
306 ABS2 mm6, mm7, mm4, mm5
310 paddw mm2, mm1 ; 7x4 sum
314 psllw mm7, 3 ; left top
316 movzx edx, word [ecx+0]
324 ABS2 mm0, mm1, mm5, mm6
325 movq mm3, [sum+0] ; dc
330 paddw mm1, [sum+8] ; h
334 movq mm3, [ecx+16] ; top left
335 movq mm4, [ecx+24] ; top right
340 ABS2 mm3, mm4, mm5, mm6
344 SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
352 mov [eax+0], ecx ; i8x8_v satd
353 mov [eax+4], edx ; i8x8_h satd
357 mov [eax+8], ecx ; i8x8_dc satd
368 ;-----------------------------------------------------------------------------
369 ; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
370 ; const uint8_t *pix2, int stride2, int sums[2][4] )
371 ;-----------------------------------------------------------------------------
372 cglobal x264_pixel_ssim_4x4x2_core_mmxext