1 ;*****************************************************************************
2 ;* pixel-32.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 ;*****************************************************************************
25 %include "x86util.asm"
30 %macro LOAD_DIFF_4x8P 1 ; dx
31 LOAD_DIFF m0, m7, none, [r0+%1], [r2+%1]
32 LOAD_DIFF m1, m6, none, [r0+%1+r1], [r2+%1+r3]
33 LOAD_DIFF m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
34 LOAD_DIFF m3, m6, none, [r0+%1+r4], [r2+%1+r5]
37 LOAD_DIFF m4, m7, none, [r0+%1], [r2+%1]
38 LOAD_DIFF m5, m6, none, [r0+%1+r1], [r2+%1+r3]
39 LOAD_DIFF m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
41 LOAD_DIFF m7, m5, none, [r0+%1+r4], [r2+%1+r5]
63 ;-----------------------------------------------------------------------------
64 ; int x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int )
65 ;-----------------------------------------------------------------------------
66 cglobal x264_pixel_sa8d_8x8_internal_mmxext
71 %define spill esp+0x60 ; +16
72 %define trans esp+0 ; +96
74 HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
77 TRANSPOSE4x4W 4, 5, 6, 7, 1
83 TRANSPOSE4x4W 0, 1, 2, 3, 4
92 HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
95 TRANSPOSE4x4W 0, 1, 2, 3, 7
101 TRANSPOSE4x4W 4, 5, 6, 7, 1
102 movq m0, [trans+0x00]
103 movq m1, [trans+0x08]
104 movq m2, [trans+0x10]
105 movq m3, [trans+0x18]
107 HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
111 movq m0, [trans+0x20]
112 movq m1, [trans+0x28]
113 movq m2, [trans+0x30]
114 movq m3, [trans+0x38]
115 movq m4, [trans+0x40]
116 movq m5, [trans+0x48]
117 movq m6, [trans+0x50]
118 movq m7, [trans+0x58]
120 HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
130 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
132 pshufw %4, %1, 01001110b
133 pshufw %5, %2, 01001110b
134 pshufw %6, %3, 01001110b
141 pshufw %4, %1, 01001110b
142 pshufw %5, %2, 01001110b
143 pshufw %6, %3, 01001110b
149 %macro LOAD_4x8P 1 ; dx
151 movd m6, [eax+%1+7*FENC_STRIDE]
152 movd m0, [eax+%1+0*FENC_STRIDE]
153 movd m1, [eax+%1+1*FENC_STRIDE]
154 movd m2, [eax+%1+2*FENC_STRIDE]
155 movd m3, [eax+%1+3*FENC_STRIDE]
156 movd m4, [eax+%1+4*FENC_STRIDE]
157 movd m5, [eax+%1+5*FENC_STRIDE]
164 movd m6, [eax+%1+6*FENC_STRIDE]
171 ;-----------------------------------------------------------------------------
172 ; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res )
173 ;-----------------------------------------------------------------------------
174 cglobal x264_intra_sa8d_x3_8x8_core_mmxext
178 %define args esp+0x74
179 %define spill esp+0x60 ; +16
180 %define trans esp+0 ; +96
181 %define sum esp+0 ; +32
183 HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
186 TRANSPOSE4x4W 4, 5, 6, 7, 0
187 movq [trans+0x00], m4
188 movq [trans+0x08], m5
189 movq [trans+0x10], m6
190 movq [trans+0x18], m7
192 TRANSPOSE4x4W 0, 1, 2, 3, 4
193 movq [trans+0x20], m0
194 movq [trans+0x28], m1
195 movq [trans+0x30], m2
196 movq [trans+0x38], m3
199 HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
202 TRANSPOSE4x4W 0, 1, 2, 3, 7
203 movq [trans+0x40], m0
204 movq [trans+0x48], m1
205 movq [trans+0x50], m2
206 movq [trans+0x58], m3
208 TRANSPOSE4x4W 4, 5, 6, 7, 0
209 movq m0, [trans+0x00]
210 movq m1, [trans+0x08]
211 movq m2, [trans+0x10]
212 movq m3, [trans+0x18]
214 HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
229 paddw m2, m1 ; 7x4 sum
231 movq m1, [ecx+8] ; left bottom
237 movq [sum+0], m0 ; dc
238 movq [sum+8], m7 ; left
240 movq m0, [trans+0x20]
241 movq m1, [trans+0x28]
242 movq m2, [trans+0x30]
243 movq m3, [trans+0x38]
244 movq m4, [trans+0x40]
245 movq m5, [trans+0x48]
246 movq m6, [trans+0x50]
247 movq m7, [trans+0x58]
249 HADAMARD8_V m0, m1, m2, m3, m4, m5, m6, m7
273 paddw m2, m1 ; 7x4 sum
277 psllw m7, 3 ; left top
279 movzx edx, word [ecx+0]
288 movq m3, [sum+0] ; dc
293 paddw m1, [sum+8] ; h
297 movq m3, [ecx+16] ; top left
298 movq m4, [ecx+24] ; top right
307 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
315 mov [eax+0], ecx ; i8x8_v satd
316 mov [eax+4], edx ; i8x8_h satd
320 mov [eax+8], ecx ; i8x8_dc satd
331 ;-----------------------------------------------------------------------------
332 ; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
333 ; const uint8_t *pix2, int stride2, int sums[2][4] )
334 ;-----------------------------------------------------------------------------
335 cglobal x264_pixel_ssim_4x4x2_core_mmxext