1 ;*****************************************************************************
2 ;* pixel-32.asm: x86_32 pixel metrics
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2011 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;* This program is also available under a commercial proprietary license.
24 ;* For more information, contact us at licensing@x264.com.
25 ;*****************************************************************************
28 %include "x86util.asm"
33 %macro LOAD_DIFF_4x8P 1 ; dx
34 LOAD_DIFF m0, m7, none, [r0+%1], [r2+%1]
35 LOAD_DIFF m1, m6, none, [r0+%1+r1], [r2+%1+r3]
36 LOAD_DIFF m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
37 LOAD_DIFF m3, m6, none, [r0+%1+r4], [r2+%1+r5]
40 LOAD_DIFF m4, m7, none, [r0+%1], [r2+%1]
41 LOAD_DIFF m5, m6, none, [r0+%1+r1], [r2+%1+r3]
42 LOAD_DIFF m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
44 LOAD_DIFF m7, m5, none, [r0+%1+r4], [r2+%1+r5]
66 ;-----------------------------------------------------------------------------
67 ; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
68 ;-----------------------------------------------------------------------------
69 cglobal pixel_sa8d_8x8_internal_mmx2
74 %define spill esp+0x60 ; +16
75 %define trans esp+0 ; +96
77 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
80 TRANSPOSE4x4W 4, 5, 6, 7, 1
86 TRANSPOSE4x4W 0, 1, 2, 3, 4
95 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
98 TRANSPOSE4x4W 0, 1, 2, 3, 7
100 movq [trans+0x48], m1
101 movq [trans+0x50], m2
102 movq [trans+0x58], m3
104 TRANSPOSE4x4W 4, 5, 6, 7, 1
105 movq m0, [trans+0x00]
106 movq m1, [trans+0x08]
107 movq m2, [trans+0x10]
108 movq m3, [trans+0x18]
110 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
114 movq m0, [trans+0x20]
115 movq m1, [trans+0x28]
116 movq m2, [trans+0x30]
117 movq m3, [trans+0x38]
118 movq m4, [trans+0x40]
119 movq m5, [trans+0x48]
120 movq m6, [trans+0x50]
121 movq m7, [trans+0x58]
123 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
133 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
135 pshufw %4, %1, 01001110b
136 pshufw %5, %2, 01001110b
137 pshufw %6, %3, 01001110b
144 pshufw %4, %1, 01001110b
145 pshufw %5, %2, 01001110b
146 pshufw %6, %3, 01001110b
152 %macro LOAD_4x8P 1 ; dx
154 movd m6, [eax+%1+7*FENC_STRIDE]
155 movd m0, [eax+%1+0*FENC_STRIDE]
156 movd m1, [eax+%1+1*FENC_STRIDE]
157 movd m2, [eax+%1+2*FENC_STRIDE]
158 movd m3, [eax+%1+3*FENC_STRIDE]
159 movd m4, [eax+%1+4*FENC_STRIDE]
160 movd m5, [eax+%1+5*FENC_STRIDE]
167 movd m6, [eax+%1+6*FENC_STRIDE]
174 ;-----------------------------------------------------------------------------
175 ; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
176 ;-----------------------------------------------------------------------------
177 cglobal intra_sa8d_x3_8x8_core_mmx2
181 %define args esp+0x74
182 %define spill esp+0x60 ; +16
183 %define trans esp+0 ; +96
184 %define sum esp+0 ; +32
186 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
189 TRANSPOSE4x4W 4, 5, 6, 7, 0
190 movq [trans+0x00], m4
191 movq [trans+0x08], m5
192 movq [trans+0x10], m6
193 movq [trans+0x18], m7
195 TRANSPOSE4x4W 0, 1, 2, 3, 4
196 movq [trans+0x20], m0
197 movq [trans+0x28], m1
198 movq [trans+0x30], m2
199 movq [trans+0x38], m3
202 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
205 TRANSPOSE4x4W 0, 1, 2, 3, 7
206 movq [trans+0x40], m0
207 movq [trans+0x48], m1
208 movq [trans+0x50], m2
209 movq [trans+0x58], m3
211 TRANSPOSE4x4W 4, 5, 6, 7, 0
212 movq m0, [trans+0x00]
213 movq m1, [trans+0x08]
214 movq m2, [trans+0x10]
215 movq m3, [trans+0x18]
217 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
232 paddw m2, m1 ; 7x4 sum
234 movq m1, [ecx+8] ; left bottom
240 movq [sum+0], m0 ; dc
241 movq [sum+8], m7 ; left
243 movq m0, [trans+0x20]
244 movq m1, [trans+0x28]
245 movq m2, [trans+0x30]
246 movq m3, [trans+0x38]
247 movq m4, [trans+0x40]
248 movq m5, [trans+0x48]
249 movq m6, [trans+0x50]
250 movq m7, [trans+0x58]
252 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
276 paddw m2, m1 ; 7x4 sum
280 psllw m7, 3 ; left top
282 movzx edx, word [ecx+0]
291 movq m3, [sum+0] ; dc
296 paddw m1, [sum+8] ; h
300 movq m3, [ecx+16] ; top left
301 movq m4, [ecx+24] ; top right
310 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
318 mov [eax+0], ecx ; i8x8_v satd
319 mov [eax+4], edx ; i8x8_h satd
323 mov [eax+8], ecx ; i8x8_dc satd
334 ;-----------------------------------------------------------------------------
335 ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
336 ; const uint8_t *pix2, int stride2, int sums[2][4] )
337 ;-----------------------------------------------------------------------------
338 cglobal pixel_ssim_4x4x2_core_mmx2