1 ;*****************************************************************************
2 ;* sad16-a.asm: x86 high depth sad functions
3 ;*****************************************************************************
4 ;* Copyright (C) 2010-2011 x264 project
6 ;* Authors: Oskar Arvidsson <oskar@irock.se>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 ;* This program is also available under a commercial proprietary license.
23 ;* For more information, contact us at licensing@x264.com.
24 ;*****************************************************************************
27 %include "x86util.asm"
33 ;=============================================================================
35 ;=============================================================================
37 %macro SAD_INC_1x16P_MMX 0
46 ABSW2 m1, m2, m1, m2, m5, m6
47 ABSW2 m3, m4, m3, m4, m7, m5
56 %macro SAD_INC_2x8P_MMX 0
65 ABSW2 m1, m2, m1, m2, m5, m6
66 ABSW2 m3, m4, m3, m4, m7, m5
75 %macro SAD_INC_2x4P_MMX 0
80 ABSW2 m1, m2, m1, m2, m3, m4
87 ;-----------------------------------------------------------------------------
88 ; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int )
89 ;-----------------------------------------------------------------------------
91 cglobal pixel_sad_%1x%2, 4,4
117 ;=============================================================================
119 ;=============================================================================
121 %macro SAD_INC_2x16P_XMM 0
124 movu m3, [r2+2*r3+ 0]
125 movu m4, [r2+2*r3+16]
128 psubw m3, [r0+2*r1+ 0]
129 psubw m4, [r0+2*r1+16]
130 ABSW2 m1, m2, m1, m2, m5, m6
133 ABSW2 m3, m4, m3, m4, m7, m5
140 %macro SAD_INC_2x8P_XMM 0
145 ABSW2 m1, m2, m1, m2, m3, m4
152 ;-----------------------------------------------------------------------------
153 ; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int )
154 ;-----------------------------------------------------------------------------
156 cglobal pixel_sad_%1x%2, 4,4,8
172 INIT_XMM sse2, aligned
183 INIT_XMM ssse3, aligned
189 ;=============================================================================
191 ;=============================================================================
193 %macro SAD_X3_INC_P 0
194 add r0, 4*FENC_STRIDE
200 %macro SAD_X3_ONE_START 0
208 ABSW2 m0, m1, m0, m1, m4, m5
220 ABSW2 m3, m4, m3, m4, m7, m6
228 %if mmsize == 8 && %1*%2 == 256
250 %macro SAD_X4_INC_P 0
251 add r0, 4*FENC_STRIDE
258 %macro SAD_X4_ONE_START 0
268 ABSW2 m0, m1, m0, m1, m5, m6
269 ABSW2 m2, m3, m2, m3, m4, m7
283 ABSW2 m5, m6, m5, m6, m9, m10
284 ABSW2 m7, m8, m7, m8, m9, m10
304 %else ; num_mmregs == 8 && !ssse3
315 ABSW2 m5, m6, m5, m6, m7, m4
322 %if mmsize == 8 && %1*%2 == 256
344 SAD_X%1_ONE x*mmsize, x*mmsize
345 SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize
351 cglobal pixel_vsad, 3,3,8
355 mova m3, [r0+2*r1+16]
359 ABSW2 m0, m1, m0, m1, m4, m5
367 mova m7, [r0+2*r1+16]
387 HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682
389 HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426
401 ;-----------------------------------------------------------------------------
402 ; void pixel_sad_xK_MxN( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
403 ; uint16_t *pix2, int i_stride, int scores[3] )
404 ;-----------------------------------------------------------------------------
406 cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
408 %xdefine STRIDE r %+ regnum
410 movsxd STRIDE, STRIDE %+ d
414 SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
415 SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1
418 SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2)