1 ;*****************************************************************************
2 ;* pixel.asm: x86 pixel metrics
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2011 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* Fiona Glaser <fiona@x264.com>
11 ;* Oskar Arvidsson <oskar@irock.se>
13 ;* This program is free software; you can redistribute it and/or modify
14 ;* it under the terms of the GNU General Public License as published by
15 ;* the Free Software Foundation; either version 2 of the License, or
16 ;* (at your option) any later version.
18 ;* This program is distributed in the hope that it will be useful,
19 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;* GNU General Public License for more details.
23 ;* You should have received a copy of the GNU General Public License
24 ;* along with this program; if not, write to the Free Software
25 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
27 ;* This program is also available under a commercial proprietary license.
28 ;* For more information, contact us at licensing@x264.com.
29 ;*****************************************************************************
32 %include "x86util.asm"
35 mask_ff: times 16 db 0xff
38 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
39 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
40 pf_64: times 4 dd 64.0
41 pf_128: times 4 dd 128.0
43 ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64
44 ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63
46 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
47 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
49 mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1
50 mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1
51 mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1
52 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
55 mask_10: times 4 dw 0, -1
56 mask_1100: times 2 dd 0, -1
57 deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
59 pd_f0: times 4 dd 0xffff0000
60 sq_0f: times 1 dq 0xffffffff
69 ;=============================================================================
71 ;=============================================================================
74 ;-----------------------------------------------------------------------------
75 ; int pixel_ssd_MxN( uint16_t *, int, uint16_t *, int )
76 ;-----------------------------------------------------------------------------
78 cglobal pixel_ssd_%1x%2_%3, 4,5,6*(mmsize/16)
97 lea r0, [r0+r1*2*num_rows]
98 lea r2, [r2+r3*2*num_rows]
108 cglobal pixel_ssd_%1x%2_mmxext, 4,5
109 mov r4, %1*%2/mmsize/2
116 mova m5, [r0+mmsize*2]
117 mova m6, [r2+mmsize*2]
118 mova m7, [r0+mmsize*3]
121 mova m2, [r2+mmsize*3]
146 SSD_ONE 8, 16, mmxext
155 %endif ; HIGH_BIT_DEPTH
157 %ifndef HIGH_BIT_DEPTH
158 %macro SSD_LOAD_FULL 5
202 DEINTB %2, %1, %4, %3, 7
217 %macro SSD_LOAD_HALF 5
218 LOAD 1, 2, [t0+%1], [t0+%3], 1
219 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
220 LOAD 3, 4, [t0+%1], [t0+%3], %5
221 JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5
247 %macro SSD_CORE_SSE2 7-8
249 DEINTB %6, %1, %7, %2, %5
253 DEINTB %6, %3, %7, %4, %5
264 %macro SSD_CORE_SSSE3 7-8
286 SSD_LOAD_%1 %2,%3,%4,%5,%6
287 SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
294 ;-----------------------------------------------------------------------------
295 ; int pixel_ssd_16x16( uint8_t *, int, uint8_t *, int )
296 ;-----------------------------------------------------------------------------
299 %assign function_align 8
301 %assign function_align 16
303 cglobal pixel_ssd_%1x%2_%3, 0,0,0
304 mov al, %1*%2/mmsize/2
307 jmp mangle(x264_pixel_ssd_%1x%1_%3.startloop)
312 DECLARE_REG_TMP 0,1,2,3
318 DECLARE_REG_TMP 1,2,3,4
337 SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
339 SSD_ITER FULL, 0, 0, t1, t3, 2
341 SSD_ITER HALF, 0, 0, t1, t3, 2
360 SSD 16, 16, sse2slow, 8
361 SSD 8, 8, sse2slow, 8
362 SSD 16, 8, sse2slow, 8
363 SSD 8, 16, sse2slow, 8
364 SSD 8, 4, sse2slow, 8
365 %define SSD_CORE SSD_CORE_SSE2
366 %define JOIN JOIN_SSE2
372 %define SSD_CORE SSD_CORE_SSSE3
373 %define JOIN JOIN_SSSE3
382 %assign function_align 16
383 %endif ; !HIGH_BIT_DEPTH
385 ;-----------------------------------------------------------------------------
386 ; void pixel_ssd_nv12_core( uint16_t *pixuv1, int stride1, uint16_t *pixuv2, int stride2,
387 ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
389 ; The maximum width this function can handle without risk of overflow is given
390 ; in the following equation: (mmsize in bits)
392 ; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
394 ; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
395 ; distortion levels it will take much more than that though.
396 ;-----------------------------------------------------------------------------
397 %ifdef HIGH_BIT_DEPTH
398 %macro SSD_NV12 1-2 0
399 cglobal pixel_ssd_nv12_core_%1, 6,7,7*(mmsize/16)
415 mova m1, [r0+r6+mmsize]
417 psubw m1, [r2+r6+mmsize]
419 pshufw m0, m0, 11011000b
420 pshufw m1, m1, 11011000b
422 pshuflw m0, m0, 11011000b
423 pshuflw m1, m1, 11011000b
424 pshufhw m0, m0, 11011000b
425 pshufhw m1, m1, 11011000b
433 %if mmsize==16 ; using HADDD would remove the mmsize/32 part from the
434 ; equation above, putting the width limit at 8208
445 %else ; unfortunately paddq is sse2
446 ; emulate 48 bit precision for mmxext instead
467 %else ; fixup for mmxext
468 SBUTTERFLY dq, 4, 5, 0
473 SBUTTERFLY dq, 0, 5, 4
481 %endif ; HIGH_BIT_DEPTH
483 %ifndef HIGH_BIT_DEPTH
484 ;-----------------------------------------------------------------------------
485 ; void pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2,
486 ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
488 ; This implementation can potentially overflow on image widths >= 11008 (or
489 ; 6604 if interlaced), since it is called on blocks of height up to 12 (resp
490 ; 20). At sane distortion levels it will take much more than that though.
491 ;-----------------------------------------------------------------------------
492 %macro SSD_NV12 1-2 0
493 cglobal pixel_ssd_nv12_core_%1, 6,7
533 %endif ; !X264_HIGHT_BIT_DEPTH
540 ;=============================================================================
542 ;=============================================================================
546 pxor m6, m6 ; sum squared
547 %ifndef HIGH_BIT_DEPTH
553 %endif ; !HIGH_BIT_DEPTH
557 %ifdef HIGH_BIT_DEPTH
558 %if mmsize == 8 && %1*%2 == 256
563 %else ; !HIGH_BIT_DEPTH
565 %endif ; HIGH_BIT_DEPTH
594 %ifdef HIGH_BIT_DEPTH
598 mova m4, [r0+%1+mmsize]
599 %else ; !HIGH_BIT_DEPTH
606 %endif ; HIGH_BIT_DEPTH
612 %ifndef HIGH_BIT_DEPTH
615 %endif ; !HIGH_BIT_DEPTH
621 ;-----------------------------------------------------------------------------
622 ; int pixel_var_wxh( uint8_t *, int )
623 ;-----------------------------------------------------------------------------
625 cglobal pixel_var_16x16_mmxext, 2,3
628 VAR_2ROW 8*SIZEOF_PIXEL, 16
631 cglobal pixel_var_8x8_mmxext, 2,3
638 %ifdef HIGH_BIT_DEPTH
639 cglobal pixel_var_16x16_sse2, 2,3,8
645 cglobal pixel_var_8x8_sse2, 2,3,8
660 %endif ; HIGH_BIT_DEPTH
662 %ifndef HIGH_BIT_DEPTH
663 cglobal pixel_var_16x16_sse2, 2,3,8
676 cglobal pixel_var_8x8_sse2, 2,4,8
691 %endif ; !HIGH_BIT_DEPTH
701 sub eax, r1d ; sqr - (sum * sum >> shift)
705 ;-----------------------------------------------------------------------------
706 ; int pixel_var2_8x8( pixel *, int, pixel *, int, int * )
707 ;-----------------------------------------------------------------------------
709 cglobal pixel_var2_8x8_mmxext, 5,6
714 %ifdef HIGH_BIT_DEPTH
718 psubw m1, [r2+mmsize]
719 %else ; !HIGH_BIT_DEPTH
730 %endif ; HIGH_BIT_DEPTH
745 cglobal pixel_var2_8x8_sse2, 5,6,8
749 %ifdef HIGH_BIT_DEPTH
754 %else ; !HIGH_BIT_DEPTH
760 %endif ; HIGH_BIT_DEPTH
769 lea r0, [r0+r1*2*SIZEOF_PIXEL]
770 lea r2, [r2+r3*2*SIZEOF_PIXEL]
776 %ifndef HIGH_BIT_DEPTH
777 cglobal pixel_var2_8x8_ssse3, 5,6,8
779 pxor m6, m6 ; sum squared
819 %endif ; !HIGH_BIT_DEPTH
821 ;=============================================================================
823 ;=============================================================================
825 %define TRANS TRANS_SSE2
829 ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
833 ; join 2x 32 bit and duplicate them
834 ; emulating shufps is faster on conroe
840 ; just use shufps on anything post conroe
851 %macro DIFF_UNPACK_SSE2 5
860 %macro DIFF_SUMSUB_SSSE3 5
861 HSUMSUB %1, %2, %3, %4, %5
866 %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
872 %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
879 %macro LOAD_DUP_4x8P_PENRYN 8
880 ; penryn and nehalem run punpcklqdq and movddup in different units
889 %macro LOAD_SUMSUB_8x2P 9
890 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
891 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
894 %macro LOAD_SUMSUB_8x4P_SSSE3 7-10 r0, r2, 0
895 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
896 LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
897 LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
904 %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
910 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
913 %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
916 DEINTB %1, %2, %3, %4, %5
919 SUMSUB_BA w, m%1, m%2, m%3
922 %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
923 ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
924 LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
925 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
926 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
927 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
930 ; in: r4=3*stride1, r5=3*stride2
931 ; in: %2 = horizontal offset
932 ; in: %3 = whether we need to increment pix1 and pix2
935 %macro SATD_4x4_MMX 3
937 %assign offset %2*SIZEOF_PIXEL
938 LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset]
939 LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset]
940 LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset]
941 LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset]
946 HADAMARD4_2D 4, 5, 6, 7, 3, %%n
951 %macro SATD_8x4_SSE 8-9
953 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
955 HADAMARD4_V m%2, m%3, m%4, m%5, m%6
956 ; doing the abs first is a slight advantage
957 ABS4 m%2, m%4, m%3, m%5, m%6, m%7
958 HADAMARD 1, max, %2, %4, %6, %7
968 HADAMARD 1, max, %3, %5, %6, %7
973 %macro SATD_START_MMX 0
975 lea r4, [3*r1] ; 3*stride1
976 lea r5, [3*r3] ; 3*stride2
979 %macro SATD_END_MMX 0
980 %ifdef HIGH_BIT_DEPTH
983 %else ; !HIGH_BIT_DEPTH
984 pshufw m1, m0, 01001110b
986 pshufw m1, m0, 10110001b
990 %endif ; HIGH_BIT_DEPTH
994 ; FIXME avoid the spilling of regs to hold 3*stride.
995 ; for small blocks on x86_32, modify pixel pointer instead.
997 ;-----------------------------------------------------------------------------
998 ; int pixel_satd_16x16( uint8_t *, int, uint8_t *, int )
999 ;-----------------------------------------------------------------------------
1001 cglobal pixel_satd_16x4_internal_mmxext
1002 SATD_4x4_MMX m2, 0, 0
1003 SATD_4x4_MMX m1, 4, 0
1005 SATD_4x4_MMX m2, 8, 0
1007 SATD_4x4_MMX m1, 12, 0
1012 cglobal pixel_satd_8x8_internal_mmxext
1013 SATD_4x4_MMX m2, 0, 0
1014 SATD_4x4_MMX m1, 4, 1
1017 pixel_satd_8x4_internal_mmxext:
1018 SATD_4x4_MMX m2, 0, 0
1019 SATD_4x4_MMX m1, 4, 0
1024 %ifdef HIGH_BIT_DEPTH
1025 %macro SATD_MxN_MMX 3
1026 cglobal pixel_satd_%1x%2_mmxext, 4,7
1029 call pixel_satd_%1x%3_internal_mmxext
1036 call pixel_satd_%1x%3_internal_mmxext
1047 SATD_MxN_MMX 16, 16, 4
1048 SATD_MxN_MMX 16, 8, 4
1049 SATD_MxN_MMX 8, 16, 8
1050 %endif ; HIGH_BIT_DEPTH
1052 %ifndef HIGH_BIT_DEPTH
1053 cglobal pixel_satd_16x16_mmxext, 4,6
1057 call pixel_satd_16x4_internal_mmxext
1061 call pixel_satd_16x4_internal_mmxext
1066 cglobal pixel_satd_16x8_mmxext, 4,6
1069 call pixel_satd_16x4_internal_mmxext
1072 call pixel_satd_16x4_internal_mmxext
1075 cglobal pixel_satd_8x16_mmxext, 4,6
1078 call pixel_satd_8x8_internal_mmxext
1081 call pixel_satd_8x8_internal_mmxext
1083 %endif ; !HIGH_BIT_DEPTH
1085 cglobal pixel_satd_8x8_mmxext, 4,6
1088 call pixel_satd_8x8_internal_mmxext
1091 cglobal pixel_satd_8x4_mmxext, 4,6
1094 call pixel_satd_8x4_internal_mmxext
1097 cglobal pixel_satd_4x8_mmxext, 4,6
1099 SATD_4x4_MMX m0, 0, 1
1100 SATD_4x4_MMX m1, 0, 0
1104 cglobal pixel_satd_4x4_mmxext, 4,6
1106 SATD_4x4_MMX m0, 0, 0
1109 %macro SATD_START_SSE2 3
1118 %macro SATD_END_SSE2 2
1124 %macro BACKUP_POINTERS 0
1131 %macro RESTORE_AND_INC_POINTERS 0
1143 ;-----------------------------------------------------------------------------
1144 ; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
1145 ;-----------------------------------------------------------------------------
1149 cglobal pixel_satd_4x4_%1, 4, 6, 6
1152 LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
1153 LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
1154 LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
1155 LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
1156 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
1157 HADAMARD 0, sumsub, 0, 1, 2, 3
1158 HADAMARD 4, sumsub, 0, 1, 2, 3
1159 HADAMARD 1, amax, 0, 1, 2, 3
1165 cglobal pixel_satd_4x8_%1, 4, 6, 8
1186 DIFFOP 0, 4, 1, 5, 7
1199 DIFFOP 2, 6, 3, 5, 7
1200 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6, swap
1205 cglobal pixel_satd_8x8_internal_%1
1206 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
1207 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
1208 pixel_satd_8x4_internal_%1:
1209 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
1210 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
1213 %ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
1214 cglobal pixel_satd_16x4_internal_%1
1215 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
1218 SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10
1219 SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10
1222 cglobal pixel_satd_16x8_%1, 4,6,12
1223 SATD_START_SSE2 %1, m10, m7
1227 jmp pixel_satd_16x8_internal_%1
1229 cglobal pixel_satd_16x16_%1, 4,6,12
1230 SATD_START_SSE2 %1, m10, m7
1234 call pixel_satd_16x4_internal_%1
1235 call pixel_satd_16x4_internal_%1
1236 pixel_satd_16x8_internal_%1:
1237 call pixel_satd_16x4_internal_%1
1238 call pixel_satd_16x4_internal_%1
1239 SATD_END_SSE2 %1, m10
1241 cglobal pixel_satd_16x8_%1, 4,6,8
1242 SATD_START_SSE2 %1, m6, m7
1244 call pixel_satd_8x8_internal_%1
1245 RESTORE_AND_INC_POINTERS
1246 call pixel_satd_8x8_internal_%1
1247 SATD_END_SSE2 %1, m6
1249 cglobal pixel_satd_16x16_%1, 4,6,8
1250 SATD_START_SSE2 %1, m6, m7
1252 call pixel_satd_8x8_internal_%1
1253 call pixel_satd_8x8_internal_%1
1254 RESTORE_AND_INC_POINTERS
1255 call pixel_satd_8x8_internal_%1
1256 call pixel_satd_8x8_internal_%1
1257 SATD_END_SSE2 %1, m6
1260 cglobal pixel_satd_8x16_%1, 4,6,8
1261 SATD_START_SSE2 %1, m6, m7
1262 call pixel_satd_8x8_internal_%1
1263 call pixel_satd_8x8_internal_%1
1264 SATD_END_SSE2 %1, m6
1266 cglobal pixel_satd_8x8_%1, 4,6,8
1267 SATD_START_SSE2 %1, m6, m7
1268 call pixel_satd_8x8_internal_%1
1269 SATD_END_SSE2 %1, m6
1271 cglobal pixel_satd_8x4_%1, 4,6,8
1272 SATD_START_SSE2 %1, m6, m7
1273 call pixel_satd_8x4_internal_%1
1274 SATD_END_SSE2 %1, m6
1275 %endmacro ; SATDS_SSE2
1285 %ifdef HIGH_BIT_DEPTH
1290 %endif ; HIGH_BIT_DEPTH
1294 %ifdef HIGH_BIT_DEPTH
1296 %elifidn %1, sse2 ; sse2 doesn't seem to like the horizontal way of doing things
1303 ;-----------------------------------------------------------------------------
1304 ; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
1305 ;-----------------------------------------------------------------------------
1306 cglobal pixel_sa8d_8x8_internal_%1
1309 LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
1310 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11
1312 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
1314 HADAMARD4_V m0, m1, m2, m8, m6
1315 HADAMARD4_V m4, m5, m3, m9, m6
1316 SUMSUB_BADC w, m0, m4, m1, m5, m6
1317 HADAMARD 2, sumsub, 0, 4, 6, 11
1318 HADAMARD 2, sumsub, 1, 5, 6, 11
1319 SUMSUB_BADC w, m2, m3, m8, m9, m6
1320 HADAMARD 2, sumsub, 2, 3, 6, 11
1321 HADAMARD 2, sumsub, 8, 9, 6, 11
1322 HADAMARD 1, amax, 0, 4, 6, 11
1323 HADAMARD 1, amax, 1, 5, 6, 4
1324 HADAMARD 1, amax, 2, 3, 6, 4
1325 HADAMARD 1, amax, 8, 9, 6, 4
1330 SAVE_MM_PERMUTATION pixel_sa8d_8x8_internal_%1
1333 cglobal pixel_sa8d_8x8_%1, 4,6,12
1340 call pixel_sa8d_8x8_internal_%1
1341 %ifdef HIGH_BIT_DEPTH
1345 %endif ; HIGH_BIT_DEPTH
1351 cglobal pixel_sa8d_16x16_%1, 4,6,12
1358 call pixel_sa8d_8x8_internal_%1 ; pix[0]
1359 add r2, 8*SIZEOF_PIXEL
1360 add r0, 8*SIZEOF_PIXEL
1361 %ifdef HIGH_BIT_DEPTH
1365 call pixel_sa8d_8x8_internal_%1 ; pix[8]
1369 call pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8]
1370 sub r2, 8*SIZEOF_PIXEL
1371 sub r0, 8*SIZEOF_PIXEL
1373 call pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
1376 %ifndef HIGH_BIT_DEPTH
1386 cglobal pixel_sa8d_8x8_internal_%1
1387 %define spill0 [esp+4]
1388 %define spill1 [esp+20]
1389 %define spill2 [esp+36]
1391 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
1392 HADAMARD4_2D 0, 1, 2, 3, 4
1394 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
1395 HADAMARD4_2D 4, 5, 6, 7, 3
1396 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
1399 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
1402 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
1403 ; could do first HADAMARD4_V here to save spilling later
1404 ; surprisingly, not a win on conroe or even p4
1409 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
1410 HADAMARD4_V m4, m5, m6, m7, m3
1416 HADAMARD4_V m0, m1, m2, m3, m7
1417 SUMSUB_BADC w, m0, m4, m1, m5, m7
1418 HADAMARD 2, sumsub, 0, 4, 7, 6
1419 HADAMARD 2, sumsub, 1, 5, 7, 6
1420 HADAMARD 1, amax, 0, 4, 7, 6
1421 HADAMARD 1, amax, 1, 5, 7, 6
1425 SUMSUB_BADC w, m2, m6, m3, m7, m4
1426 HADAMARD 2, sumsub, 2, 6, 4, 5
1427 HADAMARD 2, sumsub, 3, 7, 4, 5
1428 HADAMARD 1, amax, 2, 6, 4, 5
1429 HADAMARD 1, amax, 3, 7, 4, 5
1430 %endif ; sse2/non-sse2
1434 %endif ; ifndef mmxext
1436 cglobal pixel_sa8d_8x8_%1, 4,7
1443 call pixel_sa8d_8x8_internal_%1
1444 %ifdef HIGH_BIT_DEPTH
1448 %endif ; HIGH_BIT_DEPTH
1455 cglobal pixel_sa8d_16x16_%1, 4,7
1462 call pixel_sa8d_8x8_internal_%1
1467 %ifdef HIGH_BIT_DEPTH
1471 call pixel_sa8d_8x8_internal_%1
1474 add r0, 8*SIZEOF_PIXEL
1475 add r2, 8*SIZEOF_PIXEL
1478 call pixel_sa8d_8x8_internal_%1
1486 mova [esp+64-mmsize], m0
1487 call pixel_sa8d_8x8_internal_%1
1488 %ifdef HIGH_BIT_DEPTH
1490 %else ; !HIGH_BIT_DEPTH
1491 paddusw m0, [esp+64-mmsize]
1508 %endif ; HIGH_BIT_DEPTH
1514 %endif ; !ARCH_X86_64
1517 ;=============================================================================
1519 ;=============================================================================
1521 %macro INTRA_SA8D_SSE2 1
1524 ;-----------------------------------------------------------------------------
1525 ; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
1526 ;-----------------------------------------------------------------------------
1527 cglobal intra_sa8d_x3_8x8_core_%1, 3,3,16
1530 movq m0, [r0+0*FENC_STRIDE]
1531 movq m1, [r0+1*FENC_STRIDE]
1532 movq m2, [r0+2*FENC_STRIDE]
1533 movq m3, [r0+3*FENC_STRIDE]
1534 movq m4, [r0+4*FENC_STRIDE]
1535 movq m5, [r0+5*FENC_STRIDE]
1536 movq m6, [r0+6*FENC_STRIDE]
1537 movq m7, [r0+7*FENC_STRIDE]
1547 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
1550 movzx r0d, word [r1+0]
1551 add r0w, word [r1+16]
1561 ABS4 m8, m9, m10, m11, m12, m13
1572 ABS2 m10, m11, m13, m14
1579 movdqa m14, m15 ; 7x8 sum
1581 movdqa m8, [r1+0] ; left edge
1587 ABS1 m9, m11 ; 1x8 sum
1596 punpcklqdq m0, m4 ; transpose
1597 movdqa m1, [r1+16] ; top edge
1600 psrldq m2, 2 ; 8x7 sum
1601 psubw m0, m1 ; 8x1 sum
1613 pshufd m5, m15, 0xf5
1622 movq [r2], m3 ; i8x8_v, i8x8_h
1624 movd [r2+8], m3 ; i8x8_dc
1626 %endif ; ARCH_X86_64
1627 %endmacro ; INTRA_SA8D_SSE2
1630 ; out: m0..m3 = hadamard coefs
1632 cglobal hadamard_load
1633 ; not really a global, but otherwise cycles get attributed to the wrong function in profiling
1635 movd m0, [r0+0*FENC_STRIDE]
1636 movd m1, [r0+1*FENC_STRIDE]
1637 movd m2, [r0+2*FENC_STRIDE]
1638 movd m3, [r0+3*FENC_STRIDE]
1643 HADAMARD4_2D 0, 1, 2, 3, 4
1644 SAVE_MM_PERMUTATION hadamard_load
1647 %macro SCALAR_SUMSUB 4
1656 %macro SCALAR_HADAMARD_LEFT 5 ; y, 4x tmp
1658 shl %1d, 5 ; log(FDEC_STRIDE)
1660 movzx %2d, byte [r1+%1-1+0*FDEC_STRIDE]
1661 movzx %3d, byte [r1+%1-1+1*FDEC_STRIDE]
1662 movzx %4d, byte [r1+%1-1+2*FDEC_STRIDE]
1663 movzx %5d, byte [r1+%1-1+3*FDEC_STRIDE]
1667 SCALAR_SUMSUB %2d, %3d, %4d, %5d
1668 SCALAR_SUMSUB %2d, %4d, %3d, %5d
1669 mov [left_1d+2*%1+0], %2w
1670 mov [left_1d+2*%1+2], %3w
1671 mov [left_1d+2*%1+4], %4w
1672 mov [left_1d+2*%1+6], %5w
1675 %macro SCALAR_HADAMARD_TOP 5 ; x, 4x tmp
1676 movzx %2d, byte [r1+%1-FDEC_STRIDE+0]
1677 movzx %3d, byte [r1+%1-FDEC_STRIDE+1]
1678 movzx %4d, byte [r1+%1-FDEC_STRIDE+2]
1679 movzx %5d, byte [r1+%1-FDEC_STRIDE+3]
1680 SCALAR_SUMSUB %2d, %3d, %4d, %5d
1681 SCALAR_SUMSUB %2d, %4d, %3d, %5d
1682 mov [top_1d+2*%1+0], %2w
1683 mov [top_1d+2*%1+2], %3w
1684 mov [top_1d+2*%1+4], %4w
1685 mov [top_1d+2*%1+6], %5w
1688 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
1690 pshufw %4, %1, 01001110b
1691 pshufw %5, %2, 01001110b
1692 pshufw %6, %3, 01001110b
1699 pshufw %4, %1, 01001110b
1700 pshufw %5, %2, 01001110b
1701 pshufw %6, %3, 01001110b
1709 mov qword [sums+0], 0
1710 mov qword [sums+8], 0
1711 mov qword [sums+16], 0
1740 ; in: m0..m3 (4x4), m7 (3x4)
1741 ; out: m0 v, m4 h, m5 dc
1743 %macro SUM4x3 3 ; dc, left, top
1751 punpckldq m0, m2 ; transpose
1755 ABS2 m4, m5, m2, m3 ; 1x4 sum
1756 ABS1 m0, m1 ; 4x1 sum
1759 %macro INTRA_SATDS_MMX 1
1761 ;-----------------------------------------------------------------------------
1762 ; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
1763 ;-----------------------------------------------------------------------------
1764 cglobal intra_satd_x3_4x4_%1, 2,6
1766 ; stack is 16 byte aligned because abi says so
1767 %define top_1d rsp-8 ; size 8
1768 %define left_1d rsp-16 ; size 8
1771 ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
1773 %define top_1d esp+8
1779 SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5
1781 SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5
1782 lea t0d, [t0d + r0d + 4]
1787 SUM4x3 t0d, [left_1d], [top_1d]
1791 psrlq m1, 16 ; 4x3 sum
1794 SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw
1798 movd [r2+0], m0 ; i4x4_v satd
1799 movd [r2+4], m4 ; i4x4_h satd
1800 movd [r2+8], m5 ; i4x4_dc satd
1814 ;-----------------------------------------------------------------------------
1815 ; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
1816 ;-----------------------------------------------------------------------------
1817 cglobal intra_satd_x3_16x16_%1, 0,7
1819 %assign stack_pad 88
1821 %assign stack_pad 88 + ((stack_offset+88+4)&15)
1823 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1825 %define sums rsp+64 ; size 24
1826 %define top_1d rsp+32 ; size 32
1827 %define left_1d rsp ; size 32
1835 SCALAR_HADAMARD_LEFT t0, r3, r4, r5, r6
1837 SCALAR_HADAMARD_TOP t0, r3, r4, r5, r6
1854 SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
1857 paddw m0, [sums+0] ; i16x16_v satd
1858 paddw m4, [sums+8] ; i16x16_h satd
1859 paddw m5, [sums+16] ; i16x16_dc satd
1868 add r0, 4*FENC_STRIDE-16
1879 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
1885 movd [r2+8], m2 ; i16x16_dc satd
1886 movd [r2+4], m1 ; i16x16_h satd
1887 movd [r2+0], m0 ; i16x16_v satd
1891 ;-----------------------------------------------------------------------------
1892 ; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
1893 ;-----------------------------------------------------------------------------
1894 cglobal intra_satd_x3_8x8c_%1, 0,6
1895 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1897 %define sums rsp+48 ; size 24
1898 %define dc_1d rsp+32 ; size 16
1899 %define top_1d rsp+16 ; size 16
1900 %define left_1d rsp ; size 16
1907 SCALAR_HADAMARD_LEFT t0, t2, r3, r4, r5
1908 SCALAR_HADAMARD_TOP t0, t2, r3, r4, r5
1913 movzx t2d, word [left_1d+0]
1914 movzx r3d, word [top_1d+0]
1915 movzx r4d, word [left_1d+8]
1916 movzx r5d, word [top_1d+8]
1927 mov [dc_1d+ 0], t2d ; tl
1928 mov [dc_1d+ 4], r5d ; tr
1929 mov [dc_1d+ 8], r4d ; bl
1930 mov [dc_1d+12], r3d ; br
1943 SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
1946 paddw m0, [sums+16] ; i4x4_v satd
1947 paddw m4, [sums+8] ; i4x4_h satd
1948 paddw m5, [sums+0] ; i4x4_dc satd
1957 add r0, 4*FENC_STRIDE-8
1970 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
1972 movd [r2+0], m0 ; i8x8c_dc satd
1973 movd [r2+4], m1 ; i8x8c_h satd
1974 movd [r2+8], m2 ; i8x8c_v satd
1977 %endmacro ; INTRA_SATDS_MMX
1980 %macro ABS_MOV_SSSE3 2
1984 %macro ABS_MOV_MMX 2
1990 %define ABS_MOV ABS_MOV_MMX
1992 ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
1993 ; out: [tmp]=hadamard4, m0=satd
1994 cglobal hadamard_ac_4x4_mmxext
1995 %ifdef HIGH_BIT_DEPTH
2000 %else ; !HIGH_BIT_DEPTH
2009 %endif ; HIGH_BIT_DEPTH
2010 HADAMARD4_2D 0, 1, 2, 3, 4
2023 SAVE_MM_PERMUTATION hadamard_ac_4x4_mmxext
2026 cglobal hadamard_ac_2x2max_mmxext
2032 SUMSUB_BADC w, m0, m1, m2, m3, m4
2033 ABS4 m0, m2, m1, m3, m4, m5
2034 HADAMARD 0, max, 0, 2, 4, 5
2035 HADAMARD 0, max, 1, 3, 4, 5
2036 %ifdef HIGH_BIT_DEPTH
2041 %else ; !HIGH_BIT_DEPTH
2044 %endif ; HIGH_BIT_DEPTH
2045 SAVE_MM_PERMUTATION hadamard_ac_2x2max_mmxext
2049 %ifdef HIGH_BIT_DEPTH
2055 %ifdef HIGH_BIT_DEPTH
2060 %endif ; HIGH_BIT_DEPTH
2063 cglobal hadamard_ac_8x8_mmxext
2065 %ifdef HIGH_BIT_DEPTH
2069 %endif ; HIGH_BIT_DEPTH
2070 call hadamard_ac_4x4_mmxext
2071 add r0, 4*SIZEOF_PIXEL
2075 call hadamard_ac_4x4_mmxext
2079 call hadamard_ac_4x4_mmxext
2080 sub r0, 4*SIZEOF_PIXEL
2083 call hadamard_ac_4x4_mmxext
2086 mova [rsp+gprsize+8], m5 ; save satd
2087 %ifdef HIGH_BIT_DEPTH
2091 call hadamard_ac_2x2max_mmxext
2097 SUMSUB_BADC w, m0, m1, m2, m3, m4
2098 HADAMARD 0, sumsub, 0, 2, 4, 5
2099 ABS4 m1, m3, m0, m2, m4, m5
2100 HADAMARD 0, max, 1, 3, 4, 5
2101 %ifdef HIGH_BIT_DEPTH
2111 %else ; !HIGH_BIT_DEPTH
2117 %endif ; HIGH_BIT_DEPTH
2118 mova [rsp+gprsize], m6 ; save sa8d
2120 SAVE_MM_PERMUTATION hadamard_ac_8x8_mmxext
2123 %macro HADAMARD_AC_WXH_SUM_MMXEXT 2
2124 mova m1, [rsp+1*mmsize]
2125 %ifdef HIGH_BIT_DEPTH
2127 paddd m0, [rsp+2*mmsize]
2128 paddd m1, [rsp+3*mmsize]
2131 mova m2, [rsp+4*mmsize]
2132 paddd m1, [rsp+5*mmsize]
2133 paddd m2, [rsp+6*mmsize]
2135 paddd m1, [rsp+7*mmsize]
2142 %else ; !HIGH_BIT_DEPTH
2144 paddusw m0, [rsp+2*mmsize]
2145 paddusw m1, [rsp+3*mmsize]
2148 mova m2, [rsp+4*mmsize]
2149 paddusw m1, [rsp+5*mmsize]
2150 paddusw m2, [rsp+6*mmsize]
2152 paddusw m1, [rsp+7*mmsize]
2164 %endif ; HIGH_BIT_DEPTH
2167 %macro HADAMARD_AC_WXH_MMX 2
2168 cglobal pixel_hadamard_ac_%1x%2_mmxext, 2,4
2169 %assign pad 16-gprsize-(stack_offset&15)
2175 call hadamard_ac_8x8_mmxext
2180 call hadamard_ac_8x8_mmxext
2185 lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
2187 call hadamard_ac_8x8_mmxext
2191 call hadamard_ac_8x8_mmxext
2194 HADAMARD_AC_WXH_SUM_MMXEXT %1, %2
2202 add rsp, 128+%1*%2/4+pad
2204 %endmacro ; HADAMARD_AC_WXH_MMX
2206 HADAMARD_AC_WXH_MMX 16, 16
2207 HADAMARD_AC_WXH_MMX 8, 16
2208 HADAMARD_AC_WXH_MMX 16, 8
2209 HADAMARD_AC_WXH_MMX 8, 8
2211 %macro LOAD_INC_8x4W_SSE2 5
2212 %ifdef HIGH_BIT_DEPTH
2220 %else ; !HIGH_BIT_DEPTH
2232 %endif ; HIGH_BIT_DEPTH
2235 %macro LOAD_INC_8x4W_SSSE3 5
2236 LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1]
2240 HSUMSUB %1, %2, %3, %4, %5
2243 %macro HADAMARD_AC_SSE2 1
2245 ; in: r0=pix, r1=stride, r2=stride*3
2246 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
2247 cglobal hadamard_ac_8x8_%1
2253 %define spill0 [rsp+gprsize]
2254 %define spill1 [rsp+gprsize+16]
2255 %define spill2 [rsp+gprsize+32]
2257 %ifdef HIGH_BIT_DEPTH
2261 ;LOAD_INC only unpacks to words
2265 ;LOAD_INC loads sumsubs
2268 LOAD_INC_8x4W 0, 1, 2, 3, 7
2270 HADAMARD4_2D_SSE 0, 1, 2, 3, 4
2272 HADAMARD4_V m0, m1, m2, m3, m4
2276 LOAD_INC_8x4W 4, 5, 6, 7, 1
2278 HADAMARD4_2D_SSE 4, 5, 6, 7, 1
2280 HADAMARD4_V m4, m5, m6, m7, m1
2284 HADAMARD 1, sumsub, 0, 1, 6, 7
2285 HADAMARD 1, sumsub, 2, 3, 6, 7
2290 HADAMARD 1, sumsub, 4, 5, 1, 0
2291 HADAMARD 1, sumsub, 6, 7, 1, 0
2304 pand m1, [mask_ac4b]
2308 AC_PADD m1, m3, [pw_1]
2310 AC_PADD m1, m2, [pw_1]
2312 AC_PADD m1, m3, [pw_1]
2314 AC_PADD m1, m2, [pw_1]
2316 AC_PADD m1, m3, [pw_1]
2318 AC_PADD m1, m2, [pw_1]
2322 mova [rsp+gprsize+32], m1 ; save satd
2333 HADAMARD %%x, amax, 3, 7, 4
2334 HADAMARD %%x, amax, 2, 6, 7, 4
2336 HADAMARD %%x, amax, 1, 5, 6, 7
2337 HADAMARD %%x, sumsub, 0, 4, 5, 6
2339 AC_PADD m2, m3, [pw_1]
2340 AC_PADD m2, m1, [pw_1]
2341 %ifdef HIGH_BIT_DEPTH
2345 %endif ; HIGH_BIT_DEPTH
2349 AC_PADD m2, m4, [pw_1]
2350 AC_PADD m2, m0, [pw_1]
2351 mova [rsp+gprsize+16], m2 ; save sa8d
2353 SAVE_MM_PERMUTATION hadamard_ac_8x8_%1
2356 HADAMARD_AC_WXH_SSE2 16, 16, %1
2357 HADAMARD_AC_WXH_SSE2 8, 16, %1
2358 HADAMARD_AC_WXH_SSE2 16, 8, %1
2359 HADAMARD_AC_WXH_SSE2 8, 8, %1
2360 %endmacro ; HADAMARD_AC_SSE2
2362 %macro HADAMARD_AC_WXH_SUM_SSE2 2
2363 mova m1, [rsp+2*mmsize]
2364 %ifdef HIGH_BIT_DEPTH
2366 paddd m0, [rsp+3*mmsize]
2367 paddd m1, [rsp+4*mmsize]
2370 paddd m0, [rsp+5*mmsize]
2371 paddd m1, [rsp+6*mmsize]
2372 paddd m0, [rsp+7*mmsize]
2373 paddd m1, [rsp+8*mmsize]
2378 %else ; !HIGH_BIT_DEPTH
2380 paddusw m0, [rsp+3*mmsize]
2381 paddusw m1, [rsp+4*mmsize]
2384 paddusw m0, [rsp+5*mmsize]
2385 paddusw m1, [rsp+6*mmsize]
2386 paddusw m0, [rsp+7*mmsize]
2387 paddusw m1, [rsp+8*mmsize]
2392 %endif ; HIGH_BIT_DEPTH
2395 ; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
2396 %macro HADAMARD_AC_WXH_SSE2 3
2397 cglobal pixel_hadamard_ac_%1x%2_%3, 2,3,11
2398 %assign pad 16-gprsize-(stack_offset&15)
2403 call hadamard_ac_8x8_%3
2408 call hadamard_ac_8x8_%3
2413 lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
2415 call hadamard_ac_8x8_%3
2419 call hadamard_ac_8x8_%3
2422 HADAMARD_AC_WXH_SUM_SSE2 %1, %2
2425 shr edx, 2 - (%1*%2 >> 8)
2431 add rsp, 16+%1*%2/2+pad
2433 %endmacro ; HADAMARD_AC_WXH_SSE2
2438 cextern pixel_sa8d_8x8_internal_mmxext
2442 %define TRANS TRANS_SSE2
2443 %define ABS1 ABS1_MMX
2444 %define ABS2 ABS2_MMX
2445 %define DIFFOP DIFF_UNPACK_SSE2
2446 %define JDUP JDUP_SSE2
2447 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2
2448 %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
2449 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
2450 %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
2451 %define movdqu movups
2452 %define punpcklqdq movlhps
2456 INTRA_SA8D_SSE2 sse2
2457 %ifndef HIGH_BIT_DEPTH
2458 INTRA_SATDS_MMX mmxext
2460 HADAMARD_AC_SSE2 sse2
2462 %define ABS1 ABS1_SSSE3
2463 %define ABS2 ABS2_SSSE3
2464 %define ABS_MOV ABS_MOV_SSSE3
2465 %define DIFFOP DIFF_SUMSUB_SSSE3
2466 %define JDUP JDUP_CONROE
2467 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
2468 %ifndef HIGH_BIT_DEPTH
2469 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
2470 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
2471 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
2475 HADAMARD_AC_SSE2 ssse3
2476 %undef movdqa ; nehalem doesn't like movaps
2477 %undef movdqu ; movups
2478 %undef punpcklqdq ; or movlhps
2479 INTRA_SA8D_SSE2 ssse3
2480 INTRA_SATDS_MMX ssse3
2482 %define TRANS TRANS_SSE4
2483 %define JDUP JDUP_PENRYN
2484 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
2487 HADAMARD_AC_SSE2 sse4
2489 ;=============================================================================
2491 ;=============================================================================
2493 ;-----------------------------------------------------------------------------
2494 ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
2495 ; const uint8_t *pix2, int stride2, int sums[2][4] )
2496 ;-----------------------------------------------------------------------------
2499 %ifdef HIGH_BIT_DEPTH
2500 movdqu m5, [r0+(%1&1)*r1]
2501 movdqu m6, [r2+(%1&1)*r3]
2503 movq m5, [r0+(%1&1)*r1]
2504 movq m6, [r2+(%1&1)*r3]
2533 cglobal pixel_ssim_4x4x2_core_sse2, 4,4,8
2569 ;-----------------------------------------------------------------------------
2570 ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
2571 ;-----------------------------------------------------------------------------
2572 cglobal pixel_ssim_end4_sse2, 3,3,7
2587 movdqa m5, [ssim_c1]
2588 movdqa m6, [ssim_c2]
2589 TRANSPOSE4x4D 0, 1, 2, 3, 4
2591 ; s1=m0, s2=m1, ss=m2, s12=m3
2597 mulps m2, [pf_64] ; ss*64
2598 mulps m3, [pf_128] ; s12*128
2600 mulps m4, m0 ; s1*s2
2601 mulps m1, m1 ; s2*s2
2602 mulps m0, m0 ; s1*s1
2603 addps m4, m4 ; s1*s2*2
2604 addps m0, m1 ; s1*s1 + s2*s2
2606 subps m3, m4 ; covar*2
2607 addps m4, m5 ; s1*s2*2 + ssim_c1
2608 addps m0, m5 ; s1*s1 + s2*s2 + ssim_c1
2609 addps m2, m6 ; vars + ssim_c2
2610 addps m3, m6 ; covar*2 + ssim_c2
2614 pmaddwd m4, m0 ; s1*s2
2616 pmaddwd m0, m0 ; s1*s1 + s2*s2
2620 psubd m3, m4 ; covar*2
2626 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
2627 cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
2628 cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
2629 cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
2636 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
2639 lea r3, [mask_ff + 16]
2640 movdqu m1, [r3 + r2*4]
2642 movdqu m1, [mask_ff + r2*4 + 16]
2658 ;=============================================================================
2659 ; Successive Elimination ADS
2660 ;=============================================================================
2672 %macro ADS_END 1 ; unroll_size
2678 WIN64_RESTORE_XMM rsp
2682 %define ABS1 ABS1_MMX
2684 ;-----------------------------------------------------------------------------
2685 ; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
2686 ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
2687 ;-----------------------------------------------------------------------------
2688 cglobal pixel_ads4_mmxext, 6,7
2692 pshufw mm6, mm6, 0xAA
2694 pshufw mm4, mm4, 0xAA
2704 movq mm3, [r1+r2+16]
2719 cglobal pixel_ads2_mmxext, 6,7
2723 pshufw mm6, mm6, 0xAA
2740 cglobal pixel_ads1_mmxext, 6,7
2762 cglobal pixel_ads4_%1, 6,7,12
2764 pshuflw xmm7, xmm4, 0
2765 pshuflw xmm6, xmm4, 0xAA
2766 pshufhw xmm5, xmm4, 0
2767 pshufhw xmm4, xmm4, 0xAA
2768 punpcklqdq xmm7, xmm7
2769 punpcklqdq xmm6, xmm6
2770 punpckhqdq xmm5, xmm5
2771 punpckhqdq xmm4, xmm4
2773 pshuflw xmm8, r6m, 0
2774 punpcklqdq xmm8, xmm8
2777 movdqu xmm11, [r1+r2]
2780 movdqu xmm1, [r1+16]
2787 movdqu xmm3, [r1+r2+16]
2806 movdqu xmm1, [r1+16]
2811 movdqu xmm2, [r1+r2]
2812 movdqu xmm3, [r1+r2+16]
2822 pshuflw xmm1, xmm1, 0
2823 punpcklqdq xmm1, xmm1
2831 cglobal pixel_ads2_%1, 6,7,8
2834 pshuflw xmm7, xmm6, 0
2835 pshuflw xmm6, xmm6, 0xAA
2836 pshuflw xmm5, xmm5, 0
2837 punpcklqdq xmm7, xmm7
2838 punpcklqdq xmm6, xmm6
2839 punpcklqdq xmm5, xmm5
2843 movdqu xmm1, [r1+r2]
2857 cglobal pixel_ads1_%1, 6,7,8
2860 pshuflw xmm7, xmm7, 0
2861 pshuflw xmm6, xmm6, 0
2862 punpcklqdq xmm7, xmm7
2863 punpcklqdq xmm6, xmm6
2867 movdqu xmm1, [r1+16]
2871 movdqu xmm3, [r3+16]
2886 %define ABS1 ABS1_SSSE3
2889 ; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
2892 ; *(uint32_t*)(masks+width) = 0;
2893 ; for( i=0; i<width; i+=8 )
2895 ; uint64_t mask = *(uint64_t*)(masks+i);
2896 ; if( !mask ) continue;
2897 ; for( j=0; j<8; j++ )
2898 ; if( mask & (255<<j*8) )
2906 test r2d, 0xff<<(%1*8)
2912 cglobal pixel_ads_mvs, 0,7,0
2919 ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)