1 ;*****************************************************************************
2 ;* pixel.asm: x86 pixel metrics
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2013 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* Fiona Glaser <fiona@x264.com>
11 ;* Oskar Arvidsson <oskar@irock.se>
13 ;* This program is free software; you can redistribute it and/or modify
14 ;* it under the terms of the GNU General Public License as published by
15 ;* the Free Software Foundation; either version 2 of the License, or
16 ;* (at your option) any later version.
18 ;* This program is distributed in the hope that it will be useful,
19 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;* GNU General Public License for more details.
23 ;* You should have received a copy of the GNU General Public License
24 ;* along with this program; if not, write to the Free Software
25 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
27 ;* This program is also available under a commercial proprietary license.
28 ;* For more information, contact us at licensing@x264.com.
29 ;*****************************************************************************
32 %include "x86util.asm"
35 hmul_16p: times 16 db 1
41 mask_ff: times 16 db 0xff
43 mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1
44 mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1
45 mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1
47 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
48 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
49 pf_64: times 4 dd 64.0
50 pf_128: times 4 dd 128.0
52 ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64
53 ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63
55 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
56 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
58 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
59 mask_10: times 4 dw 0, -1
60 mask_1100: times 2 dd 0, -1
61 pb_pppm: times 4 db 1,1,1,-1
62 deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
63 intrax3_shuf: db 7,6,7,6,5,4,5,4,3,2,3,2,1,0,1,0
65 intrax9a_ddlr1: db 6, 7, 8, 9, 7, 8, 9,10, 4, 5, 6, 7, 3, 4, 5, 6
66 intrax9a_ddlr2: db 8, 9,10,11, 9,10,11,12, 2, 3, 4, 5, 1, 2, 3, 4
67 intrax9a_hdu1: db 15, 4, 5, 6,14, 3,15, 4,14, 2,13, 1,13, 1,12, 0
68 intrax9a_hdu2: db 13, 2,14, 3,12, 1,13, 2,12, 0,11,11,11,11,11,11
69 intrax9a_vrl1: db 10,11,12,13, 3, 4, 5, 6,11,12,13,14, 5, 6, 7, 8
70 intrax9a_vrl2: db 2,10,11,12, 1, 3, 4, 5,12,13,14,15, 6, 7, 8, 9
71 intrax9a_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 4, 4, 4, 3, 3, 3, 3
72 intrax9a_vh2: db 6, 7, 8, 9, 6, 7, 8, 9, 2, 2, 2, 2, 1, 1, 1, 1
73 intrax9a_dc: db 1, 2, 3, 4, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1
74 intrax9a_lut: db 0x60,0x68,0x80,0x00,0x08,0x20,0x40,0x28,0x48,0,0,0,0,0,0,0
75 pw_s01234567: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8005,0x8006,0x8007
76 pw_s01234657: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8006,0x8005,0x8007
77 intrax9_edge: db 0, 0, 1, 2, 3, 7, 8, 9,10,11,12,13,14,15,15,15
79 intrax9b_ddlr1: db 6, 7, 8, 9, 4, 5, 6, 7, 7, 8, 9,10, 3, 4, 5, 6
80 intrax9b_ddlr2: db 8, 9,10,11, 2, 3, 4, 5, 9,10,11,12, 1, 2, 3, 4
81 intrax9b_hdu1: db 15, 4, 5, 6,14, 2,13, 1,14, 3,15, 4,13, 1,12, 0
82 intrax9b_hdu2: db 13, 2,14, 3,12, 0,11,11,12, 1,13, 2,11,11,11,11
83 intrax9b_vrl1: db 10,11,12,13,11,12,13,14, 3, 4, 5, 6, 5, 6, 7, 8
84 intrax9b_vrl2: db 2,10,11,12,12,13,14,15, 1, 3, 4, 5, 6, 7, 8, 9
85 intrax9b_vh1: db 6, 7, 8, 9, 4, 4, 4, 4, 6, 7, 8, 9, 3, 3, 3, 3
86 intrax9b_vh2: db 6, 7, 8, 9, 2, 2, 2, 2, 6, 7, 8, 9, 1, 1, 1, 1
87 intrax9b_edge2: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1
88 intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1
89 intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
90 intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0
93 intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5
94 intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4
95 intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1
96 intra8x9_h4: db 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0
97 intra8x9_ddl1: db 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9,10
98 intra8x9_ddl2: db 2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9,10,11
99 intra8x9_ddl3: db 5, 6, 7, 8, 9,10,11,12, 7, 8, 9,10,11,12,13,14
100 intra8x9_ddl4: db 6, 7, 8, 9,10,11,12,13, 8, 9,10,11,12,13,14,15
101 intra8x9_vl1: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
102 intra8x9_vl2: db 1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9
103 intra8x9_vl3: db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9,10
104 intra8x9_vl4: db 3, 4, 5, 6, 7, 8, 9,10, 4, 5, 6, 7, 8, 9,10,11
105 intra8x9_ddr1: db 8, 9,10,11,12,13,14,15, 6, 7, 8, 9,10,11,12,13
106 intra8x9_ddr2: db 7, 8, 9,10,11,12,13,14, 5, 6, 7, 8, 9,10,11,12
107 intra8x9_ddr3: db 4, 5, 6, 7, 8, 9,10,11, 2, 3, 4, 5, 6, 7, 8, 9
108 intra8x9_ddr4: db 3, 4, 5, 6, 7, 8, 9,10, 1, 2, 3, 4, 5, 6, 7, 8
109 intra8x9_vr1: db 8, 9,10,11,12,13,14,15, 7, 8, 9,10,11,12,13,14
110 intra8x9_vr2: db 8, 9,10,11,12,13,14,15, 6, 8, 9,10,11,12,13,14
111 intra8x9_vr3: db 5, 7, 8, 9,10,11,12,13, 3, 5, 7, 8, 9,10,11,12
112 intra8x9_vr4: db 4, 6, 8, 9,10,11,12,13, 2, 4, 6, 8, 9,10,11,12
113 intra8x9_hd1: db 3, 8, 9,10,11,12,13,14, 1, 6, 2, 7, 3, 8, 9,10
114 intra8x9_hd2: db 2, 7, 3, 8, 9,10,11,12, 0, 5, 1, 6, 2, 7, 3, 8
115 intra8x9_hd3: db 7, 8, 9,10,11,12,13,14, 3, 4, 5, 6, 7, 8, 9,10
116 intra8x9_hd4: db 5, 6, 7, 8, 9,10,11,12, 1, 2, 3, 4, 5, 6, 7, 8
117 intra8x9_hu1: db 13,12,11,10, 9, 8, 7, 6, 9, 8, 7, 6, 5, 4, 3, 2
118 intra8x9_hu2: db 11,10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0
119 intra8x9_hu3: db 5, 4, 3, 2, 1, 0,15,15, 1, 0,15,15,15,15,15,15
120 intra8x9_hu4: db 3, 2, 1, 0,15,15,15,15,15,15,15,15,15,15,15,15
121 pw_s00112233: dw 0x8000,0x8000,0x8001,0x8001,0x8002,0x8002,0x8003,0x8003
122 pw_s00001111: dw 0x8000,0x8000,0x8000,0x8000,0x8001,0x8001,0x8001,0x8001
124 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
125 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
128 pd_f0: times 4 dd 0xffff0000
130 pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
133 %macro ADS_MVS_SHUFFLE 8
138 %assign y y>>((~y)&1)
147 ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7
168 ;=============================================================================
170 ;=============================================================================
173 ;-----------------------------------------------------------------------------
174 ; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
175 ;-----------------------------------------------------------------------------
177 cglobal pixel_ssd_%1x%2, 4,7,6
181 %define offset0_2 r1*2
184 %define offset1_2 r3*2
189 %define offset0_1 mmsize
191 %define offset0_3 r1+mmsize
192 %define offset1_1 mmsize
194 %define offset1_3 r3+mmsize
196 %define offset0_1 mmsize
197 %define offset0_2 mmsize*2
198 %define offset0_3 mmsize*3
199 %define offset1_1 mmsize
200 %define offset1_2 mmsize*2
201 %define offset1_3 mmsize*3
203 %assign %%n %2/(2*mmsize/%1)
210 mova m2, [r0+offset0_1]
211 mova m3, [r0+offset0_2]
212 mova m4, [r0+offset0_3]
214 psubw m2, [r2+offset1_1]
215 psubw m3, [r2+offset1_2]
216 psubw m4, [r2+offset1_3]
218 lea r0, [r0+r1*(%2/%%n)]
219 lea r2, [r2+r3*(%2/%%n)]
256 %endif ; HIGH_BIT_DEPTH
258 %if HIGH_BIT_DEPTH == 0
259 %macro SSD_LOAD_FULL 5
303 DEINTB %2, %1, %4, %3, 7
320 vinserti128 m%1, m%1, %4, 1
328 vinserti128 m%2, m%2, %6, 1
332 SBUTTERFLY bw, %1, %2, %3
335 %macro SSD_LOAD_HALF 5
336 LOAD 1, 2, [t0+%1], [t0+%3], 1
337 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
338 LOAD 3, 4, [t0+%1], [t0+%3], %5
339 JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5
352 punpcklbw m%2, m%1, m%5
354 punpcklbw m%4, m%3, m%5
363 %macro SSD_CORE_SSE2 7-8
365 DEINTB %6, %1, %7, %2, %5
369 DEINTB %6, %3, %7, %4, %5
380 %macro SSD_CORE_SSSE3 7-8
382 punpckhbw m%6, m%1, m%2
383 punpckhbw m%7, m%3, m%4
400 SSD_LOAD_%1 %2,%3,%4,%5,%6
401 SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
408 ;-----------------------------------------------------------------------------
409 ; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
410 ;-----------------------------------------------------------------------------
413 %assign function_align 8
415 %assign function_align 16
417 cglobal pixel_ssd_%1x%2, 0,0,0
418 mov al, %1*%2/mmsize/2
421 jmp mangle(x264_pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop)
426 DECLARE_REG_TMP 0,1,2,3
430 DECLARE_REG_TMP 1,2,3,4
439 %elifidn cpuname, sse2
449 SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
451 SSD_ITER FULL, 0, 0, t1, t3, 2
453 SSD_ITER HALF, 0, 0, t1, t3, 2
458 vextracti128 xm1, m0, 1
486 %define SSD_CORE SSD_CORE_SSE2
487 %define JOIN JOIN_SSE2
494 %define SSD_CORE SSD_CORE_SSSE3
495 %define JOIN JOIN_SSSE3
517 %define LOAD LOAD_AVX2
518 %define JOIN JOIN_AVX2
522 %assign function_align 16
523 %endif ; !HIGH_BIT_DEPTH
525 ;-----------------------------------------------------------------------------
526 ; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
527 ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
529 ; The maximum width this function can handle without risk of overflow is given
530 ; in the following equation: (mmsize in bits)
532 ; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
534 ; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
535 ; distortion levels it will take much more than that though.
536 ;-----------------------------------------------------------------------------
539 cglobal pixel_ssd_nv12_core, 6,7,7
555 mova m1, [r0+r6+mmsize]
557 psubw m1, [r2+r6+mmsize]
558 PSHUFLW m0, m0, q3120
559 PSHUFLW m1, m1, q3120
561 pshufhw m0, m0, q3120
562 pshufhw m1, m1, q3120
570 %if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
575 %if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the
576 ; equation above, putting the width limit at 8208
585 %else ; unfortunately paddq is sse2
586 ; emulate 48 bit precision for mmx2 instead
605 vextracti128 xm0, m4, 1
611 %else ; fixup for mmx2
612 SBUTTERFLY dq, 4, 5, 0
617 SBUTTERFLY dq, 0, 5, 4
625 %endif ; HIGH_BIT_DEPTH
627 %if HIGH_BIT_DEPTH == 0
628 ;-----------------------------------------------------------------------------
629 ; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
630 ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
632 ; This implementation can potentially overflow on image widths >= 11008 (or
633 ; 6604 if interlaced), since it is called on blocks of height up to 12 (resp
634 ; 20). At sane distortion levels it will take much more than that though.
635 ;-----------------------------------------------------------------------------
637 cglobal pixel_ssd_nv12_core, 6,7
648 %if mmsize == 32 ; only 16-byte alignment is guaranteed
666 %if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
669 pandn m0, m1, m0 ; zero the lower half
690 %endif ; !HIGH_BIT_DEPTH
701 ;=============================================================================
703 ;=============================================================================
707 pxor m6, m6 ; sum squared
708 %if HIGH_BIT_DEPTH == 0
714 %endif ; !HIGH_BIT_DEPTH
719 %if mmsize == 8 && %1*%2 == 256
724 %else ; !HIGH_BIT_DEPTH
726 %endif ; HIGH_BIT_DEPTH
760 mova m4, [r0+%1+mmsize]
761 %else ; !HIGH_BIT_DEPTH
767 %endif ; HIGH_BIT_DEPTH
773 %if HIGH_BIT_DEPTH == 0
776 %endif ; !HIGH_BIT_DEPTH
782 ;-----------------------------------------------------------------------------
783 ; int pixel_var_wxh( uint8_t *, intptr_t )
784 ;-----------------------------------------------------------------------------
786 cglobal pixel_var_16x16, 2,3
789 VAR_2ROW 8*SIZEOF_PIXEL, 16
792 cglobal pixel_var_8x16, 2,3
798 cglobal pixel_var_8x8, 2,3
806 cglobal pixel_var_16x16, 2,3,8
812 cglobal pixel_var_8x8, 2,3,8
835 %endif ; HIGH_BIT_DEPTH
837 %if HIGH_BIT_DEPTH == 0
839 cglobal pixel_var_16x16, 2,3,8
852 cglobal pixel_var_8x8, 2,4,8
868 cglobal pixel_var_8x16, 2,4,8
893 cglobal pixel_var_16x16, 2,4,7
900 pmovzxbw m1, [r0+r1*2]
906 vextracti128 xm0, m5, 1
907 vextracti128 xm1, m6, 1
920 %endif ; !HIGH_BIT_DEPTH
930 sub eax, r1d ; sqr - (sum * sum >> shift)
934 ;-----------------------------------------------------------------------------
935 ; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
936 ;-----------------------------------------------------------------------------
937 %macro VAR2_8x8_MMX 2
938 cglobal pixel_var2_8x%1, 5,6
947 psubw m1, [r2+mmsize]
948 %else ; !HIGH_BIT_DEPTH
959 %endif ; HIGH_BIT_DEPTH
979 %macro VAR2_8x8_SSE2 2
980 cglobal pixel_var2_8x%1, 5,6,8
989 %else ; !HIGH_BIT_DEPTH
995 %endif ; HIGH_BIT_DEPTH
1004 lea r0, [r0+r1*2*SIZEOF_PIXEL]
1005 lea r2, [r2+r3*2*SIZEOF_PIXEL]
1015 %if HIGH_BIT_DEPTH == 0
1016 %macro VAR2_8x8_SSSE3 2
1017 cglobal pixel_var2_8x%1, 5,6,8
1019 pxor m6, m6 ; sum squared
1062 VAR2_8x8_SSSE3 16, 7
1065 VAR2_8x8_SSSE3 16, 7
1067 %macro VAR2_8x8_AVX2 2
1068 cglobal pixel_var2_8x%1, 5,6,6
1070 pxor m4, m4 ; sum squared
1076 vinserti128 m0, m0, [r0+r1], 1
1077 vinserti128 m1, m1, [r2+r3], 1
1083 vinserti128 m1, m1, [r0+r1], 1
1084 vinserti128 m2, m2, [r2+r3], 1
1098 vextracti128 xm0, m3, 1
1099 vextracti128 xm1, m4, 1
1102 VAR2_END %2, xm3, xm4
1109 %endif ; !HIGH_BIT_DEPTH
1111 ;=============================================================================
1113 ;=============================================================================
1117 ; just use shufps on anything post conroe
1119 %elif cpuflag(ssse3) && notcpuflag(atom)
1120 ; join 2x 32 bit and duplicate them
1121 ; emulating shufps is faster on conroe
1125 ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
1137 %macro DIFF_UNPACK_SSE2 5
1146 %macro DIFF_SUMSUB_SSSE3 5
1147 HSUMSUB %1, %2, %3, %4, %5
1152 %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
1158 %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
1165 %macro LOAD_DUP_4x8P_PENRYN 8
1166 ; penryn and nehalem run punpcklqdq and movddup in different units
1175 %macro LOAD_SUMSUB_8x2P 9
1176 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
1177 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1180 %macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
1181 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
1182 LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
1183 LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
1190 %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
1196 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1199 %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
1202 DEINTB %1, %2, %3, %4, %5
1205 SUMSUB_BA w, %1, %2, %3
1208 %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
1209 ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
1210 LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
1211 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
1212 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
1213 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
1216 %macro LOAD_SUMSUB_16x2P_AVX2 9
1217 ; 2*dst, 2*tmp, mul, 4*ptr
1218 vbroadcasti128 m%1, [%6]
1219 vbroadcasti128 m%3, [%7]
1220 vbroadcasti128 m%2, [%8]
1221 vbroadcasti128 m%4, [%9]
1222 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1225 %macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0
1226 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
1227 LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3
1228 LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5
1235 %macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer
1240 vpermq m%3, m%3, q0011
1241 vpermq m%4, m%4, q0011
1242 vpermq m%1, m%1, q0011
1243 vpermq m%2, m%2, q0011
1246 %macro LOAD_SUMSUB8_16x2P_AVX2 9
1247 ; 2*dst, 2*tmp, mul, 4*ptr
1248 LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9
1249 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1252 %macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0
1253 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
1254 LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
1255 LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
1262 ; in: r4=3*stride1, r5=3*stride2
1263 ; in: %2 = horizontal offset
1264 ; in: %3 = whether we need to increment pix1 and pix2
1267 %macro SATD_4x4_MMX 3
1269 %assign offset %2*SIZEOF_PIXEL
1270 LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset]
1271 LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset]
1272 LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset]
1273 LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset]
1278 HADAMARD4_2D 4, 5, 6, 7, 3, %%n
1283 ; in: %1 = horizontal if 0, vertical if 1
1284 %macro SATD_8x4_SSE 8-9
1286 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
1288 HADAMARD4_V %2, %3, %4, %5, %6
1289 ; doing the abs first is a slight advantage
1290 ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
1291 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
1292 HADAMARD 1, max, %2, %4, %6, %7
1302 HADAMARD 1, max, %3, %5, %6, %7
1307 %macro SATD_START_MMX 0
1309 lea r4, [3*r1] ; 3*stride1
1310 lea r5, [3*r3] ; 3*stride2
1313 %macro SATD_END_MMX 0
1317 %else ; !HIGH_BIT_DEPTH
1318 pshufw m1, m0, q1032
1320 pshufw m1, m0, q2301
1324 %endif ; HIGH_BIT_DEPTH
1328 ; FIXME avoid the spilling of regs to hold 3*stride.
1329 ; for small blocks on x86_32, modify pixel pointer instead.
1331 ;-----------------------------------------------------------------------------
1332 ; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
1333 ;-----------------------------------------------------------------------------
1335 cglobal pixel_satd_16x4_internal
1336 SATD_4x4_MMX m2, 0, 0
1337 SATD_4x4_MMX m1, 4, 0
1339 SATD_4x4_MMX m2, 8, 0
1341 SATD_4x4_MMX m1, 12, 0
1346 cglobal pixel_satd_8x8_internal
1347 SATD_4x4_MMX m2, 0, 0
1348 SATD_4x4_MMX m1, 4, 1
1351 pixel_satd_8x4_internal_mmx2:
1352 SATD_4x4_MMX m2, 0, 0
1353 SATD_4x4_MMX m1, 4, 0
1359 %macro SATD_MxN_MMX 3
1360 cglobal pixel_satd_%1x%2, 4,7
1363 call pixel_satd_%1x%3_internal_mmx2
1370 call pixel_satd_%1x%3_internal_mmx2
1381 SATD_MxN_MMX 16, 16, 4
1382 SATD_MxN_MMX 16, 8, 4
1383 SATD_MxN_MMX 8, 16, 8
1384 %endif ; HIGH_BIT_DEPTH
1386 %if HIGH_BIT_DEPTH == 0
1387 cglobal pixel_satd_16x16, 4,6
1391 call pixel_satd_16x4_internal_mmx2
1395 call pixel_satd_16x4_internal_mmx2
1400 cglobal pixel_satd_16x8, 4,6
1403 call pixel_satd_16x4_internal_mmx2
1406 call pixel_satd_16x4_internal_mmx2
1409 cglobal pixel_satd_8x16, 4,6
1412 call pixel_satd_8x8_internal_mmx2
1415 call pixel_satd_8x8_internal_mmx2
1417 %endif ; !HIGH_BIT_DEPTH
1419 cglobal pixel_satd_8x8, 4,6
1422 call pixel_satd_8x8_internal_mmx2
1425 cglobal pixel_satd_8x4, 4,6
1428 call pixel_satd_8x4_internal_mmx2
1431 cglobal pixel_satd_4x16, 4,6
1433 SATD_4x4_MMX m0, 0, 1
1434 SATD_4x4_MMX m1, 0, 1
1436 SATD_4x4_MMX m1, 0, 1
1438 SATD_4x4_MMX m1, 0, 0
1442 cglobal pixel_satd_4x8, 4,6
1444 SATD_4x4_MMX m0, 0, 1
1445 SATD_4x4_MMX m1, 0, 0
1449 cglobal pixel_satd_4x4, 4,6
1451 SATD_4x4_MMX m0, 0, 0
1454 %macro SATD_START_SSE2 2-3 0
1456 %if HIGH_BIT_DEPTH && %3
1458 %elif cpuflag(ssse3) && notcpuflag(atom)
1470 %macro SATD_END_SSE2 1-2
1491 %macro BACKUP_POINTERS 0
1501 %macro RESTORE_AND_INC_POINTERS 0
1503 lea r0, [r6+8*SIZEOF_PIXEL]
1504 lea r2, [r7+8*SIZEOF_PIXEL]
1511 add r0, 8*SIZEOF_PIXEL
1512 add r2, 8*SIZEOF_PIXEL
1516 %macro SATD_4x8_SSE 3
1522 movhps m0, [r0+4*r1]
1523 movhps m4, [r2+4*r3]
1531 movhps m1, [r0+1*r1]
1532 movhps m5, [r2+1*r3]
1533 movhps m2, [r0+2*r1]
1534 movhps m6, [r2+2*r3]
1540 %else ; !HIGH_BIT_DEPTH
1559 DIFFOP 0, 4, 1, 5, 3
1561 DIFFOP 0, 4, 1, 5, 7
1577 DIFFOP 2, 6, 3, 5, 4
1579 DIFFOP 2, 6, 3, 5, 7
1581 %endif ; HIGH_BIT_DEPTH
1582 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
1585 ;-----------------------------------------------------------------------------
1586 ; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
1587 ;-----------------------------------------------------------------------------
1589 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
1591 %if vertical==0 || HIGH_BIT_DEPTH
1592 cglobal pixel_satd_4x4, 4, 6, 6
1595 LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
1596 LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
1597 LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
1598 LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
1599 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
1600 HADAMARD 0, sumsub, 0, 1, 2, 3
1601 HADAMARD 4, sumsub, 0, 1, 2, 3
1602 HADAMARD 1, amax, 0, 1, 2, 3
1608 cglobal pixel_satd_4x8, 4, 6, 8
1613 SATD_4x8_SSE vertical, 0, swap
1618 cglobal pixel_satd_4x16, 4, 6, 8
1623 SATD_4x8_SSE vertical, 0, swap
1624 lea r0, [r0+r1*2*SIZEOF_PIXEL]
1625 lea r2, [r2+r3*2*SIZEOF_PIXEL]
1626 SATD_4x8_SSE vertical, 1, add
1631 cglobal pixel_satd_8x8_internal
1632 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
1633 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
1634 %%pixel_satd_8x4_internal:
1635 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
1636 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
1639 ; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
1640 ; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
1641 %if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx)
1642 cglobal pixel_satd_16x4_internal
1643 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
1646 ; always use horizontal mode here
1647 SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10
1648 SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
1651 cglobal pixel_satd_16x8, 4,6,12
1652 SATD_START_SSE2 m10, m7
1656 jmp %%pixel_satd_16x8_internal
1658 cglobal pixel_satd_16x16, 4,6,12
1659 SATD_START_SSE2 m10, m7
1663 call pixel_satd_16x4_internal
1664 call pixel_satd_16x4_internal
1665 %%pixel_satd_16x8_internal:
1666 call pixel_satd_16x4_internal
1667 call pixel_satd_16x4_internal
1670 cglobal pixel_satd_16x8, 4,6,8
1671 SATD_START_SSE2 m6, m7
1673 call pixel_satd_8x8_internal
1674 RESTORE_AND_INC_POINTERS
1675 call pixel_satd_8x8_internal
1678 cglobal pixel_satd_16x16, 4,6,8
1679 SATD_START_SSE2 m6, m7, 1
1681 call pixel_satd_8x8_internal
1682 call pixel_satd_8x8_internal
1683 SATD_ACCUM m6, m0, m7
1684 RESTORE_AND_INC_POINTERS
1685 call pixel_satd_8x8_internal
1686 call pixel_satd_8x8_internal
1687 SATD_END_SSE2 m6, m7
1690 cglobal pixel_satd_8x16, 4,6,8
1691 SATD_START_SSE2 m6, m7
1692 call pixel_satd_8x8_internal
1693 call pixel_satd_8x8_internal
1696 cglobal pixel_satd_8x8, 4,6,8
1697 SATD_START_SSE2 m6, m7
1698 call pixel_satd_8x8_internal
1701 cglobal pixel_satd_8x4, 4,6,8
1702 SATD_START_SSE2 m6, m7
1703 call %%pixel_satd_8x4_internal
1705 %endmacro ; SATDS_SSE2
1720 %endif ; HIGH_BIT_DEPTH
1724 ; sse2 doesn't seem to like the horizontal way of doing things
1725 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
1728 ;-----------------------------------------------------------------------------
1729 ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
1730 ;-----------------------------------------------------------------------------
1731 cglobal pixel_sa8d_8x8_internal
1734 LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
1735 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
1737 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
1739 HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11
1747 cglobal pixel_sa8d_8x8, 4,8,12
1754 call pixel_sa8d_8x8_internal
1759 %endif ; HIGH_BIT_DEPTH
1765 cglobal pixel_sa8d_16x16, 4,8,12
1772 call pixel_sa8d_8x8_internal ; pix[0]
1773 add r2, 8*SIZEOF_PIXEL
1774 add r0, 8*SIZEOF_PIXEL
1779 call pixel_sa8d_8x8_internal ; pix[8]
1783 call pixel_sa8d_8x8_internal ; pix[8*stride+8]
1784 sub r2, 8*SIZEOF_PIXEL
1785 sub r0, 8*SIZEOF_PIXEL
1787 call pixel_sa8d_8x8_internal ; pix[8*stride]
1790 %if HIGH_BIT_DEPTH == 0
1800 cglobal pixel_sa8d_8x8_internal
1801 %define spill0 [esp+4]
1802 %define spill1 [esp+20]
1803 %define spill2 [esp+36]
1805 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
1806 HADAMARD4_2D 0, 1, 2, 3, 4
1808 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
1809 HADAMARD4_2D 4, 5, 6, 7, 3
1810 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
1813 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
1816 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
1817 ; could do first HADAMARD4_V here to save spilling later
1818 ; surprisingly, not a win on conroe or even p4
1823 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
1824 HADAMARD4_V 4, 5, 6, 7, 3
1830 HADAMARD4_V 0, 1, 2, 3, 7
1831 SUMSUB_BADC w, 0, 4, 1, 5, 7
1832 HADAMARD 2, sumsub, 0, 4, 7, 6
1833 HADAMARD 2, sumsub, 1, 5, 7, 6
1834 HADAMARD 1, amax, 0, 4, 7, 6
1835 HADAMARD 1, amax, 1, 5, 7, 6
1839 SUMSUB_BADC w, 2, 6, 3, 7, 4
1840 HADAMARD 2, sumsub, 2, 6, 4, 5
1841 HADAMARD 2, sumsub, 3, 7, 4, 5
1842 HADAMARD 1, amax, 2, 6, 4, 5
1843 HADAMARD 1, amax, 3, 7, 4, 5
1844 %endif ; sse2/non-sse2
1849 %endif ; ifndef mmx2
1851 cglobal pixel_sa8d_8x8, 4,7
1858 call pixel_sa8d_8x8_internal
1863 %endif ; HIGH_BIT_DEPTH
1870 cglobal pixel_sa8d_16x16, 4,7
1877 call pixel_sa8d_8x8_internal
1886 call pixel_sa8d_8x8_internal
1889 add r0, 8*SIZEOF_PIXEL
1890 add r2, 8*SIZEOF_PIXEL
1893 call pixel_sa8d_8x8_internal
1900 mova [esp+64-mmsize], m0
1901 call pixel_sa8d_8x8_internal
1904 %else ; !HIGH_BIT_DEPTH
1905 paddusw m0, [esp+64-mmsize]
1922 %endif ; HIGH_BIT_DEPTH
1928 %endif ; !ARCH_X86_64
1931 ;=============================================================================
1933 ;=============================================================================
1935 ; %1: vertical/horizontal mode
1936 ; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9)
1938 ; m6, m11-15: tmp regs
1939 %macro SA8D_SATD_8x4 5
1941 LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
1942 HADAMARD 0, sumsub, %2, %3, 6
1943 HADAMARD 0, sumsub, %4, %5, 6
1944 SBUTTERFLY wd, %2, %3, 6
1945 SBUTTERFLY wd, %4, %5, 6
1946 HADAMARD2_2D %2, %4, %3, %5, 6, dq
1952 HADAMARD 0, sumsub, %2, %3, 6
1953 HADAMARD 0, sumsub, %4, %5, 6
1954 SBUTTERFLY qdq, 12, 13, 6
1955 HADAMARD 0, amax, 12, 13, 6
1956 SBUTTERFLY qdq, 14, 15, 6
1958 HADAMARD 0, amax, 14, 15, 6
1961 LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
1962 HADAMARD4_V %2, %3, %4, %5, 6
1964 pabsw m12, m%2 ; doing the abs first is a slight advantage
1968 HADAMARD 1, max, 12, 14, 6, 11
1970 HADAMARD 1, max, 13, 15, 6, 11
1973 %endmacro ; SA8D_SATD_8x4
1975 ; %1: add spilled regs?
1977 %macro SA8D_SATD_ACCUM 2
1998 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
1999 cglobal pixel_sa8d_satd_8x8_internal
2000 SA8D_SATD_8x4 vertical, 0, 1, 2, 3
2001 SA8D_SATD_8x4 vertical, 4, 5, 8, 9
2003 %if vertical ; sse2-style
2004 HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax
2005 HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax
2006 %else ; complete sa8d
2007 SUMSUB_BADC w, 0, 4, 1, 5, 12
2008 HADAMARD 2, sumsub, 0, 4, 12, 11
2009 HADAMARD 2, sumsub, 1, 5, 12, 11
2010 SUMSUB_BADC w, 2, 8, 3, 9, 12
2011 HADAMARD 2, sumsub, 2, 8, 12, 11
2012 HADAMARD 2, sumsub, 3, 9, 12, 11
2013 HADAMARD 1, amax, 0, 4, 12, 11
2014 HADAMARD 1, amax, 1, 5, 12, 4
2015 HADAMARD 1, amax, 2, 8, 12, 4
2016 HADAMARD 1, amax, 3, 9, 12, 4
2019 ; create sa8d sub results
2027 ;-------------------------------------------------------------------------------
2028 ; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t )
2029 ;-------------------------------------------------------------------------------
2030 cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize
2031 %define temp0 [rsp+0*mmsize]
2032 %define temp1 [rsp+1*mmsize]
2042 call pixel_sa8d_satd_8x8_internal
2043 SA8D_SATD_ACCUM 0, 1
2044 call pixel_sa8d_satd_8x8_internal
2045 SA8D_SATD_ACCUM 1, 0
2046 vextracti128 xm1, m0, 1
2047 vextracti128 xm2, m10, 1
2051 lea r6, [r2+8*SIZEOF_PIXEL]
2052 lea r7, [r0+8*SIZEOF_PIXEL]
2054 call pixel_sa8d_satd_8x8_internal
2055 SA8D_SATD_ACCUM 0, 1
2056 call pixel_sa8d_satd_8x8_internal
2057 SA8D_SATD_ACCUM 1, 1
2062 call pixel_sa8d_satd_8x8_internal
2063 SA8D_SATD_ACCUM 1, 1
2064 call pixel_sa8d_satd_8x8_internal
2065 SA8D_SATD_ACCUM 1, 0
2068 ; xop already has fast horizontal sums
2069 %if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0
2070 pmaddwd xm10, [pw_1]
2072 phaddd xm0, xm10 ; sa8d1 sa8d2 satd1 satd2
2073 pshufd xm1, xm0, q2301 ; sa8d2 sa8d1 satd2 satd1
2074 paddd xm0, xm1 ; sa8d sa8d satd satd
2093 %endmacro ; SA8D_SATD
2095 ;=============================================================================
2097 ;=============================================================================
2108 ; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+,
2109 ; and are only retained for old cpus.
2110 %macro INTRA_SA8D_SSE2 0
2112 ;-----------------------------------------------------------------------------
2113 ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
2114 ;-----------------------------------------------------------------------------
2115 cglobal intra_sa8d_x3_8x8, 3,3,14
2118 movq m0, [r0+0*FENC_STRIDE]
2119 movq m1, [r0+1*FENC_STRIDE]
2120 movq m2, [r0+2*FENC_STRIDE]
2121 movq m3, [r0+3*FENC_STRIDE]
2122 movq m4, [r0+4*FENC_STRIDE]
2123 movq m5, [r0+5*FENC_STRIDE]
2124 movq m6, [r0+6*FENC_STRIDE]
2125 movq m7, [r0+7*FENC_STRIDE]
2135 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
2137 ABSW2 m8, m9, m2, m3, m2, m3
2138 ABSW2 m10, m11, m4, m5, m4, m5
2141 ABSW2 m10, m11, m6, m7, m6, m7
2148 ; 1D hadamard of edges
2154 HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q1032, [pw_ppppmmmm]
2155 HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q2301, [pw_ppmmppmm]
2156 pshuflw m10, m8, q2301
2157 pshuflw m11, m9, q2301
2158 pshufhw m10, m10, q2301
2159 pshufhw m11, m11, q2301
2160 pmullw m8, [pw_pmpmpmpm]
2161 pmullw m11, [pw_pmpmpmpm]
2171 psllw m8, 3 ; left edge
2174 ABSW2 m8, m10, m8, m10, m11, m12 ; 1x8 sum
2183 punpcklqdq m0, m4 ; transpose
2184 psllw m9, 3 ; top edge
2185 psrldq m2, m13, 2 ; 8x7 sum
2186 psubw m0, m9 ; 8x1 sum
2195 punpckhdq m3, m2, m8
2197 pshufd m5, m13, q3311
2200 punpckhqdq m0, m2, m5
2205 movq [r2], m0 ; i8x8_v, i8x8_h
2207 movd [r2+8], m0 ; i8x8_dc
2209 %endif ; ARCH_X86_64
2210 %endmacro ; INTRA_SA8D_SSE2
2213 ; out: m0..m3 = hadamard coefs
2215 cglobal hadamard_load
2216 ; not really a global, but otherwise cycles get attributed to the wrong function in profiling
2218 mova m0, [r0+0*FENC_STRIDEB]
2219 mova m1, [r0+1*FENC_STRIDEB]
2220 mova m2, [r0+2*FENC_STRIDEB]
2221 mova m3, [r0+3*FENC_STRIDEB]
2224 movd m0, [r0+0*FENC_STRIDE]
2225 movd m1, [r0+1*FENC_STRIDE]
2226 movd m2, [r0+2*FENC_STRIDE]
2227 movd m3, [r0+3*FENC_STRIDE]
2233 HADAMARD4_2D 0, 1, 2, 3, 4
2237 %macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp
2240 mova %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
2242 movd %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
2248 shl %2d, 5 ; log(FDEC_STRIDEB)
2250 movd %3, [r1+%2*SIZEOF_PIXEL-4+1*FDEC_STRIDEB]
2251 pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0
2252 pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2
2253 pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3
2254 %if HIGH_BIT_DEPTH == 0
2262 %define %%sign psignw
2264 %define %%sign pmullw
2266 pshufw %4, %3, q1032
2267 %%sign %4, [pw_ppmmppmm]
2269 pshufw %4, %3, q2301
2270 %%sign %4, [pw_pmpmpmpm]
2273 mova [%1_1d+2*%2], %3
2276 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
2278 pshufw %4, %1, q1032
2279 pshufw %5, %2, q1032
2280 pshufw %6, %3, q1032
2287 pshufw %4, %1, q1032
2288 pshufw %5, %2, q1032
2289 pshufw %6, %3, q1032
2299 ABSW2 m4, m5, m1, m2, m1, m2
2306 ; out: m0 v, m4 h, m5 dc
2308 %macro SUM4x3 3 ; dc, left, top
2319 punpckldq m0, m2 ; transpose
2321 ABSW2 m4, m5, m4, m5, m2, m3 ; 1x4 sum
2322 ABSW m0, m0, m1 ; 4x1 sum
2325 %macro INTRA_X3_MMX 0
2326 ;-----------------------------------------------------------------------------
2327 ; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
2328 ;-----------------------------------------------------------------------------
2329 cglobal intra_satd_x3_4x4, 3,3
2331 ; stack is 16 byte aligned because abi says so
2332 %define top_1d rsp-8 ; size 8
2333 %define left_1d rsp-16 ; size 8
2335 ; WIN64: stack is 16 byte aligned because abi says so
2336 ; X86_32: stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
2338 %define top_1d rsp+8
2343 SCALAR_HADAMARD left, 0, m4, m5
2344 SCALAR_HADAMARD top, 0, m6, m5, m7
2347 pand m6, [sw_f0] ; dc
2350 SUM4x3 m6, [left_1d], [top_1d]
2354 psrlq m1, 16 ; 4x3 sum
2357 SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw
2358 movd [r2+0], m0 ; i4x4_v satd
2359 movd [r2+4], m4 ; i4x4_h satd
2360 movd [r2+8], m5 ; i4x4_dc satd
2366 ;-----------------------------------------------------------------------------
2367 ; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
2368 ;-----------------------------------------------------------------------------
2369 cglobal intra_satd_x3_16x16, 0,5
2370 %assign stack_pad 120 + ((stack_offset+120+gprsize)&15)
2371 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
2373 %define sums rsp+64 ; size 56
2374 %define top_1d rsp+32 ; size 32
2375 %define left_1d rsp ; size 32
2393 SCALAR_HADAMARD left, r3, m0, m1
2394 SCALAR_HADAMARD top, r3, m1, m2, m3
2400 pand m6, [sw_f0] ; dc
2411 SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)]
2414 paddw m0, [sums+ 0] ; i16x16_v satd
2415 paddw m4, [sums+ 8] ; i16x16_h satd
2416 paddw m5, [sums+16] ; i16x16_dc satd
2421 add r0, 4*SIZEOF_PIXEL
2438 punpckhwd m3, m5, m7
2448 add r0, 4*FENC_STRIDEB-16*SIZEOF_PIXEL
2457 HADDD m5, m7 ; DC satd
2458 HADDD m4, m7 ; H satd
2459 HADDD m0, m7 ; the part of V satd that doesn't overlap with DC
2461 psrlq m1, 32 ; DC[1]
2462 paddd m0, m3 ; DC[2]
2463 psrlq m3, 32 ; DC[3]
2468 SUM_MM_X3 m0, m4, m5, m3, m1, m2, m6, paddd
2475 movd [r2+8], m5 ; i16x16_dc satd
2476 movd [r2+4], m4 ; i16x16_h satd
2477 movd [r2+0], m0 ; i16x16_v satd
2487 ;-----------------------------------------------------------------------------
2488 ; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
2489 ;-----------------------------------------------------------------------------
2490 cglobal intra_satd_x3_8x8c, 0,6
2491 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
2493 %define sums rsp+48 ; size 24
2494 %define dc_1d rsp+32 ; size 16
2495 %define top_1d rsp+16 ; size 16
2496 %define left_1d rsp ; size 16
2506 SCALAR_HADAMARD left, r3, m0, m1
2507 SCALAR_HADAMARD top, r3, m0, m1, m2
2512 movzx t0d, word [left_1d+0]
2513 movzx r3d, word [top_1d+0]
2514 movzx r4d, word [left_1d+8]
2515 movzx r5d, word [top_1d+8]
2516 lea t0d, [t0 + r3 + 16]
2517 lea r3d, [r4 + r5 + 16]
2526 mov [dc_1d+ 0], t0d ; tl
2527 mov [dc_1d+ 4], r5d ; tr
2528 mov [dc_1d+ 8], r4d ; bl
2529 mov [dc_1d+12], r3d ; br
2542 SUM4x3 [r5+4*(r4+2)], [left_1d+8*(r3+2)], [top_1d+8*(r4+2)]
2545 paddw m0, [sums+16] ; i4x4_v satd
2546 paddw m4, [sums+8] ; i4x4_h satd
2547 paddw m5, [sums+0] ; i4x4_dc satd
2552 add r0, 4*SIZEOF_PIXEL
2555 add r0, 4*FENC_STRIDEB-8*SIZEOF_PIXEL
2568 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
2574 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
2577 movd [r2+0], m0 ; i8x8c_dc satd
2578 movd [r2+4], m1 ; i8x8c_h satd
2579 movd [r2+8], m2 ; i8x8c_v satd
2582 %endmacro ; INTRA_X3_MMX
2586 %macro PRED4x4_LOWPASS 5
2603 %macro INTRA_X9_PRED 2
2605 movu m1, [r1-1*FDEC_STRIDE-8]
2606 pinsrb m1, [r1+3*FDEC_STRIDE-1], 0
2607 pinsrb m1, [r1+2*FDEC_STRIDE-1], 1
2608 pinsrb m1, [r1+1*FDEC_STRIDE-1], 2
2609 pinsrb m1, [r1+0*FDEC_STRIDE-1], 3
2611 movd mm0, [r1+3*FDEC_STRIDE-4]
2612 punpcklbw mm0, [r1+2*FDEC_STRIDE-4]
2613 movd mm1, [r1+1*FDEC_STRIDE-4]
2614 punpcklbw mm1, [r1+0*FDEC_STRIDE-4]
2618 movu m1, [r1-1*FDEC_STRIDE-8]
2619 movss m1, m0 ; l3 l2 l1 l0 __ __ __ lt t0 t1 t2 t3 t4 t5 t6 t7
2621 pshufb m1, [intrax9_edge] ; l3 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __
2622 psrldq m0, m1, 1 ; l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __
2623 psrldq m2, m1, 2 ; l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ __
2624 pavgb m5, m0, m1 ; Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 __ __ __ __ __
2626 PRED4x4_LOWPASS m0, m1, m2, m0, m4 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 __ __ __
2628 ; Ft1 Ft2 Ft3 Ft4 Flt Ft0 Ft1 Ft2
2629 ; Ft2 Ft3 Ft4 Ft5 Fl0 Flt Ft0 Ft1
2630 ; Ft3 Ft4 Ft5 Ft6 Fl1 Fl0 Flt Ft0
2631 ; Ft4 Ft5 Ft6 Ft7 Fl2 Fl1 Fl0 Flt
2632 pshufb m2, m0, [%1_ddlr1] ; a: ddl row0, ddl row1, ddr row0, ddr row1 / b: ddl row0, ddr row0, ddl row1, ddr row1
2633 pshufb m3, m0, [%1_ddlr2] ; rows 2,3
2635 ; Glt Flt Ft0 Ft1 Gl0 Fl1 Gl1 Fl2
2636 ; Gl0 Fl0 Glt Flt Gl1 Fl2 Gl2 Fl3
2637 ; Gl1 Fl1 Gl0 Fl0 Gl2 Fl3 Gl3 Gl3
2638 ; Gl2 Fl2 Gl1 Fl1 Gl3 Gl3 Gl3 Gl3
2639 pslldq m0, 5 ; ___ ___ ___ ___ ___ Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
2640 palignr m7, m5, m0, 5 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gl3 Gl2 Gl1 Gl0 Glt
2641 pshufb m6, m7, [%1_hdu1]
2642 pshufb m7, m7, [%1_hdu2]
2644 ; Gt0 Gt1 Gt2 Gt3 Gt1 Gt2 Gt3 Gt4
2645 ; Flt Ft0 Ft1 Ft2 Ft1 Ft2 Ft3 Ft4
2646 ; Fl0 Gt0 Gt1 Gt2 Gt2 Gt3 Gt4 Gt5
2647 ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
2648 psrldq m5, 5 ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 ...
2649 palignr m5, m0, 6 ; ___ Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
2650 pshufb m4, m5, [%1_vrl1]
2651 pshufb m5, m5, [%1_vrl2]
2652 %endmacro ; INTRA_X9_PRED
2654 %macro INTRA_X9_VHDC 5 ; edge, fenc01, fenc23, tmp, tmp
2655 pshufb m2, m%1, [intrax9b_vh1]
2656 pshufb m3, m%1, [intrax9b_vh2]
2657 mova [pred_buf+0x60], m2
2658 mova [pred_buf+0x70], m3
2659 pshufb m%1, [intrax9b_edge2] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3
2660 pmaddubsw m%1, [hmul_4p]
2661 pshufhw m0, m%1, q2301
2662 pshuflw m0, m0, q2301
2663 psignw m%1, [pw_pmpmpmpm]
2665 psllw m0, 2 ; hadamard(top), hadamard(left)
2667 pshufb m1, m0, [intrax9b_v1]
2668 pshufb m2, m0, [intrax9b_v2]
2670 psignw m3, [pw_pmmpzzzz] ; FIXME could this be eliminated?
2672 pand m0, [sw_f0] ; dc
2673 ; This (as well as one of the steps in intra_satd_x9_4x4.satd_8x4) could be
2674 ; changed from a wd transpose to a qdq, with appropriate rearrangement of inputs.
2675 ; Which would be faster on conroe, but slower on penryn and sandybridge, and too invasive to ifdef.
2676 HADAMARD 0, sumsub, %2, %3, %4, %5
2677 HADAMARD 1, sumsub, %2, %3, %4, %5
2680 imul r3d, 0x01010101
2681 mov [pred_buf+0x80], r3d
2682 mov [pred_buf+0x88], r3d
2683 mov [pred_buf+0x90], r3d
2684 mov [pred_buf+0x98], r3d
2700 SBUTTERFLY qdq, 3, 0, 2
2711 pmaddwd m1, [pw_1] ; v, _, h, dc
2713 %endmacro ; INTRA_X9_VHDC
2715 %macro INTRA_X9_END 2
2717 phminposuw m0, m0 ; h,dc,ddl,ddr,vr,hd,vl,hu
2724 ; 4x4 sad is up to 12 bits; +bitcosts -> 13 bits; pack with 3 bit index
2726 paddw m0, [pw_s01234567] ; h,dc,ddl,ddr,vr,hd,vl,hu
2728 ; 4x4 satd is up to 13 bits; +bitcosts and saturate -> 13 bits; pack with 3 bit index
2731 paddw m0, [pw_s01234657] ; h,dc,ddl,ddr,vr,vl,hd,hu
2735 pshuflw m1, m0, q0032
2737 pshuflw m1, m0, q0001
2744 ; 1<<16: increment index to match intra4x4_pred_e. couldn't do this before because it had to fit in 3 bits
2745 ; 1<<12: undo sign manipulation
2746 lea eax, [rax+r2+(1<<16)+(1<<12)]
2751 ; output the predicted samples
2756 movzx r2d, byte [r2+r3]
2758 movzx r2d, byte [%2_lut+r3]
2761 movq mm0, [pred_buf+r2]
2762 movq mm1, [pred_buf+r2+16]
2763 movd [r1+0*FDEC_STRIDE], mm0
2764 movd [r1+2*FDEC_STRIDE], mm1
2767 movd [r1+1*FDEC_STRIDE], mm0
2768 movd [r1+3*FDEC_STRIDE], mm1
2772 mov r3d, [pred_buf+r2+8*i]
2773 mov [r1+i*FDEC_STRIDE], r3d
2777 %endmacro ; INTRA_X9_END
2780 ;-----------------------------------------------------------------------------
2781 ; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
2782 ;-----------------------------------------------------------------------------
2784 cglobal intra_sad_x9_4x4, 3,4,9
2785 %assign pad 0xc0-gprsize-(stack_offset&15)
2786 %define pred_buf rsp
2789 INTRA_X9_PRED intrax9a, m8
2791 INTRA_X9_PRED intrax9a, [rsp+0xa0]
2800 movd m0, [r0+0*FENC_STRIDE]
2801 pinsrd m0, [r0+1*FENC_STRIDE], 1
2802 movd m1, [r0+2*FENC_STRIDE]
2803 pinsrd m1, [r0+3*FENC_STRIDE], 1
2805 movd mm0, [r0+0*FENC_STRIDE]
2806 punpckldq mm0, [r0+1*FENC_STRIDE]
2807 movd mm1, [r0+2*FENC_STRIDE]
2808 punpckldq mm1, [r0+3*FENC_STRIDE]
2829 %define %%zero [pb_0]
2831 pshufb m3, m7, [intrax9a_vh1]
2832 pshufb m5, m7, [intrax9a_vh2]
2833 pshufb m7, [intrax9a_dc]
2848 movzx r3d, word [r2]
2851 punpckhqdq m3, m0 ; h, dc
2852 shufps m3, m2, q2020
2858 INTRA_X9_END 1, intrax9a
2864 ;-----------------------------------------------------------------------------
2865 ; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
2866 ;-----------------------------------------------------------------------------
2867 cglobal intra_satd_x9_4x4, 3,4,16
2868 %assign pad 0xb0-gprsize-(stack_offset&15)
2869 %define pred_buf rsp
2871 INTRA_X9_PRED intrax9b, m15
2878 movd m8, [r0+0*FENC_STRIDE]
2879 movd m9, [r0+1*FENC_STRIDE]
2880 movd m10, [r0+2*FENC_STRIDE]
2881 movd m11, [r0+3*FENC_STRIDE]
2892 pshufd m1, m2, q3232
2895 call .satd_8x4 ; ddr, ddl
2897 pshufd m3, m5, q3232
2900 pshufd m1, m4, q3232
2901 call .satd_8x4 ; vr, vl
2903 pshufd m3, m7, q3232
2906 pshufd m1, m6, q3232
2907 call .satd_8x4 ; hd, hu
2911 punpcklqdq m4, m0 ; conroe dislikes punpckldq, and ssse3 INTRA_X9_END can handle arbitrary orders whereas phminposuw can't
2913 mova m1, [pw_ppmmppmm]
2918 INTRA_X9_VHDC 15, 8, 10, 6, 7
2923 %if notcpuflag(sse4)
2924 pshufhw m0, m0, q3120 ; compensate for different order in unpack
2928 movzx r0d, word [r2]
2930 INTRA_X9_END 0, intrax9b
2933 RESET_MM_PERMUTATION
2944 SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap
2947 pshufd m1, m0, q0032
2951 paddd xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free
2954 %else ; !ARCH_X86_64
2955 cglobal intra_satd_x9_4x4, 3,4,8
2956 %assign pad 0x120-gprsize-(stack_offset&15)
2957 %define fenc_buf rsp
2958 %define pred_buf rsp+0x40
2959 %define spill rsp+0xe0
2961 INTRA_X9_PRED intrax9b, [spill+0x20]
2962 mova [pred_buf+0x00], m2
2963 mova [pred_buf+0x10], m3
2964 mova [pred_buf+0x20], m4
2965 mova [pred_buf+0x30], m5
2966 mova [pred_buf+0x40], m6
2967 mova [pred_buf+0x50], m7
2968 movd m4, [r0+0*FENC_STRIDE]
2969 movd m5, [r0+1*FENC_STRIDE]
2970 movd m6, [r0+2*FENC_STRIDE]
2971 movd m0, [r0+3*FENC_STRIDE]
2981 mova [fenc_buf+0x00], m4
2982 mova [fenc_buf+0x10], m5
2983 mova [fenc_buf+0x20], m6
2984 mova [fenc_buf+0x30], m0
2986 pshufd m1, m2, q3232
2996 call .satd_8x4b ; ddr, ddl
2997 mova m3, [pred_buf+0x30]
2998 mova m1, [pred_buf+0x20]
3001 movq [spill+0x08], m0
3004 call .satd_8x4 ; vr, vl
3005 mova m3, [pred_buf+0x50]
3006 mova m1, [pred_buf+0x40]
3009 movq [spill+0x10], m0
3012 call .satd_8x4 ; hd, hu
3013 movq [spill+0x18], m0
3014 mova m1, [spill+0x20]
3015 mova m4, [fenc_buf+0x00]
3016 mova m5, [fenc_buf+0x20]
3017 mova m2, [pw_ppmmppmm]
3020 paddw m4, [fenc_buf+0x10]
3021 paddw m5, [fenc_buf+0x30]
3022 INTRA_X9_VHDC 1, 4, 5, 6, 7
3026 punpckhqdq m1, [spill+0x00]
3027 packssdw m1, [spill+0x10]
3029 pshufhw m1, m1, q3120
3031 pshufhw m0, m0, q3120
3034 movzx r0d, word [r2]
3036 INTRA_X9_END 0, intrax9b
3039 RESET_MM_PERMUTATION
3046 %xdefine fenc_buf fenc_buf+gprsize
3047 psubw m0, [fenc_buf+0x00]
3048 psubw m1, [fenc_buf+0x10]
3049 psubw m2, [fenc_buf+0x20]
3051 psubw m3, [fenc_buf+0x30]
3052 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap
3055 pshufd m1, m0, q0032
3062 %endmacro ; INTRA_X9
3067 ;-----------------------------------------------------------------------------
3068 ; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
3069 ;-----------------------------------------------------------------------------
3070 cglobal intra_sad_x9_8x8, 5,6,9
3080 %assign padbase 0x10
3082 %assign pad 0x240+0x10+padbase-gprsize-(stack_offset&15)
3083 %define pred(i,j) [rsp+i*0x40+j*0x10+padbase]
3086 movq fenc02, [r0+FENC_STRIDE* 0]
3087 movq fenc13, [r0+FENC_STRIDE* 1]
3088 movq fenc46, [r0+FENC_STRIDE* 4]
3089 movq fenc57, [r0+FENC_STRIDE* 5]
3090 movhps fenc02, [r0+FENC_STRIDE* 2]
3091 movhps fenc13, [r0+FENC_STRIDE* 3]
3092 movhps fenc46, [r0+FENC_STRIDE* 6]
3093 movhps fenc57, [r0+FENC_STRIDE* 7]
3095 ; save instruction size: avoid 4-byte memory offsets
3096 lea r0, [intra8x9_h1+128]
3097 %define off(m) (r0+m-(intra8x9_h1+128))
3102 psadbw m1, m0, fenc02
3104 psadbw m2, m0, fenc13
3106 psadbw m3, m0, fenc46
3108 psadbw m0, m0, fenc57
3118 pshufb m1, m0, [off(intra8x9_h1)]
3119 pshufb m2, m0, [off(intra8x9_h2)]
3125 pshufb m3, m0, [off(intra8x9_h3)]
3126 pshufb m2, m0, [off(intra8x9_h4)]
3137 lea r5, [rsp+padbase+0x100]
3138 %define pred(i,j) [r5+i*0x40+j*0x10-0x100]
3150 psadbw m1, m0, fenc02
3152 psadbw m2, m0, fenc13
3154 psadbw m3, m0, fenc46
3156 psadbw m0, m0, fenc57
3165 ; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
3166 ; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
3167 ; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
3168 ; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
3169 ; Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC
3170 ; Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD
3171 ; Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE
3172 ; Ft8 Ft9 FtA FtB FtC FtD FtE FtF
3176 pavgb m3, m0, m2 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB ___ ___ ___ ___ ___
3177 PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; ___ Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE FtF
3178 pshufb m1, m0, [off(intra8x9_ddl1)]
3179 pshufb m2, m0, [off(intra8x9_ddl2)]
3185 pshufb m2, m0, [off(intra8x9_ddl3)]
3189 pshufb m2, m0, [off(intra8x9_ddl4)]
3198 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8
3199 ; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
3200 ; Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9
3201 ; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
3202 ; Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA
3203 ; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
3204 ; Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB
3205 ; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
3206 pshufb m1, m3, [off(intra8x9_vl1)]
3207 pshufb m2, m0, [off(intra8x9_vl2)]
3208 pshufb m3, m3, [off(intra8x9_vl3)]
3209 pshufb m0, m0, [off(intra8x9_vl4)]
3224 pextrw [r4+14], m0, 0
3228 lea r5, [rsp+padbase+0x100]
3232 ; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
3233 ; Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
3234 ; Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4
3235 ; Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3
3236 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2
3237 ; Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1
3238 ; Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0
3239 ; Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt
3243 pavgb m3, m2, m0 ; Gl6 Gl5 Gl4 Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
3244 PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
3245 pshufb m1, m0, [off(intra8x9_ddr1)]
3246 pshufb m2, m0, [off(intra8x9_ddr2)]
3252 pshufb m2, m0, [off(intra8x9_ddr3)]
3256 pshufb m2, m0, [off(intra8x9_ddr4)]
3266 %define off(m) (r0+m-(intra8x9_h1+256+128))
3267 %define pred(i,j) [r5+i*0x40+j*0x10-0x1C0]
3270 ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
3271 ; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
3272 ; Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6
3273 ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
3274 ; Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
3275 ; Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4
3276 ; Fl4 Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4
3277 ; Fl5 Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3
3278 movsd m2, m3, m0 ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
3279 pshufb m1, m2, [off(intra8x9_vr1)]
3280 pshufb m2, m2, [off(intra8x9_vr3)]
3286 pshufb m2, m0, [off(intra8x9_vr2)]
3290 pshufb m2, m0, [off(intra8x9_vr4)]
3299 ; Glt Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
3300 ; Gl0 Fl0 Glt Flt Ft0 Ft1 Ft2 Ft3
3301 ; Gl1 Fl1 Gl0 Fl0 Glt Flt Ft0 Ft1
3302 ; Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 Glt Flt
3303 ; Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0
3304 ; Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1
3305 ; Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2
3306 ; Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3
3307 pshufd m2, m3, q0001
3309 pblendw m2, m0, q3330 ; Gl2 Gl1 Gl0 Glt ___ Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ___
3314 punpcklbw m0, m3 ; Fl7 Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 ___
3315 pshufb m1, m2, [off(intra8x9_hd1)]
3316 pshufb m2, m2, [off(intra8x9_hd2)]
3322 pshufb m2, m0, [off(intra8x9_hd3)]
3323 pshufb m3, m0, [off(intra8x9_hd4)]
3332 ; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall
3337 ; Gl0 Fl1 Gl1 Fl2 Gl2 Fl3 Gl3 Fl4
3338 ; Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 Gl4 Fl5
3339 ; Gl2 Fl3 Gl3 Gl3 Gl4 Fl5 Gl5 Fl6
3340 ; Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 Gl6 Fl7
3341 ; Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 Gl7 Gl7
3342 ; Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 Gl7 Gl7
3343 ; Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
3344 ; Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
3346 pinsrb m0, [r2+7], 15 ; Gl7
3353 pshufb m1, m0, [off(intra8x9_hu1)]
3354 pshufb m2, m0, [off(intra8x9_hu2)]
3360 pshufb m2, m0, [off(intra8x9_hu3)]
3361 pshufb m0, m0, [off(intra8x9_hu4)]
3376 movzx r5d, word [r3+16]
3381 phminposuw m0, m0 ; v,h,dc,ddl,ddr,vr,hd,vl
3384 ; 8x8 sad is up to 14 bits; +bitcosts and saturate -> 14 bits; pack with 2 bit index
3387 paddw m0, [off(pw_s00112233)]
3390 pshuflw m1, m0, q0032
3393 ; repack with 3 bit index
3401 ; reverse to phminposuw order
3415 add r1, 4*FDEC_STRIDE
3416 mova m0, [rsp+padbase+r2+0x00]
3417 mova m1, [rsp+padbase+r2+0x10]
3418 mova m2, [rsp+padbase+r2+0x20]
3419 mova m3, [rsp+padbase+r2+0x30]
3420 movq [r1+FDEC_STRIDE*-4], m0
3421 movhps [r1+FDEC_STRIDE*-2], m0
3422 movq [r1+FDEC_STRIDE*-3], m1
3423 movhps [r1+FDEC_STRIDE*-1], m1
3424 movq [r1+FDEC_STRIDE* 0], m2
3425 movhps [r1+FDEC_STRIDE* 2], m2
3426 movq [r1+FDEC_STRIDE* 1], m3
3427 movhps [r1+FDEC_STRIDE* 3], m3
3432 ;-----------------------------------------------------------------------------
3433 ; int intra_sa8d_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
3434 ;-----------------------------------------------------------------------------
3435 cglobal intra_sa8d_x9_8x8, 5,6,16
3436 %assign pad 0x2c0+0x10-gprsize-(stack_offset&15)
3437 %define fenc_buf rsp
3438 %define pred_buf rsp+0x80
3444 movddup m %+ %%i, [r0+%%i*FENC_STRIDE]
3445 pmaddubsw m9, m %+ %%i, m15
3446 punpcklbw m %+ %%i, m8
3447 mova [fenc_buf+%%i*0x10], m9
3451 ; save instruction size: avoid 4-byte memory offsets
3452 lea r0, [intra8x9_h1+0x80]
3453 %define off(m) (r0+m-(intra8x9_h1+0x80))
3454 lea r5, [pred_buf+0x80]
3457 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
3466 ; 1D hadamard of edges
3474 pshufb m9, [intrax3_shuf]
3475 pmaddubsw m8, [pb_pppm]
3476 pmaddubsw m9, [pb_pppm]
3477 HSUMSUB2 psignw, m8, m9, m12, m13, m9, q1032, [pw_ppppmmmm]
3478 HSUMSUB2 psignw, m8, m9, m12, m13, m9, q2301, [pw_ppmmppmm]
3494 psllw m8, 3 ; left edge
3497 pabsw m8, m8 ; 1x8 sum
3507 punpcklqdq m0, m4 ; transpose
3508 psllw m9, 3 ; top edge
3509 psrldq m10, m11, 2 ; 8x7 sum
3510 psubw m0, m9 ; 8x1 sum
3514 phaddd m10, m8 ; logically phaddw, but this is faster and it won't overflow
3520 pshufb m0, m3, [off(intra8x9_h1)]
3521 pshufb m1, m3, [off(intra8x9_h2)]
3522 pshufb m2, m3, [off(intra8x9_h3)]
3523 pshufb m3, m3, [off(intra8x9_h4)]
3534 PRED4x4_LOWPASS m8, m1, m2, m8, m3
3535 pshufb m0, m8, [off(intra8x9_ddl1)]
3536 pshufb m1, m8, [off(intra8x9_ddl2)]
3537 pshufb m2, m8, [off(intra8x9_ddl3)]
3538 pshufb m3, m8, [off(intra8x9_ddl4)]
3544 pshufb m0, m9, [off(intra8x9_vl1)]
3545 pshufb m1, m8, [off(intra8x9_vl2)]
3546 pshufb m2, m9, [off(intra8x9_vl3)]
3547 pshufb m3, m8, [off(intra8x9_vl4)]
3558 PRED4x4_LOWPASS m8, m1, m2, m8, m3
3559 pshufb m0, m8, [off(intra8x9_ddr1)]
3560 pshufb m1, m8, [off(intra8x9_ddr2)]
3561 pshufb m2, m8, [off(intra8x9_ddr3)]
3562 pshufb m3, m8, [off(intra8x9_ddr4)]
3568 %define off(m) (r0+m-(intra8x9_h1+0x180))
3572 pshufb m0, m2, [off(intra8x9_vr1)]
3573 pshufb m1, m8, [off(intra8x9_vr2)]
3574 pshufb m2, m2, [off(intra8x9_vr3)]
3575 pshufb m3, m8, [off(intra8x9_vr4)]
3582 pshufd m1, m9, q0001
3583 pblendw m1, m8, q3330
3585 pshufd m2, m9, q0001
3589 pshufb m0, m1, [off(intra8x9_hd1)]
3590 pshufb m1, m1, [off(intra8x9_hd2)]
3591 pshufb m2, m8, [off(intra8x9_hd3)]
3592 pshufb m3, m8, [off(intra8x9_hd4)]
3600 pinsrb m8, [r2+7], 15
3607 pshufb m0, m8, [off(intra8x9_hu1)]
3608 pshufb m1, m8, [off(intra8x9_hu2)]
3609 pshufb m2, m8, [off(intra8x9_hu3)]
3610 pshufb m3, m8, [off(intra8x9_hu4)]
3618 pshuflw m1, m0, q0032
3627 movzx r5d, word [r3+16]
3635 ; 8x8 sa8d is up to 15 bits; +bitcosts and saturate -> 15 bits; pack with 1 bit index
3637 paddw m0, [off(pw_s00001111)]
3640 pshuflw m1, m0, q0032
3643 pcmpgtw m2, m1 ; 2nd index bit
3646 ; repack with 3 bit index
3654 lea r3d, [ r3*4+r4+1]
3657 ; reverse to phminposuw order
3671 add r1, 4*FDEC_STRIDE
3672 mova m0, [pred_buf+r2+0x00]
3673 mova m1, [pred_buf+r2+0x10]
3674 mova m2, [pred_buf+r2+0x20]
3675 mova m3, [pred_buf+r2+0x30]
3676 movq [r1+FDEC_STRIDE*-4], m0
3677 movhps [r1+FDEC_STRIDE*-2], m0
3678 movq [r1+FDEC_STRIDE*-3], m1
3679 movhps [r1+FDEC_STRIDE*-1], m1
3680 movq [r1+FDEC_STRIDE* 0], m2
3681 movhps [r1+FDEC_STRIDE* 2], m2
3682 movq [r1+FDEC_STRIDE* 1], m3
3683 movhps [r1+FDEC_STRIDE* 3], m3
3690 %xdefine fenc_buf fenc_buf+gprsize
3703 PERMUTE 0,4, 1,5, 2,0, 3,1, 4,6, 5,7, 6,2, 7,3
3706 psubw m0, [fenc_buf+0x00]
3707 psubw m1, [fenc_buf+0x10]
3710 psubw m2, [fenc_buf+0x20]
3711 psubw m3, [fenc_buf+0x30]
3714 psubw m4, [fenc_buf+0x40]
3715 psubw m5, [fenc_buf+0x50]
3718 psubw m6, [fenc_buf+0x60]
3719 psubw m7, [fenc_buf+0x70]
3720 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 13, 14
3725 %endif ; ARCH_X86_64
3726 %endmacro ; INTRA8_X9
3728 ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
3729 ; out: [tmp]=hadamard4, m0=satd
3731 cglobal hadamard_ac_4x4
3737 %else ; !HIGH_BIT_DEPTH
3746 %endif ; HIGH_BIT_DEPTH
3747 HADAMARD4_2D 0, 1, 2, 3, 4
3763 cglobal hadamard_ac_2x2max
3769 SUMSUB_BADC w, 0, 1, 2, 3, 4
3770 ABSW2 m0, m2, m0, m2, m4, m5
3771 ABSW2 m1, m3, m1, m3, m4, m5
3772 HADAMARD 0, max, 0, 2, 4, 5
3773 HADAMARD 0, max, 1, 3, 4, 5
3779 %else ; !HIGH_BIT_DEPTH
3782 %endif ; HIGH_BIT_DEPTH
3798 %endif ; HIGH_BIT_DEPTH
3801 cglobal hadamard_ac_8x8
3807 %endif ; HIGH_BIT_DEPTH
3808 call hadamard_ac_4x4_mmx2
3809 add r0, 4*SIZEOF_PIXEL
3813 call hadamard_ac_4x4_mmx2
3817 call hadamard_ac_4x4_mmx2
3818 sub r0, 4*SIZEOF_PIXEL
3821 call hadamard_ac_4x4_mmx2
3824 mova [rsp+gprsize+8], m5 ; save satd
3829 call hadamard_ac_2x2max_mmx2
3835 SUMSUB_BADC w, 0, 1, 2, 3, 4
3836 HADAMARD 0, sumsub, 0, 2, 4, 5
3837 ABSW2 m1, m3, m1, m3, m4, m5
3838 ABSW2 m0, m2, m0, m2, m4, m5
3839 HADAMARD 0, max, 1, 3, 4, 5
3850 %else ; !HIGH_BIT_DEPTH
3856 %endif ; HIGH_BIT_DEPTH
3857 mova [rsp+gprsize], m6 ; save sa8d
3862 %macro HADAMARD_AC_WXH_SUM_MMX 2
3863 mova m1, [rsp+1*mmsize]
3866 paddd m0, [rsp+2*mmsize]
3867 paddd m1, [rsp+3*mmsize]
3870 mova m2, [rsp+4*mmsize]
3871 paddd m1, [rsp+5*mmsize]
3872 paddd m2, [rsp+6*mmsize]
3874 paddd m1, [rsp+7*mmsize]
3881 %else ; !HIGH_BIT_DEPTH
3883 paddusw m0, [rsp+2*mmsize]
3884 paddusw m1, [rsp+3*mmsize]
3887 mova m2, [rsp+4*mmsize]
3888 paddusw m1, [rsp+5*mmsize]
3889 paddusw m2, [rsp+6*mmsize]
3891 paddusw m1, [rsp+7*mmsize]
3903 %endif ; HIGH_BIT_DEPTH
3906 %macro HADAMARD_AC_WXH_MMX 2
3907 cglobal pixel_hadamard_ac_%1x%2, 2,4
3908 %assign pad 16-gprsize-(stack_offset&15)
3914 call hadamard_ac_8x8_mmx2
3919 call hadamard_ac_8x8_mmx2
3924 lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
3926 call hadamard_ac_8x8_mmx2
3930 call hadamard_ac_8x8_mmx2
3933 HADAMARD_AC_WXH_SUM_MMX %1, %2
3941 add rsp, 128+%1*%2/4+pad
3943 %endmacro ; HADAMARD_AC_WXH_MMX
3945 HADAMARD_AC_WXH_MMX 16, 16
3946 HADAMARD_AC_WXH_MMX 8, 16
3947 HADAMARD_AC_WXH_MMX 16, 8
3948 HADAMARD_AC_WXH_MMX 8, 8
3950 %macro LOAD_INC_8x4W_SSE2 5
3959 %else ; !HIGH_BIT_DEPTH
3971 %endif ; HIGH_BIT_DEPTH
3974 %macro LOAD_INC_8x4W_SSSE3 5
3975 LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1]
3979 HSUMSUB %1, %2, %3, %4, %5
3982 %macro HADAMARD_AC_SSE2 0
3983 ; in: r0=pix, r1=stride, r2=stride*3
3984 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
3985 cglobal hadamard_ac_8x8
3991 %define spill0 [rsp+gprsize]
3992 %define spill1 [rsp+gprsize+mmsize]
3993 %define spill2 [rsp+gprsize+mmsize*2]
3997 %elif cpuflag(ssse3) && notcpuflag(atom)
3999 ;LOAD_INC loads sumsubs
4003 ;LOAD_INC only unpacks to words
4006 LOAD_INC_8x4W 0, 1, 2, 3, 7
4008 HADAMARD4_2D_SSE 0, 1, 2, 3, 4
4010 HADAMARD4_V 0, 1, 2, 3, 4
4014 LOAD_INC_8x4W 4, 5, 6, 7, 1
4016 HADAMARD4_2D_SSE 4, 5, 6, 7, 1
4018 HADAMARD4_V 4, 5, 6, 7, 1
4023 HADAMARD 1, sumsub, 0, 1, 6, 7
4024 HADAMARD 1, sumsub, 2, 3, 6, 7
4029 HADAMARD 1, sumsub, 4, 5, 1, 0
4030 HADAMARD 1, sumsub, 6, 7, 1, 0
4043 pand m1, [mask_ac4b]
4047 AC_PADD m1, m3, [pw_1]
4049 AC_PADD m1, m2, [pw_1]
4051 AC_PADD m1, m3, [pw_1]
4053 AC_PADD m1, m2, [pw_1]
4055 AC_PADD m1, m3, [pw_1]
4056 AC_PADD m1, m2, [pw_1]
4057 paddw m3, m7, spill2
4059 mova [rsp+gprsize+mmsize*2], m1 ; save satd
4060 paddw m2, m6, spill1
4062 paddw m1, m5, spill0
4069 HADAMARD %%x, amax, 3, 7, 4
4070 HADAMARD %%x, amax, 2, 6, 7, 4
4072 HADAMARD %%x, amax, 1, 5, 6, 7
4073 HADAMARD %%x, sumsub, 0, 4, 5, 6
4075 AC_PADD m2, m3, [pw_1]
4076 AC_PADD m2, m1, [pw_1]
4081 %endif ; HIGH_BIT_DEPTH
4085 AC_PADD m2, m4, [pw_1]
4086 AC_PADD m2, m0, [pw_1]
4087 mova [rsp+gprsize+mmsize], m2 ; save sa8d
4092 HADAMARD_AC_WXH_SSE2 16, 16
4093 HADAMARD_AC_WXH_SSE2 16, 8
4095 HADAMARD_AC_WXH_SSE2 8, 16
4096 HADAMARD_AC_WXH_SSE2 8, 8
4098 %endmacro ; HADAMARD_AC_SSE2
4100 %macro HADAMARD_AC_WXH_SUM_SSE2 2
4101 mova m1, [rsp+2*mmsize]
4104 paddd m0, [rsp+3*mmsize]
4105 paddd m1, [rsp+4*mmsize]
4108 paddd m0, [rsp+5*mmsize]
4109 paddd m1, [rsp+6*mmsize]
4110 paddd m0, [rsp+7*mmsize]
4111 paddd m1, [rsp+8*mmsize]
4116 %else ; !HIGH_BIT_DEPTH
4117 %if %1*%2*16/mmsize >= 128
4118 paddusw m0, [rsp+3*mmsize]
4119 paddusw m1, [rsp+4*mmsize]
4121 %if %1*%2*16/mmsize == 256
4122 paddusw m0, [rsp+5*mmsize]
4123 paddusw m1, [rsp+6*mmsize]
4124 paddusw m0, [rsp+7*mmsize]
4125 paddusw m1, [rsp+8*mmsize]
4129 vextracti128 xm2, m0, 1
4130 vextracti128 xm3, m1, 1
4136 %endif ; HIGH_BIT_DEPTH
4139 ; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
4140 %macro HADAMARD_AC_WXH_SSE2 2
4141 cglobal pixel_hadamard_ac_%1x%2, 2,4,11
4145 and rsp, ~(mmsize-1)
4148 call hadamard_ac_8x8
4153 call hadamard_ac_8x8
4155 %if %1==16 && mmsize <= 16
4158 lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
4160 call hadamard_ac_8x8
4164 call hadamard_ac_8x8
4167 HADAMARD_AC_WXH_SUM_SSE2 %1, %2
4170 shr edx, 2 - (%1*%2*16/mmsize >> 8)
4178 %endmacro ; HADAMARD_AC_WXH_SSE2
4182 %if ARCH_X86_64 == 0
4183 cextern pixel_sa8d_8x8_internal_mmx2
4188 %define TRANS TRANS_SSE2
4189 %define DIFFOP DIFF_UNPACK_SSE2
4190 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2
4191 %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
4192 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
4193 %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
4194 %define movdqu movups
4195 %define punpcklqdq movlhps
4202 %if HIGH_BIT_DEPTH == 0
4210 %if HIGH_BIT_DEPTH == 0
4220 %define DIFFOP DIFF_SUMSUB_SSSE3
4221 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
4222 %if HIGH_BIT_DEPTH == 0
4223 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
4224 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
4225 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
4234 %if HIGH_BIT_DEPTH == 0
4238 %undef movdqa ; nehalem doesn't like movaps
4239 %undef movdqu ; movups
4240 %undef punpcklqdq ; or movlhps
4241 %if HIGH_BIT_DEPTH == 0
4246 %define TRANS TRANS_SSE4
4247 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
4255 %if HIGH_BIT_DEPTH == 0
4260 ; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
4261 ; it's effectively free.
4262 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
4269 %if HIGH_BIT_DEPTH == 0
4275 %define TRANS TRANS_XOP
4282 %if HIGH_BIT_DEPTH == 0
4284 ; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
4289 %if HIGH_BIT_DEPTH == 0
4290 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
4291 %define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2
4292 %define TRANS TRANS_SSE4
4299 %macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
4304 vinserti128 m%1, m%1, [r0+4*r1], 1
4305 vinserti128 m%3, m%3, [r2+4*r3], 1
4306 vinserti128 m%2, m%2, [r0+r4], 1
4307 vinserti128 m%4, m%4, [r2+r5], 1
4312 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
4320 vinserti128 m%3, m%3, [r0+4*r1], 1
4321 vinserti128 m%5, m%5, [r2+4*r3], 1
4322 vinserti128 m%4, m%4, [r0+r4], 1
4323 vinserti128 m%6, m%6, [r2+r5], 1
4328 DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
4331 %macro SATD_START_AVX2 2-3 0
4345 %define TRANS TRANS_SSE4
4347 cglobal pixel_satd_16x8_internal
4348 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
4349 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
4350 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0
4351 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
4354 cglobal pixel_satd_16x16, 4,6,8
4355 SATD_START_AVX2 m6, m7
4356 call pixel_satd_16x8_internal
4359 pixel_satd_16x8_internal:
4360 call pixel_satd_16x8_internal
4361 vextracti128 xm0, m6, 1
4366 cglobal pixel_satd_16x8, 4,6,8
4367 SATD_START_AVX2 m6, m7
4368 jmp pixel_satd_16x8_internal
4370 cglobal pixel_satd_8x8_internal
4371 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
4372 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
4375 cglobal pixel_satd_8x16, 4,6,8
4376 SATD_START_AVX2 m6, m7, 1
4377 call pixel_satd_8x8_internal
4382 call pixel_satd_8x8_internal
4383 vextracti128 xm0, m6, 1
4388 cglobal pixel_satd_8x8, 4,6,8
4389 SATD_START_AVX2 m6, m7, 1
4390 call pixel_satd_8x8_internal
4391 vextracti128 xm0, m6, 1
4396 cglobal pixel_sa8d_8x8_internal
4397 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
4398 HADAMARD4_V 0, 1, 2, 3, 4
4399 HADAMARD 8, sumsub, 0, 1, 4, 5
4400 HADAMARD 8, sumsub, 2, 3, 4, 5
4401 HADAMARD 2, sumsub, 0, 1, 4, 5
4402 HADAMARD 2, sumsub, 2, 3, 4, 5
4403 HADAMARD 1, amax, 0, 1, 4, 5
4404 HADAMARD 1, amax, 2, 3, 4, 5
4409 cglobal pixel_sa8d_8x8, 4,6,8
4410 SATD_START_AVX2 m6, m7, 1
4411 call pixel_sa8d_8x8_internal
4412 vextracti128 xm1, m6, 1
4420 cglobal intra_sad_x9_8x8, 5,7,8
4421 %define pred(i,j) [rsp+i*0x40+j*0x20]
4426 movu m5, [r0+0*FENC_STRIDE]
4427 movu m6, [r0+4*FENC_STRIDE]
4428 punpcklqdq m5, [r0+2*FENC_STRIDE]
4429 punpcklqdq m6, [r0+6*FENC_STRIDE]
4431 ; save instruction size: avoid 4-byte memory offsets
4432 lea r0, [intra8x9_h1+128]
4433 %define off(m) (r0+m-(intra8x9_h1+128))
4435 vpbroadcastq m0, [r2+16]
4442 vpbroadcastq m1, [r2+7]
4443 pshufb m3, m1, [off(intra8x9_h1)]
4444 pshufb m2, m1, [off(intra8x9_h3)]
4452 %define pred(i,j) [r5+i*0x40+j*0x20-0x100]
4454 ; combine the first two
4474 vbroadcasti128 m0, [r2+16]
4475 vbroadcasti128 m2, [r2+17]
4478 PRED4x4_LOWPASS m0, m1, m2, m0, m7
4479 pshufb m1, m0, [off(intra8x9_ddl1)]
4480 pshufb m2, m0, [off(intra8x9_ddl3)]
4489 vextracti128 xm1, m4, 1
4494 vinserti128 m7, m3, xm0, 1
4496 vbroadcasti128 m2, [r2+8]
4497 vbroadcasti128 m0, [r2+7]
4498 vbroadcasti128 m1, [r2+6]
4500 PRED4x4_LOWPASS m0, m1, m2, m0, m4
4501 pshufb m1, m0, [off(intra8x9_ddr1)]
4502 pshufb m2, m0, [off(intra8x9_ddr3)]
4511 %define off(m) (r0+m-(intra8x9_h1+256+128))
4512 %define pred(i,j) [r5+i*0x40+j*0x20-0x1C0]
4514 vpblendd m2, m3, m0, 11110011b
4515 pshufb m1, m2, [off(intra8x9_vr1)]
4516 pshufb m2, m2, [off(intra8x9_vr3)]
4527 pblendw m2, m0, q3330
4529 pshufb m1, m2, [off(intra8x9_hd1)]
4530 pshufb m2, m0, [off(intra8x9_hd3)]
4540 pshufb m1, m7, [off(intra8x9_vl1)]
4541 pshufb m2, m7, [off(intra8x9_vl3)]
4550 vextracti128 xm1, m4, 1
4553 SBUTTERFLY qdq, 3, 4, 7
4557 vpbroadcastd m0, [r2+7]
4559 pshufb m1, m0, [off(intra8x9_hu1)]
4560 pshufb m2, m0, [off(intra8x9_hu3)]
4566 vextracti128 xm2, m1, 1
4574 add r2w, word [r3+16]
4586 add r1, 4*FDEC_STRIDE
4587 mova xm0, [rsp+r3+0x00]
4588 mova xm1, [rsp+r3+0x10]
4589 mova xm2, [rsp+r3+0x20]
4590 mova xm3, [rsp+r3+0x30]
4591 movq [r1+FDEC_STRIDE*-4], xm0
4592 movhps [r1+FDEC_STRIDE*-2], xm0
4593 movq [r1+FDEC_STRIDE*-3], xm1
4594 movhps [r1+FDEC_STRIDE*-1], xm1
4595 movq [r1+FDEC_STRIDE* 0], xm2
4596 movhps [r1+FDEC_STRIDE* 2], xm2
4597 movq [r1+FDEC_STRIDE* 1], xm3
4598 movhps [r1+FDEC_STRIDE* 3], xm3
4602 %endif ; HIGH_BIT_DEPTH
4604 ;=============================================================================
4606 ;=============================================================================
4608 ;-----------------------------------------------------------------------------
4609 ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
4610 ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
4611 ;-----------------------------------------------------------------------------
4614 movdqu m5, [r0+(%1&1)*r1]
4615 movdqu m6, [r2+(%1&1)*r3]
4617 movq m5, [r0+(%1&1)*r1]
4618 movq m6, [r2+(%1&1)*r3]
4636 ACCUM paddd, 3, 5, %1
4637 ACCUM paddd, 4, 7, %1
4642 cglobal pixel_ssim_4x4x2_core, 4,4,8
4652 pshufd m5, m3, q2301
4655 pshufd m6, m4, q2301
4658 pshufd m1, m1, q3120
4661 punpckhdq m5, m3, m4
4677 ;-----------------------------------------------------------------------------
4678 ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
4679 ;-----------------------------------------------------------------------------
4680 cglobal pixel_ssim_end4, 3,3,7
4695 movdqa m5, [ssim_c1]
4696 movdqa m6, [ssim_c2]
4697 TRANSPOSE4x4D 0, 1, 2, 3, 4
4699 ; s1=m0, s2=m1, ss=m2, s12=m3
4705 mulps m2, [pf_64] ; ss*64
4706 mulps m3, [pf_128] ; s12*128
4708 mulps m4, m0 ; s1*s2
4709 mulps m1, m1 ; s2*s2
4710 mulps m0, m0 ; s1*s1
4711 addps m4, m4 ; s1*s2*2
4712 addps m0, m1 ; s1*s1 + s2*s2
4714 subps m3, m4 ; covar*2
4715 addps m4, m5 ; s1*s2*2 + ssim_c1
4716 addps m0, m5 ; s1*s1 + s2*s2 + ssim_c1
4717 addps m2, m6 ; vars + ssim_c2
4718 addps m3, m6 ; covar*2 + ssim_c2
4720 pmaddwd m4, m1, m0 ; s1*s2
4723 pmaddwd m0, m0 ; s1*s1 + s2*s2
4727 psubd m3, m4 ; covar*2
4733 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
4734 cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
4735 cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
4736 cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
4743 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
4746 lea r3, [mask_ff + 16]
4747 movdqu m1, [r3 + r2*4]
4749 movdqu m1, [mask_ff + r2*4 + 16]
4755 pshuflw m4, m0, q0032
4757 %if ARCH_X86_64 == 0
4769 ;-----------------------------------------------------------------------------
4770 ; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
4771 ;-----------------------------------------------------------------------------
4773 cglobal pixel_asd8, 5,5
4836 ;=============================================================================
4837 ; Successive Elimination ADS
4838 ;=============================================================================
4847 lea r6, [r4+r5+(mmsize-1)]
4852 %macro ADS_END 1 ; unroll_size
4858 WIN64_RESTORE_XMM rsp
4862 lea r6, [r4+r5+(mmsize-1)]
4871 ;-----------------------------------------------------------------------------
4872 ; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
4873 ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
4874 ;-----------------------------------------------------------------------------
4876 cglobal pixel_ads4, 5,7
4880 pshufw m6, m6, q2222
4882 pshufw m4, m4, q2222
4907 cglobal pixel_ads2, 5,7
4911 pshufw m6, m6, q2222
4928 cglobal pixel_ads1, 5,7
4951 cglobal pixel_ads4, 5,7,8
4952 vpbroadcastw m7, [r0+ 0]
4953 vpbroadcastw m6, [r0+ 4]
4954 vpbroadcastw m5, [r0+ 8]
4955 vpbroadcastw m4, [r0+12]
4957 cglobal pixel_ads4, 5,7,12
4959 pshuflw m7, m4, q0000
4960 pshuflw m6, m4, q2222
4961 pshufhw m5, m4, q0000
4962 pshufhw m4, m4, q2222
4968 %if ARCH_X86_64 && mmsize == 16
4981 movu m11, [r1+r2+16]
5011 vpbroadcastw m1, r6m
5022 vpermq m1, m1, q3120
5029 cglobal pixel_ads2, 5,7,8
5031 vpbroadcastw m7, [r0+0]
5032 vpbroadcastw m6, [r0+4]
5033 vpbroadcastw m5, r6m
5038 pshuflw m6, m6, q2222
5058 vpermq m1, m1, q3120
5065 cglobal pixel_ads1, 5,7,8
5067 vpbroadcastw m7, [r0]
5068 vpbroadcastw m6, r6m
5080 movu m1, [r1+mmsize]
5084 movu m3, [r3+mmsize]
5093 vpermq m4, m4, q3120
5108 ; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
5111 ; *(uint32_t*)(masks+width) = 0;
5112 ; for( i=0; i<width; i+=8 )
5114 ; uint64_t mask = *(uint64_t*)(masks+i);
5115 ; if( !mask ) continue;
5116 ; for( j=0; j<8; j++ )
5117 ; if( mask & (255<<j*8) )
5125 test r2d, 0xff<<(%1*8)
5132 cglobal pixel_ads_mvs, 0,7,0
5137 ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
5177 cglobal pixel_ads_mvs, 0,7,0
5180 mova m4, [pw_76543210]
5187 %define GLOBAL +r1-$$
5195 xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions)
5196 movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt
5198 ; shuffle counters based on mv mask
5199 pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
5202 paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}