1 ;*****************************************************************************
2 ;* pixel.asm: x86 pixel metrics
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2016 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* Fiona Glaser <fiona@x264.com>
11 ;* Oskar Arvidsson <oskar@irock.se>
13 ;* This program is free software; you can redistribute it and/or modify
14 ;* it under the terms of the GNU General Public License as published by
15 ;* the Free Software Foundation; either version 2 of the License, or
16 ;* (at your option) any later version.
18 ;* This program is distributed in the hope that it will be useful,
19 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;* GNU General Public License for more details.
23 ;* You should have received a copy of the GNU General Public License
24 ;* along with this program; if not, write to the Free Software
25 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
27 ;* This program is also available under a commercial proprietary license.
28 ;* For more information, contact us at licensing@x264.com.
29 ;*****************************************************************************
32 %include "x86util.asm"
35 hmul_16p: times 16 db 1
41 mask_ff: times 16 db 0xff
43 mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1
44 mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1
45 mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1
47 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
48 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
49 pf_64: times 4 dd 64.0
50 pf_128: times 4 dd 128.0
52 ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64
53 ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63
55 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
56 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
58 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
59 mask_10: times 4 dw 0, -1
60 mask_1100: times 2 dd 0, -1
61 pb_pppm: times 4 db 1,1,1,-1
62 deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
63 intrax3_shuf: db 7,6,7,6,5,4,5,4,3,2,3,2,1,0,1,0
65 intrax9a_ddlr1: db 6, 7, 8, 9, 7, 8, 9,10, 4, 5, 6, 7, 3, 4, 5, 6
66 intrax9a_ddlr2: db 8, 9,10,11, 9,10,11,12, 2, 3, 4, 5, 1, 2, 3, 4
67 intrax9a_hdu1: db 15, 4, 5, 6,14, 3,15, 4,14, 2,13, 1,13, 1,12, 0
68 intrax9a_hdu2: db 13, 2,14, 3,12, 1,13, 2,12, 0,11,11,11,11,11,11
69 intrax9a_vrl1: db 10,11,12,13, 3, 4, 5, 6,11,12,13,14, 5, 6, 7, 8
70 intrax9a_vrl2: db 2,10,11,12, 1, 3, 4, 5,12,13,14,15, 6, 7, 8, 9
71 intrax9a_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 4, 4, 4, 3, 3, 3, 3
72 intrax9a_vh2: db 6, 7, 8, 9, 6, 7, 8, 9, 2, 2, 2, 2, 1, 1, 1, 1
73 intrax9a_dc: db 1, 2, 3, 4, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1
74 intrax9a_lut: db 0x60,0x68,0x80,0x00,0x08,0x20,0x40,0x28,0x48,0,0,0,0,0,0,0
75 pw_s01234567: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8005,0x8006,0x8007
76 pw_s01234657: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8006,0x8005,0x8007
77 intrax9_edge: db 0, 0, 1, 2, 3, 7, 8, 9,10,11,12,13,14,15,15,15
79 intrax9b_ddlr1: db 6, 7, 8, 9, 4, 5, 6, 7, 7, 8, 9,10, 3, 4, 5, 6
80 intrax9b_ddlr2: db 8, 9,10,11, 2, 3, 4, 5, 9,10,11,12, 1, 2, 3, 4
81 intrax9b_hdu1: db 15, 4, 5, 6,14, 2,13, 1,14, 3,15, 4,13, 1,12, 0
82 intrax9b_hdu2: db 13, 2,14, 3,12, 0,11,11,12, 1,13, 2,11,11,11,11
83 intrax9b_vrl1: db 10,11,12,13,11,12,13,14, 3, 4, 5, 6, 5, 6, 7, 8
84 intrax9b_vrl2: db 2,10,11,12,12,13,14,15, 1, 3, 4, 5, 6, 7, 8, 9
85 intrax9b_vh1: db 6, 7, 8, 9, 4, 4, 4, 4, 6, 7, 8, 9, 3, 3, 3, 3
86 intrax9b_vh2: db 6, 7, 8, 9, 2, 2, 2, 2, 6, 7, 8, 9, 1, 1, 1, 1
87 intrax9b_edge2: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1
88 intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1
89 intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
90 intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0
93 intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5
94 intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4
95 intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1
96 intra8x9_h4: db 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0
97 intra8x9_ddl1: db 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9,10
98 intra8x9_ddl2: db 2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9,10,11
99 intra8x9_ddl3: db 5, 6, 7, 8, 9,10,11,12, 7, 8, 9,10,11,12,13,14
100 intra8x9_ddl4: db 6, 7, 8, 9,10,11,12,13, 8, 9,10,11,12,13,14,15
101 intra8x9_vl1: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
102 intra8x9_vl2: db 1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9
103 intra8x9_vl3: db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9,10
104 intra8x9_vl4: db 3, 4, 5, 6, 7, 8, 9,10, 4, 5, 6, 7, 8, 9,10,11
105 intra8x9_ddr1: db 8, 9,10,11,12,13,14,15, 6, 7, 8, 9,10,11,12,13
106 intra8x9_ddr2: db 7, 8, 9,10,11,12,13,14, 5, 6, 7, 8, 9,10,11,12
107 intra8x9_ddr3: db 4, 5, 6, 7, 8, 9,10,11, 2, 3, 4, 5, 6, 7, 8, 9
108 intra8x9_ddr4: db 3, 4, 5, 6, 7, 8, 9,10, 1, 2, 3, 4, 5, 6, 7, 8
109 intra8x9_vr1: db 8, 9,10,11,12,13,14,15, 7, 8, 9,10,11,12,13,14
110 intra8x9_vr2: db 8, 9,10,11,12,13,14,15, 6, 8, 9,10,11,12,13,14
111 intra8x9_vr3: db 5, 7, 8, 9,10,11,12,13, 3, 5, 7, 8, 9,10,11,12
112 intra8x9_vr4: db 4, 6, 8, 9,10,11,12,13, 2, 4, 6, 8, 9,10,11,12
113 intra8x9_hd1: db 3, 8, 9,10,11,12,13,14, 1, 6, 2, 7, 3, 8, 9,10
114 intra8x9_hd2: db 2, 7, 3, 8, 9,10,11,12, 0, 5, 1, 6, 2, 7, 3, 8
115 intra8x9_hd3: db 7, 8, 9,10,11,12,13,14, 3, 4, 5, 6, 7, 8, 9,10
116 intra8x9_hd4: db 5, 6, 7, 8, 9,10,11,12, 1, 2, 3, 4, 5, 6, 7, 8
117 intra8x9_hu1: db 13,12,11,10, 9, 8, 7, 6, 9, 8, 7, 6, 5, 4, 3, 2
118 intra8x9_hu2: db 11,10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0
119 intra8x9_hu3: db 5, 4, 3, 2, 1, 0,15,15, 1, 0,15,15,15,15,15,15
120 intra8x9_hu4: db 3, 2, 1, 0,15,15,15,15,15,15,15,15,15,15,15,15
121 pw_s00112233: dw 0x8000,0x8000,0x8001,0x8001,0x8002,0x8002,0x8003,0x8003
122 pw_s00001111: dw 0x8000,0x8000,0x8000,0x8000,0x8001,0x8001,0x8001,0x8001
124 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
125 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
128 pd_f0: times 4 dd 0xffff0000
130 pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
133 %macro ADS_MVS_SHUFFLE 8
138 %assign y y>>((~y)&1)
147 ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7
168 ;=============================================================================
170 ;=============================================================================
173 ;-----------------------------------------------------------------------------
174 ; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
175 ;-----------------------------------------------------------------------------
177 cglobal pixel_ssd_%1x%2, 4,7,6
181 %define offset0_2 r1*2
184 %define offset1_2 r3*2
189 %define offset0_1 mmsize
191 %define offset0_3 r1+mmsize
192 %define offset1_1 mmsize
194 %define offset1_3 r3+mmsize
196 %define offset0_1 mmsize
197 %define offset0_2 mmsize*2
198 %define offset0_3 mmsize*3
199 %define offset1_1 mmsize
200 %define offset1_2 mmsize*2
201 %define offset1_3 mmsize*3
203 %assign %%n %2/(2*mmsize/%1)
210 mova m2, [r0+offset0_1]
211 mova m3, [r0+offset0_2]
212 mova m4, [r0+offset0_3]
214 psubw m2, [r2+offset1_1]
215 psubw m3, [r2+offset1_2]
216 psubw m4, [r2+offset1_3]
218 lea r0, [r0+r1*(%2/%%n)]
219 lea r2, [r2+r3*(%2/%%n)]
256 %endif ; HIGH_BIT_DEPTH
258 %if HIGH_BIT_DEPTH == 0
259 %macro SSD_LOAD_FULL 5
303 DEINTB %2, %1, %4, %3, 7
320 vinserti128 m%1, m%1, %4, 1
328 vinserti128 m%2, m%2, %6, 1
332 SBUTTERFLY bw, %1, %2, %3
335 %macro SSD_LOAD_HALF 5
336 LOAD 1, 2, [t0+%1], [t0+%3], 1
337 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
338 LOAD 3, 4, [t0+%1], [t0+%3], %5
339 JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5
352 punpcklbw m%2, m%1, m%5
354 punpcklbw m%4, m%3, m%5
363 %macro SSD_CORE_SSE2 7-8
365 DEINTB %6, %1, %7, %2, %5
369 DEINTB %6, %3, %7, %4, %5
380 %macro SSD_CORE_SSSE3 7-8
382 punpckhbw m%6, m%1, m%2
383 punpckhbw m%7, m%3, m%4
400 SSD_LOAD_%1 %2,%3,%4,%5,%6
401 SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
408 ;-----------------------------------------------------------------------------
409 ; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
410 ;-----------------------------------------------------------------------------
413 %assign function_align 8
415 %assign function_align 16
417 cglobal pixel_ssd_%1x%2, 0,0,0
418 mov al, %1*%2/mmsize/2
421 jmp mangle(x264_pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop)
426 DECLARE_REG_TMP 0,1,2,3
430 DECLARE_REG_TMP 1,2,3,4
439 %elifidn cpuname, sse2
449 SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
451 SSD_ITER FULL, 0, 0, t1, t3, 2
453 SSD_ITER HALF, 0, 0, t1, t3, 2
458 vextracti128 xm1, m0, 1
486 %define SSD_CORE SSD_CORE_SSE2
487 %define JOIN JOIN_SSE2
494 %define SSD_CORE SSD_CORE_SSSE3
495 %define JOIN JOIN_SSSE3
517 %define LOAD LOAD_AVX2
518 %define JOIN JOIN_AVX2
522 %assign function_align 16
523 %endif ; !HIGH_BIT_DEPTH
525 ;-----------------------------------------------------------------------------
526 ; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
527 ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
529 ; The maximum width this function can handle without risk of overflow is given
530 ; in the following equation: (mmsize in bits)
532 ; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
534 ; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
535 ; distortion levels it will take much more than that though.
536 ;-----------------------------------------------------------------------------
539 cglobal pixel_ssd_nv12_core, 6,7,7
555 mova m1, [r0+r6+mmsize]
557 psubw m1, [r2+r6+mmsize]
558 PSHUFLW m0, m0, q3120
559 PSHUFLW m1, m1, q3120
561 pshufhw m0, m0, q3120
562 pshufhw m1, m1, q3120
565 pmadcswd m2, m0, m0, m2
566 pmadcswd m3, m1, m1, m3
575 %if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
580 %if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the
581 ; equation above, putting the width limit at 8208
590 %else ; unfortunately paddq is sse2
591 ; emulate 48 bit precision for mmx2 instead
610 vextracti128 xm0, m4, 1
616 %else ; fixup for mmx2
617 SBUTTERFLY dq, 4, 5, 0
622 SBUTTERFLY dq, 0, 5, 4
630 %endif ; HIGH_BIT_DEPTH
632 %if HIGH_BIT_DEPTH == 0
633 ;-----------------------------------------------------------------------------
634 ; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
635 ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
637 ; This implementation can potentially overflow on image widths >= 11008 (or
638 ; 6604 if interlaced), since it is called on blocks of height up to 12 (resp
639 ; 20). At sane distortion levels it will take much more than that though.
640 ;-----------------------------------------------------------------------------
642 cglobal pixel_ssd_nv12_core, 6,7
653 %if mmsize == 32 ; only 16-byte alignment is guaranteed
666 pmadcswd m4, m2, m2, m4
667 pmadcswd m3, m0, m0, m3
676 %if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
679 pandn m0, m1, m0 ; zero the lower half
700 %endif ; !HIGH_BIT_DEPTH
713 ;=============================================================================
715 ;=============================================================================
719 pxor m6, m6 ; sum squared
720 %if HIGH_BIT_DEPTH == 0
726 %endif ; !HIGH_BIT_DEPTH
730 %if HIGH_BIT_DEPTH && mmsize == 8 && %1*%2 == 256
768 mova m4, [r0+%1+mmsize]
769 %else ; !HIGH_BIT_DEPTH
776 %endif ; HIGH_BIT_DEPTH
787 ;-----------------------------------------------------------------------------
788 ; int pixel_var_wxh( uint8_t *, intptr_t )
789 ;-----------------------------------------------------------------------------
791 cglobal pixel_var_16x16, 2,3
794 VAR_2ROW 8*SIZEOF_PIXEL, 16
797 cglobal pixel_var_8x16, 2,3
803 cglobal pixel_var_8x8, 2,3
811 cglobal pixel_var_16x16, 2,3,8
817 cglobal pixel_var_8x8, 2,3,8
840 %endif ; HIGH_BIT_DEPTH
842 %if HIGH_BIT_DEPTH == 0
844 cglobal pixel_var_16x16, 2,3,8
857 cglobal pixel_var_8x8, 2,4,8
873 cglobal pixel_var_8x16, 2,4,8
896 %endif ; !HIGH_BIT_DEPTH
899 cglobal pixel_var_16x16, 2,4,7
913 pmovzxbw m1, [r0+r1*2]
920 vextracti128 xm0, m5, 1
921 vextracti128 xm1, m6, 1
943 sub eax, r1d ; sqr - (sum * sum >> shift)
947 ;-----------------------------------------------------------------------------
948 ; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
949 ;-----------------------------------------------------------------------------
950 %macro VAR2_8x8_MMX 2
951 cglobal pixel_var2_8x%1, 5,6
960 psubw m1, [r2+mmsize]
961 %else ; !HIGH_BIT_DEPTH
972 %endif ; HIGH_BIT_DEPTH
992 %macro VAR2_8x8_SSE2 2
993 cglobal pixel_var2_8x%1, 5,6,8
1002 %else ; !HIGH_BIT_DEPTH
1007 DEINTB 0, 1, 2, 3, 7
1008 %endif ; HIGH_BIT_DEPTH
1017 lea r0, [r0+r1*2*SIZEOF_PIXEL]
1018 lea r2, [r2+r3*2*SIZEOF_PIXEL]
1028 %if HIGH_BIT_DEPTH == 0
1029 %macro VAR2_8x8_SSSE3 2
1030 cglobal pixel_var2_8x%1, 5,6,8
1032 pxor m6, m6 ; sum squared
1075 VAR2_8x8_SSSE3 16, 7
1078 VAR2_8x8_SSSE3 16, 7
1080 %macro VAR2_8x8_AVX2 2
1081 cglobal pixel_var2_8x%1, 5,6,6
1083 pxor m4, m4 ; sum squared
1089 vinserti128 m0, m0, [r0+r1], 1
1090 vinserti128 m1, m1, [r2+r3], 1
1096 vinserti128 m1, m1, [r0+r1], 1
1097 vinserti128 m2, m2, [r2+r3], 1
1111 vextracti128 xm0, m3, 1
1112 vextracti128 xm1, m4, 1
1115 VAR2_END %2, xm3, xm4
1122 %endif ; !HIGH_BIT_DEPTH
1124 ;=============================================================================
1126 ;=============================================================================
1130 ; just use shufps on anything post conroe
1132 %elif cpuflag(ssse3) && notcpuflag(atom)
1133 ; join 2x 32 bit and duplicate them
1134 ; emulating shufps is faster on conroe
1138 ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
1150 %macro DIFF_UNPACK_SSE2 5
1159 %macro DIFF_SUMSUB_SSSE3 5
1160 HSUMSUB %1, %2, %3, %4, %5
1165 %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
1171 %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
1178 %macro LOAD_DUP_4x8P_PENRYN 8
1179 ; penryn and nehalem run punpcklqdq and movddup in different units
1188 %macro LOAD_SUMSUB_8x2P 9
1189 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
1190 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1193 %macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
1194 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
1195 LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
1196 LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
1203 %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
1209 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1212 %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
1215 DEINTB %1, %2, %3, %4, %5
1218 SUMSUB_BA w, %1, %2, %3
1221 %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
1222 ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
1223 LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
1224 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
1225 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
1226 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
1229 %macro LOAD_SUMSUB_16x2P_AVX2 9
1230 ; 2*dst, 2*tmp, mul, 4*ptr
1231 vbroadcasti128 m%1, [%6]
1232 vbroadcasti128 m%3, [%7]
1233 vbroadcasti128 m%2, [%8]
1234 vbroadcasti128 m%4, [%9]
1235 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1238 %macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0
1239 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
1240 LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3
1241 LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5
1248 %macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer
1253 vpermq m%3, m%3, q0011
1254 vpermq m%4, m%4, q0011
1255 vpermq m%1, m%1, q0011
1256 vpermq m%2, m%2, q0011
1259 %macro LOAD_SUMSUB8_16x2P_AVX2 9
1260 ; 2*dst, 2*tmp, mul, 4*ptr
1261 LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9
1262 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1265 %macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0
1266 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
1267 LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
1268 LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
1275 ; in: r4=3*stride1, r5=3*stride2
1276 ; in: %2 = horizontal offset
1277 ; in: %3 = whether we need to increment pix1 and pix2
1280 %macro SATD_4x4_MMX 3
1282 %assign offset %2*SIZEOF_PIXEL
1283 LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset]
1284 LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset]
1285 LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset]
1286 LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset]
1291 HADAMARD4_2D 4, 5, 6, 7, 3, %%n
1296 ; in: %1 = horizontal if 0, vertical if 1
1297 %macro SATD_8x4_SSE 8-9
1299 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
1301 HADAMARD4_V %2, %3, %4, %5, %6
1302 ; doing the abs first is a slight advantage
1303 ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
1304 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
1305 HADAMARD 1, max, %2, %4, %6, %7
1315 HADAMARD 1, max, %3, %5, %6, %7
1320 %macro SATD_START_MMX 0
1322 lea r4, [3*r1] ; 3*stride1
1323 lea r5, [3*r3] ; 3*stride2
1326 %macro SATD_END_MMX 0
1330 %else ; !HIGH_BIT_DEPTH
1331 pshufw m1, m0, q1032
1333 pshufw m1, m0, q2301
1337 %endif ; HIGH_BIT_DEPTH
1341 ; FIXME avoid the spilling of regs to hold 3*stride.
1342 ; for small blocks on x86_32, modify pixel pointer instead.
1344 ;-----------------------------------------------------------------------------
1345 ; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
1346 ;-----------------------------------------------------------------------------
1348 cglobal pixel_satd_16x4_internal
1349 SATD_4x4_MMX m2, 0, 0
1350 SATD_4x4_MMX m1, 4, 0
1352 SATD_4x4_MMX m2, 8, 0
1354 SATD_4x4_MMX m1, 12, 0
1359 cglobal pixel_satd_8x8_internal
1360 SATD_4x4_MMX m2, 0, 0
1361 SATD_4x4_MMX m1, 4, 1
1364 pixel_satd_8x4_internal_mmx2:
1365 SATD_4x4_MMX m2, 0, 0
1366 SATD_4x4_MMX m1, 4, 0
1372 %macro SATD_MxN_MMX 3
1373 cglobal pixel_satd_%1x%2, 4,7
1376 call pixel_satd_%1x%3_internal_mmx2
1383 call pixel_satd_%1x%3_internal_mmx2
1394 SATD_MxN_MMX 16, 16, 4
1395 SATD_MxN_MMX 16, 8, 4
1396 SATD_MxN_MMX 8, 16, 8
1397 %endif ; HIGH_BIT_DEPTH
1399 %if HIGH_BIT_DEPTH == 0
1400 cglobal pixel_satd_16x16, 4,6
1404 call pixel_satd_16x4_internal_mmx2
1408 call pixel_satd_16x4_internal_mmx2
1413 cglobal pixel_satd_16x8, 4,6
1416 call pixel_satd_16x4_internal_mmx2
1419 call pixel_satd_16x4_internal_mmx2
1422 cglobal pixel_satd_8x16, 4,6
1425 call pixel_satd_8x8_internal_mmx2
1428 call pixel_satd_8x8_internal_mmx2
1430 %endif ; !HIGH_BIT_DEPTH
1432 cglobal pixel_satd_8x8, 4,6
1435 call pixel_satd_8x8_internal_mmx2
1438 cglobal pixel_satd_8x4, 4,6
1441 call pixel_satd_8x4_internal_mmx2
1444 cglobal pixel_satd_4x16, 4,6
1446 SATD_4x4_MMX m0, 0, 1
1447 SATD_4x4_MMX m1, 0, 1
1449 SATD_4x4_MMX m1, 0, 1
1451 SATD_4x4_MMX m1, 0, 0
1455 cglobal pixel_satd_4x8, 4,6
1457 SATD_4x4_MMX m0, 0, 1
1458 SATD_4x4_MMX m1, 0, 0
1462 cglobal pixel_satd_4x4, 4,6
1464 SATD_4x4_MMX m0, 0, 0
1467 %macro SATD_START_SSE2 2-3 0
1469 %if HIGH_BIT_DEPTH && %3
1471 %elif cpuflag(ssse3) && notcpuflag(atom)
1483 %macro SATD_END_SSE2 1-2
1504 %macro BACKUP_POINTERS 0
1514 %macro RESTORE_AND_INC_POINTERS 0
1516 lea r0, [r6+8*SIZEOF_PIXEL]
1517 lea r2, [r7+8*SIZEOF_PIXEL]
1524 add r0, 8*SIZEOF_PIXEL
1525 add r2, 8*SIZEOF_PIXEL
1529 %macro SATD_4x8_SSE 3
1535 movhps m0, [r0+4*r1]
1536 movhps m4, [r2+4*r3]
1544 movhps m1, [r0+1*r1]
1545 movhps m5, [r2+1*r3]
1546 movhps m2, [r0+2*r1]
1547 movhps m6, [r2+2*r3]
1553 %else ; !HIGH_BIT_DEPTH
1572 DIFFOP 0, 4, 1, 5, 3
1574 DIFFOP 0, 4, 1, 5, 7
1590 DIFFOP 2, 6, 3, 5, 4
1592 DIFFOP 2, 6, 3, 5, 7
1594 %endif ; HIGH_BIT_DEPTH
1595 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
1598 ;-----------------------------------------------------------------------------
1599 ; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
1600 ;-----------------------------------------------------------------------------
1602 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
1604 %if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH)
1605 cglobal pixel_satd_4x4, 4, 6, 6
1608 LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
1609 LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
1610 LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
1611 LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
1612 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
1613 HADAMARD 0, sumsub, 0, 1, 2, 3
1614 HADAMARD 4, sumsub, 0, 1, 2, 3
1615 HADAMARD 1, amax, 0, 1, 2, 3
1621 cglobal pixel_satd_4x8, 4, 6, 8
1626 SATD_4x8_SSE vertical, 0, swap
1631 cglobal pixel_satd_4x16, 4, 6, 8
1636 SATD_4x8_SSE vertical, 0, swap
1637 lea r0, [r0+r1*2*SIZEOF_PIXEL]
1638 lea r2, [r2+r3*2*SIZEOF_PIXEL]
1639 SATD_4x8_SSE vertical, 1, add
1644 cglobal pixel_satd_8x8_internal
1645 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
1646 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
1647 %%pixel_satd_8x4_internal:
1648 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
1649 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
1652 ; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
1653 ; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
1654 %if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx)
1655 cglobal pixel_satd_16x4_internal
1656 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
1659 ; always use horizontal mode here
1660 SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10
1661 SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
1664 cglobal pixel_satd_16x8, 4,6,12
1665 SATD_START_SSE2 m10, m7
1669 jmp %%pixel_satd_16x8_internal
1671 cglobal pixel_satd_16x16, 4,6,12
1672 SATD_START_SSE2 m10, m7
1676 call pixel_satd_16x4_internal
1677 call pixel_satd_16x4_internal
1678 %%pixel_satd_16x8_internal:
1679 call pixel_satd_16x4_internal
1680 call pixel_satd_16x4_internal
1683 cglobal pixel_satd_16x8, 4,6,8
1684 SATD_START_SSE2 m6, m7
1686 call pixel_satd_8x8_internal
1687 RESTORE_AND_INC_POINTERS
1688 call pixel_satd_8x8_internal
1691 cglobal pixel_satd_16x16, 4,6,8
1692 SATD_START_SSE2 m6, m7, 1
1694 call pixel_satd_8x8_internal
1695 call pixel_satd_8x8_internal
1696 SATD_ACCUM m6, m0, m7
1697 RESTORE_AND_INC_POINTERS
1698 call pixel_satd_8x8_internal
1699 call pixel_satd_8x8_internal
1700 SATD_END_SSE2 m6, m7
1703 cglobal pixel_satd_8x16, 4,6,8
1704 SATD_START_SSE2 m6, m7
1705 call pixel_satd_8x8_internal
1706 call pixel_satd_8x8_internal
1709 cglobal pixel_satd_8x8, 4,6,8
1710 SATD_START_SSE2 m6, m7
1711 call pixel_satd_8x8_internal
1714 cglobal pixel_satd_8x4, 4,6,8
1715 SATD_START_SSE2 m6, m7
1716 call %%pixel_satd_8x4_internal
1718 %endmacro ; SATDS_SSE2
1733 %endif ; HIGH_BIT_DEPTH
1737 ; sse2 doesn't seem to like the horizontal way of doing things
1738 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
1741 ;-----------------------------------------------------------------------------
1742 ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
1743 ;-----------------------------------------------------------------------------
1744 cglobal pixel_sa8d_8x8_internal
1747 LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
1748 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
1750 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
1752 HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11
1760 cglobal pixel_sa8d_8x8, 4,8,12
1767 call pixel_sa8d_8x8_internal
1772 %endif ; HIGH_BIT_DEPTH
1778 cglobal pixel_sa8d_16x16, 4,8,12
1785 call pixel_sa8d_8x8_internal ; pix[0]
1786 add r2, 8*SIZEOF_PIXEL
1787 add r0, 8*SIZEOF_PIXEL
1792 call pixel_sa8d_8x8_internal ; pix[8]
1796 call pixel_sa8d_8x8_internal ; pix[8*stride+8]
1797 sub r2, 8*SIZEOF_PIXEL
1798 sub r0, 8*SIZEOF_PIXEL
1800 call pixel_sa8d_8x8_internal ; pix[8*stride]
1803 %if HIGH_BIT_DEPTH == 0
1813 cglobal pixel_sa8d_8x8_internal
1814 %define spill0 [esp+4]
1815 %define spill1 [esp+20]
1816 %define spill2 [esp+36]
1818 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
1819 HADAMARD4_2D 0, 1, 2, 3, 4
1821 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
1822 HADAMARD4_2D 4, 5, 6, 7, 3
1823 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
1826 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
1829 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
1830 ; could do first HADAMARD4_V here to save spilling later
1831 ; surprisingly, not a win on conroe or even p4
1836 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
1837 HADAMARD4_V 4, 5, 6, 7, 3
1843 HADAMARD4_V 0, 1, 2, 3, 7
1844 SUMSUB_BADC w, 0, 4, 1, 5, 7
1845 HADAMARD 2, sumsub, 0, 4, 7, 6
1846 HADAMARD 2, sumsub, 1, 5, 7, 6
1847 HADAMARD 1, amax, 0, 4, 7, 6
1848 HADAMARD 1, amax, 1, 5, 7, 6
1852 SUMSUB_BADC w, 2, 6, 3, 7, 4
1853 HADAMARD 2, sumsub, 2, 6, 4, 5
1854 HADAMARD 2, sumsub, 3, 7, 4, 5
1855 HADAMARD 1, amax, 2, 6, 4, 5
1856 HADAMARD 1, amax, 3, 7, 4, 5
1857 %endif ; sse2/non-sse2
1862 %endif ; ifndef mmx2
1864 cglobal pixel_sa8d_8x8, 4,7
1871 call pixel_sa8d_8x8_internal
1876 %endif ; HIGH_BIT_DEPTH
1883 cglobal pixel_sa8d_16x16, 4,7
1890 call pixel_sa8d_8x8_internal
1899 call pixel_sa8d_8x8_internal
1902 add r0, 8*SIZEOF_PIXEL
1903 add r2, 8*SIZEOF_PIXEL
1906 call pixel_sa8d_8x8_internal
1913 mova [esp+64-mmsize], m0
1914 call pixel_sa8d_8x8_internal
1917 %else ; !HIGH_BIT_DEPTH
1918 paddusw m0, [esp+64-mmsize]
1935 %endif ; HIGH_BIT_DEPTH
1941 %endif ; !ARCH_X86_64
1944 ;=============================================================================
1946 ;=============================================================================
1948 ; %1: vertical/horizontal mode
1949 ; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9)
1951 ; m6, m11-15: tmp regs
1952 %macro SA8D_SATD_8x4 5
1954 LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
1955 HADAMARD 0, sumsub, %2, %3, 6
1956 HADAMARD 0, sumsub, %4, %5, 6
1957 SBUTTERFLY wd, %2, %3, 6
1958 SBUTTERFLY wd, %4, %5, 6
1959 HADAMARD2_2D %2, %4, %3, %5, 6, dq
1965 HADAMARD 0, sumsub, %2, %3, 6
1966 HADAMARD 0, sumsub, %4, %5, 6
1967 SBUTTERFLY qdq, 12, 13, 6
1968 HADAMARD 0, amax, 12, 13, 6
1969 SBUTTERFLY qdq, 14, 15, 6
1971 HADAMARD 0, amax, 14, 15, 6
1974 LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
1975 HADAMARD4_V %2, %3, %4, %5, 6
1977 pabsw m12, m%2 ; doing the abs first is a slight advantage
1981 HADAMARD 1, max, 12, 14, 6, 11
1983 HADAMARD 1, max, 13, 15, 6, 11
1986 %endmacro ; SA8D_SATD_8x4
1988 ; %1: add spilled regs?
1990 %macro SA8D_SATD_ACCUM 2
2011 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
2012 cglobal pixel_sa8d_satd_8x8_internal
2013 SA8D_SATD_8x4 vertical, 0, 1, 2, 3
2014 SA8D_SATD_8x4 vertical, 4, 5, 8, 9
2016 %if vertical ; sse2-style
2017 HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax
2018 HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax
2019 %else ; complete sa8d
2020 SUMSUB_BADC w, 0, 4, 1, 5, 12
2021 HADAMARD 2, sumsub, 0, 4, 12, 11
2022 HADAMARD 2, sumsub, 1, 5, 12, 11
2023 SUMSUB_BADC w, 2, 8, 3, 9, 12
2024 HADAMARD 2, sumsub, 2, 8, 12, 11
2025 HADAMARD 2, sumsub, 3, 9, 12, 11
2026 HADAMARD 1, amax, 0, 4, 12, 11
2027 HADAMARD 1, amax, 1, 5, 12, 4
2028 HADAMARD 1, amax, 2, 8, 12, 4
2029 HADAMARD 1, amax, 3, 9, 12, 4
2032 ; create sa8d sub results
2040 ;-------------------------------------------------------------------------------
2041 ; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t )
2042 ;-------------------------------------------------------------------------------
2043 cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize
2044 %define temp0 [rsp+0*mmsize]
2045 %define temp1 [rsp+1*mmsize]
2055 call pixel_sa8d_satd_8x8_internal
2056 SA8D_SATD_ACCUM 0, 1
2057 call pixel_sa8d_satd_8x8_internal
2058 SA8D_SATD_ACCUM 1, 0
2059 vextracti128 xm1, m0, 1
2060 vextracti128 xm2, m10, 1
2064 lea r6, [r2+8*SIZEOF_PIXEL]
2065 lea r7, [r0+8*SIZEOF_PIXEL]
2067 call pixel_sa8d_satd_8x8_internal
2068 SA8D_SATD_ACCUM 0, 1
2069 call pixel_sa8d_satd_8x8_internal
2070 SA8D_SATD_ACCUM 1, 1
2075 call pixel_sa8d_satd_8x8_internal
2076 SA8D_SATD_ACCUM 1, 1
2077 call pixel_sa8d_satd_8x8_internal
2078 SA8D_SATD_ACCUM 1, 0
2081 ; xop already has fast horizontal sums
2082 %if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0
2083 pmaddwd xm10, [pw_1]
2085 phaddd xm0, xm10 ; sa8d1 sa8d2 satd1 satd2
2086 pshufd xm1, xm0, q2301 ; sa8d2 sa8d1 satd2 satd1
2087 paddd xm0, xm1 ; sa8d sa8d satd satd
2106 %endmacro ; SA8D_SATD
2108 ;=============================================================================
2110 ;=============================================================================
2121 ; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+,
2122 ; and are only retained for old cpus.
2123 %macro INTRA_SA8D_SSE2 0
2125 ;-----------------------------------------------------------------------------
2126 ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
2127 ;-----------------------------------------------------------------------------
2128 cglobal intra_sa8d_x3_8x8, 3,3,14
2131 movq m0, [r0+0*FENC_STRIDE]
2132 movq m1, [r0+1*FENC_STRIDE]
2133 movq m2, [r0+2*FENC_STRIDE]
2134 movq m3, [r0+3*FENC_STRIDE]
2135 movq m4, [r0+4*FENC_STRIDE]
2136 movq m5, [r0+5*FENC_STRIDE]
2137 movq m6, [r0+6*FENC_STRIDE]
2138 movq m7, [r0+7*FENC_STRIDE]
2148 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
2150 ABSW2 m8, m9, m2, m3, m2, m3
2151 ABSW2 m10, m11, m4, m5, m4, m5
2154 ABSW2 m10, m11, m6, m7, m6, m7
2161 ; 1D hadamard of edges
2167 HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q1032, [pw_ppppmmmm]
2168 HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q2301, [pw_ppmmppmm]
2169 pshuflw m10, m8, q2301
2170 pshuflw m11, m9, q2301
2171 pshufhw m10, m10, q2301
2172 pshufhw m11, m11, q2301
2173 pmullw m8, [pw_pmpmpmpm]
2174 pmullw m11, [pw_pmpmpmpm]
2184 psllw m8, 3 ; left edge
2187 ABSW2 m8, m10, m8, m10, m11, m12 ; 1x8 sum
2196 punpcklqdq m0, m4 ; transpose
2197 psllw m9, 3 ; top edge
2198 psrldq m2, m13, 2 ; 8x7 sum
2199 psubw m0, m9 ; 8x1 sum
2208 punpckhdq m3, m2, m8
2210 pshufd m5, m13, q3311
2213 punpckhqdq m0, m2, m5
2218 movq [r2], m0 ; i8x8_v, i8x8_h
2220 movd [r2+8], m0 ; i8x8_dc
2222 %endif ; ARCH_X86_64
2223 %endmacro ; INTRA_SA8D_SSE2
2226 ; out: m0..m3 = hadamard coefs
2228 cglobal hadamard_load
2229 ; not really a global, but otherwise cycles get attributed to the wrong function in profiling
2231 mova m0, [r0+0*FENC_STRIDEB]
2232 mova m1, [r0+1*FENC_STRIDEB]
2233 mova m2, [r0+2*FENC_STRIDEB]
2234 mova m3, [r0+3*FENC_STRIDEB]
2237 movd m0, [r0+0*FENC_STRIDE]
2238 movd m1, [r0+1*FENC_STRIDE]
2239 movd m2, [r0+2*FENC_STRIDE]
2240 movd m3, [r0+3*FENC_STRIDE]
2246 HADAMARD4_2D 0, 1, 2, 3, 4
2250 %macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp
2253 mova %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
2255 movd %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
2261 shl %2d, 5 ; log(FDEC_STRIDEB)
2263 movd %3, [r1+%2*SIZEOF_PIXEL-4+1*FDEC_STRIDEB]
2264 pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0
2265 pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2
2266 pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3
2267 %if HIGH_BIT_DEPTH == 0
2275 %define %%sign psignw
2277 %define %%sign pmullw
2279 pshufw %4, %3, q1032
2280 %%sign %4, [pw_ppmmppmm]
2282 pshufw %4, %3, q2301
2283 %%sign %4, [pw_pmpmpmpm]
2286 mova [%1_1d+2*%2], %3
2289 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
2291 pshufw %4, %1, q1032
2292 pshufw %5, %2, q1032
2293 pshufw %6, %3, q1032
2300 pshufw %4, %1, q1032
2301 pshufw %5, %2, q1032
2302 pshufw %6, %3, q1032
2312 ABSW2 m4, m5, m1, m2, m1, m2
2319 ; out: m0 v, m4 h, m5 dc
2321 %macro SUM4x3 3 ; dc, left, top
2332 punpckldq m0, m2 ; transpose
2334 ABSW2 m4, m5, m4, m5, m2, m3 ; 1x4 sum
2335 ABSW m0, m0, m1 ; 4x1 sum
2338 %macro INTRA_X3_MMX 0
2339 ;-----------------------------------------------------------------------------
2340 ; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
2341 ;-----------------------------------------------------------------------------
2342 cglobal intra_satd_x3_4x4, 3,3
2344 ; stack is 16 byte aligned because abi says so
2345 %define top_1d rsp-8 ; size 8
2346 %define left_1d rsp-16 ; size 8
2348 ; WIN64: stack is 16 byte aligned because abi says so
2349 ; X86_32: stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
2351 %define top_1d rsp+8
2356 SCALAR_HADAMARD left, 0, m4, m5
2357 SCALAR_HADAMARD top, 0, m6, m5, m7
2360 pand m6, [sw_f0] ; dc
2363 SUM4x3 m6, [left_1d], [top_1d]
2367 psrlq m1, 16 ; 4x3 sum
2370 SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw
2371 movd [r2+0], m0 ; i4x4_v satd
2372 movd [r2+4], m4 ; i4x4_h satd
2373 movd [r2+8], m5 ; i4x4_dc satd
2379 ;-----------------------------------------------------------------------------
2380 ; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
2381 ;-----------------------------------------------------------------------------
2382 cglobal intra_satd_x3_16x16, 0,5
2383 %assign stack_pad 120 + ((stack_offset+120+gprsize)&15)
2384 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
2386 %define sums rsp+64 ; size 56
2387 %define top_1d rsp+32 ; size 32
2388 %define left_1d rsp ; size 32
2406 SCALAR_HADAMARD left, r3, m0, m1
2407 SCALAR_HADAMARD top, r3, m1, m2, m3
2413 pand m6, [sw_f0] ; dc
2424 SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)]
2427 paddw m0, [sums+ 0] ; i16x16_v satd
2428 paddw m4, [sums+ 8] ; i16x16_h satd
2429 paddw m5, [sums+16] ; i16x16_dc satd
2434 add r0, 4*SIZEOF_PIXEL
2451 punpckhwd m3, m5, m7
2461 add r0, 4*FENC_STRIDEB-16*SIZEOF_PIXEL
2470 HADDD m5, m7 ; DC satd
2471 HADDD m4, m7 ; H satd
2472 HADDD m0, m7 ; the part of V satd that doesn't overlap with DC
2474 psrlq m1, 32 ; DC[1]
2475 paddd m0, m3 ; DC[2]
2476 psrlq m3, 32 ; DC[3]
2481 SUM_MM_X3 m0, m4, m5, m3, m1, m2, m6, paddd
2488 movd [r2+8], m5 ; i16x16_dc satd
2489 movd [r2+4], m4 ; i16x16_h satd
2490 movd [r2+0], m0 ; i16x16_v satd
2500 ;-----------------------------------------------------------------------------
2501 ; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
2502 ;-----------------------------------------------------------------------------
2503 cglobal intra_satd_x3_8x8c, 0,6
2504 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
2506 %define sums rsp+48 ; size 24
2507 %define dc_1d rsp+32 ; size 16
2508 %define top_1d rsp+16 ; size 16
2509 %define left_1d rsp ; size 16
2519 SCALAR_HADAMARD left, r3, m0, m1
2520 SCALAR_HADAMARD top, r3, m0, m1, m2
2525 movzx t0d, word [left_1d+0]
2526 movzx r3d, word [top_1d+0]
2527 movzx r4d, word [left_1d+8]
2528 movzx r5d, word [top_1d+8]
2529 lea t0d, [t0 + r3 + 16]
2530 lea r3d, [r4 + r5 + 16]
2539 mov [dc_1d+ 0], t0d ; tl
2540 mov [dc_1d+ 4], r5d ; tr
2541 mov [dc_1d+ 8], r4d ; bl
2542 mov [dc_1d+12], r3d ; br
2555 SUM4x3 [r5+4*(r4+2)], [left_1d+8*(r3+2)], [top_1d+8*(r4+2)]
2558 paddw m0, [sums+16] ; i4x4_v satd
2559 paddw m4, [sums+8] ; i4x4_h satd
2560 paddw m5, [sums+0] ; i4x4_dc satd
2565 add r0, 4*SIZEOF_PIXEL
2568 add r0, 4*FENC_STRIDEB-8*SIZEOF_PIXEL
2581 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
2587 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
2590 movd [r2+0], m0 ; i8x8c_dc satd
2591 movd [r2+4], m1 ; i8x8c_h satd
2592 movd [r2+8], m2 ; i8x8c_v satd
2595 %endmacro ; INTRA_X3_MMX
2599 %macro PRED4x4_LOWPASS 5
2616 %macro INTRA_X9_PRED 2
2618 movu m1, [r1-1*FDEC_STRIDE-8]
2619 pinsrb m1, [r1+3*FDEC_STRIDE-1], 0
2620 pinsrb m1, [r1+2*FDEC_STRIDE-1], 1
2621 pinsrb m1, [r1+1*FDEC_STRIDE-1], 2
2622 pinsrb m1, [r1+0*FDEC_STRIDE-1], 3
2624 movd mm0, [r1+3*FDEC_STRIDE-4]
2625 punpcklbw mm0, [r1+2*FDEC_STRIDE-4]
2626 movd mm1, [r1+1*FDEC_STRIDE-4]
2627 punpcklbw mm1, [r1+0*FDEC_STRIDE-4]
2631 movu m1, [r1-1*FDEC_STRIDE-8]
2632 movss m1, m0 ; l3 l2 l1 l0 __ __ __ lt t0 t1 t2 t3 t4 t5 t6 t7
2634 pshufb m1, [intrax9_edge] ; l3 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __
2635 psrldq m0, m1, 1 ; l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __
2636 psrldq m2, m1, 2 ; l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ __
2637 pavgb m5, m0, m1 ; Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 __ __ __ __ __
2639 PRED4x4_LOWPASS m0, m1, m2, m0, m4 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 __ __ __
2641 ; Ft1 Ft2 Ft3 Ft4 Flt Ft0 Ft1 Ft2
2642 ; Ft2 Ft3 Ft4 Ft5 Fl0 Flt Ft0 Ft1
2643 ; Ft3 Ft4 Ft5 Ft6 Fl1 Fl0 Flt Ft0
2644 ; Ft4 Ft5 Ft6 Ft7 Fl2 Fl1 Fl0 Flt
2645 pshufb m2, m0, [%1_ddlr1] ; a: ddl row0, ddl row1, ddr row0, ddr row1 / b: ddl row0, ddr row0, ddl row1, ddr row1
2646 pshufb m3, m0, [%1_ddlr2] ; rows 2,3
2648 ; Glt Flt Ft0 Ft1 Gl0 Fl1 Gl1 Fl2
2649 ; Gl0 Fl0 Glt Flt Gl1 Fl2 Gl2 Fl3
2650 ; Gl1 Fl1 Gl0 Fl0 Gl2 Fl3 Gl3 Gl3
2651 ; Gl2 Fl2 Gl1 Fl1 Gl3 Gl3 Gl3 Gl3
2652 pslldq m0, 5 ; ___ ___ ___ ___ ___ Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
2653 palignr m7, m5, m0, 5 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gl3 Gl2 Gl1 Gl0 Glt
2654 pshufb m6, m7, [%1_hdu1]
2655 pshufb m7, m7, [%1_hdu2]
2657 ; Gt0 Gt1 Gt2 Gt3 Gt1 Gt2 Gt3 Gt4
2658 ; Flt Ft0 Ft1 Ft2 Ft1 Ft2 Ft3 Ft4
2659 ; Fl0 Gt0 Gt1 Gt2 Gt2 Gt3 Gt4 Gt5
2660 ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
2661 psrldq m5, 5 ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 ...
2662 palignr m5, m0, 6 ; ___ Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
2663 pshufb m4, m5, [%1_vrl1]
2664 pshufb m5, m5, [%1_vrl2]
2665 %endmacro ; INTRA_X9_PRED
2667 %macro INTRA_X9_VHDC 5 ; edge, fenc01, fenc23, tmp, tmp
2668 pshufb m2, m%1, [intrax9b_vh1]
2669 pshufb m3, m%1, [intrax9b_vh2]
2670 mova [pred_buf+0x60], m2
2671 mova [pred_buf+0x70], m3
2672 pshufb m%1, [intrax9b_edge2] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3
2673 pmaddubsw m%1, [hmul_4p]
2674 pshufhw m0, m%1, q2301
2675 pshuflw m0, m0, q2301
2676 psignw m%1, [pw_pmpmpmpm]
2678 psllw m0, 2 ; hadamard(top), hadamard(left)
2680 pshufb m1, m0, [intrax9b_v1]
2681 pshufb m2, m0, [intrax9b_v2]
2683 psignw m3, [pw_pmmpzzzz] ; FIXME could this be eliminated?
2685 pand m0, [sw_f0] ; dc
2686 ; This (as well as one of the steps in intra_satd_x9_4x4.satd_8x4) could be
2687 ; changed from a wd transpose to a qdq, with appropriate rearrangement of inputs.
2688 ; Which would be faster on conroe, but slower on penryn and sandybridge, and too invasive to ifdef.
2689 HADAMARD 0, sumsub, %2, %3, %4, %5
2690 HADAMARD 1, sumsub, %2, %3, %4, %5
2693 imul r3d, 0x01010101
2694 mov [pred_buf+0x80], r3d
2695 mov [pred_buf+0x88], r3d
2696 mov [pred_buf+0x90], r3d
2697 mov [pred_buf+0x98], r3d
2713 SBUTTERFLY qdq, 3, 0, 2
2724 pmaddwd m1, [pw_1] ; v, _, h, dc
2726 %endmacro ; INTRA_X9_VHDC
2728 %macro INTRA_X9_END 2
2730 phminposuw m0, m0 ; h,dc,ddl,ddr,vr,hd,vl,hu
2737 ; 4x4 sad is up to 12 bits; +bitcosts -> 13 bits; pack with 3 bit index
2739 paddw m0, [pw_s01234567] ; h,dc,ddl,ddr,vr,hd,vl,hu
2741 ; 4x4 satd is up to 13 bits; +bitcosts and saturate -> 13 bits; pack with 3 bit index
2744 paddw m0, [pw_s01234657] ; h,dc,ddl,ddr,vr,vl,hd,hu
2748 pshuflw m1, m0, q0032
2750 pshuflw m1, m0, q0001
2757 ; 1<<16: increment index to match intra4x4_pred_e. couldn't do this before because it had to fit in 3 bits
2758 ; 1<<12: undo sign manipulation
2759 lea eax, [rax+r2+(1<<16)+(1<<12)]
2764 ; output the predicted samples
2769 movzx r2d, byte [r2+r3]
2771 movzx r2d, byte [%2_lut+r3]
2774 movq mm0, [pred_buf+r2]
2775 movq mm1, [pred_buf+r2+16]
2776 movd [r1+0*FDEC_STRIDE], mm0
2777 movd [r1+2*FDEC_STRIDE], mm1
2780 movd [r1+1*FDEC_STRIDE], mm0
2781 movd [r1+3*FDEC_STRIDE], mm1
2785 mov r3d, [pred_buf+r2+8*i]
2786 mov [r1+i*FDEC_STRIDE], r3d
2790 %endmacro ; INTRA_X9_END
2793 ;-----------------------------------------------------------------------------
2794 ; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
2795 ;-----------------------------------------------------------------------------
2797 cglobal intra_sad_x9_4x4, 3,4,9
2798 %assign pad 0xc0-gprsize-(stack_offset&15)
2799 %define pred_buf rsp
2802 INTRA_X9_PRED intrax9a, m8
2804 INTRA_X9_PRED intrax9a, [rsp+0xa0]
2813 movd m0, [r0+0*FENC_STRIDE]
2814 pinsrd m0, [r0+1*FENC_STRIDE], 1
2815 movd m1, [r0+2*FENC_STRIDE]
2816 pinsrd m1, [r0+3*FENC_STRIDE], 1
2818 movd mm0, [r0+0*FENC_STRIDE]
2819 punpckldq mm0, [r0+1*FENC_STRIDE]
2820 movd mm1, [r0+2*FENC_STRIDE]
2821 punpckldq mm1, [r0+3*FENC_STRIDE]
2842 %define %%zero [pb_0]
2844 pshufb m3, m7, [intrax9a_vh1]
2845 pshufb m5, m7, [intrax9a_vh2]
2846 pshufb m7, [intrax9a_dc]
2861 movzx r3d, word [r2]
2864 punpckhqdq m3, m0 ; h, dc
2865 shufps m3, m2, q2020
2871 INTRA_X9_END 1, intrax9a
2877 ;-----------------------------------------------------------------------------
2878 ; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
2879 ;-----------------------------------------------------------------------------
2880 cglobal intra_satd_x9_4x4, 3,4,16
2881 %assign pad 0xb0-gprsize-(stack_offset&15)
2882 %define pred_buf rsp
2884 INTRA_X9_PRED intrax9b, m15
2891 movd m8, [r0+0*FENC_STRIDE]
2892 movd m9, [r0+1*FENC_STRIDE]
2893 movd m10, [r0+2*FENC_STRIDE]
2894 movd m11, [r0+3*FENC_STRIDE]
2905 pshufd m1, m2, q3232
2908 call .satd_8x4 ; ddr, ddl
2910 pshufd m3, m5, q3232
2913 pshufd m1, m4, q3232
2914 call .satd_8x4 ; vr, vl
2916 pshufd m3, m7, q3232
2919 pshufd m1, m6, q3232
2920 call .satd_8x4 ; hd, hu
2924 punpcklqdq m4, m0 ; conroe dislikes punpckldq, and ssse3 INTRA_X9_END can handle arbitrary orders whereas phminposuw can't
2926 mova m1, [pw_ppmmppmm]
2931 INTRA_X9_VHDC 15, 8, 10, 6, 7
2936 %if notcpuflag(sse4)
2937 pshufhw m0, m0, q3120 ; compensate for different order in unpack
2941 movzx r0d, word [r2]
2943 INTRA_X9_END 0, intrax9b
2946 RESET_MM_PERMUTATION
2957 SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap
2960 paddd xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free
2963 %else ; !ARCH_X86_64
2964 cglobal intra_satd_x9_4x4, 3,4,8
2965 %assign pad 0x120-gprsize-(stack_offset&15)
2966 %define fenc_buf rsp
2967 %define pred_buf rsp+0x40
2968 %define spill rsp+0xe0
2970 INTRA_X9_PRED intrax9b, [spill+0x20]
2971 mova [pred_buf+0x00], m2
2972 mova [pred_buf+0x10], m3
2973 mova [pred_buf+0x20], m4
2974 mova [pred_buf+0x30], m5
2975 mova [pred_buf+0x40], m6
2976 mova [pred_buf+0x50], m7
2977 movd m4, [r0+0*FENC_STRIDE]
2978 movd m5, [r0+1*FENC_STRIDE]
2979 movd m6, [r0+2*FENC_STRIDE]
2980 movd m0, [r0+3*FENC_STRIDE]
2990 mova [fenc_buf+0x00], m4
2991 mova [fenc_buf+0x10], m5
2992 mova [fenc_buf+0x20], m6
2993 mova [fenc_buf+0x30], m0
2995 pshufd m1, m2, q3232
3005 call .satd_8x4b ; ddr, ddl
3006 mova m3, [pred_buf+0x30]
3007 mova m1, [pred_buf+0x20]
3010 movq [spill+0x08], m0
3013 call .satd_8x4 ; vr, vl
3014 mova m3, [pred_buf+0x50]
3015 mova m1, [pred_buf+0x40]
3018 movq [spill+0x10], m0
3021 call .satd_8x4 ; hd, hu
3022 movq [spill+0x18], m0
3023 mova m1, [spill+0x20]
3024 mova m4, [fenc_buf+0x00]
3025 mova m5, [fenc_buf+0x20]
3026 mova m2, [pw_ppmmppmm]
3029 paddw m4, [fenc_buf+0x10]
3030 paddw m5, [fenc_buf+0x30]
3031 INTRA_X9_VHDC 1, 4, 5, 6, 7
3035 punpckhqdq m1, [spill+0x00]
3036 packssdw m1, [spill+0x10]
3038 pshufhw m1, m1, q3120
3040 pshufhw m0, m0, q3120
3043 movzx r0d, word [r2]
3045 INTRA_X9_END 0, intrax9b
3048 RESET_MM_PERMUTATION
3055 %xdefine fenc_buf fenc_buf+gprsize
3056 psubw m0, [fenc_buf+0x00]
3057 psubw m1, [fenc_buf+0x10]
3058 psubw m2, [fenc_buf+0x20]
3060 psubw m3, [fenc_buf+0x30]
3061 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap
3067 %endmacro ; INTRA_X9
3070 ;-----------------------------------------------------------------------------
3071 ; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
3072 ;-----------------------------------------------------------------------------
3073 cglobal intra_sad_x9_8x8, 5,6,9
3083 %assign padbase 0x10
3085 %assign pad 0x240+0x10+padbase-gprsize-(stack_offset&15)
3086 %define pred(i,j) [rsp+i*0x40+j*0x10+padbase]
3089 movq fenc02, [r0+FENC_STRIDE* 0]
3090 movq fenc13, [r0+FENC_STRIDE* 1]
3091 movq fenc46, [r0+FENC_STRIDE* 4]
3092 movq fenc57, [r0+FENC_STRIDE* 5]
3093 movhps fenc02, [r0+FENC_STRIDE* 2]
3094 movhps fenc13, [r0+FENC_STRIDE* 3]
3095 movhps fenc46, [r0+FENC_STRIDE* 6]
3096 movhps fenc57, [r0+FENC_STRIDE* 7]
3098 ; save instruction size: avoid 4-byte memory offsets
3099 lea r0, [intra8x9_h1+128]
3100 %define off(m) (r0+m-(intra8x9_h1+128))
3105 psadbw m1, m0, fenc02
3107 psadbw m2, m0, fenc13
3109 psadbw m3, m0, fenc46
3111 psadbw m0, m0, fenc57
3121 pshufb m1, m0, [off(intra8x9_h1)]
3122 pshufb m2, m0, [off(intra8x9_h2)]
3128 pshufb m3, m0, [off(intra8x9_h3)]
3129 pshufb m2, m0, [off(intra8x9_h4)]
3140 lea r5, [rsp+padbase+0x100]
3141 %define pred(i,j) [r5+i*0x40+j*0x10-0x100]
3153 psadbw m1, m0, fenc02
3155 psadbw m2, m0, fenc13
3157 psadbw m3, m0, fenc46
3159 psadbw m0, m0, fenc57
3168 ; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
3169 ; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
3170 ; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
3171 ; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
3172 ; Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC
3173 ; Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD
3174 ; Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE
3175 ; Ft8 Ft9 FtA FtB FtC FtD FtE FtF
3179 pavgb m3, m0, m2 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB ___ ___ ___ ___ ___
3180 PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; ___ Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE FtF
3181 pshufb m1, m0, [off(intra8x9_ddl1)]
3182 pshufb m2, m0, [off(intra8x9_ddl2)]
3188 pshufb m2, m0, [off(intra8x9_ddl3)]
3192 pshufb m2, m0, [off(intra8x9_ddl4)]
3201 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8
3202 ; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
3203 ; Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9
3204 ; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
3205 ; Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA
3206 ; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
3207 ; Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB
3208 ; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
3209 pshufb m1, m3, [off(intra8x9_vl1)]
3210 pshufb m2, m0, [off(intra8x9_vl2)]
3211 pshufb m3, m3, [off(intra8x9_vl3)]
3212 pshufb m0, m0, [off(intra8x9_vl4)]
3227 pextrw [r4+14], m0, 0
3231 lea r5, [rsp+padbase+0x100]
3235 ; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
3236 ; Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
3237 ; Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4
3238 ; Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3
3239 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2
3240 ; Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1
3241 ; Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0
3242 ; Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt
3246 pavgb m3, m2, m0 ; Gl6 Gl5 Gl4 Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
3247 PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
3248 pshufb m1, m0, [off(intra8x9_ddr1)]
3249 pshufb m2, m0, [off(intra8x9_ddr2)]
3255 pshufb m2, m0, [off(intra8x9_ddr3)]
3259 pshufb m2, m0, [off(intra8x9_ddr4)]
3269 %define off(m) (r0+m-(intra8x9_h1+256+128))
3270 %define pred(i,j) [r5+i*0x40+j*0x10-0x1C0]
3273 ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
3274 ; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
3275 ; Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6
3276 ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
3277 ; Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
3278 ; Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4
3279 ; Fl4 Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4
3280 ; Fl5 Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3
3281 movsd m2, m3, m0 ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
3282 pshufb m1, m2, [off(intra8x9_vr1)]
3283 pshufb m2, m2, [off(intra8x9_vr3)]
3289 pshufb m2, m0, [off(intra8x9_vr2)]
3293 pshufb m2, m0, [off(intra8x9_vr4)]
3302 ; Glt Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
3303 ; Gl0 Fl0 Glt Flt Ft0 Ft1 Ft2 Ft3
3304 ; Gl1 Fl1 Gl0 Fl0 Glt Flt Ft0 Ft1
3305 ; Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 Glt Flt
3306 ; Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0
3307 ; Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1
3308 ; Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2
3309 ; Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3
3310 pshufd m2, m3, q0001
3312 pblendw m2, m0, q3330 ; Gl2 Gl1 Gl0 Glt ___ Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ___
3317 punpcklbw m0, m3 ; Fl7 Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 ___
3318 pshufb m1, m2, [off(intra8x9_hd1)]
3319 pshufb m2, m2, [off(intra8x9_hd2)]
3325 pshufb m2, m0, [off(intra8x9_hd3)]
3326 pshufb m3, m0, [off(intra8x9_hd4)]
3335 ; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall
3340 ; Gl0 Fl1 Gl1 Fl2 Gl2 Fl3 Gl3 Fl4
3341 ; Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 Gl4 Fl5
3342 ; Gl2 Fl3 Gl3 Gl3 Gl4 Fl5 Gl5 Fl6
3343 ; Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 Gl6 Fl7
3344 ; Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 Gl7 Gl7
3345 ; Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 Gl7 Gl7
3346 ; Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
3347 ; Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
3349 pinsrb m0, [r2+7], 15 ; Gl7
3356 pshufb m1, m0, [off(intra8x9_hu1)]
3357 pshufb m2, m0, [off(intra8x9_hu2)]
3363 pshufb m2, m0, [off(intra8x9_hu3)]
3364 pshufb m0, m0, [off(intra8x9_hu4)]
3379 movzx r5d, word [r3+16]
3384 phminposuw m0, m0 ; v,h,dc,ddl,ddr,vr,hd,vl
3387 ; 8x8 sad is up to 14 bits; +bitcosts and saturate -> 14 bits; pack with 2 bit index
3390 paddw m0, [off(pw_s00112233)]
3393 pshuflw m1, m0, q0032
3396 ; repack with 3 bit index
3404 ; reverse to phminposuw order
3418 add r1, 4*FDEC_STRIDE
3419 mova m0, [rsp+padbase+r2+0x00]
3420 mova m1, [rsp+padbase+r2+0x10]
3421 mova m2, [rsp+padbase+r2+0x20]
3422 mova m3, [rsp+padbase+r2+0x30]
3423 movq [r1+FDEC_STRIDE*-4], m0
3424 movhps [r1+FDEC_STRIDE*-2], m0
3425 movq [r1+FDEC_STRIDE*-3], m1
3426 movhps [r1+FDEC_STRIDE*-1], m1
3427 movq [r1+FDEC_STRIDE* 0], m2
3428 movhps [r1+FDEC_STRIDE* 2], m2
3429 movq [r1+FDEC_STRIDE* 1], m3
3430 movhps [r1+FDEC_STRIDE* 3], m3
3435 ;-----------------------------------------------------------------------------
3436 ; int intra_sa8d_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
3437 ;-----------------------------------------------------------------------------
3438 cglobal intra_sa8d_x9_8x8, 5,6,16
3439 %assign pad 0x2c0+0x10-gprsize-(stack_offset&15)
3440 %define fenc_buf rsp
3441 %define pred_buf rsp+0x80
3447 movddup m %+ %%i, [r0+%%i*FENC_STRIDE]
3448 pmaddubsw m9, m %+ %%i, m15
3449 punpcklbw m %+ %%i, m8
3450 mova [fenc_buf+%%i*0x10], m9
3454 ; save instruction size: avoid 4-byte memory offsets
3455 lea r0, [intra8x9_h1+0x80]
3456 %define off(m) (r0+m-(intra8x9_h1+0x80))
3457 lea r5, [pred_buf+0x80]
3460 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
3469 ; 1D hadamard of edges
3477 pshufb m9, [intrax3_shuf]
3478 pmaddubsw m8, [pb_pppm]
3479 pmaddubsw m9, [pb_pppm]
3480 HSUMSUB2 psignw, m8, m9, m12, m13, m9, q1032, [pw_ppppmmmm]
3481 HSUMSUB2 psignw, m8, m9, m12, m13, m9, q2301, [pw_ppmmppmm]
3497 psllw m8, 3 ; left edge
3500 pabsw m8, m8 ; 1x8 sum
3510 punpcklqdq m0, m4 ; transpose
3511 psllw m9, 3 ; top edge
3512 psrldq m10, m11, 2 ; 8x7 sum
3513 psubw m0, m9 ; 8x1 sum
3517 phaddd m10, m8 ; logically phaddw, but this is faster and it won't overflow
3523 pshufb m0, m3, [off(intra8x9_h1)]
3524 pshufb m1, m3, [off(intra8x9_h2)]
3525 pshufb m2, m3, [off(intra8x9_h3)]
3526 pshufb m3, m3, [off(intra8x9_h4)]
3537 PRED4x4_LOWPASS m8, m1, m2, m8, m3
3538 pshufb m0, m8, [off(intra8x9_ddl1)]
3539 pshufb m1, m8, [off(intra8x9_ddl2)]
3540 pshufb m2, m8, [off(intra8x9_ddl3)]
3541 pshufb m3, m8, [off(intra8x9_ddl4)]
3547 pshufb m0, m9, [off(intra8x9_vl1)]
3548 pshufb m1, m8, [off(intra8x9_vl2)]
3549 pshufb m2, m9, [off(intra8x9_vl3)]
3550 pshufb m3, m8, [off(intra8x9_vl4)]
3561 PRED4x4_LOWPASS m8, m1, m2, m8, m3
3562 pshufb m0, m8, [off(intra8x9_ddr1)]
3563 pshufb m1, m8, [off(intra8x9_ddr2)]
3564 pshufb m2, m8, [off(intra8x9_ddr3)]
3565 pshufb m3, m8, [off(intra8x9_ddr4)]
3571 %define off(m) (r0+m-(intra8x9_h1+0x180))
3575 pshufb m0, m2, [off(intra8x9_vr1)]
3576 pshufb m1, m8, [off(intra8x9_vr2)]
3577 pshufb m2, m2, [off(intra8x9_vr3)]
3578 pshufb m3, m8, [off(intra8x9_vr4)]
3585 pshufd m1, m9, q0001
3586 pblendw m1, m8, q3330
3588 pshufd m2, m9, q0001
3592 pshufb m0, m1, [off(intra8x9_hd1)]
3593 pshufb m1, m1, [off(intra8x9_hd2)]
3594 pshufb m2, m8, [off(intra8x9_hd3)]
3595 pshufb m3, m8, [off(intra8x9_hd4)]
3603 pinsrb m8, [r2+7], 15
3610 pshufb m0, m8, [off(intra8x9_hu1)]
3611 pshufb m1, m8, [off(intra8x9_hu2)]
3612 pshufb m2, m8, [off(intra8x9_hu3)]
3613 pshufb m3, m8, [off(intra8x9_hu4)]
3621 pshuflw m1, m0, q0032
3630 movzx r5d, word [r3+16]
3638 ; 8x8 sa8d is up to 15 bits; +bitcosts and saturate -> 15 bits; pack with 1 bit index
3640 paddw m0, [off(pw_s00001111)]
3643 pshuflw m1, m0, q0032
3646 pcmpgtw m2, m1 ; 2nd index bit
3649 ; repack with 3 bit index
3657 lea r3d, [ r3*4+r4+1]
3660 ; reverse to phminposuw order
3674 add r1, 4*FDEC_STRIDE
3675 mova m0, [pred_buf+r2+0x00]
3676 mova m1, [pred_buf+r2+0x10]
3677 mova m2, [pred_buf+r2+0x20]
3678 mova m3, [pred_buf+r2+0x30]
3679 movq [r1+FDEC_STRIDE*-4], m0
3680 movhps [r1+FDEC_STRIDE*-2], m0
3681 movq [r1+FDEC_STRIDE*-3], m1
3682 movhps [r1+FDEC_STRIDE*-1], m1
3683 movq [r1+FDEC_STRIDE* 0], m2
3684 movhps [r1+FDEC_STRIDE* 2], m2
3685 movq [r1+FDEC_STRIDE* 1], m3
3686 movhps [r1+FDEC_STRIDE* 3], m3
3693 %xdefine fenc_buf fenc_buf+gprsize
3706 PERMUTE 0,4, 1,5, 2,0, 3,1, 4,6, 5,7, 6,2, 7,3
3709 psubw m0, [fenc_buf+0x00]
3710 psubw m1, [fenc_buf+0x10]
3713 psubw m2, [fenc_buf+0x20]
3714 psubw m3, [fenc_buf+0x30]
3717 psubw m4, [fenc_buf+0x40]
3718 psubw m5, [fenc_buf+0x50]
3721 psubw m6, [fenc_buf+0x60]
3722 psubw m7, [fenc_buf+0x70]
3723 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 13, 14
3728 %endif ; ARCH_X86_64
3729 %endmacro ; INTRA8_X9
3731 ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
3732 ; out: [tmp]=hadamard4, m0=satd
3734 cglobal hadamard_ac_4x4
3740 %else ; !HIGH_BIT_DEPTH
3749 %endif ; HIGH_BIT_DEPTH
3750 HADAMARD4_2D 0, 1, 2, 3, 4
3766 cglobal hadamard_ac_2x2max
3772 SUMSUB_BADC w, 0, 1, 2, 3, 4
3773 ABSW2 m0, m2, m0, m2, m4, m5
3774 ABSW2 m1, m3, m1, m3, m4, m5
3775 HADAMARD 0, max, 0, 2, 4, 5
3776 HADAMARD 0, max, 1, 3, 4, 5
3782 %else ; !HIGH_BIT_DEPTH
3785 %endif ; HIGH_BIT_DEPTH
3801 %endif ; HIGH_BIT_DEPTH
3804 cglobal hadamard_ac_8x8
3810 %endif ; HIGH_BIT_DEPTH
3811 call hadamard_ac_4x4_mmx2
3812 add r0, 4*SIZEOF_PIXEL
3816 call hadamard_ac_4x4_mmx2
3820 call hadamard_ac_4x4_mmx2
3821 sub r0, 4*SIZEOF_PIXEL
3824 call hadamard_ac_4x4_mmx2
3827 mova [rsp+gprsize+8], m5 ; save satd
3832 call hadamard_ac_2x2max_mmx2
3838 SUMSUB_BADC w, 0, 1, 2, 3, 4
3839 HADAMARD 0, sumsub, 0, 2, 4, 5
3840 ABSW2 m1, m3, m1, m3, m4, m5
3841 ABSW2 m0, m2, m0, m2, m4, m5
3842 HADAMARD 0, max, 1, 3, 4, 5
3853 %else ; !HIGH_BIT_DEPTH
3859 %endif ; HIGH_BIT_DEPTH
3860 mova [rsp+gprsize], m6 ; save sa8d
3865 %macro HADAMARD_AC_WXH_SUM_MMX 2
3866 mova m1, [rsp+1*mmsize]
3869 paddd m0, [rsp+2*mmsize]
3870 paddd m1, [rsp+3*mmsize]
3873 mova m2, [rsp+4*mmsize]
3874 paddd m1, [rsp+5*mmsize]
3875 paddd m2, [rsp+6*mmsize]
3877 paddd m1, [rsp+7*mmsize]
3884 %else ; !HIGH_BIT_DEPTH
3886 paddusw m0, [rsp+2*mmsize]
3887 paddusw m1, [rsp+3*mmsize]
3890 mova m2, [rsp+4*mmsize]
3891 paddusw m1, [rsp+5*mmsize]
3892 paddusw m2, [rsp+6*mmsize]
3894 paddusw m1, [rsp+7*mmsize]
3906 %endif ; HIGH_BIT_DEPTH
3909 %macro HADAMARD_AC_WXH_MMX 2
3910 cglobal pixel_hadamard_ac_%1x%2, 2,4
3911 %assign pad 16-gprsize-(stack_offset&15)
3917 call hadamard_ac_8x8_mmx2
3922 call hadamard_ac_8x8_mmx2
3927 lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
3929 call hadamard_ac_8x8_mmx2
3933 call hadamard_ac_8x8_mmx2
3936 HADAMARD_AC_WXH_SUM_MMX %1, %2
3944 add rsp, 128+%1*%2/4+pad
3946 %endmacro ; HADAMARD_AC_WXH_MMX
3948 HADAMARD_AC_WXH_MMX 16, 16
3949 HADAMARD_AC_WXH_MMX 8, 16
3950 HADAMARD_AC_WXH_MMX 16, 8
3951 HADAMARD_AC_WXH_MMX 8, 8
3953 %macro LOAD_INC_8x4W_SSE2 5
3962 %else ; !HIGH_BIT_DEPTH
3974 %endif ; HIGH_BIT_DEPTH
3977 %macro LOAD_INC_8x4W_SSSE3 5
3978 LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1]
3982 HSUMSUB %1, %2, %3, %4, %5
3985 %macro HADAMARD_AC_SSE2 0
3986 ; in: r0=pix, r1=stride, r2=stride*3
3987 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
3988 cglobal hadamard_ac_8x8
3994 %define spill0 [rsp+gprsize]
3995 %define spill1 [rsp+gprsize+mmsize]
3996 %define spill2 [rsp+gprsize+mmsize*2]
4000 %elif cpuflag(ssse3) && notcpuflag(atom)
4002 ;LOAD_INC loads sumsubs
4006 ;LOAD_INC only unpacks to words
4009 LOAD_INC_8x4W 0, 1, 2, 3, 7
4011 HADAMARD4_2D_SSE 0, 1, 2, 3, 4
4013 HADAMARD4_V 0, 1, 2, 3, 4
4017 LOAD_INC_8x4W 4, 5, 6, 7, 1
4019 HADAMARD4_2D_SSE 4, 5, 6, 7, 1
4021 HADAMARD4_V 4, 5, 6, 7, 1
4026 HADAMARD 1, sumsub, 0, 1, 6, 7
4027 HADAMARD 1, sumsub, 2, 3, 6, 7
4032 HADAMARD 1, sumsub, 4, 5, 1, 0
4033 HADAMARD 1, sumsub, 6, 7, 1, 0
4046 pand m1, [mask_ac4b]
4050 AC_PADD m1, m3, [pw_1]
4052 AC_PADD m1, m2, [pw_1]
4054 AC_PADD m1, m3, [pw_1]
4056 AC_PADD m1, m2, [pw_1]
4058 AC_PADD m1, m3, [pw_1]
4059 AC_PADD m1, m2, [pw_1]
4060 paddw m3, m7, spill2
4062 mova [rsp+gprsize+mmsize*2], m1 ; save satd
4063 paddw m2, m6, spill1
4065 paddw m1, m5, spill0
4072 HADAMARD %%x, amax, 3, 7, 4
4073 HADAMARD %%x, amax, 2, 6, 7, 4
4075 HADAMARD %%x, amax, 1, 5, 6, 7
4076 HADAMARD %%x, sumsub, 0, 4, 5, 6
4078 AC_PADD m2, m3, [pw_1]
4079 AC_PADD m2, m1, [pw_1]
4084 %endif ; HIGH_BIT_DEPTH
4088 AC_PADD m2, m4, [pw_1]
4089 AC_PADD m2, m0, [pw_1]
4090 mova [rsp+gprsize+mmsize], m2 ; save sa8d
4095 HADAMARD_AC_WXH_SSE2 16, 16
4096 HADAMARD_AC_WXH_SSE2 16, 8
4098 HADAMARD_AC_WXH_SSE2 8, 16
4099 HADAMARD_AC_WXH_SSE2 8, 8
4101 %endmacro ; HADAMARD_AC_SSE2
4103 %macro HADAMARD_AC_WXH_SUM_SSE2 2
4104 mova m1, [rsp+2*mmsize]
4107 paddd m0, [rsp+3*mmsize]
4108 paddd m1, [rsp+4*mmsize]
4111 paddd m0, [rsp+5*mmsize]
4112 paddd m1, [rsp+6*mmsize]
4113 paddd m0, [rsp+7*mmsize]
4114 paddd m1, [rsp+8*mmsize]
4119 %else ; !HIGH_BIT_DEPTH
4120 %if %1*%2*16/mmsize >= 128
4121 paddusw m0, [rsp+3*mmsize]
4122 paddusw m1, [rsp+4*mmsize]
4124 %if %1*%2*16/mmsize == 256
4125 paddusw m0, [rsp+5*mmsize]
4126 paddusw m1, [rsp+6*mmsize]
4127 paddusw m0, [rsp+7*mmsize]
4128 paddusw m1, [rsp+8*mmsize]
4132 vextracti128 xm2, m0, 1
4133 vextracti128 xm3, m1, 1
4139 %endif ; HIGH_BIT_DEPTH
4142 ; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
4143 %macro HADAMARD_AC_WXH_SSE2 2
4144 cglobal pixel_hadamard_ac_%1x%2, 2,4,11
4148 and rsp, ~(mmsize-1)
4151 call hadamard_ac_8x8
4156 call hadamard_ac_8x8
4158 %if %1==16 && mmsize <= 16
4161 lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
4163 call hadamard_ac_8x8
4167 call hadamard_ac_8x8
4170 HADAMARD_AC_WXH_SUM_SSE2 %1, %2
4173 shr edx, 2 - (%1*%2*16/mmsize >> 8)
4181 %endmacro ; HADAMARD_AC_WXH_SSE2
4185 %if ARCH_X86_64 == 0
4186 cextern pixel_sa8d_8x8_internal_mmx2
4191 %define TRANS TRANS_SSE2
4192 %define DIFFOP DIFF_UNPACK_SSE2
4193 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2
4194 %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
4195 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
4196 %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
4197 %define movdqu movups
4198 %define punpcklqdq movlhps
4205 %if HIGH_BIT_DEPTH == 0
4213 %if HIGH_BIT_DEPTH == 0
4223 %define DIFFOP DIFF_SUMSUB_SSSE3
4224 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
4225 %if HIGH_BIT_DEPTH == 0
4226 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
4227 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
4228 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
4237 %if HIGH_BIT_DEPTH == 0
4241 %undef movdqa ; nehalem doesn't like movaps
4242 %undef movdqu ; movups
4243 %undef punpcklqdq ; or movlhps
4244 %if HIGH_BIT_DEPTH == 0
4249 %define TRANS TRANS_SSE4
4250 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
4258 %if HIGH_BIT_DEPTH == 0
4263 ; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
4264 ; it's effectively free.
4265 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
4272 %if HIGH_BIT_DEPTH == 0
4278 %define TRANS TRANS_XOP
4285 %if HIGH_BIT_DEPTH == 0
4287 ; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
4292 %if HIGH_BIT_DEPTH == 0
4293 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
4294 %define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2
4295 %define TRANS TRANS_SSE4
4302 %macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
4307 vinserti128 m%1, m%1, [r0+4*r1], 1
4308 vinserti128 m%3, m%3, [r2+4*r3], 1
4309 vinserti128 m%2, m%2, [r0+r4], 1
4310 vinserti128 m%4, m%4, [r2+r5], 1
4315 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
4323 vinserti128 m%3, m%3, [r0+4*r1], 1
4324 vinserti128 m%5, m%5, [r2+4*r3], 1
4325 vinserti128 m%4, m%4, [r0+r4], 1
4326 vinserti128 m%6, m%6, [r2+r5], 1
4331 DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
4334 %macro SATD_START_AVX2 2-3 0
4348 %define TRANS TRANS_SSE4
4350 cglobal pixel_satd_16x8_internal
4351 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
4352 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
4353 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0
4354 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
4357 cglobal pixel_satd_16x16, 4,6,8
4358 SATD_START_AVX2 m6, m7
4359 call pixel_satd_16x8_internal
4362 pixel_satd_16x8_internal:
4363 call pixel_satd_16x8_internal
4364 vextracti128 xm0, m6, 1
4369 cglobal pixel_satd_16x8, 4,6,8
4370 SATD_START_AVX2 m6, m7
4371 jmp pixel_satd_16x8_internal
4373 cglobal pixel_satd_8x8_internal
4374 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
4375 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
4378 cglobal pixel_satd_8x16, 4,6,8
4379 SATD_START_AVX2 m6, m7, 1
4380 call pixel_satd_8x8_internal
4385 call pixel_satd_8x8_internal
4386 vextracti128 xm0, m6, 1
4391 cglobal pixel_satd_8x8, 4,6,8
4392 SATD_START_AVX2 m6, m7, 1
4393 call pixel_satd_8x8_internal
4394 vextracti128 xm0, m6, 1
4399 cglobal pixel_sa8d_8x8_internal
4400 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
4401 HADAMARD4_V 0, 1, 2, 3, 4
4402 HADAMARD 8, sumsub, 0, 1, 4, 5
4403 HADAMARD 8, sumsub, 2, 3, 4, 5
4404 HADAMARD 2, sumsub, 0, 1, 4, 5
4405 HADAMARD 2, sumsub, 2, 3, 4, 5
4406 HADAMARD 1, amax, 0, 1, 4, 5
4407 HADAMARD 1, amax, 2, 3, 4, 5
4412 cglobal pixel_sa8d_8x8, 4,6,8
4413 SATD_START_AVX2 m6, m7, 1
4414 call pixel_sa8d_8x8_internal
4415 vextracti128 xm1, m6, 1
4423 cglobal intra_sad_x9_8x8, 5,7,8
4424 %define pred(i,j) [rsp+i*0x40+j*0x20]
4429 movu m5, [r0+0*FENC_STRIDE]
4430 movu m6, [r0+4*FENC_STRIDE]
4431 punpcklqdq m5, [r0+2*FENC_STRIDE]
4432 punpcklqdq m6, [r0+6*FENC_STRIDE]
4434 ; save instruction size: avoid 4-byte memory offsets
4435 lea r0, [intra8x9_h1+128]
4436 %define off(m) (r0+m-(intra8x9_h1+128))
4438 vpbroadcastq m0, [r2+16]
4445 vpbroadcastq m1, [r2+7]
4446 pshufb m3, m1, [off(intra8x9_h1)]
4447 pshufb m2, m1, [off(intra8x9_h3)]
4455 %define pred(i,j) [r5+i*0x40+j*0x20-0x100]
4457 ; combine the first two
4477 vbroadcasti128 m0, [r2+16]
4478 vbroadcasti128 m2, [r2+17]
4481 PRED4x4_LOWPASS m0, m1, m2, m0, m7
4482 pshufb m1, m0, [off(intra8x9_ddl1)]
4483 pshufb m2, m0, [off(intra8x9_ddl3)]
4492 vextracti128 xm1, m4, 1
4497 vinserti128 m7, m3, xm0, 1
4499 vbroadcasti128 m2, [r2+8]
4500 vbroadcasti128 m0, [r2+7]
4501 vbroadcasti128 m1, [r2+6]
4503 PRED4x4_LOWPASS m0, m1, m2, m0, m4
4504 pshufb m1, m0, [off(intra8x9_ddr1)]
4505 pshufb m2, m0, [off(intra8x9_ddr3)]
4514 %define off(m) (r0+m-(intra8x9_h1+256+128))
4515 %define pred(i,j) [r5+i*0x40+j*0x20-0x1C0]
4517 vpblendd m2, m3, m0, 11110011b
4518 pshufb m1, m2, [off(intra8x9_vr1)]
4519 pshufb m2, m2, [off(intra8x9_vr3)]
4530 pblendw m2, m0, q3330
4532 pshufb m1, m2, [off(intra8x9_hd1)]
4533 pshufb m2, m0, [off(intra8x9_hd3)]
4543 pshufb m1, m7, [off(intra8x9_vl1)]
4544 pshufb m2, m7, [off(intra8x9_vl3)]
4553 vextracti128 xm1, m4, 1
4556 SBUTTERFLY qdq, 3, 4, 7
4560 vpbroadcastd m0, [r2+7]
4562 pshufb m1, m0, [off(intra8x9_hu1)]
4563 pshufb m2, m0, [off(intra8x9_hu3)]
4569 vextracti128 xm2, m1, 1
4577 add r2w, word [r3+16]
4589 add r1, 4*FDEC_STRIDE
4590 mova xm0, [rsp+r3+0x00]
4591 mova xm1, [rsp+r3+0x10]
4592 mova xm2, [rsp+r3+0x20]
4593 mova xm3, [rsp+r3+0x30]
4594 movq [r1+FDEC_STRIDE*-4], xm0
4595 movhps [r1+FDEC_STRIDE*-2], xm0
4596 movq [r1+FDEC_STRIDE*-3], xm1
4597 movhps [r1+FDEC_STRIDE*-1], xm1
4598 movq [r1+FDEC_STRIDE* 0], xm2
4599 movhps [r1+FDEC_STRIDE* 2], xm2
4600 movq [r1+FDEC_STRIDE* 1], xm3
4601 movhps [r1+FDEC_STRIDE* 3], xm3
4605 %endif ; HIGH_BIT_DEPTH
4607 ;=============================================================================
4609 ;=============================================================================
4611 ;-----------------------------------------------------------------------------
4612 ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
4613 ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
4614 ;-----------------------------------------------------------------------------
4617 movdqu m5, [r0+(%1&1)*r1]
4618 movdqu m6, [r2+(%1&1)*r3]
4620 movq m5, [r0+(%1&1)*r1]
4621 movq m6, [r2+(%1&1)*r3]
4639 ACCUM paddd, 3, 5, %1
4640 ACCUM paddd, 4, 7, %1
4645 cglobal pixel_ssim_4x4x2_core, 4,4,8
4655 pshufd m5, m3, q2301
4658 pshufd m6, m4, q2301
4661 pshufd m1, m1, q3120
4664 punpckhdq m5, m3, m4
4680 ;-----------------------------------------------------------------------------
4681 ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
4682 ;-----------------------------------------------------------------------------
4683 cglobal pixel_ssim_end4, 2,3
4699 TRANSPOSE4x4D 0, 1, 2, 3, 4
4701 ; s1=m0, s2=m1, ss=m2, s12=m3
4707 mulps m4, m0, m1 ; s1*s2
4708 mulps m0, m0 ; s1*s1
4709 mulps m1, m1 ; s2*s2
4710 mulps m2, [pf_64] ; ss*64
4711 mulps m3, [pf_128] ; s12*128
4712 addps m4, m4 ; s1*s2*2
4713 addps m0, m1 ; s1*s1 + s2*s2
4715 subps m3, m4 ; covar*2
4716 movaps m1, [ssim_c1]
4717 addps m4, m1 ; s1*s2*2 + ssim_c1
4718 addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1
4719 movaps m1, [ssim_c2]
4720 addps m2, m1 ; vars + ssim_c2
4721 addps m3, m1 ; covar*2 + ssim_c2
4723 pmaddwd m4, m1, m0 ; s1*s2
4726 pmaddwd m0, m0 ; s1*s1 + s2*s2
4730 psubd m3, m4 ; covar*2
4738 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
4739 cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
4740 cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
4741 cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
4748 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
4752 lea r3, [mask_ff + 16]
4755 %xdefine %%mask mask_ff + 16
4758 andps m4, [%%mask + r2*4]
4760 movups m0, [%%mask + r2*4]
4770 pshuflw m4, m0, q0032
4773 %if ARCH_X86_64 == 0
4785 ;-----------------------------------------------------------------------------
4786 ; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
4787 ;-----------------------------------------------------------------------------
4789 cglobal pixel_asd8, 5,5
4852 ;=============================================================================
4853 ; Successive Elimination ADS
4854 ;=============================================================================
4863 lea r6, [r4+r5+(mmsize-1)]
4868 %macro ADS_END 1 ; unroll_size
4874 WIN64_RESTORE_XMM rsp
4878 lea r6, [r4+r5+(mmsize-1)]
4887 ;-----------------------------------------------------------------------------
4888 ; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
4889 ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
4890 ;-----------------------------------------------------------------------------
4892 cglobal pixel_ads4, 5,7
4896 pshufw m6, m6, q2222
4898 pshufw m4, m4, q2222
4923 cglobal pixel_ads2, 5,7
4927 pshufw m6, m6, q2222
4944 cglobal pixel_ads1, 5,7
4967 cglobal pixel_ads4, 5,7,8
4968 vpbroadcastw m7, [r0+ 0]
4969 vpbroadcastw m6, [r0+ 4]
4970 vpbroadcastw m5, [r0+ 8]
4971 vpbroadcastw m4, [r0+12]
4973 cglobal pixel_ads4, 5,7,12
4975 pshuflw m7, m4, q0000
4976 pshuflw m6, m4, q2222
4977 pshufhw m5, m4, q0000
4978 pshufhw m4, m4, q2222
4984 %if ARCH_X86_64 && mmsize == 16
4997 movu m11, [r1+r2+16]
5027 vpbroadcastw m1, r6m
5038 vpermq m1, m1, q3120
5045 cglobal pixel_ads2, 5,7,8
5047 vpbroadcastw m7, [r0+0]
5048 vpbroadcastw m6, [r0+4]
5049 vpbroadcastw m5, r6m
5054 pshuflw m6, m6, q2222
5074 vpermq m1, m1, q3120
5081 cglobal pixel_ads1, 5,7,8
5083 vpbroadcastw m7, [r0]
5084 vpbroadcastw m6, r6m
5096 movu m1, [r1+mmsize]
5100 movu m3, [r3+mmsize]
5109 vpermq m4, m4, q3120
5124 ; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
5127 ; *(uint32_t*)(masks+width) = 0;
5128 ; for( i=0; i<width; i+=8 )
5130 ; uint64_t mask = *(uint64_t*)(masks+i);
5131 ; if( !mask ) continue;
5132 ; for( j=0; j<8; j++ )
5133 ; if( mask & (255<<j*8) )
5141 test r2d, 0xff<<(%1*8)
5148 cglobal pixel_ads_mvs, 0,7,0
5153 ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
5193 cglobal pixel_ads_mvs, 0,7,0
5196 mova m4, [pw_76543210]
5203 %define GLOBAL +r1-$$
5211 xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions)
5212 movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt
5214 ; shuffle counters based on mv mask
5215 pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
5218 paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}