1 ;*****************************************************************************
2 ;* pixel.asm: x86 pixel metrics
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2014 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* Fiona Glaser <fiona@x264.com>
11 ;* Oskar Arvidsson <oskar@irock.se>
13 ;* This program is free software; you can redistribute it and/or modify
14 ;* it under the terms of the GNU General Public License as published by
15 ;* the Free Software Foundation; either version 2 of the License, or
16 ;* (at your option) any later version.
18 ;* This program is distributed in the hope that it will be useful,
19 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;* GNU General Public License for more details.
23 ;* You should have received a copy of the GNU General Public License
24 ;* along with this program; if not, write to the Free Software
25 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
27 ;* This program is also available under a commercial proprietary license.
28 ;* For more information, contact us at licensing@x264.com.
29 ;*****************************************************************************
32 %include "x86util.asm"
35 hmul_16p: times 16 db 1
41 mask_ff: times 16 db 0xff
43 mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1
44 mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1
45 mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1
47 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
48 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
49 pf_64: times 4 dd 64.0
50 pf_128: times 4 dd 128.0
52 ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64
53 ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63
55 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
56 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
58 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
59 mask_10: times 4 dw 0, -1
60 mask_1100: times 2 dd 0, -1
61 pb_pppm: times 4 db 1,1,1,-1
62 deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
63 intrax3_shuf: db 7,6,7,6,5,4,5,4,3,2,3,2,1,0,1,0
65 intrax9a_ddlr1: db 6, 7, 8, 9, 7, 8, 9,10, 4, 5, 6, 7, 3, 4, 5, 6
66 intrax9a_ddlr2: db 8, 9,10,11, 9,10,11,12, 2, 3, 4, 5, 1, 2, 3, 4
67 intrax9a_hdu1: db 15, 4, 5, 6,14, 3,15, 4,14, 2,13, 1,13, 1,12, 0
68 intrax9a_hdu2: db 13, 2,14, 3,12, 1,13, 2,12, 0,11,11,11,11,11,11
69 intrax9a_vrl1: db 10,11,12,13, 3, 4, 5, 6,11,12,13,14, 5, 6, 7, 8
70 intrax9a_vrl2: db 2,10,11,12, 1, 3, 4, 5,12,13,14,15, 6, 7, 8, 9
71 intrax9a_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 4, 4, 4, 3, 3, 3, 3
72 intrax9a_vh2: db 6, 7, 8, 9, 6, 7, 8, 9, 2, 2, 2, 2, 1, 1, 1, 1
73 intrax9a_dc: db 1, 2, 3, 4, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1
74 intrax9a_lut: db 0x60,0x68,0x80,0x00,0x08,0x20,0x40,0x28,0x48,0,0,0,0,0,0,0
75 pw_s01234567: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8005,0x8006,0x8007
76 pw_s01234657: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8006,0x8005,0x8007
77 intrax9_edge: db 0, 0, 1, 2, 3, 7, 8, 9,10,11,12,13,14,15,15,15
79 intrax9b_ddlr1: db 6, 7, 8, 9, 4, 5, 6, 7, 7, 8, 9,10, 3, 4, 5, 6
80 intrax9b_ddlr2: db 8, 9,10,11, 2, 3, 4, 5, 9,10,11,12, 1, 2, 3, 4
81 intrax9b_hdu1: db 15, 4, 5, 6,14, 2,13, 1,14, 3,15, 4,13, 1,12, 0
82 intrax9b_hdu2: db 13, 2,14, 3,12, 0,11,11,12, 1,13, 2,11,11,11,11
83 intrax9b_vrl1: db 10,11,12,13,11,12,13,14, 3, 4, 5, 6, 5, 6, 7, 8
84 intrax9b_vrl2: db 2,10,11,12,12,13,14,15, 1, 3, 4, 5, 6, 7, 8, 9
85 intrax9b_vh1: db 6, 7, 8, 9, 4, 4, 4, 4, 6, 7, 8, 9, 3, 3, 3, 3
86 intrax9b_vh2: db 6, 7, 8, 9, 2, 2, 2, 2, 6, 7, 8, 9, 1, 1, 1, 1
87 intrax9b_edge2: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1
88 intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1
89 intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
90 intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0
93 intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5
94 intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4
95 intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1
96 intra8x9_h4: db 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0
97 intra8x9_ddl1: db 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9,10
98 intra8x9_ddl2: db 2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9,10,11
99 intra8x9_ddl3: db 5, 6, 7, 8, 9,10,11,12, 7, 8, 9,10,11,12,13,14
100 intra8x9_ddl4: db 6, 7, 8, 9,10,11,12,13, 8, 9,10,11,12,13,14,15
101 intra8x9_vl1: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
102 intra8x9_vl2: db 1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9
103 intra8x9_vl3: db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9,10
104 intra8x9_vl4: db 3, 4, 5, 6, 7, 8, 9,10, 4, 5, 6, 7, 8, 9,10,11
105 intra8x9_ddr1: db 8, 9,10,11,12,13,14,15, 6, 7, 8, 9,10,11,12,13
106 intra8x9_ddr2: db 7, 8, 9,10,11,12,13,14, 5, 6, 7, 8, 9,10,11,12
107 intra8x9_ddr3: db 4, 5, 6, 7, 8, 9,10,11, 2, 3, 4, 5, 6, 7, 8, 9
108 intra8x9_ddr4: db 3, 4, 5, 6, 7, 8, 9,10, 1, 2, 3, 4, 5, 6, 7, 8
109 intra8x9_vr1: db 8, 9,10,11,12,13,14,15, 7, 8, 9,10,11,12,13,14
110 intra8x9_vr2: db 8, 9,10,11,12,13,14,15, 6, 8, 9,10,11,12,13,14
111 intra8x9_vr3: db 5, 7, 8, 9,10,11,12,13, 3, 5, 7, 8, 9,10,11,12
112 intra8x9_vr4: db 4, 6, 8, 9,10,11,12,13, 2, 4, 6, 8, 9,10,11,12
113 intra8x9_hd1: db 3, 8, 9,10,11,12,13,14, 1, 6, 2, 7, 3, 8, 9,10
114 intra8x9_hd2: db 2, 7, 3, 8, 9,10,11,12, 0, 5, 1, 6, 2, 7, 3, 8
115 intra8x9_hd3: db 7, 8, 9,10,11,12,13,14, 3, 4, 5, 6, 7, 8, 9,10
116 intra8x9_hd4: db 5, 6, 7, 8, 9,10,11,12, 1, 2, 3, 4, 5, 6, 7, 8
117 intra8x9_hu1: db 13,12,11,10, 9, 8, 7, 6, 9, 8, 7, 6, 5, 4, 3, 2
118 intra8x9_hu2: db 11,10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0
119 intra8x9_hu3: db 5, 4, 3, 2, 1, 0,15,15, 1, 0,15,15,15,15,15,15
120 intra8x9_hu4: db 3, 2, 1, 0,15,15,15,15,15,15,15,15,15,15,15,15
121 pw_s00112233: dw 0x8000,0x8000,0x8001,0x8001,0x8002,0x8002,0x8003,0x8003
122 pw_s00001111: dw 0x8000,0x8000,0x8000,0x8000,0x8001,0x8001,0x8001,0x8001
124 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
125 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
128 pd_f0: times 4 dd 0xffff0000
130 pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
133 %macro ADS_MVS_SHUFFLE 8
138 %assign y y>>((~y)&1)
147 ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7
168 ;=============================================================================
170 ;=============================================================================
173 ;-----------------------------------------------------------------------------
174 ; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
175 ;-----------------------------------------------------------------------------
177 cglobal pixel_ssd_%1x%2, 4,7,6
181 %define offset0_2 r1*2
184 %define offset1_2 r3*2
189 %define offset0_1 mmsize
191 %define offset0_3 r1+mmsize
192 %define offset1_1 mmsize
194 %define offset1_3 r3+mmsize
196 %define offset0_1 mmsize
197 %define offset0_2 mmsize*2
198 %define offset0_3 mmsize*3
199 %define offset1_1 mmsize
200 %define offset1_2 mmsize*2
201 %define offset1_3 mmsize*3
203 %assign %%n %2/(2*mmsize/%1)
210 mova m2, [r0+offset0_1]
211 mova m3, [r0+offset0_2]
212 mova m4, [r0+offset0_3]
214 psubw m2, [r2+offset1_1]
215 psubw m3, [r2+offset1_2]
216 psubw m4, [r2+offset1_3]
218 lea r0, [r0+r1*(%2/%%n)]
219 lea r2, [r2+r3*(%2/%%n)]
256 %endif ; HIGH_BIT_DEPTH
258 %if HIGH_BIT_DEPTH == 0
259 %macro SSD_LOAD_FULL 5
303 DEINTB %2, %1, %4, %3, 7
320 vinserti128 m%1, m%1, %4, 1
328 vinserti128 m%2, m%2, %6, 1
332 SBUTTERFLY bw, %1, %2, %3
335 %macro SSD_LOAD_HALF 5
336 LOAD 1, 2, [t0+%1], [t0+%3], 1
337 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
338 LOAD 3, 4, [t0+%1], [t0+%3], %5
339 JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5
352 punpcklbw m%2, m%1, m%5
354 punpcklbw m%4, m%3, m%5
363 %macro SSD_CORE_SSE2 7-8
365 DEINTB %6, %1, %7, %2, %5
369 DEINTB %6, %3, %7, %4, %5
380 %macro SSD_CORE_SSSE3 7-8
382 punpckhbw m%6, m%1, m%2
383 punpckhbw m%7, m%3, m%4
400 SSD_LOAD_%1 %2,%3,%4,%5,%6
401 SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
408 ;-----------------------------------------------------------------------------
409 ; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
410 ;-----------------------------------------------------------------------------
413 %assign function_align 8
415 %assign function_align 16
417 cglobal pixel_ssd_%1x%2, 0,0,0
418 mov al, %1*%2/mmsize/2
421 jmp mangle(x264_pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop)
426 DECLARE_REG_TMP 0,1,2,3
430 DECLARE_REG_TMP 1,2,3,4
439 %elifidn cpuname, sse2
449 SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
451 SSD_ITER FULL, 0, 0, t1, t3, 2
453 SSD_ITER HALF, 0, 0, t1, t3, 2
458 vextracti128 xm1, m0, 1
486 %define SSD_CORE SSD_CORE_SSE2
487 %define JOIN JOIN_SSE2
494 %define SSD_CORE SSD_CORE_SSSE3
495 %define JOIN JOIN_SSSE3
517 %define LOAD LOAD_AVX2
518 %define JOIN JOIN_AVX2
522 %assign function_align 16
523 %endif ; !HIGH_BIT_DEPTH
525 ;-----------------------------------------------------------------------------
526 ; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
527 ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
529 ; The maximum width this function can handle without risk of overflow is given
530 ; in the following equation: (mmsize in bits)
532 ; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
534 ; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
535 ; distortion levels it will take much more than that though.
536 ;-----------------------------------------------------------------------------
539 cglobal pixel_ssd_nv12_core, 6,7,7
555 mova m1, [r0+r6+mmsize]
557 psubw m1, [r2+r6+mmsize]
558 PSHUFLW m0, m0, q3120
559 PSHUFLW m1, m1, q3120
561 pshufhw m0, m0, q3120
562 pshufhw m1, m1, q3120
565 pmadcswd m2, m0, m0, m2
566 pmadcswd m3, m1, m1, m3
575 %if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
580 %if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the
581 ; equation above, putting the width limit at 8208
590 %else ; unfortunately paddq is sse2
591 ; emulate 48 bit precision for mmx2 instead
610 vextracti128 xm0, m4, 1
616 %else ; fixup for mmx2
617 SBUTTERFLY dq, 4, 5, 0
622 SBUTTERFLY dq, 0, 5, 4
630 %endif ; HIGH_BIT_DEPTH
632 %if HIGH_BIT_DEPTH == 0
633 ;-----------------------------------------------------------------------------
634 ; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
635 ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
637 ; This implementation can potentially overflow on image widths >= 11008 (or
638 ; 6604 if interlaced), since it is called on blocks of height up to 12 (resp
639 ; 20). At sane distortion levels it will take much more than that though.
640 ;-----------------------------------------------------------------------------
642 cglobal pixel_ssd_nv12_core, 6,7
653 %if mmsize == 32 ; only 16-byte alignment is guaranteed
666 pmadcswd m4, m2, m2, m4
667 pmadcswd m3, m0, m0, m3
676 %if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
679 pandn m0, m1, m0 ; zero the lower half
700 %endif ; !HIGH_BIT_DEPTH
713 ;=============================================================================
715 ;=============================================================================
719 pxor m6, m6 ; sum squared
720 %if HIGH_BIT_DEPTH == 0
726 %endif ; !HIGH_BIT_DEPTH
731 %if mmsize == 8 && %1*%2 == 256
736 %else ; !HIGH_BIT_DEPTH
738 %endif ; HIGH_BIT_DEPTH
772 mova m4, [r0+%1+mmsize]
773 %else ; !HIGH_BIT_DEPTH
779 %endif ; HIGH_BIT_DEPTH
785 %if HIGH_BIT_DEPTH == 0
788 %endif ; !HIGH_BIT_DEPTH
794 ;-----------------------------------------------------------------------------
795 ; int pixel_var_wxh( uint8_t *, intptr_t )
796 ;-----------------------------------------------------------------------------
798 cglobal pixel_var_16x16, 2,3
801 VAR_2ROW 8*SIZEOF_PIXEL, 16
804 cglobal pixel_var_8x16, 2,3
810 cglobal pixel_var_8x8, 2,3
818 cglobal pixel_var_16x16, 2,3,8
824 cglobal pixel_var_8x8, 2,3,8
847 %endif ; HIGH_BIT_DEPTH
849 %if HIGH_BIT_DEPTH == 0
851 cglobal pixel_var_16x16, 2,3,8
864 cglobal pixel_var_8x8, 2,4,8
880 cglobal pixel_var_8x16, 2,4,8
905 cglobal pixel_var_16x16, 2,4,7
912 pmovzxbw m1, [r0+r1*2]
918 vextracti128 xm0, m5, 1
919 vextracti128 xm1, m6, 1
932 %endif ; !HIGH_BIT_DEPTH
942 sub eax, r1d ; sqr - (sum * sum >> shift)
946 ;-----------------------------------------------------------------------------
947 ; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
948 ;-----------------------------------------------------------------------------
949 %macro VAR2_8x8_MMX 2
950 cglobal pixel_var2_8x%1, 5,6
959 psubw m1, [r2+mmsize]
960 %else ; !HIGH_BIT_DEPTH
971 %endif ; HIGH_BIT_DEPTH
991 %macro VAR2_8x8_SSE2 2
992 cglobal pixel_var2_8x%1, 5,6,8
1001 %else ; !HIGH_BIT_DEPTH
1006 DEINTB 0, 1, 2, 3, 7
1007 %endif ; HIGH_BIT_DEPTH
1016 lea r0, [r0+r1*2*SIZEOF_PIXEL]
1017 lea r2, [r2+r3*2*SIZEOF_PIXEL]
1027 %if HIGH_BIT_DEPTH == 0
1028 %macro VAR2_8x8_SSSE3 2
1029 cglobal pixel_var2_8x%1, 5,6,8
1031 pxor m6, m6 ; sum squared
1074 VAR2_8x8_SSSE3 16, 7
1077 VAR2_8x8_SSSE3 16, 7
1079 %macro VAR2_8x8_AVX2 2
1080 cglobal pixel_var2_8x%1, 5,6,6
1082 pxor m4, m4 ; sum squared
1088 vinserti128 m0, m0, [r0+r1], 1
1089 vinserti128 m1, m1, [r2+r3], 1
1095 vinserti128 m1, m1, [r0+r1], 1
1096 vinserti128 m2, m2, [r2+r3], 1
1110 vextracti128 xm0, m3, 1
1111 vextracti128 xm1, m4, 1
1114 VAR2_END %2, xm3, xm4
1121 %endif ; !HIGH_BIT_DEPTH
1123 ;=============================================================================
1125 ;=============================================================================
1129 ; just use shufps on anything post conroe
1131 %elif cpuflag(ssse3) && notcpuflag(atom)
1132 ; join 2x 32 bit and duplicate them
1133 ; emulating shufps is faster on conroe
1137 ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
1149 %macro DIFF_UNPACK_SSE2 5
1158 %macro DIFF_SUMSUB_SSSE3 5
1159 HSUMSUB %1, %2, %3, %4, %5
1164 %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
1170 %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
1177 %macro LOAD_DUP_4x8P_PENRYN 8
1178 ; penryn and nehalem run punpcklqdq and movddup in different units
1187 %macro LOAD_SUMSUB_8x2P 9
1188 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
1189 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1192 %macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
1193 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
1194 LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
1195 LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
1202 %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
1208 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1211 %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
1214 DEINTB %1, %2, %3, %4, %5
1217 SUMSUB_BA w, %1, %2, %3
1220 %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
1221 ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
1222 LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
1223 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
1224 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
1225 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
1228 %macro LOAD_SUMSUB_16x2P_AVX2 9
1229 ; 2*dst, 2*tmp, mul, 4*ptr
1230 vbroadcasti128 m%1, [%6]
1231 vbroadcasti128 m%3, [%7]
1232 vbroadcasti128 m%2, [%8]
1233 vbroadcasti128 m%4, [%9]
1234 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1237 %macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0
1238 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
1239 LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3
1240 LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5
1247 %macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer
1252 vpermq m%3, m%3, q0011
1253 vpermq m%4, m%4, q0011
1254 vpermq m%1, m%1, q0011
1255 vpermq m%2, m%2, q0011
1258 %macro LOAD_SUMSUB8_16x2P_AVX2 9
1259 ; 2*dst, 2*tmp, mul, 4*ptr
1260 LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9
1261 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1264 %macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0
1265 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
1266 LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
1267 LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
1274 ; in: r4=3*stride1, r5=3*stride2
1275 ; in: %2 = horizontal offset
1276 ; in: %3 = whether we need to increment pix1 and pix2
1279 %macro SATD_4x4_MMX 3
1281 %assign offset %2*SIZEOF_PIXEL
1282 LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset]
1283 LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset]
1284 LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset]
1285 LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset]
1290 HADAMARD4_2D 4, 5, 6, 7, 3, %%n
1295 ; in: %1 = horizontal if 0, vertical if 1
1296 %macro SATD_8x4_SSE 8-9
1298 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
1300 HADAMARD4_V %2, %3, %4, %5, %6
1301 ; doing the abs first is a slight advantage
1302 ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
1303 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
1304 HADAMARD 1, max, %2, %4, %6, %7
1314 HADAMARD 1, max, %3, %5, %6, %7
1319 %macro SATD_START_MMX 0
1321 lea r4, [3*r1] ; 3*stride1
1322 lea r5, [3*r3] ; 3*stride2
1325 %macro SATD_END_MMX 0
1329 %else ; !HIGH_BIT_DEPTH
1330 pshufw m1, m0, q1032
1332 pshufw m1, m0, q2301
1336 %endif ; HIGH_BIT_DEPTH
1340 ; FIXME avoid the spilling of regs to hold 3*stride.
1341 ; for small blocks on x86_32, modify pixel pointer instead.
1343 ;-----------------------------------------------------------------------------
1344 ; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
1345 ;-----------------------------------------------------------------------------
1347 cglobal pixel_satd_16x4_internal
1348 SATD_4x4_MMX m2, 0, 0
1349 SATD_4x4_MMX m1, 4, 0
1351 SATD_4x4_MMX m2, 8, 0
1353 SATD_4x4_MMX m1, 12, 0
1358 cglobal pixel_satd_8x8_internal
1359 SATD_4x4_MMX m2, 0, 0
1360 SATD_4x4_MMX m1, 4, 1
1363 pixel_satd_8x4_internal_mmx2:
1364 SATD_4x4_MMX m2, 0, 0
1365 SATD_4x4_MMX m1, 4, 0
1371 %macro SATD_MxN_MMX 3
1372 cglobal pixel_satd_%1x%2, 4,7
1375 call pixel_satd_%1x%3_internal_mmx2
1382 call pixel_satd_%1x%3_internal_mmx2
1393 SATD_MxN_MMX 16, 16, 4
1394 SATD_MxN_MMX 16, 8, 4
1395 SATD_MxN_MMX 8, 16, 8
1396 %endif ; HIGH_BIT_DEPTH
1398 %if HIGH_BIT_DEPTH == 0
1399 cglobal pixel_satd_16x16, 4,6
1403 call pixel_satd_16x4_internal_mmx2
1407 call pixel_satd_16x4_internal_mmx2
1412 cglobal pixel_satd_16x8, 4,6
1415 call pixel_satd_16x4_internal_mmx2
1418 call pixel_satd_16x4_internal_mmx2
1421 cglobal pixel_satd_8x16, 4,6
1424 call pixel_satd_8x8_internal_mmx2
1427 call pixel_satd_8x8_internal_mmx2
1429 %endif ; !HIGH_BIT_DEPTH
1431 cglobal pixel_satd_8x8, 4,6
1434 call pixel_satd_8x8_internal_mmx2
1437 cglobal pixel_satd_8x4, 4,6
1440 call pixel_satd_8x4_internal_mmx2
1443 cglobal pixel_satd_4x16, 4,6
1445 SATD_4x4_MMX m0, 0, 1
1446 SATD_4x4_MMX m1, 0, 1
1448 SATD_4x4_MMX m1, 0, 1
1450 SATD_4x4_MMX m1, 0, 0
1454 cglobal pixel_satd_4x8, 4,6
1456 SATD_4x4_MMX m0, 0, 1
1457 SATD_4x4_MMX m1, 0, 0
1461 cglobal pixel_satd_4x4, 4,6
1463 SATD_4x4_MMX m0, 0, 0
1466 %macro SATD_START_SSE2 2-3 0
1468 %if HIGH_BIT_DEPTH && %3
1470 %elif cpuflag(ssse3) && notcpuflag(atom)
1482 %macro SATD_END_SSE2 1-2
1503 %macro BACKUP_POINTERS 0
1513 %macro RESTORE_AND_INC_POINTERS 0
1515 lea r0, [r6+8*SIZEOF_PIXEL]
1516 lea r2, [r7+8*SIZEOF_PIXEL]
1523 add r0, 8*SIZEOF_PIXEL
1524 add r2, 8*SIZEOF_PIXEL
1528 %macro SATD_4x8_SSE 3
1534 movhps m0, [r0+4*r1]
1535 movhps m4, [r2+4*r3]
1543 movhps m1, [r0+1*r1]
1544 movhps m5, [r2+1*r3]
1545 movhps m2, [r0+2*r1]
1546 movhps m6, [r2+2*r3]
1552 %else ; !HIGH_BIT_DEPTH
1571 DIFFOP 0, 4, 1, 5, 3
1573 DIFFOP 0, 4, 1, 5, 7
1589 DIFFOP 2, 6, 3, 5, 4
1591 DIFFOP 2, 6, 3, 5, 7
1593 %endif ; HIGH_BIT_DEPTH
1594 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
1597 ;-----------------------------------------------------------------------------
1598 ; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
1599 ;-----------------------------------------------------------------------------
1601 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
1603 %if vertical==0 || HIGH_BIT_DEPTH
1604 cglobal pixel_satd_4x4, 4, 6, 6
1607 LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
1608 LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
1609 LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
1610 LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
1611 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
1612 HADAMARD 0, sumsub, 0, 1, 2, 3
1613 HADAMARD 4, sumsub, 0, 1, 2, 3
1614 HADAMARD 1, amax, 0, 1, 2, 3
1620 cglobal pixel_satd_4x8, 4, 6, 8
1625 SATD_4x8_SSE vertical, 0, swap
1630 cglobal pixel_satd_4x16, 4, 6, 8
1635 SATD_4x8_SSE vertical, 0, swap
1636 lea r0, [r0+r1*2*SIZEOF_PIXEL]
1637 lea r2, [r2+r3*2*SIZEOF_PIXEL]
1638 SATD_4x8_SSE vertical, 1, add
1643 cglobal pixel_satd_8x8_internal
1644 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
1645 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
1646 %%pixel_satd_8x4_internal:
1647 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
1648 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
1651 ; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
1652 ; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
1653 %if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx)
1654 cglobal pixel_satd_16x4_internal
1655 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
1658 ; always use horizontal mode here
1659 SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10
1660 SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
1663 cglobal pixel_satd_16x8, 4,6,12
1664 SATD_START_SSE2 m10, m7
1668 jmp %%pixel_satd_16x8_internal
1670 cglobal pixel_satd_16x16, 4,6,12
1671 SATD_START_SSE2 m10, m7
1675 call pixel_satd_16x4_internal
1676 call pixel_satd_16x4_internal
1677 %%pixel_satd_16x8_internal:
1678 call pixel_satd_16x4_internal
1679 call pixel_satd_16x4_internal
1682 cglobal pixel_satd_16x8, 4,6,8
1683 SATD_START_SSE2 m6, m7
1685 call pixel_satd_8x8_internal
1686 RESTORE_AND_INC_POINTERS
1687 call pixel_satd_8x8_internal
1690 cglobal pixel_satd_16x16, 4,6,8
1691 SATD_START_SSE2 m6, m7, 1
1693 call pixel_satd_8x8_internal
1694 call pixel_satd_8x8_internal
1695 SATD_ACCUM m6, m0, m7
1696 RESTORE_AND_INC_POINTERS
1697 call pixel_satd_8x8_internal
1698 call pixel_satd_8x8_internal
1699 SATD_END_SSE2 m6, m7
1702 cglobal pixel_satd_8x16, 4,6,8
1703 SATD_START_SSE2 m6, m7
1704 call pixel_satd_8x8_internal
1705 call pixel_satd_8x8_internal
1708 cglobal pixel_satd_8x8, 4,6,8
1709 SATD_START_SSE2 m6, m7
1710 call pixel_satd_8x8_internal
1713 cglobal pixel_satd_8x4, 4,6,8
1714 SATD_START_SSE2 m6, m7
1715 call %%pixel_satd_8x4_internal
1717 %endmacro ; SATDS_SSE2
1732 %endif ; HIGH_BIT_DEPTH
1736 ; sse2 doesn't seem to like the horizontal way of doing things
1737 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
1740 ;-----------------------------------------------------------------------------
1741 ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
1742 ;-----------------------------------------------------------------------------
1743 cglobal pixel_sa8d_8x8_internal
1746 LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
1747 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
1749 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
1751 HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11
1759 cglobal pixel_sa8d_8x8, 4,8,12
1766 call pixel_sa8d_8x8_internal
1771 %endif ; HIGH_BIT_DEPTH
1777 cglobal pixel_sa8d_16x16, 4,8,12
1784 call pixel_sa8d_8x8_internal ; pix[0]
1785 add r2, 8*SIZEOF_PIXEL
1786 add r0, 8*SIZEOF_PIXEL
1791 call pixel_sa8d_8x8_internal ; pix[8]
1795 call pixel_sa8d_8x8_internal ; pix[8*stride+8]
1796 sub r2, 8*SIZEOF_PIXEL
1797 sub r0, 8*SIZEOF_PIXEL
1799 call pixel_sa8d_8x8_internal ; pix[8*stride]
1802 %if HIGH_BIT_DEPTH == 0
1812 cglobal pixel_sa8d_8x8_internal
1813 %define spill0 [esp+4]
1814 %define spill1 [esp+20]
1815 %define spill2 [esp+36]
1817 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
1818 HADAMARD4_2D 0, 1, 2, 3, 4
1820 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
1821 HADAMARD4_2D 4, 5, 6, 7, 3
1822 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
1825 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
1828 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
1829 ; could do first HADAMARD4_V here to save spilling later
1830 ; surprisingly, not a win on conroe or even p4
1835 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
1836 HADAMARD4_V 4, 5, 6, 7, 3
1842 HADAMARD4_V 0, 1, 2, 3, 7
1843 SUMSUB_BADC w, 0, 4, 1, 5, 7
1844 HADAMARD 2, sumsub, 0, 4, 7, 6
1845 HADAMARD 2, sumsub, 1, 5, 7, 6
1846 HADAMARD 1, amax, 0, 4, 7, 6
1847 HADAMARD 1, amax, 1, 5, 7, 6
1851 SUMSUB_BADC w, 2, 6, 3, 7, 4
1852 HADAMARD 2, sumsub, 2, 6, 4, 5
1853 HADAMARD 2, sumsub, 3, 7, 4, 5
1854 HADAMARD 1, amax, 2, 6, 4, 5
1855 HADAMARD 1, amax, 3, 7, 4, 5
1856 %endif ; sse2/non-sse2
1861 %endif ; ifndef mmx2
1863 cglobal pixel_sa8d_8x8, 4,7
1870 call pixel_sa8d_8x8_internal
1875 %endif ; HIGH_BIT_DEPTH
1882 cglobal pixel_sa8d_16x16, 4,7
1889 call pixel_sa8d_8x8_internal
1898 call pixel_sa8d_8x8_internal
1901 add r0, 8*SIZEOF_PIXEL
1902 add r2, 8*SIZEOF_PIXEL
1905 call pixel_sa8d_8x8_internal
1912 mova [esp+64-mmsize], m0
1913 call pixel_sa8d_8x8_internal
1916 %else ; !HIGH_BIT_DEPTH
1917 paddusw m0, [esp+64-mmsize]
1934 %endif ; HIGH_BIT_DEPTH
1940 %endif ; !ARCH_X86_64
1943 ;=============================================================================
1945 ;=============================================================================
1947 ; %1: vertical/horizontal mode
1948 ; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9)
1950 ; m6, m11-15: tmp regs
1951 %macro SA8D_SATD_8x4 5
1953 LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
1954 HADAMARD 0, sumsub, %2, %3, 6
1955 HADAMARD 0, sumsub, %4, %5, 6
1956 SBUTTERFLY wd, %2, %3, 6
1957 SBUTTERFLY wd, %4, %5, 6
1958 HADAMARD2_2D %2, %4, %3, %5, 6, dq
1964 HADAMARD 0, sumsub, %2, %3, 6
1965 HADAMARD 0, sumsub, %4, %5, 6
1966 SBUTTERFLY qdq, 12, 13, 6
1967 HADAMARD 0, amax, 12, 13, 6
1968 SBUTTERFLY qdq, 14, 15, 6
1970 HADAMARD 0, amax, 14, 15, 6
1973 LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
1974 HADAMARD4_V %2, %3, %4, %5, 6
1976 pabsw m12, m%2 ; doing the abs first is a slight advantage
1980 HADAMARD 1, max, 12, 14, 6, 11
1982 HADAMARD 1, max, 13, 15, 6, 11
1985 %endmacro ; SA8D_SATD_8x4
1987 ; %1: add spilled regs?
1989 %macro SA8D_SATD_ACCUM 2
2010 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
2011 cglobal pixel_sa8d_satd_8x8_internal
2012 SA8D_SATD_8x4 vertical, 0, 1, 2, 3
2013 SA8D_SATD_8x4 vertical, 4, 5, 8, 9
2015 %if vertical ; sse2-style
2016 HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax
2017 HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax
2018 %else ; complete sa8d
2019 SUMSUB_BADC w, 0, 4, 1, 5, 12
2020 HADAMARD 2, sumsub, 0, 4, 12, 11
2021 HADAMARD 2, sumsub, 1, 5, 12, 11
2022 SUMSUB_BADC w, 2, 8, 3, 9, 12
2023 HADAMARD 2, sumsub, 2, 8, 12, 11
2024 HADAMARD 2, sumsub, 3, 9, 12, 11
2025 HADAMARD 1, amax, 0, 4, 12, 11
2026 HADAMARD 1, amax, 1, 5, 12, 4
2027 HADAMARD 1, amax, 2, 8, 12, 4
2028 HADAMARD 1, amax, 3, 9, 12, 4
2031 ; create sa8d sub results
2039 ;-------------------------------------------------------------------------------
2040 ; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t )
2041 ;-------------------------------------------------------------------------------
2042 cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize
2043 %define temp0 [rsp+0*mmsize]
2044 %define temp1 [rsp+1*mmsize]
2054 call pixel_sa8d_satd_8x8_internal
2055 SA8D_SATD_ACCUM 0, 1
2056 call pixel_sa8d_satd_8x8_internal
2057 SA8D_SATD_ACCUM 1, 0
2058 vextracti128 xm1, m0, 1
2059 vextracti128 xm2, m10, 1
2063 lea r6, [r2+8*SIZEOF_PIXEL]
2064 lea r7, [r0+8*SIZEOF_PIXEL]
2066 call pixel_sa8d_satd_8x8_internal
2067 SA8D_SATD_ACCUM 0, 1
2068 call pixel_sa8d_satd_8x8_internal
2069 SA8D_SATD_ACCUM 1, 1
2074 call pixel_sa8d_satd_8x8_internal
2075 SA8D_SATD_ACCUM 1, 1
2076 call pixel_sa8d_satd_8x8_internal
2077 SA8D_SATD_ACCUM 1, 0
2080 ; xop already has fast horizontal sums
2081 %if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0
2082 pmaddwd xm10, [pw_1]
2084 phaddd xm0, xm10 ; sa8d1 sa8d2 satd1 satd2
2085 pshufd xm1, xm0, q2301 ; sa8d2 sa8d1 satd2 satd1
2086 paddd xm0, xm1 ; sa8d sa8d satd satd
2105 %endmacro ; SA8D_SATD
2107 ;=============================================================================
2109 ;=============================================================================
2120 ; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+,
2121 ; and are only retained for old cpus.
2122 %macro INTRA_SA8D_SSE2 0
2124 ;-----------------------------------------------------------------------------
2125 ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
2126 ;-----------------------------------------------------------------------------
2127 cglobal intra_sa8d_x3_8x8, 3,3,14
2130 movq m0, [r0+0*FENC_STRIDE]
2131 movq m1, [r0+1*FENC_STRIDE]
2132 movq m2, [r0+2*FENC_STRIDE]
2133 movq m3, [r0+3*FENC_STRIDE]
2134 movq m4, [r0+4*FENC_STRIDE]
2135 movq m5, [r0+5*FENC_STRIDE]
2136 movq m6, [r0+6*FENC_STRIDE]
2137 movq m7, [r0+7*FENC_STRIDE]
2147 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
2149 ABSW2 m8, m9, m2, m3, m2, m3
2150 ABSW2 m10, m11, m4, m5, m4, m5
2153 ABSW2 m10, m11, m6, m7, m6, m7
2160 ; 1D hadamard of edges
2166 HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q1032, [pw_ppppmmmm]
2167 HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q2301, [pw_ppmmppmm]
2168 pshuflw m10, m8, q2301
2169 pshuflw m11, m9, q2301
2170 pshufhw m10, m10, q2301
2171 pshufhw m11, m11, q2301
2172 pmullw m8, [pw_pmpmpmpm]
2173 pmullw m11, [pw_pmpmpmpm]
2183 psllw m8, 3 ; left edge
2186 ABSW2 m8, m10, m8, m10, m11, m12 ; 1x8 sum
2195 punpcklqdq m0, m4 ; transpose
2196 psllw m9, 3 ; top edge
2197 psrldq m2, m13, 2 ; 8x7 sum
2198 psubw m0, m9 ; 8x1 sum
2207 punpckhdq m3, m2, m8
2209 pshufd m5, m13, q3311
2212 punpckhqdq m0, m2, m5
2217 movq [r2], m0 ; i8x8_v, i8x8_h
2219 movd [r2+8], m0 ; i8x8_dc
2221 %endif ; ARCH_X86_64
2222 %endmacro ; INTRA_SA8D_SSE2
2225 ; out: m0..m3 = hadamard coefs
2227 cglobal hadamard_load
2228 ; not really a global, but otherwise cycles get attributed to the wrong function in profiling
2230 mova m0, [r0+0*FENC_STRIDEB]
2231 mova m1, [r0+1*FENC_STRIDEB]
2232 mova m2, [r0+2*FENC_STRIDEB]
2233 mova m3, [r0+3*FENC_STRIDEB]
2236 movd m0, [r0+0*FENC_STRIDE]
2237 movd m1, [r0+1*FENC_STRIDE]
2238 movd m2, [r0+2*FENC_STRIDE]
2239 movd m3, [r0+3*FENC_STRIDE]
2245 HADAMARD4_2D 0, 1, 2, 3, 4
2249 %macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp
2252 mova %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
2254 movd %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
2260 shl %2d, 5 ; log(FDEC_STRIDEB)
2262 movd %3, [r1+%2*SIZEOF_PIXEL-4+1*FDEC_STRIDEB]
2263 pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0
2264 pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2
2265 pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3
2266 %if HIGH_BIT_DEPTH == 0
2274 %define %%sign psignw
2276 %define %%sign pmullw
2278 pshufw %4, %3, q1032
2279 %%sign %4, [pw_ppmmppmm]
2281 pshufw %4, %3, q2301
2282 %%sign %4, [pw_pmpmpmpm]
2285 mova [%1_1d+2*%2], %3
2288 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
2290 pshufw %4, %1, q1032
2291 pshufw %5, %2, q1032
2292 pshufw %6, %3, q1032
2299 pshufw %4, %1, q1032
2300 pshufw %5, %2, q1032
2301 pshufw %6, %3, q1032
2311 ABSW2 m4, m5, m1, m2, m1, m2
2318 ; out: m0 v, m4 h, m5 dc
2320 %macro SUM4x3 3 ; dc, left, top
2331 punpckldq m0, m2 ; transpose
2333 ABSW2 m4, m5, m4, m5, m2, m3 ; 1x4 sum
2334 ABSW m0, m0, m1 ; 4x1 sum
2337 %macro INTRA_X3_MMX 0
2338 ;-----------------------------------------------------------------------------
2339 ; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
2340 ;-----------------------------------------------------------------------------
2341 cglobal intra_satd_x3_4x4, 3,3
2343 ; stack is 16 byte aligned because abi says so
2344 %define top_1d rsp-8 ; size 8
2345 %define left_1d rsp-16 ; size 8
2347 ; WIN64: stack is 16 byte aligned because abi says so
2348 ; X86_32: stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
2350 %define top_1d rsp+8
2355 SCALAR_HADAMARD left, 0, m4, m5
2356 SCALAR_HADAMARD top, 0, m6, m5, m7
2359 pand m6, [sw_f0] ; dc
2362 SUM4x3 m6, [left_1d], [top_1d]
2366 psrlq m1, 16 ; 4x3 sum
2369 SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw
2370 movd [r2+0], m0 ; i4x4_v satd
2371 movd [r2+4], m4 ; i4x4_h satd
2372 movd [r2+8], m5 ; i4x4_dc satd
2378 ;-----------------------------------------------------------------------------
2379 ; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
2380 ;-----------------------------------------------------------------------------
2381 cglobal intra_satd_x3_16x16, 0,5
2382 %assign stack_pad 120 + ((stack_offset+120+gprsize)&15)
2383 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
2385 %define sums rsp+64 ; size 56
2386 %define top_1d rsp+32 ; size 32
2387 %define left_1d rsp ; size 32
2405 SCALAR_HADAMARD left, r3, m0, m1
2406 SCALAR_HADAMARD top, r3, m1, m2, m3
2412 pand m6, [sw_f0] ; dc
2423 SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)]
2426 paddw m0, [sums+ 0] ; i16x16_v satd
2427 paddw m4, [sums+ 8] ; i16x16_h satd
2428 paddw m5, [sums+16] ; i16x16_dc satd
2433 add r0, 4*SIZEOF_PIXEL
2450 punpckhwd m3, m5, m7
2460 add r0, 4*FENC_STRIDEB-16*SIZEOF_PIXEL
2469 HADDD m5, m7 ; DC satd
2470 HADDD m4, m7 ; H satd
2471 HADDD m0, m7 ; the part of V satd that doesn't overlap with DC
2473 psrlq m1, 32 ; DC[1]
2474 paddd m0, m3 ; DC[2]
2475 psrlq m3, 32 ; DC[3]
2480 SUM_MM_X3 m0, m4, m5, m3, m1, m2, m6, paddd
2487 movd [r2+8], m5 ; i16x16_dc satd
2488 movd [r2+4], m4 ; i16x16_h satd
2489 movd [r2+0], m0 ; i16x16_v satd
2499 ;-----------------------------------------------------------------------------
2500 ; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
2501 ;-----------------------------------------------------------------------------
2502 cglobal intra_satd_x3_8x8c, 0,6
2503 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
2505 %define sums rsp+48 ; size 24
2506 %define dc_1d rsp+32 ; size 16
2507 %define top_1d rsp+16 ; size 16
2508 %define left_1d rsp ; size 16
2518 SCALAR_HADAMARD left, r3, m0, m1
2519 SCALAR_HADAMARD top, r3, m0, m1, m2
2524 movzx t0d, word [left_1d+0]
2525 movzx r3d, word [top_1d+0]
2526 movzx r4d, word [left_1d+8]
2527 movzx r5d, word [top_1d+8]
2528 lea t0d, [t0 + r3 + 16]
2529 lea r3d, [r4 + r5 + 16]
2538 mov [dc_1d+ 0], t0d ; tl
2539 mov [dc_1d+ 4], r5d ; tr
2540 mov [dc_1d+ 8], r4d ; bl
2541 mov [dc_1d+12], r3d ; br
2554 SUM4x3 [r5+4*(r4+2)], [left_1d+8*(r3+2)], [top_1d+8*(r4+2)]
2557 paddw m0, [sums+16] ; i4x4_v satd
2558 paddw m4, [sums+8] ; i4x4_h satd
2559 paddw m5, [sums+0] ; i4x4_dc satd
2564 add r0, 4*SIZEOF_PIXEL
2567 add r0, 4*FENC_STRIDEB-8*SIZEOF_PIXEL
2580 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
2586 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
2589 movd [r2+0], m0 ; i8x8c_dc satd
2590 movd [r2+4], m1 ; i8x8c_h satd
2591 movd [r2+8], m2 ; i8x8c_v satd
2594 %endmacro ; INTRA_X3_MMX
2598 %macro PRED4x4_LOWPASS 5
2615 %macro INTRA_X9_PRED 2
2617 movu m1, [r1-1*FDEC_STRIDE-8]
2618 pinsrb m1, [r1+3*FDEC_STRIDE-1], 0
2619 pinsrb m1, [r1+2*FDEC_STRIDE-1], 1
2620 pinsrb m1, [r1+1*FDEC_STRIDE-1], 2
2621 pinsrb m1, [r1+0*FDEC_STRIDE-1], 3
2623 movd mm0, [r1+3*FDEC_STRIDE-4]
2624 punpcklbw mm0, [r1+2*FDEC_STRIDE-4]
2625 movd mm1, [r1+1*FDEC_STRIDE-4]
2626 punpcklbw mm1, [r1+0*FDEC_STRIDE-4]
2630 movu m1, [r1-1*FDEC_STRIDE-8]
2631 movss m1, m0 ; l3 l2 l1 l0 __ __ __ lt t0 t1 t2 t3 t4 t5 t6 t7
2633 pshufb m1, [intrax9_edge] ; l3 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __
2634 psrldq m0, m1, 1 ; l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __
2635 psrldq m2, m1, 2 ; l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ __
2636 pavgb m5, m0, m1 ; Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 __ __ __ __ __
2638 PRED4x4_LOWPASS m0, m1, m2, m0, m4 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 __ __ __
2640 ; Ft1 Ft2 Ft3 Ft4 Flt Ft0 Ft1 Ft2
2641 ; Ft2 Ft3 Ft4 Ft5 Fl0 Flt Ft0 Ft1
2642 ; Ft3 Ft4 Ft5 Ft6 Fl1 Fl0 Flt Ft0
2643 ; Ft4 Ft5 Ft6 Ft7 Fl2 Fl1 Fl0 Flt
2644 pshufb m2, m0, [%1_ddlr1] ; a: ddl row0, ddl row1, ddr row0, ddr row1 / b: ddl row0, ddr row0, ddl row1, ddr row1
2645 pshufb m3, m0, [%1_ddlr2] ; rows 2,3
2647 ; Glt Flt Ft0 Ft1 Gl0 Fl1 Gl1 Fl2
2648 ; Gl0 Fl0 Glt Flt Gl1 Fl2 Gl2 Fl3
2649 ; Gl1 Fl1 Gl0 Fl0 Gl2 Fl3 Gl3 Gl3
2650 ; Gl2 Fl2 Gl1 Fl1 Gl3 Gl3 Gl3 Gl3
2651 pslldq m0, 5 ; ___ ___ ___ ___ ___ Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
2652 palignr m7, m5, m0, 5 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gl3 Gl2 Gl1 Gl0 Glt
2653 pshufb m6, m7, [%1_hdu1]
2654 pshufb m7, m7, [%1_hdu2]
2656 ; Gt0 Gt1 Gt2 Gt3 Gt1 Gt2 Gt3 Gt4
2657 ; Flt Ft0 Ft1 Ft2 Ft1 Ft2 Ft3 Ft4
2658 ; Fl0 Gt0 Gt1 Gt2 Gt2 Gt3 Gt4 Gt5
2659 ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
2660 psrldq m5, 5 ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 ...
2661 palignr m5, m0, 6 ; ___ Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
2662 pshufb m4, m5, [%1_vrl1]
2663 pshufb m5, m5, [%1_vrl2]
2664 %endmacro ; INTRA_X9_PRED
2666 %macro INTRA_X9_VHDC 5 ; edge, fenc01, fenc23, tmp, tmp
2667 pshufb m2, m%1, [intrax9b_vh1]
2668 pshufb m3, m%1, [intrax9b_vh2]
2669 mova [pred_buf+0x60], m2
2670 mova [pred_buf+0x70], m3
2671 pshufb m%1, [intrax9b_edge2] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3
2672 pmaddubsw m%1, [hmul_4p]
2673 pshufhw m0, m%1, q2301
2674 pshuflw m0, m0, q2301
2675 psignw m%1, [pw_pmpmpmpm]
2677 psllw m0, 2 ; hadamard(top), hadamard(left)
2679 pshufb m1, m0, [intrax9b_v1]
2680 pshufb m2, m0, [intrax9b_v2]
2682 psignw m3, [pw_pmmpzzzz] ; FIXME could this be eliminated?
2684 pand m0, [sw_f0] ; dc
2685 ; This (as well as one of the steps in intra_satd_x9_4x4.satd_8x4) could be
2686 ; changed from a wd transpose to a qdq, with appropriate rearrangement of inputs.
2687 ; Which would be faster on conroe, but slower on penryn and sandybridge, and too invasive to ifdef.
2688 HADAMARD 0, sumsub, %2, %3, %4, %5
2689 HADAMARD 1, sumsub, %2, %3, %4, %5
2692 imul r3d, 0x01010101
2693 mov [pred_buf+0x80], r3d
2694 mov [pred_buf+0x88], r3d
2695 mov [pred_buf+0x90], r3d
2696 mov [pred_buf+0x98], r3d
2712 SBUTTERFLY qdq, 3, 0, 2
2723 pmaddwd m1, [pw_1] ; v, _, h, dc
2725 %endmacro ; INTRA_X9_VHDC
2727 %macro INTRA_X9_END 2
2729 phminposuw m0, m0 ; h,dc,ddl,ddr,vr,hd,vl,hu
2736 ; 4x4 sad is up to 12 bits; +bitcosts -> 13 bits; pack with 3 bit index
2738 paddw m0, [pw_s01234567] ; h,dc,ddl,ddr,vr,hd,vl,hu
2740 ; 4x4 satd is up to 13 bits; +bitcosts and saturate -> 13 bits; pack with 3 bit index
2743 paddw m0, [pw_s01234657] ; h,dc,ddl,ddr,vr,vl,hd,hu
2747 pshuflw m1, m0, q0032
2749 pshuflw m1, m0, q0001
2756 ; 1<<16: increment index to match intra4x4_pred_e. couldn't do this before because it had to fit in 3 bits
2757 ; 1<<12: undo sign manipulation
2758 lea eax, [rax+r2+(1<<16)+(1<<12)]
2763 ; output the predicted samples
2768 movzx r2d, byte [r2+r3]
2770 movzx r2d, byte [%2_lut+r3]
2773 movq mm0, [pred_buf+r2]
2774 movq mm1, [pred_buf+r2+16]
2775 movd [r1+0*FDEC_STRIDE], mm0
2776 movd [r1+2*FDEC_STRIDE], mm1
2779 movd [r1+1*FDEC_STRIDE], mm0
2780 movd [r1+3*FDEC_STRIDE], mm1
2784 mov r3d, [pred_buf+r2+8*i]
2785 mov [r1+i*FDEC_STRIDE], r3d
2789 %endmacro ; INTRA_X9_END
2792 ;-----------------------------------------------------------------------------
2793 ; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
2794 ;-----------------------------------------------------------------------------
2796 cglobal intra_sad_x9_4x4, 3,4,9
2797 %assign pad 0xc0-gprsize-(stack_offset&15)
2798 %define pred_buf rsp
2801 INTRA_X9_PRED intrax9a, m8
2803 INTRA_X9_PRED intrax9a, [rsp+0xa0]
2812 movd m0, [r0+0*FENC_STRIDE]
2813 pinsrd m0, [r0+1*FENC_STRIDE], 1
2814 movd m1, [r0+2*FENC_STRIDE]
2815 pinsrd m1, [r0+3*FENC_STRIDE], 1
2817 movd mm0, [r0+0*FENC_STRIDE]
2818 punpckldq mm0, [r0+1*FENC_STRIDE]
2819 movd mm1, [r0+2*FENC_STRIDE]
2820 punpckldq mm1, [r0+3*FENC_STRIDE]
2841 %define %%zero [pb_0]
2843 pshufb m3, m7, [intrax9a_vh1]
2844 pshufb m5, m7, [intrax9a_vh2]
2845 pshufb m7, [intrax9a_dc]
2860 movzx r3d, word [r2]
2863 punpckhqdq m3, m0 ; h, dc
2864 shufps m3, m2, q2020
2870 INTRA_X9_END 1, intrax9a
2876 ;-----------------------------------------------------------------------------
2877 ; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
2878 ;-----------------------------------------------------------------------------
2879 cglobal intra_satd_x9_4x4, 3,4,16
2880 %assign pad 0xb0-gprsize-(stack_offset&15)
2881 %define pred_buf rsp
2883 INTRA_X9_PRED intrax9b, m15
2890 movd m8, [r0+0*FENC_STRIDE]
2891 movd m9, [r0+1*FENC_STRIDE]
2892 movd m10, [r0+2*FENC_STRIDE]
2893 movd m11, [r0+3*FENC_STRIDE]
2904 pshufd m1, m2, q3232
2907 call .satd_8x4 ; ddr, ddl
2909 pshufd m3, m5, q3232
2912 pshufd m1, m4, q3232
2913 call .satd_8x4 ; vr, vl
2915 pshufd m3, m7, q3232
2918 pshufd m1, m6, q3232
2919 call .satd_8x4 ; hd, hu
2923 punpcklqdq m4, m0 ; conroe dislikes punpckldq, and ssse3 INTRA_X9_END can handle arbitrary orders whereas phminposuw can't
2925 mova m1, [pw_ppmmppmm]
2930 INTRA_X9_VHDC 15, 8, 10, 6, 7
2935 %if notcpuflag(sse4)
2936 pshufhw m0, m0, q3120 ; compensate for different order in unpack
2940 movzx r0d, word [r2]
2942 INTRA_X9_END 0, intrax9b
2945 RESET_MM_PERMUTATION
2956 SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap
2959 pshufd m1, m0, q0032
2963 paddd xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free
2966 %else ; !ARCH_X86_64
2967 cglobal intra_satd_x9_4x4, 3,4,8
2968 %assign pad 0x120-gprsize-(stack_offset&15)
2969 %define fenc_buf rsp
2970 %define pred_buf rsp+0x40
2971 %define spill rsp+0xe0
2973 INTRA_X9_PRED intrax9b, [spill+0x20]
2974 mova [pred_buf+0x00], m2
2975 mova [pred_buf+0x10], m3
2976 mova [pred_buf+0x20], m4
2977 mova [pred_buf+0x30], m5
2978 mova [pred_buf+0x40], m6
2979 mova [pred_buf+0x50], m7
2980 movd m4, [r0+0*FENC_STRIDE]
2981 movd m5, [r0+1*FENC_STRIDE]
2982 movd m6, [r0+2*FENC_STRIDE]
2983 movd m0, [r0+3*FENC_STRIDE]
2993 mova [fenc_buf+0x00], m4
2994 mova [fenc_buf+0x10], m5
2995 mova [fenc_buf+0x20], m6
2996 mova [fenc_buf+0x30], m0
2998 pshufd m1, m2, q3232
3008 call .satd_8x4b ; ddr, ddl
3009 mova m3, [pred_buf+0x30]
3010 mova m1, [pred_buf+0x20]
3013 movq [spill+0x08], m0
3016 call .satd_8x4 ; vr, vl
3017 mova m3, [pred_buf+0x50]
3018 mova m1, [pred_buf+0x40]
3021 movq [spill+0x10], m0
3024 call .satd_8x4 ; hd, hu
3025 movq [spill+0x18], m0
3026 mova m1, [spill+0x20]
3027 mova m4, [fenc_buf+0x00]
3028 mova m5, [fenc_buf+0x20]
3029 mova m2, [pw_ppmmppmm]
3032 paddw m4, [fenc_buf+0x10]
3033 paddw m5, [fenc_buf+0x30]
3034 INTRA_X9_VHDC 1, 4, 5, 6, 7
3038 punpckhqdq m1, [spill+0x00]
3039 packssdw m1, [spill+0x10]
3041 pshufhw m1, m1, q3120
3043 pshufhw m0, m0, q3120
3046 movzx r0d, word [r2]
3048 INTRA_X9_END 0, intrax9b
3051 RESET_MM_PERMUTATION
3058 %xdefine fenc_buf fenc_buf+gprsize
3059 psubw m0, [fenc_buf+0x00]
3060 psubw m1, [fenc_buf+0x10]
3061 psubw m2, [fenc_buf+0x20]
3063 psubw m3, [fenc_buf+0x30]
3064 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap
3067 pshufd m1, m0, q0032
3074 %endmacro ; INTRA_X9
3079 ;-----------------------------------------------------------------------------
3080 ; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
3081 ;-----------------------------------------------------------------------------
3082 cglobal intra_sad_x9_8x8, 5,6,9
3092 %assign padbase 0x10
3094 %assign pad 0x240+0x10+padbase-gprsize-(stack_offset&15)
3095 %define pred(i,j) [rsp+i*0x40+j*0x10+padbase]
3098 movq fenc02, [r0+FENC_STRIDE* 0]
3099 movq fenc13, [r0+FENC_STRIDE* 1]
3100 movq fenc46, [r0+FENC_STRIDE* 4]
3101 movq fenc57, [r0+FENC_STRIDE* 5]
3102 movhps fenc02, [r0+FENC_STRIDE* 2]
3103 movhps fenc13, [r0+FENC_STRIDE* 3]
3104 movhps fenc46, [r0+FENC_STRIDE* 6]
3105 movhps fenc57, [r0+FENC_STRIDE* 7]
3107 ; save instruction size: avoid 4-byte memory offsets
3108 lea r0, [intra8x9_h1+128]
3109 %define off(m) (r0+m-(intra8x9_h1+128))
3114 psadbw m1, m0, fenc02
3116 psadbw m2, m0, fenc13
3118 psadbw m3, m0, fenc46
3120 psadbw m0, m0, fenc57
3130 pshufb m1, m0, [off(intra8x9_h1)]
3131 pshufb m2, m0, [off(intra8x9_h2)]
3137 pshufb m3, m0, [off(intra8x9_h3)]
3138 pshufb m2, m0, [off(intra8x9_h4)]
3149 lea r5, [rsp+padbase+0x100]
3150 %define pred(i,j) [r5+i*0x40+j*0x10-0x100]
3162 psadbw m1, m0, fenc02
3164 psadbw m2, m0, fenc13
3166 psadbw m3, m0, fenc46
3168 psadbw m0, m0, fenc57
3177 ; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
3178 ; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
3179 ; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
3180 ; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
3181 ; Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC
3182 ; Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD
3183 ; Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE
3184 ; Ft8 Ft9 FtA FtB FtC FtD FtE FtF
3188 pavgb m3, m0, m2 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB ___ ___ ___ ___ ___
3189 PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; ___ Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE FtF
3190 pshufb m1, m0, [off(intra8x9_ddl1)]
3191 pshufb m2, m0, [off(intra8x9_ddl2)]
3197 pshufb m2, m0, [off(intra8x9_ddl3)]
3201 pshufb m2, m0, [off(intra8x9_ddl4)]
3210 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8
3211 ; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
3212 ; Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9
3213 ; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
3214 ; Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA
3215 ; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
3216 ; Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB
3217 ; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
3218 pshufb m1, m3, [off(intra8x9_vl1)]
3219 pshufb m2, m0, [off(intra8x9_vl2)]
3220 pshufb m3, m3, [off(intra8x9_vl3)]
3221 pshufb m0, m0, [off(intra8x9_vl4)]
3236 pextrw [r4+14], m0, 0
3240 lea r5, [rsp+padbase+0x100]
3244 ; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
3245 ; Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
3246 ; Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4
3247 ; Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3
3248 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2
3249 ; Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1
3250 ; Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0
3251 ; Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt
3255 pavgb m3, m2, m0 ; Gl6 Gl5 Gl4 Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
3256 PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
3257 pshufb m1, m0, [off(intra8x9_ddr1)]
3258 pshufb m2, m0, [off(intra8x9_ddr2)]
3264 pshufb m2, m0, [off(intra8x9_ddr3)]
3268 pshufb m2, m0, [off(intra8x9_ddr4)]
3278 %define off(m) (r0+m-(intra8x9_h1+256+128))
3279 %define pred(i,j) [r5+i*0x40+j*0x10-0x1C0]
3282 ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
3283 ; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
3284 ; Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6
3285 ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
3286 ; Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
3287 ; Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4
3288 ; Fl4 Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4
3289 ; Fl5 Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3
3290 movsd m2, m3, m0 ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
3291 pshufb m1, m2, [off(intra8x9_vr1)]
3292 pshufb m2, m2, [off(intra8x9_vr3)]
3298 pshufb m2, m0, [off(intra8x9_vr2)]
3302 pshufb m2, m0, [off(intra8x9_vr4)]
3311 ; Glt Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
3312 ; Gl0 Fl0 Glt Flt Ft0 Ft1 Ft2 Ft3
3313 ; Gl1 Fl1 Gl0 Fl0 Glt Flt Ft0 Ft1
3314 ; Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 Glt Flt
3315 ; Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0
3316 ; Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1
3317 ; Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2
3318 ; Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3
3319 pshufd m2, m3, q0001
3321 pblendw m2, m0, q3330 ; Gl2 Gl1 Gl0 Glt ___ Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ___
3326 punpcklbw m0, m3 ; Fl7 Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 ___
3327 pshufb m1, m2, [off(intra8x9_hd1)]
3328 pshufb m2, m2, [off(intra8x9_hd2)]
3334 pshufb m2, m0, [off(intra8x9_hd3)]
3335 pshufb m3, m0, [off(intra8x9_hd4)]
3344 ; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall
3349 ; Gl0 Fl1 Gl1 Fl2 Gl2 Fl3 Gl3 Fl4
3350 ; Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 Gl4 Fl5
3351 ; Gl2 Fl3 Gl3 Gl3 Gl4 Fl5 Gl5 Fl6
3352 ; Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 Gl6 Fl7
3353 ; Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 Gl7 Gl7
3354 ; Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 Gl7 Gl7
3355 ; Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
3356 ; Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
3358 pinsrb m0, [r2+7], 15 ; Gl7
3365 pshufb m1, m0, [off(intra8x9_hu1)]
3366 pshufb m2, m0, [off(intra8x9_hu2)]
3372 pshufb m2, m0, [off(intra8x9_hu3)]
3373 pshufb m0, m0, [off(intra8x9_hu4)]
3388 movzx r5d, word [r3+16]
3393 phminposuw m0, m0 ; v,h,dc,ddl,ddr,vr,hd,vl
3396 ; 8x8 sad is up to 14 bits; +bitcosts and saturate -> 14 bits; pack with 2 bit index
3399 paddw m0, [off(pw_s00112233)]
3402 pshuflw m1, m0, q0032
3405 ; repack with 3 bit index
3413 ; reverse to phminposuw order
3427 add r1, 4*FDEC_STRIDE
3428 mova m0, [rsp+padbase+r2+0x00]
3429 mova m1, [rsp+padbase+r2+0x10]
3430 mova m2, [rsp+padbase+r2+0x20]
3431 mova m3, [rsp+padbase+r2+0x30]
3432 movq [r1+FDEC_STRIDE*-4], m0
3433 movhps [r1+FDEC_STRIDE*-2], m0
3434 movq [r1+FDEC_STRIDE*-3], m1
3435 movhps [r1+FDEC_STRIDE*-1], m1
3436 movq [r1+FDEC_STRIDE* 0], m2
3437 movhps [r1+FDEC_STRIDE* 2], m2
3438 movq [r1+FDEC_STRIDE* 1], m3
3439 movhps [r1+FDEC_STRIDE* 3], m3
3444 ;-----------------------------------------------------------------------------
3445 ; int intra_sa8d_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
3446 ;-----------------------------------------------------------------------------
3447 cglobal intra_sa8d_x9_8x8, 5,6,16
3448 %assign pad 0x2c0+0x10-gprsize-(stack_offset&15)
3449 %define fenc_buf rsp
3450 %define pred_buf rsp+0x80
3456 movddup m %+ %%i, [r0+%%i*FENC_STRIDE]
3457 pmaddubsw m9, m %+ %%i, m15
3458 punpcklbw m %+ %%i, m8
3459 mova [fenc_buf+%%i*0x10], m9
3463 ; save instruction size: avoid 4-byte memory offsets
3464 lea r0, [intra8x9_h1+0x80]
3465 %define off(m) (r0+m-(intra8x9_h1+0x80))
3466 lea r5, [pred_buf+0x80]
3469 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
3478 ; 1D hadamard of edges
3486 pshufb m9, [intrax3_shuf]
3487 pmaddubsw m8, [pb_pppm]
3488 pmaddubsw m9, [pb_pppm]
3489 HSUMSUB2 psignw, m8, m9, m12, m13, m9, q1032, [pw_ppppmmmm]
3490 HSUMSUB2 psignw, m8, m9, m12, m13, m9, q2301, [pw_ppmmppmm]
3506 psllw m8, 3 ; left edge
3509 pabsw m8, m8 ; 1x8 sum
3519 punpcklqdq m0, m4 ; transpose
3520 psllw m9, 3 ; top edge
3521 psrldq m10, m11, 2 ; 8x7 sum
3522 psubw m0, m9 ; 8x1 sum
3526 phaddd m10, m8 ; logically phaddw, but this is faster and it won't overflow
3532 pshufb m0, m3, [off(intra8x9_h1)]
3533 pshufb m1, m3, [off(intra8x9_h2)]
3534 pshufb m2, m3, [off(intra8x9_h3)]
3535 pshufb m3, m3, [off(intra8x9_h4)]
3546 PRED4x4_LOWPASS m8, m1, m2, m8, m3
3547 pshufb m0, m8, [off(intra8x9_ddl1)]
3548 pshufb m1, m8, [off(intra8x9_ddl2)]
3549 pshufb m2, m8, [off(intra8x9_ddl3)]
3550 pshufb m3, m8, [off(intra8x9_ddl4)]
3556 pshufb m0, m9, [off(intra8x9_vl1)]
3557 pshufb m1, m8, [off(intra8x9_vl2)]
3558 pshufb m2, m9, [off(intra8x9_vl3)]
3559 pshufb m3, m8, [off(intra8x9_vl4)]
3570 PRED4x4_LOWPASS m8, m1, m2, m8, m3
3571 pshufb m0, m8, [off(intra8x9_ddr1)]
3572 pshufb m1, m8, [off(intra8x9_ddr2)]
3573 pshufb m2, m8, [off(intra8x9_ddr3)]
3574 pshufb m3, m8, [off(intra8x9_ddr4)]
3580 %define off(m) (r0+m-(intra8x9_h1+0x180))
3584 pshufb m0, m2, [off(intra8x9_vr1)]
3585 pshufb m1, m8, [off(intra8x9_vr2)]
3586 pshufb m2, m2, [off(intra8x9_vr3)]
3587 pshufb m3, m8, [off(intra8x9_vr4)]
3594 pshufd m1, m9, q0001
3595 pblendw m1, m8, q3330
3597 pshufd m2, m9, q0001
3601 pshufb m0, m1, [off(intra8x9_hd1)]
3602 pshufb m1, m1, [off(intra8x9_hd2)]
3603 pshufb m2, m8, [off(intra8x9_hd3)]
3604 pshufb m3, m8, [off(intra8x9_hd4)]
3612 pinsrb m8, [r2+7], 15
3619 pshufb m0, m8, [off(intra8x9_hu1)]
3620 pshufb m1, m8, [off(intra8x9_hu2)]
3621 pshufb m2, m8, [off(intra8x9_hu3)]
3622 pshufb m3, m8, [off(intra8x9_hu4)]
3630 pshuflw m1, m0, q0032
3639 movzx r5d, word [r3+16]
3647 ; 8x8 sa8d is up to 15 bits; +bitcosts and saturate -> 15 bits; pack with 1 bit index
3649 paddw m0, [off(pw_s00001111)]
3652 pshuflw m1, m0, q0032
3655 pcmpgtw m2, m1 ; 2nd index bit
3658 ; repack with 3 bit index
3666 lea r3d, [ r3*4+r4+1]
3669 ; reverse to phminposuw order
3683 add r1, 4*FDEC_STRIDE
3684 mova m0, [pred_buf+r2+0x00]
3685 mova m1, [pred_buf+r2+0x10]
3686 mova m2, [pred_buf+r2+0x20]
3687 mova m3, [pred_buf+r2+0x30]
3688 movq [r1+FDEC_STRIDE*-4], m0
3689 movhps [r1+FDEC_STRIDE*-2], m0
3690 movq [r1+FDEC_STRIDE*-3], m1
3691 movhps [r1+FDEC_STRIDE*-1], m1
3692 movq [r1+FDEC_STRIDE* 0], m2
3693 movhps [r1+FDEC_STRIDE* 2], m2
3694 movq [r1+FDEC_STRIDE* 1], m3
3695 movhps [r1+FDEC_STRIDE* 3], m3
3702 %xdefine fenc_buf fenc_buf+gprsize
3715 PERMUTE 0,4, 1,5, 2,0, 3,1, 4,6, 5,7, 6,2, 7,3
3718 psubw m0, [fenc_buf+0x00]
3719 psubw m1, [fenc_buf+0x10]
3722 psubw m2, [fenc_buf+0x20]
3723 psubw m3, [fenc_buf+0x30]
3726 psubw m4, [fenc_buf+0x40]
3727 psubw m5, [fenc_buf+0x50]
3730 psubw m6, [fenc_buf+0x60]
3731 psubw m7, [fenc_buf+0x70]
3732 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 13, 14
3737 %endif ; ARCH_X86_64
3738 %endmacro ; INTRA8_X9
3740 ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
3741 ; out: [tmp]=hadamard4, m0=satd
3743 cglobal hadamard_ac_4x4
3749 %else ; !HIGH_BIT_DEPTH
3758 %endif ; HIGH_BIT_DEPTH
3759 HADAMARD4_2D 0, 1, 2, 3, 4
3775 cglobal hadamard_ac_2x2max
3781 SUMSUB_BADC w, 0, 1, 2, 3, 4
3782 ABSW2 m0, m2, m0, m2, m4, m5
3783 ABSW2 m1, m3, m1, m3, m4, m5
3784 HADAMARD 0, max, 0, 2, 4, 5
3785 HADAMARD 0, max, 1, 3, 4, 5
3791 %else ; !HIGH_BIT_DEPTH
3794 %endif ; HIGH_BIT_DEPTH
3810 %endif ; HIGH_BIT_DEPTH
3813 cglobal hadamard_ac_8x8
3819 %endif ; HIGH_BIT_DEPTH
3820 call hadamard_ac_4x4_mmx2
3821 add r0, 4*SIZEOF_PIXEL
3825 call hadamard_ac_4x4_mmx2
3829 call hadamard_ac_4x4_mmx2
3830 sub r0, 4*SIZEOF_PIXEL
3833 call hadamard_ac_4x4_mmx2
3836 mova [rsp+gprsize+8], m5 ; save satd
3841 call hadamard_ac_2x2max_mmx2
3847 SUMSUB_BADC w, 0, 1, 2, 3, 4
3848 HADAMARD 0, sumsub, 0, 2, 4, 5
3849 ABSW2 m1, m3, m1, m3, m4, m5
3850 ABSW2 m0, m2, m0, m2, m4, m5
3851 HADAMARD 0, max, 1, 3, 4, 5
3862 %else ; !HIGH_BIT_DEPTH
3868 %endif ; HIGH_BIT_DEPTH
3869 mova [rsp+gprsize], m6 ; save sa8d
3874 %macro HADAMARD_AC_WXH_SUM_MMX 2
3875 mova m1, [rsp+1*mmsize]
3878 paddd m0, [rsp+2*mmsize]
3879 paddd m1, [rsp+3*mmsize]
3882 mova m2, [rsp+4*mmsize]
3883 paddd m1, [rsp+5*mmsize]
3884 paddd m2, [rsp+6*mmsize]
3886 paddd m1, [rsp+7*mmsize]
3893 %else ; !HIGH_BIT_DEPTH
3895 paddusw m0, [rsp+2*mmsize]
3896 paddusw m1, [rsp+3*mmsize]
3899 mova m2, [rsp+4*mmsize]
3900 paddusw m1, [rsp+5*mmsize]
3901 paddusw m2, [rsp+6*mmsize]
3903 paddusw m1, [rsp+7*mmsize]
3915 %endif ; HIGH_BIT_DEPTH
3918 %macro HADAMARD_AC_WXH_MMX 2
3919 cglobal pixel_hadamard_ac_%1x%2, 2,4
3920 %assign pad 16-gprsize-(stack_offset&15)
3926 call hadamard_ac_8x8_mmx2
3931 call hadamard_ac_8x8_mmx2
3936 lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
3938 call hadamard_ac_8x8_mmx2
3942 call hadamard_ac_8x8_mmx2
3945 HADAMARD_AC_WXH_SUM_MMX %1, %2
3953 add rsp, 128+%1*%2/4+pad
3955 %endmacro ; HADAMARD_AC_WXH_MMX
3957 HADAMARD_AC_WXH_MMX 16, 16
3958 HADAMARD_AC_WXH_MMX 8, 16
3959 HADAMARD_AC_WXH_MMX 16, 8
3960 HADAMARD_AC_WXH_MMX 8, 8
3962 %macro LOAD_INC_8x4W_SSE2 5
3971 %else ; !HIGH_BIT_DEPTH
3983 %endif ; HIGH_BIT_DEPTH
3986 %macro LOAD_INC_8x4W_SSSE3 5
3987 LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1]
3991 HSUMSUB %1, %2, %3, %4, %5
3994 %macro HADAMARD_AC_SSE2 0
3995 ; in: r0=pix, r1=stride, r2=stride*3
3996 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
3997 cglobal hadamard_ac_8x8
4003 %define spill0 [rsp+gprsize]
4004 %define spill1 [rsp+gprsize+mmsize]
4005 %define spill2 [rsp+gprsize+mmsize*2]
4009 %elif cpuflag(ssse3) && notcpuflag(atom)
4011 ;LOAD_INC loads sumsubs
4015 ;LOAD_INC only unpacks to words
4018 LOAD_INC_8x4W 0, 1, 2, 3, 7
4020 HADAMARD4_2D_SSE 0, 1, 2, 3, 4
4022 HADAMARD4_V 0, 1, 2, 3, 4
4026 LOAD_INC_8x4W 4, 5, 6, 7, 1
4028 HADAMARD4_2D_SSE 4, 5, 6, 7, 1
4030 HADAMARD4_V 4, 5, 6, 7, 1
4035 HADAMARD 1, sumsub, 0, 1, 6, 7
4036 HADAMARD 1, sumsub, 2, 3, 6, 7
4041 HADAMARD 1, sumsub, 4, 5, 1, 0
4042 HADAMARD 1, sumsub, 6, 7, 1, 0
4055 pand m1, [mask_ac4b]
4059 AC_PADD m1, m3, [pw_1]
4061 AC_PADD m1, m2, [pw_1]
4063 AC_PADD m1, m3, [pw_1]
4065 AC_PADD m1, m2, [pw_1]
4067 AC_PADD m1, m3, [pw_1]
4068 AC_PADD m1, m2, [pw_1]
4069 paddw m3, m7, spill2
4071 mova [rsp+gprsize+mmsize*2], m1 ; save satd
4072 paddw m2, m6, spill1
4074 paddw m1, m5, spill0
4081 HADAMARD %%x, amax, 3, 7, 4
4082 HADAMARD %%x, amax, 2, 6, 7, 4
4084 HADAMARD %%x, amax, 1, 5, 6, 7
4085 HADAMARD %%x, sumsub, 0, 4, 5, 6
4087 AC_PADD m2, m3, [pw_1]
4088 AC_PADD m2, m1, [pw_1]
4093 %endif ; HIGH_BIT_DEPTH
4097 AC_PADD m2, m4, [pw_1]
4098 AC_PADD m2, m0, [pw_1]
4099 mova [rsp+gprsize+mmsize], m2 ; save sa8d
4104 HADAMARD_AC_WXH_SSE2 16, 16
4105 HADAMARD_AC_WXH_SSE2 16, 8
4107 HADAMARD_AC_WXH_SSE2 8, 16
4108 HADAMARD_AC_WXH_SSE2 8, 8
4110 %endmacro ; HADAMARD_AC_SSE2
4112 %macro HADAMARD_AC_WXH_SUM_SSE2 2
4113 mova m1, [rsp+2*mmsize]
4116 paddd m0, [rsp+3*mmsize]
4117 paddd m1, [rsp+4*mmsize]
4120 paddd m0, [rsp+5*mmsize]
4121 paddd m1, [rsp+6*mmsize]
4122 paddd m0, [rsp+7*mmsize]
4123 paddd m1, [rsp+8*mmsize]
4128 %else ; !HIGH_BIT_DEPTH
4129 %if %1*%2*16/mmsize >= 128
4130 paddusw m0, [rsp+3*mmsize]
4131 paddusw m1, [rsp+4*mmsize]
4133 %if %1*%2*16/mmsize == 256
4134 paddusw m0, [rsp+5*mmsize]
4135 paddusw m1, [rsp+6*mmsize]
4136 paddusw m0, [rsp+7*mmsize]
4137 paddusw m1, [rsp+8*mmsize]
4141 vextracti128 xm2, m0, 1
4142 vextracti128 xm3, m1, 1
4148 %endif ; HIGH_BIT_DEPTH
4151 ; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
4152 %macro HADAMARD_AC_WXH_SSE2 2
4153 cglobal pixel_hadamard_ac_%1x%2, 2,4,11
4157 and rsp, ~(mmsize-1)
4160 call hadamard_ac_8x8
4165 call hadamard_ac_8x8
4167 %if %1==16 && mmsize <= 16
4170 lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
4172 call hadamard_ac_8x8
4176 call hadamard_ac_8x8
4179 HADAMARD_AC_WXH_SUM_SSE2 %1, %2
4182 shr edx, 2 - (%1*%2*16/mmsize >> 8)
4190 %endmacro ; HADAMARD_AC_WXH_SSE2
4194 %if ARCH_X86_64 == 0
4195 cextern pixel_sa8d_8x8_internal_mmx2
4200 %define TRANS TRANS_SSE2
4201 %define DIFFOP DIFF_UNPACK_SSE2
4202 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2
4203 %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
4204 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
4205 %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
4206 %define movdqu movups
4207 %define punpcklqdq movlhps
4214 %if HIGH_BIT_DEPTH == 0
4222 %if HIGH_BIT_DEPTH == 0
4232 %define DIFFOP DIFF_SUMSUB_SSSE3
4233 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
4234 %if HIGH_BIT_DEPTH == 0
4235 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
4236 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
4237 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
4246 %if HIGH_BIT_DEPTH == 0
4250 %undef movdqa ; nehalem doesn't like movaps
4251 %undef movdqu ; movups
4252 %undef punpcklqdq ; or movlhps
4253 %if HIGH_BIT_DEPTH == 0
4258 %define TRANS TRANS_SSE4
4259 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
4267 %if HIGH_BIT_DEPTH == 0
4272 ; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
4273 ; it's effectively free.
4274 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
4281 %if HIGH_BIT_DEPTH == 0
4287 %define TRANS TRANS_XOP
4294 %if HIGH_BIT_DEPTH == 0
4296 ; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
4301 %if HIGH_BIT_DEPTH == 0
4302 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
4303 %define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2
4304 %define TRANS TRANS_SSE4
4311 %macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
4316 vinserti128 m%1, m%1, [r0+4*r1], 1
4317 vinserti128 m%3, m%3, [r2+4*r3], 1
4318 vinserti128 m%2, m%2, [r0+r4], 1
4319 vinserti128 m%4, m%4, [r2+r5], 1
4324 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
4332 vinserti128 m%3, m%3, [r0+4*r1], 1
4333 vinserti128 m%5, m%5, [r2+4*r3], 1
4334 vinserti128 m%4, m%4, [r0+r4], 1
4335 vinserti128 m%6, m%6, [r2+r5], 1
4340 DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
4343 %macro SATD_START_AVX2 2-3 0
4357 %define TRANS TRANS_SSE4
4359 cglobal pixel_satd_16x8_internal
4360 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
4361 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
4362 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0
4363 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
4366 cglobal pixel_satd_16x16, 4,6,8
4367 SATD_START_AVX2 m6, m7
4368 call pixel_satd_16x8_internal
4371 pixel_satd_16x8_internal:
4372 call pixel_satd_16x8_internal
4373 vextracti128 xm0, m6, 1
4378 cglobal pixel_satd_16x8, 4,6,8
4379 SATD_START_AVX2 m6, m7
4380 jmp pixel_satd_16x8_internal
4382 cglobal pixel_satd_8x8_internal
4383 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
4384 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
4387 cglobal pixel_satd_8x16, 4,6,8
4388 SATD_START_AVX2 m6, m7, 1
4389 call pixel_satd_8x8_internal
4394 call pixel_satd_8x8_internal
4395 vextracti128 xm0, m6, 1
4400 cglobal pixel_satd_8x8, 4,6,8
4401 SATD_START_AVX2 m6, m7, 1
4402 call pixel_satd_8x8_internal
4403 vextracti128 xm0, m6, 1
4408 cglobal pixel_sa8d_8x8_internal
4409 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
4410 HADAMARD4_V 0, 1, 2, 3, 4
4411 HADAMARD 8, sumsub, 0, 1, 4, 5
4412 HADAMARD 8, sumsub, 2, 3, 4, 5
4413 HADAMARD 2, sumsub, 0, 1, 4, 5
4414 HADAMARD 2, sumsub, 2, 3, 4, 5
4415 HADAMARD 1, amax, 0, 1, 4, 5
4416 HADAMARD 1, amax, 2, 3, 4, 5
4421 cglobal pixel_sa8d_8x8, 4,6,8
4422 SATD_START_AVX2 m6, m7, 1
4423 call pixel_sa8d_8x8_internal
4424 vextracti128 xm1, m6, 1
4432 cglobal intra_sad_x9_8x8, 5,7,8
4433 %define pred(i,j) [rsp+i*0x40+j*0x20]
4438 movu m5, [r0+0*FENC_STRIDE]
4439 movu m6, [r0+4*FENC_STRIDE]
4440 punpcklqdq m5, [r0+2*FENC_STRIDE]
4441 punpcklqdq m6, [r0+6*FENC_STRIDE]
4443 ; save instruction size: avoid 4-byte memory offsets
4444 lea r0, [intra8x9_h1+128]
4445 %define off(m) (r0+m-(intra8x9_h1+128))
4447 vpbroadcastq m0, [r2+16]
4454 vpbroadcastq m1, [r2+7]
4455 pshufb m3, m1, [off(intra8x9_h1)]
4456 pshufb m2, m1, [off(intra8x9_h3)]
4464 %define pred(i,j) [r5+i*0x40+j*0x20-0x100]
4466 ; combine the first two
4486 vbroadcasti128 m0, [r2+16]
4487 vbroadcasti128 m2, [r2+17]
4490 PRED4x4_LOWPASS m0, m1, m2, m0, m7
4491 pshufb m1, m0, [off(intra8x9_ddl1)]
4492 pshufb m2, m0, [off(intra8x9_ddl3)]
4501 vextracti128 xm1, m4, 1
4506 vinserti128 m7, m3, xm0, 1
4508 vbroadcasti128 m2, [r2+8]
4509 vbroadcasti128 m0, [r2+7]
4510 vbroadcasti128 m1, [r2+6]
4512 PRED4x4_LOWPASS m0, m1, m2, m0, m4
4513 pshufb m1, m0, [off(intra8x9_ddr1)]
4514 pshufb m2, m0, [off(intra8x9_ddr3)]
4523 %define off(m) (r0+m-(intra8x9_h1+256+128))
4524 %define pred(i,j) [r5+i*0x40+j*0x20-0x1C0]
4526 vpblendd m2, m3, m0, 11110011b
4527 pshufb m1, m2, [off(intra8x9_vr1)]
4528 pshufb m2, m2, [off(intra8x9_vr3)]
4539 pblendw m2, m0, q3330
4541 pshufb m1, m2, [off(intra8x9_hd1)]
4542 pshufb m2, m0, [off(intra8x9_hd3)]
4552 pshufb m1, m7, [off(intra8x9_vl1)]
4553 pshufb m2, m7, [off(intra8x9_vl3)]
4562 vextracti128 xm1, m4, 1
4565 SBUTTERFLY qdq, 3, 4, 7
4569 vpbroadcastd m0, [r2+7]
4571 pshufb m1, m0, [off(intra8x9_hu1)]
4572 pshufb m2, m0, [off(intra8x9_hu3)]
4578 vextracti128 xm2, m1, 1
4586 add r2w, word [r3+16]
4598 add r1, 4*FDEC_STRIDE
4599 mova xm0, [rsp+r3+0x00]
4600 mova xm1, [rsp+r3+0x10]
4601 mova xm2, [rsp+r3+0x20]
4602 mova xm3, [rsp+r3+0x30]
4603 movq [r1+FDEC_STRIDE*-4], xm0
4604 movhps [r1+FDEC_STRIDE*-2], xm0
4605 movq [r1+FDEC_STRIDE*-3], xm1
4606 movhps [r1+FDEC_STRIDE*-1], xm1
4607 movq [r1+FDEC_STRIDE* 0], xm2
4608 movhps [r1+FDEC_STRIDE* 2], xm2
4609 movq [r1+FDEC_STRIDE* 1], xm3
4610 movhps [r1+FDEC_STRIDE* 3], xm3
4614 %endif ; HIGH_BIT_DEPTH
4616 ;=============================================================================
4618 ;=============================================================================
4620 ;-----------------------------------------------------------------------------
4621 ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
4622 ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
4623 ;-----------------------------------------------------------------------------
4626 movdqu m5, [r0+(%1&1)*r1]
4627 movdqu m6, [r2+(%1&1)*r3]
4629 movq m5, [r0+(%1&1)*r1]
4630 movq m6, [r2+(%1&1)*r3]
4648 ACCUM paddd, 3, 5, %1
4649 ACCUM paddd, 4, 7, %1
4654 cglobal pixel_ssim_4x4x2_core, 4,4,8
4664 pshufd m5, m3, q2301
4667 pshufd m6, m4, q2301
4670 pshufd m1, m1, q3120
4673 punpckhdq m5, m3, m4
4689 ;-----------------------------------------------------------------------------
4690 ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
4691 ;-----------------------------------------------------------------------------
4692 cglobal pixel_ssim_end4, 3,3,7
4707 movdqa m5, [ssim_c1]
4708 movdqa m6, [ssim_c2]
4709 TRANSPOSE4x4D 0, 1, 2, 3, 4
4711 ; s1=m0, s2=m1, ss=m2, s12=m3
4717 mulps m2, [pf_64] ; ss*64
4718 mulps m3, [pf_128] ; s12*128
4720 mulps m4, m0 ; s1*s2
4721 mulps m1, m1 ; s2*s2
4722 mulps m0, m0 ; s1*s1
4723 addps m4, m4 ; s1*s2*2
4724 addps m0, m1 ; s1*s1 + s2*s2
4726 subps m3, m4 ; covar*2
4727 addps m4, m5 ; s1*s2*2 + ssim_c1
4728 addps m0, m5 ; s1*s1 + s2*s2 + ssim_c1
4729 addps m2, m6 ; vars + ssim_c2
4730 addps m3, m6 ; covar*2 + ssim_c2
4732 pmaddwd m4, m1, m0 ; s1*s2
4735 pmaddwd m0, m0 ; s1*s1 + s2*s2
4739 psubd m3, m4 ; covar*2
4745 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
4746 cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
4747 cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
4748 cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
4755 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
4758 lea r3, [mask_ff + 16]
4759 movdqu m1, [r3 + r2*4]
4761 movdqu m1, [mask_ff + r2*4 + 16]
4767 pshuflw m4, m0, q0032
4769 %if ARCH_X86_64 == 0
4781 ;-----------------------------------------------------------------------------
4782 ; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
4783 ;-----------------------------------------------------------------------------
4785 cglobal pixel_asd8, 5,5
4848 ;=============================================================================
4849 ; Successive Elimination ADS
4850 ;=============================================================================
4859 lea r6, [r4+r5+(mmsize-1)]
4864 %macro ADS_END 1 ; unroll_size
4870 WIN64_RESTORE_XMM rsp
4874 lea r6, [r4+r5+(mmsize-1)]
4883 ;-----------------------------------------------------------------------------
4884 ; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
4885 ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
4886 ;-----------------------------------------------------------------------------
4888 cglobal pixel_ads4, 5,7
4892 pshufw m6, m6, q2222
4894 pshufw m4, m4, q2222
4919 cglobal pixel_ads2, 5,7
4923 pshufw m6, m6, q2222
4940 cglobal pixel_ads1, 5,7
4963 cglobal pixel_ads4, 5,7,8
4964 vpbroadcastw m7, [r0+ 0]
4965 vpbroadcastw m6, [r0+ 4]
4966 vpbroadcastw m5, [r0+ 8]
4967 vpbroadcastw m4, [r0+12]
4969 cglobal pixel_ads4, 5,7,12
4971 pshuflw m7, m4, q0000
4972 pshuflw m6, m4, q2222
4973 pshufhw m5, m4, q0000
4974 pshufhw m4, m4, q2222
4980 %if ARCH_X86_64 && mmsize == 16
4993 movu m11, [r1+r2+16]
5023 vpbroadcastw m1, r6m
5034 vpermq m1, m1, q3120
5041 cglobal pixel_ads2, 5,7,8
5043 vpbroadcastw m7, [r0+0]
5044 vpbroadcastw m6, [r0+4]
5045 vpbroadcastw m5, r6m
5050 pshuflw m6, m6, q2222
5070 vpermq m1, m1, q3120
5077 cglobal pixel_ads1, 5,7,8
5079 vpbroadcastw m7, [r0]
5080 vpbroadcastw m6, r6m
5092 movu m1, [r1+mmsize]
5096 movu m3, [r3+mmsize]
5105 vpermq m4, m4, q3120
5120 ; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
5123 ; *(uint32_t*)(masks+width) = 0;
5124 ; for( i=0; i<width; i+=8 )
5126 ; uint64_t mask = *(uint64_t*)(masks+i);
5127 ; if( !mask ) continue;
5128 ; for( j=0; j<8; j++ )
5129 ; if( mask & (255<<j*8) )
5137 test r2d, 0xff<<(%1*8)
5144 cglobal pixel_ads_mvs, 0,7,0
5149 ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
5189 cglobal pixel_ads_mvs, 0,7,0
5192 mova m4, [pw_76543210]
5199 %define GLOBAL +r1-$$
5207 xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions)
5208 movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt
5210 ; shuffle counters based on mv mask
5211 pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
5214 paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}