1 ;*****************************************************************************
2 ;* pixel.asm: x86 pixel metrics
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2013 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* Fiona Glaser <fiona@x264.com>
11 ;* Oskar Arvidsson <oskar@irock.se>
13 ;* This program is free software; you can redistribute it and/or modify
14 ;* it under the terms of the GNU General Public License as published by
15 ;* the Free Software Foundation; either version 2 of the License, or
16 ;* (at your option) any later version.
18 ;* This program is distributed in the hope that it will be useful,
19 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;* GNU General Public License for more details.
23 ;* You should have received a copy of the GNU General Public License
24 ;* along with this program; if not, write to the Free Software
25 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
27 ;* This program is also available under a commercial proprietary license.
28 ;* For more information, contact us at licensing@x264.com.
29 ;*****************************************************************************
32 %include "x86util.asm"
35 mask_ff: times 16 db 0xff
38 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
39 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
40 pf_64: times 4 dd 64.0
41 pf_128: times 4 dd 128.0
43 ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64
44 ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63
46 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
47 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
49 mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1
50 mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1
51 mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1
52 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
55 mask_10: times 4 dw 0, -1
56 mask_1100: times 2 dd 0, -1
57 pb_pppm: times 4 db 1,1,1,-1
58 deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
59 intrax3_shuf: db 7,6,7,6,5,4,5,4,3,2,3,2,1,0,1,0
61 intrax9a_ddlr1: db 6, 7, 8, 9, 7, 8, 9,10, 4, 5, 6, 7, 3, 4, 5, 6
62 intrax9a_ddlr2: db 8, 9,10,11, 9,10,11,12, 2, 3, 4, 5, 1, 2, 3, 4
63 intrax9a_hdu1: db 15, 4, 5, 6,14, 3,15, 4,14, 2,13, 1,13, 1,12, 0
64 intrax9a_hdu2: db 13, 2,14, 3,12, 1,13, 2,12, 0,11,11,11,11,11,11
65 intrax9a_vrl1: db 10,11,12,13, 3, 4, 5, 6,11,12,13,14, 5, 6, 7, 8
66 intrax9a_vrl2: db 2,10,11,12, 1, 3, 4, 5,12,13,14,15, 6, 7, 8, 9
67 intrax9a_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 4, 4, 4, 3, 3, 3, 3
68 intrax9a_vh2: db 6, 7, 8, 9, 6, 7, 8, 9, 2, 2, 2, 2, 1, 1, 1, 1
69 intrax9a_dc: db 1, 2, 3, 4, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1
70 intrax9a_lut: db 0x60,0x68,0x80,0x00,0x08,0x20,0x40,0x28,0x48,0,0,0,0,0,0,0
71 pw_s01234567: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8005,0x8006,0x8007
72 pw_s01234657: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8006,0x8005,0x8007
73 intrax9_edge: db 0, 0, 1, 2, 3, 7, 8, 9,10,11,12,13,14,15,15,15
75 intrax9b_ddlr1: db 6, 7, 8, 9, 4, 5, 6, 7, 7, 8, 9,10, 3, 4, 5, 6
76 intrax9b_ddlr2: db 8, 9,10,11, 2, 3, 4, 5, 9,10,11,12, 1, 2, 3, 4
77 intrax9b_hdu1: db 15, 4, 5, 6,14, 2,13, 1,14, 3,15, 4,13, 1,12, 0
78 intrax9b_hdu2: db 13, 2,14, 3,12, 0,11,11,12, 1,13, 2,11,11,11,11
79 intrax9b_vrl1: db 10,11,12,13,11,12,13,14, 3, 4, 5, 6, 5, 6, 7, 8
80 intrax9b_vrl2: db 2,10,11,12,12,13,14,15, 1, 3, 4, 5, 6, 7, 8, 9
81 intrax9b_vh1: db 6, 7, 8, 9, 4, 4, 4, 4, 6, 7, 8, 9, 3, 3, 3, 3
82 intrax9b_vh2: db 6, 7, 8, 9, 2, 2, 2, 2, 6, 7, 8, 9, 1, 1, 1, 1
83 intrax9b_edge2: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1
84 intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1
85 intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
86 intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0
88 intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5
89 intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4
90 intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1
91 intra8x9_h4: db 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0
92 intra8x9_ddl1: db 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9,10
93 intra8x9_ddl2: db 2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9,10,11
94 intra8x9_ddl3: db 5, 6, 7, 8, 9,10,11,12, 7, 8, 9,10,11,12,13,14
95 intra8x9_ddl4: db 6, 7, 8, 9,10,11,12,13, 8, 9,10,11,12,13,14,15
96 intra8x9_vl1: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
97 intra8x9_vl2: db 1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9
98 intra8x9_vl3: db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9,10
99 intra8x9_vl4: db 3, 4, 5, 6, 7, 8, 9,10, 4, 5, 6, 7, 8, 9,10,11
100 intra8x9_ddr1: db 8, 9,10,11,12,13,14,15, 6, 7, 8, 9,10,11,12,13
101 intra8x9_ddr2: db 7, 8, 9,10,11,12,13,14, 5, 6, 7, 8, 9,10,11,12
102 intra8x9_ddr3: db 4, 5, 6, 7, 8, 9,10,11, 2, 3, 4, 5, 6, 7, 8, 9
103 intra8x9_ddr4: db 3, 4, 5, 6, 7, 8, 9,10, 1, 2, 3, 4, 5, 6, 7, 8
104 intra8x9_vr1: db 8, 9,10,11,12,13,14,15, 7, 8, 9,10,11,12,13,14
105 intra8x9_vr2: db 8, 9,10,11,12,13,14,15, 6, 8, 9,10,11,12,13,14
106 intra8x9_vr3: db 5, 7, 8, 9,10,11,12,13, 3, 5, 7, 8, 9,10,11,12
107 intra8x9_vr4: db 4, 6, 8, 9,10,11,12,13, 2, 4, 6, 8, 9,10,11,12
108 intra8x9_hd1: db 3, 8, 9,10,11,12,13,14, 1, 6, 2, 7, 3, 8, 9,10
109 intra8x9_hd2: db 2, 7, 3, 8, 9,10,11,12, 0, 5, 1, 6, 2, 7, 3, 8
110 intra8x9_hd3: db 7, 8, 9,10,11,12,13,14, 3, 4, 5, 6, 7, 8, 9,10
111 intra8x9_hd4: db 5, 6, 7, 8, 9,10,11,12, 1, 2, 3, 4, 5, 6, 7, 8
112 intra8x9_hu1: db 13,12,11,10, 9, 8, 7, 6, 9, 8, 7, 6, 5, 4, 3, 2
113 intra8x9_hu2: db 11,10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0
114 intra8x9_hu3: db 5, 4, 3, 2, 1, 0,15,15, 1, 0,15,15,15,15,15,15
115 intra8x9_hu4: db 3, 2, 1, 0,15,15,15,15,15,15,15,15,15,15,15,15
116 pw_s00112233: dw 0x8000,0x8000,0x8001,0x8001,0x8002,0x8002,0x8003,0x8003
117 pw_s00001111: dw 0x8000,0x8000,0x8000,0x8000,0x8001,0x8001,0x8001,0x8001
119 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
120 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
123 sq_0f: dq 0xffffffff, 0
124 pd_f0: times 4 dd 0xffff0000
141 ;=============================================================================
143 ;=============================================================================
146 ;-----------------------------------------------------------------------------
147 ; int pixel_ssd_MxN( uint16_t *, intptr_t, uint16_t *, intptr_t )
148 ;-----------------------------------------------------------------------------
150 cglobal pixel_ssd_%1x%2, 4,7,6
154 %define offset0_2 r1*2
157 %define offset1_2 r3*2
162 %define offset0_1 mmsize
164 %define offset0_3 r1+mmsize
165 %define offset1_1 mmsize
167 %define offset1_3 r3+mmsize
169 %define offset0_1 mmsize
170 %define offset0_2 mmsize*2
171 %define offset0_3 mmsize*3
172 %define offset1_1 mmsize
173 %define offset1_2 mmsize*2
174 %define offset1_3 mmsize*3
176 %assign %%n %2/(2*mmsize/%1)
183 mova m2, [r0+offset0_1]
184 mova m3, [r0+offset0_2]
185 mova m4, [r0+offset0_3]
187 psubw m2, [r2+offset1_1]
188 psubw m3, [r2+offset1_2]
189 psubw m4, [r2+offset1_3]
191 lea r0, [r0+r1*(%2/%%n)]
192 lea r2, [r2+r3*(%2/%%n)]
226 %endif ; HIGH_BIT_DEPTH
228 %if HIGH_BIT_DEPTH == 0
229 %macro SSD_LOAD_FULL 5
273 DEINTB %2, %1, %4, %3, 7
288 %macro SSD_LOAD_HALF 5
289 LOAD 1, 2, [t0+%1], [t0+%3], 1
290 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
291 LOAD 3, 4, [t0+%1], [t0+%3], %5
292 JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5
305 punpcklbw m%2, m%1, m%5
307 punpcklbw m%4, m%3, m%5
316 %macro SSD_CORE_SSE2 7-8
318 DEINTB %6, %1, %7, %2, %5
322 DEINTB %6, %3, %7, %4, %5
333 %macro SSD_CORE_SSSE3 7-8
335 punpckhbw m%6, m%1, m%2
336 punpckhbw m%7, m%3, m%4
353 SSD_LOAD_%1 %2,%3,%4,%5,%6
354 SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
361 ;-----------------------------------------------------------------------------
362 ; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
363 ;-----------------------------------------------------------------------------
366 %assign function_align 8
368 %assign function_align 16
370 cglobal pixel_ssd_%1x%2, 0,0,0
371 mov al, %1*%2/mmsize/2
374 jmp mangle(x264_pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop)
379 DECLARE_REG_TMP 0,1,2,3
383 DECLARE_REG_TMP 1,2,3,4
392 %elifidn cpuname, sse2
402 SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
404 SSD_ITER FULL, 0, 0, t1, t3, 2
406 SSD_ITER HALF, 0, 0, t1, t3, 2
432 %define SSD_CORE SSD_CORE_SSE2
433 %define JOIN JOIN_SSE2
440 %define SSD_CORE SSD_CORE_SSSE3
441 %define JOIN JOIN_SSSE3
463 %assign function_align 16
464 %endif ; !HIGH_BIT_DEPTH
466 ;-----------------------------------------------------------------------------
467 ; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
468 ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
470 ; The maximum width this function can handle without risk of overflow is given
471 ; in the following equation: (mmsize in bits)
473 ; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
475 ; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
476 ; distortion levels it will take much more than that though.
477 ;-----------------------------------------------------------------------------
480 cglobal pixel_ssd_nv12_core, 6,7,7
496 mova m1, [r0+r6+mmsize]
498 psubw m1, [r2+r6+mmsize]
499 PSHUFLW m0, m0, q3120
500 PSHUFLW m1, m1, q3120
502 pshufhw m0, m0, q3120
503 pshufhw m1, m1, q3120
511 %if mmsize==16 ; using HADDD would remove the mmsize/32 part from the
512 ; equation above, putting the width limit at 8208
521 %else ; unfortunately paddq is sse2
522 ; emulate 48 bit precision for mmx2 instead
543 %else ; fixup for mmx2
544 SBUTTERFLY dq, 4, 5, 0
549 SBUTTERFLY dq, 0, 5, 4
557 %endif ; HIGH_BIT_DEPTH
559 %if HIGH_BIT_DEPTH == 0
560 ;-----------------------------------------------------------------------------
561 ; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
562 ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
564 ; This implementation can potentially overflow on image widths >= 11008 (or
565 ; 6604 if interlaced), since it is called on blocks of height up to 12 (resp
566 ; 20). At sane distortion levels it will take much more than that though.
567 ;-----------------------------------------------------------------------------
569 cglobal pixel_ssd_nv12_core, 6,7
608 %endif ; !HIGH_BIT_DEPTH
617 ;=============================================================================
619 ;=============================================================================
623 pxor m6, m6 ; sum squared
624 %if HIGH_BIT_DEPTH == 0
630 %endif ; !HIGH_BIT_DEPTH
635 %if mmsize == 8 && %1*%2 == 256
640 %else ; !HIGH_BIT_DEPTH
642 %endif ; HIGH_BIT_DEPTH
675 mova m4, [r0+%1+mmsize]
676 %else ; !HIGH_BIT_DEPTH
682 %endif ; HIGH_BIT_DEPTH
688 %if HIGH_BIT_DEPTH == 0
691 %endif ; !HIGH_BIT_DEPTH
697 ;-----------------------------------------------------------------------------
698 ; int pixel_var_wxh( uint8_t *, intptr_t )
699 ;-----------------------------------------------------------------------------
701 cglobal pixel_var_16x16, 2,3
704 VAR_2ROW 8*SIZEOF_PIXEL, 16
707 cglobal pixel_var_8x16, 2,3
713 cglobal pixel_var_8x8, 2,3
721 cglobal pixel_var_16x16, 2,3,8
727 cglobal pixel_var_8x8, 2,3,8
750 %endif ; HIGH_BIT_DEPTH
752 %if HIGH_BIT_DEPTH == 0
754 cglobal pixel_var_16x16, 2,3,8
767 cglobal pixel_var_8x8, 2,4,8
783 cglobal pixel_var_8x16, 2,4,8
806 %endif ; !HIGH_BIT_DEPTH
816 sub eax, r1d ; sqr - (sum * sum >> shift)
820 ;-----------------------------------------------------------------------------
821 ; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
822 ;-----------------------------------------------------------------------------
823 %macro VAR2_8x8_MMX 2
824 cglobal pixel_var2_8x%1, 5,6
833 psubw m1, [r2+mmsize]
834 %else ; !HIGH_BIT_DEPTH
845 %endif ; HIGH_BIT_DEPTH
865 %macro VAR2_8x8_SSE2 2
866 cglobal pixel_var2_8x%1, 5,6,8
875 %else ; !HIGH_BIT_DEPTH
881 %endif ; HIGH_BIT_DEPTH
890 lea r0, [r0+r1*2*SIZEOF_PIXEL]
891 lea r2, [r2+r3*2*SIZEOF_PIXEL]
901 %if HIGH_BIT_DEPTH == 0
902 %macro VAR2_8x8_SSSE3 2
903 cglobal pixel_var2_8x%1, 5,6,8
905 pxor m6, m6 ; sum squared
953 %endif ; !HIGH_BIT_DEPTH
955 ;=============================================================================
957 ;=============================================================================
961 ; just use shufps on anything post conroe
964 ; join 2x 32 bit and duplicate them
965 ; emulating shufps is faster on conroe
969 ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
981 %macro DIFF_UNPACK_SSE2 5
990 %macro DIFF_SUMSUB_SSSE3 5
991 HSUMSUB %1, %2, %3, %4, %5
996 %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
1002 %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
1009 %macro LOAD_DUP_4x8P_PENRYN 8
1010 ; penryn and nehalem run punpcklqdq and movddup in different units
1019 %macro LOAD_SUMSUB_8x2P 9
1020 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
1021 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1024 %macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
1025 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
1026 LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
1027 LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
1034 %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
1040 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1043 %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
1046 DEINTB %1, %2, %3, %4, %5
1049 SUMSUB_BA w, %1, %2, %3
1052 %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
1053 ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
1054 LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
1055 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
1056 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
1057 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
1060 ; in: r4=3*stride1, r5=3*stride2
1061 ; in: %2 = horizontal offset
1062 ; in: %3 = whether we need to increment pix1 and pix2
1065 %macro SATD_4x4_MMX 3
1067 %assign offset %2*SIZEOF_PIXEL
1068 LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset]
1069 LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset]
1070 LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset]
1071 LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset]
1076 HADAMARD4_2D 4, 5, 6, 7, 3, %%n
1081 %macro SATD_8x4_SSE 8-9
1083 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
1085 HADAMARD4_V %2, %3, %4, %5, %6
1086 ; doing the abs first is a slight advantage
1087 ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
1088 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
1089 HADAMARD 1, max, %2, %4, %6, %7
1099 HADAMARD 1, max, %3, %5, %6, %7
1104 %macro SATD_START_MMX 0
1106 lea r4, [3*r1] ; 3*stride1
1107 lea r5, [3*r3] ; 3*stride2
1110 %macro SATD_END_MMX 0
1114 %else ; !HIGH_BIT_DEPTH
1115 pshufw m1, m0, q1032
1117 pshufw m1, m0, q2301
1121 %endif ; HIGH_BIT_DEPTH
1125 ; FIXME avoid the spilling of regs to hold 3*stride.
1126 ; for small blocks on x86_32, modify pixel pointer instead.
1128 ;-----------------------------------------------------------------------------
1129 ; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
1130 ;-----------------------------------------------------------------------------
1132 cglobal pixel_satd_16x4_internal
1133 SATD_4x4_MMX m2, 0, 0
1134 SATD_4x4_MMX m1, 4, 0
1136 SATD_4x4_MMX m2, 8, 0
1138 SATD_4x4_MMX m1, 12, 0
1143 cglobal pixel_satd_8x8_internal
1144 SATD_4x4_MMX m2, 0, 0
1145 SATD_4x4_MMX m1, 4, 1
1148 pixel_satd_8x4_internal_mmx2:
1149 SATD_4x4_MMX m2, 0, 0
1150 SATD_4x4_MMX m1, 4, 0
1156 %macro SATD_MxN_MMX 3
1157 cglobal pixel_satd_%1x%2, 4,7
1160 call pixel_satd_%1x%3_internal_mmx2
1167 call pixel_satd_%1x%3_internal_mmx2
1178 SATD_MxN_MMX 16, 16, 4
1179 SATD_MxN_MMX 16, 8, 4
1180 SATD_MxN_MMX 8, 16, 8
1181 %endif ; HIGH_BIT_DEPTH
1183 %if HIGH_BIT_DEPTH == 0
1184 cglobal pixel_satd_16x16, 4,6
1188 call pixel_satd_16x4_internal_mmx2
1192 call pixel_satd_16x4_internal_mmx2
1197 cglobal pixel_satd_16x8, 4,6
1200 call pixel_satd_16x4_internal_mmx2
1203 call pixel_satd_16x4_internal_mmx2
1206 cglobal pixel_satd_8x16, 4,6
1209 call pixel_satd_8x8_internal_mmx2
1212 call pixel_satd_8x8_internal_mmx2
1214 %endif ; !HIGH_BIT_DEPTH
1216 cglobal pixel_satd_8x8, 4,6
1219 call pixel_satd_8x8_internal_mmx2
1222 cglobal pixel_satd_8x4, 4,6
1225 call pixel_satd_8x4_internal_mmx2
1228 cglobal pixel_satd_4x16, 4,6
1230 SATD_4x4_MMX m0, 0, 1
1231 SATD_4x4_MMX m1, 0, 1
1233 SATD_4x4_MMX m1, 0, 1
1235 SATD_4x4_MMX m1, 0, 0
1239 cglobal pixel_satd_4x8, 4,6
1241 SATD_4x4_MMX m0, 0, 1
1242 SATD_4x4_MMX m1, 0, 0
1246 cglobal pixel_satd_4x4, 4,6
1248 SATD_4x4_MMX m0, 0, 0
1251 %macro SATD_START_SSE2 2-3 0
1253 %if HIGH_BIT_DEPTH && %3
1255 %elif cpuflag(ssse3)
1263 %macro SATD_END_SSE2 1-2
1284 %macro BACKUP_POINTERS 0
1294 %macro RESTORE_AND_INC_POINTERS 0
1296 lea r0, [r6+8*SIZEOF_PIXEL]
1297 lea r2, [r7+8*SIZEOF_PIXEL]
1304 add r0, 8*SIZEOF_PIXEL
1305 add r2, 8*SIZEOF_PIXEL
1309 %macro SATD_4x8_SSE 2
1315 movhps m0, [r0+4*r1]
1316 movhps m4, [r2+4*r3]
1324 movhps m1, [r0+1*r1]
1325 movhps m5, [r2+1*r3]
1326 movhps m2, [r0+2*r1]
1327 movhps m6, [r2+2*r3]
1333 %else ; !HIGH_BIT_DEPTH
1350 %if cpuflag(ssse3) && %1==1
1352 DIFFOP 0, 4, 1, 5, 3
1354 DIFFOP 0, 4, 1, 5, 7
1368 %if cpuflag(ssse3) && %1==1
1370 DIFFOP 2, 6, 3, 5, 4
1372 DIFFOP 2, 6, 3, 5, 7
1374 %endif ; HIGH_BIT_DEPTH
1375 SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 7, %2
1378 ;-----------------------------------------------------------------------------
1379 ; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
1380 ;-----------------------------------------------------------------------------
1383 cglobal pixel_satd_4x4, 4, 6, 6
1386 LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
1387 LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
1388 LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
1389 LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
1390 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
1391 HADAMARD 0, sumsub, 0, 1, 2, 3
1392 HADAMARD 4, sumsub, 0, 1, 2, 3
1393 HADAMARD 1, amax, 0, 1, 2, 3
1399 cglobal pixel_satd_4x8, 4, 6, 8
1401 %if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
1404 SATD_4x8_SSE 0, swap
1409 cglobal pixel_satd_4x16, 4, 6, 8
1411 %if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
1414 SATD_4x8_SSE 0, swap
1415 lea r0, [r0+r1*2*SIZEOF_PIXEL]
1416 lea r2, [r2+r3*2*SIZEOF_PIXEL]
1422 cglobal pixel_satd_8x8_internal
1423 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
1424 SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 6
1425 %%pixel_satd_8x4_internal:
1426 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
1427 SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 6
1430 %if HIGH_BIT_DEPTH == 0 && UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
1431 cglobal pixel_satd_16x4_internal
1432 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
1435 SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10
1436 SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
1439 cglobal pixel_satd_16x8, 4,6,12
1440 SATD_START_SSE2 m10, m7
1441 %if notcpuflag(ssse3)
1444 jmp %%pixel_satd_16x8_internal
1446 cglobal pixel_satd_16x16, 4,6,12
1447 SATD_START_SSE2 m10, m7
1448 %if notcpuflag(ssse3)
1451 call pixel_satd_16x4_internal
1452 call pixel_satd_16x4_internal
1453 %%pixel_satd_16x8_internal:
1454 call pixel_satd_16x4_internal
1455 call pixel_satd_16x4_internal
1458 cglobal pixel_satd_16x8, 4,6,8
1459 SATD_START_SSE2 m6, m7
1461 call pixel_satd_8x8_internal
1462 RESTORE_AND_INC_POINTERS
1463 call pixel_satd_8x8_internal
1466 cglobal pixel_satd_16x16, 4,6,8
1467 SATD_START_SSE2 m6, m7, 1
1469 call pixel_satd_8x8_internal
1470 call pixel_satd_8x8_internal
1471 SATD_ACCUM m6, m0, m7
1472 RESTORE_AND_INC_POINTERS
1473 call pixel_satd_8x8_internal
1474 call pixel_satd_8x8_internal
1475 SATD_END_SSE2 m6, m7
1478 cglobal pixel_satd_8x16, 4,6,8
1479 SATD_START_SSE2 m6, m7
1480 call pixel_satd_8x8_internal
1481 call pixel_satd_8x8_internal
1484 cglobal pixel_satd_8x8, 4,6,8
1485 SATD_START_SSE2 m6, m7
1486 call pixel_satd_8x8_internal
1489 cglobal pixel_satd_8x4, 4,6,8
1490 SATD_START_SSE2 m6, m7
1491 call %%pixel_satd_8x4_internal
1493 %endmacro ; SATDS_SSE2
1508 %endif ; HIGH_BIT_DEPTH
1514 %else ; sse2 doesn't seem to like the horizontal way of doing things
1515 %define vertical (cpuflags == cpuflags_sse2)
1519 ;-----------------------------------------------------------------------------
1520 ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
1521 ;-----------------------------------------------------------------------------
1522 cglobal pixel_sa8d_8x8_internal
1525 LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
1526 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
1528 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
1530 HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11
1538 cglobal pixel_sa8d_8x8, 4,8,12
1545 call pixel_sa8d_8x8_internal
1550 %endif ; HIGH_BIT_DEPTH
1556 cglobal pixel_sa8d_16x16, 4,8,12
1563 call pixel_sa8d_8x8_internal ; pix[0]
1564 add r2, 8*SIZEOF_PIXEL
1565 add r0, 8*SIZEOF_PIXEL
1570 call pixel_sa8d_8x8_internal ; pix[8]
1574 call pixel_sa8d_8x8_internal ; pix[8*stride+8]
1575 sub r2, 8*SIZEOF_PIXEL
1576 sub r0, 8*SIZEOF_PIXEL
1578 call pixel_sa8d_8x8_internal ; pix[8*stride]
1581 %if HIGH_BIT_DEPTH == 0
1591 cglobal pixel_sa8d_8x8_internal
1592 %define spill0 [esp+4]
1593 %define spill1 [esp+20]
1594 %define spill2 [esp+36]
1596 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
1597 HADAMARD4_2D 0, 1, 2, 3, 4
1599 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
1600 HADAMARD4_2D 4, 5, 6, 7, 3
1601 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
1604 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
1607 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
1608 ; could do first HADAMARD4_V here to save spilling later
1609 ; surprisingly, not a win on conroe or even p4
1614 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
1615 HADAMARD4_V 4, 5, 6, 7, 3
1621 HADAMARD4_V 0, 1, 2, 3, 7
1622 SUMSUB_BADC w, 0, 4, 1, 5, 7
1623 HADAMARD 2, sumsub, 0, 4, 7, 6
1624 HADAMARD 2, sumsub, 1, 5, 7, 6
1625 HADAMARD 1, amax, 0, 4, 7, 6
1626 HADAMARD 1, amax, 1, 5, 7, 6
1630 SUMSUB_BADC w, 2, 6, 3, 7, 4
1631 HADAMARD 2, sumsub, 2, 6, 4, 5
1632 HADAMARD 2, sumsub, 3, 7, 4, 5
1633 HADAMARD 1, amax, 2, 6, 4, 5
1634 HADAMARD 1, amax, 3, 7, 4, 5
1635 %endif ; sse2/non-sse2
1640 %endif ; ifndef mmx2
1642 cglobal pixel_sa8d_8x8, 4,7
1649 call pixel_sa8d_8x8_internal
1654 %endif ; HIGH_BIT_DEPTH
1661 cglobal pixel_sa8d_16x16, 4,7
1668 call pixel_sa8d_8x8_internal
1677 call pixel_sa8d_8x8_internal
1680 add r0, 8*SIZEOF_PIXEL
1681 add r2, 8*SIZEOF_PIXEL
1684 call pixel_sa8d_8x8_internal
1691 mova [esp+64-mmsize], m0
1692 call pixel_sa8d_8x8_internal
1695 %else ; !HIGH_BIT_DEPTH
1696 paddusw m0, [esp+64-mmsize]
1713 %endif ; HIGH_BIT_DEPTH
1719 %endif ; !ARCH_X86_64
1722 ;=============================================================================
1724 ;=============================================================================
1735 ; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+,
1736 ; and are only retained for old cpus.
1737 %macro INTRA_SA8D_SSE2 0
1739 ;-----------------------------------------------------------------------------
1740 ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
1741 ;-----------------------------------------------------------------------------
1742 cglobal intra_sa8d_x3_8x8, 3,3,14
1745 movq m0, [r0+0*FENC_STRIDE]
1746 movq m1, [r0+1*FENC_STRIDE]
1747 movq m2, [r0+2*FENC_STRIDE]
1748 movq m3, [r0+3*FENC_STRIDE]
1749 movq m4, [r0+4*FENC_STRIDE]
1750 movq m5, [r0+5*FENC_STRIDE]
1751 movq m6, [r0+6*FENC_STRIDE]
1752 movq m7, [r0+7*FENC_STRIDE]
1762 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
1764 ABSW2 m8, m9, m2, m3, m2, m3
1765 ABSW2 m10, m11, m4, m5, m4, m5
1768 ABSW2 m10, m11, m6, m7, m6, m7
1775 ; 1D hadamard of edges
1781 HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q1032, [pw_ppppmmmm]
1782 HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q2301, [pw_ppmmppmm]
1783 pshuflw m10, m8, q2301
1784 pshuflw m11, m9, q2301
1785 pshufhw m10, m10, q2301
1786 pshufhw m11, m11, q2301
1787 pmullw m8, [pw_pmpmpmpm]
1788 pmullw m11, [pw_pmpmpmpm]
1798 psllw m8, 3 ; left edge
1801 ABSW2 m8, m10, m8, m10, m11, m12 ; 1x8 sum
1810 punpcklqdq m0, m4 ; transpose
1811 psllw m9, 3 ; top edge
1812 psrldq m2, m13, 2 ; 8x7 sum
1813 psubw m0, m9 ; 8x1 sum
1822 punpckhdq m3, m2, m8
1824 pshufd m5, m13, q3311
1827 punpckhqdq m0, m2, m5
1832 movq [r2], m0 ; i8x8_v, i8x8_h
1834 movd [r2+8], m0 ; i8x8_dc
1836 %endif ; ARCH_X86_64
1837 %endmacro ; INTRA_SA8D_SSE2
1840 ; out: m0..m3 = hadamard coefs
1842 cglobal hadamard_load
1843 ; not really a global, but otherwise cycles get attributed to the wrong function in profiling
1845 mova m0, [r0+0*FENC_STRIDEB]
1846 mova m1, [r0+1*FENC_STRIDEB]
1847 mova m2, [r0+2*FENC_STRIDEB]
1848 mova m3, [r0+3*FENC_STRIDEB]
1851 movd m0, [r0+0*FENC_STRIDE]
1852 movd m1, [r0+1*FENC_STRIDE]
1853 movd m2, [r0+2*FENC_STRIDE]
1854 movd m3, [r0+3*FENC_STRIDE]
1860 HADAMARD4_2D 0, 1, 2, 3, 4
1864 %macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp
1867 mova %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
1869 movd %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
1875 shl %2d, 5 ; log(FDEC_STRIDEB)
1877 movd %3, [r1+%2*SIZEOF_PIXEL-4+1*FDEC_STRIDEB]
1878 pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0
1879 pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2
1880 pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3
1881 %if HIGH_BIT_DEPTH == 0
1889 %define %%sign psignw
1891 %define %%sign pmullw
1893 pshufw %4, %3, q1032
1894 %%sign %4, [pw_ppmmppmm]
1896 pshufw %4, %3, q2301
1897 %%sign %4, [pw_pmpmpmpm]
1900 mova [%1_1d+2*%2], %3
1903 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
1905 pshufw %4, %1, q1032
1906 pshufw %5, %2, q1032
1907 pshufw %6, %3, q1032
1914 pshufw %4, %1, q1032
1915 pshufw %5, %2, q1032
1916 pshufw %6, %3, q1032
1926 ABSW2 m4, m5, m1, m2, m1, m2
1933 ; out: m0 v, m4 h, m5 dc
1935 %macro SUM4x3 3 ; dc, left, top
1946 punpckldq m0, m2 ; transpose
1948 ABSW2 m4, m5, m4, m5, m2, m3 ; 1x4 sum
1949 ABSW m0, m0, m1 ; 4x1 sum
1952 %macro INTRA_X3_MMX 0
1953 ;-----------------------------------------------------------------------------
1954 ; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
1955 ;-----------------------------------------------------------------------------
1956 cglobal intra_satd_x3_4x4, 3,3
1958 ; stack is 16 byte aligned because abi says so
1959 %define top_1d rsp-8 ; size 8
1960 %define left_1d rsp-16 ; size 8
1962 ; WIN64: stack is 16 byte aligned because abi says so
1963 ; X86_32: stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
1965 %define top_1d rsp+8
1970 SCALAR_HADAMARD left, 0, m4, m5
1971 SCALAR_HADAMARD top, 0, m6, m5, m7
1974 pand m6, [sw_f0] ; dc
1977 SUM4x3 m6, [left_1d], [top_1d]
1981 psrlq m1, 16 ; 4x3 sum
1984 SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw
1985 movd [r2+0], m0 ; i4x4_v satd
1986 movd [r2+4], m4 ; i4x4_h satd
1987 movd [r2+8], m5 ; i4x4_dc satd
1993 ;-----------------------------------------------------------------------------
1994 ; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
1995 ;-----------------------------------------------------------------------------
1996 cglobal intra_satd_x3_16x16, 0,5
1997 %assign stack_pad 120 + ((stack_offset+120+gprsize)&15)
1998 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
2000 %define sums rsp+64 ; size 56
2001 %define top_1d rsp+32 ; size 32
2002 %define left_1d rsp ; size 32
2020 SCALAR_HADAMARD left, r3, m0, m1
2021 SCALAR_HADAMARD top, r3, m1, m2, m3
2027 pand m6, [sw_f0] ; dc
2038 SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)]
2041 paddw m0, [sums+ 0] ; i16x16_v satd
2042 paddw m4, [sums+ 8] ; i16x16_h satd
2043 paddw m5, [sums+16] ; i16x16_dc satd
2048 add r0, 4*SIZEOF_PIXEL
2065 punpckhwd m3, m5, m7
2075 add r0, 4*FENC_STRIDEB-16*SIZEOF_PIXEL
2084 HADDD m5, m7 ; DC satd
2085 HADDD m4, m7 ; H satd
2086 HADDD m0, m7 ; the part of V satd that doesn't overlap with DC
2088 psrlq m1, 32 ; DC[1]
2089 paddd m0, m3 ; DC[2]
2090 psrlq m3, 32 ; DC[3]
2095 SUM_MM_X3 m0, m4, m5, m3, m1, m2, m6, paddd
2102 movd [r2+8], m5 ; i16x16_dc satd
2103 movd [r2+4], m4 ; i16x16_h satd
2104 movd [r2+0], m0 ; i16x16_v satd
2114 ;-----------------------------------------------------------------------------
2115 ; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
2116 ;-----------------------------------------------------------------------------
2117 cglobal intra_satd_x3_8x8c, 0,6
2118 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
2120 %define sums rsp+48 ; size 24
2121 %define dc_1d rsp+32 ; size 16
2122 %define top_1d rsp+16 ; size 16
2123 %define left_1d rsp ; size 16
2133 SCALAR_HADAMARD left, r3, m0, m1
2134 SCALAR_HADAMARD top, r3, m0, m1, m2
2139 movzx t0d, word [left_1d+0]
2140 movzx r3d, word [top_1d+0]
2141 movzx r4d, word [left_1d+8]
2142 movzx r5d, word [top_1d+8]
2143 lea t0d, [t0 + r3 + 16]
2144 lea r3d, [r4 + r5 + 16]
2153 mov [dc_1d+ 0], t0d ; tl
2154 mov [dc_1d+ 4], r5d ; tr
2155 mov [dc_1d+ 8], r4d ; bl
2156 mov [dc_1d+12], r3d ; br
2169 SUM4x3 [r5+4*(r4+2)], [left_1d+8*(r3+2)], [top_1d+8*(r4+2)]
2172 paddw m0, [sums+16] ; i4x4_v satd
2173 paddw m4, [sums+8] ; i4x4_h satd
2174 paddw m5, [sums+0] ; i4x4_dc satd
2179 add r0, 4*SIZEOF_PIXEL
2182 add r0, 4*FENC_STRIDEB-8*SIZEOF_PIXEL
2195 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
2201 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
2204 movd [r2+0], m0 ; i8x8c_dc satd
2205 movd [r2+4], m1 ; i8x8c_h satd
2206 movd [r2+8], m2 ; i8x8c_v satd
2209 %endmacro ; INTRA_X3_MMX
2213 %macro PRED4x4_LOWPASS 5
2230 %macro INTRA_X9_PRED 2
2232 movu m1, [r1-1*FDEC_STRIDE-8]
2233 pinsrb m1, [r1+3*FDEC_STRIDE-1], 0
2234 pinsrb m1, [r1+2*FDEC_STRIDE-1], 1
2235 pinsrb m1, [r1+1*FDEC_STRIDE-1], 2
2236 pinsrb m1, [r1+0*FDEC_STRIDE-1], 3
2238 movd mm0, [r1+3*FDEC_STRIDE-4]
2239 punpcklbw mm0, [r1+2*FDEC_STRIDE-4]
2240 movd mm1, [r1+1*FDEC_STRIDE-4]
2241 punpcklbw mm1, [r1+0*FDEC_STRIDE-4]
2245 movu m1, [r1-1*FDEC_STRIDE-8]
2246 movss m1, m0 ; l3 l2 l1 l0 __ __ __ lt t0 t1 t2 t3 t4 t5 t6 t7
2248 pshufb m1, [intrax9_edge] ; l3 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __
2249 psrldq m0, m1, 1 ; l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __
2250 psrldq m2, m1, 2 ; l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ __
2251 pavgb m5, m0, m1 ; Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 __ __ __ __ __
2253 PRED4x4_LOWPASS m0, m1, m2, m0, m4 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 __ __ __
2255 ; Ft1 Ft2 Ft3 Ft4 Flt Ft0 Ft1 Ft2
2256 ; Ft2 Ft3 Ft4 Ft5 Fl0 Flt Ft0 Ft1
2257 ; Ft3 Ft4 Ft5 Ft6 Fl1 Fl0 Flt Ft0
2258 ; Ft4 Ft5 Ft6 Ft7 Fl2 Fl1 Fl0 Flt
2259 pshufb m2, m0, [%1_ddlr1] ; a: ddl row0, ddl row1, ddr row0, ddr row1 / b: ddl row0, ddr row0, ddl row1, ddr row1
2260 pshufb m3, m0, [%1_ddlr2] ; rows 2,3
2262 ; Glt Flt Ft0 Ft1 Gl0 Fl1 Gl1 Fl2
2263 ; Gl0 Fl0 Glt Flt Gl1 Fl2 Gl2 Fl3
2264 ; Gl1 Fl1 Gl0 Fl0 Gl2 Fl3 Gl3 Gl3
2265 ; Gl2 Fl2 Gl1 Fl1 Gl3 Gl3 Gl3 Gl3
2266 pslldq m0, 5 ; ___ ___ ___ ___ ___ Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
2267 palignr m7, m5, m0, 5 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gl3 Gl2 Gl1 Gl0 Glt
2268 pshufb m6, m7, [%1_hdu1]
2269 pshufb m7, m7, [%1_hdu2]
2271 ; Gt0 Gt1 Gt2 Gt3 Gt1 Gt2 Gt3 Gt4
2272 ; Flt Ft0 Ft1 Ft2 Ft1 Ft2 Ft3 Ft4
2273 ; Fl0 Gt0 Gt1 Gt2 Gt2 Gt3 Gt4 Gt5
2274 ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
2275 psrldq m5, 5 ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 ...
2276 palignr m5, m0, 6 ; ___ Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
2277 pshufb m4, m5, [%1_vrl1]
2278 pshufb m5, m5, [%1_vrl2]
2279 %endmacro ; INTRA_X9_PRED
2281 %macro INTRA_X9_VHDC 5 ; edge, fenc01, fenc23, tmp, tmp
2282 pshufb m2, m%1, [intrax9b_vh1]
2283 pshufb m3, m%1, [intrax9b_vh2]
2284 mova [pred_buf+0x60], m2
2285 mova [pred_buf+0x70], m3
2286 pshufb m%1, [intrax9b_edge2] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3
2287 pmaddubsw m%1, [hmul_4p]
2288 pshufhw m0, m%1, q2301
2289 pshuflw m0, m0, q2301
2290 psignw m%1, [pw_pmpmpmpm]
2292 psllw m0, 2 ; hadamard(top), hadamard(left)
2294 pshufb m1, m0, [intrax9b_v1]
2295 pshufb m2, m0, [intrax9b_v2]
2297 psignw m3, [pw_pmmpzzzz] ; FIXME could this be eliminated?
2299 pand m0, [sw_f0] ; dc
2300 ; This (as well as one of the steps in intra_satd_x9_4x4.satd_8x4) could be
2301 ; changed from a wd transpose to a qdq, with appropriate rearrangement of inputs.
2302 ; Which would be faster on conroe, but slower on penryn and sandybridge, and too invasive to ifdef.
2303 HADAMARD 0, sumsub, %2, %3, %4, %5
2304 HADAMARD 1, sumsub, %2, %3, %4, %5
2307 imul r3d, 0x01010101
2308 mov [pred_buf+0x80], r3d
2309 mov [pred_buf+0x88], r3d
2310 mov [pred_buf+0x90], r3d
2311 mov [pred_buf+0x98], r3d
2327 SBUTTERFLY qdq, 3, 0, 2
2338 pmaddwd m1, [pw_1] ; v, _, h, dc
2340 %endmacro ; INTRA_X9_VHDC
2342 %macro INTRA_X9_END 2
2344 phminposuw m0, m0 ; h,dc,ddl,ddr,vr,hd,vl,hu
2351 ; 4x4 sad is up to 12 bits; +bitcosts -> 13 bits; pack with 3 bit index
2353 paddw m0, [pw_s01234567] ; h,dc,ddl,ddr,vr,hd,vl,hu
2355 ; 4x4 satd is up to 13 bits; +bitcosts and saturate -> 13 bits; pack with 3 bit index
2358 paddw m0, [pw_s01234657] ; h,dc,ddl,ddr,vr,vl,hd,hu
2362 pshuflw m1, m0, q0032
2364 pshuflw m1, m0, q0001
2371 ; 1<<16: increment index to match intra4x4_pred_e. couldn't do this before because it had to fit in 3 bits
2372 ; 1<<12: undo sign manipulation
2373 lea eax, [rax+r2+(1<<16)+(1<<12)]
2378 ; output the predicted samples
2383 movzx r2d, byte [r2+r3]
2385 movzx r2d, byte [%2_lut+r3]
2388 movq mm0, [pred_buf+r2]
2389 movq mm1, [pred_buf+r2+16]
2390 movd [r1+0*FDEC_STRIDE], mm0
2391 movd [r1+2*FDEC_STRIDE], mm1
2394 movd [r1+1*FDEC_STRIDE], mm0
2395 movd [r1+3*FDEC_STRIDE], mm1
2399 mov r3d, [pred_buf+r2+8*i]
2400 mov [r1+i*FDEC_STRIDE], r3d
2404 %endmacro ; INTRA_X9_END
2407 ;-----------------------------------------------------------------------------
2408 ; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
2409 ;-----------------------------------------------------------------------------
2411 cglobal intra_sad_x9_4x4, 3,4,9
2412 %assign pad 0xc0-gprsize-(stack_offset&15)
2413 %define pred_buf rsp
2416 INTRA_X9_PRED intrax9a, m8
2418 INTRA_X9_PRED intrax9a, [rsp+0xa0]
2427 movd m0, [r0+0*FENC_STRIDE]
2428 pinsrd m0, [r0+1*FENC_STRIDE], 1
2429 movd m1, [r0+2*FENC_STRIDE]
2430 pinsrd m1, [r0+3*FENC_STRIDE], 1
2432 movd mm0, [r0+0*FENC_STRIDE]
2433 punpckldq mm0, [r0+1*FENC_STRIDE]
2434 movd mm1, [r0+2*FENC_STRIDE]
2435 punpckldq mm1, [r0+3*FENC_STRIDE]
2456 %define %%zero [pb_0]
2458 pshufb m3, m7, [intrax9a_vh1]
2459 pshufb m5, m7, [intrax9a_vh2]
2460 pshufb m7, [intrax9a_dc]
2475 movzx r3d, word [r2]
2478 punpckhqdq m3, m0 ; h, dc
2479 shufps m3, m2, q2020
2485 INTRA_X9_END 1, intrax9a
2491 ;-----------------------------------------------------------------------------
2492 ; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
2493 ;-----------------------------------------------------------------------------
2494 cglobal intra_satd_x9_4x4, 3,4,16
2495 %assign pad 0xb0-gprsize-(stack_offset&15)
2496 %define pred_buf rsp
2498 INTRA_X9_PRED intrax9b, m15
2505 movd m8, [r0+0*FENC_STRIDE]
2506 movd m9, [r0+1*FENC_STRIDE]
2507 movd m10, [r0+2*FENC_STRIDE]
2508 movd m11, [r0+3*FENC_STRIDE]
2519 pshufd m1, m2, q3232
2522 call .satd_8x4 ; ddr, ddl
2524 pshufd m3, m5, q3232
2527 pshufd m1, m4, q3232
2528 call .satd_8x4 ; vr, vl
2530 pshufd m3, m7, q3232
2533 pshufd m1, m6, q3232
2534 call .satd_8x4 ; hd, hu
2538 punpcklqdq m4, m0 ; conroe dislikes punpckldq, and ssse3 INTRA_X9_END can handle arbitrary orders whereas phminposuw can't
2540 mova m1, [pw_ppmmppmm]
2545 INTRA_X9_VHDC 15, 8, 10, 6, 7
2550 %if notcpuflag(sse4)
2551 pshufhw m0, m0, q3120 ; compensate for different order in unpack
2555 movzx r0d, word [r2]
2557 INTRA_X9_END 0, intrax9b
2560 RESET_MM_PERMUTATION
2571 SATD_8x4_SSE (cpuflags == cpuflags_sse2), 0, 1, 2, 3, 13, 14, 0, swap
2574 pshufd m1, m0, q0032
2578 paddd xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free
2581 %else ; !ARCH_X86_64
2582 cglobal intra_satd_x9_4x4, 3,4,8
2583 %assign pad 0x120-gprsize-(stack_offset&15)
2584 %define fenc_buf rsp
2585 %define pred_buf rsp+0x40
2586 %define spill rsp+0xe0
2588 INTRA_X9_PRED intrax9b, [spill+0x20]
2589 mova [pred_buf+0x00], m2
2590 mova [pred_buf+0x10], m3
2591 mova [pred_buf+0x20], m4
2592 mova [pred_buf+0x30], m5
2593 mova [pred_buf+0x40], m6
2594 mova [pred_buf+0x50], m7
2595 movd m4, [r0+0*FENC_STRIDE]
2596 movd m5, [r0+1*FENC_STRIDE]
2597 movd m6, [r0+2*FENC_STRIDE]
2598 movd m0, [r0+3*FENC_STRIDE]
2608 mova [fenc_buf+0x00], m4
2609 mova [fenc_buf+0x10], m5
2610 mova [fenc_buf+0x20], m6
2611 mova [fenc_buf+0x30], m0
2613 pshufd m1, m2, q3232
2623 call .satd_8x4b ; ddr, ddl
2624 mova m3, [pred_buf+0x30]
2625 mova m1, [pred_buf+0x20]
2628 movq [spill+0x08], m0
2631 call .satd_8x4 ; vr, vl
2632 mova m3, [pred_buf+0x50]
2633 mova m1, [pred_buf+0x40]
2636 movq [spill+0x10], m0
2639 call .satd_8x4 ; hd, hu
2640 movq [spill+0x18], m0
2641 mova m1, [spill+0x20]
2642 mova m4, [fenc_buf+0x00]
2643 mova m5, [fenc_buf+0x20]
2644 mova m2, [pw_ppmmppmm]
2647 paddw m4, [fenc_buf+0x10]
2648 paddw m5, [fenc_buf+0x30]
2649 INTRA_X9_VHDC 1, 4, 5, 6, 7
2653 punpckhqdq m1, [spill+0x00]
2654 packssdw m1, [spill+0x10]
2656 pshufhw m1, m1, q3120
2658 pshufhw m0, m0, q3120
2661 movzx r0d, word [r2]
2663 INTRA_X9_END 0, intrax9b
2666 RESET_MM_PERMUTATION
2673 %xdefine fenc_buf fenc_buf+gprsize
2674 psubw m0, [fenc_buf+0x00]
2675 psubw m1, [fenc_buf+0x10]
2676 psubw m2, [fenc_buf+0x20]
2678 psubw m3, [fenc_buf+0x30]
2679 SATD_8x4_SSE (cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 0, swap
2682 pshufd m1, m0, q0032
2689 %endmacro ; INTRA_X9
2694 ;-----------------------------------------------------------------------------
2695 ; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
2696 ;-----------------------------------------------------------------------------
2697 cglobal intra_sad_x9_8x8, 5,6,9
2707 %assign padbase 0x10
2709 %assign pad 0x240+0x10+padbase-gprsize-(stack_offset&15)
2710 %define pred(i,j) [rsp+i*0x40+j*0x10+padbase]
2713 movq fenc02, [r0+FENC_STRIDE* 0]
2714 movq fenc13, [r0+FENC_STRIDE* 1]
2715 movq fenc46, [r0+FENC_STRIDE* 4]
2716 movq fenc57, [r0+FENC_STRIDE* 5]
2717 movhps fenc02, [r0+FENC_STRIDE* 2]
2718 movhps fenc13, [r0+FENC_STRIDE* 3]
2719 movhps fenc46, [r0+FENC_STRIDE* 6]
2720 movhps fenc57, [r0+FENC_STRIDE* 7]
2722 ; save instruction size: avoid 4-byte memory offsets
2723 lea r0, [intra8x9_h1+128]
2724 %define off(m) (r0+m-(intra8x9_h1+128))
2729 psadbw m1, m0, fenc02
2731 psadbw m2, m0, fenc13
2733 psadbw m3, m0, fenc46
2735 psadbw m0, m0, fenc57
2745 pshufb m1, m0, [off(intra8x9_h1)]
2746 pshufb m2, m0, [off(intra8x9_h2)]
2752 pshufb m3, m0, [off(intra8x9_h3)]
2753 pshufb m2, m0, [off(intra8x9_h4)]
2764 lea r5, [rsp+padbase+0x100]
2765 %define pred(i,j) [r5+i*0x40+j*0x10-0x100]
2777 psadbw m1, m0, fenc02
2779 psadbw m2, m0, fenc13
2781 psadbw m3, m0, fenc46
2783 psadbw m0, m0, fenc57
2792 ; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
2793 ; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
2794 ; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
2795 ; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
2796 ; Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC
2797 ; Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD
2798 ; Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE
2799 ; Ft8 Ft9 FtA FtB FtC FtD FtE FtF
2803 pavgb m3, m0, m2 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB ___ ___ ___ ___ ___
2804 PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; ___ Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE FtF
2805 pshufb m1, m0, [off(intra8x9_ddl1)]
2806 pshufb m2, m0, [off(intra8x9_ddl2)]
2812 pshufb m2, m0, [off(intra8x9_ddl3)]
2816 pshufb m2, m0, [off(intra8x9_ddl4)]
2825 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8
2826 ; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
2827 ; Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9
2828 ; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
2829 ; Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA
2830 ; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
2831 ; Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB
2832 ; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
2833 pshufb m1, m3, [off(intra8x9_vl1)]
2834 pshufb m2, m0, [off(intra8x9_vl2)]
2835 pshufb m3, m3, [off(intra8x9_vl3)]
2836 pshufb m0, m0, [off(intra8x9_vl4)]
2851 pextrw [r4+14], m0, 0
2855 lea r5, [rsp+padbase+0x100]
2859 ; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
2860 ; Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
2861 ; Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4
2862 ; Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3
2863 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2
2864 ; Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1
2865 ; Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0
2866 ; Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt
2870 pavgb m3, m2, m0 ; Gl6 Gl5 Gl4 Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
2871 PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
2872 pshufb m1, m0, [off(intra8x9_ddr1)]
2873 pshufb m2, m0, [off(intra8x9_ddr2)]
2879 pshufb m2, m0, [off(intra8x9_ddr3)]
2883 pshufb m2, m0, [off(intra8x9_ddr4)]
2893 %define off(m) (r0+m-(intra8x9_h1+256+128))
2894 %define pred(i,j) [r5+i*0x40+j*0x10-0x1C0]
2897 ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
2898 ; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
2899 ; Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6
2900 ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
2901 ; Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
2902 ; Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4
2903 ; Fl4 Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4
2904 ; Fl5 Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3
2905 movsd m2, m3, m0 ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
2906 pshufb m1, m2, [off(intra8x9_vr1)]
2907 pshufb m2, m2, [off(intra8x9_vr3)]
2913 pshufb m2, m0, [off(intra8x9_vr2)]
2917 pshufb m2, m0, [off(intra8x9_vr4)]
2926 ; Glt Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
2927 ; Gl0 Fl0 Glt Flt Ft0 Ft1 Ft2 Ft3
2928 ; Gl1 Fl1 Gl0 Fl0 Glt Flt Ft0 Ft1
2929 ; Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 Glt Flt
2930 ; Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0
2931 ; Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1
2932 ; Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2
2933 ; Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3
2934 pshufd m2, m3, q0001
2936 pblendw m2, m0, q3330 ; Gl2 Gl1 Gl0 Glt ___ Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ___
2941 punpcklbw m0, m3 ; Fl7 Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 ___
2942 pshufb m1, m2, [off(intra8x9_hd1)]
2943 pshufb m2, m2, [off(intra8x9_hd2)]
2949 pshufb m2, m0, [off(intra8x9_hd3)]
2950 pshufb m3, m0, [off(intra8x9_hd4)]
2959 ; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall
2964 ; Gl0 Fl1 Gl1 Fl2 Gl2 Fl3 Gl3 Fl4
2965 ; Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 Gl4 Fl5
2966 ; Gl2 Fl3 Gl3 Gl3 Gl4 Fl5 Gl5 Fl6
2967 ; Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 Gl6 Fl7
2968 ; Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 Gl7 Gl7
2969 ; Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 Gl7 Gl7
2970 ; Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
2971 ; Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
2973 pinsrb m0, [r2+7], 15 ; Gl7
2980 pshufb m1, m0, [off(intra8x9_hu1)]
2981 pshufb m2, m0, [off(intra8x9_hu2)]
2987 pshufb m2, m0, [off(intra8x9_hu3)]
2988 pshufb m0, m0, [off(intra8x9_hu4)]
3003 movzx r5d, word [r3+16]
3008 phminposuw m0, m0 ; v,h,dc,ddl,ddr,vr,hd,vl
3011 ; 8x8 sad is up to 14 bits; +bitcosts and saturate -> 14 bits; pack with 2 bit index
3014 paddw m0, [off(pw_s00112233)]
3017 pshuflw m1, m0, q0032
3020 ; repack with 3 bit index
3028 ; reverse to phminposuw order
3042 add r1, 4*FDEC_STRIDE
3043 mova m0, [rsp+padbase+r2+0x00]
3044 mova m1, [rsp+padbase+r2+0x10]
3045 mova m2, [rsp+padbase+r2+0x20]
3046 mova m3, [rsp+padbase+r2+0x30]
3047 movq [r1+FDEC_STRIDE*-4], m0
3048 movhps [r1+FDEC_STRIDE*-2], m0
3049 movq [r1+FDEC_STRIDE*-3], m1
3050 movhps [r1+FDEC_STRIDE*-1], m1
3051 movq [r1+FDEC_STRIDE* 0], m2
3052 movhps [r1+FDEC_STRIDE* 2], m2
3053 movq [r1+FDEC_STRIDE* 1], m3
3054 movhps [r1+FDEC_STRIDE* 3], m3
3059 ;-----------------------------------------------------------------------------
3060 ; int intra_sa8d_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
3061 ;-----------------------------------------------------------------------------
3062 cglobal intra_sa8d_x9_8x8, 5,6,16
3063 %assign pad 0x2c0+0x10-gprsize-(stack_offset&15)
3064 %define fenc_buf rsp
3065 %define pred_buf rsp+0x80
3071 movddup m %+ %%i, [r0+%%i*FENC_STRIDE]
3072 pmaddubsw m9, m %+ %%i, m15
3073 punpcklbw m %+ %%i, m8
3074 mova [fenc_buf+%%i*0x10], m9
3078 ; save instruction size: avoid 4-byte memory offsets
3079 lea r0, [intra8x9_h1+0x80]
3080 %define off(m) (r0+m-(intra8x9_h1+0x80))
3081 lea r5, [pred_buf+0x80]
3084 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
3093 ; 1D hadamard of edges
3101 pshufb m9, [intrax3_shuf]
3102 pmaddubsw m8, [pb_pppm]
3103 pmaddubsw m9, [pb_pppm]
3104 HSUMSUB2 psignw, m8, m9, m12, m13, m9, q1032, [pw_ppppmmmm]
3105 HSUMSUB2 psignw, m8, m9, m12, m13, m9, q2301, [pw_ppmmppmm]
3121 psllw m8, 3 ; left edge
3124 pabsw m8, m8 ; 1x8 sum
3134 punpcklqdq m0, m4 ; transpose
3135 psllw m9, 3 ; top edge
3136 psrldq m10, m11, 2 ; 8x7 sum
3137 psubw m0, m9 ; 8x1 sum
3141 phaddd m10, m8 ; logically phaddw, but this is faster and it won't overflow
3147 pshufb m0, m3, [off(intra8x9_h1)]
3148 pshufb m1, m3, [off(intra8x9_h2)]
3149 pshufb m2, m3, [off(intra8x9_h3)]
3150 pshufb m3, m3, [off(intra8x9_h4)]
3161 PRED4x4_LOWPASS m8, m1, m2, m8, m3
3162 pshufb m0, m8, [off(intra8x9_ddl1)]
3163 pshufb m1, m8, [off(intra8x9_ddl2)]
3164 pshufb m2, m8, [off(intra8x9_ddl3)]
3165 pshufb m3, m8, [off(intra8x9_ddl4)]
3171 pshufb m0, m9, [off(intra8x9_vl1)]
3172 pshufb m1, m8, [off(intra8x9_vl2)]
3173 pshufb m2, m9, [off(intra8x9_vl3)]
3174 pshufb m3, m8, [off(intra8x9_vl4)]
3185 PRED4x4_LOWPASS m8, m1, m2, m8, m3
3186 pshufb m0, m8, [off(intra8x9_ddr1)]
3187 pshufb m1, m8, [off(intra8x9_ddr2)]
3188 pshufb m2, m8, [off(intra8x9_ddr3)]
3189 pshufb m3, m8, [off(intra8x9_ddr4)]
3195 %define off(m) (r0+m-(intra8x9_h1+0x180))
3199 pshufb m0, m2, [off(intra8x9_vr1)]
3200 pshufb m1, m8, [off(intra8x9_vr2)]
3201 pshufb m2, m2, [off(intra8x9_vr3)]
3202 pshufb m3, m8, [off(intra8x9_vr4)]
3209 pshufd m1, m9, q0001
3210 pblendw m1, m8, q3330
3212 pshufd m2, m9, q0001
3216 pshufb m0, m1, [off(intra8x9_hd1)]
3217 pshufb m1, m1, [off(intra8x9_hd2)]
3218 pshufb m2, m8, [off(intra8x9_hd3)]
3219 pshufb m3, m8, [off(intra8x9_hd4)]
3227 pinsrb m8, [r2+7], 15
3234 pshufb m0, m8, [off(intra8x9_hu1)]
3235 pshufb m1, m8, [off(intra8x9_hu2)]
3236 pshufb m2, m8, [off(intra8x9_hu3)]
3237 pshufb m3, m8, [off(intra8x9_hu4)]
3245 pshuflw m1, m0, q0032
3254 movzx r5d, word [r3+16]
3262 ; 8x8 sa8d is up to 15 bits; +bitcosts and saturate -> 15 bits; pack with 1 bit index
3264 paddw m0, [off(pw_s00001111)]
3267 pshuflw m1, m0, q0032
3270 pcmpgtw m2, m1 ; 2nd index bit
3273 ; repack with 3 bit index
3281 lea r3d, [ r3*4+r4+1]
3284 ; reverse to phminposuw order
3298 add r1, 4*FDEC_STRIDE
3299 mova m0, [pred_buf+r2+0x00]
3300 mova m1, [pred_buf+r2+0x10]
3301 mova m2, [pred_buf+r2+0x20]
3302 mova m3, [pred_buf+r2+0x30]
3303 movq [r1+FDEC_STRIDE*-4], m0
3304 movhps [r1+FDEC_STRIDE*-2], m0
3305 movq [r1+FDEC_STRIDE*-3], m1
3306 movhps [r1+FDEC_STRIDE*-1], m1
3307 movq [r1+FDEC_STRIDE* 0], m2
3308 movhps [r1+FDEC_STRIDE* 2], m2
3309 movq [r1+FDEC_STRIDE* 1], m3
3310 movhps [r1+FDEC_STRIDE* 3], m3
3317 %xdefine fenc_buf fenc_buf+gprsize
3330 PERMUTE 0,4, 1,5, 2,0, 3,1, 4,6, 5,7, 6,2, 7,3
3333 psubw m0, [fenc_buf+0x00]
3334 psubw m1, [fenc_buf+0x10]
3337 psubw m2, [fenc_buf+0x20]
3338 psubw m3, [fenc_buf+0x30]
3341 psubw m4, [fenc_buf+0x40]
3342 psubw m5, [fenc_buf+0x50]
3345 psubw m6, [fenc_buf+0x60]
3346 psubw m7, [fenc_buf+0x70]
3347 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 13, 14
3352 %endif ; ARCH_X86_64
3353 %endmacro ; INTRA8_X9
3355 ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
3356 ; out: [tmp]=hadamard4, m0=satd
3358 cglobal hadamard_ac_4x4
3364 %else ; !HIGH_BIT_DEPTH
3373 %endif ; HIGH_BIT_DEPTH
3374 HADAMARD4_2D 0, 1, 2, 3, 4
3390 cglobal hadamard_ac_2x2max
3396 SUMSUB_BADC w, 0, 1, 2, 3, 4
3397 ABSW2 m0, m2, m0, m2, m4, m5
3398 ABSW2 m1, m3, m1, m3, m4, m5
3399 HADAMARD 0, max, 0, 2, 4, 5
3400 HADAMARD 0, max, 1, 3, 4, 5
3406 %else ; !HIGH_BIT_DEPTH
3409 %endif ; HIGH_BIT_DEPTH
3425 %endif ; HIGH_BIT_DEPTH
3428 cglobal hadamard_ac_8x8
3434 %endif ; HIGH_BIT_DEPTH
3435 call hadamard_ac_4x4_mmx2
3436 add r0, 4*SIZEOF_PIXEL
3440 call hadamard_ac_4x4_mmx2
3444 call hadamard_ac_4x4_mmx2
3445 sub r0, 4*SIZEOF_PIXEL
3448 call hadamard_ac_4x4_mmx2
3451 mova [rsp+gprsize+8], m5 ; save satd
3456 call hadamard_ac_2x2max_mmx2
3462 SUMSUB_BADC w, 0, 1, 2, 3, 4
3463 HADAMARD 0, sumsub, 0, 2, 4, 5
3464 ABSW2 m1, m3, m1, m3, m4, m5
3465 ABSW2 m0, m2, m0, m2, m4, m5
3466 HADAMARD 0, max, 1, 3, 4, 5
3477 %else ; !HIGH_BIT_DEPTH
3483 %endif ; HIGH_BIT_DEPTH
3484 mova [rsp+gprsize], m6 ; save sa8d
3489 %macro HADAMARD_AC_WXH_SUM_MMX 2
3490 mova m1, [rsp+1*mmsize]
3493 paddd m0, [rsp+2*mmsize]
3494 paddd m1, [rsp+3*mmsize]
3497 mova m2, [rsp+4*mmsize]
3498 paddd m1, [rsp+5*mmsize]
3499 paddd m2, [rsp+6*mmsize]
3501 paddd m1, [rsp+7*mmsize]
3508 %else ; !HIGH_BIT_DEPTH
3510 paddusw m0, [rsp+2*mmsize]
3511 paddusw m1, [rsp+3*mmsize]
3514 mova m2, [rsp+4*mmsize]
3515 paddusw m1, [rsp+5*mmsize]
3516 paddusw m2, [rsp+6*mmsize]
3518 paddusw m1, [rsp+7*mmsize]
3530 %endif ; HIGH_BIT_DEPTH
3533 %macro HADAMARD_AC_WXH_MMX 2
3534 cglobal pixel_hadamard_ac_%1x%2, 2,4
3535 %assign pad 16-gprsize-(stack_offset&15)
3541 call hadamard_ac_8x8_mmx2
3546 call hadamard_ac_8x8_mmx2
3551 lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
3553 call hadamard_ac_8x8_mmx2
3557 call hadamard_ac_8x8_mmx2
3560 HADAMARD_AC_WXH_SUM_MMX %1, %2
3568 add rsp, 128+%1*%2/4+pad
3570 %endmacro ; HADAMARD_AC_WXH_MMX
3572 HADAMARD_AC_WXH_MMX 16, 16
3573 HADAMARD_AC_WXH_MMX 8, 16
3574 HADAMARD_AC_WXH_MMX 16, 8
3575 HADAMARD_AC_WXH_MMX 8, 8
3577 %macro LOAD_INC_8x4W_SSE2 5
3586 %else ; !HIGH_BIT_DEPTH
3598 %endif ; HIGH_BIT_DEPTH
3601 %macro LOAD_INC_8x4W_SSSE3 5
3602 LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1]
3606 HSUMSUB %1, %2, %3, %4, %5
3609 %macro HADAMARD_AC_SSE2 0
3610 ; in: r0=pix, r1=stride, r2=stride*3
3611 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
3612 cglobal hadamard_ac_8x8
3618 %define spill0 [rsp+gprsize]
3619 %define spill1 [rsp+gprsize+16]
3620 %define spill2 [rsp+gprsize+32]
3624 %elif cpuflag(ssse3)
3626 ;LOAD_INC loads sumsubs
3630 ;LOAD_INC only unpacks to words
3633 LOAD_INC_8x4W 0, 1, 2, 3, 7
3635 HADAMARD4_2D_SSE 0, 1, 2, 3, 4
3637 HADAMARD4_V 0, 1, 2, 3, 4
3641 LOAD_INC_8x4W 4, 5, 6, 7, 1
3643 HADAMARD4_2D_SSE 4, 5, 6, 7, 1
3645 HADAMARD4_V 4, 5, 6, 7, 1
3650 HADAMARD 1, sumsub, 0, 1, 6, 7
3651 HADAMARD 1, sumsub, 2, 3, 6, 7
3656 HADAMARD 1, sumsub, 4, 5, 1, 0
3657 HADAMARD 1, sumsub, 6, 7, 1, 0
3670 pand m1, [mask_ac4b]
3674 AC_PADD m1, m3, [pw_1]
3676 AC_PADD m1, m2, [pw_1]
3678 AC_PADD m1, m3, [pw_1]
3680 AC_PADD m1, m2, [pw_1]
3682 AC_PADD m1, m3, [pw_1]
3684 AC_PADD m1, m2, [pw_1]
3688 mova [rsp+gprsize+32], m1 ; save satd
3699 HADAMARD %%x, amax, 3, 7, 4
3700 HADAMARD %%x, amax, 2, 6, 7, 4
3702 HADAMARD %%x, amax, 1, 5, 6, 7
3703 HADAMARD %%x, sumsub, 0, 4, 5, 6
3705 AC_PADD m2, m3, [pw_1]
3706 AC_PADD m2, m1, [pw_1]
3711 %endif ; HIGH_BIT_DEPTH
3715 AC_PADD m2, m4, [pw_1]
3716 AC_PADD m2, m0, [pw_1]
3717 mova [rsp+gprsize+16], m2 ; save sa8d
3722 HADAMARD_AC_WXH_SSE2 16, 16
3723 HADAMARD_AC_WXH_SSE2 8, 16
3724 HADAMARD_AC_WXH_SSE2 16, 8
3725 HADAMARD_AC_WXH_SSE2 8, 8
3726 %endmacro ; HADAMARD_AC_SSE2
3728 %macro HADAMARD_AC_WXH_SUM_SSE2 2
3729 mova m1, [rsp+2*mmsize]
3732 paddd m0, [rsp+3*mmsize]
3733 paddd m1, [rsp+4*mmsize]
3736 paddd m0, [rsp+5*mmsize]
3737 paddd m1, [rsp+6*mmsize]
3738 paddd m0, [rsp+7*mmsize]
3739 paddd m1, [rsp+8*mmsize]
3744 %else ; !HIGH_BIT_DEPTH
3746 paddusw m0, [rsp+3*mmsize]
3747 paddusw m1, [rsp+4*mmsize]
3750 paddusw m0, [rsp+5*mmsize]
3751 paddusw m1, [rsp+6*mmsize]
3752 paddusw m0, [rsp+7*mmsize]
3753 paddusw m1, [rsp+8*mmsize]
3758 %endif ; HIGH_BIT_DEPTH
3761 ; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
3762 %macro HADAMARD_AC_WXH_SSE2 2
3763 cglobal pixel_hadamard_ac_%1x%2, 2,3,11
3764 %assign pad 16-gprsize-(stack_offset&15)
3769 call hadamard_ac_8x8
3774 call hadamard_ac_8x8
3779 lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
3781 call hadamard_ac_8x8
3785 call hadamard_ac_8x8
3788 HADAMARD_AC_WXH_SUM_SSE2 %1, %2
3791 shr edx, 2 - (%1*%2 >> 8)
3797 add rsp, 16+%1*%2/2+pad
3799 %endmacro ; HADAMARD_AC_WXH_SSE2
3803 %if ARCH_X86_64 == 0
3804 cextern pixel_sa8d_8x8_internal_mmx2
3809 %define TRANS TRANS_SSE2
3810 %define DIFFOP DIFF_UNPACK_SSE2
3811 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2
3812 %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
3813 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
3814 %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
3815 %define movdqu movups
3816 %define punpcklqdq movlhps
3820 %if HIGH_BIT_DEPTH == 0
3828 %define DIFFOP DIFF_SUMSUB_SSSE3
3829 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
3830 %if HIGH_BIT_DEPTH == 0
3831 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
3832 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
3833 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
3839 %if HIGH_BIT_DEPTH == 0
3843 %undef movdqa ; nehalem doesn't like movaps
3844 %undef movdqu ; movups
3845 %undef punpcklqdq ; or movlhps
3846 %if HIGH_BIT_DEPTH == 0
3851 %define TRANS TRANS_SSE4
3852 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
3857 %if HIGH_BIT_DEPTH == 0
3865 %if HIGH_BIT_DEPTH == 0
3871 %define TRANS TRANS_XOP
3875 %if HIGH_BIT_DEPTH == 0
3877 ; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
3881 ;=============================================================================
3883 ;=============================================================================
3885 ;-----------------------------------------------------------------------------
3886 ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
3887 ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
3888 ;-----------------------------------------------------------------------------
3891 movdqu m5, [r0+(%1&1)*r1]
3892 movdqu m6, [r2+(%1&1)*r3]
3894 movq m5, [r0+(%1&1)*r1]
3895 movq m6, [r2+(%1&1)*r3]
3913 ACCUM paddd, 3, 5, %1
3914 ACCUM paddd, 4, 7, %1
3919 cglobal pixel_ssim_4x4x2_core, 4,4,8
3929 pshufd m5, m3, q2301
3932 pshufd m6, m4, q2301
3935 pshufd m1, m1, q3120
3938 punpckhdq m5, m3, m4
3954 ;-----------------------------------------------------------------------------
3955 ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
3956 ;-----------------------------------------------------------------------------
3957 cglobal pixel_ssim_end4, 3,3,7
3972 movdqa m5, [ssim_c1]
3973 movdqa m6, [ssim_c2]
3974 TRANSPOSE4x4D 0, 1, 2, 3, 4
3976 ; s1=m0, s2=m1, ss=m2, s12=m3
3982 mulps m2, [pf_64] ; ss*64
3983 mulps m3, [pf_128] ; s12*128
3985 mulps m4, m0 ; s1*s2
3986 mulps m1, m1 ; s2*s2
3987 mulps m0, m0 ; s1*s1
3988 addps m4, m4 ; s1*s2*2
3989 addps m0, m1 ; s1*s1 + s2*s2
3991 subps m3, m4 ; covar*2
3992 addps m4, m5 ; s1*s2*2 + ssim_c1
3993 addps m0, m5 ; s1*s1 + s2*s2 + ssim_c1
3994 addps m2, m6 ; vars + ssim_c2
3995 addps m3, m6 ; covar*2 + ssim_c2
3997 pmaddwd m4, m1, m0 ; s1*s2
4000 pmaddwd m0, m0 ; s1*s1 + s2*s2
4004 psubd m3, m4 ; covar*2
4010 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
4011 cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
4012 cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
4013 cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
4020 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
4023 lea r3, [mask_ff + 16]
4024 movdqu m1, [r3 + r2*4]
4026 movdqu m1, [mask_ff + r2*4 + 16]
4032 pshuflw m4, m0, q0032
4034 %if ARCH_X86_64 == 0
4046 ;-----------------------------------------------------------------------------
4047 ; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
4048 ;-----------------------------------------------------------------------------
4050 cglobal pixel_asd8, 5,5
4113 ;=============================================================================
4114 ; Successive Elimination ADS
4115 ;=============================================================================
4129 %macro ADS_END 1 ; unroll_size
4135 WIN64_RESTORE_XMM rsp
4139 ;-----------------------------------------------------------------------------
4140 ; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
4141 ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
4142 ;-----------------------------------------------------------------------------
4144 cglobal pixel_ads4, 5,7
4148 pshufw mm6, mm6, q2222
4150 pshufw mm4, mm4, q2222
4160 movq mm3, [r1+r2+16]
4175 cglobal pixel_ads2, 5,7
4179 pshufw mm6, mm6, q2222
4196 cglobal pixel_ads1, 5,7
4218 cglobal pixel_ads4, 5,7,12
4220 pshuflw xmm7, xmm4, 0
4221 pshuflw xmm6, xmm4, q2222
4222 pshufhw xmm5, xmm4, 0
4223 pshufhw xmm4, xmm4, q2222
4224 punpcklqdq xmm7, xmm7
4225 punpcklqdq xmm6, xmm6
4226 punpckhqdq xmm5, xmm5
4227 punpckhqdq xmm4, xmm4
4229 pshuflw xmm8, r6m, 0
4230 punpcklqdq xmm8, xmm8
4233 movdqu xmm11, [r1+r2]
4235 psubw xmm0, xmm10, xmm7
4236 movdqu xmm10, [r1+16]
4237 psubw xmm1, xmm10, xmm6
4238 ABSW xmm0, xmm0, xmm2
4239 ABSW xmm1, xmm1, xmm3
4240 psubw xmm2, xmm11, xmm5
4241 movdqu xmm11, [r1+r2+16]
4243 psubw xmm3, xmm11, xmm4
4245 ABSW xmm2, xmm2, xmm1
4246 ABSW xmm3, xmm3, xmm1
4250 psubusw xmm1, xmm8, xmm0
4257 movdqu xmm1, [r1+16]
4260 ABSW xmm0, xmm0, xmm2
4261 ABSW xmm1, xmm1, xmm3
4262 movdqu xmm2, [r1+r2]
4263 movdqu xmm3, [r1+r2+16]
4267 ABSW xmm2, xmm2, xmm1
4268 ABSW xmm3, xmm3, xmm1
4273 pshuflw xmm1, xmm1, 0
4274 punpcklqdq xmm1, xmm1
4282 cglobal pixel_ads2, 5,7,8
4285 pshuflw xmm7, xmm6, 0
4286 pshuflw xmm6, xmm6, q2222
4287 pshuflw xmm5, xmm5, 0
4288 punpcklqdq xmm7, xmm7
4289 punpcklqdq xmm6, xmm6
4290 punpcklqdq xmm5, xmm5
4294 movdqu xmm1, [r1+r2]
4298 ABSW xmm0, xmm0, xmm2
4299 ABSW xmm1, xmm1, xmm3
4302 psubusw xmm1, xmm5, xmm0
4307 cglobal pixel_ads1, 5,7,8
4310 pshuflw xmm7, xmm7, 0
4311 pshuflw xmm6, xmm6, 0
4312 punpcklqdq xmm7, xmm7
4313 punpcklqdq xmm6, xmm6
4317 movdqu xmm1, [r1+16]
4321 movdqu xmm3, [r3+16]
4322 ABSW xmm0, xmm0, xmm4
4323 ABSW xmm1, xmm1, xmm5
4326 psubusw xmm4, xmm6, xmm0
4327 psubusw xmm5, xmm6, xmm1
4340 ; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
4343 ; *(uint32_t*)(masks+width) = 0;
4344 ; for( i=0; i<width; i+=8 )
4346 ; uint64_t mask = *(uint64_t*)(masks+i);
4347 ; if( !mask ) continue;
4348 ; for( j=0; j<8; j++ )
4349 ; if( mask & (255<<j*8) )
4357 test r2d, 0xff<<(%1*8)
4364 cglobal pixel_ads_mvs, 0,7,0
4371 ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)