1 ;*****************************************************************************
2 ;* pixel.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* Fiona Glaser <fiona@x264.com>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;*****************************************************************************
28 %include "x86util.asm"
32 pw_00ff: times 8 dw 0xff
33 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
34 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
35 mask_ff: times 16 db 0xff
37 mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1
38 mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1
39 mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1
40 hsub_mul: times 8 db 1, -1
41 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
44 mask_10: times 4 dw 0, -1
45 mask_1100: times 2 dd 0, -1
49 %macro HADDD 2 ; sum junk
62 pmaddwd %1, [pw_1 GLOBAL]
75 ;=============================================================================
77 ;=============================================================================
79 %macro SSD_LOAD_FULL 5
123 DEINTB %2, %1, %4, %3, 7
138 %macro SSD_LOAD_HALF 5
139 LOAD 1, 2, [t0+%1], [t0+%3], 1
140 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
141 LOAD 3, 4, [t0+%1], [t0+%3], %5
142 JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5
168 %macro SSD_CORE_SSE2 7-8
170 DEINTB %6, %1, %7, %2, %5
174 DEINTB %6, %3, %7, %4, %5
185 %macro SSD_CORE_SSSE3 7-8
207 SSD_LOAD_%1 %2,%3,%4,%5,%6
208 SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
215 ;-----------------------------------------------------------------------------
216 ; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
217 ;-----------------------------------------------------------------------------
220 %assign function_align 8
222 %assign function_align 16
224 cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0
225 mov al, %1*%2/mmsize/2
228 jmp mangle(x264_pixel_ssd_%1x%1_%3.startloop)
233 DECLARE_REG_TMP 0,1,2,3
239 DECLARE_REG_TMP 1,2,3,4
247 mova m7, [hsub_mul GLOBAL]
249 mova m7, [pw_00ff GLOBAL]
258 SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
260 SSD_ITER FULL, 0, 0, t1, t3, 2
262 SSD_ITER HALF, 0, 0, t1, t3, 2
281 SSD 16, 16, sse2slow, 8
282 SSD 8, 8, sse2slow, 8
283 SSD 16, 8, sse2slow, 8
284 SSD 8, 16, sse2slow, 8
285 SSD 8, 4, sse2slow, 8
286 %define SSD_CORE SSD_CORE_SSE2
287 %define JOIN JOIN_SSE2
293 %define SSD_CORE SSD_CORE_SSSE3
294 %define JOIN JOIN_SSSE3
303 %assign function_align 16
305 ;=============================================================================
307 ;=============================================================================
311 pxor m6, m6 ; sum squared
313 mova m7, [pw_00ff GLOBAL]
367 ;-----------------------------------------------------------------------------
368 ; int x264_pixel_var_wxh_mmxext( uint8_t *, int )
369 ;-----------------------------------------------------------------------------
371 cglobal x264_pixel_var_16x16_mmxext, 2,3
376 cglobal x264_pixel_var_8x8_mmxext, 2,3
382 cglobal x264_pixel_var_16x16_sse2, 2,3,8
395 cglobal x264_pixel_var_8x8_sse2, 2,4,8
419 sub eax, r1d ; sqr - (sum * sum >> shift)
423 ;-----------------------------------------------------------------------------
424 ; int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * )
425 ;-----------------------------------------------------------------------------
428 cglobal x264_pixel_var2_8x8_mmxext, 5,6
458 cglobal x264_pixel_var2_8x8_sse2, 5,6,8
482 cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
484 pxor m6, m6 ; sum squared
485 mova m7, [hsub_mul GLOBAL]
525 ;=============================================================================
527 ;=============================================================================
529 %define TRANS TRANS_SSE2
533 ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
537 ; join 2x 32 bit and duplicate them
538 ; emulating shufps is faster on conroe
544 ; just use shufps on anything post conroe
555 %macro DIFF_UNPACK_SSE2 5
564 %macro DIFF_SUMSUB_SSSE3 5
565 HSUMSUB %1, %2, %3, %4, %5
570 %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
576 %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
583 %macro LOAD_DUP_4x8P_PENRYN 8
584 ; penryn and nehalem run punpcklqdq and movddup in different units
593 %macro LOAD_SUMSUB_8x2P 9
594 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
595 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
598 %macro LOAD_SUMSUB_8x4P_SSSE3 7-10 r0, r2, 0
599 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
600 LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
601 LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
608 %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
614 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
617 %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
620 DEINTB %1, %2, %3, %4, %5
623 SUMSUB_BA m%1, m%2, m%3
626 %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
627 ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
628 LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
629 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
630 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
631 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
634 ; in: r4=3*stride1, r5=3*stride2
635 ; in: %2 = horizontal offset
636 ; in: %3 = whether we need to increment pix1 and pix2
639 %macro SATD_4x4_MMX 3
641 LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2]
642 LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2]
643 LOAD_DIFF m6, m3, none, [r0+2*r1+%2], [r2+2*r3+%2]
644 LOAD_DIFF m7, m3, none, [r0+r4+%2], [r2+r5+%2]
649 HADAMARD4_2D 4, 5, 6, 7, 3, %%n
654 %macro SATD_8x4_SSE 8-9
656 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
658 HADAMARD4_V m%2, m%3, m%4, m%5, m%6
659 ; doing the abs first is a slight advantage
660 ABS4 m%2, m%4, m%3, m%5, m%6, m%7
661 HADAMARD 1, max, %2, %4, %6, %7
671 HADAMARD 1, max, %3, %5, %6, %7
676 %macro SATD_START_MMX 0
677 lea r4, [3*r1] ; 3*stride1
678 lea r5, [3*r3] ; 3*stride2
681 %macro SATD_END_MMX 0
682 pshufw m1, m0, 01001110b
684 pshufw m1, m0, 10110001b
691 ; FIXME avoid the spilling of regs to hold 3*stride.
692 ; for small blocks on x86_32, modify pixel pointer instead.
694 ;-----------------------------------------------------------------------------
695 ; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
696 ;-----------------------------------------------------------------------------
698 cglobal x264_pixel_satd_16x4_internal_mmxext
699 SATD_4x4_MMX m2, 0, 0
700 SATD_4x4_MMX m1, 4, 0
702 SATD_4x4_MMX m2, 8, 0
704 SATD_4x4_MMX m1, 12, 0
709 cglobal x264_pixel_satd_8x8_internal_mmxext
710 SATD_4x4_MMX m2, 0, 0
711 SATD_4x4_MMX m1, 4, 1
714 x264_pixel_satd_8x4_internal_mmxext:
715 SATD_4x4_MMX m2, 0, 0
716 SATD_4x4_MMX m1, 4, 0
721 cglobal x264_pixel_satd_16x16_mmxext, 4,6
725 call x264_pixel_satd_16x4_internal_mmxext
729 call x264_pixel_satd_16x4_internal_mmxext
734 cglobal x264_pixel_satd_16x8_mmxext, 4,6
737 call x264_pixel_satd_16x4_internal_mmxext
740 call x264_pixel_satd_16x4_internal_mmxext
743 cglobal x264_pixel_satd_8x16_mmxext, 4,6
746 call x264_pixel_satd_8x8_internal_mmxext
749 call x264_pixel_satd_8x8_internal_mmxext
752 cglobal x264_pixel_satd_8x8_mmxext, 4,6
755 call x264_pixel_satd_8x8_internal_mmxext
758 cglobal x264_pixel_satd_8x4_mmxext, 4,6
761 call x264_pixel_satd_8x4_internal_mmxext
764 cglobal x264_pixel_satd_4x8_mmxext, 4,6
766 SATD_4x4_MMX m0, 0, 1
767 SATD_4x4_MMX m1, 0, 0
771 cglobal x264_pixel_satd_4x4_mmxext, 4,6
773 SATD_4x4_MMX m0, 0, 0
776 %macro SATD_START_SSE2 3
778 mova %3, [hmul_8p GLOBAL]
785 %macro SATD_END_SSE2 2
791 %macro BACKUP_POINTERS 0
798 %macro RESTORE_AND_INC_POINTERS 0
810 ;-----------------------------------------------------------------------------
811 ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
812 ;-----------------------------------------------------------------------------
816 cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
818 mova m4, [hmul_4p GLOBAL]
819 LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
820 LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
821 LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
822 LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
823 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
824 HADAMARD 0, sumsub, 0, 1, 2, 3
825 HADAMARD 4, sumsub, 0, 1, 2, 3
826 HADAMARD 1, amax, 0, 1, 2, 3
832 cglobal x264_pixel_satd_4x8_%1, 4, 6, 8
835 mova m7, [hmul_4p GLOBAL]
867 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6, swap
872 cglobal x264_pixel_satd_8x8_internal_%1
873 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
874 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
875 x264_pixel_satd_8x4_internal_%1:
876 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
877 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
880 %ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
881 cglobal x264_pixel_satd_16x4_internal_%1
882 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
885 SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10
886 SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10
889 cglobal x264_pixel_satd_16x8_%1, 4,6,12
890 SATD_START_SSE2 %1, m10, m7
892 mova m7, [pw_00ff GLOBAL]
894 jmp x264_pixel_satd_16x8_internal_%1
896 cglobal x264_pixel_satd_16x16_%1, 4,6,12
897 SATD_START_SSE2 %1, m10, m7
899 mova m7, [pw_00ff GLOBAL]
901 call x264_pixel_satd_16x4_internal_%1
902 call x264_pixel_satd_16x4_internal_%1
903 x264_pixel_satd_16x8_internal_%1:
904 call x264_pixel_satd_16x4_internal_%1
905 call x264_pixel_satd_16x4_internal_%1
906 SATD_END_SSE2 %1, m10
908 cglobal x264_pixel_satd_16x8_%1, 4,6,8
909 SATD_START_SSE2 %1, m6, m7
911 call x264_pixel_satd_8x8_internal_%1
912 RESTORE_AND_INC_POINTERS
913 call x264_pixel_satd_8x8_internal_%1
916 cglobal x264_pixel_satd_16x16_%1, 4,6,8
917 SATD_START_SSE2 %1, m6, m7
919 call x264_pixel_satd_8x8_internal_%1
920 call x264_pixel_satd_8x8_internal_%1
921 RESTORE_AND_INC_POINTERS
922 call x264_pixel_satd_8x8_internal_%1
923 call x264_pixel_satd_8x8_internal_%1
927 cglobal x264_pixel_satd_8x16_%1, 4,6,8
928 SATD_START_SSE2 %1, m6, m7
929 call x264_pixel_satd_8x8_internal_%1
930 call x264_pixel_satd_8x8_internal_%1
933 cglobal x264_pixel_satd_8x8_%1, 4,6,8
934 SATD_START_SSE2 %1, m6, m7
935 call x264_pixel_satd_8x8_internal_%1
938 cglobal x264_pixel_satd_8x4_%1, 4,6,8
939 SATD_START_SSE2 %1, m6, m7
940 call x264_pixel_satd_8x4_internal_%1
942 %endmacro ; SATDS_SSE2
946 ;-----------------------------------------------------------------------------
947 ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
948 ;-----------------------------------------------------------------------------
949 cglobal x264_pixel_sa8d_8x8_internal_%1
952 LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
953 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11
954 %ifidn %1, sse2 ; sse2 doesn't seem to like the horizontal way of doing things
955 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
957 HADAMARD4_V m0, m1, m2, m8, m6
958 HADAMARD4_V m4, m5, m3, m9, m6
959 SUMSUB_BADC m0, m4, m1, m5, m6
960 HADAMARD 2, sumsub, 0, 4, 6, 11
961 HADAMARD 2, sumsub, 1, 5, 6, 11
962 SUMSUB_BADC m2, m3, m8, m9, m6
963 HADAMARD 2, sumsub, 2, 3, 6, 11
964 HADAMARD 2, sumsub, 8, 9, 6, 11
965 HADAMARD 1, amax, 0, 4, 6, 11
966 HADAMARD 1, amax, 1, 5, 6, 4
967 HADAMARD 1, amax, 2, 3, 6, 4
968 HADAMARD 1, amax, 8, 9, 6, 4
973 SAVE_MM_PERMUTATION x264_pixel_sa8d_8x8_internal_%1
976 cglobal x264_pixel_sa8d_8x8_%1, 4,6,12
980 mova m7, [hmul_8p GLOBAL]
982 call x264_pixel_sa8d_8x8_internal_%1
989 cglobal x264_pixel_sa8d_16x16_%1, 4,6,12
993 mova m7, [hmul_8p GLOBAL]
995 call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
999 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8]
1003 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8]
1007 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
1017 cglobal x264_pixel_sa8d_8x8_internal_%1
1018 %define spill0 [esp+4]
1019 %define spill1 [esp+20]
1020 %define spill2 [esp+36]
1022 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
1023 HADAMARD4_2D 0, 1, 2, 3, 4
1025 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
1026 HADAMARD4_2D 4, 5, 6, 7, 3
1027 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
1030 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
1032 mova m7, [hmul_8p GLOBAL]
1033 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
1034 ; could do first HADAMARD4_V here to save spilling later
1035 ; surprisingly, not a win on conroe or even p4
1040 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
1041 HADAMARD4_V m4, m5, m6, m7, m3
1047 HADAMARD4_V m0, m1, m2, m3, m7
1048 SUMSUB_BADC m0, m4, m1, m5, m7
1049 HADAMARD 2, sumsub, 0, 4, 7, 6
1050 HADAMARD 2, sumsub, 1, 5, 7, 6
1051 HADAMARD 1, amax, 0, 4, 7, 6
1052 HADAMARD 1, amax, 1, 5, 7, 6
1056 SUMSUB_BADC m2, m6, m3, m7, m4
1057 HADAMARD 2, sumsub, 2, 6, 4, 5
1058 HADAMARD 2, sumsub, 3, 7, 4, 5
1059 HADAMARD 1, amax, 2, 6, 4, 5
1060 HADAMARD 1, amax, 3, 7, 4, 5
1061 %endif ; sse2/non-sse2
1065 %endif ; ifndef mmxext
1067 cglobal x264_pixel_sa8d_8x8_%1, 4,7
1073 call x264_pixel_sa8d_8x8_internal_%1
1081 cglobal x264_pixel_sa8d_16x16_%1, 4,7
1087 call x264_pixel_sa8d_8x8_internal_%1
1093 call x264_pixel_sa8d_8x8_internal_%1
1098 paddusw m0, [esp+48]
1100 call x264_pixel_sa8d_8x8_internal_%1
1106 paddusw m0, [esp+48]
1108 mova [esp+64-mmsize], m0
1109 call x264_pixel_sa8d_8x8_internal_%1
1110 paddusw m0, [esp+64-mmsize]
1132 %endif ; !ARCH_X86_64
1135 ;=============================================================================
1137 ;=============================================================================
1139 %macro INTRA_SA8D_SSE2 1
1142 ;-----------------------------------------------------------------------------
1143 ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
1144 ;-----------------------------------------------------------------------------
1145 cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16
1148 movq m0, [r0+0*FENC_STRIDE]
1149 movq m1, [r0+1*FENC_STRIDE]
1150 movq m2, [r0+2*FENC_STRIDE]
1151 movq m3, [r0+3*FENC_STRIDE]
1152 movq m4, [r0+4*FENC_STRIDE]
1153 movq m5, [r0+5*FENC_STRIDE]
1154 movq m6, [r0+6*FENC_STRIDE]
1155 movq m7, [r0+7*FENC_STRIDE]
1165 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
1168 movzx r0d, word [r1+0]
1169 add r0w, word [r1+16]
1179 ABS4 m8, m9, m10, m11, m12, m13
1190 ABS2 m10, m11, m13, m14
1197 movdqa m14, m15 ; 7x8 sum
1199 movdqa m8, [r1+0] ; left edge
1205 ABS1 m9, m11 ; 1x8 sum
1214 punpcklqdq m0, m4 ; transpose
1215 movdqa m1, [r1+16] ; top edge
1218 psrldq m2, 2 ; 8x7 sum
1219 psubw m0, m1 ; 8x1 sum
1224 movdqa m7, [pw_1 GLOBAL]
1231 pshufd m5, m15, 0xf5
1240 movq [r2], m3 ; i8x8_v, i8x8_h
1242 movd [r2+8], m3 ; i8x8_dc
1244 %endif ; ARCH_X86_64
1245 %endmacro ; INTRA_SA8D_SSE2
1248 ; out: m0..m3 = hadamard coefs
1253 movd m0, [r0+0*FENC_STRIDE]
1254 movd m1, [r0+1*FENC_STRIDE]
1255 movd m2, [r0+2*FENC_STRIDE]
1256 movd m3, [r0+3*FENC_STRIDE]
1261 HADAMARD4_2D 0, 1, 2, 3, 4
1262 SAVE_MM_PERMUTATION load_hadamard
1265 %macro SCALAR_SUMSUB 4
1274 %macro SCALAR_HADAMARD_LEFT 5 ; y, 4x tmp
1276 shl %1d, 5 ; log(FDEC_STRIDE)
1278 movzx %2d, byte [r1+%1-1+0*FDEC_STRIDE]
1279 movzx %3d, byte [r1+%1-1+1*FDEC_STRIDE]
1280 movzx %4d, byte [r1+%1-1+2*FDEC_STRIDE]
1281 movzx %5d, byte [r1+%1-1+3*FDEC_STRIDE]
1285 SCALAR_SUMSUB %2d, %3d, %4d, %5d
1286 SCALAR_SUMSUB %2d, %4d, %3d, %5d
1287 mov [left_1d+2*%1+0], %2w
1288 mov [left_1d+2*%1+2], %3w
1289 mov [left_1d+2*%1+4], %4w
1290 mov [left_1d+2*%1+6], %5w
1293 %macro SCALAR_HADAMARD_TOP 5 ; x, 4x tmp
1294 movzx %2d, byte [r1+%1-FDEC_STRIDE+0]
1295 movzx %3d, byte [r1+%1-FDEC_STRIDE+1]
1296 movzx %4d, byte [r1+%1-FDEC_STRIDE+2]
1297 movzx %5d, byte [r1+%1-FDEC_STRIDE+3]
1298 SCALAR_SUMSUB %2d, %3d, %4d, %5d
1299 SCALAR_SUMSUB %2d, %4d, %3d, %5d
1300 mov [top_1d+2*%1+0], %2w
1301 mov [top_1d+2*%1+2], %3w
1302 mov [top_1d+2*%1+4], %4w
1303 mov [top_1d+2*%1+6], %5w
1306 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
1308 pshufw %4, %1, 01001110b
1309 pshufw %5, %2, 01001110b
1310 pshufw %6, %3, 01001110b
1317 pshufw %4, %1, 01001110b
1318 pshufw %5, %2, 01001110b
1319 pshufw %6, %3, 01001110b
1327 mov qword [sums+0], 0
1328 mov qword [sums+8], 0
1329 mov qword [sums+16], 0
1358 ; in: m0..m3 (4x4), m7 (3x4)
1359 ; out: m0 v, m4 h, m5 dc
1361 %macro SUM4x3 3 ; dc, left, top
1369 punpckldq m0, m2 ; transpose
1373 ABS2 m4, m5, m2, m3 ; 1x4 sum
1374 ABS1 m0, m1 ; 4x1 sum
1377 %macro INTRA_SATDS_MMX 1
1379 ;-----------------------------------------------------------------------------
1380 ; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1381 ;-----------------------------------------------------------------------------
1382 cglobal x264_intra_satd_x3_4x4_%1, 2,6
1384 ; stack is 16 byte aligned because abi says so
1385 %define top_1d rsp-8 ; size 8
1386 %define left_1d rsp-16 ; size 8
1389 ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
1391 %define top_1d esp+8
1397 SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5
1399 SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5
1400 lea t0d, [t0d + r0d + 4]
1405 SUM4x3 t0d, [left_1d], [top_1d]
1409 psrlq m1, 16 ; 4x3 sum
1412 SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw
1416 movd [r2+0], m0 ; i4x4_v satd
1417 movd [r2+4], m4 ; i4x4_h satd
1418 movd [r2+8], m5 ; i4x4_dc satd
1432 ;-----------------------------------------------------------------------------
1433 ; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1434 ;-----------------------------------------------------------------------------
1435 cglobal x264_intra_satd_x3_16x16_%1, 0,7
1437 %assign stack_pad 88
1439 %assign stack_pad 88 + ((stack_offset+88+4)&15)
1441 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1443 %define sums rsp+64 ; size 24
1444 %define top_1d rsp+32 ; size 32
1445 %define left_1d rsp ; size 32
1453 SCALAR_HADAMARD_LEFT t0, r3, r4, r5, r6
1455 SCALAR_HADAMARD_TOP t0, r3, r4, r5, r6
1472 SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
1475 paddw m0, [sums+0] ; i16x16_v satd
1476 paddw m4, [sums+8] ; i16x16_h satd
1477 paddw m5, [sums+16] ; i16x16_dc satd
1486 add r0, 4*FENC_STRIDE-16
1497 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
1503 movd [r2+8], m2 ; i16x16_dc satd
1504 movd [r2+4], m1 ; i16x16_h satd
1505 movd [r2+0], m0 ; i16x16_v satd
1509 ;-----------------------------------------------------------------------------
1510 ; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1511 ;-----------------------------------------------------------------------------
1512 cglobal x264_intra_satd_x3_8x8c_%1, 0,6
1513 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1515 %define sums rsp+48 ; size 24
1516 %define dc_1d rsp+32 ; size 16
1517 %define top_1d rsp+16 ; size 16
1518 %define left_1d rsp ; size 16
1525 SCALAR_HADAMARD_LEFT t0, t2, r3, r4, r5
1526 SCALAR_HADAMARD_TOP t0, t2, r3, r4, r5
1531 movzx t2d, word [left_1d+0]
1532 movzx r3d, word [top_1d+0]
1533 movzx r4d, word [left_1d+8]
1534 movzx r5d, word [top_1d+8]
1545 mov [dc_1d+ 0], t2d ; tl
1546 mov [dc_1d+ 4], r5d ; tr
1547 mov [dc_1d+ 8], r4d ; bl
1548 mov [dc_1d+12], r3d ; br
1561 SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
1564 paddw m0, [sums+16] ; i4x4_v satd
1565 paddw m4, [sums+8] ; i4x4_h satd
1566 paddw m5, [sums+0] ; i4x4_dc satd
1575 add r0, 4*FENC_STRIDE-8
1588 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
1590 movd [r2+0], m0 ; i8x8c_dc satd
1591 movd [r2+4], m1 ; i8x8c_h satd
1592 movd [r2+8], m2 ; i8x8c_v satd
1595 %endmacro ; INTRA_SATDS_MMX
1598 %macro ABS_MOV_SSSE3 2
1602 %macro ABS_MOV_MMX 2
1608 %define ABS_MOV ABS_MOV_MMX
1610 ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
1611 ; out: [tmp]=hadamard4, m0=satd
1612 cglobal x264_hadamard_ac_4x4_mmxext
1621 HADAMARD4_2D 0, 1, 2, 3, 4
1634 SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext
1637 cglobal x264_hadamard_ac_2x2max_mmxext
1643 SUMSUB_BADC m0, m1, m2, m3, m4
1644 ABS4 m0, m2, m1, m3, m4, m5
1645 HADAMARD 0, max, 0, 2, 4, 5
1646 HADAMARD 0, max, 1, 3, 4, 5
1649 SAVE_MM_PERMUTATION x264_hadamard_ac_2x2max_mmxext
1652 cglobal x264_hadamard_ac_8x8_mmxext
1653 mova m6, [mask_ac4 GLOBAL]
1655 call x264_hadamard_ac_4x4_mmxext
1659 call x264_hadamard_ac_4x4_mmxext
1663 call x264_hadamard_ac_4x4_mmxext
1667 call x264_hadamard_ac_4x4_mmxext
1670 mova [rsp+gprsize+8], m5 ; save satd
1672 call x264_hadamard_ac_2x2max_mmxext
1678 SUMSUB_BADC m0, m1, m2, m3, m4
1679 HADAMARD 0, sumsub, 0, 2, 4, 5
1680 ABS4 m1, m3, m0, m2, m4, m5
1681 HADAMARD 0, max, 1, 3, 4, 5
1687 mova [rsp+gprsize], m6 ; save sa8d
1689 SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext
1692 %macro HADAMARD_AC_WXH_MMX 2
1693 cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4
1694 %assign pad 16-gprsize-(stack_offset&15)
1699 call x264_hadamard_ac_8x8_mmxext
1704 call x264_hadamard_ac_8x8_mmxext
1709 lea r0, [r0+ysub*4+8]
1711 call x264_hadamard_ac_8x8_mmxext
1715 call x264_hadamard_ac_8x8_mmxext
1720 paddusw m0, [rsp+0x10]
1721 paddusw m1, [rsp+0x18]
1725 paddusw m1, [rsp+0x28]
1726 paddusw m2, [rsp+0x30]
1728 paddusw m1, [rsp+0x38]
1730 pand m3, [pw_1 GLOBAL]
1747 add rsp, 128+%1*%2/4+pad
1749 %endmacro ; HADAMARD_AC_WXH_MMX
1751 HADAMARD_AC_WXH_MMX 16, 16
1752 HADAMARD_AC_WXH_MMX 8, 16
1753 HADAMARD_AC_WXH_MMX 16, 8
1754 HADAMARD_AC_WXH_MMX 8, 8
1756 %macro LOAD_INC_8x4W_SSE2 5
1770 %macro LOAD_INC_8x4W_SSSE3 5
1771 LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1]
1775 HSUMSUB %1, %2, %3, %4, %5
1778 %macro HADAMARD_AC_SSE2 1
1780 ; in: r0=pix, r1=stride, r2=stride*3
1781 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
1782 cglobal x264_hadamard_ac_8x8_%1
1788 %define spill0 [rsp+gprsize]
1789 %define spill1 [rsp+gprsize+16]
1790 %define spill2 [rsp+gprsize+32]
1793 ;LOAD_INC loads sumsubs
1794 mova m7, [hmul_8p GLOBAL]
1796 ;LOAD_INC only unpacks to words
1799 LOAD_INC_8x4W 0, 1, 2, 3, 7
1801 HADAMARD4_2D_SSE 0, 1, 2, 3, 4
1803 HADAMARD4_V m0, m1, m2, m3, m4
1807 LOAD_INC_8x4W 4, 5, 6, 7, 1
1809 HADAMARD4_2D_SSE 4, 5, 6, 7, 1
1811 HADAMARD4_V m4, m5, m6, m7, m1
1818 HADAMARD 1, sumsub, 0, 1, 6, 7
1819 HADAMARD 1, sumsub, 2, 3, 6, 7
1824 HADAMARD 1, sumsub, 4, 5, 1, 0
1825 HADAMARD 1, sumsub, 6, 7, 1, 0
1835 SUMSUB_BA m0, m4; m2
1837 pand m1, [mask_ac4b GLOBAL]
1839 pand m1, [mask_ac4 GLOBAL]
1856 mova [rsp+gprsize+32], m1 ; save satd
1864 HADAMARD 2, amax, 3, 7, 4
1865 HADAMARD 2, amax, 2, 6, 7, 4
1867 HADAMARD 2, amax, 1, 5, 6, 7
1868 HADAMARD 2, sumsub, 0, 4, 5, 6
1871 HADAMARD 4, amax, 3, 7, 4
1872 HADAMARD 4, amax, 2, 6, 7, 4
1874 HADAMARD 4, amax, 1, 5, 6, 7
1875 HADAMARD 4, sumsub, 0, 4, 5, 6
1881 pand m0, [mask_ac8 GLOBAL]
1885 mova [rsp+gprsize+16], m0 ; save sa8d
1886 SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1
1889 HADAMARD_AC_WXH_SSE2 16, 16, %1
1890 HADAMARD_AC_WXH_SSE2 8, 16, %1
1891 HADAMARD_AC_WXH_SSE2 16, 8, %1
1892 HADAMARD_AC_WXH_SSE2 8, 8, %1
1893 %endmacro ; HADAMARD_AC_SSE2
1895 ; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
1896 %macro HADAMARD_AC_WXH_SSE2 3
1897 cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3,11
1898 %assign pad 16-gprsize-(stack_offset&15)
1902 call x264_hadamard_ac_8x8_%3
1907 call x264_hadamard_ac_8x8_%3
1912 lea r0, [r0+ysub*4+8]
1914 call x264_hadamard_ac_8x8_%3
1918 call x264_hadamard_ac_8x8_%3
1923 paddusw m0, [rsp+0x30]
1924 paddusw m1, [rsp+0x40]
1927 paddusw m0, [rsp+0x50]
1928 paddusw m1, [rsp+0x60]
1929 paddusw m0, [rsp+0x70]
1930 paddusw m1, [rsp+0x80]
1937 shr edx, 2 - (%1*%2 >> 8)
1943 add rsp, 16+%1*%2/2+pad
1945 %endmacro ; HADAMARD_AC_WXH_SSE2
1950 cextern x264_pixel_sa8d_8x8_internal_mmxext
1954 %define TRANS TRANS_SSE2
1955 %define ABS1 ABS1_MMX
1956 %define ABS2 ABS2_MMX
1957 %define DIFFOP DIFF_UNPACK_SSE2
1958 %define JDUP JDUP_SSE2
1959 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2
1960 %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
1961 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
1962 %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
1963 %define movdqu movups
1964 %define punpcklqdq movlhps
1968 INTRA_SA8D_SSE2 sse2
1969 INTRA_SATDS_MMX mmxext
1970 HADAMARD_AC_SSE2 sse2
1972 %define ABS1 ABS1_SSSE3
1973 %define ABS2 ABS2_SSSE3
1974 %define ABS_MOV ABS_MOV_SSSE3
1975 %define DIFFOP DIFF_SUMSUB_SSSE3
1976 %define JDUP JDUP_CONROE
1977 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
1978 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
1979 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
1980 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
1983 HADAMARD_AC_SSE2 ssse3
1984 %undef movdqa ; nehalem doesn't like movaps
1985 %undef movdqu ; movups
1986 %undef punpcklqdq ; or movlhps
1987 INTRA_SA8D_SSE2 ssse3
1988 INTRA_SATDS_MMX ssse3
1990 %define TRANS TRANS_SSE4
1991 %define JDUP JDUP_PENRYN
1992 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
1995 HADAMARD_AC_SSE2 sse4
1997 ;=============================================================================
1999 ;=============================================================================
2001 ;-----------------------------------------------------------------------------
2002 ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
2003 ; const uint8_t *pix2, int stride2, int sums[2][4] )
2004 ;-----------------------------------------------------------------------------
2007 movq m5, [r0+(%1&1)*r1]
2008 movq m6, [r2+(%1&1)*r3]
2036 cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8
2044 movdqa m7, [pw_1 GLOBAL]
2071 ;-----------------------------------------------------------------------------
2072 ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
2073 ;-----------------------------------------------------------------------------
2074 cglobal x264_pixel_ssim_end4_sse2, 3,3,7
2089 movdqa m5, [ssim_c1 GLOBAL]
2090 movdqa m6, [ssim_c2 GLOBAL]
2091 TRANSPOSE4x4D 0, 1, 2, 3, 4
2093 ; s1=m0, s2=m1, ss=m2, s12=m3
2096 pmaddwd m4, m0 ; s1*s2
2098 pmaddwd m0, m0 ; s1*s1 + s2*s2
2102 psubd m3, m4 ; covar*2
2108 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
2109 cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
2110 cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
2111 cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
2117 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
2120 lea r3, [mask_ff + 16 GLOBAL]
2121 movdqu m1, [r3 + r2*4]
2123 movdqu m1, [mask_ff + r2*4 + 16 GLOBAL]
2139 ;=============================================================================
2140 ; Successive Elimination ADS
2141 ;=============================================================================
2143 %macro ADS_START 1 ; unroll_size
2148 movsxd r5, dword r5m
2175 %define ABS1 ABS1_MMX
2177 ;-----------------------------------------------------------------------------
2178 ; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
2179 ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
2180 ;-----------------------------------------------------------------------------
2181 cglobal x264_pixel_ads4_mmxext, 4,7
2185 pshufw mm6, mm6, 0xAA
2187 pshufw mm4, mm4, 0xAA
2197 movq mm3, [r1+r2+16]
2206 pshufw mm1, [r10+stack_offset+56], 0
2207 %elifdef ARCH_X86_64
2208 pshufw mm1, [r10+8], 0
2210 pshufw mm1, [ebp+stack_offset+28], 0
2218 cglobal x264_pixel_ads2_mmxext, 4,7
2222 pshufw mm6, mm6, 0xAA
2239 cglobal x264_pixel_ads1_mmxext, 4,7
2261 cglobal x264_pixel_ads4_%1, 4,7,12
2263 pshuflw xmm7, xmm4, 0
2264 pshuflw xmm6, xmm4, 0xAA
2265 pshufhw xmm5, xmm4, 0
2266 pshufhw xmm4, xmm4, 0xAA
2267 punpcklqdq xmm7, xmm7
2268 punpcklqdq xmm6, xmm6
2269 punpckhqdq xmm5, xmm5
2270 punpckhqdq xmm4, xmm4
2272 pshuflw xmm8, r6m, 0
2273 punpcklqdq xmm8, xmm8
2276 movdqu xmm11, [r1+r2]
2279 movdqu xmm1, [r1+16]
2286 movdqu xmm3, [r1+r2+16]
2305 movdqu xmm1, [r1+16]
2310 movdqu xmm2, [r1+r2]
2311 movdqu xmm3, [r1+r2+16]
2319 movd xmm1, [ebp+stack_offset+28]
2321 pshuflw xmm1, xmm1, 0
2322 punpcklqdq xmm1, xmm1
2330 cglobal x264_pixel_ads2_%1, 4,7,8
2333 pshuflw xmm7, xmm6, 0
2334 pshuflw xmm6, xmm6, 0xAA
2335 pshuflw xmm5, xmm5, 0
2336 punpcklqdq xmm7, xmm7
2337 punpcklqdq xmm6, xmm6
2338 punpcklqdq xmm5, xmm5
2342 movdqu xmm1, [r1+r2]
2356 cglobal x264_pixel_ads1_%1, 4,7,8
2359 pshuflw xmm7, xmm7, 0
2360 pshuflw xmm6, xmm6, 0
2361 punpcklqdq xmm7, xmm7
2362 punpcklqdq xmm6, xmm6
2366 movdqu xmm1, [r1+16]
2370 movdqu xmm3, [r3+16]
2385 %define ABS1 ABS1_SSSE3
2388 ; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
2391 ; *(uint32_t*)(masks+width) = 0;
2392 ; for( i=0; i<width; i+=8 )
2394 ; uint64_t mask = *(uint64_t*)(masks+i);
2395 ; if( !mask ) continue;
2396 ; for( j=0; j<8; j++ )
2397 ; if( mask & (255<<j*8) )
2402 cglobal x264_pixel_ads_mvs, 0,7,0
2408 ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
2415 mov dword [rsp+r9], 0
2428 test edi, 0xff<<(%1*8)
2451 mov ebx, [ebp+stack_offset+20] ; mvs
2452 mov edi, [ebp+stack_offset+24] ; width
2453 mov dword [esp+edi], 0
2461 mov ebp, [esp+esi+4]
2462 mov edx, [esp+esi+8]
2469 test %2, 0xff<<(%1*8)