1 ;*****************************************************************************
2 ;* pixel.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* Fiona Glaser <fiona@x264.com>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;*****************************************************************************
28 %include "x86util.asm"
32 pw_00ff: times 8 dw 0xff
33 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
34 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
35 mask_ff: times 16 db 0xff
37 mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1
38 mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1
39 mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1
40 hsub_mul: times 8 db 1, -1
41 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
44 mask_10: times 4 dw 0, -1
45 mask_1100: times 2 dd 0, -1
49 %macro HADDD 2 ; sum junk
62 pmaddwd %1, [pw_1 GLOBAL]
75 ;=============================================================================
77 ;=============================================================================
79 %macro SSD_LOAD_FULL 5
123 DEINTB %2, %1, %4, %3, 7
138 %macro SSD_LOAD_HALF 5
139 LOAD 1, 2, [t0+%1], [t0+%3], 1
140 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
141 LOAD 3, 4, [t0+%1], [t0+%3], %5
142 JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5
168 %macro SSD_CORE_SSE2 7-8
170 DEINTB %6, %1, %7, %2, %5
174 DEINTB %6, %3, %7, %4, %5
185 %macro SSD_CORE_SSSE3 7-8
207 SSD_LOAD_%1 %2,%3,%4,%5,%6
208 SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
215 ;-----------------------------------------------------------------------------
216 ; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
217 ;-----------------------------------------------------------------------------
220 %assign function_align 8
222 %assign function_align 16
224 cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0
225 mov al, %1*%2/mmsize/2
228 jmp mangle(x264_pixel_ssd_%1x%1_%3.startloop)
233 DECLARE_REG_TMP 0,1,2,3
239 DECLARE_REG_TMP 1,2,3,4
247 mova m7, [hsub_mul GLOBAL]
249 mova m7, [pw_00ff GLOBAL]
258 SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
260 SSD_ITER FULL, 0, 0, t1, t3, 2
262 SSD_ITER HALF, 0, 0, t1, t3, 2
281 SSD 16, 16, sse2slow, 8
282 SSD 8, 8, sse2slow, 8
283 SSD 16, 8, sse2slow, 8
284 SSD 8, 16, sse2slow, 8
285 SSD 8, 4, sse2slow, 8
286 %define SSD_CORE SSD_CORE_SSE2
287 %define JOIN JOIN_SSE2
293 %define SSD_CORE SSD_CORE_SSSE3
294 %define JOIN JOIN_SSSE3
303 %assign function_align 16
305 ;=============================================================================
307 ;=============================================================================
311 pxor m6, m6 ; sum squared
313 mova m7, [pw_00ff GLOBAL]
326 sub eax, r1d ; sqr - (sum * sum >> shift)
366 ;-----------------------------------------------------------------------------
367 ; int x264_pixel_var_wxh_mmxext( uint8_t *, int )
368 ;-----------------------------------------------------------------------------
370 cglobal x264_pixel_var_16x16_mmxext, 2,3
375 cglobal x264_pixel_var_8x8_mmxext, 2,3
381 cglobal x264_pixel_var_16x16_sse2, 2,3,8
394 cglobal x264_pixel_var_8x8_sse2, 2,4,8
418 sub eax, r1d ; sqr - (sum * sum >> shift)
422 ;-----------------------------------------------------------------------------
423 ; int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * )
424 ;-----------------------------------------------------------------------------
427 cglobal x264_pixel_var2_8x8_mmxext, 5,6
457 cglobal x264_pixel_var2_8x8_sse2, 5,6,8
481 cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
483 pxor m6, m6 ; sum squared
484 mova m7, [hsub_mul GLOBAL]
524 ;=============================================================================
526 ;=============================================================================
528 %define TRANS TRANS_SSE2
532 ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
536 ; join 2x 32 bit and duplicate them
537 ; emulating shufps is faster on conroe
543 ; just use shufps on anything post conroe
554 %macro DIFF_UNPACK_SSE2 5
563 %macro DIFF_SUMSUB_SSSE3 5
564 HSUMSUB %1, %2, %3, %4, %5
569 %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
575 %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
582 %macro LOAD_DUP_4x8P_PENRYN 8
583 ; penryn and nehalem run punpcklqdq and movddup in different units
592 %macro LOAD_SUMSUB_8x2P 9
593 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
594 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
597 %macro LOAD_SUMSUB_8x4P_SSSE3 7-10 r0, r2, 0
598 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
599 LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
600 LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
607 %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
613 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
616 %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
619 DEINTB %1, %2, %3, %4, %5
622 SUMSUB_BA m%1, m%2, m%3
625 %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
626 ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
627 LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
628 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
629 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
630 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
633 ; in: r4=3*stride1, r5=3*stride2
634 ; in: %2 = horizontal offset
635 ; in: %3 = whether we need to increment pix1 and pix2
638 %macro SATD_4x4_MMX 3
640 LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2]
641 LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2]
642 LOAD_DIFF m6, m3, none, [r0+2*r1+%2], [r2+2*r3+%2]
643 LOAD_DIFF m7, m3, none, [r0+r4+%2], [r2+r5+%2]
648 HADAMARD4_2D 4, 5, 6, 7, 3, %%n
653 %macro SATD_8x4_SSE 8-9
655 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
657 HADAMARD4_V m%2, m%3, m%4, m%5, m%6
658 ; doing the abs first is a slight advantage
659 ABS4 m%2, m%4, m%3, m%5, m%6, m%7
660 HADAMARD 1, max, %2, %4, %6, %7
670 HADAMARD 1, max, %3, %5, %6, %7
675 %macro SATD_START_MMX 0
676 lea r4, [3*r1] ; 3*stride1
677 lea r5, [3*r3] ; 3*stride2
680 %macro SATD_END_MMX 0
681 pshufw m1, m0, 01001110b
683 pshufw m1, m0, 10110001b
690 ; FIXME avoid the spilling of regs to hold 3*stride.
691 ; for small blocks on x86_32, modify pixel pointer instead.
693 ;-----------------------------------------------------------------------------
694 ; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
695 ;-----------------------------------------------------------------------------
697 cglobal x264_pixel_satd_16x4_internal_mmxext
698 SATD_4x4_MMX m2, 0, 0
699 SATD_4x4_MMX m1, 4, 0
701 SATD_4x4_MMX m2, 8, 0
703 SATD_4x4_MMX m1, 12, 0
708 cglobal x264_pixel_satd_8x8_internal_mmxext
709 SATD_4x4_MMX m2, 0, 0
710 SATD_4x4_MMX m1, 4, 1
713 x264_pixel_satd_8x4_internal_mmxext:
714 SATD_4x4_MMX m2, 0, 0
715 SATD_4x4_MMX m1, 4, 0
720 cglobal x264_pixel_satd_16x16_mmxext, 4,6
724 call x264_pixel_satd_16x4_internal_mmxext
728 call x264_pixel_satd_16x4_internal_mmxext
733 cglobal x264_pixel_satd_16x8_mmxext, 4,6
736 call x264_pixel_satd_16x4_internal_mmxext
739 call x264_pixel_satd_16x4_internal_mmxext
742 cglobal x264_pixel_satd_8x16_mmxext, 4,6
745 call x264_pixel_satd_8x8_internal_mmxext
748 call x264_pixel_satd_8x8_internal_mmxext
751 cglobal x264_pixel_satd_8x8_mmxext, 4,6
754 call x264_pixel_satd_8x8_internal_mmxext
757 cglobal x264_pixel_satd_8x4_mmxext, 4,6
760 call x264_pixel_satd_8x4_internal_mmxext
763 cglobal x264_pixel_satd_4x8_mmxext, 4,6
765 SATD_4x4_MMX m0, 0, 1
766 SATD_4x4_MMX m1, 0, 0
770 cglobal x264_pixel_satd_4x4_mmxext, 4,6
772 SATD_4x4_MMX m0, 0, 0
775 %macro SATD_START_SSE2 3
777 mova %3, [hmul_8p GLOBAL]
784 %macro SATD_END_SSE2 2
790 %macro BACKUP_POINTERS 0
797 %macro RESTORE_AND_INC_POINTERS 0
809 ;-----------------------------------------------------------------------------
810 ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
811 ;-----------------------------------------------------------------------------
815 cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
817 mova m4, [hmul_4p GLOBAL]
818 LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
819 LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
820 LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
821 LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
822 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
823 HADAMARD 0, sumsub, 0, 1, 2, 3
824 HADAMARD 4, sumsub, 0, 1, 2, 3
825 HADAMARD 1, amax, 0, 1, 2, 3
831 cglobal x264_pixel_satd_4x8_%1, 4, 6, 8
834 mova m7, [hmul_4p GLOBAL]
866 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6, swap
871 cglobal x264_pixel_satd_8x8_internal_%1
872 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
873 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
874 x264_pixel_satd_8x4_internal_%1:
875 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
876 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
879 %ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
880 cglobal x264_pixel_satd_16x4_internal_%1
881 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
884 SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10
885 SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10
888 cglobal x264_pixel_satd_16x8_%1, 4,6,12
889 SATD_START_SSE2 %1, m10, m7
891 mova m7, [pw_00ff GLOBAL]
893 jmp x264_pixel_satd_16x8_internal_%1
895 cglobal x264_pixel_satd_16x16_%1, 4,6,12
896 SATD_START_SSE2 %1, m10, m7
898 mova m7, [pw_00ff GLOBAL]
900 call x264_pixel_satd_16x4_internal_%1
901 call x264_pixel_satd_16x4_internal_%1
902 x264_pixel_satd_16x8_internal_%1:
903 call x264_pixel_satd_16x4_internal_%1
904 call x264_pixel_satd_16x4_internal_%1
905 SATD_END_SSE2 %1, m10
907 cglobal x264_pixel_satd_16x8_%1, 4,6,8
908 SATD_START_SSE2 %1, m6, m7
910 call x264_pixel_satd_8x8_internal_%1
911 RESTORE_AND_INC_POINTERS
912 call x264_pixel_satd_8x8_internal_%1
915 cglobal x264_pixel_satd_16x16_%1, 4,6,8
916 SATD_START_SSE2 %1, m6, m7
918 call x264_pixel_satd_8x8_internal_%1
919 call x264_pixel_satd_8x8_internal_%1
920 RESTORE_AND_INC_POINTERS
921 call x264_pixel_satd_8x8_internal_%1
922 call x264_pixel_satd_8x8_internal_%1
926 cglobal x264_pixel_satd_8x16_%1, 4,6,8
927 SATD_START_SSE2 %1, m6, m7
928 call x264_pixel_satd_8x8_internal_%1
929 call x264_pixel_satd_8x8_internal_%1
932 cglobal x264_pixel_satd_8x8_%1, 4,6,8
933 SATD_START_SSE2 %1, m6, m7
934 call x264_pixel_satd_8x8_internal_%1
937 cglobal x264_pixel_satd_8x4_%1, 4,6,8
938 SATD_START_SSE2 %1, m6, m7
939 call x264_pixel_satd_8x4_internal_%1
941 %endmacro ; SATDS_SSE2
945 ;-----------------------------------------------------------------------------
946 ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
947 ;-----------------------------------------------------------------------------
948 cglobal x264_pixel_sa8d_8x8_internal_%1
951 LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
952 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11
953 %ifidn %1, sse2 ; sse2 doesn't seem to like the horizontal way of doing things
954 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
956 HADAMARD4_V m0, m1, m2, m8, m6
957 HADAMARD4_V m4, m5, m3, m9, m6
958 SUMSUB_BADC m0, m4, m1, m5, m6
959 HADAMARD 2, sumsub, 0, 4, 6, 11
960 HADAMARD 2, sumsub, 1, 5, 6, 11
961 SUMSUB_BADC m2, m3, m8, m9, m6
962 HADAMARD 2, sumsub, 2, 3, 6, 11
963 HADAMARD 2, sumsub, 8, 9, 6, 11
964 HADAMARD 1, amax, 0, 4, 6, 11
965 HADAMARD 1, amax, 1, 5, 6, 4
966 HADAMARD 1, amax, 2, 3, 6, 4
967 HADAMARD 1, amax, 8, 9, 6, 4
972 SAVE_MM_PERMUTATION x264_pixel_sa8d_8x8_internal_%1
975 cglobal x264_pixel_sa8d_8x8_%1, 4,6,12
979 mova m7, [hmul_8p GLOBAL]
981 call x264_pixel_sa8d_8x8_internal_%1
988 cglobal x264_pixel_sa8d_16x16_%1, 4,6,12
992 mova m7, [hmul_8p GLOBAL]
994 call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
998 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8]
1002 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8]
1006 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
1016 cglobal x264_pixel_sa8d_8x8_internal_%1
1017 %define spill0 [esp+4]
1018 %define spill1 [esp+20]
1019 %define spill2 [esp+36]
1021 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
1022 HADAMARD4_2D 0, 1, 2, 3, 4
1024 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
1025 HADAMARD4_2D 4, 5, 6, 7, 3
1026 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
1029 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
1031 mova m7, [hmul_8p GLOBAL]
1032 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
1033 ; could do first HADAMARD4_V here to save spilling later
1034 ; surprisingly, not a win on conroe or even p4
1039 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
1040 HADAMARD4_V m4, m5, m6, m7, m3
1046 HADAMARD4_V m0, m1, m2, m3, m7
1047 SUMSUB_BADC m0, m4, m1, m5, m7
1048 HADAMARD 2, sumsub, 0, 4, 7, 6
1049 HADAMARD 2, sumsub, 1, 5, 7, 6
1050 HADAMARD 1, amax, 0, 4, 7, 6
1051 HADAMARD 1, amax, 1, 5, 7, 6
1055 SUMSUB_BADC m2, m6, m3, m7, m4
1056 HADAMARD 2, sumsub, 2, 6, 4, 5
1057 HADAMARD 2, sumsub, 3, 7, 4, 5
1058 HADAMARD 1, amax, 2, 6, 4, 5
1059 HADAMARD 1, amax, 3, 7, 4, 5
1060 %endif ; sse2/non-sse2
1064 %endif ; ifndef mmxext
1066 cglobal x264_pixel_sa8d_8x8_%1, 4,7
1072 call x264_pixel_sa8d_8x8_internal_%1
1080 cglobal x264_pixel_sa8d_16x16_%1, 4,7
1086 call x264_pixel_sa8d_8x8_internal_%1
1092 call x264_pixel_sa8d_8x8_internal_%1
1097 paddusw m0, [esp+48]
1099 call x264_pixel_sa8d_8x8_internal_%1
1105 paddusw m0, [esp+48]
1107 mova [esp+64-mmsize], m0
1108 call x264_pixel_sa8d_8x8_internal_%1
1109 paddusw m0, [esp+64-mmsize]
1131 %endif ; !ARCH_X86_64
1134 ;=============================================================================
1136 ;=============================================================================
1138 %macro INTRA_SA8D_SSE2 1
1141 ;-----------------------------------------------------------------------------
1142 ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
1143 ;-----------------------------------------------------------------------------
1144 cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16
1147 movq m0, [r0+0*FENC_STRIDE]
1148 movq m1, [r0+1*FENC_STRIDE]
1149 movq m2, [r0+2*FENC_STRIDE]
1150 movq m3, [r0+3*FENC_STRIDE]
1151 movq m4, [r0+4*FENC_STRIDE]
1152 movq m5, [r0+5*FENC_STRIDE]
1153 movq m6, [r0+6*FENC_STRIDE]
1154 movq m7, [r0+7*FENC_STRIDE]
1164 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
1167 movzx r0d, word [r1+0]
1168 add r0w, word [r1+16]
1178 ABS4 m8, m9, m10, m11, m12, m13
1189 ABS2 m10, m11, m13, m14
1196 movdqa m14, m15 ; 7x8 sum
1198 movdqa m8, [r1+0] ; left edge
1204 ABS1 m9, m11 ; 1x8 sum
1213 punpcklqdq m0, m4 ; transpose
1214 movdqa m1, [r1+16] ; top edge
1217 psrldq m2, 2 ; 8x7 sum
1218 psubw m0, m1 ; 8x1 sum
1223 movdqa m7, [pw_1 GLOBAL]
1230 pshufd m5, m15, 0xf5
1239 movq [r2], m3 ; i8x8_v, i8x8_h
1241 movd [r2+8], m3 ; i8x8_dc
1243 %endif ; ARCH_X86_64
1244 %endmacro ; INTRA_SA8D_SSE2
1247 ; out: m0..m3 = hadamard coefs
1252 movd m0, [r0+0*FENC_STRIDE]
1253 movd m1, [r0+1*FENC_STRIDE]
1254 movd m2, [r0+2*FENC_STRIDE]
1255 movd m3, [r0+3*FENC_STRIDE]
1260 HADAMARD4_2D 0, 1, 2, 3, 4
1261 SAVE_MM_PERMUTATION load_hadamard
1264 %macro SCALAR_SUMSUB 4
1273 %macro SCALAR_HADAMARD_LEFT 5 ; y, 4x tmp
1275 shl %1d, 5 ; log(FDEC_STRIDE)
1277 movzx %2d, byte [r1+%1-1+0*FDEC_STRIDE]
1278 movzx %3d, byte [r1+%1-1+1*FDEC_STRIDE]
1279 movzx %4d, byte [r1+%1-1+2*FDEC_STRIDE]
1280 movzx %5d, byte [r1+%1-1+3*FDEC_STRIDE]
1284 SCALAR_SUMSUB %2d, %3d, %4d, %5d
1285 SCALAR_SUMSUB %2d, %4d, %3d, %5d
1286 mov [left_1d+2*%1+0], %2w
1287 mov [left_1d+2*%1+2], %3w
1288 mov [left_1d+2*%1+4], %4w
1289 mov [left_1d+2*%1+6], %5w
1292 %macro SCALAR_HADAMARD_TOP 5 ; x, 4x tmp
1293 movzx %2d, byte [r1+%1-FDEC_STRIDE+0]
1294 movzx %3d, byte [r1+%1-FDEC_STRIDE+1]
1295 movzx %4d, byte [r1+%1-FDEC_STRIDE+2]
1296 movzx %5d, byte [r1+%1-FDEC_STRIDE+3]
1297 SCALAR_SUMSUB %2d, %3d, %4d, %5d
1298 SCALAR_SUMSUB %2d, %4d, %3d, %5d
1299 mov [top_1d+2*%1+0], %2w
1300 mov [top_1d+2*%1+2], %3w
1301 mov [top_1d+2*%1+4], %4w
1302 mov [top_1d+2*%1+6], %5w
1305 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
1307 pshufw %4, %1, 01001110b
1308 pshufw %5, %2, 01001110b
1309 pshufw %6, %3, 01001110b
1316 pshufw %4, %1, 01001110b
1317 pshufw %5, %2, 01001110b
1318 pshufw %6, %3, 01001110b
1326 mov qword [sums+0], 0
1327 mov qword [sums+8], 0
1328 mov qword [sums+16], 0
1357 ; in: m0..m3 (4x4), m7 (3x4)
1358 ; out: m0 v, m4 h, m5 dc
1360 %macro SUM4x3 3 ; dc, left, top
1368 punpckldq m0, m2 ; transpose
1372 ABS2 m4, m5, m2, m3 ; 1x4 sum
1373 ABS1 m0, m1 ; 4x1 sum
1376 %macro INTRA_SATDS_MMX 1
1378 ;-----------------------------------------------------------------------------
1379 ; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1380 ;-----------------------------------------------------------------------------
1381 cglobal x264_intra_satd_x3_4x4_%1, 2,6
1383 ; stack is 16 byte aligned because abi says so
1384 %define top_1d rsp-8 ; size 8
1385 %define left_1d rsp-16 ; size 8
1388 ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
1390 %define top_1d esp+8
1396 SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5
1398 SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5
1399 lea t0d, [t0d + r0d + 4]
1404 SUM4x3 t0d, [left_1d], [top_1d]
1408 psrlq m1, 16 ; 4x3 sum
1411 SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw
1415 movd [r2+0], m0 ; i4x4_v satd
1416 movd [r2+4], m4 ; i4x4_h satd
1417 movd [r2+8], m5 ; i4x4_dc satd
1431 ;-----------------------------------------------------------------------------
1432 ; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1433 ;-----------------------------------------------------------------------------
1434 cglobal x264_intra_satd_x3_16x16_%1, 0,7
1436 %assign stack_pad 88
1438 %assign stack_pad 88 + ((stack_offset+88+4)&15)
1440 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1442 %define sums rsp+64 ; size 24
1443 %define top_1d rsp+32 ; size 32
1444 %define left_1d rsp ; size 32
1452 SCALAR_HADAMARD_LEFT t0, r3, r4, r5, r6
1454 SCALAR_HADAMARD_TOP t0, r3, r4, r5, r6
1471 SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
1474 paddw m0, [sums+0] ; i16x16_v satd
1475 paddw m4, [sums+8] ; i16x16_h satd
1476 paddw m5, [sums+16] ; i16x16_dc satd
1485 add r0, 4*FENC_STRIDE-16
1496 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
1502 movd [r2+8], m2 ; i16x16_dc satd
1503 movd [r2+4], m1 ; i16x16_h satd
1504 movd [r2+0], m0 ; i16x16_v satd
1508 ;-----------------------------------------------------------------------------
1509 ; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1510 ;-----------------------------------------------------------------------------
1511 cglobal x264_intra_satd_x3_8x8c_%1, 0,6
1512 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1514 %define sums rsp+48 ; size 24
1515 %define dc_1d rsp+32 ; size 16
1516 %define top_1d rsp+16 ; size 16
1517 %define left_1d rsp ; size 16
1524 SCALAR_HADAMARD_LEFT t0, t2, r3, r4, r5
1525 SCALAR_HADAMARD_TOP t0, t2, r3, r4, r5
1530 movzx t2d, word [left_1d+0]
1531 movzx r3d, word [top_1d+0]
1532 movzx r4d, word [left_1d+8]
1533 movzx r5d, word [top_1d+8]
1544 mov [dc_1d+ 0], t2d ; tl
1545 mov [dc_1d+ 4], r5d ; tr
1546 mov [dc_1d+ 8], r4d ; bl
1547 mov [dc_1d+12], r3d ; br
1560 SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
1563 paddw m0, [sums+16] ; i4x4_v satd
1564 paddw m4, [sums+8] ; i4x4_h satd
1565 paddw m5, [sums+0] ; i4x4_dc satd
1574 add r0, 4*FENC_STRIDE-8
1587 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
1589 movd [r2+0], m0 ; i8x8c_dc satd
1590 movd [r2+4], m1 ; i8x8c_h satd
1591 movd [r2+8], m2 ; i8x8c_v satd
1594 %endmacro ; INTRA_SATDS_MMX
1597 %macro ABS_MOV_SSSE3 2
1601 %macro ABS_MOV_MMX 2
1607 %define ABS_MOV ABS_MOV_MMX
1609 ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
1610 ; out: [tmp]=hadamard4, m0=satd
1611 cglobal x264_hadamard_ac_4x4_mmxext
1620 HADAMARD4_2D 0, 1, 2, 3, 4
1633 SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext
1636 cglobal x264_hadamard_ac_2x2max_mmxext
1642 SUMSUB_BADC m0, m1, m2, m3, m4
1643 ABS4 m0, m2, m1, m3, m4, m5
1644 HADAMARD 0, max, 0, 2, 4, 5
1645 HADAMARD 0, max, 1, 3, 4, 5
1648 SAVE_MM_PERMUTATION x264_hadamard_ac_2x2max_mmxext
1651 cglobal x264_hadamard_ac_8x8_mmxext
1652 mova m6, [mask_ac4 GLOBAL]
1654 call x264_hadamard_ac_4x4_mmxext
1658 call x264_hadamard_ac_4x4_mmxext
1662 call x264_hadamard_ac_4x4_mmxext
1666 call x264_hadamard_ac_4x4_mmxext
1669 mova [rsp+gprsize+8], m5 ; save satd
1671 call x264_hadamard_ac_2x2max_mmxext
1677 SUMSUB_BADC m0, m1, m2, m3, m4
1678 HADAMARD 0, sumsub, 0, 2, 4, 5
1679 ABS4 m1, m3, m0, m2, m4, m5
1680 HADAMARD 0, max, 1, 3, 4, 5
1686 mova [rsp+gprsize], m6 ; save sa8d
1688 SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext
1691 %macro HADAMARD_AC_WXH_MMX 2
1692 cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4
1693 %assign pad 16-gprsize-(stack_offset&15)
1698 call x264_hadamard_ac_8x8_mmxext
1703 call x264_hadamard_ac_8x8_mmxext
1708 lea r0, [r0+ysub*4+8]
1710 call x264_hadamard_ac_8x8_mmxext
1714 call x264_hadamard_ac_8x8_mmxext
1719 paddusw m0, [rsp+0x10]
1720 paddusw m1, [rsp+0x18]
1724 paddusw m1, [rsp+0x28]
1725 paddusw m2, [rsp+0x30]
1727 paddusw m1, [rsp+0x38]
1729 pand m3, [pw_1 GLOBAL]
1746 add rsp, 128+%1*%2/4+pad
1748 %endmacro ; HADAMARD_AC_WXH_MMX
1750 HADAMARD_AC_WXH_MMX 16, 16
1751 HADAMARD_AC_WXH_MMX 8, 16
1752 HADAMARD_AC_WXH_MMX 16, 8
1753 HADAMARD_AC_WXH_MMX 8, 8
1755 %macro LOAD_INC_8x4W_SSE2 5
1769 %macro LOAD_INC_8x4W_SSSE3 5
1770 LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1]
1774 HSUMSUB %1, %2, %3, %4, %5
1777 %macro HADAMARD_AC_SSE2 1
1779 ; in: r0=pix, r1=stride, r2=stride*3
1780 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
1781 cglobal x264_hadamard_ac_8x8_%1
1787 %define spill0 [rsp+gprsize]
1788 %define spill1 [rsp+gprsize+16]
1789 %define spill2 [rsp+gprsize+32]
1792 ;LOAD_INC loads sumsubs
1793 mova m7, [hmul_8p GLOBAL]
1795 ;LOAD_INC only unpacks to words
1798 LOAD_INC_8x4W 0, 1, 2, 3, 7
1800 HADAMARD4_2D_SSE 0, 1, 2, 3, 4
1802 HADAMARD4_V m0, m1, m2, m3, m4
1806 LOAD_INC_8x4W 4, 5, 6, 7, 1
1808 HADAMARD4_2D_SSE 4, 5, 6, 7, 1
1810 HADAMARD4_V m4, m5, m6, m7, m1
1817 HADAMARD 1, sumsub, 0, 1, 6, 7
1818 HADAMARD 1, sumsub, 2, 3, 6, 7
1823 HADAMARD 1, sumsub, 4, 5, 1, 0
1824 HADAMARD 1, sumsub, 6, 7, 1, 0
1834 SUMSUB_BA m0, m4; m2
1836 pand m1, [mask_ac4b GLOBAL]
1838 pand m1, [mask_ac4 GLOBAL]
1855 mova [rsp+gprsize+32], m1 ; save satd
1863 HADAMARD 2, amax, 3, 7, 4
1864 HADAMARD 2, amax, 2, 6, 7, 4
1866 HADAMARD 2, amax, 1, 5, 6, 7
1867 HADAMARD 2, sumsub, 0, 4, 5, 6
1870 HADAMARD 4, amax, 3, 7, 4
1871 HADAMARD 4, amax, 2, 6, 7, 4
1873 HADAMARD 4, amax, 1, 5, 6, 7
1874 HADAMARD 4, sumsub, 0, 4, 5, 6
1880 pand m0, [mask_ac8 GLOBAL]
1884 mova [rsp+gprsize+16], m0 ; save sa8d
1885 SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1
1888 HADAMARD_AC_WXH_SSE2 16, 16, %1
1889 HADAMARD_AC_WXH_SSE2 8, 16, %1
1890 HADAMARD_AC_WXH_SSE2 16, 8, %1
1891 HADAMARD_AC_WXH_SSE2 8, 8, %1
1892 %endmacro ; HADAMARD_AC_SSE2
1894 ; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
1895 %macro HADAMARD_AC_WXH_SSE2 3
1896 cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3,11
1897 %assign pad 16-gprsize-(stack_offset&15)
1901 call x264_hadamard_ac_8x8_%3
1906 call x264_hadamard_ac_8x8_%3
1911 lea r0, [r0+ysub*4+8]
1913 call x264_hadamard_ac_8x8_%3
1917 call x264_hadamard_ac_8x8_%3
1922 paddusw m0, [rsp+0x30]
1923 paddusw m1, [rsp+0x40]
1926 paddusw m0, [rsp+0x50]
1927 paddusw m1, [rsp+0x60]
1928 paddusw m0, [rsp+0x70]
1929 paddusw m1, [rsp+0x80]
1936 shr edx, 2 - (%1*%2 >> 8)
1942 add rsp, 16+%1*%2/2+pad
1944 %endmacro ; HADAMARD_AC_WXH_SSE2
1949 cextern x264_pixel_sa8d_8x8_internal_mmxext
1953 %define TRANS TRANS_SSE2
1954 %define ABS1 ABS1_MMX
1955 %define ABS2 ABS2_MMX
1956 %define DIFFOP DIFF_UNPACK_SSE2
1957 %define JDUP JDUP_SSE2
1958 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2
1959 %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
1960 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
1961 %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
1962 %define movdqu movups
1963 %define punpcklqdq movlhps
1967 INTRA_SA8D_SSE2 sse2
1968 INTRA_SATDS_MMX mmxext
1969 HADAMARD_AC_SSE2 sse2
1971 %define ABS1 ABS1_SSSE3
1972 %define ABS2 ABS2_SSSE3
1973 %define ABS_MOV ABS_MOV_SSSE3
1974 %define DIFFOP DIFF_SUMSUB_SSSE3
1975 %define JDUP JDUP_CONROE
1976 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
1977 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
1978 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
1979 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
1982 HADAMARD_AC_SSE2 ssse3
1983 %undef movdqa ; nehalem doesn't like movaps
1984 %undef movdqu ; movups
1985 %undef punpcklqdq ; or movlhps
1986 INTRA_SA8D_SSE2 ssse3
1987 INTRA_SATDS_MMX ssse3
1989 %define TRANS TRANS_SSE4
1990 %define JDUP JDUP_PENRYN
1991 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
1994 HADAMARD_AC_SSE2 sse4
1996 ;=============================================================================
1998 ;=============================================================================
2000 ;-----------------------------------------------------------------------------
2001 ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
2002 ; const uint8_t *pix2, int stride2, int sums[2][4] )
2003 ;-----------------------------------------------------------------------------
2006 movq m5, [r0+(%1&1)*r1]
2007 movq m6, [r2+(%1&1)*r3]
2035 cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8
2043 movdqa m7, [pw_1 GLOBAL]
2070 ;-----------------------------------------------------------------------------
2071 ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
2072 ;-----------------------------------------------------------------------------
2073 cglobal x264_pixel_ssim_end4_sse2, 3,3,7
2088 movdqa m5, [ssim_c1 GLOBAL]
2089 movdqa m6, [ssim_c2 GLOBAL]
2090 TRANSPOSE4x4D 0, 1, 2, 3, 4
2092 ; s1=m0, s2=m1, ss=m2, s12=m3
2095 pmaddwd m4, m0 ; s1*s2
2097 pmaddwd m0, m0 ; s1*s1 + s2*s2
2101 psubd m3, m4 ; covar*2
2107 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
2108 cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
2109 cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
2110 cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
2116 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
2119 lea r3, [mask_ff + 16 GLOBAL]
2120 movdqu m1, [r3 + r2*4]
2122 movdqu m1, [mask_ff + r2*4 + 16 GLOBAL]
2138 ;=============================================================================
2139 ; Successive Elimination ADS
2140 ;=============================================================================
2142 %macro ADS_START 1 ; unroll_size
2147 movsxd r5, dword r5m
2174 %define ABS1 ABS1_MMX
2176 ;-----------------------------------------------------------------------------
2177 ; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
2178 ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
2179 ;-----------------------------------------------------------------------------
2180 cglobal x264_pixel_ads4_mmxext, 4,7
2184 pshufw mm6, mm6, 0xAA
2186 pshufw mm4, mm4, 0xAA
2196 movq mm3, [r1+r2+16]
2205 pshufw mm1, [r10+stack_offset+56], 0
2206 %elifdef ARCH_X86_64
2207 pshufw mm1, [r10+8], 0
2209 pshufw mm1, [ebp+stack_offset+28], 0
2217 cglobal x264_pixel_ads2_mmxext, 4,7
2221 pshufw mm6, mm6, 0xAA
2238 cglobal x264_pixel_ads1_mmxext, 4,7
2260 cglobal x264_pixel_ads4_%1, 4,7,12
2262 pshuflw xmm7, xmm4, 0
2263 pshuflw xmm6, xmm4, 0xAA
2264 pshufhw xmm5, xmm4, 0
2265 pshufhw xmm4, xmm4, 0xAA
2266 punpcklqdq xmm7, xmm7
2267 punpcklqdq xmm6, xmm6
2268 punpckhqdq xmm5, xmm5
2269 punpckhqdq xmm4, xmm4
2271 pshuflw xmm8, r6m, 0
2272 punpcklqdq xmm8, xmm8
2275 movdqu xmm11, [r1+r2]
2278 movdqu xmm1, [r1+16]
2285 movdqu xmm3, [r1+r2+16]
2304 movdqu xmm1, [r1+16]
2309 movdqu xmm2, [r1+r2]
2310 movdqu xmm3, [r1+r2+16]
2318 movd xmm1, [ebp+stack_offset+28]
2320 pshuflw xmm1, xmm1, 0
2321 punpcklqdq xmm1, xmm1
2329 cglobal x264_pixel_ads2_%1, 4,7,8
2332 pshuflw xmm7, xmm6, 0
2333 pshuflw xmm6, xmm6, 0xAA
2334 pshuflw xmm5, xmm5, 0
2335 punpcklqdq xmm7, xmm7
2336 punpcklqdq xmm6, xmm6
2337 punpcklqdq xmm5, xmm5
2341 movdqu xmm1, [r1+r2]
2355 cglobal x264_pixel_ads1_%1, 4,7,8
2358 pshuflw xmm7, xmm7, 0
2359 pshuflw xmm6, xmm6, 0
2360 punpcklqdq xmm7, xmm7
2361 punpcklqdq xmm6, xmm6
2365 movdqu xmm1, [r1+16]
2369 movdqu xmm3, [r3+16]
2384 %define ABS1 ABS1_SSSE3
2387 ; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
2390 ; *(uint32_t*)(masks+width) = 0;
2391 ; for( i=0; i<width; i+=8 )
2393 ; uint64_t mask = *(uint64_t*)(masks+i);
2394 ; if( !mask ) continue;
2395 ; for( j=0; j<8; j++ )
2396 ; if( mask & (255<<j*8) )
2401 cglobal x264_pixel_ads_mvs, 0,7,0
2407 ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
2414 mov dword [rsp+r9], 0
2427 test edi, 0xff<<(%1*8)
2450 mov ebx, [ebp+stack_offset+20] ; mvs
2451 mov edi, [ebp+stack_offset+24] ; width
2452 mov dword [esp+edi], 0
2460 mov ebp, [esp+esi+4]
2461 mov edx, [esp+esi+8]
2468 test %2, 0xff<<(%1*8)