1 ;*****************************************************************************
2 ;* pixel.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* Fiona Glaser <fiona@x264.com>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;*****************************************************************************
28 %include "x86util.asm"
32 pw_00ff: times 8 dw 0xff
33 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
34 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
35 mask_ff: times 16 db 0xff
37 mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1
38 mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1
39 mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1
40 hsub_mul: times 8 db 1, -1
41 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
44 mask_10: times 4 dw 0, -1
45 mask_1100: times 2 dd 0, -1
49 %macro HADDD 2 ; sum junk
62 pmaddwd %1, [pw_1 GLOBAL]
75 ;=============================================================================
77 ;=============================================================================
79 %macro SSD_LOAD_FULL 5
120 DEINTB %2, %1, %4, %3, 7
135 %macro SSD_LOAD_HALF 5
136 LOAD 1, 2, [r0+%1], [r0+%3], 1
137 JOIN 1, 2, 3, 4, [r2+%2], [r2+%4], 1
138 LOAD 3, 4, [r0+%1], [r0+%3], %5
139 JOIN 3, 4, 5, 6, [r2+%2], [r2+%4], %5
165 %macro SSD_CORE_SSE2 7-8
167 DEINTB %6, %1, %7, %2, %5
171 DEINTB %6, %3, %7, %4, %5
182 %macro SSD_CORE_SSSE3 7-8
215 SSD_LOAD_%1 %2,%3,%4,%5,%7
216 SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
220 ;-----------------------------------------------------------------------------
221 ; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
222 ;-----------------------------------------------------------------------------
224 cglobal x264_pixel_ssd_%1x%2_%3, 4,4,%4
226 mova m7, [hsub_mul GLOBAL]
228 mova m7, [pw_00ff GLOBAL]
235 SSD_ITER FULL, 0, 0, mmsize, mmsize, i, 0
236 SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, 1
237 SSD_ITER FULL, 0, 0, mmsize, mmsize, 1, 0
238 SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/4-1
240 SSD_ITER FULL, 0, 0, r1, r3, i, 1
241 SSD_ITER FULL, 0, 0, r1, r3, 1, i<%2/4-1
243 SSD_ITER HALF, 0, 0, r1, r3, i, i<%2/4-1
261 SSD 16, 16, sse2slow, 8
262 SSD 16, 8, sse2slow, 8
263 SSD 8, 16, sse2slow, 8
264 SSD 8, 8, sse2slow, 8
265 SSD 8, 4, sse2slow, 8
266 %define SSD_CORE SSD_CORE_SSE2
267 %define JOIN JOIN_SSE2
273 %define SSD_CORE SSD_CORE_SSSE3
274 %define JOIN JOIN_SSSE3
284 ;=============================================================================
286 ;=============================================================================
290 pxor m6, m6 ; sum squared
292 mova m7, [pw_00ff GLOBAL]
305 sub eax, r1d ; sqr - (sum * sum >> shift)
345 ;-----------------------------------------------------------------------------
346 ; int x264_pixel_var_wxh_mmxext( uint8_t *, int )
347 ;-----------------------------------------------------------------------------
349 cglobal x264_pixel_var_16x16_mmxext, 2,3
354 cglobal x264_pixel_var_8x8_mmxext, 2,3
360 cglobal x264_pixel_var_16x16_sse2, 2,3,8
373 cglobal x264_pixel_var_8x8_sse2, 2,4,8
390 ;=============================================================================
392 ;=============================================================================
394 %define TRANS TRANS_SSE2
398 ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
402 ; join 2x 32 bit and duplicate them
403 ; emulating shufps is faster on conroe
409 ; just use shufps on anything post conroe
420 %macro DIFF_UNPACK_SSE2 5
429 %macro DIFF_SUMSUB_SSSE3 5
430 HSUMSUB %1, %2, %3, %4, %5
435 %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
441 %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
448 %macro LOAD_DUP_4x8P_PENRYN 8
449 ; penryn and nehalem run punpcklqdq and movddup in different units
458 %macro LOAD_SUMSUB_8x2P 9
459 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
460 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
463 %macro LOAD_SUMSUB_8x4P_SSSE3 7-10 r0, r2, 0
464 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
465 LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
466 LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
473 %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
479 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
482 %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
485 DEINTB %1, %2, %3, %4, %5
488 SUMSUB_BA m%1, m%2, m%3
491 %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
492 ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
493 LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
494 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
495 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
496 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
499 ; in: r4=3*stride1, r5=3*stride2
500 ; in: %2 = horizontal offset
501 ; in: %3 = whether we need to increment pix1 and pix2
504 %macro SATD_4x4_MMX 3
506 LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2]
507 LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2]
508 LOAD_DIFF m6, m3, none, [r0+2*r1+%2], [r2+2*r3+%2]
509 LOAD_DIFF m7, m3, none, [r0+r4+%2], [r2+r5+%2]
514 HADAMARD4_2D 4, 5, 6, 7, 3, %%n
519 %macro SATD_8x4_SSE 8-9
521 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
523 HADAMARD4_V m%2, m%3, m%4, m%5, m%6
524 ; doing the abs first is a slight advantage
525 ABS4 m%2, m%4, m%3, m%5, m%6, m%7
526 HADAMARD 1, max, %2, %4, %6, %7
536 HADAMARD 1, max, %3, %5, %6, %7
541 %macro SATD_START_MMX 0
542 lea r4, [3*r1] ; 3*stride1
543 lea r5, [3*r3] ; 3*stride2
546 %macro SATD_END_MMX 0
547 pshufw m1, m0, 01001110b
549 pshufw m1, m0, 10110001b
556 ; FIXME avoid the spilling of regs to hold 3*stride.
557 ; for small blocks on x86_32, modify pixel pointer instead.
559 ;-----------------------------------------------------------------------------
560 ; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
561 ;-----------------------------------------------------------------------------
563 cglobal x264_pixel_satd_16x4_internal_mmxext
564 SATD_4x4_MMX m2, 0, 0
565 SATD_4x4_MMX m1, 4, 0
567 SATD_4x4_MMX m2, 8, 0
569 SATD_4x4_MMX m1, 12, 0
574 cglobal x264_pixel_satd_8x8_internal_mmxext
575 SATD_4x4_MMX m2, 0, 0
576 SATD_4x4_MMX m1, 4, 1
579 x264_pixel_satd_8x4_internal_mmxext:
580 SATD_4x4_MMX m2, 0, 0
581 SATD_4x4_MMX m1, 4, 0
586 cglobal x264_pixel_satd_16x16_mmxext, 4,6
590 call x264_pixel_satd_16x4_internal_mmxext
594 call x264_pixel_satd_16x4_internal_mmxext
599 cglobal x264_pixel_satd_16x8_mmxext, 4,6
602 call x264_pixel_satd_16x4_internal_mmxext
605 call x264_pixel_satd_16x4_internal_mmxext
608 cglobal x264_pixel_satd_8x16_mmxext, 4,6
611 call x264_pixel_satd_8x8_internal_mmxext
614 call x264_pixel_satd_8x8_internal_mmxext
617 cglobal x264_pixel_satd_8x8_mmxext, 4,6
620 call x264_pixel_satd_8x8_internal_mmxext
623 cglobal x264_pixel_satd_8x4_mmxext, 4,6
626 call x264_pixel_satd_8x4_internal_mmxext
629 cglobal x264_pixel_satd_4x8_mmxext, 4,6
631 SATD_4x4_MMX m0, 0, 1
632 SATD_4x4_MMX m1, 0, 0
636 cglobal x264_pixel_satd_4x4_mmxext, 4,6
638 SATD_4x4_MMX m0, 0, 0
641 %macro SATD_START_SSE2 3
643 mova %3, [hmul_8p GLOBAL]
650 %macro SATD_END_SSE2 2
656 %macro BACKUP_POINTERS 0
663 %macro RESTORE_AND_INC_POINTERS 0
675 ;-----------------------------------------------------------------------------
676 ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
677 ;-----------------------------------------------------------------------------
681 cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
683 mova m4, [hmul_4p GLOBAL]
684 LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
685 LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
686 LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
687 LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
688 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
689 HADAMARD 0, sumsub, 0, 1, 2, 3
690 HADAMARD 4, sumsub, 0, 1, 2, 3
691 HADAMARD 1, amax, 0, 1, 2, 3
697 cglobal x264_pixel_satd_4x8_%1, 4, 6, 8
700 mova m7, [hmul_4p GLOBAL]
732 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6, swap
737 cglobal x264_pixel_satd_8x8_internal_%1
738 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
739 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
740 x264_pixel_satd_8x4_internal_%1:
741 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
742 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
745 %ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
746 cglobal x264_pixel_satd_16x4_internal_%1
747 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
750 SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10
751 SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10
754 cglobal x264_pixel_satd_16x8_%1, 4,6,12
755 SATD_START_SSE2 %1, m10, m7
757 mova m7, [pw_00ff GLOBAL]
759 jmp x264_pixel_satd_16x8_internal_%1
761 cglobal x264_pixel_satd_16x16_%1, 4,6,12
762 SATD_START_SSE2 %1, m10, m7
764 mova m7, [pw_00ff GLOBAL]
766 call x264_pixel_satd_16x4_internal_%1
767 call x264_pixel_satd_16x4_internal_%1
768 x264_pixel_satd_16x8_internal_%1:
769 call x264_pixel_satd_16x4_internal_%1
770 call x264_pixel_satd_16x4_internal_%1
771 SATD_END_SSE2 %1, m10
773 cglobal x264_pixel_satd_16x8_%1, 4,6,8
774 SATD_START_SSE2 %1, m6, m7
776 call x264_pixel_satd_8x8_internal_%1
777 RESTORE_AND_INC_POINTERS
778 call x264_pixel_satd_8x8_internal_%1
781 cglobal x264_pixel_satd_16x16_%1, 4,6,8
782 SATD_START_SSE2 %1, m6, m7
784 call x264_pixel_satd_8x8_internal_%1
785 call x264_pixel_satd_8x8_internal_%1
786 RESTORE_AND_INC_POINTERS
787 call x264_pixel_satd_8x8_internal_%1
788 call x264_pixel_satd_8x8_internal_%1
792 cglobal x264_pixel_satd_8x16_%1, 4,6,8
793 SATD_START_SSE2 %1, m6, m7
794 call x264_pixel_satd_8x8_internal_%1
795 call x264_pixel_satd_8x8_internal_%1
798 cglobal x264_pixel_satd_8x8_%1, 4,6,8
799 SATD_START_SSE2 %1, m6, m7
800 call x264_pixel_satd_8x8_internal_%1
803 cglobal x264_pixel_satd_8x4_%1, 4,6,8
804 SATD_START_SSE2 %1, m6, m7
805 call x264_pixel_satd_8x4_internal_%1
807 %endmacro ; SATDS_SSE2
811 ;-----------------------------------------------------------------------------
812 ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
813 ;-----------------------------------------------------------------------------
814 cglobal x264_pixel_sa8d_8x8_internal_%1
817 LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
818 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11
819 %ifidn %1, sse2 ; sse2 doesn't seem to like the horizontal way of doing things
820 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
822 HADAMARD4_V m0, m1, m2, m8, m6
823 HADAMARD4_V m4, m5, m3, m9, m6
824 SUMSUB_BADC m0, m4, m1, m5, m6
825 HADAMARD 2, sumsub, 0, 4, 6, 11
826 HADAMARD 2, sumsub, 1, 5, 6, 11
827 SUMSUB_BADC m2, m3, m8, m9, m6
828 HADAMARD 2, sumsub, 2, 3, 6, 11
829 HADAMARD 2, sumsub, 8, 9, 6, 11
830 HADAMARD 1, amax, 0, 4, 6, 11
831 HADAMARD 1, amax, 1, 5, 6, 4
832 HADAMARD 1, amax, 2, 3, 6, 4
833 HADAMARD 1, amax, 8, 9, 6, 4
838 SAVE_MM_PERMUTATION x264_pixel_sa8d_8x8_internal_%1
841 cglobal x264_pixel_sa8d_8x8_%1, 4,6,12
845 mova m7, [hmul_8p GLOBAL]
847 call x264_pixel_sa8d_8x8_internal_%1
854 cglobal x264_pixel_sa8d_16x16_%1, 4,6,12
858 mova m7, [hmul_8p GLOBAL]
860 call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
864 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8]
868 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8]
872 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
882 cglobal x264_pixel_sa8d_8x8_internal_%1
883 %define spill0 [esp+4]
884 %define spill1 [esp+20]
885 %define spill2 [esp+36]
887 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
888 HADAMARD4_2D 0, 1, 2, 3, 4
890 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
891 HADAMARD4_2D 4, 5, 6, 7, 3
892 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
895 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
897 mova m7, [hmul_8p GLOBAL]
898 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
899 ; could do first HADAMARD4_V here to save spilling later
900 ; surprisingly, not a win on conroe or even p4
905 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
906 HADAMARD4_V m4, m5, m6, m7, m3
912 HADAMARD4_V m0, m1, m2, m3, m7
913 SUMSUB_BADC m0, m4, m1, m5, m7
914 HADAMARD 2, sumsub, 0, 4, 7, 6
915 HADAMARD 2, sumsub, 1, 5, 7, 6
916 HADAMARD 1, amax, 0, 4, 7, 6
917 HADAMARD 1, amax, 1, 5, 7, 6
921 SUMSUB_BADC m2, m6, m3, m7, m4
922 HADAMARD 2, sumsub, 2, 6, 4, 5
923 HADAMARD 2, sumsub, 3, 7, 4, 5
924 HADAMARD 1, amax, 2, 6, 4, 5
925 HADAMARD 1, amax, 3, 7, 4, 5
926 %endif ; sse2/non-sse2
930 %endif ; ifndef mmxext
932 cglobal x264_pixel_sa8d_8x8_%1, 4,7
938 call x264_pixel_sa8d_8x8_internal_%1
946 cglobal x264_pixel_sa8d_16x16_%1, 4,7
952 call x264_pixel_sa8d_8x8_internal_%1
958 call x264_pixel_sa8d_8x8_internal_%1
965 call x264_pixel_sa8d_8x8_internal_%1
973 mova [esp+64-mmsize], m0
974 call x264_pixel_sa8d_8x8_internal_%1
975 paddusw m0, [esp+64-mmsize]
997 %endif ; !ARCH_X86_64
1000 ;=============================================================================
1002 ;=============================================================================
1004 %macro INTRA_SA8D_SSE2 1
1007 ;-----------------------------------------------------------------------------
1008 ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
1009 ;-----------------------------------------------------------------------------
1010 cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16
1013 movq m0, [r0+0*FENC_STRIDE]
1014 movq m1, [r0+1*FENC_STRIDE]
1015 movq m2, [r0+2*FENC_STRIDE]
1016 movq m3, [r0+3*FENC_STRIDE]
1017 movq m4, [r0+4*FENC_STRIDE]
1018 movq m5, [r0+5*FENC_STRIDE]
1019 movq m6, [r0+6*FENC_STRIDE]
1020 movq m7, [r0+7*FENC_STRIDE]
1030 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
1033 movzx r0d, word [r1+0]
1034 add r0w, word [r1+16]
1044 ABS4 m8, m9, m10, m11, m12, m13
1055 ABS2 m10, m11, m13, m14
1062 movdqa m14, m15 ; 7x8 sum
1064 movdqa m8, [r1+0] ; left edge
1070 ABS1 m9, m11 ; 1x8 sum
1079 punpcklqdq m0, m4 ; transpose
1080 movdqa m1, [r1+16] ; top edge
1083 psrldq m2, 2 ; 8x7 sum
1084 psubw m0, m1 ; 8x1 sum
1089 movdqa m7, [pw_1 GLOBAL]
1096 pshufd m5, m15, 0xf5
1105 movq [r2], m3 ; i8x8_v, i8x8_h
1107 movd [r2+8], m3 ; i8x8_dc
1109 %endif ; ARCH_X86_64
1110 %endmacro ; INTRA_SA8D_SSE2
1113 ; out: m0..m3 = hadamard coefs
1118 movd m0, [r0+0*FENC_STRIDE]
1119 movd m1, [r0+1*FENC_STRIDE]
1120 movd m2, [r0+2*FENC_STRIDE]
1121 movd m3, [r0+3*FENC_STRIDE]
1126 HADAMARD4_2D 0, 1, 2, 3, 4
1127 SAVE_MM_PERMUTATION load_hadamard
1130 %macro SCALAR_SUMSUB 4
1139 %macro SCALAR_HADAMARD_LEFT 5 ; y, 4x tmp
1141 shl %1d, 5 ; log(FDEC_STRIDE)
1143 movzx %2d, byte [r1+%1-1+0*FDEC_STRIDE]
1144 movzx %3d, byte [r1+%1-1+1*FDEC_STRIDE]
1145 movzx %4d, byte [r1+%1-1+2*FDEC_STRIDE]
1146 movzx %5d, byte [r1+%1-1+3*FDEC_STRIDE]
1150 SCALAR_SUMSUB %2d, %3d, %4d, %5d
1151 SCALAR_SUMSUB %2d, %4d, %3d, %5d
1152 mov [left_1d+2*%1+0], %2w
1153 mov [left_1d+2*%1+2], %3w
1154 mov [left_1d+2*%1+4], %4w
1155 mov [left_1d+2*%1+6], %5w
1158 %macro SCALAR_HADAMARD_TOP 5 ; x, 4x tmp
1159 movzx %2d, byte [r1+%1-FDEC_STRIDE+0]
1160 movzx %3d, byte [r1+%1-FDEC_STRIDE+1]
1161 movzx %4d, byte [r1+%1-FDEC_STRIDE+2]
1162 movzx %5d, byte [r1+%1-FDEC_STRIDE+3]
1163 SCALAR_SUMSUB %2d, %3d, %4d, %5d
1164 SCALAR_SUMSUB %2d, %4d, %3d, %5d
1165 mov [top_1d+2*%1+0], %2w
1166 mov [top_1d+2*%1+2], %3w
1167 mov [top_1d+2*%1+4], %4w
1168 mov [top_1d+2*%1+6], %5w
1171 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
1173 pshufw %4, %1, 01001110b
1174 pshufw %5, %2, 01001110b
1175 pshufw %6, %3, 01001110b
1182 pshufw %4, %1, 01001110b
1183 pshufw %5, %2, 01001110b
1184 pshufw %6, %3, 01001110b
1192 mov qword [sums+0], 0
1193 mov qword [sums+8], 0
1194 mov qword [sums+16], 0
1223 ; in: m0..m3 (4x4), m7 (3x4)
1224 ; out: m0 v, m4 h, m5 dc
1226 %macro SUM4x3 3 ; dc, left, top
1234 punpckldq m0, m2 ; transpose
1238 ABS2 m4, m5, m2, m3 ; 1x4 sum
1239 ABS1 m0, m1 ; 4x1 sum
1242 %macro INTRA_SATDS_MMX 1
1244 ;-----------------------------------------------------------------------------
1245 ; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1246 ;-----------------------------------------------------------------------------
1247 cglobal x264_intra_satd_x3_4x4_%1, 2,6
1249 ; stack is 16 byte aligned because abi says so
1250 %define top_1d rsp-8 ; size 8
1251 %define left_1d rsp-16 ; size 8
1254 ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
1256 %define top_1d esp+8
1262 SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5
1264 SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5
1265 lea t0d, [t0d + r0d + 4]
1270 SUM4x3 t0d, [left_1d], [top_1d]
1274 psrlq m1, 16 ; 4x3 sum
1277 SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw
1281 movd [r2+0], m0 ; i4x4_v satd
1282 movd [r2+4], m4 ; i4x4_h satd
1283 movd [r2+8], m5 ; i4x4_dc satd
1297 ;-----------------------------------------------------------------------------
1298 ; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1299 ;-----------------------------------------------------------------------------
1300 cglobal x264_intra_satd_x3_16x16_%1, 0,7
1302 %assign stack_pad 88
1304 %assign stack_pad 88 + ((stack_offset+88+4)&15)
1306 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1308 %define sums rsp+64 ; size 24
1309 %define top_1d rsp+32 ; size 32
1310 %define left_1d rsp ; size 32
1318 SCALAR_HADAMARD_LEFT t0, r3, r4, r5, r6
1320 SCALAR_HADAMARD_TOP t0, r3, r4, r5, r6
1337 SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
1340 paddw m0, [sums+0] ; i16x16_v satd
1341 paddw m4, [sums+8] ; i16x16_h satd
1342 paddw m5, [sums+16] ; i16x16_dc satd
1351 add r0, 4*FENC_STRIDE-16
1362 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
1368 movd [r2+8], m2 ; i16x16_dc satd
1369 movd [r2+4], m1 ; i16x16_h satd
1370 movd [r2+0], m0 ; i16x16_v satd
1374 ;-----------------------------------------------------------------------------
1375 ; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1376 ;-----------------------------------------------------------------------------
1377 cglobal x264_intra_satd_x3_8x8c_%1, 0,6
1378 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1380 %define sums rsp+48 ; size 24
1381 %define dc_1d rsp+32 ; size 16
1382 %define top_1d rsp+16 ; size 16
1383 %define left_1d rsp ; size 16
1390 SCALAR_HADAMARD_LEFT t0, t2, r3, r4, r5
1391 SCALAR_HADAMARD_TOP t0, t2, r3, r4, r5
1396 movzx t2d, word [left_1d+0]
1397 movzx r3d, word [top_1d+0]
1398 movzx r4d, word [left_1d+8]
1399 movzx r5d, word [top_1d+8]
1410 mov [dc_1d+ 0], t2d ; tl
1411 mov [dc_1d+ 4], r5d ; tr
1412 mov [dc_1d+ 8], r4d ; bl
1413 mov [dc_1d+12], r3d ; br
1426 SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
1429 paddw m0, [sums+16] ; i4x4_v satd
1430 paddw m4, [sums+8] ; i4x4_h satd
1431 paddw m5, [sums+0] ; i4x4_dc satd
1440 add r0, 4*FENC_STRIDE-8
1453 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
1455 movd [r2+0], m0 ; i8x8c_dc satd
1456 movd [r2+4], m1 ; i8x8c_h satd
1457 movd [r2+8], m2 ; i8x8c_v satd
1460 %endmacro ; INTRA_SATDS_MMX
1463 %macro ABS_MOV_SSSE3 2
1467 %macro ABS_MOV_MMX 2
1473 %define ABS_MOV ABS_MOV_MMX
1475 ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
1476 ; out: [tmp]=hadamard4, m0=satd
1477 cglobal x264_hadamard_ac_4x4_mmxext
1486 HADAMARD4_2D 0, 1, 2, 3, 4
1499 SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext
1502 cglobal x264_hadamard_ac_2x2max_mmxext
1508 SUMSUB_BADC m0, m1, m2, m3, m4
1509 ABS4 m0, m2, m1, m3, m4, m5
1510 HADAMARD 0, max, 0, 2, 4, 5
1511 HADAMARD 0, max, 1, 3, 4, 5
1514 SAVE_MM_PERMUTATION x264_hadamard_ac_2x2max_mmxext
1517 cglobal x264_hadamard_ac_8x8_mmxext
1518 mova m6, [mask_ac4 GLOBAL]
1520 call x264_hadamard_ac_4x4_mmxext
1524 call x264_hadamard_ac_4x4_mmxext
1528 call x264_hadamard_ac_4x4_mmxext
1532 call x264_hadamard_ac_4x4_mmxext
1535 mova [rsp+gprsize+8], m5 ; save satd
1537 call x264_hadamard_ac_2x2max_mmxext
1543 SUMSUB_BADC m0, m1, m2, m3, m4
1544 HADAMARD 0, sumsub, 0, 2, 4, 5
1545 ABS4 m1, m3, m0, m2, m4, m5
1546 HADAMARD 0, max, 1, 3, 4, 5
1552 mova [rsp+gprsize], m6 ; save sa8d
1554 SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext
1557 %macro HADAMARD_AC_WXH_MMX 2
1558 cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4
1559 %assign pad 16-gprsize-(stack_offset&15)
1564 call x264_hadamard_ac_8x8_mmxext
1569 call x264_hadamard_ac_8x8_mmxext
1574 lea r0, [r0+ysub*4+8]
1576 call x264_hadamard_ac_8x8_mmxext
1580 call x264_hadamard_ac_8x8_mmxext
1585 paddusw m0, [rsp+0x10]
1586 paddusw m1, [rsp+0x18]
1590 paddusw m1, [rsp+0x28]
1591 paddusw m2, [rsp+0x30]
1593 paddusw m1, [rsp+0x38]
1595 pand m3, [pw_1 GLOBAL]
1612 add rsp, 128+%1*%2/4+pad
1614 %endmacro ; HADAMARD_AC_WXH_MMX
1616 HADAMARD_AC_WXH_MMX 16, 16
1617 HADAMARD_AC_WXH_MMX 8, 16
1618 HADAMARD_AC_WXH_MMX 16, 8
1619 HADAMARD_AC_WXH_MMX 8, 8
1621 %macro LOAD_INC_8x4W_SSE2 5
1635 %macro LOAD_INC_8x4W_SSSE3 5
1636 LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1]
1640 HSUMSUB %1, %2, %3, %4, %5
1643 %macro HADAMARD_AC_SSE2 1
1645 ; in: r0=pix, r1=stride, r2=stride*3
1646 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
1647 cglobal x264_hadamard_ac_8x8_%1
1653 %define spill0 [rsp+gprsize]
1654 %define spill1 [rsp+gprsize+16]
1655 %define spill2 [rsp+gprsize+32]
1658 ;LOAD_INC loads sumsubs
1659 mova m7, [hmul_8p GLOBAL]
1661 ;LOAD_INC only unpacks to words
1664 LOAD_INC_8x4W 0, 1, 2, 3, 7
1666 HADAMARD4_2D_SSE 0, 1, 2, 3, 4
1668 HADAMARD4_V m0, m1, m2, m3, m4
1672 LOAD_INC_8x4W 4, 5, 6, 7, 1
1674 HADAMARD4_2D_SSE 4, 5, 6, 7, 1
1676 HADAMARD4_V m4, m5, m6, m7, m1
1683 HADAMARD 1, sumsub, 0, 1, 6, 7
1684 HADAMARD 1, sumsub, 2, 3, 6, 7
1689 HADAMARD 1, sumsub, 4, 5, 1, 0
1690 HADAMARD 1, sumsub, 6, 7, 1, 0
1700 SUMSUB_BA m0, m4; m2
1702 pand m1, [mask_ac4b GLOBAL]
1704 pand m1, [mask_ac4 GLOBAL]
1721 mova [rsp+gprsize+32], m1 ; save satd
1729 HADAMARD 2, amax, 3, 7, 4
1730 HADAMARD 2, amax, 2, 6, 7, 4
1732 HADAMARD 2, amax, 1, 5, 6, 7
1733 HADAMARD 2, sumsub, 0, 4, 5, 6
1736 HADAMARD 4, amax, 3, 7, 4
1737 HADAMARD 4, amax, 2, 6, 7, 4
1739 HADAMARD 4, amax, 1, 5, 6, 7
1740 HADAMARD 4, sumsub, 0, 4, 5, 6
1746 pand m0, [mask_ac8 GLOBAL]
1750 mova [rsp+gprsize+16], m0 ; save sa8d
1751 SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1
1754 HADAMARD_AC_WXH_SSE2 16, 16, %1
1755 HADAMARD_AC_WXH_SSE2 8, 16, %1
1756 HADAMARD_AC_WXH_SSE2 16, 8, %1
1757 HADAMARD_AC_WXH_SSE2 8, 8, %1
1758 %endmacro ; HADAMARD_AC_SSE2
1760 ; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
1761 %macro HADAMARD_AC_WXH_SSE2 3
1762 cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3,11
1763 %assign pad 16-gprsize-(stack_offset&15)
1767 call x264_hadamard_ac_8x8_%3
1772 call x264_hadamard_ac_8x8_%3
1777 lea r0, [r0+ysub*4+8]
1779 call x264_hadamard_ac_8x8_%3
1783 call x264_hadamard_ac_8x8_%3
1788 paddusw m0, [rsp+0x30]
1789 paddusw m1, [rsp+0x40]
1792 paddusw m0, [rsp+0x50]
1793 paddusw m1, [rsp+0x60]
1794 paddusw m0, [rsp+0x70]
1795 paddusw m1, [rsp+0x80]
1802 shr edx, 2 - (%1*%2 >> 8)
1808 add rsp, 16+%1*%2/2+pad
1810 %endmacro ; HADAMARD_AC_WXH_SSE2
1815 cextern x264_pixel_sa8d_8x8_internal_mmxext
1819 %define TRANS TRANS_SSE2
1820 %define ABS1 ABS1_MMX
1821 %define ABS2 ABS2_MMX
1822 %define DIFFOP DIFF_UNPACK_SSE2
1823 %define JDUP JDUP_SSE2
1824 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2
1825 %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
1826 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
1827 %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
1828 %define movdqu movups
1829 %define punpcklqdq movlhps
1833 INTRA_SA8D_SSE2 sse2
1834 INTRA_SATDS_MMX mmxext
1835 HADAMARD_AC_SSE2 sse2
1837 %define ABS1 ABS1_SSSE3
1838 %define ABS2 ABS2_SSSE3
1839 %define ABS_MOV ABS_MOV_SSSE3
1840 %define DIFFOP DIFF_SUMSUB_SSSE3
1841 %define JDUP JDUP_CONROE
1842 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
1843 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
1844 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
1845 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
1848 HADAMARD_AC_SSE2 ssse3
1849 %undef movdqa ; nehalem doesn't like movaps
1850 %undef movdqu ; movups
1851 %undef punpcklqdq ; or movlhps
1852 INTRA_SA8D_SSE2 ssse3
1853 INTRA_SATDS_MMX ssse3
1855 %define TRANS TRANS_SSE4
1856 %define JDUP JDUP_PENRYN
1857 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
1860 HADAMARD_AC_SSE2 sse4
1862 ;=============================================================================
1864 ;=============================================================================
1866 ;-----------------------------------------------------------------------------
1867 ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
1868 ; const uint8_t *pix2, int stride2, int sums[2][4] )
1869 ;-----------------------------------------------------------------------------
1872 movq m5, [r0+(%1&1)*r1]
1873 movq m6, [r2+(%1&1)*r3]
1901 cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8
1909 movdqa m7, [pw_1 GLOBAL]
1936 ;-----------------------------------------------------------------------------
1937 ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
1938 ;-----------------------------------------------------------------------------
1939 cglobal x264_pixel_ssim_end4_sse2, 3,3,7
1954 movdqa m5, [ssim_c1 GLOBAL]
1955 movdqa m6, [ssim_c2 GLOBAL]
1956 TRANSPOSE4x4D 0, 1, 2, 3, 4
1958 ; s1=m0, s2=m1, ss=m2, s12=m3
1961 pmaddwd m4, m0 ; s1*s2
1963 pmaddwd m0, m0 ; s1*s1 + s2*s2
1967 psubd m3, m4 ; covar*2
1973 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
1974 cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
1975 cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
1976 cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
1982 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
1985 lea r3, [mask_ff + 16 GLOBAL]
1986 movdqu m1, [r3 + r2*4]
1988 movdqu m1, [mask_ff + r2*4 + 16 GLOBAL]
2004 ;=============================================================================
2005 ; Successive Elimination ADS
2006 ;=============================================================================
2008 %macro ADS_START 1 ; unroll_size
2013 movsxd r5, dword r5m
2040 %define ABS1 ABS1_MMX
2042 ;-----------------------------------------------------------------------------
2043 ; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
2044 ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
2045 ;-----------------------------------------------------------------------------
2046 cglobal x264_pixel_ads4_mmxext, 4,7
2050 pshufw mm6, mm6, 0xAA
2052 pshufw mm4, mm4, 0xAA
2062 movq mm3, [r1+r2+16]
2071 pshufw mm1, [r10+stack_offset+56], 0
2072 %elifdef ARCH_X86_64
2073 pshufw mm1, [r10+8], 0
2075 pshufw mm1, [ebp+stack_offset+28], 0
2083 cglobal x264_pixel_ads2_mmxext, 4,7
2087 pshufw mm6, mm6, 0xAA
2104 cglobal x264_pixel_ads1_mmxext, 4,7
2126 cglobal x264_pixel_ads4_%1, 4,7,12
2128 pshuflw xmm7, xmm4, 0
2129 pshuflw xmm6, xmm4, 0xAA
2130 pshufhw xmm5, xmm4, 0
2131 pshufhw xmm4, xmm4, 0xAA
2132 punpcklqdq xmm7, xmm7
2133 punpcklqdq xmm6, xmm6
2134 punpckhqdq xmm5, xmm5
2135 punpckhqdq xmm4, xmm4
2137 pshuflw xmm8, r6m, 0
2138 punpcklqdq xmm8, xmm8
2141 movdqu xmm11, [r1+r2]
2144 movdqu xmm1, [r1+16]
2151 movdqu xmm3, [r1+r2+16]
2170 movdqu xmm1, [r1+16]
2175 movdqu xmm2, [r1+r2]
2176 movdqu xmm3, [r1+r2+16]
2184 movd xmm1, [ebp+stack_offset+28]
2186 pshuflw xmm1, xmm1, 0
2187 punpcklqdq xmm1, xmm1
2195 cglobal x264_pixel_ads2_%1, 4,7,8
2198 pshuflw xmm7, xmm6, 0
2199 pshuflw xmm6, xmm6, 0xAA
2200 pshuflw xmm5, xmm5, 0
2201 punpcklqdq xmm7, xmm7
2202 punpcklqdq xmm6, xmm6
2203 punpcklqdq xmm5, xmm5
2207 movdqu xmm1, [r1+r2]
2221 cglobal x264_pixel_ads1_%1, 4,7,8
2224 pshuflw xmm7, xmm7, 0
2225 pshuflw xmm6, xmm6, 0
2226 punpcklqdq xmm7, xmm7
2227 punpcklqdq xmm6, xmm6
2231 movdqu xmm1, [r1+16]
2235 movdqu xmm3, [r3+16]
2250 %define ABS1 ABS1_SSSE3
2253 ; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
2256 ; *(uint32_t*)(masks+width) = 0;
2257 ; for( i=0; i<width; i+=8 )
2259 ; uint64_t mask = *(uint64_t*)(masks+i);
2260 ; if( !mask ) continue;
2261 ; for( j=0; j<8; j++ )
2262 ; if( mask & (255<<j*8) )
2267 cglobal x264_pixel_ads_mvs, 0,7,0
2273 ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
2280 mov dword [rsp+r9], 0
2293 test edi, 0xff<<(%1*8)
2316 mov ebx, [ebp+stack_offset+20] ; mvs
2317 mov edi, [ebp+stack_offset+24] ; width
2318 mov dword [esp+edi], 0
2326 mov ebp, [esp+esi+4]
2327 mov edx, [esp+esi+8]
2334 test %2, 0xff<<(%1*8)